每天一个python小程序(4)--统计单词频数

任一个英文的纯文本文件,统计其中每个单词出现的频数。

这一题使用collections模块会非常简单

#!/usr/bin/python
#encoding=utf-8

import collections
import re

#增加统计单词总数和最长行
def count_word(file_name):
    f = open(file_name)

    word_counter = collections.Counter()

    #文件对象使用迭代器
    for line in f:
        words = re.findall('\w+', line.lower())
        word_counter.update(words)

    #迭代器对象只能遍历一次,欲再次遍历需要重现构建迭代器对象
    f = open(file_name)
    #计算单词总数 p206
    word_sum = len([word for line in f for word in line.split()])


    #最长行
    f = open(file_name)
    longest = max(len(line.split()) for line in f)

    print "the longest line's length is %d" % longest
    f.close()

    return word_counter, word_sum

if __name__ == '__main__':
    world_counter, word_sum = count_word('test.txt')
    print 'there are %d words in this article' % word_sum
    #most_common打印出现次数最多的单词,这里most_common(3)表示出现次数最多的3个单词
    for key,count in world_counter.most_common(3):
        print '%s: %d' % (key, count)

代码在这里