#入门学习系列的内容均是在学习《Python编程入门(第3版)》时的学习笔记
统计一个文本文档的信息,并输出出现频率最高的10个单词
#text.py #保留的字符 keep = {‘a‘,‘b‘,‘c‘,‘d‘,‘e‘,‘f‘,‘g‘,‘h‘,‘i‘,‘j‘,‘k‘,‘l‘,‘m‘,‘n‘,‘o‘,‘p‘ ‘q‘,‘r‘,‘s‘,‘t‘,‘u‘,‘v‘,‘w‘,‘x‘,‘y‘,‘z‘,‘ ‘,‘-‘,"‘"} #将文本规范化 def normalize(s): """Convert s to a normalized string.""" result = ‘‘ for c in s.lower(): if c in keep: result += c return result #获取文本基本信息 def file_stats(fname): """Print statistics for the given file.""" s = open(fname,‘r‘).read() num_chars = len(s) num_lines = s.count(‘\n‘) num_words = len(normalize(s).split()) print("The file %s has:" % fname) print(" %s characters" % num_chars) print(" %s lines" % num_lines) print(" %s words" % num_words) #将字符串转化为字典 def make_freq_dict(s): """Return a dictionary whose keys are the words of s,and whose values are the counts of those words.""" s = normalize(s) words = s.split() d = {} for w in words: if w in d: d[w] += 1 else: d[w] = 1 return d #获取文本基本信息 def file_stats2(fname): """Print statistics for the given file.""" s = open(fname,‘r‘).read() num_chars = len(s) num_lines = s.count(‘\n‘) d = make_freq_dict(s) num_words = sum(d[w] for w in d) lst = [(d[w],w) for w in d] lst.sort() lst.reverse() print("The file %s has:" % fname) print(" %s characters" % num_chars) print(" %s lines" % num_lines) print(" %s words" % num_words) print("\nThe top 10 most frequent words are:") i = 1 for count,word in lst[:99]: print(‘%2s. %4s %s‘ % (i, count, word)) i += 1
>>> file_stats2(‘a.txt‘) The file a.txt has: 12927 characters 297 lines 1645 words The top 10 most frequent words are: 1. 62 to 2. 62 the 3. 47 is 4. 42 a 5. 41 of 6. 40 it 7. 36 that 8. 35 and 9. 32 as 10. 24 so
进一步完善的代码:
#text.py #保留的字符 keep = {‘a‘,‘b‘,‘c‘,‘d‘,‘e‘,‘f‘,‘g‘,‘h‘,‘i‘,‘j‘,‘k‘,‘l‘,‘m‘,‘n‘,‘o‘,‘p‘ ‘q‘,‘r‘,‘s‘,‘t‘,‘u‘,‘v‘,‘w‘,‘x‘,‘y‘,‘z‘,‘ ‘,‘-‘,"‘"} #将文本规范化 def normalize(s): """Convert s to a normalized string.""" result = ‘‘ for c in s.lower(): if c in keep: result += c return result #获取文本基本信息 def file_stats(fname): """Print statistics for the given file.""" s = open(fname,‘r‘).read() num_chars = len(s) num_lines = s.count(‘\n‘) num_words = len(normalize(s).split()) print("The file %s has:" % fname) print(" %s characters" % num_chars) print(" %s lines" % num_lines) print(" %s words" % num_words) #将字符串转化为字典 def make_freq_dict(s): """Return a dictionary whose keys are the words of s,and whose values are the counts of those words.""" s = normalize(s) words = s.split() d = {} for w in words: if w in d: d[w] += 1 else: d[w] = 1 return d #获取文本基本信息 def file_stats2(fname): """Print statistics for the given file.""" s = open(fname,‘r‘).read() num_chars = len(s) num_lines = s.count(‘\n‘) d = make_freq_dict(s) num_different_words = sum(d[w]/d[w] for w in d) num_words = sum(d[w] for w in d) words_average_length = sum(len(w) for w in d)/num_different_words num_once = sum(d[w] for w in d if d[w] == 1) lst = [(d[w],w) for w in d] lst.sort() lst.reverse() print("The file %s has:" % fname) print(" %s characters" % num_chars) print(" %s lines" % num_lines) print(" %s words" % num_words) print(" %s words appreance one time" % num_once) print(" %s different words" % int(num_different_words)) print(" %s average length" % words_average_length) print("\nThe top 10 most frequent words are:") i = 1 for count,word in lst[:10]: print(‘%2s. %4s %s‘ % (i, count, word)) i += 1 def main(): file_stats2(‘a.txt‘) if __name__==‘__main__‘: main()
>>> ================================ RESTART ================================ >>> The file a.txt has: 12927 characters 297 lines 1645 words 515 words appreance one time 699 different words 6.539341917024321 average length The top 10 most frequent words are: 1. 62 to 2. 62 the 3. 47 is 4. 42 a 5. 41 of 6. 40 it 7. 36 that 8. 35 and 9. 32 as 10. 24 so
时间: 2024-11-08 11:01:50