Lecture Notes on 7 August 2013 def pre_process (s): punctuation_marks = '.,;:?!-()[]{}"@#$%^&*+=~`|/<>_' marks = set (punctuation_marks) st = '' for ch in s: if ch in marks: st = st + ' ' elif (ch == "'"): st = st + ' ' else: st = st + ch return st def main(): word_freq = {} inFile = open ('copperfield.txt', 'r') for line in inFile: line = line.strip() line = line.lower() line = pre_process (line) # get frequency of words words = line.split() for elt in words: if elt in word_freq: word_freq[elt] = word_freq[elt] + 1 else: word_freq[elt] = 1 inFile.close() word_list = list(word_freq.keys()) word_list.sort() max_freq = 0 for word in word_list: if (word_freq[word] > max_freq): max_freq = word_freq[word] print (word, word_freq[word]) print ('Maximum frequency = ' + str (max_freq)) # get distribution according to frequency freq_list = {} for word in word_list: freq = word_freq[word] if freq in freq_list: (freq_list[freq]).append(word) else: new_list = [] new_list.append(word) freq_list[freq] = new_list freq_range = list(freq_list.keys()) freq_range.sort() freq_range.reverse() for freq in freq_range: print (freq, freq_list[freq]) main()