Lecture Notes on 25 Nov 2013 # keeps only letters and removes everything else def filter_string (st): s = '' for ch in st: if ((ch >= 'a') and (ch <= 'z')): s += ch else: s += ' ' return s def main(): # open book book = open ('hard_times.txt', 'r') # create empty set of words word_set = set() # create a dictionary for word frequency word_dict = {} # track total number of words total_words = 0 # read book line by line for line in book: # remove leading and trailing spaces line = line.strip() # make everything lower case line = line.lower() # filter the line line = filter_string (line) # get the words in the line word_list = line.split() # add words to set and dictionary for word in word_list: word_set.add (word) total_words += 1 if word in word_dict: word_dict [word] = word_dict [word] + 1 else: word_dict [word] = 1 # close book book.close() # print total number of words print ('Total words used = ', total_words) # print unique number of words num_unique_words = len (word_set) print ('Number of unique words = ', num_unique_words) # ratio of unique words to total words word_ratio = num_unique_words / total_words print ('Unique words / Total words = ', word_ratio) # print word frequency all_words = list (word_dict.keys()) all_words.sort() for word in all_words: print (word + " : " + str (word_dict [word])) # get distribution according to frequency freq_dict = {} for word in word_dict: freq = word_dict[word] if freq in freq_dict: (freq_dict[freq]).append (word) else: new_list = [] new_list.append(word) freq_dict[freq] = new_list # print according to frequency all_freq = list (freq_dict.keys()) all_freq.sort() all_freq.reverse() for freq in all_freq: print (str (freq) + " : " + str(freq_dict[freq])) main()