Lecture Notes on 6 Aug 2014

def filter_string (st):
  s = ''
  for ch in st:
    if ((ch >= 'a') and (ch <= 'z')):
      s += ch
    else:
      s += ' '
  return s

def main():
  # open the book
  book = open ('./hard_times.txt', 'r')

  # create a dictionary for word frequency
  word_dict = {}

  # total number of words
  total_words = 0

  # read book line by line
  for line in book:
    # remove leading and trailing spaces
    line = line.strip()

    # make everything lower case
    line = line.lower()

    # filter the line
    line = filter_string (line)

    # get the words in the line
    word_list = line.split()

    # add words to the dictionary
    for word in word_list:
      total_words += 1
      if word in word_dict:
        word_dict[word] = word_dict[word] + 1
      else:
        word_dict[word] = 1

  # close the book
  book.close()

  # print total number of words
  print ('Total words used = ', total_words)

  # print unique number of words
  num_unique = len (word_dict)
  print ('Number of unique words = ', num_unique)

  # ratio of unique words  to total words
  word_ratio = num_unique / total_words
  print ('Unique words / Total words = ', word_ratio)

  # print word frequency
  all_words = list (word_dict.keys())
  all_words.sort()
  for word in all_words:
    print (word + " : " + str (word_dict[word]))

  print ("\n")

  # get distribution according to frequency
  freq_dict = {}
  for word in word_dict:
    freq = word_dict[word]
    if freq in freq_dict:
      (freq_dict[freq]).append (word)
    else:
      new_list = []
      new_list.append (word)
      freq_dict[freq] = new_list

  # print according to frequency
  all_freq = list (freq_dict.keys())
  all_freq.sort()
  all_freq.reverse()
  for freq in all_freq:
    print (str(freq) + " : " + str (freq_dict[freq]))

main()