Lecture Notes on 25 Nov 2013

# keeps only letters and removes everything else
def filter_string (st):
  s = ''
  for ch in st:
    if ((ch >= 'a') and (ch <= 'z')):
      s += ch
    else:
      s += ' '
  return s

def main():
  # open book
  book = open ('hard_times.txt', 'r')

  # create empty set of words
  word_set = set()

  # create a dictionary for word frequency
  word_dict = {}

  # track total number of words
  total_words = 0

  # read book line by line
  for line in book:
    # remove leading and trailing spaces
    line = line.strip()

    # make everything lower case
    line = line.lower()

    # filter the line
    line = filter_string (line)

    # get the words in the line
    word_list = line.split()

    # add words to set and dictionary
    for word in word_list:
      word_set.add (word)
      total_words += 1
      if word in word_dict:
        word_dict [word] = word_dict [word] + 1
      else:
        word_dict [word] = 1

  # close book
  book.close()

  # print total number of words
  print ('Total words used = ', total_words)

  # print unique number of words 
  num_unique_words = len (word_set)
  print ('Number of unique words = ', num_unique_words)

  # ratio of unique words to total words
  word_ratio = num_unique_words / total_words
  print ('Unique words / Total words = ', word_ratio)
  
  # print word frequency
  all_words = list (word_dict.keys())
  all_words.sort()
  for word in all_words:
    print (word + " : " + str (word_dict [word]))

  # get distribution according to frequency
  freq_dict = {}
  for word in word_dict:
    freq = word_dict[word]
    if freq in freq_dict:
      (freq_dict[freq]).append (word)
    else:
      new_list = []
      new_list.append(word)
      freq_dict[freq] = new_list

  # print according to frequency
  all_freq = list (freq_dict.keys())
  all_freq.sort()
  all_freq.reverse()
  for freq in all_freq:
    print (str (freq) + " : " +  str(freq_dict[freq]))

main()