/* * This program prints the twenty most frequent words in a text file. * * Run on William Shakespeare's King Henry VIII, the top twenty words * are: * * the 943 * and 735 * to 600 * i 579 * of 539 * a 457 * you 416 * my 371 * that 323 * in 298 * your 270 * his 268 * this 255 * me 233 * king 216 * for 210 * he 201 * is 193 * him 192 * not 191 * */ #include #include #include #define MAXWORDS 10000 #define MAXSTRING 100 /* structure holding word frequency information */ typedef struct _word { char s[MAXSTRING]; /* the word */ int count; /* number of times word occurs */ } word; /* * This function inserts a word into the list of words. If the word is * not yet in the list, a new slot is used with the count set to 1. * Otherwise, the count is simply incremented. */ void insert_word (word *words, int *n, char *s) { int i; /* linear search for the word */ for (i=0; i<*n; i++) if (strcmp (s, words[i].s) == 0) { /* found it? increment and return. */ words[i].count++; return; } /* error conditions... */ if (strlen (s) >= MAXSTRING) { fprintf (stderr, "word too long!\n"); exit (1); } if (*n >= MAXWORDS) { fprintf (stderr, "too many words!\n"); exit (1); } /* copy the word into the structure at the first available slot, * i.e., *n */ strcpy (words[*n].s, s); /* this word has occured once up to now, so count = 1 */ words[*n].count = 1; /* one more word */ (*n)++; } /* comparison function for quicksort. this lets quicksort sort words * by descending order of count, i.e., from most to least frequent */ int wordcmp (word *a, word *b) { if (a->count < b->count) return +1; if (a->count > b->count) return -1; return 0; } /* return 1 if c is alphabetic (a..z or A..Z), 0 otherwise */ int is_alpha (char c) { if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') return 1; return 0; } /* remove the i'th character from the string s */ void remove_char (char *s, int i) { while (s[i]) { i++; s[i-1] = s[i]; } s[i] = 0; } /* remove non-alphabetic characters from the string s */ void remove_non_alpha (char *s) { int i; for (i=0; s[i]; i++) if (!is_alpha (s[i])) remove_char (s, i); } /* make all the letters in s lowercase */ void make_lowercase (char *s) { int i; for (i=0; s[i]; i++) s[i] = tolower (s[i]); } /* main program */ int main () { word words[MAXWORDS]; char s[1000]; int i, n, m; n = 0; /* read all the words in the file... */ while (!feof (stdin)) { scanf ("%s", s); /* only insert the word if it's not punctuation */ if (is_alpha (s[0])) { /* get rid of non-letters */ remove_non_alpha (s); /* make all letters lowercase */ make_lowercase (s); /* put this word in the list */ insert_word (words, &n, s); } } /* sort the list of words by descending frequency */ qsort((void *) words, n, sizeof (word), (int (*) (const void *, const void *)) wordcmp); /* if fewer than 20 words in total, just print up the the * first n words */ if (n < 20) m = n; else m = 20; /* print the words with their frequencies */ for (i=0; i