I am trying to speed up my project to count word frequencies. I have 360+ text files, and I need to get the total number of words and the number of times each word from anot
One possible implementation (using Counter)...
Instead of printing the output, I think it would be simpler to write to a csv file and import that into Excel. Look at http://docs.python.org/2/library/csv.html and replace print_summary.
import os
from collections import Counter
import glob
def word_frequency(fileobj, words):
"""Build a Counter of specified words in fileobj"""
# initialise the counter to 0 for each word
ct = Counter(dict((w, 0) for w in words))
file_words = (word for line in fileobj for word in line.split())
filtered_words = (word for word in file_words if word in words)
return Counter(filtered_words)
def count_words_in_dir(dirpath, words, action=None):
"""For each .txt file in a dir, count the specified words"""
for filepath in glob.iglob(os.path.join(dirpath, '*.txt')):
with open(filepath) as f:
ct = word_frequency(f, words)
if action:
action(filepath, ct)
def print_summary(filepath, ct):
words = sorted(ct.keys())
counts = [str(ct[k]) for k in words]
print('{0}\n{1}\n{2}\n\n'.format(
filepath,
', '.join(words),
', '.join(counts)))
words = set(['inflation', 'jobs', 'output'])
count_words_in_dir('./', words, action=print_summary)