I am trying to speed up my project to count word frequencies. I have 360+ text files, and I need to get the total number of words and the number of times each word from anot
A simple functional code to count word frequencies in a text file:
{
import string
def process_file(filename):
hist = dict()
f = open(filename,'rb')
for line in f:
process_line(line,hist)
return hist
def process_line(line,hist):
line = line.replace('-','.')
for word in line.split():
word = word.strip(string.punctuation + string.whitespace)
word.lower()
hist[word] = hist.get(word,0)+1
hist = process_file(filename)
print hist
}
import re, os, sys, codecs, fnmatch
import decimal
import zipfile
import glob
import csv
path= 'C:\\Users\\user\\Desktop\\sentiment2020\\POSITIVE'
files=[]
for r,d,f in os.walk(path):
for file in f:
if'.txt' in file:
files.append(os.path.join(r,file))
for f in files:
print(f)
file1= codecs.open(f,'r','utf8',errors='ignore')
content=file1.read()
words=content.split()
for x in words:
print (x)
dicts=[]
if __name__=="__main__":
str =words
str2 = []
for i in str:
if i not in str2:
str2.append(i)
for i in range(0, len(str2)):
a= {str2[i]:str.count(str2[i])}
dicts.append(a)
for i in dicts:
print(dicts)
# for i in range(len(files)):
# with codecs.open('C:\\Users\\user\\Desktop\\sentiment2020\\NEGETIVE1\\sad1%s.txt' % i, 'w',"utf8") as filehandle:
# filehandle.write('%s\n' % dicts)
collections.Counter() has this covered if I understand your problem.
The example from the docs would seem to match your problem.
# Tally occurrences of words in a list
cnt = Counter()
for word in ['red', 'blue', 'red', 'green', 'blue', 'blue']:
cnt[word] += 1
print cnt
# Find the ten most common words in Hamlet
import re
words = re.findall('\w+', open('hamlet.txt').read().lower())
Counter(words).most_common(10)
From the example above you should be able to do:
import re
import collections
words = re.findall('\w+', open('1976.03.txt').read().lower())
print collections.Counter(words)
EDIT naive approach to show one way.
wanted = "fish chips steak"
cnt = Counter()
words = re.findall('\w+', open('1976.03.txt').read().lower())
for word in words:
if word in wanted:
cnt[word] += 1
print cnt
One possible implementation (using Counter)...
Instead of printing the output, I think it would be simpler to write to a csv file and import that into Excel. Look at http://docs.python.org/2/library/csv.html and replace print_summary
.
import os
from collections import Counter
import glob
def word_frequency(fileobj, words):
"""Build a Counter of specified words in fileobj"""
# initialise the counter to 0 for each word
ct = Counter(dict((w, 0) for w in words))
file_words = (word for line in fileobj for word in line.split())
filtered_words = (word for word in file_words if word in words)
return Counter(filtered_words)
def count_words_in_dir(dirpath, words, action=None):
"""For each .txt file in a dir, count the specified words"""
for filepath in glob.iglob(os.path.join(dirpath, '*.txt')):
with open(filepath) as f:
ct = word_frequency(f, words)
if action:
action(filepath, ct)
def print_summary(filepath, ct):
words = sorted(ct.keys())
counts = [str(ct[k]) for k in words]
print('{0}\n{1}\n{2}\n\n'.format(
filepath,
', '.join(words),
', '.join(counts)))
words = set(['inflation', 'jobs', 'output'])
count_words_in_dir('./', words, action=print_summary)