Python - Finding word frequencies of list of words in text file

后端 未结 4 1015
渐次进展
渐次进展 2020-12-01 12:46

I am trying to speed up my project to count word frequencies. I have 360+ text files, and I need to get the total number of words and the number of times each word from anot

相关标签:
4条回答
  • 2020-12-01 13:23

    A simple functional code to count word frequencies in a text file:

    {
    import string
    
    def process_file(filename):
    hist = dict()
    f = open(filename,'rb')
    for line in f:
        process_line(line,hist)
    return hist
    
    def process_line(line,hist):
    
    line = line.replace('-','.')
    
    for word in line.split():
        word = word.strip(string.punctuation + string.whitespace)
        word.lower()
    
        hist[word] = hist.get(word,0)+1
    
    hist = process_file(filename)
    print hist
    }
    
    0 讨论(0)
  • 2020-12-01 13:27
    import re, os, sys, codecs, fnmatch
    import decimal
    import zipfile
    import glob
    import csv
    
    path= 'C:\\Users\\user\\Desktop\\sentiment2020\\POSITIVE'
    
    files=[]
    for r,d,f in os.walk(path):
        for file in f:
            if'.txt' in  file:
                files.append(os.path.join(r,file))
    
    for f in files:
        print(f)
        file1= codecs.open(f,'r','utf8',errors='ignore')
        content=file1.read()
    
    words=content.split()
    for x in words:
        print (x)
    
    dicts=[]
    if __name__=="__main__":  
        str =words
        str2 = [] 
        for i in str:              
            if i not in str2: 
                  str2.append(i)  
        for i in range(0, len(str2)):
            a= {str2[i]:str.count(str2[i])}
            dicts.append(a)
    for i in dicts:        
        print(dicts)
    
    
    
    #  for i in range(len(files)):
      #    with codecs.open('C:\\Users\\user\\Desktop\\sentiment2020\\NEGETIVE1\\sad1%s.txt' % i, 'w',"utf8") as filehandle:
      #         filehandle.write('%s\n' % dicts) 
    
    0 讨论(0)
  • 2020-12-01 13:43

    collections.Counter() has this covered if I understand your problem.

    The example from the docs would seem to match your problem.

    # Tally occurrences of words in a list
    cnt = Counter()
    for word in ['red', 'blue', 'red', 'green', 'blue', 'blue']:
        cnt[word] += 1
    print cnt
    
    
    # Find the ten most common words in Hamlet
    import re
    words = re.findall('\w+', open('hamlet.txt').read().lower())
    Counter(words).most_common(10)
    

    From the example above you should be able to do:

    import re
    import collections
    words = re.findall('\w+', open('1976.03.txt').read().lower())
    print collections.Counter(words)
    

    EDIT naive approach to show one way.

    wanted = "fish chips steak"
    cnt = Counter()
    words = re.findall('\w+', open('1976.03.txt').read().lower())
    for word in words:
        if word in wanted:
            cnt[word] += 1
    print cnt
    
    0 讨论(0)
  • 2020-12-01 13:49

    One possible implementation (using Counter)...

    Instead of printing the output, I think it would be simpler to write to a csv file and import that into Excel. Look at http://docs.python.org/2/library/csv.html and replace print_summary.

    import os
    from collections import Counter
    import glob
    
    def word_frequency(fileobj, words):
        """Build a Counter of specified words in fileobj"""
        # initialise the counter to 0 for each word
        ct = Counter(dict((w, 0) for w in words))
        file_words = (word for line in fileobj for word in line.split())
        filtered_words = (word for word in file_words if word in words)
        return Counter(filtered_words)
    
    
    def count_words_in_dir(dirpath, words, action=None):
        """For each .txt file in a dir, count the specified words"""
        for filepath in glob.iglob(os.path.join(dirpath, '*.txt')):
            with open(filepath) as f:
                ct = word_frequency(f, words)
                if action:
                    action(filepath, ct)
    
    
    def print_summary(filepath, ct):
        words = sorted(ct.keys())
        counts = [str(ct[k]) for k in words]
        print('{0}\n{1}\n{2}\n\n'.format(
            filepath,
            ', '.join(words),
            ', '.join(counts)))
    
    
    words = set(['inflation', 'jobs', 'output'])
    count_words_in_dir('./', words, action=print_summary)
    
    0 讨论(0)
提交回复
热议问题