Efficiently count word frequencies in python

前端 未结 8 1191
走了就别回头了
走了就别回头了 2020-11-29 04:33

I\'d like to count frequencies of all words in a text file.

>>> countInFile(\'test.txt\')

should return {\'aaa\':1, \'bbb\':

8条回答
  •  臣服心动
    2020-11-29 05:04

    Instead of decoding the whole bytes read from the url, I process the binary data. Because bytes.translate expects its second argument to be a byte string, I utf-8 encode punctuation. After removing punctuations, I utf-8 decode the byte string.

    The function freq_dist expects an iterable. That's why I've passed data.splitlines().

    from urllib2 import urlopen
    from collections import Counter
    from string import punctuation
    from time import time
    import sys
    from pprint import pprint
    
    url = 'https://raw.githubusercontent.com/Simdiva/DSL-Task/master/data/DSLCC-v2.0/test/test.txt'
    
    data = urlopen(url).read()
    
    def freq_dist(data):
        """
        :param data: file-like object opened in binary mode or
                     sequence of byte strings separated by '\n'
        :type data: an iterable sequence
        """
        #For readability   
        #return Counter(word for line in data
        #    for word in line.translate(
        #    None,bytes(punctuation.encode('utf-8'))).decode('utf-8').split())
    
        punc = punctuation.encode('utf-8')
        words = (word for line in data for word in line.translate(None, punc).decode('utf-8').split())
        return Counter(words)
    
    
    start = time()
    word_dist = freq_dist(data.splitlines())
    print('elapsed: {}'.format(time() - start))
    pprint(word_dist.most_common(10))
    

    Output;

    elapsed: 0.806480884552
    
    [(u'de', 11106),
     (u'a', 6742),
     (u'que', 5701),
     (u'la', 4319),
     (u'je', 4260),
     (u'se', 3938),
     (u'\u043d\u0430', 3929),
     (u'na', 3623),
     (u'da', 3534),
     (u'i', 3487)]
    

    It seems dict is more efficient than Counter object.

    def freq_dist(data):
        """
        :param data: A string with sentences separated by '\n'
        :type data: str
        """
        d = {}
        punc = punctuation.encode('utf-8')
        words = (word for line in data for word in line.translate(None, punc).decode('utf-8').split())
        for word in words:
            d[word] = d.get(word, 0) + 1
        return d
    
    start = time()
    word_dist = freq_dist(data.splitlines())
    print('elapsed: {}'.format(time() - start))
    pprint(sorted(word_dist.items(), key=lambda x: (x[1], x[0]), reverse=True)[:10])
    

    Output;

    elapsed: 0.642680168152
    
    [(u'de', 11106),
     (u'a', 6742),
     (u'que', 5701),
     (u'la', 4319),
     (u'je', 4260),
     (u'se', 3938),
     (u'\u043d\u0430', 3929),
     (u'na', 3623),
     (u'da', 3534),
     (u'i', 3487)]
    

    To be more memory efficient when opening huge file, you have to pass just the opened url. But the timing will include file download time too.

    data = urlopen(url)
    word_dist = freq_dist(data)
    

提交回复
热议问题