Simple implementation of N-Gram, tf-idf and Cosine similarity in Python

前端 未结 5 1706
逝去的感伤
逝去的感伤 2020-11-28 17:58

I need to compare documents stored in a DB and come up with a similarity score between 0 and 1.

The method I need to use has to be very simple. Implementing a vanil

5条回答
  •  情歌与酒
    2020-11-28 18:36

    Here's an answer with just python + numpy, in short:

    Cosine:

    def cosine_sim(u,v):
        return np.dot(u,v) / (sqrt(np.dot(u,u)) * sqrt(np.dot(v,v)))
    

    Ngrams:

    def ngrams(sentence, n):
      return zip(*[sentence.split()[i:] for i in range(n)])
    

    TF-IDF (it's a little weird but it works):

    def tfidf(corpus, vocab):
        """
        INPUT:
    
        corpus = [('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]), 
        ('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]), 
        ('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
    
        vocab = ['a', 'bar', 'black', 'foo', 'is', 'sentence', 
        'sheep', 'this']
    
        OUTPUT:
    
        [[0.300, 0.300, 0.0, 0.300, 0.300, 0.0, 0.0, 0.300], 
        [0.0, 0.600, 0.600, 0.300, 0.0, 0.0, 0.600, 0.0], 
        [0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]
    
        """
        def termfreq(matrix, doc, term):
            try: return matrix[doc][term] / float(sum(matrix[doc].values()))
            except ZeroDivisionError: return 0
        def inversedocfreq(matrix, term):
            try: 
                return float(len(matrix)) /sum([1 for i,_ in enumerate(matrix) if matrix[i][term] > 0])
            except ZeroDivisionError: return 0
    
        matrix = [{k:v for k,v in zip(vocab, i[1])} for i in corpus]
        tfidf = defaultdict(dict)
        for doc,_ in enumerate(matrix):
            for term in matrix[doc]:
                tf = termfreq(matrix,doc,term)
                idf = inversedocfreq(matrix, term)
                tfidf[doc][term] = tf*idf
    
        return [[tfidf[doc][term] for term in vocab] for doc,_ in enumerate(tfidf)]
    

    Here's the long answer with the tests:

    import numpy as np
    from math import sqrt, log
    from itertools import chain, product
    from collections import defaultdict
    
    def cosine_sim(u,v):
        return np.dot(u,v) / (sqrt(np.dot(u,u)) * sqrt(np.dot(v,v)))
    
    def ngrams(sentence, n):
      return zip(*[sentence.split()[i:] for i in range(n)])
    
    def tfidf(corpus, vocab):
        """
        INPUT:
    
        corpus = [('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]), 
        ('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]), 
        ('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
    
        vocab = ['a', 'bar', 'black', 'foo', 'is', 'sentence', 
        'sheep', 'this']
    
        OUTPUT:
    
        [[0.300, 0.300, 0.0, 0.300, 0.300, 0.0, 0.0, 0.300], 
        [0.0, 0.600, 0.600, 0.300, 0.0, 0.0, 0.600, 0.0], 
        [0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]
    
        """
        def termfreq(matrix, doc, term):
            try: return matrix[doc][term] / float(sum(matrix[doc].values()))
            except ZeroDivisionError: return 0
        def inversedocfreq(matrix, term):
            try: 
                return float(len(matrix)) /sum([1 for i,_ in enumerate(matrix) if matrix[i][term] > 0])
            except ZeroDivisionError: return 0
    
        matrix = [{k:v for k,v in zip(vocab, i[1])} for i in corpus]
        tfidf = defaultdict(dict)
        for doc,_ in enumerate(matrix):
            for term in matrix[doc]:
                tf = termfreq(matrix,doc,term)
                idf = inversedocfreq(matrix, term)
                tfidf[doc][term] = tf*idf
    
        return [[tfidf[doc][term] for term in vocab] for doc,_ in enumerate(tfidf)]
    
    
    def corpus2vectors(corpus):
        def vectorize(sentence, vocab):
            return [sentence.split().count(i) for i in vocab]
        vectorized_corpus = []
        vocab = sorted(set(chain(*[i.lower().split() for i in corpus])))
        for i in corpus:
            vectorized_corpus.append((i, vectorize(i, vocab)))
        return vectorized_corpus, vocab
    
    def create_test_corpus():
        sent1 = "this is a foo bar"
        sent2 = "foo bar bar black sheep"
        sent3 = "this is a sentence"
    
        all_sents = [sent1,sent2,sent3]
        corpus, vocab = corpus2vectors(all_sents)
        return corpus, vocab
    
    def test_cosine():
        corpus, vocab = create_test_corpus()
    
        for sentx, senty in product(corpus, corpus):
            print sentx[0]
            print senty[0]
            print "cosine =", cosine_sim(sentx[1], senty[1])
            print
    
    def test_ngrams():
        corpus, vocab = create_test_corpus()
        for sentx in corpus:
            print sentx[0]
            print ngrams(sentx[0],2)
            print ngrams(sentx[0],3)
            print
    
    def test_tfidf():
        corpus, vocab = create_test_corpus()
        print corpus
        print vocab
        print tfidf(corpus, vocab)
    
    print "Testing cosine..."
    test_cosine()
    print
    print "Testing ngrams..."
    test_ngrams()
    print
    print "Testing tfidf..."
    test_tfidf()
    print
    

    [out]:

    Testing cosine...
    this is a foo bar
    this is a foo bar
    cosine = 1.0
    
    this is a foo bar
    foo bar bar black sheep
    cosine = 0.507092552837
    
    this is a foo bar
    this is a sentence
    cosine = 0.67082039325
    
    foo bar bar black sheep
    this is a foo bar
    cosine = 0.507092552837
    
    foo bar bar black sheep
    foo bar bar black sheep
    cosine = 1.0
    
    foo bar bar black sheep
    this is a sentence
    cosine = 0.0
    
    this is a sentence
    this is a foo bar
    cosine = 0.67082039325
    
    this is a sentence
    foo bar bar black sheep
    cosine = 0.0
    
    this is a sentence
    this is a sentence
    cosine = 1.0
    
    
    Testing ngrams...
    this is a foo bar
    [('this', 'is'), ('is', 'a'), ('a', 'foo'), ('foo', 'bar')]
    [('this', 'is', 'a'), ('is', 'a', 'foo'), ('a', 'foo', 'bar')]
    
    foo bar bar black sheep
    [('foo', 'bar'), ('bar', 'bar'), ('bar', 'black'), ('black', 'sheep')]
    [('foo', 'bar', 'bar'), ('bar', 'bar', 'black'), ('bar', 'black', 'sheep')]
    
    this is a sentence
    [('this', 'is'), ('is', 'a'), ('a', 'sentence')]
    [('this', 'is', 'a'), ('is', 'a', 'sentence')]
    
    
    Testing tfidf...
    [('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]), ('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]), ('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
    ['a', 'bar', 'black', 'foo', 'is', 'sentence', 'sheep', 'this']
    [[0.30000000000000004, 0.30000000000000004, 0.0, 0.30000000000000004, 0.30000000000000004, 0.0, 0.0, 0.30000000000000004], [0.0, 0.6000000000000001, 0.6000000000000001, 0.30000000000000004, 0.0, 0.0, 0.6000000000000001, 0.0], [0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]
    

提交回复
热议问题