I am looking for a module in sklearn that lets you derive the word-word co-occurrence matrix.
I can get the document-term matrix but not sure how to go about obtain
with numpy, as corpus would be list of lists (each list a tokenized document):
corpus = [['', 'All', 'that', 'glitters', "isn't", 'gold', ''],
['', "All's", 'well', 'that', 'ends', 'well', '']]
and a word->row/col mapping
def compute_co_occurrence_matrix(corpus, window_size):
words = sorted(list(set([word for words_list in corpus for word in words_list])))
num_words = len(words)
M = np.zeros((num_words, num_words))
word2Ind = dict(zip(words, range(num_words)))
for doc in corpus:
cur_idx = 0
doc_len = len(doc)
while cur_idx < doc_len:
left = max(cur_idx-window_size, 0)
right = min(cur_idx+window_size+1, doc_len)
words_to_add = doc[left:cur_idx] + doc[cur_idx+1:right]
focus_word = doc[cur_idx]
for word in words_to_add:
outside_idx = word2Ind[word]
M[outside_idx, word2Ind[focus_word]] += 1
cur_idx += 1
return M, word2Ind