I am looking for a module in sklearn that lets you derive the word-word co-occurrence matrix.
I can get the document-term matrix but not sure how to go about obtain
I used the below code for creating co-occurrance matrix with window size:
#https://stackoverflow.com/questions/4843158/check-if-a-python-list-item-contains-a-string-inside-another-string
import pandas as pd
def co_occurance_matrix(input_text,top_words,window_size):
co_occur = pd.DataFrame(index=top_words, columns=top_words)
for row,nrow in zip(top_words,range(len(top_words))):
for colm,ncolm in zip(top_words,range(len(top_words))):
count = 0
if row == colm:
co_occur.iloc[nrow,ncolm] = count
else:
for single_essay in input_text:
essay_split = single_essay.split(" ")
max_len = len(essay_split)
top_word_index = [index for index, split in enumerate(essay_split) if row in split]
for index in top_word_index:
if index == 0:
count = count + essay_split[:window_size + 1].count(colm)
elif index == (max_len -1):
count = count + essay_split[-(window_size + 1):].count(colm)
else:
count = count + essay_split[index + 1 : (index + window_size + 1)].count(colm)
if index < window_size:
count = count + essay_split[: index].count(colm)
else:
count = count + essay_split[(index - window_size): index].count(colm)
co_occur.iloc[nrow,ncolm] = count
return co_occur
then i used the below code to perform test:
corpus = ['ABC DEF IJK PQR','PQR KLM OPQ','LMN PQR XYZ ABC DEF PQR ABC']
words = ['ABC','PQR','DEF']
window_size =2
result = co_occurance_matrix(corpus,words,window_size)
result
Output is here: