I know how to do this in R. But, is there any function in pandas that transforms a dataframe to an nxn co-occurrence matrix containing the counts of two aspects co-occurring
To further elaborate this question, If you want to construct co-occurrence matrix from sentences you can do this:
import numpy as np
import pandas as pd
def create_cooccurrence_matrix(sentences, window_size=2):
"""Create co occurrence matrix from given list of sentences.
Returns:
- vocabs: dictionary of word counts
- co_occ_matrix_sparse: sparse co occurrence matrix
Example:
===========
sentences = ['I love nlp', 'I love to learn',
'nlp is future', 'nlp is cool']
vocabs,co_occ = create_cooccurrence_matrix(sentences)
df_co_occ = pd.DataFrame(co_occ.todense(),
index=vocabs.keys(),
columns = vocabs.keys())
df_co_occ = df_co_occ.sort_index()[sorted(vocabs.keys())]
df_co_occ.style.applymap(lambda x: 'color: red' if x>0 else '')
"""
import scipy
import nltk
vocabulary = {}
data = []
row = []
col = []
tokenizer = nltk.tokenize.word_tokenize
for sentence in sentences:
sentence = sentence.strip()
tokens = [token for token in tokenizer(sentence) if token != u""]
for pos, token in enumerate(tokens):
i = vocabulary.setdefault(token, len(vocabulary))
start = max(0, pos-window_size)
end = min(len(tokens), pos+window_size+1)
for pos2 in range(start, end):
if pos2 == pos:
continue
j = vocabulary.setdefault(tokens[pos2], len(vocabulary))
data.append(1.)
row.append(i)
col.append(j)
cooccurrence_matrix_sparse = scipy.sparse.coo_matrix((data, (row, col)))
return vocabulary, cooccurrence_matrix_sparse
sentences = ['I love nlp', 'I love to learn',
'nlp is future', 'nlp is cool']
vocabs,co_occ = create_cooccurrence_matrix(sentences)
df_co_occ = pd.DataFrame(co_occ.todense(),
index=vocabs.keys(),
columns = vocabs.keys())
df_co_occ = df_co_occ.sort_index()[sorted(vocabs.keys())]
df_co_occ.style.applymap(lambda x: 'color: red' if x>0 else '')
# If not in jupyter notebook, print(df_co_occ)