Constructing a co-occurrence matrix in python pandas

匿名 (未验证) 提交于 2019-12-03 02:44:02

问题:

I know how to do this in R. But, is there any function in pandas that transforms a dataframe to an nxn co-occurrence matrix containing the counts of two aspects co-occurring.

For example a matrix df:

import pandas as pd  df = pd.DataFrame({'TFD' : ['AA', 'SL', 'BB', 'D0', 'Dk', 'FF'],                     'Snack' : ['1', '0', '1', '1', '0', '0'],                     'Trans' : ['1', '1', '1', '0', '0', '1'],                     'Dop' : ['1', '0', '1', '0', '1', '1']}).set_index('TFD')  print df  >>>      Dop Snack Trans TFD                 AA    1     1     1 SL    0     0     1 BB    1     1     1 D0    0     1     0 Dk    1     0     0 FF    1     0     1  [6 rows x 3 columns] 

would yield:

    Dop Snack Trans  Dop   0     2     3 Snack 2     0     2 Trans 3     2     0 

Since the matrix is mirrored on the diagonal I guess there would be a way to optimize code.

回答1:

It's a simple linear algebra, you multiply matrix with its transpose (your example contains strings, don't forget to convert them to integer):

>>> df_asint = df.astype(int) >>> coocc = df_asint.T.dot(df_asint) >>> coocc        Dop  Snack  Trans Dop      4      2      3 Snack    2      3      2 Trans    3      2      4 

if, as in R answer, you want to reset diagonal, you can use numpy's fill_diagonal:

>>> import numpy as np >>> np.fill_diagonal(coocc.values, 0) >>> coocc        Dop  Snack  Trans Dop      0      2      3 Snack    2      0      2 Trans    3      2      0 


回答2:

Demo in NumPy:

import numpy as np np.random.seed(3) # for reproducibility  # Generate data: 5 labels, 10 examples, binary. label_headers = 'Alice Bob Carol Dave Eve'.split(' ') label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer. print('labels:\n{0}'.format(label_data))  # Compute cooccurrence matrix  cooccurrence_matrix = np.dot(label_data.transpose(),label_data) print('\ncooccurrence_matrix:\n{0}'.format(cooccurrence_matrix))   # Compute cooccurrence matrix in percentage # FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element #      http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804 cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix) with np.errstate(divide='ignore', invalid='ignore'):     cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None])) print('\ncooccurrence_matrix_percentage:\n{0}'.format(cooccurrence_matrix_percentage)) 

Output:

labels: [[0 0 1 1 0]  [0 0 1 1 1]  [0 1 1 1 0]  [1 1 0 0 0]  [0 1 1 0 0]  [0 1 0 0 0]  [0 1 0 1 1]  [0 1 0 0 1]  [1 0 0 1 0]  [1 0 1 1 1]]  cooccurrence_matrix: [[3 1 1 2 1]  [1 6 2 2 2]  [1 2 5 4 2]  [2 2 4 6 3]  [1 2 2 3 4]]  cooccurrence_matrix_percentage: [[ 1.          0.33333333  0.33333333  0.66666667  0.33333333]  [ 0.16666667  1.          0.33333333  0.33333333  0.33333333]  [ 0.2         0.4         1.          0.8         0.4       ]  [ 0.33333333  0.33333333  0.66666667  1.          0.5       ]  [ 0.25        0.5         0.5         0.75        1.        ]] 

With a heatmap using matplotlib:

import numpy as np np.random.seed(3) # for reproducibility  import matplotlib.pyplot as plt   def show_values(pc, fmt="%.2f", **kw):     '''     Heatmap with text in each cell with matplotlib's pyplot     Source: http://stackoverflow.com/a/25074150/395857      By HYRY     '''     from itertools import izip     pc.update_scalarmappable()     ax = pc.get_axes()     for p, color, value in izip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):         x, y = p.vertices[:-2, :].mean(0)         if np.all(color[:3] > 0.5):             color = (0.0, 0.0, 0.0)         else:             color = (1.0, 1.0, 1.0)         ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)  def cm2inch(*tupl):     '''     Specify figure size in centimeter in matplotlib     Source: http://stackoverflow.com/a/22787457/395857     By gns-ank     '''     inch = 2.54     if type(tupl[0]) == tuple:         return tuple(i/inch for i in tupl[0])     else:         return tuple(i/inch for i in tupl)  def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels):     '''     Inspired by:     - http://stackoverflow.com/a/16124677/395857      - http://stackoverflow.com/a/25074150/395857     '''      # Plot it out     fig, ax = plt.subplots()         c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)      # put the major ticks at the middle of each cell     ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)     ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)      # set tick labels     #ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)     ax.set_xticklabels(xticklabels, minor=False)     ax.set_yticklabels(yticklabels, minor=False)      # set title and x/y labels     plt.title(title)     plt.xlabel(xlabel)     plt.ylabel(ylabel)            # Remove last blank column     plt.xlim( (0, AUC.shape[1]) )      # Turn off all the ticks     ax = plt.gca()         for t in ax.xaxis.get_major_ticks():         t.tick1On = False         t.tick2On = False     for t in ax.yaxis.get_major_ticks():         t.tick1On = False         t.tick2On = False      # Add color bar     plt.colorbar(c)      # Add text in each cell      show_values(c)      # Proper orientation (origin at the top left instead of bottom left)     ax.invert_yaxis()     ax.xaxis.tick_top()      # resize      fig = plt.gcf()     fig.set_size_inches(cm2inch(40, 20))    def main():      # Generate data: 5 labels, 10 examples, binary.     label_headers = 'Alice Bob Carol Dave Eve'.split(' ')     label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer.     print('labels:\n{0}'.format(label_data))      # Compute cooccurrence matrix      cooccurrence_matrix = np.dot(label_data.transpose(),label_data)     print('\ncooccurrence_matrix:\n{0}'.format(cooccurrence_matrix))       # Compute cooccurrence matrix in percentage     # FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element     #      http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804     cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix)     with np.errstate(divide='ignore', invalid='ignore'):         cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None]))     print('\ncooccurrence_matrix_percentage:\n{0}'.format(cooccurrence_matrix_percentage))      # Add count in labels     label_header_with_count = [ '{0} ({1})'.format(label_header, cooccurrence_matrix_diagonal[label_number]) for label_number, label_header in enumerate(label_headers)]       print('\nlabel_header_with_count: {0}'.format(label_header_with_count))      # Plotting     x_axis_size = cooccurrence_matrix_percentage.shape[0]     y_axis_size = cooccurrence_matrix_percentage.shape[1]     title = "Co-occurrence matrix\n"     xlabel= ''#"Labels"     ylabel= ''#"Labels"     xticklabels = label_header_with_count     yticklabels = label_header_with_count     heatmap(cooccurrence_matrix_percentage, title, xlabel, ylabel, xticklabels, yticklabels)     plt.savefig('image_output.png', dpi=300, format='png', bbox_inches='tight') # use format='svg' or 'pdf' for vectorial pictures     #plt.show()   if __name__ == "__main__":     main()     #cProfile.run('main()') # if you want to do some profiling 

(PS: a neat visualization of a co-occurrence matrix in D3.js.)



回答3:

In case that you have larger corpus and term-frequency matrix, using sparse matrix multiplication might be more efficient. I use the same trick of matrix multiplication refered to algo answer on this page.

import scipy.sparse as sp X = sp.csr_matrix(df.astype(int).values) # convert dataframe to sparse matrix Xc = X.T * X # multiply sparse matrix #  Xc.setdiag(0) # reset diagonal print(Xc.todense()) # to print co-occurence matrix in dense format 

Xc here will be the co-occurence matrix in sparse csr format



易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!