reordering cluster numbers for correct correspondence

后端 未结 2 800
借酒劲吻你
借酒劲吻你 2020-12-21 10:27

I have a dataset that I clustered using two different clustering algorithms. The results are about the same, but the cluster numbers are permuted. Now for displaying the col

2条回答
  •  無奈伤痛
    2020-12-21 11:09

    I have a function that works for me. But it may fail when the two cluster results are very inconsistent, which leads to duplicated max values in the contingency matrix. If your cluster results are about the same, it should work.

    Here is my code:

    from sklearn.metrics.cluster import contingency_matrix
    
    def align_cluster_index(ref_cluster, map_cluster):
    """
    remap cluster index according the the ref_cluster.
    both inputs must be nparray and have same number of unique cluster index values.
    
    Xin Niu Jan-15-2020
    """
    
    ref_values = np.unique(ref_cluster)
    map_values = np.unique(map_cluster)
    
    print(ref_values)
    print(map_values)
    
    num_values = ref_values.shape[0]
    
    if ref_values.shape[0]!=map_values.shape[0]:
        print('error: both inputs must have same number of unique cluster index values.')
        return()
    
    switched_col = set()
    while True:
        cont_mat = contingency_matrix(ref_cluster, map_cluster)
        print(cont_mat)
        # divide contingency_matrix by its row and col sums to avoid potential duplicated values:
        col_sum = np.matmul(np.ones((num_values, 1)), np.sum(cont_mat, axis = 0).reshape(1, num_values))
        row_sum = np.matmul(np.sum(cont_mat, axis = 1).reshape(num_values, 1), np.ones((1, num_values)))
        print(col_sum)
        print(row_sum)
    
        cont_mat = cont_mat/(col_sum+row_sum)
        print(cont_mat)
    
        # ignore columns that have been switched:
        cont_mat[:, list(switched_col)]=-1
    
        print(cont_mat)
    
        sort_0 = np.argsort(cont_mat, axis = 0)
        sort_1 = np.argsort(cont_mat, axis = 1)
    
        print('argsort contmat:')
        print(sort_0)
        print(sort_1)
    
        if np.array_equal(sort_1[:,-1], np.array(range(num_values))):
            break
    
        # switch values according to the max value in the contingency matrix:
        # get the position of max value:
        idx_max = np.unravel_index(np.argmax(cont_mat, axis=None), cont_mat.shape)
        print(cont_mat)
        print(idx_max)
    
        if (cont_mat[idx_max]>0) and (idx_max[0] not in switched_col):
            cluster_tmp = map_cluster.copy()
            print('switch', map_values[idx_max[1]], 'and:', ref_values[idx_max[0]])
            map_cluster[cluster_tmp==map_values[idx_max[1]]]=ref_values[idx_max[0]]
            map_cluster[cluster_tmp==map_values[idx_max[0]]]=ref_values[idx_max[1]]
    
            switched_col.add(idx_max[0])
            print(switched_col)
    
        else:
            break
    
    print('final argsort contmat:')
    print(sort_0)
    print(sort_1)
    
    print('final cont_mat:')
    cont_mat = contingency_matrix(ref_cluster, map_cluster)
    col_sum = np.matmul(np.ones((num_values, 1)), np.sum(cont_mat, axis = 0).reshape(1, num_values))
    row_sum = np.matmul(np.sum(cont_mat, axis = 1).reshape(num_values, 1), np.ones((1, num_values)))
    cont_mat = cont_mat/(col_sum+row_sum)
    
    print(cont_mat)
    
    return(map_cluster)
    

    And here is some test code:

    ref_cluster = np.array([2,2,3,1,0,0,0,1,2,1,2,2,0,3,3,3,3])
    map_cluster = np.array([0,0,0,1,1,3,2,3,2,2,0,0,0,2,0,3,3])
    
    c = align_cluster_index(ref_cluster, map_cluster)
    print(ref_cluster)
    print(c)
    
    >>>[2 2 3 1 0 0 0 1 2 1 2 2 0 3 3 3 3]
    >>>[2 2 2 1 1 3 0 3 0 0 2 2 2 0 2 3 3]
    

提交回复
热议问题