Vectorized groupby with NumPy

前端 未结 4 2221
自闭症患者
自闭症患者 2020-12-31 08:11

Pandas has a widely-used groupby facility to split up a DataFrame based on a corresponding mapping, from which you can apply a calculation on each subgroup and recombine the

4条回答
  •  北荒
    北荒 (楼主)
    2020-12-31 08:52

    How about using scipy sparse matrix

    import numpy as np
    from scipy import sparse
    import time
    
    x_len = 500000
    g_len = 100
    
    X = np.arange(x_len * 2).reshape(x_len, 2)
    groups = np.random.randint(0, g_len, x_len)
    
    # original
    s = time.time()
    
    a = np.array([X[groups==i].sum() for i in np.unique(groups)])
    
    print(time.time() - s)
    
    # using scipy sparse matrix
    s = time.time()
    
    x_sum = X.sum(axis=1)
    b = np.array(sparse.coo_matrix(
        (
            x_sum,
            (groups, np.arange(len(x_sum)))
        ),
        shape=(g_len, x_len)
    ).sum(axis=1)).ravel()
    
    print(time.time() - s)
    
    #compare
    print(np.abs((a-b)).sum())
    

    result on my PC

    0.15915322303771973
    0.012875080108642578
    0
    

    More than 10 times faster.


    Update!

    Let's benchmark answers of @Paul Panzer and @Daniel F. It is summation only benchmark.

    import numpy as np
    from scipy import sparse
    import time
    
    # by @Daniel F
    def groupby_np(X, groups, axis = 0, uf = np.add, out = None, minlength = 0, identity = None):
        if minlength < groups.max() + 1:
            minlength = groups.max() + 1
        if identity is None:
            identity = uf.identity
        i = list(range(X.ndim))
        del i[axis]
        i = tuple(i)
        n = out is None
        if n:
            if identity is None:  # fallback to loops over 0-index for identity
                assert np.all(np.in1d(np.arange(minlength), groups)), "No valid identity for unassinged groups"
                s = [slice(None)] * X.ndim
                for i_ in i:
                    s[i_] = 0
                out = np.array([uf.reduce(X[tuple(s)][groups == i]) for i in range(minlength)])
            else:
                out = np.full((minlength,), identity, dtype = X.dtype)
        uf.at(out, groups, uf.reduce(X, i))
        if n:
            return out
    
    x_len = 500000
    g_len = 200
    
    X = np.arange(x_len * 2).reshape(x_len, 2)
    groups = np.random.randint(0, g_len, x_len)
    
    print("original")
    s = time.time()
    
    a = np.array([X[groups==i].sum() for i in np.unique(groups)])
    
    print(time.time() - s)
    
    print("use scipy coo matrix")
    s = time.time()
    
    x_sum = X.sum(axis=1)
    b = np.array(sparse.coo_matrix(
        (
            x_sum,
            (groups, np.arange(len(x_sum)))
        ),
        shape=(g_len, x_len)
    ).sum(axis=1)).ravel()
    
    print(time.time() - s)
    
    #compare
    print(np.abs((a-b)).sum())
    
    
    print("use scipy csr matrix @Daniel F")
    s = time.time()
    x_sum = X.sum(axis=1)
    c = np.array(sparse.csr_matrix(
        (
            x_sum,
            groups,
            np.arange(len(groups)+1)
        ),
        shape=(len(groups), g_len)
    ).sum(axis=0)).ravel()
    
    print(time.time() - s)
    
    #compare
    print(np.abs((a-c)).sum())
    
    
    print("use bincount @Paul Panzer @Daniel F")
    s = time.time()
    d = np.bincount(groups, X.sum(axis=1), g_len)
    print(time.time() - s)
    
    #compare
    print(np.abs((a-d)).sum())
    
    print("use ufunc @Daniel F")
    s = time.time()
    e = groupby_np(X, groups)
    print(time.time() - s)
    
    #compare
    print(np.abs((a-e)).sum())
    

    STDOUT

    original
    0.2882847785949707
    use scipy coo matrix
    0.012301445007324219
    0
    use scipy csr matrix @Daniel F
    0.01046299934387207
    0
    use bincount @Paul Panzer @Daniel F
    0.007468223571777344
    0.0
    use ufunc @Daniel F
    0.04431319236755371
    0
    

    The winner is the bincount solution. But the csr matrix solution is also very interesting.

提交回复
热议问题