Is there a simple process-based parallel map for python?

前端 未结 4 1244
小蘑菇
小蘑菇 2020-11-29 18:50

I\'m looking for a simple process-based parallel map for python, that is, a function

parmap(function,[data])

that would run function on eac

4条回答
  •  無奈伤痛
    2020-11-29 19:28

    For those who looking for Python equivalent of R's mclapply(), here is my implementation. It is an improvement of the following two examples:

    • "Parallelize Pandas map() or apply()", as mentioned by @Rafael Valero.
    • How to apply map to functions with multiple arguments.

    It can be apply to map functions with single or multiple arguments.

    import numpy as np, pandas as pd
    from scipy import sparse
    import functools, multiprocessing
    from multiprocessing import Pool
    
    num_cores = multiprocessing.cpu_count()
    
    def parallelize_dataframe(df, func, U=None, V=None):
    
        #blockSize = 5000
        num_partitions = 5 # int( np.ceil(df.shape[0]*(1.0/blockSize)) )
        blocks = np.array_split(df, num_partitions)
    
        pool = Pool(num_cores)
        if V is not None and U is not None:
            # apply func with multiple arguments to dataframe (i.e. involves multiple columns)
            df = pd.concat(pool.map(functools.partial(func, U=U, V=V), blocks))
        else:
            # apply func with one argument to dataframe (i.e. involves single column)
            df = pd.concat(pool.map(func, blocks))
    
        pool.close()
        pool.join()
    
        return df
    
    def square(x):
        return x**2
    
    def test_func(data):
        print("Process working on: ", data.shape)
        data["squareV"] = data["testV"].apply(square)
        return data
    
    def vecProd(row, U, V):
        return np.sum( np.multiply(U[int(row["obsI"]),:], V[int(row["obsJ"]),:]) )
    
    def mProd_func(data, U, V):
        data["predV"] = data.apply( lambda row: vecProd(row, U, V), axis=1 )
        return data
    
    def generate_simulated_data():
    
        N, D, nnz, K = [302, 184, 5000, 5]
        I = np.random.choice(N, size=nnz, replace=True)
        J = np.random.choice(D, size=nnz, replace=True)
        vals = np.random.sample(nnz)
    
        sparseY = sparse.csc_matrix((vals, (I, J)), shape=[N, D])
    
        # Generate parameters U and V which could be used to reconstruct the matrix Y
        U = np.random.sample(N*K).reshape([N,K])
        V = np.random.sample(D*K).reshape([D,K])
    
        return sparseY, U, V
    
    def main():
        Y, U, V = generate_simulated_data()
    
        # find row, column indices and obvseved values for sparse matrix Y
        (testI, testJ, testV) = sparse.find(Y)
    
        colNames = ["obsI", "obsJ", "testV", "predV", "squareV"]
        dtypes = {"obsI":int, "obsJ":int, "testV":float, "predV":float, "squareV": float}
    
        obsValDF = pd.DataFrame(np.zeros((len(testV), len(colNames))), columns=colNames)
        obsValDF["obsI"] = testI
        obsValDF["obsJ"] = testJ
        obsValDF["testV"] = testV
        obsValDF = obsValDF.astype(dtype=dtypes)
    
        print("Y.shape: {!s}, #obsVals: {}, obsValDF.shape: {!s}".format(Y.shape, len(testV), obsValDF.shape))
    
        # calculate the square of testVals    
        obsValDF = parallelize_dataframe(obsValDF, test_func)
    
        # reconstruct prediction of testVals using parameters U and V
        obsValDF = parallelize_dataframe(obsValDF, mProd_func, U, V)
    
        print("obsValDF.shape after reconstruction: {!s}".format(obsValDF.shape))
        print("First 5 elements of obsValDF:\n", obsValDF.iloc[:5,:])
    
    if __name__ == '__main__':
        main()
    

提交回复
热议问题