Parallelize this nested for loop in python

狂风中的少年 提交于 2019-12-04 07:56:14
from concurrent.futures import ProcessPoolExecutor, Future, wait
from itertools import combinations
from functools import partial

similarity_matrix = [[0]*word_count for _ in range(word_count)]

def callback(i, j, future):
    similarity_matrix[i][j] = future.result()
    similarity_matrix[j][i] = future.result()

with ProcessPoolExecutor(max_workers=4) as executer:
    fs = []
    for i, j in combinations(range(wordcount), 2):
        future = excuter.submit(
                    calculate_similarity, 
                    t_matrix[i], 
                    t_matrix[j])

        future.add_done_callback(partial(callback, i, j))
        fs.append(future)

    wait(fs)
Blckknght

Here's an alternative implementation of the same general algorithm as in Matt's answer, just using multiprocessing.Pool instead of concurrent.futures.ProcessPoolExecutor. It may be more efficient than his code, since the values of the input (t_matrix) are only serialized once and passed to the initializer function in each worker process.

import multiprocessing
import itertools

def worker_init(matrix):
    global worker_matrix
    worker_matrix = matrix

def worker(i, j):
    similarity = calculate_similarity(worker_matrix[i], worker_matrix[j])
    return i, j, similarity

def main(matrix):
    size = len(matrix)
    result = [[0]*size for _ in range(size)]
    with multiprocessing.Pool(initializer=worker_init, initargs=(matrix,)) as pool:
        for i, j, val in pool.starmap(worker, itertools.combinations(range(size), 2)):
            result[i][j] = result[j][i] = val
    return result

if __name__ == "__main__":
    # get t_matrix from somewhere
    main(t_matrix)

You are using to many list comprehensions for such an amount of data. I would strongly recommend the numpy module. If that is an option you can do:

import numpy as np
import itertools

t = np.array(t_matrix)

s = np.sum(t,axis=1)

denom = s[:,None] + s[None,:]
num = np.zeros((word_count,word_count))

for i,j in itertools.product(range(word_count),repeat=2):
    num[i,j] = np.where(t[i] <= t[j], t[i], t[j]).sum()

similarity_matrix = np.where(denom != 0.0, 2.*num/denom, 0 )
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!