Python - Using threads or a queue to iterate over a for loop that calls a function

假装没事ソ 提交于 2019-12-06 04:10:37

Subclass threading.Thread and put your work function in that class as part of run().

import threading
import time
import random

class Worker(threading.Thread):
    def __init__(self, srcfile, printlock,**kwargs):
        super(Worker,self).__init__(**kwargs)
        self.srcfile = srcfile
        self.lock = printlock # so threads don't step on each other's prints

    def run(self):
        with self.lock:
            print("starting %s on %s" % (self.ident,self.srcfile))
        # do whatever you need to, return when done
        # example, sleep for a random interval up to 10 seconds
        time.sleep(random.random()*10)
        with self.lock:
            print("%s done" % self.ident)


def threadme(srcfiles):
    printlock = threading.Lock()
    threadpool = []
    for file in srcfiles:
        threadpool.append(Worker(file,printlock))

    for thr in threadpool:
        thr.start()

    # this loop will block until all threads are done
    # (however it won't necessarily first join those that are done first)
    for thr in threadpool:
        thr.join()

    print("all threads are done")

if __name__ == "__main__":
    threadme(["abc","def","ghi"])

As requested, to limit the number of threads, use the following:

def threadme(infiles,threadlimit=None,timeout=0.01):
    assert threadlimit is None or threadlimit > 0, \
           "need at least one thread";
    printlock = threading.Lock()
    srcfiles = list(infiles)
    threadpool = []

    # keep going while work to do or being done
    while srcfiles or threadpool:

        # while there's room, remove source files
        # and add to the pool
        while srcfiles and \
           (threadlimit is None \
            or len(threadpool) < threadlimit):
            file = srcfiles.pop()
            wrkr = Worker(file,printlock)
            wrkr.start()
            threadpool.append(wrkr)

        # remove completed threads from the pool
        for thr in threadpool:
            thr.join(timeout=timeout)
            if not thr.is_alive():
                threadpool.remove(thr)

    print("all threads are done")

if __name__ == "__main__":
    for lim in (1,2,3,4):
        print("--- Running with thread limit %i ---" % lim)
        threadme(("abc","def","ghi"),threadlimit=lim)

Note that this will actually process the sources in reverse (due to the list pop()). If you require them to be done in order, reverse the list somewhere, or use a deque and popleft().

I would recommend using mrjob for this.

Mr Job is a python implementation of map reduce.

Below is the mr job code to do a multithreaded word count over a lot of text files:

from mrjob.job import MRJob

class MRWordCounter(MRJob):
    def get_words(self, key, line):
        for word in line.split():
            yield word, 1

    def sum_words(self, word, occurrences):
        yield word, sum(occurrences)

    def steps(self):
        return [self.mr(self.get_words, self.sum_words),]

if __name__ == '__main__':
    MRWordCounter.run()

This code maps all the files in parallel (counts the words for each file), then reduces the various counts into one single total word count.

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!