Scraping concurrently with selenium in python

眉间皱痕 提交于 2019-12-05 18:40:14

Here is a different approach that I've had success with: you keep your workers in __main__, and the workers pull from the task_q.

import multiprocessing
import traceback

class scrapeWorker(multiprocessing.Process):
    def __init__(self, worker_num, task_q, result_q):
        super(scrapeWorker, self).__init__()
        self.worker_num = worker_num
        self.task_q = task_q
        self.result_q = result_q

        self.scraper = my_scraper_class() # this contains driver code, methods, etc.

    def handleWork(self, work):
        assert isinstance(work, tuple) or isinstance(work, list), "work should be a tuple or list. found {}".format(type(work))
        assert len(work) == 2, "len(work) != 2. found {}".format(work)
        assert isinstance(work[1], dict), "work[1] should be a dict. found {}".format(type(work[1]))

        # do the work
        result = getattr( self.scraper, work[0] )( **work[1] )

        self.result_q.put( result )

    # worker.run() is actually called via worker.start()
    def run(self):
        try:
            self.scraper.startDriving()

            while True:
                work = self.task_q.get()

                if work == 'KILL':
                    self.scraper.driver.quit()
                    break

                self.handleWork( work )
        except:
            print traceback.format_exc()

            raise

if __name__ == "__main__":
    num_workers = 4

    manager = multiprocessing.Manager()
    task_q = manager.Queue()
    result_q = manager.Queue()

    workers = []
    for worker_num in xrange(num_workers):
        worker = scrapeWorker(worker_num, task_q, result_q)
        worker.start()
        workers.append( worker )

    # you decide what job_stuff is
    # work == [ 'method_name', {'kw_1': val_1, ...} ]
    for work in job_stuff:
        task_q.put( work )

    results = []
    while len(results) < len(job_stuff):
        results.append( result_q.get() )

    for worker in workers:
        task_q.put( "KILL" )

    for worker in workers:
        worker.join()

    print "finished!"


####
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!