Python Webdriver Multithread

社会主义新天地 提交于 2019-12-31 00:44:17

问题


I'm trying to spawn multiple webdriver instances with the code from: http://www.ibm.com/developerworks/aix/library/au-threadingpython/

import time
import Queue
import urllib2
import threading
from selenium import webdriver
from BeautifulSoup import BeautifulSoup
hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
    "http://ibm.com", "http://apple.com"]
queue = Queue.Queue
out_queue = Queue.Queue

class Login_Driver(threading.Thread):
    def __init__(self, queue, out_queue, driver):
        threading.Thread.__init__(self)
        self.queue = queue
        self.out_queue = out_queue
        self.driver = driver
        print driver.title
    def run(self):
        while True:
            #grabs host from queue
            host = self.queue.get()
            #grabs urls of hosts and then grabs chunk of webpage
            driver.get(host)
            chunk = driver.page_source()
            #place chunk into out queue
            self.out_queue.put(chunk)
            #signals to queue job is done
            self.queue.task_done()
class Poster(threading.Thread):
    def __init__(self, driver, out_queue):
        self.out_queue = out_queue
        self.driver = driver
        print driver.name
    def run(self):
        while True:
            #grabs host from queue
            chunk = self.out_queue.get()
            #parse the chunk
            soup = BeautifulSoup(chunk)
            print soup.findAll(['title'])
            #signals to queue job is done
            self.out_queue.task_done()
start = time.time()
def main():
    #spawn a pool of threads, and pass them queue instance
    for i in range(5):
        driver = webdriver.Firefox()
        t = Login_Driver(queue, out_queue, driver)
        t.setDaemon(True)
        t.start()
        time.sleep(20)
    #populate queue with data
    for host in hosts:
        queue.put(host)
    for i in range(5):
        dt = Poster(out_queue)
        dt.setDaemon(True)
        dt.start()
    #wait on the queue until everything has been processed
    queue.join()
    out_queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)

It errors: TypeError: unbound method get() must be called with Queque instance as first argument (got nothing instead)

I'm a newbie on threads, classes, processes, can you please tell me what is more ok to use, threads or processes and if can give me an example would be great. Thank you guys.

UPDATE

Working code:

import time
import Queue
import urllib2
import threading
from selenium import webdriver
from BeautifulSoup import BeautifulSoup

hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
        "http://ibm.com", "http://apple.com"]
queue = Queue.Queue()
out_queue = Queue.Queue()

class Login_Driver(threading.Thread):
#def __init__(self, driver):
    def __init__(self, queue, out_queue, driver):
        threading.Thread.__init__(self)
        self.queue = queue
        self.out_queue = out_queue
        self.driver = driver
        print "In init first class.."
    def run(self):
        while True:
            #grabs host from queue
            host = self.queue.get()
            #grabs urls of hosts and then grabs chunk of webpage
            self.driver.get(host)
            chunk = self.driver.page_source
            #place chunk into out queue
            self.out_queue.put(chunk)
            #signals to queue job is done
            print self.driver.title
            self.queue.task_done()
class Poster(threading.Thread):
    def __init__(self, out_queue, driver):
        threading.Thread.__init__(self)
        self.out_queue = out_queue
        self.driver = driver
        print "In init a second class.."
    def run(self):
        while True:
            #grabs host from queue
            chunk = self.out_queue.get()
            #parse the chunk
            soup = BeautifulSoup(chunk)
            print soup.findAll(['title'])
            #signals to queue job is done
            print self.driver.name
            self.out_queue.task_done()
start = time.time()
def main():
    #spawn a pool of threads, and pass them queue instance
    for i in range(5):
        driver = webdriver.Firefox()
        t = Login_Driver(queue, out_queue, driver)
        t.setDaemon(True)
        t.start()
        print "Started webdriver: --- "+str(i)+" --- from main"
    print "All started"
    time.sleep(3)
    #populate queue with data
    for host in hosts:
        queue.put(host)
        print "Opening website: "+host
    print "All sites passed for opening.."
    time.sleep(3)
    for i in range(5):
        dt = Poster(out_queue, driver)
        dt.setDaemon(True)
        dt.start()
        print "Starting second class/title and name beautifull soup and webdriver: --- "+str(i)+" --- from main"
    print "Started secound class.."
    time.sleep(3)
    #wait on the queue until everything has been processed
    queue.join()
    out_queue.join()
    print "out_queue.join()"
main()
print "Elapsed Time: %s" % (time.time() - start)

回答1:


You need to use Queue.Queue() instead of Queue.Queue




回答2:


You are not instantiating the Queue correctly. Instead of,

queue = Queue.Queue
out_queue = Queue.Queue

it should be

queue = Queue.Queue()
out_queue = Queue.Queue()


来源:https://stackoverflow.com/questions/37615350/python-webdriver-multithread

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!