I'm using scrapy and i'm trying to use celery to manage multiple spiders on one machine. The problem i have is (bit difficult to explain), that the spiders gets multiplied -> meaning if my first spider starts and i start a second spider the first spider executes twice.
See my code here:
ProcessJob.py
class ProcessJob(): def processJob(self, job): #update job mysql = MysqlConnector.Mysql() db = mysql.getConnection(); cur = db.cursor(); job.status = 1 update = "UPDATE job SET status=1 WHERE id=" + str(job.id) cur.execute(update) db.commit() db.close() #Start new crawler configure_logging() webspider = spider.MySpider; if job.ajax == 1: webspider.custom_settings = CrawlerSettings.ajax_settings; else: webspider.custom_settings = CrawlerSettings.normal_settings; crawler = UrlCrawlerScript(webspider, job) crawler.start() crawler.join() reactor.stop(0) class UrlCrawlerScript(Process): def __init__(self, spider, job): Process.__init__(self) self.crawler = CrawlerRunner() self.crawler.crawl(spider, job=job) def run(self): d = self.crawler.join() d.addBoth(lambda _: reactor.stop()) reactor.run(0)
And the Spider:
def __init__(self, job): self.job = job #Get the hosts allowedDomainsPre = job.url.split(",") allowedDomains = [] for domains in allowedDomainsPre: parsed_uri = urlparse(domains) domain = '{uri.netloc}'.format(uri=parsed_uri) print domain allowedDomains.append(domain) self.allowed_domains = allowedDomains self.start_urls = allowedDomainsPre #Get job patterns jobPatterns = job.processing_patterns.split(",") allowedPatterns = [] deniedPatterns = [] for pattern in jobPatterns: if '-' in pattern: deniedPatterns.append(pattern.replace("-","")) else: allowedPatterns.append(pattern) self._rules = [ Rule(LinkExtractor(allow=(allowedPatterns), deny=(deniedPatterns)), callback=self.parse_items, follow=True) ] self.name = job.id def parse_items(self, response): item = Item() item['html'] = response.body item['url'] = response.url item['job_id'] = self.job.id return item
What this does: I retrieve new Jobs from my database (not in this code here - would be a bit too much). Then i want to process them (run a spider). As i said the problem is when i execute two jobs at a time the first spider is "doubled" (so executing twice in parallel).
Any suggestions how to fix this? Its possibly a problem with reactor again :(