I\'d like to implement some unit tests in a Scrapy (screen scraper/web crawler). Since a project is run through the \"scrapy crawl\" command I can run it through something
I'm using Twisted's trial to run tests, similar to Scrapy's own tests. It already starts a reactor, so I make use of the CrawlerRunner without worrying about starting and stopping one in the tests.
Stealing some ideas from the check and parse Scrapy commands I ended up with the following base TestCase class to run assertions against live sites:
from twisted.trial import unittest
from scrapy.crawler import CrawlerRunner
from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.utils.spider import iterate_spider_output
class SpiderTestCase(unittest.TestCase):
def setUp(self):
self.runner = CrawlerRunner()
def make_test_class(self, cls, url):
"""
Make a class that proxies to the original class,
sets up a URL to be called, and gathers the items
and requests returned by the parse function.
"""
class TestSpider(cls):
# This is a once used class, so writing into
# the class variables is fine. The framework
# will instantiate it, not us.
items = []
requests = []
def start_requests(self):
req = super(TestSpider, self).make_requests_from_url(url)
req.meta["_callback"] = req.callback or self.parse
req.callback = self.collect_output
yield req
def collect_output(self, response):
try:
cb = response.request.meta["_callback"]
for x in iterate_spider_output(cb(response)):
if isinstance(x, (BaseItem, dict)):
self.items.append(x)
elif isinstance(x, Request):
self.requests.append(x)
except Exception as ex:
print("ERROR", "Could not execute callback: ", ex)
raise ex
# Returning any requests here would make the crawler follow them.
return None
return TestSpider
Example:
@defer.inlineCallbacks
def test_foo(self):
tester = self.make_test_class(FooSpider, 'https://foo.com')
yield self.runner.crawl(tester)
self.assertEqual(len(tester.items), 1)
self.assertEqual(len(tester.requests), 2)
or perform one request in the setup and run multiple tests against the results:
@defer.inlineCallbacks
def setUp(self):
super(FooTestCase, self).setUp()
if FooTestCase.tester is None:
FooTestCase.tester = self.make_test_class(FooSpider, 'https://foo.com')
yield self.runner.crawl(self.tester)
def test_foo(self):
self.assertEqual(len(self.tester.items), 1)