insert multiple input fields before running scrapy

﹥>﹥吖頭↗ 提交于 2021-02-19 08:30:06

问题


I'm referencing a stackoverflow answer that is similar to my GUI app. My scrappy application is a bit different. When exectuing the app, a user is prompt to enter keywords for scrapy to search for

looks like this

im trying to put this logic on the GUI, but im unsure how to do it.

here is what the gui looks like as of now.

I want to be able to input fields where a user can input the information need before processing the scrapy script.

here is a bit of the scrapy script

my_spider.py

import scrapy
import sys
import random
import csv
from scrape.items import Item
from var_dump import var_dump


search_item = input("Input The Search Item: ")
location = input("Location:")
second_location = input("Second Location:")
third_location = input("Third Location:")
fourth_location = input("Fourth Location:")
fifth_location = input("Fifth Location:")
sixth_location = input("Sixth Location:")




# city = [
#     "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "Fort Worth", 
#     "San Diego", "Dallas", "San Jose", "Austin", "Columbus", "Indianapolis",  "Seattle", "St. Paul", "Nashville", 
#     "Louisville", "Plano"
# ]

# rancity = random.choice(city)


class YellowSpider(scrapy.Spider):


    name = "yellow"

    # start_urls = [
    #     "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location
    #     # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
    #     # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
    # ]

    def start_requests(self):
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location, self.parse)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location, self.parse2)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location, self.parse3)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location, self.parse4)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fifth_location, self.parse5)
        yield scrapy.Request("https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + sixth_location, self.parse6)
        # yield scrapy.Request('http://www.example.com/3.html', self.parse)

    def __init__(self):
        self.seen_business_names = []
        self.seen_phonenumbers = []
        self.seen_websites = []
        self.seen_emails = []

    def parse(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse)

    def parse2(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile2)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse2)

    def parse3(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile3)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse3)
        ........

here is the GUI

main.py

from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets

class ScrapyWorker(QtCore.QObject):
    logChanged = QtCore.pyqtSignal(str)
    started = QtCore.pyqtSignal()
    finished = QtCore.pyqtSignal()

    def __init__(self, parent=None):
        super(ScrapyWorker, self).__init__(parent)
        self._process = QtCore.QProcess(self)
        self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
        self._process.setProgram('scrapy')
        self._process.started.connect(self.started)
        self._process.finished.connect(self.finished)

    def run(self, project, spider):
        self._process.setWorkingDirectory(project)
        self._process.setArguments(['crawl', spider])
        self._process.start()

    @QtCore.pyqtSlot()
    def on_readyReadStandardOutput(self):
        data = self._process.readAllStandardOutput().data().decode()
        self.logChanged.emit(data)

    @QtCore.pyqtSlot()
    def stop(self):
        self._process.kill()

    def spiders(self, project):
        process = QtCore.QProcess()
        process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        process.setWorkingDirectory(project)
        loop = QtCore.QEventLoop()
        process.finished.connect(loop.quit)
        process.start('scrapy', ['list'])
        loop.exec_()
        return process.readAllStandardOutput().data().decode().split()

class MainWindow(QtWidgets.QMainWindow):
    def __init__(self, parent=None):
        super(MainWindow, self).__init__(parent)

        self.project_le = QtWidgets.QLineEdit()
        self.project_button = QtWidgets.QPushButton('Select Project')
        self.spider_combobox = QtWidgets.QComboBox()
        self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
        self.text_edit = QtWidgets.QTextBrowser()
        self.input = QtWidgets.QLineEdit()
        self.input1 = QtWidgets.QLineEdit()
        self.input2 = QtWidgets.QLineEdit()
        self.input3 = QtWidgets.QLineEdit()
        self.input4 = QtWidgets.QLineEdit()
        self.input5 = QtWidgets.QLineEdit()
        self.input6 = QtWidgets.QLineEdit()
        central_widget = QtWidgets.QWidget()
        self.setCentralWidget(central_widget)

        lay = QtWidgets.QVBoxLayout(central_widget)
        hlay = QtWidgets.QHBoxLayout()
        hlay.addWidget(self.project_le)
        hlay.addWidget(self.project_button)
        lay.addLayout(hlay)
        hlay2 = QtWidgets.QHBoxLayout()
        hlay2.addWidget(QtWidgets.QLabel("Input The Search Item :"))
        hlay2.addWidget(self.input, 1)
        hlay3 = QtWidgets.QHBoxLayout()
        hlay4 = QtWidgets.QHBoxLayout()
        hlay5 = QtWidgets.QHBoxLayout()
        hlay6 = QtWidgets.QHBoxLayout()
        hlay7 = QtWidgets.QHBoxLayout()
        hlay8 = QtWidgets.QHBoxLayout()
        hlay3.addWidget(QtWidgets.QLabel("Location :"))
        hlay3.addWidget(self.input1, 1 )
        hlay4.addWidget(QtWidgets.QLabel("Location 2 :"))
        hlay4.addWidget(self.input2, 1 )
        hlay5.addWidget(QtWidgets.QLabel("Location 3 :"))
        hlay5.addWidget(self.input3, 1 )
        hlay6.addWidget(QtWidgets.QLabel("Location 4 :"))
        hlay6.addWidget(self.input4, 1 )
        hlay7.addWidget(QtWidgets.QLabel("Location 5 :"))
        hlay7.addWidget(self.input5, 1 )
        hlay8.addWidget(QtWidgets.QLabel("Location 6 :"))
        hlay8.addWidget(self.input6, 1 )
        lay.addLayout(hlay2)
        lay.addLayout(hlay3)
        lay.addLayout(hlay4)
        lay.addLayout(hlay5)
        lay.addLayout(hlay6)
        lay.addLayout(hlay7)
        lay.addLayout(hlay8)
        lay.addWidget(self.start_stop_button)
        lay.addWidget(self.text_edit)

        self.start_stop_button.setEnabled(False)

        self.scrapy_worker = ScrapyWorker(self)
        self.scrapy_worker.logChanged.connect(self.insert_log)
        self.scrapy_worker.started.connect(self.text_edit.clear)
        self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))

        self.start_stop_button.toggled.connect(self.on_checked)
        self.project_button.clicked.connect(self.select_project)
        self.resize(640, 480)

    @QtCore.pyqtSlot(bool)
    def on_checked(self, state):
        if state:
            filename = self.project_le.text()
            finfo = QtCore.QFileInfo(filename)
            directory = finfo.dir().absolutePath()
            self.scrapy_worker.run(directory, self.spider_combobox.currentText())
            self.start_stop_button.setText('Stop')
        else:
            self.start_stop_button.setText('Start')
            self.scrapy_worker.stop()

    @QtCore.pyqtSlot()
    def select_project(self):
        filename, _ = QtWidgets.QFileDialog.getOpenFileName(
            self,
            "Select .cfg file",
            QtCore.QDir.currentPath(),
            "Configure File (*.cfg)"
        )
        if filename:
            self.project_le.setText(filename)
            finfo = QtCore.QFileInfo(filename)
            directory = finfo.dir().absolutePath()
            spiders = self.scrapy_worker.spiders(directory)
            self.spider_combobox.clear()
            self.spider_combobox.addItems(spiders)
            self.start_stop_button.setEnabled(True if spiders else False)

    @QtCore.pyqtSlot(str)
    def insert_log(self, text):
        prev_cursor = self.text_edit.textCursor()
        self.text_edit.moveCursor(QtGui.QTextCursor.End)
        self.text_edit.insertPlainText(text)
        self.text_edit.setTextCursor(prev_cursor)

if __name__ == '__main__':
    import sys
    app = QtWidgets.QApplication(sys.argv)
    app.setStyle('fusion')
    w = MainWindow()
    w.show()
    sys.exit(app.exec_())

回答1:


First you have to modify your spider to accept arguments directly by the console avoiding using the input() method:

yellowpage_spider.py

import json
import scrapy
from scrape.items import Item

class YellowSpider(scrapy.Spider):
    name = "yellow"

    def __init__(self, *args, **kwargs):
        super(YellowSpider, self).__init__(*args, **kwargs)
        self.seen_business_names = []
        self.seen_phonenumbers = []
        self.seen_websites = []
        self.seen_emails = []

    def start_requests(self):
        if not hasattr(self, 'parameters'):
            return
        parameters = json.loads(self.parameters)
        search_item = parameters['search_item']
        locations = parameters['locations']
        for location in locations:
            url = "https://www.yellowpages.com/search?search_terms={}&geo_location_terms={}".format(search_item, location)
            yield scrapy.Request(url=url, callback=self.parse, meta={'location': location})

    def parse(self, response):
        location = response.meta['location']
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile, meta={'location': location})

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse, meta={'location': location})

    def businessprofile(self, response):
        location = response.meta['location']
        for business in response.css('header#main-header'):
            item = Item()
            item['business_name'] = business.css('div.sales-info h1::text').extract()
            w = business.css('a.secondary-btn.website-link::attr(href)').extract()

            item['website'] = str(w).strip('[]')

            item['location'] = location

            s = business.css('a.email-business::attr(href)').extract()
            item['email'] = [item[7:] for item in s]

            item['phonenumber'] = business.css('p.phone::text').extract_first()
            for x in item['email']:
                #new code here, call to self.seen_business_names
                if x not in self.seen_emails:
                    if item['email']:
                        if item['phonenumber']:
                            if item['website']:
                                self.seen_emails.append(x)
                                yield item

Then the previous code expects a parameter called parameters:

scrapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany", "brazil"]}'

So in the GUI we must now form the entry using the GUI inputs:

gui.py

import os
import json
from functools import partial
from PyQt5 import QtCore, QtGui, QtWidgets
import utils


dir_path = os.path.dirname(os.path.abspath(__file__))
icons_dir = os.path.join(dir_path, 'assets', 'icons')


class ScrapyWorker(QtCore.QObject):
    logChanged = QtCore.pyqtSignal(str)
    started = QtCore.pyqtSignal()
    finished = QtCore.pyqtSignal()

    def __init__(self, parent=None):
        super(ScrapyWorker, self).__init__(parent)
        self._process = QtCore.QProcess(self)
        self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput)
        self._process.started.connect(self.started)
        self._process.finished.connect(self.finished)

    def run(self, project, program, arguments):
        self._process.setWorkingDirectory(project)
        self._process.setProgram('scrapy')
        self._process.setArguments(arguments)
        self._process.start()

    @QtCore.pyqtSlot()
    def on_readyReadStandardOutput(self):
        data = self._process.readAllStandardOutput().data().decode()
        self.logChanged.emit(data)

    @QtCore.pyqtSlot()
    def stop(self):
        self._process.kill()

class LocationWidget(QtWidgets.QWidget):
    def __init__(self, parent=None):
        super(LocationWidget, self).__init__(parent)
        self.lay = QtWidgets.QVBoxLayout(self)
        self.lay.setContentsMargins(0, 0, 0, 0)
        self.lay.addStretch()
        self.setContentsMargins(0, 0, 0, 0)
        self.widgets = []
        self.create_row()

    def create_row(self):
        widget = QtWidgets.QWidget()
        widget.setContentsMargins(0, 0, 0, 0)
        hlay = QtWidgets.QHBoxLayout(widget)
        hlay.setContentsMargins(0, 0, 0, 0)
        lineedit = QtWidgets.QLineEdit()
        button = QtWidgets.QToolButton(clicked=self.on_clicled)
        button.setFocusPolicy(QtCore.Qt.NoFocus)
        hlay.addWidget(lineedit)
        hlay.addWidget(button)
        button.setIconSize(QtCore.QSize(24, 24))
        button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))
        self.widgets.append(widget)
        self.lay.insertWidget(-1, widget)

    @QtCore.pyqtSlot()
    def on_clicled(self):
        button = self.sender()
        widget = button.parentWidget()
        if self.lay.indexOf(widget) == (self.lay.count()-1):
            self.create_row()
        else:
            self.lay.removeWidget(widget)
            widget.deleteLater()
            self.widgets.remove(widget)
        for widget in self.widgets:
            button = widget.findChild(QtWidgets.QToolButton)
            button.setIcon(QtGui.QIcon(os.path.join(icons_dir, 'remove.png')))
        self.widgets[-1].findChild(QtWidgets.QToolButton).setIcon(QtGui.QIcon(os.path.join(icons_dir, 'add.png')))

    def get_locations(self):
        locations = []
        for widget in self.widgets:
            le = widget.findChild(QtWidgets.QLineEdit)
            if le.text():
                locations.append(le.text())
        return locations

class YellowWidget(QtWidgets.QMainWindow):
    def __init__(self, parent=None):
        super(YellowWidget, self).__init__(parent)
        self.setWindowTitle('Yellow Pages Scrapper')
        self.scrapy_worker = ScrapyWorker(self)
        self.search_item_le = QtWidgets.QLineEdit()
        self.location_widget = LocationWidget()
        self.start_stop_button = QtWidgets.QPushButton("Start", checkable=True)
        self.text_edit = QtWidgets.QTextBrowser()

        central_widget = QtWidgets.QWidget()
        self.setCentralWidget(central_widget)
        lay = QtWidgets.QGridLayout(central_widget)
        lay.addWidget(QtWidgets.QLabel("<b>Search:</b>"), 0, 0)
        lay.addWidget(self.search_item_le, 0, 1)
        lay.addWidget(QtWidgets.QLabel("<b>Locations:</b>"), 1, 0, alignment=QtCore.Qt.AlignTop|QtCore.Qt.AlignLeft)
        lay.addWidget(self.location_widget, 1, 1, alignment=QtCore.Qt.AlignTop)
        lay.addWidget(self.start_stop_button, 2, 0, 1, 2)
        lay.addWidget(self.text_edit, 3, 0, 1, 2)

        self.start_stop_button.toggled.connect(self.on_checked)
        self.scrapy_worker.logChanged.connect(self.insert_log)
        self.scrapy_worker.started.connect(self.text_edit.clear)
        self.scrapy_worker.finished.connect(partial(self.start_stop_button.setChecked, False))

    @QtCore.pyqtSlot(bool)
    def on_checked(self, state):
        if state:
            # crapy crawl yellow -a parameters='{"search_item": "house", "locations": ["usa", "germany"]}'
            search_item = self.search_item_le.text()
            locations = self.location_widget.get_locations()
            directory, program, args = utils.create_arguments(search_item, locations)
            self.scrapy_worker.run(directory, program, args)
            self.start_stop_button.setText('Stop')
        else:
            self.start_stop_button.setText('Start')
            self.scrapy_worker.stop()

    @QtCore.pyqtSlot(str)
    def insert_log(self, text):
        prev_cursor = self.text_edit.textCursor()
        self.text_edit.moveCursor(QtGui.QTextCursor.End)
        self.text_edit.insertPlainText(text)
        self.text_edit.setTextCursor(prev_cursor)

if __name__ == '__main__':
    import sys
    app = QtWidgets.QApplication(sys.argv)
    app.setStyle('fusion')
    w = YellowWidget()
    w.resize(640, 480)
    w.show()
    sys.exit(app.exec_())

I used a function that is in the utils.py file:

import os
import json

def create_arguments(search_item, locations):
    program = 'scrapy'
    dir_path = os.path.dirname(os.path.abspath(__file__))
    directory = os.path.join(dir_path, 'scrape')
    d = {"search_item": search_item, "locations": locations}
    argument = 'parameters={}'.format(json.dumps(d))
    return directory, program, ['crawl', 'yellow', "-a", argument]

Obtaining the following:

The complete project is here.



来源:https://stackoverflow.com/questions/55212321/insert-multiple-input-fields-before-running-scrapy

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!