Scrape multiple urls using QWebPage

后端 未结 1 1494
天命终不由人
天命终不由人 2020-11-22 13:20

I\'m using Qt\'s QWebPage to render a page that uses javascript to update its content dynamically - so a library that just downloads a static version of the page (such as ur

相关标签:
1条回答
  • 2020-11-22 13:55

    The problem with your program is that you are attempting to create a new QApplication with every url you fetch.

    Instead, you should create one QApplication, and handle all the loading and processing of web pages within the WebPage class itself. The key concept is to use the loadFinished signal to create a loop by fetching a new url after the current one has been loaded and processed.

    The two demo scripts below (for PyQt4 and PyQt5) are simplified examples that show how to structure the program. Hopefully, it should be fairly obvious how to adapt them for your own use:

    import sys
    from PyQt4 import QtCore, QtGui, QtWebKit
    
    class WebPage(QtWebKit.QWebPage):
        def __init__(self):
            super(WebPage, self).__init__()
            self.mainFrame().loadFinished.connect(self.handleLoadFinished)
    
        def start(self, urls):
            self._urls = iter(urls)
            self.fetchNext()
    
        def fetchNext(self):
            try:
                url = next(self._urls)
            except StopIteration:
                return False
            else:
                self.mainFrame().load(QtCore.QUrl(url))
            return True
    
        def processCurrentPage(self):
            url = self.mainFrame().url().toString()
            html = self.mainFrame().toHtml()
            # do stuff with html...
            print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
    
        def handleLoadFinished(self):
            self.processCurrentPage()
            if not self.fetchNext():
                QtGui.qApp.quit()
    
    if __name__ == '__main__':
    
        # generate some test urls
        urls = []
        url = 'http://pyqt.sourceforge.net/Docs/PyQt4/%s.html'
        for name in dir(QtWebKit):
            if name.startswith('Q'):
                urls.append(url % name.lower())
    
        app = QtGui.QApplication(sys.argv)
        webpage = WebPage()
        webpage.start(urls)
        sys.exit(app.exec_())
    

    Here is a PyQt5/QWebEngine version of the above script:

    import sys
    from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
    
    class WebPage(QtWebEngineWidgets.QWebEnginePage):
        def __init__(self):
            super(WebPage, self).__init__()
            self.loadFinished.connect(self.handleLoadFinished)
    
        def start(self, urls):
            self._urls = iter(urls)
            self.fetchNext()
    
        def fetchNext(self):
            try:
                url = next(self._urls)
            except StopIteration:
                return False
            else:
                self.load(QtCore.QUrl(url))
            return True
    
        def processCurrentPage(self, html):
            url = self.url().toString()
            # do stuff with html...
            print('loaded: [%d chars] %s' % (len(html), url))
            if not self.fetchNext():
                QtWidgets.qApp.quit()
    
        def handleLoadFinished(self):
            self.toHtml(self.processCurrentPage)
    
    if __name__ == '__main__':
    
        # generate some test urls
        urls = []
        url = 'http://pyqt.sourceforge.net/Docs/PyQt5/%s.html'
        for name in dir(QtWebEngineWidgets):
            if name.startswith('Q'):
                urls.append(url % name.lower())
    
        app = QtWidgets.QApplication(sys.argv)
        webpage = WebPage()
        webpage.start(urls)
        sys.exit(app.exec_())
    
    0 讨论(0)
提交回复
热议问题