I\'m using Qt\'s QWebPage to render a page that uses javascript to update its content dynamically - so a library that just downloads a static version of the page (such as ur
The problem with your program is that you are attempting to create a new QApplication with every url you fetch.
Instead, you should create one QApplication, and handle all the loading and processing of web pages within the WebPage
class itself. The key concept is to use the loadFinished
signal to create a loop by fetching a new url after the current one has been loaded and processed.
The two demo scripts below (for PyQt4 and PyQt5) are simplified examples that show how to structure the program. Hopefully, it should be fairly obvious how to adapt them for your own use:
import sys
from PyQt4 import QtCore, QtGui, QtWebKit
class WebPage(QtWebKit.QWebPage):
def __init__(self):
super(WebPage, self).__init__()
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QtCore.QUrl(url))
return True
def processCurrentPage(self):
url = self.mainFrame().url().toString()
html = self.mainFrame().toHtml()
# do stuff with html...
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QtGui.qApp.quit()
if __name__ == '__main__':
# generate some test urls
urls = []
url = 'http://pyqt.sourceforge.net/Docs/PyQt4/%s.html'
for name in dir(QtWebKit):
if name.startswith('Q'):
urls.append(url % name.lower())
app = QtGui.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
sys.exit(app.exec_())
Here is a PyQt5/QWebEngine version of the above script:
import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
url = self.url().toString()
# do stuff with html...
print('loaded: [%d chars] %s' % (len(html), url))
if not self.fetchNext():
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
if __name__ == '__main__':
# generate some test urls
urls = []
url = 'http://pyqt.sourceforge.net/Docs/PyQt5/%s.html'
for name in dir(QtWebEngineWidgets):
if name.startswith('Q'):
urls.append(url % name.lower())
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
sys.exit(app.exec_())