I\'m using PhantomJS v1.4.1 to load some web pages. I don\'t have access to their server-side, I just getting links pointing to them. I\'m using obsolete version of Phantom
I would rather periodically check for document.readyState
status (https://developer.mozilla.org/en-US/docs/Web/API/document.readyState). Although this approach is a bit clunky, you can be sure that inside onPageReady
function you are using fully loaded document.
var page = require("webpage").create(),
url = "http://example.com/index.html";
function onPageReady() {
var htmlContent = page.evaluate(function () {
return document.documentElement.outerHTML;
});
console.log(htmlContent);
phantom.exit();
}
page.open(url, function (status) {
function checkReadyState() {
setTimeout(function () {
var readyState = page.evaluate(function () {
return document.readyState;
});
if ("complete" === readyState) {
onPageReady();
} else {
checkReadyState();
}
});
}
checkReadyState();
});
Additional explanation:
Using nested setTimeout
instead of setInterval
prevents checkReadyState
from "overlapping" and race conditions when its execution is prolonged for some random reasons. setTimeout
has a default delay of 4ms (https://stackoverflow.com/a/3580085/1011156) so active polling will not drastically affect program performance.
document.readyState === "complete"
means that document is completely loaded with all resources (https://html.spec.whatwg.org/multipage/dom.html#current-document-readiness).