问题
I'm scraping data from AJAX-based pages using PhantomJS through the npm-phantom
module. Sometimes the data isn't loaded yet when phantom starts DOM traversal. How to insert something like window.onload = function() { ... }
into the page.evaluate
? It returns me a function, but not the data.
var phantom = require('phantom');
exports.main = function (url, callback) {
phantom.create(function (ph) {
ph.createPage(function (page) {
page.open(pref + url, function (status) {
page.evaluate(function () {
// here
var data = {};
data.one = document.getElementById("first").innerText;
data.two = document.getElementById("last").innerText;
return data;
},
function (res) {
callback(null, res);
ph.exit();
});
});
});
});
}
On the PhantomJS API page I found onLoadFinished, but how does it apply.
回答1:
page.open(url, function(status){...})
is just another notation for
page.onLoadFinished = function(status){...};
page.open(url);
You can find the quote here:
Also see WebPage#open for an alternate hook for the onLoadFinished callback.
Since this is an AJAX-based page, you need to wait for the data to appear. You can only do that by repeatedly checking a specific portion of the page.
You can find an example in the examples directory of the phantomjs installation or here. This will probably also work for phantomjs through npm-phantom.
In your case this will look like this (abbreviated):
page.open(pref + url, function (status) {
waitFor(function check(){
return page.evaluate(function () {
// ensure #first and #last are in the DOM
return !!document.getElementById("first") &&
!!document.getElementById("last");
});
}, function onReady(){
page.evaluate(function () {
var data = {};
data.one = document.getElementById("first").innerText;
data.two = document.getElementById("last").innerText;
return data;
});
callback(null, res);
ph.exit();
}, 5000); // some timeout
});
来源:https://stackoverflow.com/questions/24865467/use-window-onload-in-phantomjs