Phantomjs has these two really handy callbacks onLoadStarted
and onLoadFinished
which allow you to essentially pause execution while the page is lo
I think the onLoadStarted
and onLoadFinished
functions are everything you need. Take for example the following script:
var page = require('webpage').create();
page.onResourceReceived = function(response) {
if (response.stage !== "end") return;
console.log('Response (#' + response.id + ', stage "' + response.stage + '"): ' + response.url);
};
page.onResourceRequested = function(requestData, networkRequest) {
console.log('Request (#' + requestData.id + '): ' + requestData.url);
};
page.onUrlChanged = function(targetUrl) {
console.log('New URL: ' + targetUrl);
};
page.onLoadFinished = function(status) {
console.log('Load Finished: ' + status);
};
page.onLoadStarted = function() {
console.log('Load Started');
};
page.onNavigationRequested = function(url, type, willNavigate, main) {
console.log('Trying to navigate to: ' + url);
};
page.open("http://example.com", function(status){
page.evaluate(function(){
// click
var e = document.createEvent('MouseEvents');
e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
document.querySelector("a").dispatchEvent(e);
});
setTimeout(function(){
phantom.exit();
}, 10000);
});
It prints
Trying to navigate to: http://example.com/ Request (#1): http://example.com/ Load Started New URL: http://example.com/ Response (#1, stage "end"): http://example.com/ Load Finished: success Trying to navigate to: http://www.iana.org/domains/example Request (#2): http://www.iana.org/domains/example Load Started Trying to navigate to: http://www.iana.org/domains/reserved Request (#3): http://www.iana.org/domains/reserved Response (#2, stage "end"): http://www.iana.org/domains/example New URL: http://www.iana.org/domains/reserved Request (#4): http://www.iana.org/_css/2013.1/screen.css Request (#5): http://www.iana.org/_js/2013.1/jquery.js Request (#6): http://www.iana.org/_js/2013.1/iana.js Response (#3, stage "end"): http://www.iana.org/domains/reserved Response (#6, stage "end"): http://www.iana.org/_js/2013.1/iana.js Response (#4, stage "end"): http://www.iana.org/_css/2013.1/screen.css Response (#5, stage "end"): http://www.iana.org/_js/2013.1/jquery.js Request (#7): http://www.iana.org/_img/2013.1/iana-logo-header.svg Request (#8): http://www.iana.org/_img/2013.1/icann-logo.svg Response (#8, stage "end"): http://www.iana.org/_img/2013.1/icann-logo.svg Response (#7, stage "end"): http://www.iana.org/_img/2013.1/iana-logo-header.svg Request (#9): http://www.iana.org/_css/2013.1/print.css Response (#9, stage "end"): http://www.iana.org/_css/2013.1/print.css Load Finished: success
It shows that clicking a link emits the LoadStarted event once and NavigationRequested event twice, because there is a redirect. The trick is to add the event handlers before doing the action:
var page = require('webpage').create();
page.open("http://example.com", function(status){
page.onLoadFinished = function(status) {
console.log('Load Finished: ' + status);
page.render("test37_next_page.png");
phantom.exit();
};
page.onLoadStarted = function() {
console.log('Load Started');
};
page.evaluate(function(){
var e = document.createEvent('MouseEvents');
e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
document.querySelector("a").dispatchEvent(e);
});
});
If you need to do those things, maybe it is time to try something else like CasperJS. It runs on top of PhantomJS, but has a much better API for navigating web pages.
Here is my code based on some other answers. In my case, I didn't need to specifically evaluate any other javascript. I just needed to wait for the page to finish loading.
var system = require('system');
if (system.args.length === 1) {
console.log('Try to pass some arguments when invoking this script!');
}
else {
var page = require('webpage').create();
var address = system.args[1];
page.open(address, function(status){
page.onLoadFinished = function(status) {
console.log(page.content);
phantom.exit();
};
});
}
Save the above in a file called "scrape.js" and call it this way:
phantomjs --ssl-protocol=any --ignore-ssl-errors=true scrape.js https://www.example.com
The SSL-related params are added to avoid other issues that I was having with certain HTTPS sites (related to certificate loading issues).
Hope this helps someone!
Use the high-level wrapper, nightmarejs.
You can easily click
there and wait afterwards.
Here is the code (Examples section):
var Nightmare = require('nightmare');
new Nightmare()
.goto('http://yahoo.com')
.type('input[title="Search"]', 'github nightmare')
.click('.searchsubmit')
.run(function (err, nightmare) {
if (err) return console.log(err);
console.log('Done!');
});
More examples and API usage can be found at github