问题
Can PhantomJS be used an an alternative to BeautifulSoup?
I am trying to search on Etsy and visit all the links in term. In Python, I know how to do this (with BeautifulSoup) but today I want to see if I can do the same with PhantomJS. I'm not getting very far.
This script should search "hello kitty" on Etsy and return all the of products
<a class="listing-thumb" href=...></a>
and print them in the console. Ideally I'd visit them later on and get the information I need. Right now it just freezes. Any ideas?
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status){
// list all the a.href links in the hello kitty etsy page
var link = page.evaluate(function() {
return document.querySelectorAll('a.listing-thumb');
});
for(var i = 0; i < link.length; i++){ console.log(link[i].href); }
phantom.exit();
});
I have toyed with using CasperJS, which may be better designed for this.
回答1:
PhantomJS evaluate()
cannot serialize and return complex objects like HTMLElements or NodeLists, so you have to map them to serializable things before:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status) {
// list all the a.href links in the hello kitty etsy page
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a.listing-thumb'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
phantom.exit();
});
Note: here we use [].map.call()
in order to treat a NodeList
as a standard Array
.
回答2:
The only problem with your code is that you do not understand phantomjs scopes. You have phantom and page scopes. You tried to return JavaScript DOM object references (those can't be serialized) from page scope (page.evaluate runs in page scope) to phantom main scope. I think that is not possible. Here follows code that works:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
// for debug (to see if page returns status code 200)
page.onResourceReceived = function(response) {
if (response.url === url) {
console.log('Resorce: "' + response.url + '" status: ' + response.status);
if (response.status === 200) {
console.log(response.url);
for (var i = 0; i < response.headers.length; i++) {
console.log(response.headers[i].name + ': ' + response.headers[i].value);
}
}
}
};
page.onLoadFinished = function(status){
console.log('Status: ' + status);
console.log('Starting evaluate...');
var links = page.evaluate(function() {
var nodes = [],
matches = document.querySelectorAll("a.listing-thumb");
for(var i = 0; i < matches.length; ++i) {
nodes.push(matches[i].href);
}
return nodes;
});
console.log('Done evaluate... count: ' + links.length);
if (links && links.length > 0) {
for(var i = 0; i < links.length; ++i) {
console.log('(' + i + ') ' + links[i]);
}
} else {
console.log("No match found!");
}
phantom.exit(0);
};
page.open(url);
回答3:
Here is some code I recently wrote that scrapes urls using PhantomJs, if you provide only a URL it will display all URLS's on the page, if you supply an argument of class|id
followed by a "class/id name" it will display the urls of the class/id only.
//////////////////////////////////////////////////////////
///// PhantomJS URL Scraper v.1.3 /////
//
// Copyrighted by +A.M.Danischewski 2016+ (c)
// This program may be reutilized without limits, provided this
// notice remain intact.
//
// Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]
//
// Argument 1: URL -- "https://www.youtube.com/watch?v=8TniRMwL2Vg"
// Argument 2: "class" or "id"
// Argument 3: If Argument 2 was provided, "class name" or "id name"
//
// By default this program will display ALL urls from a user supplied URL.
// If a class name or id name is provided then only URL's from the class
// or id are displayed.
//
///////////////////////////////////
var page = require('webpage').create(),
system = require('system'),
address;
if (system.args.length === 1) {
console.log(' Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]');
phantom.exit();
}
address = system.args[1];
querytype= system.args[2];
queryclass = system.args[3];
page.open(address, function(status) {
if (status !== 'success') {
console.log('Error loading address: '+address);
} else {
//console.log('Success! In loading address: '+address);
}
});
page.onConsoleMessage = function(msg) {
console.log(msg);
}
page.onLoadFinished = function(status) {
var dynclass="function() { window.class_urls = new Array(); window.class_urls_next=0; var listings = document.getElementsByClassName('"+queryclass+"'); for (var i=0; i < listings.length; i++) { var el = listings[i]; var ellnks=[].map.call(el.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=el.innerHTML; window.class_urls.push(ellnks.join('\\n')); }; return window.class_urls;}";
var dynid="function() { window.id_urls = new Array(); window.id_urls_next=0; var listings = document.getElementById('"+queryclass+"'); var ellnks=[].map.call(listings.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=listings.innerHTML; window.id_urls.push(ellnks.join('\\n')); return window.id_urls;}";
var allurls="function() { var links = page.evaluate(function() { return [].map.call(document.querySelectorAll('a'), function(link) { return link.getAttribute('href'); };); };); console.log(links.join('\\n')); }";
var page_eval_function="";
if (querytype === "class") {
console.log(page.evaluate(dynclass).toString().replace(/,/g, "\n"));
} else if (querytype === "id") {
console.log(page.evaluate(dynid).toString().replace(/,/g, "\n"));
} else {
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
}
phantom.exit();
};
来源:https://stackoverflow.com/questions/13944518/how-to-scrape-links-with-phantomjs