Dynamic scraping using nodejs and phantomjs

你说的曾经没有我的故事 提交于 2019-12-10 23:36:59

问题


First of all, I've installed successfully both PhantomJs and its npm interface phantom. I've set the code to load my page with the new syntax (All the other questions posted on here were based on the old code syntax or I'm missing something). this is the source I'm trying to scrape.

Now, the right sidebar, the one with the fake select near "Comune" and the other one are generated dynamically and I can't understand why phantomjs isn't picking them up. Following my code:

var sito = "http://bicincitta.tobike.it/";
var sitepage = null;
var phInstance = null;
var phantom = require('phantom')

phantom.create()
    .then((instance) => {
    phInstance = instance;
return instance.createPage();
})
.then((page) => {
    sitepage = page;
return page.open(sito);
})
.then((status) => {
    console.log(status);
return sitepage.property('content');
})
.then((content) => {
    console.log(content);
sitepage.close();
phInstance.exit();
})
.catch((error) => {
    console.log(error);
phInstance.exit();
})

I'm hitting my head hard on a wall right now. Am I supposed to get in some way the site's scripts and execute them? Am I missing an instruction?

Also, on a sidenote; it's not really clear how should I concatenate additional methods to page, if page is scoped inside the second ".then".


回答1:


I've spent the past week workign with PhantomJS trying to get it to snapshot a page with data that is rendered with angular. The easiest thing I found to do was to use page.injectJs('../script.js') for any local scripts, and page.includeJs('http://jquery.com...') for any external scripts. Since Phantom is sandboxed, it won't execute the javascript on the page it's capturing unless you give it the JS to execute. This will allow you to screenshot a page that has data rendered with javascript.




回答2:


There is CData script at the bottom of the html that can not be parsed by phantom. This is where the items are being propagated from.

<script type="text/javascript">
//<![CDATA[
Sys.Application.initialize();
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxManager, {"_updatePanels":"","ajaxSettings":[],"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"defaultLoadingPanelID":"","enableAJAX":true,"enableHistory":false,"links":[],"styles":[],"uniqueID":"RadAjaxManager1","updatePanelsRenderMode":0}, null, null, $get("RadAjaxManager1"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajCheckLoginUser"}, null, null, $get("ajCheckLoginUser"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajCheckLoginAdmin"}, null, null, $get("ajCheckLoginAdmin"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajLogoutUser"}, null, null, $get("ajLogoutUser"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadWindow, {"_dockMode":false,"behaviors":0,"clientStateFieldID":"radPortal_ClientState","destroyOnClose":true,"formID":"form1","height":"180px","iconUrl":"","left":"","minimizeIconUrl":"","modal":true,"name":"radPortal","reloadOnShow":true,"showContentDuringLoad":false,"skin":"Office2007","top":"","visibleStatusbar":false,"width":"450px"}, {"close":OnClientClosePortal}, null, $get("radPortal"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadWindowManager, {"behaviors":4,"clientStateFieldID":"windowManagerPortal_ClientState","destroyOnClose":true,"formID":"form1","iconUrl":"","left":"","minimizeIconUrl":"","modal":true,"name":"windowManagerPortal","reloadOnShow":true,"showContentDuringLoad":false,"skin":"Office2007","top":"","visibleStatusbar":false,"windowControls":"['radPortal']"}, null, {"child":"radPortal"}, $get("windowManagerPortal"));
    });
//]]>
</script>

These items will also be destroyed as soon as you leave communication with this sites server. There are methods to get around this but I think you are better off trying something else. I used npm cheerio to load the CDATA html



来源:https://stackoverflow.com/questions/36654763/dynamic-scraping-using-nodejs-and-phantomjs

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!