CasperJS loop or iterate through multiple web pages?

∥☆過路亽.° 提交于 2019-11-27 18:51:46
Artjom B.

Since there exists a next page button, you can use it to traverse all pages recursively:

function getRatingsAndWrite(){
    ratings = casper.evaluate(getRatings);
    dates = casper.evaluate(getDate);

    casper.echo(ratings);
    casper.echo(ratings.length + ' ratings found:');

    for(var i=0; i<ratings.length; i++){
        ratings[i] = ratings[i]+': '+dates[i];
        dates[i] = '';
    }
    casper.echo(ratings);
    var content = ratings;

    content = content.join("\n");

    fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'a'); 

    casper.echo(dates.length + ' dates found:');

    var nextLink = ".BVRRPageLink.BVRRNextPage > a";
    if (casper.visible(nextLink)) {
        casper.thenClick(nextLink);
        casper.then(getRatingsAndWrite);
    } else {
        casper.echo("END")
    }
}

casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm');

casper.then(getRatingsAndWrite);

casper.run();

A related answer is A: CasperJS parse next page after button click.

Fanch

This code can help you : you define in an array of objects the wanted urls, selectors for each page and in a loop you do what you want to do with these properties.

You can use a click method in the loop instead of url too.

var navigation = [
    {
        url: 'http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', 
        selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img', selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
    }
    ,{
        url: 'yourSecondUrl, etc...',
        selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img',
        selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
    }
],
content = "";

    casper.start()
    .then(function(){
        //loop on the array
        navigation.forEach(function(navIndex){
            //open url : property url 
            casper.thenOpen(navIndex.url)
            //wait for the page to load -> must be useless because thenOpen() do it
            .waitForUrl(navIndex.url, function(){
                //get the value of attribute title of adequate selector
                var ratings = this.getElementAttribute(navIndex.selectorRatings, 'title'),
                //get the HTML of adequate selector
                var dates = this.getHTML(navIndex.selectorDates);
                this.echo(ratings);
                this.echo(dates);
                content = content +  ' ' + ratings + ' ' + dates;
            }); 
        });
    })
    .run(function() {
            this.echo('----------- All steps done ------------\n');
            this.exit();
    });

Thanks Fanch and Artjom B. Both of your answers rendered the working solution. I used the recursive walk through the 'next' pages on the pagination as given by Artjom B. Next, I added a wait() function to make sure the next ratings page was loaded before scraping them. Without this wait() function, we scrape the same page multiple times between the instant that 'next' is clicked and the resp. next page is done loading. See the working code below:

var ratings = [];
var dates = [];
var casper = require('casper').create({

    pageSettings: {
        loadImages:  false,         
        loadPlugins: false          
    },
    logLevel: "debug",               
    verbose: true                   
});

var fs = require('fs');

function getRatings() {
    var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
    return Array.prototype.map.call(ratings, function(e) {
        return e.getAttribute('title');
    });
}

function getDate() {
    var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');

    return Array.prototype.map.call(dates, function(e) {

        return e.innerHTML;

    });
}

function getRatingsAndWrite(){
    ratings = casper.evaluate(getRatings);
    dates = casper.evaluate(getDate);


    casper.echo(ratings.length + ' ratings found:');

     for(var i=0; i<ratings.length; i++){
        var rating = ratings[i].substr(0,1);
        ratings[i] = rating +': '+dates[i];
        dates[i] = '';
    } 

    var content = ratings;

    content = content.join("\n");

    fs.write("<filepath to write content>", content, 'a'); 

    casper.echo(dates.length + ' dates found:');

    var nextLink = ".BVRRPageLink.BVRRNextPage > a";
    if (casper.visible(nextLink)) {
        casper.thenClick(nextLink);
        casper.wait(3000);
        casper.then(getRatingsAndWrite);
    } else {
        casper.echo("END")
    }
}

casper.start('http://www.t-mobile.com/cell-phones/htc-one-m8.html');

casper.then(getRatingsAndWrite);

casper.run();
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!