How to save the current webpage with casperjs/phantomjs?

前端 未结 2 584
野性不改
野性不改 2020-12-09 22:52

Is there a way to save the current webpage by using casperjs or phantomjs? I tried to get the html and save it into a file. But the resulting file was a lot different from t

2条回答
  •  失恋的感觉
    2020-12-09 23:52

    Andrey Borisko suggested to use the disk cache to retrieve the resources. My solution is not that efficient, but you don't need to decompress text files.

    I use XMLHttpRequest to retrieve all resources after I registered them with the resource.received event handler. I then filter the resources into images, css and fonts. The current limitation is that remote resource paths that contain something like ../ or ./ are not handled correctly.

    I retrieve the current page content with getHTML and iterate over all captured resources to replace the path used in the markup, that is identified by a portion of the complete resource URL, with a randomly generated file name. The file extension is created from the content type of the resource. It is converted using mimeType from this gist.

    Since CSS files may contain background images or fonts, they have to be processed before saving to disk. The provided loadResource function loads the resource, but does not save it.

    Since XMLHttpRequest to download the resources the script has to be invoked with the --web-security=false flag:

    casperjs script.js --web-security=false
    

    script.js

    var casper = require("casper").create();
    var utils = require('utils');
    var fs = require('fs');
    var mimetype = require('./mimetype'); // URL provided below
    var cssResources = [];
    var imgResources = [];
    var fontResources = [];
    var resourceDirectory = "resources";
    var debug = false;
    
    fs.removeTree(resourceDirectory);
    
    casper.on("remote.message", function(msg){
        this.echo("remote.msg: " + msg);
    });
    
    casper.on("resource.error", function(resourceError){
        this.echo("res.err: " + JSON.stringify(resourceError));
    });
    
    casper.on("page.error", function(pageError){
        this.echo("page.err: " + JSON.stringify(pageError));
    });
    
    casper.on("downloaded.file", function(targetPath){
        if (debug) this.echo("dl.file: " + targetPath);
    });
    
    casper.on("resource.received", function(resource){
        // don't try to download data:* URI and only use stage == "end"
        if (resource.url.indexOf("data:") != 0 && resource.stage == "end") {
            if (resource.contentType == "text/css") {
                cssResources.push({obj: resource, file: false});
            }
            if (resource.contentType.indexOf("image/") == 0) {
                imgResources.push({obj: resource, file: false});
            }
            if (resource.contentType.indexOf("application/x-font-") == 0) {
                fontResources.push({obj: resource, file: false});
            }
        }
    });
    
    // based on http://docs.casperjs.org/en/latest/modules/casper.html#download
    casper.loadResource = function loadResource(url, method, data) {
        "use strict";
        this.checkStarted();
        var cu = require('clientutils').create(utils.mergeObjects({}, this.options));
        return cu.decode(this.base64encode(url, method, data));
    };
    
    
    function escapeRegExp(string) {
        // from https://stackoverflow.com/a/1144788/1816580
        return string.replace(/([.*+?^=!:${}()|\[\]\/\\])/g, "\\$1");
    }
    
    function replaceAll(find, replace, str) {
        // from https://stackoverflow.com/a/1144788/1816580
        return str.replace(find, replace);
    }
    
    var wrapFunctions = [
        function wrapQuot1(s){
            return '"' + s + '"';
        },
        function wrapQuot2(s){
            return "'" + s + "'";
        },
        function csswrap(s){
            return '(' + s + ')';
        }
    ];
    
    function findAndReplace(doc, resources, resourcesReplacer) {
        // change page on the fly
        resources.forEach(function(resource){
            var url = resource.obj.url;
    
            // don't download again
            if (!resource.file) {
                // set random filename and download it **or** call further processing which in turn will load ans write to disk
                resource.file = resourceDirectory+"/"+Math.random().toString(36).slice(2)+"."+mimetype.ext[resource.obj.contentType];
                if (typeof resourcesReplacer != "function") {
                    if (debug) casper.echo("download resource (" + resource.obj.contentType + "): " + url + " to " + resource.file);
                    casper.download(url, resource.file, "GET");
                } else {
                    resourcesReplacer(resource);
                }
            }
    
            wrapFunctions.forEach(function(wrap){
                // test the resource url (growing from the back) with a string in the document
                var lastURL;
                var lastRegExp;
                var subURL;
                // min length is 4 characters
                for(var i = 0; i < url.length-5; i++) {
                    subURL = url.substring(i);
                    lastRegExp = new RegExp(escapeRegExp(wrap(subURL)), "g");
                    if (doc.match(lastRegExp)) {
                        lastURL = subURL;
                        break;
                    }
                }
                if (lastURL) {
                    if (debug) casper.echo("replace " + lastURL + " with " + resource.file);
                    doc = replaceAll(lastRegExp, wrap(resource.file), doc);
                }
            });
        });
        return doc;
    }
    
    function capturePage(){
    
        // remove all 
    
                                     
                  
提交回复
热议问题