href field missing when I get the page using jsoup or htmlunit

后端 未结 1 468
慢半拍i
慢半拍i 2020-12-10 22:37

I\'m trying to parse google images search result.

I\'m trying to get the href attribute of an element. I\'ve noticed that the href field

相关标签:
1条回答
  • 2020-12-10 23:17

    For each search result there is a <div class="rg_meta">containing a JSON object, which also holds the url. Using a JSON parser like json-simple to parse the object, the following code prints the image urls:

    String searchTerm = "naruto shippuden";
    String searchUrl = "https://www.google.com/search?site=imghp&tbm=isch&source=hp&biw=1920&bih=955&q=" + searchTerm.replace(" ", "+") + "&gws_rd=cr";
    
    try {
        Document doc = Jsoup.connect(searchUrl)
                .userAgent("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36")
                .referrer("https://www.google.com/").get();
    
        JSONObject obj;
    
        for (Element result : doc.select("div.rg_meta")) {
    
            // div.rg_meta contains a JSON object, which also holds the image url
            obj = (JSONObject) new JSONParser().parse(result.text());
    
            String imageUrl = (String) obj.get("ou");
    
            // just printing out the url to demonstate the approach
            System.out.println("imageUrl: " + imageUrl);    
        } 
    
    } catch (IOException e1) {
        e1.printStackTrace();
    }catch (ParseException e) {
        e.printStackTrace();
    }
    

    Output:

    imageUrl: http://ib3.huluim.com/show_key_art/1603?size=1600x600&region=US
    imageUrl: http://cdn.zonarutoppuden.com/ns/peliculas-naruto-shippuden.jpg
    imageUrl: http://www.saiyanisland.com/news/wp-content/uploads2/2014/12/Naruto-Sasuke.jpg
    ...
    

    Update

    Since jsAction doesn't seem to play nicely with htmlUnit, I would propose to use phantomJs. Just download the binary for your OS and create a script file.

    create a page.js file:

    var page = require('webpage').create();
    var fs = require('fs');
    
    page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';
    
    page.zoomFactor = 0.1;
    
    page.viewportSize = {
      width: 1920,
      height: 1080
    };
    
    var divCount="-1";
    var topPosition=0;
    var unchangedCounter=0;
    
    page.open('https://www.google.com/search?site=imghp&tbm=isch&source=hp&q=naruto+shippuden&gws_rd=cr', function(status) {
        console.log("Status: " + status);
        if(status === "success") {
    
            window.setInterval(function() {
    
                var newDivCount = page.evaluate(function() { 
                    var divs = document.querySelectorAll(".rg_di.rg_bx.rg_el.ivg-i");
                    return divs[divs.length-1].getAttribute("data-ri");
                });
    
                topPosition = topPosition + 1080;
    
                page.scrollPosition = {
                    top: topPosition,
                    left: 0
                };
    
                if(newDivCount===divCount){
                    page.evaluate(function() {
                        var button = document.querySelector("#smb");
                        console.log("buttontype:"+typeof button);
                        if(!(typeof button === "undefined")) {
                            button.click();
                            return true;
                        }else{
                            return false;
                        }
                    });
    
                    if(unchangedCounter===5){
                        console.log(newDivCount);
                        var path = 'output.html';
                        fs.write(path, page.content, 'w');
                        phantom.exit();
                    }else{
                        unchangedCounter=unchangedCounter+1;
                    }
                }else{
                    unchangedCounter=0;
                }
                divCount = newDivCount;
    
            }, 500);
        }
    });
    

    Now we execute the script file with phantomJs and parse the result as before with jsoup:

    try {
        Process process = Runtime.getRuntime().exec("bin\\phantomjs page.js"); //change path to phantomjs binary and your script file
        process.waitFor();
    
        Document doc = Jsoup.parse(new File("output.html"),"UTF-8"); // output.html is created by phantom.js, same path as page.js
    
        for (Element element : doc.select("div.rg_di.rg_bx.rg_el.ivg-i a")) {
            System.out.println(element.attr("href"));
        }
        System.out.println("Number of results: " + doc.select("div.rg_di.rg_bx.rg_el.ivg-i a").size());
    } catch (IOException | InterruptedException e) {
        e.printStackTrace();
    }
    

    Output:

    /imgres?imgurl=http%3A%2F%2Fib3.huluim.com%2Fshow_key_art%2F1603%3Fsize%3D1600x600%26region%3DUS&imgrefurl=http%3A%2F%2Fwww.hulu.com%2Fnaruto-shippuden&docid=OgW4j66rp7CKkM&tbnid=SElXvYDJj9cR6M%3A&w=1600&h=600&bih=10800&biw=19200&ved=0ahUKEwjX2PXmptPOAhULVxoKHXfmDg8QMwgzKAAwAA&iact=mrc&uact=8
    /imgres?imgurl=http%3A%2F%2Fcdn.zonarutoppuden.com%2Fns%2Fpeliculas-naruto-shippuden.jpg&imgrefurl=http%3A%2F%2Fwww.zonarutoppuden.com%2F2010%2F10%2Fnaruto-shippuden-peliculas.html&docid=JR8NPqKrF3ac_M&tbnid=0EPPOYQcflXkMM%3A&w=900&h=600&bih=10800&biw=19200&ved=0ahUKEwjX2PXmptPOAhULVxoKHXfmDg8QMwg0KAEwAQ&iact=mrc&uact=8
    ...
    Number of results: 463
    

    Update: passing the url as a parameter to the script

    Script page.js

    var page = require('webpage').create();
    var fs = require('fs');
    var system = require('system');
    
    var url = "";
    var searchParameter = "";
    
    if (system.args.length === 3) {
        url=system.args[1];
        searchParameter=system.args[2];
    }
    
    if(url==="" || searchParameter===""){
        phantom.exit();
    }
    
    page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';
    
    page.zoomFactor = 0.1;
    
    page.viewportSize = {
      width: 1920,
      height: 1080
    };
    
    var divCount="-1";
    var topPosition=0;
    var unchangedCounter=0;
    
    page.open(url, function(status) {
        console.log("Status: " + status);
        if(status === "success") {
    
            window.setInterval(function() {
    
                var newDivCount = page.evaluate(function() { 
                    var divs = document.querySelectorAll(".rg_di.rg_bx.rg_el.ivg-i");
                    return divs[divs.length-1].getAttribute("data-ri");
                });
    
                topPosition = topPosition + 1080;
    
                page.scrollPosition = {
                    top: topPosition,
                    left: 0
                };
    
                if(newDivCount===divCount){
                    page.evaluate(function() {
                        var button = document.querySelector("#smb");
                        if(!(typeof button === "undefined")) {
                            button.click();
                            return true;
                        }else{
                            return false;
                        }
                    });
    
                    if(unchangedCounter===5){
                        var path = searchParameter+'.html';
                        fs.write(path, page.content, 'w');
                        phantom.exit();
                    }else{
                        unchangedCounter=unchangedCounter+1;
                    }
                }else{
                    unchangedCounter=0;
                }
                divCount = newDivCount;
    
            }, 500);
        }else{
            phantom.exit();
        }
    });
    

    Java code

    try {
        //change path to phantomjs binary and your script file
        String phantomJSPath = "phantomjs" + File.separator + "bin" + File.separator + "phantomjs";
        String scriptFile = "page.js";
    
        String searchTerm = "naruto+shippuden";
        String urlParameter = "https://www.google.com/search?site=imghp&tbm=isch&source=hp&gws_rd=cr&q="+searchTerm;
    
        Process process = Runtime.getRuntime().exec(phantomJSPath + " " + scriptFile + " " + urlParameter + " " + searchTerm);
        process.waitFor();
    
        Document doc = Jsoup.parse(new File(searchTerm + ".html"),"UTF-8"); // output.html is created by phantom.js, same path as page.js
    
        for (Element element : doc.select("div.rg_di.rg_bx.rg_el.ivg-i a")) {
            System.out.println(element.attr("href"));
        }
        System.out.println("Number of results: " + doc.select("div.rg_di.rg_bx.rg_el.ivg-i a").size());
    } catch (IOException | InterruptedException e) {
        e.printStackTrace();
    }
    
    0 讨论(0)
提交回复
热议问题