Selenium get .har file

与世无争的帅哥 提交于 2019-11-27 15:46:15

You can use browsermob proxy to capture all the request and response data See here

Set preferences in your Selenium code:

    profile.setPreference("devtools.netmonitor.har.enableAutoExportToFile", true);
profile.setPreference("devtools.netmonitor.har.defaultLogDir", String.valueOf(dir));
profile.setPreference("devtools.netmonitor.har.defaultFileName", "network-log-file-%Y-%m-%d-%H-%M-%S");

and open console:

Actions keyAction = new Actions(driver);
keyAction.keyDown(Keys.LEFT_CONTROL).keyDown(Keys.LEFT_SHIFT).sendKeys("q").keyUp(Keys.LEFT_CONTROL).keyUp(Keys.LEFT_SHIFT).perform();

Use PhantomJS with BrowserMobProxy. PhantomJS helps us for JavaScript enables pages. The following code works for HTTPS web addresses, too.

Place 'phantomjs.exe' in C drive and you get the 'HAR-Information.har' file in C drive itself.

Make sure you DO NOT put a ' / ' at the end of the url, like

driver.get("https://www.google.co.in/")

It should be

driver.get("https://www.google.co.in");

Otherwise, it won't work.

package makemyhar;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import net.lightbody.bmp.BrowserMobProxy;
import net.lightbody.bmp.BrowserMobProxyServer;
import net.lightbody.bmp.core.har.Har;
import net.lightbody.bmp.proxy.CaptureType;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.DesiredCapabilities;

public class MakeMyHAR {
    public static void main(String[] args) throws IOException, InterruptedException {

        //BrowserMobProxy
        BrowserMobProxy server = new BrowserMobProxyServer();
        server.start(0);
        server.setHarCaptureTypes(CaptureType.getAllContentCaptureTypes());
        server.enableHarCaptureTypes(CaptureType.REQUEST_CONTENT, CaptureType.RESPONSE_CONTENT);
        server.newHar("Google");

        //PHANTOMJS_CLI_ARGS
        ArrayList<String> cliArgsCap = new ArrayList<>();
        cliArgsCap.add("--proxy=localhost:"+server.getPort());
        cliArgsCap.add("--ignore-ssl-errors=yes");

        //DesiredCapabilities
        DesiredCapabilities capabilities = new DesiredCapabilities();
        capabilities.setCapability(CapabilityType.ACCEPT_SSL_CERTS, true);
        capabilities.setCapability(CapabilityType.SUPPORTS_JAVASCRIPT, true);
        capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, cliArgsCap);
        capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,"C:\\phantomjs.exe");

        //WebDriver
        WebDriver driver = new PhantomJSDriver(capabilities);
        driver.get("https://www.google.co.in");

        //HAR
        Har har = server.getHar();
        FileOutputStream fos = new FileOutputStream("C:\\HAR-Information.har");
        har.writeTo(fos);
        server.stop();
        driver.close();
    }
}

I have tried as well to get the har file using a proxy like browsermob proxy

I did a lot of research because the file which I've received was always empty.

What I did was to enable the browser performance log.

Note this will work only with chrome driver.

This is my driver class (in python)

from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium import webdriver
from lib.config import config


class Driver:

    global performance_log
    capabilities = DesiredCapabilities.CHROME
    capabilities['loggingPrefs'] = {'performance': 'ALL'}

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument("--headless")
    mobile_emulation = {"deviceName": "Nexus 5"}

    if config.Env().is_mobile():
        chrome_options.add_experimental_option(
            "mobileEmulation", mobile_emulation)
    else:
        pass

    chrome_options.add_experimental_option(
        'perfLoggingPrefs', {"enablePage": True})

    def __init__(self):
        self.instance = webdriver.Chrome(
            executable_path='/usr/local/bin/chromedriver', options=self.chrome_options)

    def navigate(self, url):
        if isinstance(url, str):
            self.instance.get(url)
            self.performance_log = self.instance.get_log('performance')
        else:
            raise TypeError("URL must be a string.")

The amount of information which is found the in output is huge so you'll have to filter the raw data and get the network received and send objects only.

import json
import secrets


def digest_log_data(performance_log):
    # write all raw data in a file
    with open('data.json', 'w', encoding='utf-8') as outfile:
        json.dump(performance_log, outfile)
    # open the file and real it with encoding='utf-8'
    with open('data.json', encoding='utf-8') as data_file:
        data = json.loads(data_file.read())
        return data


def digest_raw_data(data, mongo_object={}):
    for idx, val in enumerate(data):
        data_object = json.loads(data[idx]['message'])
        if (data_object['message']['method'] == 'Network.responseReceived') or (data_object['message']['method'] == 'Network.requestWillBeSent'):
            mongo_object[secrets.token_hex(30)] = data_object
        else:
            pass

We choose to push this data into a mongo db which will be analyse later by an etl and pushed into a redshift database to create statistics .

I hope is what you are looking for.

The way Im running the script is :

import codecs
from pprint import pprint
import urllib
from lib import mongo_client
from lib.test_data import test_data as data
from jsonpath_ng.ext import parse
from IPython import embed
from lib.output_data import process_output_data as output_data
from lib.config import config
from lib import driver

browser = driver.Driver()

# get the list of urls which we need to navigate
urls = data.url_list()

for url in urls:
    browser.navigate(config.Env().base_url() + url)
    print('Visiting ' + url)
    # get performance log
    performance_log = browser.performance_log
    # digest the performace log
    data = output_data.digest_log_data(performance_log)
    # initiate an empty dict
    mongo_object = {}
    # prepare the data for the mongo document
    output_data.digest_raw_data(data, mongo_object)
    # load data into the mongo db
    mongo_client.populate_mongo(mongo_object)


browser.instance.quit()

My main source was this one which I've adjusted it to my needs. https://www.reddit.com/r/Python/comments/97m9iq/headless_browsers_export_to_har/ Thanks

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!