How to extract data from HTML page source of (a tab within) a webpage?

依然范特西╮ 提交于 2021-02-20 06:20:34

问题


I have tried several solutions specified in other answers, like experimenting with different user agents (Chrome, safari etc), and getting HTML directly using HTTPClient and BufferedReader, but none of them work. How do I make the Android output similar as a web output? Here is the web output I am looking for; (View page source of https://finance.yahoo.com/quote/AAPL/financials?p=AAPL for full output - this basically contains the AJAX tab named "Quarterly" which contains a table. I need to get that data, but the Android HTML source doesn't have it but the web source does.)

root.App.main = {"context":{"dispatcher":{"stores":{"PageStore":{"currentPageName":"quote","currentRenderTargetId":"default","pagesConfigRaw":{"base":{"quote":{"layout":{"bundleName":"yahoodotcom-layout.TwoColumnLayout","name":"TwoColumnLayout","config":{"enableHeaderCollapse":true,"Header":{"isFixed":true,"uhContainerClasses":"Bgi($uhGrayGradient)","navContainerClasses":"Bgi($navrailGrayGradient) Bxsh($navrailShadow) Pos(r) hasScrolled_Bxsh(headerShadow) Panel-open_Bxsh(headerShadow)","navTransitionClasses":"HideNavrail_Translate3d(0,-46px,0) Panel-open_Translate3d(0,-46px,0)","secondaryNavContainerClasses":"hasScrolled_Bdbw(0px) Bxsh($navrailShadow)","height":135},"fetchNewAttribution":true},"meta":{"property":{"twitter:site":"@YahooFinance"}}},"meta":{"property":{"twitter:site":"@YahooFinance","fb:pages":"90376669494"}},"regions":{"SecondaryNav":[{"bundleName":"react-finance","name":"SecondaryNav","config":{"ui":{"enableRelativeUrl":true}},"props":{"key":"SecondaryNav-0-SecondaryNav","id":"SecondaryNav-0-SecondaryNav"},"isPageComposite":true}],"Overlay":[{"bundleName":"react-lightbox","name":"Lightbox","props":{"key":"Overlay-0-Lightbox","id":"Overlay-0-Lightbox"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-1-Null","id":"Overlay-1-Null"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-2-Null","id":"Overlay-2-Null"},"isPageComposite":true}],"Lead":[{"bundleName":"react-finance","name":"FinanceHeader","props":{"className":"Bxz(bb) H(100%) Pos(r) Maw($newGridWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mstart(a) Mend(a) Px(20px) My(10px)","showAds":true,"adsConfig":{"positions":["FB2A","FB2B","FB2C","FB2D"]},"key":"Lead-0-FinanceHeader","id":"Lead-0-FinanceHeader"},"isPageComposite":true},{"bundleName":"tdv2-applet-featurebar","name":"FeatureBar","config":{"ui":{"container_classnames":"W(100%) Bxz(bb) Bdrs(2px) Mb(10px) Maw($maxModuleWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mx(a)","prerender":{"enabled":true,"renderTargetId":"modal"}},"site":"finance"},"props":{"key":"Lead-1-FeatureBar","id":"Lead-1-FeatureBar"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteHeader","props":{"key":"Lead-2-QuoteHeader","id":"Lead-2-QuoteHeader"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteNav","props":{"key":"Lead-3-QuoteNav","id":"Lead-3-QuoteNav"},"isPageComposite":true}],"Col1":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LDRB","style":{"marginBottom":"8px","paddingTop":"0px","marginLeft":"auto","marginRight":"auto","textAlign":"center","lineHeight":"0px","position":"relative","zIndex":"5"},"key":"Col1-0-Ad","id":"Col1-0-Ad"},"isPageComposite":true},{"bundleName":"Quote.financials","name":"Financials","props":{"key":"Col1-1-Financials","id":"Col1-1-Financials"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-foot","positions":["FOOT"],"key":"Col1-2-AdUnitWithTdAds","id":"Col1-2-AdUnitWithTdAds"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-fsrvy","positions":["FSRVY"],"key":"Col1-3-AdUnitWithTdAds","id":"Col1-3-AdUnitWithTdAds"},"isPageComposite":true}],"Col2":[{"bundleName":"td-app-finance","name":"ExtPromoButton","props":{"className":"btn Bds(s) Bdc($c-fuji-grey-c) Bdrs(4px) Bgc($white) Bdw(1px) Bgc($ExtButtonHov):h C($white):h C($ExtButtonHov) Cur(p) Fz(s) Fw(b) H(44px) Lh(40px) Mb(20px) Ta(c) Td(n) W(100%)","sec":"ext-promo-all-mkt-submit","titleId":"EXTENSION_PROMO_TITLE","url":"https:\u002F\u002Fchrome.google.com\u002Fwebstore\u002Fdetail\u002Fdoojmkhhplhicnghmafjbhncmgjiohma","enabled":true,"key":"Col2-0-ExtPromoButton","id":"Col2-0-ExtPromoButton"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"eventPromo","key":"Col2-1-QuoteModule","id":"Col2-1-QuoteModule"},"isPageComposite":true},{"bundleName":"td-ads","name":"ComboAd","props":{"adparseStyle":{"marginBottom":"20px"},"finishedStyle":{"marginBottom":"20px"},"children":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LREC"}},{"bundleName":"td-ads","name":"Ad","props":{"pos":"MON"}}],"serverHeight":true,"key":"Col2-2-ComboAd","id":"Col2-2-ComboAd"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"similarCompanies","key":"Col2-3-QuoteModule","id":"Col2-3-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"earningsChart","key":"Col2-4-QuoteModule","id":"Col2-4-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"financialsChart","key":"Col2-5-QuoteModule","id":"Col2-5-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"react-finance",..."}}}};

Here is the Android output I get;

(root.App.main = {"context":{"dispatcher":{"stores":{"PageStore":{"currentPageName":"quote","currentRenderTargetId":"default","pagesConfigRaw":{"base":{"quote":{"layout":{"bundleName":"yahoodotcom-layout.TwoColumnLayout","name":"TwoColumnLayout","config":{"enableHeaderCollapse":true,"Header":{"isFixed":true,"uhContainerClasses":"Bgi($uhGrayGradient)","navContainerClasses":"Bgi($navrailGrayGradient) Bxsh($navrailShadow) Pos(r) hasScrolled_Bxsh(headerShadow) Panel-open_Bxsh(headerShadow)","navTransitionClasses":"HideNavrail_Translate3d(0,-46px,0) Panel-open_Translate3d(0,-46px,0)","secondaryNavContainerClasses":"hasScrolled_Bdbw(0px) Bxsh($navrailShadow)","height":135},"fetchNewAttribution":true},"meta":{"property":{"twitter:site":"@YahooFinance"}}},"meta":{"property":{"twitter:site":"@YahooFinance","fb:pages":"90376669494"}},"regions":{"SecondaryNav":[{"bundleName":"react-finance","name":"SecondaryNav","config":{"ui":{"enableRelativeUrl":true}},"props":{"key":"SecondaryNav-0-SecondaryNav","id":"SecondaryNav-0-SecondaryNav"},"isPageComposite":true}],"Overlay":[{"bundleName":"react-lightbox","name":"Lightbox","props":{"key":"Overlay-0-Lightbox","id":"Overlay-0-Lightbox"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-1-Null","id":"Overlay-1-Null"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-2-Null","id":"Overlay-2-Null"},"isPageComposite":true}],"Lead":[{"bundleName":"react-finance","name":"FinanceHeader","props":{"className":"Bxz(bb) H(100%) Pos(r) Maw($newGridWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mstart(a) Mend(a) Px(20px) My(10px)","showAds":true,"adsConfig":{"positions":["FB2A","FB2B","FB2C","FB2D"]},"key":"Lead-0-FinanceHeader","id":"Lead-0-FinanceHeader"},"isPageComposite":true},{"bundleName":"tdv2-applet-featurebar","name":"FeatureBar","config":{"ui":{"container_classnames":"W(100%) Bxz(bb) Bdrs(2px) Mb(10px) Maw($maxModuleWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mx(a)","prerender":{"enabled":true,"renderTargetId":"modal"}},"site":"finance"},"props":{"key":"Lead-1-FeatureBar","id":"Lead-1-FeatureBar"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteHeader","props":{"key":"Lead-2-QuoteHeader","id":"Lead-2-QuoteHeader"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteNav","props":{"key":"Lead-3-QuoteNav","id":"Lead-3-QuoteNav"},"isPageComposite":true}],"Col1":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LDRB","style":{"marginBottom":"8px","paddingTop":"0px","marginLeft":"auto","marginRight":"auto","textAlign":"center","lineHeight":"0px","position":"relative","zIndex":"5"},"key":"Col1-0-Ad","id":"Col1-0-Ad"},"isPageComposite":true},{"bundleName":"Quote.financials","name":"Financials","props":{"key":"Col1-1-Financials","id":"Col1-1-Financials"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-foot","positions":["FOOT"],"key":"Col1-2-AdUnitWithTdAds","id":"Col1-2-AdUnitWithTdAds"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-fsrvy","positions":["FSRVY"],"key":"Col1-3-AdUnitWithTdAds","id":"Col1-3-AdUnitWithTdAds"},"isPageComposite":true}],"Col2":[{"bundleName":"td-app-finance","name":"ExtPromoButton","props":{"className":"btn Bds(s) Bdc($c-fuji-grey-c) Bdrs(4px) Bgc($white) Bdw(1px) Bgc($ExtButtonHov):h C($white):h C($ExtButtonHov) Cur(p) Fz(s) Fw(b) H(44px) Lh(40px) Mb(20px) Ta(c) Td(n) W(100%)","sec":"ext-promo-all-mkt-submit","titleId":"EXTENSION_PROMO_TITLE","url":"https:\u002F\u002Fchrome.google.com\u002Fwebstore\u002Fdetail\u002Fdoojmkhhplhicnghmafjbhncmgjiohma","enabled":true,"key":"Col2-0-ExtPromoButton","id":"Col2-0-ExtPromoButton"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"eventPromo","key":"Col2-1-QuoteModule","id":"Col2-1-QuoteModule"},"isPageComposite":true}

Do you have any suggestions? Thanks. My code;

Document doc = Jsoup.connect(requestURL).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.43")
                .timeout(600000).get();
        Elements tableDivs = doc.getElementsByAttributeValue("class", myClassName);
        Elements scriptTags = doc.getElementsByTag("script");
        for (Element script : scriptTags) {
            //System.out.println(script.data());
            Log.e("ONE", script.data());
        }

回答1:


Yahoo Finance redirects to guce.oath.com, which informs us about the use of cookies and other data, and requires to click 'accept' before providing the content. We can observe that also in a browser if we clear the cokies and refresh the page.

We could scrape the link from guce.oath.com, but I've noticed that the final URL has a guccounter=2 parameter, and if we use that URL we can get the required response.

String requestURL = "https://finance.yahoo.com/quote/AAPL/financials?p=AAPL&guccounter=2";
String userAgent = "My UAString";
Document doc = Jsoup.connect(requestURL).userAgent(userAgent).get();

Since the data is not HTML but JavaScript code, we can't parse it with jsoup, but we can use regular expressions.

Elements scriptTags = doc.getElementsByTag("script");
String re = "root\\.App\\.main\\s*\\=\\s*(.*?);\\s*\\}\\(this\\)\\)\\s*;";
String data = null;

for (Element script : scriptTags) {
    Pattern pattern = Pattern.compile(re, Pattern.DOTALL);
    Matcher matcher = pattern.matcher(script.html());

    if (matcher.find()) {
        data = matcher.group(1);
        break;
    }
}

The data string should contain the dictionary from the JavaScript code, which is a valid json string that can be parsed with JSONObject.


On Android Studio however, there are no redirects as far sa I can tell. I've tried with several user-agent strings, but it seems that the page loads directly. Still, the JavaScript dictionary that contains the data is present, and we can extract it, and parse it with JSONObject.

Code for Android Studio:

String requestURL = "https://finance.yahoo.com/quote/AAPL/financials?p=AAPL";
String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.43";
String row = "totalRevenue";

try {
    Document doc = Jsoup.connect(requestURL).userAgent(userAgent).get();
    String html = doc.html();
    //Log.d("html", html);

    Elements scriptTags = doc.getElementsByTag("script");
    String re = "root\\.App\\.main\\s*\\=\\s*(.*?);\\s*\\}\\(this\\)\\)\\s*;";

    for (Element script : scriptTags) {
        Pattern pattern = Pattern.compile(re, Pattern.DOTALL);
        Matcher matcher = pattern.matcher(script.html());

        if (matcher.find()) {
            String data = matcher.group(1);
            //Log.d("data", data);

            JSONObject jo = new JSONObject(data);
            JSONArray table = getTable(jo);
            //Log.d("table", table.toString());

            String[] tableRow = getRow(table, row);
            String values = TextUtils.join(", ", tableRow);
            Log.d("values", values);
        }
    }
} catch (Exception e) {
    Log.e("err", "err", e);
}

This should parse the data and select the 'Total Revenue' values. The getTable and getRow methods I used:

private JSONArray getTable(JSONObject json) throws JSONException {
    JSONArray table = (JSONArray) json.getJSONObject("context")
            .getJSONObject("dispatcher")
            .getJSONObject("stores")
            .getJSONObject("QuoteSummaryStore")
            .getJSONObject("incomeStatementHistoryQuarterly")
            .getJSONArray("incomeStatementHistory");
    return table;
}

private String[] getRow(JSONArray table, String name) throws JSONException {
    String[] values = new String[table.length()];
    for (int i = 0; i < table.length(); i++) {
        JSONObject jo = table.getJSONObject(i);
        if (jo.has(name)) {
            jo = jo.getJSONObject(name);
            values[i] = jo.has("longFmt") ? jo.get("longFmt").toString() : "-";
        } else {
            values[i] = "-";
        }
    }
    return values;
}

private String[] getDates(JSONArray table) throws JSONException {
    String[] values = new String[table.length()];
    for (int i = 0; i < table.length(); i++) {
        values[i] = table.getJSONObject(i).getJSONObject("endDate")
                .get("fmt").toString();
    }
    return values;
}

I think the best way to get the table data is to map each html row name to a json key. Furthermore, the main table has five sub-tables, so we could map each nested table to the rows it contains.

Map<String, Map<String, String>> getTableNames() {
    final Map<String, String> revenue = new LinkedHashMap<String, String>() {
        { put("Total Revenue", "totalRevenue"); }
        { put("Cost of Revenue", "costOfRevenue"); }
        { put("Gross Profit", "grossProfit"); }
    };
    final Map<String, String> operatingExpenses = new LinkedHashMap<String, String>() {
        { put("Research Development", "researchDevelopment"); }
        { put("Selling General and Administrative", "sellingGeneralAdministrative"); }
        { put("Non Recurring", "nonRecurring"); }
        { put("Others", "otherOperatingExpenses"); }
        { put("Total Operating Expenses", "totalOperatingExpenses"); }
        { put("Operating Income or Loss", "operatingIncome"); }
    };
    Map<String, Map<String, String>> allTableNames = new LinkedHashMap<String, Map<String, String>>() {
        { put("Revenue", revenue); }
        { put("Operating Expenses", operatingExpenses); }

    };
    return allTableNames;
}

We can use this map to select a single cell, for example the 'Total Revenue' of 6/30/2018 (which is on the first row and column),

JSONObject jo = new JSONObject(jsData);
JSONArray table = getTable(jo);

Map<String, Map<String, String>> tableNames = getTableNames();
String totalRevenueKey = tableNames.get("Revenue").get("Total Revenue");
String[] totalRevenueValues = getRow(table, totalRevenueKey);
String value = totalRevenueValues[0];

or we could iterate over the table names and build a list or string, containing all the table data.

List<String> tableData = new ArrayList<>();
Map<String, Map<String, String>> tableNames = getTableNames();
String[] dates = getDates(table);

for (Map.Entry<String, Map<String, String>> tableEntry : tableNames.entrySet()) {
    tableData.add(tableEntry.getKey());
    tableData.addAll(Arrays.asList(dates));

    for (Map.Entry<String, String> row : tableEntry.getValue().entrySet()) {
        String[] tableRow = getRow(table, row.getValue());
        tableData.add(row.getKey());
        for (String column: tableRow) {
            tableData.add(column);
        }
    }
}
String tableDataString = TextUtils.join(", ", tableData);

I've tried to match the html table as much as possible, so the the tableData list and the resulting string is formated as "table name, date,date,date,date" and "row name, price,price,price,price", but it may be best to include only the numbers. (in this case we should add only tableRow items to to tableData)



来源:https://stackoverflow.com/questions/52939611/how-to-extract-data-from-html-page-source-of-a-tab-within-a-webpage

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!