Cannot download file in R - status 503

后端 未结 2 855
半阙折子戏
半阙折子戏 2020-12-22 01:16

I\'m trying to download file:

> URL <- \"https://www.bitmarket.pl/graphs/BTCPLN/90m.json\"
> download.file(URL, destfile = \"res.json\", method = \"         


        
相关标签:
2条回答
  • 2020-12-22 01:59

    No need to leave R. We can use the V8 package for this and make a special GET function:

    #' Work around cloudflare anti-DDoS protection
    #' 
    #' SUPER FRAGILE AS IT NEEDS TO BE MODIFIED WHENEVER CLOUDFLARE CHANGES THE CHALLENGE CODE
    #' 
    #' @param cf_url the URL you want
    #' @param ... other params passed to all `httr::GET`` calls (headers, verbose, etc)
    #' @return `httr::response object``
    cf_GET <- function(cf_url, ...) {
    
      require(urltools)
      require(stringi)
      require(rvest)
      library(httr)
      require(V8)
    
      c(
        ua_macos_chrome = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
        ua_ios_safari = "Mozilla/5.0 (iPad; CPU OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0 Mobile/14C92 Safari/602.1",
        ua_win7_firefox = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
      ) -> agents
    
      # use a valid browser user-agent but don't always use the same one
      (cf_agent <- unname(sample(agents, 1)))
    
      httr::GET(
        url = cf_url,
        httr::user_agent(cf_agent),
        ...
      ) -> res
    
      # sometimes you get lucky and the page comes back
      if (!httr::status_code(res) == 503) return(res) # return now if no cf redirect
    
      # get the page
      cf_pg <- httr::content(res, as="parsed")
    
      # get form/form variables we'll need later
      (jschl_vc <- html_attr(html_node(cf_pg, "input[name='jschl_vc']"), "value"))
      (pass <- html_attr(html_node(cf_pg, "input[name='pass']"), "value"))
      (action <- html_attr(html_node(cf_pg, "form[id='challenge-form']"), "action"))
    
      # get the page as just lines of text
      cf_code <- httr::content(res, as="text")
      writeLines(cf_code, "/tmp/a.html")
      cf_code <- stri_split_lines(cf_code)[[1]]
    
      # find the javascript
      decl <- cf_code[which(stri_detect_fixed(cf_code, "s,t,o,p,b"))]
      (init_line <- stri_match_first_regex(decl, "s,t,o,p,b[[:alpha:], ]+ (.*$)")[,2])
      (var_name <- stri_match_first_regex(init_line, "([[:alnum:]]+)")[,2])
      (exec_line <- cf_code[which(stri_detect_fixed(cf_code, var_name))[2]])
    
      # tweak and execute the javascript
      ctx <- v8()
      ctx$eval(sprintf("var a = {}; t = '%s';%s\n%s", domain(cf_url), decl, exec_line))
      (ctx$get("a.value"))
    
      # this lying but you can wait 10s
      message("Waiting 5 seconds...")
      Sys.sleep(10)
    
      # solve the DDoS challenge and make the request
      httr::GET(
        url = sprintf("%s://%s/%s", scheme(cf_url), domain(cf_url), action),
        httr::user_agent(cf_agent),
        httr::add_headers(
          `Referer` = cf_url
        ),
        query = list(
          `jschl-answer` = ctx$get("a.value"),
          jschl_vc = jschl_vc,
          pass = pass
        ),
        ...
      ) -> res
    
      res
    
    }
    

    And, it works:

    res <- cf_GET("https://www.bitmarket.pl/graphs/BTCPLN/90m.json")
    
    str(content(res, as="parsed"))
    ## List of 90
    ##  $ :List of 6
    ##   ..$ time : int 1512906360
    ##   ..$ open : chr "48303.78770000"
    ##   ..$ high : chr "48303.78770000"
    ##   ..$ low  : chr "48303.78770000"
    ##   ..$ close: chr "48303.78770000"
    ##   ..$ vol  : chr "0.13550275"
    ##  $ :List of 6
    ##   ..$ time : int 1512906420
    ##   ..$ open : chr "48303.78770000"
    ##   ..$ high : chr "48303.78770000"
    ##   ..$ low  : chr "48000.10000000"
    ##   ..$ close: chr "48000.10000000"
    ##   ..$ vol  : chr "1.12078334"
    ## ...
    

    UPDATE:

    I wrapped this in a package:

    devtools::install_github("hrbrmstr/cfhttr")
    library(cfhttr)
    
    res <- cf_GET("https://www.bitmarket.pl/graphs/BTCPLN/90m.json")
    

    (same output)

    0 讨论(0)
  • 2020-12-22 02:04

    That's because the page is using a DDoS protection service. On the first load, the page itself does a JavaScript-initiated redirect after 5 seconds to fetch the final content so the process fails with tools like wget/curl which do not interpret JavaScript. If you think that it is justifiable to do so, then one option would be to use for example phantomjs and supply a custom script (say, save.js):

    var system = require('system');
    var page = require('webpage').create();
    
    page.userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5';
    
    page.open(system.args[1], function(){
        setTimeout(function(){
            console.log(page.evaluate(function(){
                //gets the JSON from the first <pre> element rendered on the page
                return document.getElementsByTagName('pre')[0].textContent;
            }));
            phantom.exit();
        }, 6000); //waits 6 seconds for the page to reload
    });
    

    and then use it instead of wget as:

    phantomjs save.js https://www.bitmarket.pl/graphs/BTCPLN/90m.json
    
    0 讨论(0)
提交回复
热议问题