Cannot download file in R - status 503

后端未结

关注

 2  855

I\'m trying to download file:

> URL <- \"https://www.bitmarket.pl/graphs/BTCPLN/90m.json\"
> download.file(URL, destfile = \"res.json\", method = \"


                      
              相关标签:


      
      
        
          2条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  后悔当初        
                
              
                            
                2020-12-22 01:59
              
            
            
                                                                       
No need to leave R. We can use the V8 package for this and make a special GET function:

#' Work around cloudflare anti-DDoS protection
#' 
#' SUPER FRAGILE AS IT NEEDS TO BE MODIFIED WHENEVER CLOUDFLARE CHANGES THE CHALLENGE CODE
#' 
#' @param cf_url the URL you want
#' @param ... other params passed to all `httr::GET`` calls (headers, verbose, etc)
#' @return `httr::response object``
cf_GET <- function(cf_url, ...) {

  require(urltools)
  require(stringi)
  require(rvest)
  library(httr)
  require(V8)

  c(
    ua_macos_chrome = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
    ua_ios_safari = "Mozilla/5.0 (iPad; CPU OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0 Mobile/14C92 Safari/602.1",
    ua_win7_firefox = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
  ) -> agents

  # use a valid browser user-agent but don't always use the same one
  (cf_agent <- unname(sample(agents, 1)))

  httr::GET(
    url = cf_url,
    httr::user_agent(cf_agent),
    ...
  ) -> res

  # sometimes you get lucky and the page comes back
  if (!httr::status_code(res) == 503) return(res) # return now if no cf redirect

  # get the page
  cf_pg <- httr::content(res, as="parsed")

  # get form/form variables we'll need later
  (jschl_vc <- html_attr(html_node(cf_pg, "input[name='jschl_vc']"), "value"))
  (pass <- html_attr(html_node(cf_pg, "input[name='pass']"), "value"))
  (action <- html_attr(html_node(cf_pg, "form[id='challenge-form']"), "action"))

  # get the page as just lines of text
  cf_code <- httr::content(res, as="text")
  writeLines(cf_code, "/tmp/a.html")
  cf_code <- stri_split_lines(cf_code)[[1]]

  # find the javascript
  decl <- cf_code[which(stri_detect_fixed(cf_code, "s,t,o,p,b"))]
  (init_line <- stri_match_first_regex(decl, "s,t,o,p,b[[:alpha:], ]+ (.*$)")[,2])
  (var_name <- stri_match_first_regex(init_line, "([[:alnum:]]+)")[,2])
  (exec_line <- cf_code[which(stri_detect_fixed(cf_code, var_name))[2]])

  # tweak and execute the javascript
  ctx <- v8()
  ctx$eval(sprintf("var a = {}; t = '%s';%s\n%s", domain(cf_url), decl, exec_line))
  (ctx$get("a.value"))

  # this lying but you can wait 10s
  message("Waiting 5 seconds...")
  Sys.sleep(10)

  # solve the DDoS challenge and make the request
  httr::GET(
    url = sprintf("%s://%s/%s", scheme(cf_url), domain(cf_url), action),
    httr::user_agent(cf_agent),
    httr::add_headers(
      `Referer` = cf_url
    ),
    query = list(
      `jschl-answer` = ctx$get("a.value"),
      jschl_vc = jschl_vc,
      pass = pass
    ),
    ...
  ) -> res

  res

}


And, it works:

res <- cf_GET("https://www.bitmarket.pl/graphs/BTCPLN/90m.json")

str(content(res, as="parsed"))
## List of 90
##  $ :List of 6
##   ..$ time : int 1512906360
##   ..$ open : chr "48303.78770000"
##   ..$ high : chr "48303.78770000"
##   ..$ low  : chr "48303.78770000"
##   ..$ close: chr "48303.78770000"
##   ..$ vol  : chr "0.13550275"
##  $ :List of 6
##   ..$ time : int 1512906420
##   ..$ open : chr "48303.78770000"
##   ..$ high : chr "48303.78770000"
##   ..$ low  : chr "48000.10000000"
##   ..$ close: chr "48000.10000000"
##   ..$ vol  : chr "1.12078334"
## ...


UPDATE:

I wrapped this in a package:

devtools::install_github("hrbrmstr/cfhttr")
library(cfhttr)

res <- cf_GET("https://www.bitmarket.pl/graphs/BTCPLN/90m.json")


(same output)
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  难免孤独        
                
              
                            
                2020-12-22 02:04
              
            
            
                                                                       
That's because the page is using a DDoS protection service. On the first load, the page itself does a JavaScript-initiated redirect after 5 seconds to fetch the final content so the process fails with tools like wget/curl which do not interpret JavaScript. If you think that it is justifiable to do so, then one option would be to use for example phantomjs and supply a custom script (say, save.js):

var system = require('system');
var page = require('webpage').create();

page.userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5';

page.open(system.args[1], function(){
    setTimeout(function(){
        console.log(page.evaluate(function(){
            //gets the JSON from the first <pre> element rendered on the page
            return document.getElementsByTagName('pre')[0].textContent;
        }));
        phantom.exit();
    }, 6000); //waits 6 seconds for the page to reload
});


and then use it instead of wget as:

phantomjs save.js https://www.bitmarket.pl/graphs/BTCPLN/90m.json

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复