loop across multiple urls in r with rvest [duplicate]

孤者浪人 提交于 2019-11-28 20:59:03

You are attempting to vectorize a method that cannot take multiple items in one call. Specifically, read_html() requires one page per call since it needs to read in web data one at a time and expects a scalar value. Consider looping through the site list with lapply then bind all dfs together:

jump <- seq(0, 800, by = 100)
site <- paste('http://www.basketball-reference.com/play-index/draft_finder.cgi?',
              'request=1&year_min=2001&year_max=2014&round_min=&round_max=',
              '&pick_overall_min=&pick_overall_max=&franch_id=&college_id=0',
              '&is_active=&is_hof=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y',
              '&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&c1stat=&c1comp=&c1val=&c2stat=&c2comp=',
              '&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=year_id',
              '&order_by_asc=&offset=', jump, sep="")

dfList <- lapply(site, function(i) {
    webpage <- read_html(i)
    draft_table <- html_nodes(webpage, 'table')
    draft <- html_table(draft_table)[[1]]
})

finaldf <- do.call(rbind, dfList)             # ASSUMING ALL DFs MAINTAIN SAME COLS

You can use curl to run all of the requests at once. I Be nice to the sites that may have small servers and don't blow them up. With this code you can use the lapply at the end to clean up the table so you can stack it with do.call(rbind, AllOut) but I will leave that to you.

library(rvest)
library(stringr)
library(tidyr)

OffSet <- seq(0, 900, by = 100)

Sites <- paste0('http://www.basketball-reference.com/play-index/draft_finder.cgi?request=1&year_min=2001&year_max=2014&round_min=&round_max=&pick_overall_min=&pick_overall_max=&franch_id=&college_id=0&is_active=&is_hof=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&c1stat=&c1comp=&c1val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=year_id&order_by_asc=&offset=', OffSet)


library(curl)

out <<- list()
# This is function, function which will be run if data vendor call is successful
complete = function(res){
  # cat("Request done! Status:", res$status, "\n")
  out <<- c(out, list(res))
}

for(i in 1:length(Sites)){
  curl_fetch_multi(
    Sites[i]
    , done = complete
    , fail = print
    , handle = new_handle(customrequest = "GET")
    )
}

multi_run()

AllOut <- lapply(out, function(x){

  webpage <- read_html(x$content)
  draft_table <- html_nodes(webpage, 'table')
  Tab <- html_table(draft_table)
  if(length(Tab) == 0){
    NULL
  } else {
    Tab
  }

})
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!