What if I want to web scrape with R for a page with parameters?

后端 未结 3 659
不知归路
不知归路 2020-12-10 09:51

The page I would like to scrape here: http://stoptb.org/countries/tbteam/searchExperts.asp requires the submission of parameters in this page: http://stoptb.org/countries/tb

相关标签:
3条回答
  • 2020-12-10 10:11

    You can use RHTMLForms

    You may need to install it first:

    # install.packages("RHTMLForms", repos = "http://www.omegahat.org/R")
    

    or under windows you may need

    # install.packages("RHTMLForms", repos = "http://www.omegahat.org/R", type = "source")
    
    
     require(RHTMLForms)
     require(RCurl)
     require(XML)
     forms = getHTMLFormDescription("http://stoptb.org/countries/tbteam/experts.asp")
     fun = createFunction(forms$sExperts)
     # find experts with expertise in "Infection control: Engineering Consultant"
     results <- fun(Expertise = "Infection control: Engineering Consultant")
    
     tableData <- getNodeSet(htmlParse(results), "//*/table[@class = 'data']")
     readHTMLTable(tableData[[1]])
    
    #                              V1                   V2                     V3
    #1                                                <NA>                   <NA>
    #2                 Name of Expert Country of Residence                  Email
    #3               Girmay, Desalegn             Ethiopia    deskebede@yahoo.com
    #4            IVANCHENKO, VARVARA              Estonia v.ivanchenko81@mail.ru
    #5                   JAUCOT, Alex              Belgium  alex.jaucot@gmail.com
    #6 Mulder, Hans Johannes Henricus              Namibia        hmulder@iway.na
    #7                    Walls, Neil            Australia        neil@nwalls.com
    #8                 Zuccotti, Thea                Italy     thea_zuc@yahoo.com
    #                  V4
    #1               <NA>
    #2 Number of Missions
    #3                  0
    #4                  3
    #5                  0
    #6                  0
    #7                  0
    #8                  1
    

    or create a reader to return a table

     returnTable <- function(results){
      tableData <- getNodeSet(htmlParse(results), "//*/table[@class = 'data']")
      readHTMLTable(tableData[[1]])
     }
     fun = createFunction(forms$sExperts, reader = returnTable)
     fun(CBased = "Bhutan") # find experts based in Bhutan
    #                 V1                   V2                      V3
    #1                                   <NA>                    <NA>
    #2    Name of Expert Country of Residence                   Email
    #3 Wangchuk, Lungten               Bhutan drlungten@health.gov.bt
    #                  V4
    #1               <NA>
    #2 Number of Missions
    #3                  2
    
    0 讨论(0)
  • 2020-12-10 10:26

    Sending a form to a web server normally is done via what is called an HTTP POST request (getting an ordinary web page is an HTTP GET request).

    The POST request spec lets you bundle up all the parameters into part of the HTTP headers.

    RCurl can do this, or you might try the httr package which has a POST function that takes an R list of parameters to pass on with the POST request.

    Another tip: use Firebug or other browser debugger to inspect what a page has sent as parameters to POST requests.

    0 讨论(0)
  • 2020-12-10 10:37

    I tried to prepare a full working example for "http://hp2010.nhlbihin.net/atpiii/calculator.asp?usertype=prof".

     #Example the data required.
     medication <- 0; #c("0", "1")
     sex <- "male";   #c("female", "male")
     smoker <- 1;     # c("0", "1")
     age <- 20;       #
     cholesterol<-130;#
     hdl <- 20;       #
     systolic <- 130; #
    
     #thanks to http://www.omegahat.org/RHTMLForms/
      download.file("http://www.omegahat.org/RHTMLForms/RHTMLForms_0.6-0.tar", tempdir())
      install.packages(file.path(paste(tempdir(),"RHTMLForms_0.6-0.tar",  'RHTMLForms_0.6-0.tar')),repos=NULL, type='source')
    
    #----------------------------------------------------------------------------
     #libraries
     library(RHTMLForms)
     library(xlsx)
     library(XML)
     library(RCurl)
    
     # http://stackoverflow.com/questions/5396461/how-to-automate-multiple-requests-to-a-web-search-form-using-r
     setwd("C:\\MyPath")
     data<-read.csv("MyData.csv")
     # get form description
     url<-"http://hp2010.nhlbihin.net/atpiii/calculator.asp?usertype=prof"
     forms <- getHTMLFormDescription(url);
    
     # create a function using form description, to query the url
     efun  <- createFunction(forms[[1]]);
    
    
     result<-matrix(NA,nrow=dim(data)[1],ncol=9,dimnames=list(1:dim(data)[1],c("IDNO","medication","sex","smoker","age","cholesterol","hdl","systolic","risk_persent")))
    
    # If you had an actual file you could use this for loop, for now you can use the example data
    #  for(i in 1: dim(data)[1]){
    #    medication <- 0;#c("0", "1")
    #    sex <- ifelse(data$gender1[i]==0,"female","male") ;#c("female", "male")
    #    smoker <- ifelse(data$cig1c[i]<2,0,1);# c("0", "1")
    #    age <- data$age1c[i];#
    #    cholesterol <- data$chol1[i];#
    #    hdl <- data$hdl1[i];#
    #    systolic <- round(data$sbp1c[i]);#
    
        if(age<20||age>99||systolic>200||systolic<90||cholesterol<130||cholesterol>320||hdl<20||hdl>100||is.na(sex)||is.na(smoker)||is.na(age)||is.na(cholesterol)||is.na(hdl)||is.na(systolic)){    
        result[i,]<-c(data$IDNO[i],medication,sex,smoker,age,cholesterol,hdl,systolic,NA)
        next;
      }
    
      # extract webpage by passing required arguments to function
      page  <- efun(medication = toString(medication), sex = toString(sex), smoker = toString(smoker), age = toString(age), cholesterol = toString(cholesterol), hdl = toString(hdl), systolic = toString(systolic));
      #pause the algorithm, so that you do not request too often from the server
      Sys.sleep(.1)
      # parse webpage and return html tree
      doc   <- htmlTreeParse(page, asText = T, useInternalNodes = T);
      # extract table from the html tree
      tab   <- readHTMLTable(doc);
      result[i,]<-c(data$IDNO[i],medication,sex,smoker,age,cholesterol,hdl,systolic,as.numeric(gsub('[%a-zA-Z ]','',(tab[[1]][11,2]))))
     #system.time
     # }#end of for loop
    
     write.csv(result,file="MyResults.csv")
    
    0 讨论(0)
提交回复
热议问题