What if I want to web scrape with R for a page with parameters?

后端 未结 3 667
不知归路
不知归路 2020-12-10 09:51

The page I would like to scrape here: http://stoptb.org/countries/tbteam/searchExperts.asp requires the submission of parameters in this page: http://stoptb.org/countries/tb

3条回答
  •  醉酒成梦
    2020-12-10 10:37

    I tried to prepare a full working example for "http://hp2010.nhlbihin.net/atpiii/calculator.asp?usertype=prof".

     #Example the data required.
     medication <- 0; #c("0", "1")
     sex <- "male";   #c("female", "male")
     smoker <- 1;     # c("0", "1")
     age <- 20;       #
     cholesterol<-130;#
     hdl <- 20;       #
     systolic <- 130; #
    
     #thanks to http://www.omegahat.org/RHTMLForms/
      download.file("http://www.omegahat.org/RHTMLForms/RHTMLForms_0.6-0.tar", tempdir())
      install.packages(file.path(paste(tempdir(),"RHTMLForms_0.6-0.tar",  'RHTMLForms_0.6-0.tar')),repos=NULL, type='source')
    
    #----------------------------------------------------------------------------
     #libraries
     library(RHTMLForms)
     library(xlsx)
     library(XML)
     library(RCurl)
    
     # http://stackoverflow.com/questions/5396461/how-to-automate-multiple-requests-to-a-web-search-form-using-r
     setwd("C:\\MyPath")
     data<-read.csv("MyData.csv")
     # get form description
     url<-"http://hp2010.nhlbihin.net/atpiii/calculator.asp?usertype=prof"
     forms <- getHTMLFormDescription(url);
    
     # create a function using form description, to query the url
     efun  <- createFunction(forms[[1]]);
    
    
     result<-matrix(NA,nrow=dim(data)[1],ncol=9,dimnames=list(1:dim(data)[1],c("IDNO","medication","sex","smoker","age","cholesterol","hdl","systolic","risk_persent")))
    
    # If you had an actual file you could use this for loop, for now you can use the example data
    #  for(i in 1: dim(data)[1]){
    #    medication <- 0;#c("0", "1")
    #    sex <- ifelse(data$gender1[i]==0,"female","male") ;#c("female", "male")
    #    smoker <- ifelse(data$cig1c[i]<2,0,1);# c("0", "1")
    #    age <- data$age1c[i];#
    #    cholesterol <- data$chol1[i];#
    #    hdl <- data$hdl1[i];#
    #    systolic <- round(data$sbp1c[i]);#
    
        if(age<20||age>99||systolic>200||systolic<90||cholesterol<130||cholesterol>320||hdl<20||hdl>100||is.na(sex)||is.na(smoker)||is.na(age)||is.na(cholesterol)||is.na(hdl)||is.na(systolic)){    
        result[i,]<-c(data$IDNO[i],medication,sex,smoker,age,cholesterol,hdl,systolic,NA)
        next;
      }
    
      # extract webpage by passing required arguments to function
      page  <- efun(medication = toString(medication), sex = toString(sex), smoker = toString(smoker), age = toString(age), cholesterol = toString(cholesterol), hdl = toString(hdl), systolic = toString(systolic));
      #pause the algorithm, so that you do not request too often from the server
      Sys.sleep(.1)
      # parse webpage and return html tree
      doc   <- htmlTreeParse(page, asText = T, useInternalNodes = T);
      # extract table from the html tree
      tab   <- readHTMLTable(doc);
      result[i,]<-c(data$IDNO[i],medication,sex,smoker,age,cholesterol,hdl,systolic,as.numeric(gsub('[%a-zA-Z ]','',(tab[[1]][11,2]))))
     #system.time
     # }#end of for loop
    
     write.csv(result,file="MyResults.csv")
    

提交回复
热议问题