The page I would like to scrape here: http://stoptb.org/countries/tbteam/searchExperts.asp requires the submission of parameters in this page: http://stoptb.org/countries/tb
I tried to prepare a full working example for "http://hp2010.nhlbihin.net/atpiii/calculator.asp?usertype=prof".
#Example the data required.
medication <- 0; #c("0", "1")
sex <- "male"; #c("female", "male")
smoker <- 1; # c("0", "1")
age <- 20; #
cholesterol<-130;#
hdl <- 20; #
systolic <- 130; #
#thanks to http://www.omegahat.org/RHTMLForms/
download.file("http://www.omegahat.org/RHTMLForms/RHTMLForms_0.6-0.tar", tempdir())
install.packages(file.path(paste(tempdir(),"RHTMLForms_0.6-0.tar", 'RHTMLForms_0.6-0.tar')),repos=NULL, type='source')
#----------------------------------------------------------------------------
#libraries
library(RHTMLForms)
library(xlsx)
library(XML)
library(RCurl)
# http://stackoverflow.com/questions/5396461/how-to-automate-multiple-requests-to-a-web-search-form-using-r
setwd("C:\\MyPath")
data<-read.csv("MyData.csv")
# get form description
url<-"http://hp2010.nhlbihin.net/atpiii/calculator.asp?usertype=prof"
forms <- getHTMLFormDescription(url);
# create a function using form description, to query the url
efun <- createFunction(forms[[1]]);
result<-matrix(NA,nrow=dim(data)[1],ncol=9,dimnames=list(1:dim(data)[1],c("IDNO","medication","sex","smoker","age","cholesterol","hdl","systolic","risk_persent")))
# If you had an actual file you could use this for loop, for now you can use the example data
# for(i in 1: dim(data)[1]){
# medication <- 0;#c("0", "1")
# sex <- ifelse(data$gender1[i]==0,"female","male") ;#c("female", "male")
# smoker <- ifelse(data$cig1c[i]<2,0,1);# c("0", "1")
# age <- data$age1c[i];#
# cholesterol <- data$chol1[i];#
# hdl <- data$hdl1[i];#
# systolic <- round(data$sbp1c[i]);#
if(age<20||age>99||systolic>200||systolic<90||cholesterol<130||cholesterol>320||hdl<20||hdl>100||is.na(sex)||is.na(smoker)||is.na(age)||is.na(cholesterol)||is.na(hdl)||is.na(systolic)){
result[i,]<-c(data$IDNO[i],medication,sex,smoker,age,cholesterol,hdl,systolic,NA)
next;
}
# extract webpage by passing required arguments to function
page <- efun(medication = toString(medication), sex = toString(sex), smoker = toString(smoker), age = toString(age), cholesterol = toString(cholesterol), hdl = toString(hdl), systolic = toString(systolic));
#pause the algorithm, so that you do not request too often from the server
Sys.sleep(.1)
# parse webpage and return html tree
doc <- htmlTreeParse(page, asText = T, useInternalNodes = T);
# extract table from the html tree
tab <- readHTMLTable(doc);
result[i,]<-c(data$IDNO[i],medication,sex,smoker,age,cholesterol,hdl,systolic,as.numeric(gsub('[%a-zA-Z ]','',(tab[[1]][11,2]))))
#system.time
# }#end of for loop
write.csv(result,file="MyResults.csv")