问题
I am trying to scrap some hotel reviews from TripAdvisor. The code below gives me all the reviews for Giant case hotel reviews from all pages. Yet when I add rate, quoting and rate I get only the first page of reviews. To reiterate, without these 3 items I get all the reviews from 1st page to 60, etc. Can someone help me?
#Load library
library(tm)
library(stringr)
library(rvest)
df <- data.frame(Date=as.Date(character()), File=character(),User=character(), ?stringsAsFactors=FALSE)
x <- 0
for(i in c(1:500)){
url <- ""
if(x == 0){
url <- "https://www.tripadvisor.co.uk/Attraction_Review-g209948-d189773-Reviews-Giant_s_Causeway-Bushmills_County_Antrim_Northern_Ireland.html"
x <- x + 10
} else{
url <- paste("https://www.tripadvisor.com/Attraction_Review-g209948-d189773-Reviews-or",x,"-Giant_s_Causeway-Bushmills_County_Antrim_Northern_Ireland.html#REVIEWS", sep = "")
x <- x + 10
}
reviews <- url %>%
read_html() %>%
html_nodes("#REVIEWS .innerBubble")
id <- reviews %>%
html_node(".quote a") %>%
html_attr("id")
review <- reviews %>%
html_node(".entry .partial_entry") %>%
html_text()
#quote <- reviews %>%
#html_node(".quote span") %>%
#html_text()
#rating <- reviews %>%
#html_node(".rating span") %>%
#html_attr("class") %>%
#gsub("ui_bubble_rating bubble_", "", .) %>%
#gsub("0", "", .) %>%
#as.integer()
#date <- reviews %>%
#html_node(".rating .ratingDate") %>%
#html_attr("title") %>%
#strptime("%d %b %Y") %>%
#as.POSIXct()
if(nrow(df) == 0){
df <- data.frame(id, review, stringsAsFactors = FALSE)
}
else{
temp <- df
df <- rbind(temp, data.frame(id, review, stringsAsFactors = FALSE))
}
}
来源:https://stackoverflow.com/questions/48301436/scraping-issues-from-tripadvisor