问题
How can I loop the rvest::follow_link()
function to scrape linked webpages?
Use Case:
- Identify all Lego Movie cast members
- Follow all Lego Movie cast member links
- Grab a table of each movie (+ year) for all cast members
The required selectors I need are below:
library(rvest)
lego_movie <- html("http://www.imdb.com/title/tt1490017/")
lego_movie <- lego_movie %>%
html_nodes(".itemprop , .character a") %>%
html_text()
# follow cast links
(".itemprop .itemprop")
# grab tables of all movies and dates for each cast member
(".year_column , b a")
Desired Output:
castMember movie year
Will Arnett Lego 2017
Will Arnett BoJack 2014
Will Arnett Wander 2014
............
Elizabeth Banks Moonbeam 2015
Elizabeth Banks Wet Hot 2015
............
Alison Brie Get Hard 2015
Alison Brie GetaJob 2015
.....etc.....
回答1:
Perhaps something like this could work.
library(rvest)
library(stringr)
library(data.table)
lego_movie <- read_html("http://www.imdb.com/title/tt1490017/")
cast <- lego_movie %>%
html_nodes("#titleCast .itemprop span") %>%
html_text()
cast
s <- html_session("http://www.imdb.com/title/tt1490017/")
cast_movies <- list()
for(i in cast[1:3]){
actorpage <- s %>% follow_link(i) %>% read_html()
cast_movies[[i]]$movies <- actorpage %>%
html_nodes("b a") %>% html_text() %>% head(10)
cast_movies[[i]]$years <- actorpage %>%
html_nodes("#filmography .year_column") %>% html_text() %>%
head(10) %>% str_extract("[0-9]{4}")
cast_movies[[i]]$name <- rep(i, length(cast_movies[[i]]$years))
}
cast_movies
as.data.frame(cast_movies[[1]])
rbindlist(cast_movies)
回答2:
This is untested, so it may be wrong. I'd go through it step by step and verify that it's correct. I'm not sure about how to use follow_link in this context... but here's what I came up with...
library("rvest")
library("stringr")
lego_movie <- html("http://www.imdb.com/title/tt1490017/")
links <- lego_movie %>%
html() %>%
html_nodes(".itemprop , a") %>% xml_attr("href")
links[is.na(links)] <- ""
actors <- lego_movie %>%
html() %>%
html_nodes(".itemprop , a") %>%
html_text()
df <- data.frame(name=actors, link=links, stringsAsFactors=F)
df <- subset(df, substring(link, 2, 5)=="name")
df <- subset(df, name!="")
df$name <- gsub("\\n", "", df$name)
df$name <- str_trim(df$name)
df <- df[order(df$name),]
df <- subset(df, !duplicated(df$name))
get_movies <- function(name, link){
url <- paste0("http://www.imdb.com", link)
movies <- url %>%
html() %>%
html_nodes(".year_column , b a") %>%
html_text()
# take care of random date at top of some actors stuff...
if(length(movies)%%2==1){movies <- movies[-1]}
movies <- gsub("\\n", "", movies)
movies <- str_trim(movies)
df <- data.frame(date=movies[seq(1, length(movies), 2)],
movie=movies[seq(2, length(movies), 2)],
stringsAsFactors=F)
df <- cbind(name=rep(name, nrow(df)), df)
return(df)
}
final_df <- data.frame()
for(i in 1:nrow(df)){
final_df <- rbind(final_df, get_movies(df$name[i], df$link[i]))
}
来源:https://stackoverflow.com/questions/28863775/scraping-linked-html-webpages-by-looping-the-rvestfollow-link-function