Scraping linked HTML webpages by looping the rvest::follow_link() function

前提是你 提交于 2019-12-03 08:45:43

Perhaps something like this could work.

library(rvest)
library(stringr)
library(data.table)
lego_movie <- read_html("http://www.imdb.com/title/tt1490017/")
cast <- lego_movie %>%
    html_nodes("#titleCast .itemprop span") %>%
    html_text()
cast

s <- html_session("http://www.imdb.com/title/tt1490017/")

cast_movies <- list()

for(i in cast[1:3]){
    actorpage <- s %>% follow_link(i) %>% read_html()
    cast_movies[[i]]$movies <-  actorpage %>% 
        html_nodes("b a") %>% html_text() %>% head(10)
    cast_movies[[i]]$years <- actorpage %>%
        html_nodes("#filmography .year_column") %>% html_text() %>% 
        head(10) %>% str_extract("[0-9]{4}")
    cast_movies[[i]]$name <- rep(i, length(cast_movies[[i]]$years))
}

cast_movies
as.data.frame(cast_movies[[1]])
rbindlist(cast_movies)

This is untested, so it may be wrong. I'd go through it step by step and verify that it's correct. I'm not sure about how to use follow_link in this context... but here's what I came up with...

library("rvest")
library("stringr")
lego_movie <- html("http://www.imdb.com/title/tt1490017/")
links <- lego_movie %>%
            html() %>%
            html_nodes(".itemprop , a") %>% xml_attr("href")
links[is.na(links)] <- ""

actors <- lego_movie %>%
  html() %>%
  html_nodes(".itemprop , a") %>%
html_text()

df <- data.frame(name=actors, link=links, stringsAsFactors=F)
df <- subset(df, substring(link, 2, 5)=="name")
df <- subset(df, name!="")
df$name <- gsub("\\n", "", df$name)
df$name <- str_trim(df$name)
df <- df[order(df$name),]
df <- subset(df, !duplicated(df$name))

get_movies <- function(name, link){
  url <- paste0("http://www.imdb.com", link)
  movies <- url %>%
    html() %>%
    html_nodes(".year_column , b a") %>%
    html_text()
  # take care of random date at top of some actors stuff...
  if(length(movies)%%2==1){movies <- movies[-1]}
  movies <- gsub("\\n", "", movies)
  movies <- str_trim(movies)
  df <- data.frame(date=movies[seq(1, length(movies), 2)], 
                   movie=movies[seq(2, length(movies), 2)],
                   stringsAsFactors=F)
  df <- cbind(name=rep(name, nrow(df)), df)
  return(df)
}

final_df <- data.frame()
for(i in 1:nrow(df)){
  final_df <- rbind(final_df, get_movies(df$name[i], df$link[i]))
}
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!