Scrape tables by passing multiple search requests using R

问题

I'm trying to search for multiple times on a website using First and last name (https://npiregistry.cms.hhs.gov/registry/) and then create a dataframe of the output

I figured out that this is similar to what has been described in How to automate multiple requests to a web search form using R, but for some reasons I've been getting the error "Error: failed to load external entity"`

Below is the code that I'm using to pull records

fn  = rep(c('HARVEY','HARVEY'));

ln  = rep(c('BIDWELL','ADELSON'));

mydf = data.frame(fn,ln);


get_data = function(df){

  library(XML);

  root = 'http://npiregistry.cms.hhs.gov/'


  u = paste(root,'registry/search-results-table?','first_name=', df$fn, '&last_name=', 
            df$ln, sep = "");

  # encode url correctly
  url  = URLencode(u);

  # extract data from the right table
  data = readHTMLTable(url);

}


library(plyr)
mydata = adply(mydf, 1, get_data);

Thanks for the help

回答1:

The call needs is to https: and not http:. I also removed the plyr library used just base R:

library(rvest)
fn  = rep(c('HARVEY','HARVEY'));
ln  = rep(c('BIDWELL','ADELSON'));
mydf = data.frame(fn,ln);

get_data = function(df){
  root = 'https://npiregistry.cms.hhs.gov/'
  u = paste(root,'registry/search-results-table?','first_name=', df[1], '&last_name=', 
            df[2], sep = "");
  # encode url correctly
  url  = URLencode(u);
  #print(url)
  # extract data from the right table
  data = read_html(url);
  newresult<- html_nodes(data, "table")[1] %>%html_table()
  # convert result into a data frame
  newresult<-as.data.frame(newresult)
}

mydata = apply(mydf, 1, function(x) { get_data(x)})
#mydata is a list of data frames, do.call creates a single data.frame
finalanswer<-do.call(rbind, mydata)
#finalanswer needs some clean up.

回答2:

It has an unauthenticated API… why not use it?

library(httr)
library(jsonlite)
library(tidyverse)

npi_query <- function(f_name, l_name) {

  res <- GET("https://npiregistry.cms.hhs.gov/api/",
             query = list(first_name = f_name, last_name = l_name))

  stop_for_status(res)

  res <- content(res, as="text", encoding="UTF-8")
  res <- fromJSON(res, flatten=TRUE)

  as_tibble(res$results)

}

data_frame(
  fn = c('HARVEY', 'HARVEY'),
  ln = c('BIDWELL','ADELSON')
) -> lkp

map2_df(lkp$fn, lkp$ln, npi_query) %>%
  glimpse()
## Observations: 2
## Variables: 19
## $ taxonomies             <list> [<MA, 207R00000X, TRUE, 36065, Interna...
## $ addresses              <list> [<c("DORCHESTER", "DORCHESTER"), c("23...
## $ created_epoch          <int> 1152230400, 1168992000
## $ identifiers            <list> [[], []]
## $ other_names            <list> [[], []]
## $ number                 <int> 1336171859, 1205988342
## $ last_updated_epoch     <int> 1183852800, 1183852800
## $ enumeration_type       <chr> "NPI-1", "NPI-1"
## $ basic.status           <chr> "A", "A"
## $ basic.credential       <chr> "M.D.", "DMD"
## $ basic.first_name       <chr> "HARVEY", "HARVEY"
## $ basic.last_name        <chr> "BIDWELL", "ADELSON"
## $ basic.middle_name      <chr> "W", "JEROME"
## $ basic.name             <chr> "BIDWELL HARVEY", "ADELSON HARVEY"
## $ basic.gender           <chr> "M", "M"
## $ basic.sole_proprietor  <chr> "NO", "NO"
## $ basic.last_updated     <chr> "2007-07-08", "2007-07-08"
## $ basic.enumeration_date <chr> "2006-07-07", "2007-01-17"
## $ basic.name_prefix      <chr> NA, "DR."

来源：https://stackoverflow.com/questions/43528714/scrape-tables-by-passing-multiple-search-requests-using-r

标签

web-scraping