问题
I have a data.table
containing some state name abbreviations and county names. I want to get approx. coordinates from ggplot2::map_data('county')
for each row.
I can do this sequentially with multiple lines of code using :=
but I would like to make only one function call.
Below is what I've tried:
Data:
library(data.table)
library(ggplot2)
> dput(dt[1:20, .(state, county, prime_mover)])
structure(list(state = c("AZ", "AZ", "CA", "CA", "CA", "CT",
"FL", "IN", "MA", "MA", "MA", "MN", "NJ", "NJ", "NJ", "NY", "NC",
"SC", "TN", "TX"), county = c("Maricopa", "Maricopa", "Los Angeles",
"Orange", "Los Angeles", "Fairfield", "Hillsborough", "Morgan",
"Barnstable", "Nantucket", "Essex", "Dakota", "Cape May", "Salem",
"Middlesex", "Kings", "Buncombe", "Anderson", "Shelby", "Tarrant"
), prime_mover = c("GT", "GT", "CT", "CT", "CT", "CT", "GT",
"CT", "GT", "GT", "GT", "GT", "CT", "GT", "CT", "GT", "CT", "CT",
"CT", "CT")), .Names = c("state", "county", "prime_mover"), row.names = c(NA,
-20L), class = c("data.table", "data.frame"))
coord_data <- as.data.table(map_data('county'))
CODE:
getCoords <- function(state, county){
prov <- state.name[grep(state, state.abb)]
ck <- coord_data[region == tolower(prov) & subregion == tolower(county),
.(lon = mean(long), lat = mean(lat))]
return(list(unname(unlist(ck))))
}
# Testing getCoords
> getCoords('AZ', 'Maricopa')
[[1]]
[1] -111.88668 33.58126
ERRORS:
> dt[, c('lon', 'lat') := lapply(.SD, getCoords), .SDcols = c('state', 'county')]
Error in tolower(county) : argument "county" is missing, with no default
In addition: Warning message:
In grep(state, state.abb) :
argument 'pattern' has length > 1 and only the first element will be used
I've seen the following answers but am not able to quite get what I'm doing wrong:
- Loop through data.table and create new columns basis some condition
- R data.table create new columns with standard names
- Add new columns to a data.table containing many variables
- Add multiple columns to R data.table in one function call?
- Assign multiple columns using := in data.table, by group
- Dynamically create new columns in data.table
I am able to achieve what I want by other means (multiple lines, dplyr
or even base R) but I prefer to use the data.table
approach for this.
回答1:
I would go with two update joins:
library(data.table)
# aggregate coordinates
cols <- c("long", "lat")
agg_coord <- coord_data[, lapply(.SD, mean), .SDcols = cols, by = .(region, subregion)]
# coerce to data.table by reference
setDT(dt)[
# 1st update join to append region/state.name
.(state = state.abb, state.name = tolower(state.name)),
on = "state", region := state.name][
# append subregion
, subregion := tolower(county)][
# 2nd update join to append coordinates
agg_coord, on = .(region, subregion), (cols) := .(long, lat)][
# remove helper columns
, c("region", "subregion") := NULL]
# print updated dt
dt[]
state county prime_mover long lat 1: AZ Maricopa GT -111.88668 33.58126 2: AZ Maricopa GT -111.88668 33.58126 3: CA Los Angeles CT -118.29410 34.06683 4: CA Orange CT -117.73632 33.69611 5: CA Los Angeles CT -118.29410 34.06683 6: CT Fairfield CT -73.35118 41.29633 7: FL Hillsborough GT -82.47527 27.87826 8: IN Morgan CT -86.49791 39.52721 9: MA Barnstable GT -70.21598 41.79520 10: MA Nantucket GT -70.05841 41.29880 11: MA Essex GT -70.98384 42.64042 12: MN Dakota GT -93.04962 44.70344 13: NJ Cape May CT -74.80790 39.15476 14: NJ Salem GT -75.36532 39.58720 15: NJ Middlesex CT -74.42345 40.45429 16: NY Kings GT -73.95052 40.64792 17: NC Buncombe CT -82.50883 35.62002 18: SC Anderson CT -82.61956 34.57094 19: TN Shelby CT -89.99297 35.22379 20: TX Tarrant CT -97.29396 32.79856
来源:https://stackoverflow.com/questions/51931636/create-new-data-table-columns-based-on-other-columns