I want to calculate the average geographical distance between a number of houses per province.
Suppose I have the following data.
df1 <- data.fram
You can use a vectorized version of haversine distance, such as :
dist_haversine_for_dfs <- function (df_x, df_y, lat, r = 6378137)
{
if(!all(c("lat", "lon") %in% names(df_x))) {
stop("parameter df_x does not have column 'lat' and 'lon'")
}
if(!all(c("lat", "lon") %in% names(df_y))) {
stop("parameter df_x does not have column 'lat' and 'lon'")
}
toRad <- pi/180
df_x <- df_x * toRad
df_y <- df_y * toRad
dLat <- df_y[["lat"]] - df_x[["lat"]]
dLon <- df_y[["lon"]] - df_x[["lon"]]
a <- sin(dLat/2) * sin(dLat/2) + cos(df_x[["lat"]]) * cos(df_y[["lat"]]) *
sin(dLon/2) * sin(dLon/2)
a <- pmin(a, 1)
dist <- 2 * atan2(sqrt(a), sqrt(1 - a)) * r
return(dist)
}
Then using data.table
and the package arrangements
(for faster combinations generation) you can do the following :
library(data.table)
dt <- data.table(df1)
ids <- dt[, {
comb_mat <- arrangements::combinations(x = house, k = 2)
list(house_x = comb_mat[, 1],
house_y = comb_mat[, 2])}, by = province]
jdt <- cbind(ids,
dt[ids$house_x, .(lon_x=lon, lat_x=lat)],
dt[ids$house_y, .(lon_y=lon, lat_y=lat)])
jdt[, dist := dist_haversine_for_dfs(df_x = jdt[, .(lon = lon.x, lat = lat.x)],
df_y = jdt[, .(lon = lon.y, lat = lat.y)])]
jdt[, .(mean_dist = mean(dist)), by = province]
which outputs
province mean_dist
1: 1 15379.21
2: 2 793612.04