问题
I have data in a long format which will be grouped by geographies. I want to calculate the difference in each group between one of the variables of interest against all the other variables of interest. I could not figure out how to do this efficiently in a single data table statement so did a workaround which also introduced some new errors along the way (I fixed those with more workarounds but help here would also be appreciated!).
I then want to pass the resulting columns into a ggplot function however cannot get the recommended methods to work so am using a deprecated method.
library(data.table)
library(ggplot2)
set.seed(1)
results <- data.table(geography = rep(1:4, each = 4),
variable = rep(c("alpha", "bravo", "charlie", "delta"), 4),
statistic = rnorm(16) )
> results[c(1:4,13:16)]
geography variable statistic
1: 1 alpha -0.62645381
2: 1 bravo 0.18364332
3: 1 charlie -0.83562861
4: 1 delta 1.59528080
5: 4 alpha -0.62124058
6: 4 bravo -2.21469989
7: 4 charlie 1.12493092
8: 4 delta -0.04493361
base_variable <- "alpha"
From this point I ideally want to write a simple piece of code that groups by the geographies, then returns this table in the same format but with the statistic for each variable being (base_variable - variable) in each group.
I could not figure out how to do this so my workaround is below, any advice on a better method is appreciated.
# Convert to a wide table so we can do the subtraction by rows
results_wide <- dcast(results, geography ~ variable, value.var = "statistic")
geography alpha bravo charlie delta
1: 1 -0.6264538 0.1836433 -0.8356286 1.59528080
2: 2 0.3295078 -0.8204684 0.4874291 0.73832471
3: 3 0.5757814 -0.3053884 1.5117812 0.38984324
4: 4 -0.6212406 -2.2146999 1.1249309 -0.04493361
this_is_a_hack <- as.data.table(lapply(results_wide[,-1], function(x) results_wide[, ..base_variable] - x))
alpha.alpha bravo.alpha charlie.alpha delta.alpha
1: 0 -0.8100971 0.2091748 -2.2217346
2: 0 1.1499762 -0.1579213 -0.4088169
3: 0 0.8811697 -0.9359998 0.1859381
4: 0 1.5934593 -1.7461715 -0.5763070
Names are now messed up and we don't have a geography. Why are the names like this? Also, need to re-add geography.
this_is_a_hack[, geography := results_wide[, geography] ]
normalise_these_names <- colnames(this_is_a_hack)
#Regex approach. Hacky and situational.
new_names <- sub("\\.(.*)", "", normalise_these_names[normalise_these_names != "geography"] )
normalise_these_names[normalise_these_names != "geography"] <- new_names
#Makes use of the fact that geographies will appear last in the data.table, not generalisable approach.
colnames(this_is_a_hack) <- normalise_these_names
I dont need the base variable anymore as all the values are zero so I try to drop it however I cant seem to do this the usual way I do it:
this_is_a_hack[, ..base_variable := NULL]
Warning message:
In `[.data.table`(this_is_a_hack, , `:=`(..base_variable, NULL)) :
Column '..base_variable' does not exist to remove
library(dplyr)
this_is_a_hack <- select(this_is_a_hack, -base_variable)
final_result <- melt(this_is_a_hack, id.vars = "geography")
> final_result[c(1:4,9:12)]
geography variable value
1: 1 bravo -0.8100971
2: 2 bravo 1.1499762
3: 3 bravo 0.8811697
4: 4 bravo 1.5934593
5: 1 delta -2.2217346
6: 2 delta -0.4088169
7: 3 delta 0.1859381
8: 4 delta -0.5763070
Data is now ready to be visualised. I'm trying to pass these variables into a plotting function however referencing data.table columns seems to be difficult compared to dataframes. Apparently you should be using quosures to pass data.table variables into functions however this just errored out so I'm using the deprecated 'aes_string' function instead - help on this is also appreciated.
plott <- function(dataset, varx, vary, fillby) {
# varx <- ensym(varx)
# vary <- ensym(vary)
# vary <- ensym(fillby)
ggplot(dataset,
aes_string(x = varx, y = vary, color = fillby)) +
geom_point()
}
plott(dataset = final_result,
varx = "geography",
vary = "value",
fillby = "variable")
# Error I get when I try the ensym(...) method in the function:
Don't know how to automatically pick scale for object of type name. Defaulting to continuous. (this message happens 3 times)
Error: Aesthetics must be valid data columns. Problematic aesthetic(s): x = varx, y = vary, colour = fillby.
Did you mistype the name of a data column or forget to add stat()?
回答1:
An option is to subset the 'statistic' by creating a logical condition based on 'variable' with 'base_variable' element grouped by 'geography'
results[, .(variable, diff = statistic - statistic[variable == base_variable]),
by = geography][variable != base_variable]
# geography variable diff
# 1: 1 bravo 0.8100971
# 2: 1 charlie -0.2091748
# 3: 1 delta 2.2217346
# 4: 2 bravo -1.1499762
# 5: 2 charlie 0.1579213
# 6: 2 delta 0.4088169
# 7: 3 bravo -0.8811697
# 8: 3 charlie 0.9359998
# 9: 3 delta -0.1859381
#10: 4 bravo -1.5934593
#11: 4 charlie 1.7461715
#12: 4 delta 0.5763070
回答2:
This kind of thing can also be done with joins. In my experience the "subset variables + grouping" approach is usually faster for smaller tables (like this example), and the join approach is faster when you have millions of rows.
results[variable != base_variable
][results[variable == base_variable], on = 'geography',
diff := statistic - i.statistic][]
# geography variable statistic diff
# 1: 1 bravo 0.18364332 0.8100971
# 2: 1 charlie -0.83562861 -0.2091748
# 3: 1 delta 1.59528080 2.2217346
# 4: 2 bravo -0.82046838 -1.1499762
# 5: 2 charlie 0.48742905 0.1579213
# 6: 2 delta 0.73832471 0.4088169
# 7: 3 bravo -0.30538839 -0.8811697
# 8: 3 charlie 1.51178117 0.9359998
# 9: 3 delta 0.38984324 -0.1859381
# 10: 4 bravo -2.21469989 -1.5934593
# 11: 4 charlie 1.12493092 1.7461715
# 12: 4 delta -0.04493361 0.5763070
Two benchmarks
library(microbenchmark)
microbenchmark(
use_group =
results[, .(variable, diff = statistic - statistic[variable == base_variable]),
by = geography][variable != base_variable],
use_join =
results[variable != base_variable
][results[variable == base_variable], on = 'geography',
diff := statistic - i.statistic][],
times = 10
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# use_group 1.624204 1.801434 2.143670 2.212306 2.391793 2.654357 10 a
# use_join 6.297842 6.808610 7.626004 7.729634 8.337635 8.708916 10 b
results <- results[rep(1:.N, 1e4)][, geography := rleid(geography)]
microbenchmark(
use_group =
results[, .(variable, diff = statistic - statistic[variable == base_variable]),
by = geography][variable != base_variable],
use_join =
results[variable != base_variable
][results[variable == base_variable], on = 'geography',
diff := statistic - i.statistic][],
times = 10
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# use_group 97.42187 106.80935 132.42537 120.64893 143.03045 208.1996 10 b
# use_join 19.88511 21.86214 26.22012 25.82972 29.29885 36.0853 10 a
来源:https://stackoverflow.com/questions/57794449/data-table-apply-function-over-groups-with-reference-to-set-value-in-each-group