问题
set.seed(3)
library(dplyr)
dat <- tibble(Measure = c("Height","Weight","Width","Length"),
AD1_1= rpois(4,10),
AD1_2= rpois(4,9),
AD2_1= rpois(4,10),
AD2_2= rpois(4,9),
AD3_1= rpois(4,10),
AD3_2= rpois(4,9),
AD4_1= rpois(4,10),
AD4_2= rpois(4,9),
AD5_1= rpois(4,10),
AD5_2= rpois(4,9),
AD6_1= rpois(4,10),
AD6_2= rpois(4,9))
Suppose I have data that looks like this. I wish to calculate the difference for each AD, paired with underscored number, i.e., AD1diff, AD2diff,AD3diff.
Instead of writing
dat %>%
mutate(AD1diff = AD1_1 - AD1_2,
AD2diff = AD2_1 - AD2_2,
...)
what would be an efficient way to write this?
回答1:
One dplyr
option could be:
dat %>%
mutate(across(ends_with("_1"), .names = "{col}_diff") - across(ends_with("_2"))) %>%
rename_with(~ sub("_\\d+", "", .), ends_with("_diff"))
Measure AD1_1 AD1_2 AD2_1 AD2_2 AD3_1 AD3_2 AD4_1 AD4_2 AD5_1 AD5_2 AD6_1 AD6_2 AD1_diff AD2_diff AD3_diff AD4_diff AD5_diff AD6_diff
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 Height 6 10 10 3 12 8 7 5 7 5 8 9 -4 7 4 2 2 -1
2 Weight 8 9 13 6 14 7 8 7 13 11 10 9 -1 7 7 1 2 1
3 Width 10 9 11 5 12 8 7 11 9 5 5 6 1 6 4 -4 4 -1
4 Length 8 9 8 7 8 13 8 7 6 11 14 6 -1 1 -5 1 -5 8
回答2:
The "tidy" way to do this would be to convert your data from wide to long, do a grouped subtraction, and then go back to wide format:
library(tidyr)
dat_long = dat %>% pivot_longer(
cols = starts_with("AD"),
names_sep = "_",
names_to = c("group", "obs")
)
dat_long %>% head
# # A tibble: 48 x 4
# Measure group obs value
# <chr> <chr> <chr> <int>
# 1 Height AD1 1 6
# 2 Height AD1 2 10
# 3 Height AD2 1 10
# 4 Height AD2 2 3
# 5 Height AD3 1 12
# 6 Height AD3 2 8
dat_long %>%
group_by(Measure, group) %>%
summarize(diff = value[obs == 1] - value[obs == 2]) %>%
pivot_wider(names_from = "group", values_from = "diff") %>%
rename_with(.fn = ~ paste0(., "diff"), .cols = starts_with("AD"))
# # A tibble: 4 x 7
# # Groups: Measure [4]
# Measure AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
# <chr> <int> <int> <int> <int> <int> <int>
# 1 Height -4 7 4 2 2 -1
# 2 Length -1 1 -5 1 -5 8
# 3 Weight -1 7 7 1 2 1
# 4 Width 1 6 4 -4 4 -1
回答3:
Here is a data.table
option
setDT(dat)[
,
paste0(
unique(gsub("_\\d+", "", names(dat)[-1])),
"diff"
) := lapply(
split.default(.SD[, -1], gsub("_\\d+", "", names(dat)[-1])),
function(x) do.call("-", x)
)
]
which gives
> dat
Measure AD1_1 AD1_2 AD2_1 AD2_2 AD3_1 AD3_2 AD4_1 AD4_2 AD5_1 AD5_2 AD6_1
1: Height 6 10 10 3 12 8 7 5 7 5 8
2: Weight 8 9 13 6 14 7 8 7 13 11 10
3: Width 10 9 11 5 12 8 7 11 9 5 5
4: Length 8 9 8 7 8 13 8 7 6 11 14
AD6_2 AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
1: 9 -4 7 4 2 2 -1
2: 9 -1 7 7 1 2 1
3: 6 1 6 4 -4 4 -1
4: 6 -1 1 -5 1 -5 8
or
setDT(dat)[
,
c(.(Measure = Measure), setNames(lapply(
split.default(.SD[, -1], gsub("_\\d+", "", names(dat)[-1])),
function(x) do.call("-", x)
), paste0(
unique(gsub("_\\d+", "", names(dat)[-1])),
"diff"
)))
]
gives
Measure AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
1: Height -4 7 4 2 2 -1
2: Weight -1 7 7 1 2 1
3: Width 1 6 4 -4 4 -1
4: Length -1 1 -5 1 -5 8
回答4:
Use tidyverse package tidyr
to rearrange your data before mutating
require(dplyr)
require(tidyr)
#> Loading required package: tidyr
First, tidyr::pivot_longer
the data frame so that there's a separate row for every column:
new_dat <-
pivot_longer(dat, cols = starts_with("AD"), # For columns whose names start with 'AD'...
names_sep = "_", # separate columns using '_' in colname
names_to = c("AD_number", "observation")) %>%
arrange(AD_number, Measure, observation)
head(new_dat, 9)
#> # A tibble: 9 x 4
#> Measure AD_number observation value
#> <chr> <chr> <chr> <int>
#> 1 Height AD1 1 6
#> 2 Height AD1 2 10
#> 3 Length AD1 1 8
#> 4 Length AD1 2 9
#> 5 Weight AD1 1 8
#> 6 Weight AD1 2 9
#> 7 Width AD1 1 10
#> 8 Width AD1 2 9
#> 9 Height AD2 1 10
Then, use tidyr::pivot_wider
(the functional opposite of pivot_longer
) to make a separate column for each value in observation
. This will be very compatible with the upcoming mutate
operation.
new_dat <-
pivot_wider(new_dat,
names_from = observation,
values_from = value,
names_prefix = "value_")
head(new_dat, 5)
#> # A tibble: 5 x 4
#> Measure AD_number value_1 value_2
#> <chr> <chr> <int> <int>
#> 1 Height AD1 6 10
#> 2 Length AD1 8 9
#> 3 Weight AD1 8 9
#> 4 Width AD1 10 9
#> 5 Height AD2 10 3
Finally, mutate the data:
new_dat <-
mutate(new_dat, diff = value_1 - value_2)
head(new_dat, 4)
#> # A tibble: 4 x 5
#> Measure AD_number value_1 value_2 diff
#> <chr> <chr> <int> <int> <int>
#> 1 Height AD1 6 10 -4
#> 2 Length AD1 8 9 -1
#> 3 Weight AD1 8 9 -1
#> 4 Width AD1 10 9 1
Created on 2021-01-22 by the reprex package (v0.3.0)
Getting back to your original data format is possible, but it might not make the data any easier to work with:
rename(new_dat,
c(`1` = "value_1", `2` = "value_2")) %>%
pivot_wider(names_from = AD_number,
values_from = c(`1`, `2`, diff),
names_glue = "{AD_number}_{.value}") %>%
{.[,order(names(.))]} %>%
relocate(Measure)
来源:https://stackoverflow.com/questions/65849383/finding-the-differences-of-paired-columns-using-dplyr