Finding the differences of paired-columns using dplyr

问题

set.seed(3)
library(dplyr)
dat <- tibble(Measure = c("Height","Weight","Width","Length"),
             AD1_1= rpois(4,10),
             AD1_2= rpois(4,9),
             AD2_1= rpois(4,10),
             AD2_2= rpois(4,9),
             AD3_1= rpois(4,10),
             AD3_2= rpois(4,9),
             AD4_1= rpois(4,10),
             AD4_2= rpois(4,9),
             AD5_1= rpois(4,10),
             AD5_2= rpois(4,9),
             AD6_1= rpois(4,10),
             AD6_2= rpois(4,9))

Suppose I have data that looks like this. I wish to calculate the difference for each AD, paired with underscored number, i.e., AD1diff, AD2diff,AD3diff.

Instead of writing

dat %>%
mutate(AD1diff = AD1_1 - AD1_2,
       AD2diff = AD2_1 - AD2_2,
...)

what would be an efficient way to write this?

回答1:

One dplyr option could be:

dat %>%
 mutate(across(ends_with("_1"), .names = "{col}_diff") - across(ends_with("_2"))) %>%
 rename_with(~ sub("_\\d+", "", .), ends_with("_diff"))

  Measure AD1_1 AD1_2 AD2_1 AD2_2 AD3_1 AD3_2 AD4_1 AD4_2 AD5_1 AD5_2 AD6_1 AD6_2 AD1_diff AD2_diff AD3_diff AD4_diff AD5_diff AD6_diff
  <chr>   <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>    <int>    <int>    <int>    <int>    <int>    <int>
1 Height      6    10    10     3    12     8     7     5     7     5     8     9       -4        7        4        2        2       -1
2 Weight      8     9    13     6    14     7     8     7    13    11    10     9       -1        7        7        1        2        1
3 Width      10     9    11     5    12     8     7    11     9     5     5     6        1        6        4       -4        4       -1
4 Length      8     9     8     7     8    13     8     7     6    11    14     6       -1        1       -5        1       -5        8

回答2:

The "tidy" way to do this would be to convert your data from wide to long, do a grouped subtraction, and then go back to wide format:

library(tidyr)
dat_long = dat %>% pivot_longer(
  cols = starts_with("AD"),
  names_sep = "_",
  names_to = c("group", "obs")
) 

dat_long %>% head
# # A tibble: 48 x 4
#    Measure group obs   value
#    <chr>   <chr> <chr> <int>
#  1 Height  AD1   1         6
#  2 Height  AD1   2        10
#  3 Height  AD2   1        10
#  4 Height  AD2   2         3
#  5 Height  AD3   1        12
#  6 Height  AD3   2         8

dat_long %>%
  group_by(Measure, group) %>% 
  summarize(diff = value[obs == 1] - value[obs == 2]) %>%
  pivot_wider(names_from = "group", values_from = "diff") %>%
  rename_with(.fn = ~ paste0(., "diff"), .cols = starts_with("AD"))
# # A tibble: 4 x 7
# # Groups:   Measure [4]
#   Measure AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
#   <chr>     <int>   <int>   <int>   <int>   <int>   <int>
# 1 Height       -4       7       4       2       2      -1
# 2 Length       -1       1      -5       1      -5       8
# 3 Weight       -1       7       7       1       2       1
# 4 Width         1       6       4      -4       4      -1

回答3:

Here is a data.table option

setDT(dat)[
  ,
  paste0(
    unique(gsub("_\\d+", "", names(dat)[-1])),
    "diff"
  ) := lapply(
    split.default(.SD[, -1], gsub("_\\d+", "", names(dat)[-1])),
    function(x) do.call("-", x)
  )
]

which gives

> dat
   Measure AD1_1 AD1_2 AD2_1 AD2_2 AD3_1 AD3_2 AD4_1 AD4_2 AD5_1 AD5_2 AD6_1
1:  Height     6    10    10     3    12     8     7     5     7     5     8
2:  Weight     8     9    13     6    14     7     8     7    13    11    10
3:   Width    10     9    11     5    12     8     7    11     9     5     5
4:  Length     8     9     8     7     8    13     8     7     6    11    14
   AD6_2 AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
1:     9      -4       7       4       2       2      -1
2:     9      -1       7       7       1       2       1
3:     6       1       6       4      -4       4      -1
4:     6      -1       1      -5       1      -5       8

setDT(dat)[
  ,
  c(.(Measure = Measure), setNames(lapply(
    split.default(.SD[, -1], gsub("_\\d+", "", names(dat)[-1])),
    function(x) do.call("-", x)
  ), paste0(
    unique(gsub("_\\d+", "", names(dat)[-1])),
    "diff"
  )))
]

gives

   Measure AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
1:  Height      -4       7       4       2       2      -1
2:  Weight      -1       7       7       1       2       1
3:   Width       1       6       4      -4       4      -1
4:  Length      -1       1      -5       1      -5       8

回答4:

Use tidyverse package `tidyr` to rearrange your data before mutating

require(dplyr)
require(tidyr)

#> Loading required package: tidyr

First, tidyr::pivot_longer the data frame so that there's a separate row for every column:

new_dat <- 
  pivot_longer(dat, cols = starts_with("AD"), # For columns whose names start with 'AD'...
               names_sep = "_", # separate columns using '_' in colname
               names_to = c("AD_number", "observation")) %>% 
  arrange(AD_number, Measure, observation)

head(new_dat, 9)
#> # A tibble: 9 x 4
#>   Measure AD_number observation value
#>   <chr>   <chr>     <chr>       <int>
#> 1 Height  AD1       1               6
#> 2 Height  AD1       2              10
#> 3 Length  AD1       1               8
#> 4 Length  AD1       2               9
#> 5 Weight  AD1       1               8
#> 6 Weight  AD1       2               9
#> 7 Width   AD1       1              10
#> 8 Width   AD1       2               9
#> 9 Height  AD2       1              10

Then, use tidyr::pivot_wider (the functional opposite of pivot_longer) to make a separate column for each value in observation. This will be very compatible with the upcoming mutate operation.

new_dat <-
  pivot_wider(new_dat,
            names_from = observation,
            values_from = value,
            names_prefix = "value_")

head(new_dat, 5)
#> # A tibble: 5 x 4
#>   Measure AD_number value_1 value_2
#>   <chr>   <chr>       <int>   <int>
#> 1 Height  AD1             6      10
#> 2 Length  AD1             8       9
#> 3 Weight  AD1             8       9
#> 4 Width   AD1            10       9
#> 5 Height  AD2            10       3

Finally, mutate the data:

new_dat <- 
  mutate(new_dat, diff = value_1 - value_2)

head(new_dat, 4)
#> # A tibble: 4 x 5
#>   Measure AD_number value_1 value_2  diff
#>   <chr>   <chr>       <int>   <int> <int>
#> 1 Height  AD1             6      10    -4
#> 2 Length  AD1             8       9    -1
#> 3 Weight  AD1             8       9    -1
#> 4 Width   AD1            10       9     1

^{Created on 2021-01-22 by the reprex package (v0.3.0)}

Getting back to your original data format is possible, but it might not make the data any easier to work with:

rename(new_dat, 
       c(`1` = "value_1", `2` = "value_2")) %>% 
  pivot_wider(names_from = AD_number,
              values_from = c(`1`, `2`, diff),
              names_glue = "{AD_number}_{.value}") %>% 
  {.[,order(names(.))]} %>% 
  relocate(Measure)

来源：https://stackoverflow.com/questions/65849383/finding-the-differences-of-paired-columns-using-dplyr

标签

dplyr