How to use map from purrr with dplyr::mutate to create multiple new columns based on column pairs

孤者浪人 提交于 2019-11-28 07:02:15

Here is one option with purrr. We get the unique prefix of the names of the dataset ('nm1'), use map (from purrr) to loop through the unique names, select the column that matches the prefix value of 'nm1', add the rows using reduce and the bind the columns (bind_cols) with the original dataset

library(tidyverse)
nm1 <- names(df) %>% 
          substr(1, 1) %>%
          unique 
nm1 %>% 
     map(~ df %>% 
            select(matches(.x)) %>%
            reduce(`+`)) %>%
            set_names(paste0("sum_", nm1)) %>%
     bind_cols(df, .)
#    a1 b1 c1 a2 b2 c2 sum_a sum_b sum_c
#1  1  4 10  9  3 15    10     7    25
#2  2  5 11 10  4 16    12     9    27
#3  3  6 12 11  5 17    14    11    29
#4  4  7 13 12  6 18    16    13    31
#5  5  8 14 13  7 19    18    15    33

In case you like to consider a base R approach, here's how you could do it:

cbind(df, lapply(split.default(df, substr(names(df), 0,1)), rowSums))
#  a1 b1 c1 a2 b2 c2  a  b  c
#1  1  4 10  9  3 15 10  7 25
#2  2  5 11 10  4 16 12  9 27
#3  3  6 12 11  5 17 14 11 29
#4  4  7 13 12  6 18 16 13 31
#5  5  8 14 13  7 19 18 15 33

It splits the data column-wise into a list, based on the first letter of each column name (either a, b, or c).

If you have a large number of columns and need to differentiate between all characters except the numbers at the end of each column name, you could modify the approach to:

cbind(df, lapply(split.default(df, sub("\\d+$", "", names(df))), rowSums))
df %>% 
  mutate(sum_a = pmap_dbl(select(., starts_with("a")), sum), 
         sum_b = pmap_dbl(select(., starts_with("b")), sum),
         sum_c = pmap_dbl(select(., starts_with("c")), sum))

  a1 b1 c1 a2 b2 c2 sum_a sum_b sum_c
1  1  4 10  9  3 15    10     7    25
2  2  5 11 10  4 16    12     9    27
3  3  6 12 11  5 17    14    11    29
4  4  7 13 12  6 18    16    13    31
5  5  8 14 13  7 19    18    15    33

in base R, all vectorized:

nms <- names(df)
df[paste0("sum_",unique(gsub("[1-9]","",nms)))] <-
  df[endsWith(nms,"1")] + df[endsWith(nms,"2")]

#   a1 b1 c1 a2 b2 c2 sum_a sum_b sum_c
# 1  1  4 10  9  3 15    10     7    25
# 2  2  5 11 10  4 16    12     9    27
# 3  3  6 12 11  5 17    14    11    29
# 4  4  7 13 12  6 18    16    13    31
# 5  5  8 14 13  7 19    18    15    33

For a hackish tidy solution, check this out:

library(tidyr)
library(dplyr)

df %>% 
   rownames_to_column(var = 'row') %>% 
   gather(a1:c2, key = 'key', value = 'value') %>% 
   extract(key, into = c('col.base', 'col.index'), regex = '([a-zA-Z]+)([0-9]+)') %>% 
   group_by(row, col.base) %>% 
   summarize(.sum = sum(value)) %>%
   spread(col.base, .sum) %>% 
   bind_cols(df, .) %>% 
   select(-row)

Basically, I collect all pairs of columns with their values across all rows, separate the column name in two parts, calculate the row sums for columns with the same letter, and cast it back to the wide form.

Another solution that splits df by the numbers than use Reduce to calculate the sum

library(tidyverse)

df %>% 
  split.default(., substr(names(.), 2, 3)) %>% 
  Reduce('+', .) %>% 
  set_names(paste0("sum_", substr(names(.), 1, 1))) %>% 
  cbind(df, .)

#>   a1 b1 c1 a2 b2 c2 sum_a sum_b sum_c
#> 1  1  4 10  9  3 15    10     7    25
#> 2  2  5 11 10  4 16    12     9    27
#> 3  3  6 12 11  5 17    14    11    29
#> 4  4  7 13 12  6 18    16    13    31
#> 5  5  8 14 13  7 19    18    15    33

Created on 2018-04-13 by the reprex package (v0.2.0).

1) dplyr/tidyr Convert to long form, summarize and convert back to wide form:

library(dplyr)
library(tidyr)

DF %>%
  mutate(Row = 1:n()) %>%
  gather(colname, value, -Row) %>%
  group_by(g = gsub("\\d", "", colname), Row) %>%
  summarize(sum = sum(value)) %>%
  ungroup %>%
  mutate(g = paste("sum", g, sep = "_")) %>%
  spread(g, sum) %>%
  arrange(Row) %>%
  cbind(DF, .) %>%
  select(-Row)

giving:

  a1 b1 c1 a2 b2 c2 sum_a sum_b sum_c
1  1  4 10  9  3 15    10     7    25
2  2  5 11 10  4 16    12     9    27
3  4  7 13 12  6 18    16    13    31
4  5  8 14 13  7 19    18    15    33

2) base using matrix multiplication

nms is a vector of column names without the digits and prefaced with sum_. u is a vector of the unique elements of it. Form a logical matrix using outer from that which when multiplied by DF gives the sums -- the logicals get converted to 0-1 when that is done. Finally bind it to the input.

nms <- gsub("(\\D+)\\d", "sum_\\1", names(DF))
u <- unique(nms)
sums <- as.matrix(DF) %*% outer(nms, setNames(u, u), "==")
cbind(DF, sums)

giving:

  a1 b1 c1 a2 b2 c2 sum_a sum_b sum_c
1  1  4 10  9  3 15    10     7    25
2  2  5 11 10  4 16    12     9    27
3  4  7 13 12  6 18    16    13    31
4  5  8 14 13  7 19    18    15    33

3) base with tapply

Using nms from (2) apply tapply to each row:

cbind(DF, t(apply(DF, 1, tapply, nms, sum)))

giving:

  a1 b1 c1 a2 b2 c2 sum_a sum_b sum_c
1  1  4 10  9  3 15    10     7    25
2  2  5 11 10  4 16    12     9    27
3  4  7 13 12  6 18    16    13    31
4  5  8 14 13  7 19    18    15    33

You may wish to replace nms with factor(nms, levels = unique(nms)) in the above expression if the names are not in ascending order.

A slightly different approach using base R:

cbind(df, lapply(unique(gsub("\\d+","", colnames(df))), function(li) {
   set_names(data.frame(V = apply(df[grep(li, colnames(df), val = T)], FUN = sum, MARGIN = 1)), paste0("sum_", li))
}))
#  a1 b1 c1 a2 b2 c2 sum_a sum_b sum_c
#1  1  4 10  9  3 15    10     7    25
#2  2  5 11 10  4 16    12     9    27
#3  3  6 12 11  5 17    14    11    29
#4  4  7 13 12  6 18    16    13    31
#5  5  8 14 13  7 19    18    15    33
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!