Dividing each cell in a data set by the column sum in R

后端 未结 5 946
遇见更好的自我
遇见更好的自我 2020-12-17 01:35

I am trying to divide each cell in a data frame by the sum of the column. For example, I have a data frame df:

sample   a   b   c
a2      1    4    6
a3             


        
5条回答
  •  攒了一身酷
    2020-12-17 01:59

    Solution 1

    Here are two dplyr solutions. We can use mutate_at or mutate_if to efficiently specify which column we want to apply an operation, or under what condition we want to apply an operation.

    library(dplyr)
    
    # Apply the operation to all column except sample
    dat2 <- dat %>%
      mutate_at(vars(-sample), funs(./sum(.))) 
    dat2
    #   sample         a         b   c
    # 1     a2 0.1666667 0.4444444 0.6
    # 2     a3 0.8333333 0.5555556 0.4
    
    # Apply the operation if the column is numeric 
    dat2 <- dat %>%
      mutate_if(is.numeric, funs(./sum(.))) 
    dat2
    #   sample         a         b   c
    # 1     a2 0.1666667 0.4444444 0.6
    # 2     a3 0.8333333 0.5555556 0.4
    

    Solution 2

    We can also use the map_at and map_if function from the purrr package. However, since the output is a list, we will need as.data.frame from base R or as_data_frame from dplyr to convert the list to a data frame.

    library(dplyr)
    library(purrr)
    
    # Apply the operation to column a, b, and c    
    dat2 <- dat %>%
      map_at(c("a", "b", "c"), ~./sum(.)) %>% 
      as_data_frame()
    dat2
    # # A tibble: 2 x 4
    #   sample     a     b     c
    #       
    # 1 a2     0.167 0.444 0.600
    # 2 a3     0.833 0.556 0.400
    
    # Apply the operation if the column is numeric
    dat2 <- dat %>%
      map_if(is.numeric, ~./sum(.)) %>%
      as_data_frame()
    dat2
    # # A tibble: 2 x 4
    #   sample     a     b     c
    #       
    # 1 a2     0.167 0.444 0.600
    # 2 a3     0.833 0.556 0.400
    

    Solution 3

    We can also use the .SD and .SDcols from the data.table package.

    library(data.table)
    
    # Convert to data.table
    setDT(dat)
    dat2 <- copy(dat)
    dat2[, (c("a", "b", "c")) := lapply(.SD, function(x) x/sum(x)), .SDcols = c("a", "b", "c")]
    dat2[]
    #    sample         a         b   c
    # 1:     a2 0.1666667 0.4444444 0.6
    # 2:     a3 0.8333333 0.5555556 0.4
    

    Solution 4

    We can also use the lapply function to loop through all column except the first column to perform the operation.

    dat2 <- dat
    dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))
    dat2
    #   sample         a         b   c
    # 1     a2 0.1666667 0.4444444 0.6
    # 2     a3 0.8333333 0.5555556 0.4
    

    We can also use apply to loop through all columns but add an if-else statement in the function to make sure only perform the operation on the numeric columns.

    dat2 <- dat
    dat2[] <- lapply(dat2[], function(x){
      # Check if the column is numeric
      if (is.numeric(x)){
        return(x/sum(x))
      } else{
        return(x)
      }
    })
    dat2
    #   sample         a         b   c
    # 1     a2 0.1666667 0.4444444 0.6
    # 2     a3 0.8333333 0.5555556 0.4
    

    Solution 5

    A dplyr and tidyr solution based on gather and spread.

    library(dplyr)
    library(tidyr)
    
    dat2 <- dat %>%
      gather(Column, Value, -sample) %>%
      group_by(Column) %>%
      mutate(Value = Value/sum(Value)) %>%
      spread(Column, Value)
    dat2
    # # A tibble: 2 x 4
    #   sample     a     b     c
    # *     
    # 1 a2     0.167 0.444 0.600
    # 2 a3     0.833 0.556 0.400
    

    Performance Evaluation

    I am curious about which method has the best performance. So I conduct the following performance evaluation using the microbenchmark package with a data frame having the same column names as OP's example but with 1000000 rows.

    library(dplyr)
    library(tidyr)
    library(purrr)
    library(data.table)
    library(microbenchmark)
    
    set.seed(100)
    
    dat <- data_frame(sample = paste0("a", 1:1000000),
                      a = rpois(1000000, lambda = 3),
                      b = rpois(1000000, lambda = 3),
                      c = rpois(1000000, lambda = 3))
    
    # Convert the data frame to a data.table for later perofrmance evaluation
    dat_dt <- as.data.table(dat)    
    
    head(dat)
    # # A tibble: 6 x 4
    #   sample     a     b     c
    #       
    # 1 a1         2     5     2
    # 2 a2         2     5     5
    # 3 a3         3     2     4
    # 4 a4         1     2     2
    # 5 a5         3     3     1
    # 6 a6         3     6     1
    

    In addition to all the methods I proposed, I also interested two other methods proposed by others: the prop.table method proposed by Henrik in the comments, and the apply method by Spacedman. I called all my solutions with m1_1, m1_2, m2_1, ... to m5. If there are two methods in one solution, I used _ to separate them. I also called the prop.table method as m6 and the apply method as m7. Notice that I modified m6 to have an output as a data frame so that all the methods can have data frame, tibble, or data.table output.

    Here is the code I used to assess the performance.

    per <- microbenchmark(m1_1 = {dat2 <- dat %>% mutate_at(vars(-sample), funs(./sum(.)))},
                          m1_2 = {dat2 <- dat %>% mutate_if(is.numeric, funs(./sum(.)))},
                          m2_1 = {dat2 <- dat %>%
                            map_at(c("a", "b", "c"), ~./sum(.)) %>% 
                            as_data_frame()
                          },
                          m2_2 = {dat2 <- dat %>%
                            map_if(is.numeric, ~./sum(.)) %>%
                            as_data_frame()},
                          m3 = {dat_dt2 <- copy(dat_dt)
                                dat_dt2[, c("a", "b", "c") := lapply(.SD, function(x) x/sum(x)), 
                                          .SDcols = c("a", "b", "c")]},
                          m4_1 = {dat2 <- dat
                                  dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))},
                          m4_2 = {dat2 <- dat
                                  dat2[] <- lapply(dat2[], function(x){
                            if (is.numeric(x)){
                              return(x/sum(x))
                            } else{
                              return(x)
                            }
                          })},
                          m5 = {dat2 <- dat %>%
                            gather(Column, Value, -sample) %>%
                            group_by(Column) %>%
                            mutate(Value = Value/sum(Value)) %>%
                            spread(Column, Value)},
                          m6 = {dat2 <- dat
                                dat2[-1] <- prop.table(as.matrix(dat2[-1]), margin = 2)},
                          m7 = {dat2 <- dat
                                dat2[, -1] = apply(dat2[, -1], 2, function(x) {x/sum(x)})}
                          )
    print(per)
    # Unit: milliseconds
    # expr         min          lq       mean      median          uq        max neval
    # m1_1   23.335600   24.326445   28.71934   25.134798   27.465017   75.06974   100
    # m1_2   20.373093   21.202780   29.73477   21.967439   24.897305  216.27853   100
    # m2_1    9.452987    9.817967   17.83030   10.052634   11.056073  175.00184   100
    # m2_2   10.009197   10.342819   16.43832   10.679270   11.846692  163.62731   100
    #   m3   16.195868   17.154327   34.40433   18.975886   46.521868  190.50681   100
    # m4_1    8.100504    8.342882   12.66035    8.778545    9.348634  181.45273   100
    # m4_2    8.130833    8.499926   15.84080    8.766979    9.732891  172.79242   100
    #   m5 5373.395308 5652.938528 5791.73180 5737.383894 5825.141584 6660.35354   100
    #   m6  117.038355  150.688502  191.43501  166.665125  218.837502  325.58701   100
    #   m7  119.680606  155.743991  199.59313  174.007653  215.295395  357.02775   100
    
    
    library(ggplot2)
    autoplot(per) 
    

    The result shows that methods based on lapply (m4_1 and m4_2) are the fastest, while the tidyr approach (m5) is the slowest, indicating that when row numbers are large it is not a good idea to use the gather and spread method.

    DATA

    dat <- read.table(text = "sample   a   b   c
    a2      1    4    6
                      a3      5    5    4",
                      header = TRUE, stringsAsFactors = FALSE)
    

提交回复
热议问题