How to compute the mean in different categories using left_join and nest in R?

问题

I'm trying to compute the mean values for binned data using left_join and nest.

bin.size = 100

First dataframe:

df = data.frame(x =c(300,400), 
                y = c("sca1","sca2"))
    x    y
1 300 sca1
2 400 sca2

Second dataframe:

df2 = data.frame(snp = c(1,2,10,100,1,2,14,16,399), 
                 sca = c("sca1","sca1","sca1","sca1","sca2","sca2","sca2","sca2","sca2"))

      snp   r2  sca
1   1 0.70 sca1
2   2 0.80 sca1
3  10 0.70 sca1
4 100 0.10 sca1
5   1 0.90 sca2
6   2 0.98 sca2
7  14 0.80 sca2
8  16 0.80 sca2
9 399 0.01 sca2

Code from @r2evans:

output_bin_LD = df %>%
  left_join(nest(df2, snp, .key = "snp"), by = c("y" = "sca")) %>%
  mutate(
    cuts = map(x, ~ seq(0, ., by = bin.size)),
    tbls = pmap(
      .l = list(snp, cuts),
      .f = function(xx, breaks) {
        z <- table(cut(xx$snp, breaks))
        data_frame(cut = names(z), count = z)
      }
    )
  ) %>%
  select(y, tbls) %>%
  unnest()

This code up is doing this:

     y       cut count
1 sca1   (0,100]     4
2 sca1 (100,200]     0
3 sca1 (200,300]     0
4 sca2   (0,100]     4
5 sca2 (100,200]     0
6 sca2 (200,300]     0
7 sca2 (300,400]     1

The end goal would be to have

     y       cut count  mean
1 sca1   (0,100]     4 0.575
2 sca1 (100,200]     0     0
3 sca1 (200,300]     0     0
4 sca2   (0,100]     4  0.87
5 sca2 (100,200]     0     0
6 sca2 (200,300]     0     0
7 sca2 (300,400]     1   399

So far I've tried this:

df %>%
  left_join(nest(df2, snp, r2, .key = "snp"), 
            by = c("y" = "sca")) %>%
  mutate(
    cuts = map(x, ~ seq(0, ., by = 100)),
    tbls = pmap(
      .l = list(snp, cuts),
      .f = function(xx, breaks) {
        z <- table(cut(xx$snp, breaks))
        a <- mean(cut(xx$r2, breaks))
        data_frame(cut = names(z), count = z, mean = a)
      } # .f 
    ) # closing pmap
  ) %>% # mutate
  select(y, tbls) %>%
  unnest()

But it outputs me NAs and a warning message:

     y       cut count mean
1 sca1   (0,100]     4   NA
2 sca1 (100,200]     0   NA
3 sca1 (200,300]     0   NA
4 sca2   (0,100]     4   NA
5 sca2 (100,200]     0   NA
6 sca2 (200,300]     0   NA
7 sca2 (300,400]     1   NA
Warning messages:
1: In mean.default(cut(xx$r2, breaks)) :
  argument is not numeric or logical: returning NA
2: In mean.default(cut(xx$r2, breaks)) :
  argument is not numeric or logical: returning NA

How should I fix this? Do I need to double nest the table?

回答1:

Not sure about your approach, but here's a slightly straightforward approach.. using data.table package, if you're interested. You will need the latest version (currently 1.10.0) for this to work (since it's a new feature).

require(data.table) ## v1.9.8+
and <- b[a, on=.(sca=y, snp>start, snp<=end),       ## 1
         .(count=.N, mean=mean(r2, na.rm=TRUE)),    ## 2
         by=.EACHI]                                 ## 3

For each row in a, find matching row indices in b while matching on the condition provided to on argument.
length(matching row indices) == .N gives count and mean() gives the mean of r2 for those matching indices.
The expression in (2) is run for each row in a.

where, a is:

require(data.table) ## v1.9.8+
a <- setDT(df)[, .(start=seq(0, x-1, by=bin.size), 
                   end=seq(bin.size, x, by=bin.size)), 
                 by=y]

b <- fread("snp   r2  sca
      1 0.70 sca1
      2 0.80 sca1
     10 0.70 sca1
    100 0.10 sca1
      1 0.90 sca2
      2 0.98 sca2
     14 0.80 sca2
     16 0.80 sca2
    399 0.01 sca2")

回答2:

Here's a tidyverse option that's heavier on dplyr than purrr, which makes it somewhat more readable:

library(tidyverse)

df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%   
    summarise(count = n(),    # For each group, count rows
              mean = mean(r2)) %>%    # and calculate mean
    # Add rows for every level of the cuts. Fill new rows with zeros.
    complete(cut, fill = list(count = 0L, mean = 0)) %>% 
    separate(cut, c('from', 'to'), sep = ',') %>%    # Split cut into two numbers
    mutate_at(vars(from:to), parse_number) %>%    # Extract numbers from strings
    left_join(df, c(sca = 'y')) %>%    # Join to get x value for each group
    filter(to <= x)    # Subset to rows where the max cut is within the range.

#> Source: local data frame [7 x 6]
#> Groups: sca [2]
#> 
#>     sca  from    to count  mean     x
#>   <chr> <dbl> <dbl> <int> <dbl> <dbl>
#> 1  sca1     0   100     4 0.575   300
#> 2  sca1   100   200     0 0.000   300
#> 3  sca1   200   300     0 0.000   300
#> 4  sca2     0   100     4 0.870   400
#> 5  sca2   100   200     0 0.000   400
#> 6  sca2   200   300     0 0.000   400
#> 7  sca2   300   400     1 0.010   400

You can actually avoid the join and messing with the cuts with a little regex and subsetting:

df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>% 
    summarise(count = n(), 
              mean = mean(r2)) %>% 
    complete(cut, fill = list(count = 0L, mean = 0)) %>% 
    filter(as.integer(gsub('.*,(\\d+).*', '\\1', cut)) <= df$x[unique(sca) == df$y])

#> Source: local data frame [7 x 4]
#> Groups: sca [2]
#> 
#>     sca       cut count  mean
#>   <chr>    <fctr> <int> <dbl>
#> 1  sca1   (0,100]     4 0.575
#> 2  sca1 (100,200]     0 0.000
#> 3  sca1 (200,300]     0 0.000
#> 4  sca2   (0,100]     4 0.870
#> 5  sca2 (100,200]     0 0.000
#> 6  sca2 (200,300]     0 0.000
#> 7  sca2 (300,400]     1 0.010

Data

df <- structure(list(x = c(300, 400), y = c("sca1", "sca2")), .Names = c("x", 
    "y"), row.names = c(NA, -2L), class = "data.frame")

df2 <- structure(list(snp = c(1L, 2L, 10L, 100L, 1L, 2L, 14L, 16L, 399L
    ), r2 = c(0.7, 0.8, 0.7, 0.1, 0.9, 0.98, 0.8, 0.8, 0.01), sca = c("sca1", 
    "sca1", "sca1", "sca1", "sca2", "sca2", "sca2", "sca2", "sca2"
    )), .Names = c("snp", "r2", "sca"), row.names = c(NA, -9L), class = "data.frame")

来源：https://stackoverflow.com/questions/41752268/how-to-compute-the-mean-in-different-categories-using-left-join-and-nest-in-r

标签

left-join

dplyr

tidyr