问题
I'm trying to compute the mean values for binned data using left_join
and nest
.
bin.size = 100
First dataframe:
df = data.frame(x =c(300,400),
y = c("sca1","sca2"))
x y
1 300 sca1
2 400 sca2
Second dataframe:
df2 = data.frame(snp = c(1,2,10,100,1,2,14,16,399),
sca = c("sca1","sca1","sca1","sca1","sca2","sca2","sca2","sca2","sca2"))
snp r2 sca
1 1 0.70 sca1
2 2 0.80 sca1
3 10 0.70 sca1
4 100 0.10 sca1
5 1 0.90 sca2
6 2 0.98 sca2
7 14 0.80 sca2
8 16 0.80 sca2
9 399 0.01 sca2
Code from @r2evans:
output_bin_LD = df %>%
left_join(nest(df2, snp, .key = "snp"), by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = bin.size)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
data_frame(cut = names(z), count = z)
}
)
) %>%
select(y, tbls) %>%
unnest()
This code up is doing this:
y cut count
1 sca1 (0,100] 4
2 sca1 (100,200] 0
3 sca1 (200,300] 0
4 sca2 (0,100] 4
5 sca2 (100,200] 0
6 sca2 (200,300] 0
7 sca2 (300,400] 1
The end goal would be to have
y cut count mean
1 sca1 (0,100] 4 0.575
2 sca1 (100,200] 0 0
3 sca1 (200,300] 0 0
4 sca2 (0,100] 4 0.87
5 sca2 (100,200] 0 0
6 sca2 (200,300] 0 0
7 sca2 (300,400] 1 399
So far I've tried this:
df %>%
left_join(nest(df2, snp, r2, .key = "snp"),
by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = 100)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
a <- mean(cut(xx$r2, breaks))
data_frame(cut = names(z), count = z, mean = a)
} # .f
) # closing pmap
) %>% # mutate
select(y, tbls) %>%
unnest()
But it outputs me NA
s and a warning message:
y cut count mean
1 sca1 (0,100] 4 NA
2 sca1 (100,200] 0 NA
3 sca1 (200,300] 0 NA
4 sca2 (0,100] 4 NA
5 sca2 (100,200] 0 NA
6 sca2 (200,300] 0 NA
7 sca2 (300,400] 1 NA
Warning messages:
1: In mean.default(cut(xx$r2, breaks)) :
argument is not numeric or logical: returning NA
2: In mean.default(cut(xx$r2, breaks)) :
argument is not numeric or logical: returning NA
How should I fix this? Do I need to double nest the table?
回答1:
Not sure about your approach, but here's a slightly straightforward approach.. using data.table
package, if you're interested. You will need the latest version (currently 1.10.0) for this to work (since it's a new feature).
require(data.table) ## v1.9.8+
and <- b[a, on=.(sca=y, snp>start, snp<=end), ## 1
.(count=.N, mean=mean(r2, na.rm=TRUE)), ## 2
by=.EACHI] ## 3
For each row in
a
, find matching row indices inb
while matching on the condition provided toon
argument.length(matching row indices)
==.N
givescount
andmean()
gives the mean ofr2
for those matching indices.The expression in
(2)
is run for each row ina
.
where, a
is:
require(data.table) ## v1.9.8+
a <- setDT(df)[, .(start=seq(0, x-1, by=bin.size),
end=seq(bin.size, x, by=bin.size)),
by=y]
b <- fread("snp r2 sca
1 0.70 sca1
2 0.80 sca1
10 0.70 sca1
100 0.10 sca1
1 0.90 sca2
2 0.98 sca2
14 0.80 sca2
16 0.80 sca2
399 0.01 sca2")
回答2:
Here's a tidyverse option that's heavier on dplyr than purrr, which makes it somewhat more readable:
library(tidyverse)
df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%
summarise(count = n(), # For each group, count rows
mean = mean(r2)) %>% # and calculate mean
# Add rows for every level of the cuts. Fill new rows with zeros.
complete(cut, fill = list(count = 0L, mean = 0)) %>%
separate(cut, c('from', 'to'), sep = ',') %>% # Split cut into two numbers
mutate_at(vars(from:to), parse_number) %>% # Extract numbers from strings
left_join(df, c(sca = 'y')) %>% # Join to get x value for each group
filter(to <= x) # Subset to rows where the max cut is within the range.
#> Source: local data frame [7 x 6]
#> Groups: sca [2]
#>
#> sca from to count mean x
#> <chr> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 sca1 0 100 4 0.575 300
#> 2 sca1 100 200 0 0.000 300
#> 3 sca1 200 300 0 0.000 300
#> 4 sca2 0 100 4 0.870 400
#> 5 sca2 100 200 0 0.000 400
#> 6 sca2 200 300 0 0.000 400
#> 7 sca2 300 400 1 0.010 400
You can actually avoid the join and messing with the cuts with a little regex and subsetting:
df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%
summarise(count = n(),
mean = mean(r2)) %>%
complete(cut, fill = list(count = 0L, mean = 0)) %>%
filter(as.integer(gsub('.*,(\\d+).*', '\\1', cut)) <= df$x[unique(sca) == df$y])
#> Source: local data frame [7 x 4]
#> Groups: sca [2]
#>
#> sca cut count mean
#> <chr> <fctr> <int> <dbl>
#> 1 sca1 (0,100] 4 0.575
#> 2 sca1 (100,200] 0 0.000
#> 3 sca1 (200,300] 0 0.000
#> 4 sca2 (0,100] 4 0.870
#> 5 sca2 (100,200] 0 0.000
#> 6 sca2 (200,300] 0 0.000
#> 7 sca2 (300,400] 1 0.010
Data
df <- structure(list(x = c(300, 400), y = c("sca1", "sca2")), .Names = c("x",
"y"), row.names = c(NA, -2L), class = "data.frame")
df2 <- structure(list(snp = c(1L, 2L, 10L, 100L, 1L, 2L, 14L, 16L, 399L
), r2 = c(0.7, 0.8, 0.7, 0.1, 0.9, 0.98, 0.8, 0.8, 0.01), sca = c("sca1",
"sca1", "sca1", "sca1", "sca2", "sca2", "sca2", "sca2", "sca2"
)), .Names = c("snp", "r2", "sca"), row.names = c(NA, -9L), class = "data.frame")
来源:https://stackoverflow.com/questions/41752268/how-to-compute-the-mean-in-different-categories-using-left-join-and-nest-in-r