I have a large data.frame
where the first three columns contain information about a marker. The remaining columns are of numeric type for that
These is an other possible solution. All solution above are valid.
My solution is create a function for your case-sensitive without the use of a new library. It's quite long and it's possible to compact, but it's useful to see each step in order to understand how the function works.
olddf <- data.frame(marker = c("kgp5209280_chr3_21902067",
"chr3_21902130_21902131_A_T",
"chr3_21902134_21902135_T_C"),
alleleA = c("T","A","T"),
alleleB = c("A","T","C"),
X818 = c(0.0000,0.8626,0.6982),
X818.1 = c(1.0000,0.1356,0.2854),
X818.2 = c(0.0000,0.0018,0.0164),
X345 = c(1.0000,0.7676, 0.5617),
X345.1 = c(0.0000, 0.2170, 0.3749),
X345.2 = c(0.0000, 0.0154, 0.0634),
X346 = c(0.0000, 0.8626, 0.6982),
X346.1 = c(1.0000,0.1356, 0.2854),
X346.2 = c(0.0000, 0.0018, 0.0164))
mergeallele <- function(arguments,threshold = 0.8){
n <- nrow(arguments)
# Creation of a results object as an empty list of length NROW
# speed for huge data.frame
new.lst <- vector(mode="list", n)
for (i in 1:n){
marker_row <- arguments[i,]
colvalue.4 <- NaN
if (max(marker_row[,c(4:6)]) < threshold){
colvalue.4 <- max(marker_row[,c(4:6)])
}
colvalue.5 <- NaN
if (max(marker_row[,c(7:9)]) < threshold){
colvalue.5 <- max(marker_row[,c(7:9)])
}
colvalue.6 <- NaN
if (max(marker_row[,c(10:12)]) < threshold){
colvalue.6 <- max(marker_row[,c(10:12)])
}
new.lst[[i]] <- data.frame(marker_row[,1],
marker_row[,2],
marker_row[,3],
colvalue.4,
colvalue.5,
colvalue.6)
}
new.df <- as.data.frame(do.call("rbind",new.lst))
names(new.df) <- c(colnames(arguments)[1],
colnames(arguments)[2],
colnames(arguments)[3],
colnames(arguments)[4],
colnames(arguments)[7],
colnames(arguments)[10])
return(new.df)
}
newdf <- mergeallele(olddf)
marker alleleA alleleB X818 X345 X346
1 kgp5209280_chr3_21902067 T A NaN NaN NaN
2 chr3_21902130_21902131_A_T A T NaN 0.7676 NaN
3 chr3_21902134_21902135_T_C T C 0.6982 0.5617 0.6982
about:
threshold = 0.8
you can set your the threshold value (ex: 0.8) avoid to change variable inside the function
new.lst <- vector(mode="list", n)
you can create a empty list of length the old data.frame and the elements of the list are then gradually filled with the loop results (much faster). See the test speed from this Blog
data.table
versions >= 1.9.0. Go here for more info.require(data.table)
require(reshape2)
dt <- as.data.table(df)
# melt data.table
dt.m <- melt(dt, id=c("marker", "alleleA", "alleleB"),
variable.name="id", value.name="val")
dt.m[, id := gsub("\\.[0-9]+$", "", id)] # replace `.[0-9]` with nothing
# aggregation
dt.m <- dt.m[, list(alleleA = alleleA[1],
alleleB = alleleB[1], val = max(val)),
keyby=list(marker, id)][val <= 0.8, val := NA]
# casting back
dt.c <- dcast.data.table(dt.m, marker + alleleA + alleleB ~ id)
# marker alleleA alleleB X345 X346 X818
# 1: chr3_21902130_21902131_A_T A T NA 0.8626 0.8626
# 2: chr3_21902134_21902135_T_C T C NA NA NA
# 3: kgp5209280_chr3_21902067 T A 1 1.0000 1.0000
Solution 1: Probably not the best way, but this is what I could think of at the moment:
mm <- t(apply(df[-(1:3)], 1, function(x) tapply(x, gl(3,3), max)))
mode(mm) <- "numeric"
mm[mm < 0.8] <- NA
# you can set the column names of mm here if necessary
out <- cbind(df[, 1:3], mm)
# marker alleleA alleleB 1 2 3
# 1 kgp5209280_chr3_21902067 T A 1.0000 1 1.0000
# 2 chr3_21902130_21902131_A_T A T 0.8626 NA 0.8626
# 3 chr3_21902134_21902135_T_C T C NA NA NA
gl(3,3)
gives a factor with values 1,1,1,2,2,2,3,3,3
with levels 1,2,3
. That is, tapply
will take the values x
3 at a time and get their max
(first 3, next 3 and the last 3). And apply
sends each row one by one.
Solution 2: A data.table
solution with melt
and cast
within data.table
without using reshape
or reshape2
:
require(data.table)
dt <- data.table(df)
# melt your data.table to long format
dt.melt <- dt[, list(id = names(.SD), val = unlist(.SD)),
by=list(marker, alleleA, alleleB)]
# replace `.[0-9]` with nothing
dt.melt[, id := gsub("\\.[0-9]+$", "", id)]
# get max value grouping by marker and id
dt.melt <- dt.melt[, list(alleleA = alleleA[1],
alleleB = alleleB[1],
val = max(val)),
keyby=list(marker, id)][val <= 0.8, val := NA]
# edit mnel (use setattr(,'names') to avoid copy by `names<-` within `setNames`
dt.cast <- dt.melt[, as.list(setattr(val,'names', id)),
by=list(marker, alleleA, alleleB)]
# marker alleleA alleleB X345 X346 X818
# 1: chr3_21902130_21902131_A_T A T NA 0.8626 0.8626
# 2: chr3_21902134_21902135_T_C T C NA NA NA
# 3: kgp5209280_chr3_21902067 T A 1 1.0000 1.0000
I think it is better here to put your data in the long format. Here a solution based on reshape2
package , maybe similar to second @Arun solution but syntactically different
library(reshape2)
dat.m <- melt(dat,id.vars=1:3)
dat.m$variable <- gsub('[.].*','',dat.m$variable)
dcast(dat.m,...~variable,fun.aggregate=function(x){
res <- NA_real_
if(length(x) > 0 && max(x)> 0.8)
res <- max(x)
res
})
marker alleleA alleleB X345 X346 X818
1 chr3_21902130_21902131_A_T A T NA 0.8626 0.8626
2 chr3_21902134_21902135_T_C T C NA NA NA
3 kgp5209280_chr3_21902067 T A 1 1.0000 1.0000
Here is my approach using the function pmax
. Note that this will give you the maximum if there are two or more values above 0.8 for each individual:
df <- read.table(textConnection(" marker alleleA alleleB X818 X818.1 X818.2 X345 X345.1 X345.2 X346 X346.1 X346.2
1 kgp5209280_chr3_21902067 T A 0.0000 1.0000 0.0000 1.0000 0.0000 0.0000 0.0000 1.0000 0.0000
2 chr3_21902130_21902131_A_T A T 0.8626 0.1356 0.0018 0.7676 0.2170 0.0154 0.8626 0.1356 0.0018
3 chr3_21902134_21902135_T_C T C 0.6982 0.2854 0.0164 0.5617 0.3749 0.0634 0.6982 0.2854 0.0164"), header=TRUE)
#data.table solution
library(data.table)
DT <- as.data.table(df)
DT[, M818 := ifelse(pmax(X818, X818.1, X818.2) > 0.8, pmax(X818, X818.1, X818.2), NA)]
DT[, M345 := ifelse(pmax(X345, X345.1, X345.2) > 0.8, pmax(X345, X345.1, X345.2), NA)]
DT[, M346 := ifelse(pmax(X346, X346.1, X346.2) > 0.8, pmax(X346, X346.1, X346.2), NA)]
#Base R solution
df$M818 <- ifelse(pmax(df$X818, df$X818.1, df$X818.2) > 0.8, pmax(df$X818, df$X818.1, df$X818.2), NA)
df$M345 <- ifelse(pmax(df$X345, df$X345.1, df$X345.2) > 0.8, pmax(df$X345, df$X345.1, df$X345.2), NA)
df$M346 <- ifelse(pmax(df$X346, df$X346.1, df$X346.2) > 0.8, pmax(df$X346, df$X346.1, df$X346.2), NA)
If you want to get rid of the other columns, just type:
DT[, list(marker, alleleA, alleleB, M818, M345, M346)]
marker alleleA alleleB M818 M345 M346
1: kgp5209280_chr3_21902067 T A 1.0000 1 1.0000
2: chr3_21902130_21902131_A_T A T 0.8626 NA 0.8626
3: chr3_21902134_21902135_T_C T C NA NA NA