Nested if else statements over a number of columns

后端 未结 4 1025
情深已故
情深已故 2020-12-03 00:07

I have a large data.frame where the first three columns contain information about a marker. The remaining columns are of numeric type for that

相关标签:
4条回答
  • 2020-12-03 00:53

    These is an other possible solution. All solution above are valid.

    My solution is create a function for your case-sensitive without the use of a new library. It's quite long and it's possible to compact, but it's useful to see each step in order to understand how the function works.

    olddf <- data.frame(marker = c("kgp5209280_chr3_21902067",
            "chr3_21902130_21902131_A_T",
            "chr3_21902134_21902135_T_C"),
            alleleA = c("T","A","T"),
            alleleB = c("A","T","C"),
            X818 = c(0.0000,0.8626,0.6982),
            X818.1 = c(1.0000,0.1356,0.2854),
            X818.2 = c(0.0000,0.0018,0.0164),
            X345 = c(1.0000,0.7676, 0.5617),
            X345.1 = c(0.0000, 0.2170, 0.3749),
            X345.2 = c(0.0000, 0.0154, 0.0634),   
            X346 = c(0.0000, 0.8626, 0.6982),
            X346.1 = c(1.0000,0.1356, 0.2854), 
            X346.2 = c(0.0000, 0.0018, 0.0164))
    
    
    mergeallele <- function(arguments,threshold = 0.8){
        n <- nrow(arguments)
        # Creation of a results object as an empty list of length NROW
        # speed for huge data.frame 
        new.lst <- vector(mode="list", n)
        for (i in 1:n){
            marker_row <- arguments[i,]
            colvalue.4 <- NaN
            if (max(marker_row[,c(4:6)]) < threshold){
                colvalue.4 <- max(marker_row[,c(4:6)])
            }
    
            colvalue.5 <- NaN       
            if (max(marker_row[,c(7:9)]) < threshold){
                colvalue.5 <- max(marker_row[,c(7:9)])
            }
    
            colvalue.6 <- NaN       
            if (max(marker_row[,c(10:12)]) < threshold){
                colvalue.6 <- max(marker_row[,c(10:12)])
            }
            new.lst[[i]]  <- data.frame(marker_row[,1],
                marker_row[,2],
                marker_row[,3],
                colvalue.4,
                colvalue.5,
                colvalue.6)     
        }   
        new.df <- as.data.frame(do.call("rbind",new.lst))
        names(new.df) <-  c(colnames(arguments)[1],
                        colnames(arguments)[2],
                        colnames(arguments)[3],
                        colnames(arguments)[4],
                        colnames(arguments)[7],
                        colnames(arguments)[10])
        return(new.df)
    }
    
    
    newdf <- mergeallele(olddf)
    
                          marker alleleA alleleB   X818   X345   X346
    1   kgp5209280_chr3_21902067       T       A    NaN    NaN    NaN
    2 chr3_21902130_21902131_A_T       A       T    NaN 0.7676    NaN
    3 chr3_21902134_21902135_T_C       T       C 0.6982 0.5617 0.6982
    

    about:

    threshold = 0.8 
    

    you can set your the threshold value (ex: 0.8) avoid to change variable inside the function

    new.lst <- vector(mode="list", n)
    

    you can create a empty list of length the old data.frame and the elements of the list are then gradually filled with the loop results (much faster). See the test speed from this Blog

    0 讨论(0)
  • 2020-12-03 00:57

    Edit: Updated solution using the fast melt/dcast methods implemented in data.table versions >= 1.9.0. Go here for more info.

    require(data.table)
    require(reshape2)
    dt <- as.data.table(df)
    
    # melt data.table
    dt.m <- melt(dt, id=c("marker", "alleleA", "alleleB"), 
                     variable.name="id", value.name="val")
    dt.m[, id := gsub("\\.[0-9]+$", "", id)] # replace `.[0-9]` with nothing
    # aggregation
    dt.m <- dt.m[, list(alleleA = alleleA[1], 
             alleleB = alleleB[1], val = max(val)), 
            keyby=list(marker, id)][val <= 0.8, val := NA]
    # casting back
    dt.c <- dcast.data.table(dt.m, marker + alleleA + alleleB ~ id)
    #                        marker alleleA alleleB X345   X346   X818
    # 1: chr3_21902130_21902131_A_T       A       T   NA 0.8626 0.8626
    # 2: chr3_21902134_21902135_T_C       T       C   NA     NA     NA
    # 3:   kgp5209280_chr3_21902067       T       A    1 1.0000 1.0000
    

    Solution 1: Probably not the best way, but this is what I could think of at the moment:

    mm <- t(apply(df[-(1:3)], 1, function(x) tapply(x, gl(3,3), max)))
    mode(mm) <- "numeric"
    mm[mm < 0.8] <- NA 
    # you can set the column names of mm here if necessary
    out <- cbind(df[, 1:3], mm)
    
    #                       marker alleleA alleleB      1  2      3
    # 1   kgp5209280_chr3_21902067       T       A 1.0000  1 1.0000
    # 2 chr3_21902130_21902131_A_T       A       T 0.8626 NA 0.8626
    # 3 chr3_21902134_21902135_T_C       T       C     NA NA     NA
    

    gl(3,3) gives a factor with values 1,1,1,2,2,2,3,3,3 with levels 1,2,3. That is, tapply will take the values x 3 at a time and get their max (first 3, next 3 and the last 3). And apply sends each row one by one.


    Solution 2: A data.table solution with melt and cast within data.table without using reshape or reshape2:

    require(data.table)
    dt <- data.table(df)
    # melt your data.table to long format
    dt.melt <- dt[, list(id = names(.SD), val = unlist(.SD)), 
                      by=list(marker, alleleA, alleleB)]
    # replace `.[0-9]` with nothing
    dt.melt[, id := gsub("\\.[0-9]+$", "", id)]
    # get max value grouping by marker and id
    dt.melt <- dt.melt[, list(alleleA = alleleA[1], 
                          alleleB = alleleB[1], 
                          val = max(val)), 
            keyby=list(marker, id)][val <= 0.8, val := NA]
    # edit mnel (use setattr(,'names') to avoid copy by `names<-` within `setNames`
    dt.cast <- dt.melt[, as.list(setattr(val,'names', id)), 
                       by=list(marker, alleleA, alleleB)]
    
    #                        marker alleleA alleleB X345   X346   X818
    # 1: chr3_21902130_21902131_A_T       A       T   NA 0.8626 0.8626
    # 2: chr3_21902134_21902135_T_C       T       C   NA     NA     NA
    # 3:   kgp5209280_chr3_21902067       T       A    1 1.0000 1.0000
    
    0 讨论(0)
  • 2020-12-03 00:58

    I think it is better here to put your data in the long format. Here a solution based on reshape2 package , maybe similar to second @Arun solution but syntactically different

    library(reshape2)
    dat.m <- melt(dat,id.vars=1:3)
    dat.m$variable <- gsub('[.].*','',dat.m$variable)
    dcast(dat.m,...~variable,fun.aggregate=function(x){
       res <- NA_real_
       if(length(x) > 0 && max(x)> 0.8)
          res <- max(x)
       res
    })
    
                          marker alleleA alleleB X345   X346   X818
    1 chr3_21902130_21902131_A_T       A       T   NA 0.8626 0.8626
    2 chr3_21902134_21902135_T_C       T       C   NA     NA     NA
    3   kgp5209280_chr3_21902067       T       A    1 1.0000 1.0000
    
    0 讨论(0)
  • 2020-12-03 00:58

    Here is my approach using the function pmax. Note that this will give you the maximum if there are two or more values above 0.8 for each individual:

    df <- read.table(textConnection("                      marker alleleA alleleB   X818 X818.1 X818.2   X345 X345.1 X345.2   X346 X346.1 X346.2
    1   kgp5209280_chr3_21902067       T       A 0.0000 1.0000 0.0000 1.0000 0.0000 0.0000 0.0000 1.0000 0.0000
    2 chr3_21902130_21902131_A_T       A       T 0.8626 0.1356 0.0018 0.7676 0.2170 0.0154 0.8626 0.1356 0.0018
    3 chr3_21902134_21902135_T_C       T       C 0.6982 0.2854 0.0164 0.5617 0.3749 0.0634 0.6982 0.2854 0.0164"), header=TRUE)
    
    #data.table solution
    library(data.table)
    DT <- as.data.table(df)
    DT[, M818 := ifelse(pmax(X818, X818.1, X818.2) > 0.8, pmax(X818, X818.1, X818.2), NA)]
    DT[, M345 := ifelse(pmax(X345, X345.1, X345.2) > 0.8, pmax(X345, X345.1, X345.2), NA)]
    DT[, M346 := ifelse(pmax(X346, X346.1, X346.2) > 0.8, pmax(X346, X346.1, X346.2), NA)]
    
    #Base R solution
    df$M818 <- ifelse(pmax(df$X818, df$X818.1, df$X818.2) > 0.8, pmax(df$X818, df$X818.1, df$X818.2), NA)
    df$M345 <- ifelse(pmax(df$X345, df$X345.1, df$X345.2) > 0.8, pmax(df$X345, df$X345.1, df$X345.2), NA)
    df$M346 <- ifelse(pmax(df$X346, df$X346.1, df$X346.2) > 0.8, pmax(df$X346, df$X346.1, df$X346.2), NA)
    

    If you want to get rid of the other columns, just type:

    DT[, list(marker, alleleA, alleleB, M818, M345, M346)]
                           marker alleleA alleleB   M818 M345   M346
    1:   kgp5209280_chr3_21902067       T       A 1.0000    1 1.0000
    2: chr3_21902130_21902131_A_T       A       T 0.8626   NA 0.8626
    3: chr3_21902134_21902135_T_C       T       C     NA   NA     NA
    
    0 讨论(0)
提交回复
热议问题