Select last non-NA column of a list of dataframes

白昼怎懂夜的黑 提交于 2019-12-24 12:19:12

问题


I am currently trying to parse RDP multiclassifier hierarchy files in R, however the problem is more generally applicable. Basically I create a list which contains data-frames of several files which contain "hierarchical" rows:

dput(corner(hierlist$hier_M2MID06_Trimmed_noGaps.fas_fixrank.txt,n=c(7,10)))
structure(list(X1 = structure(c(30L, 31L, 163L, 45L, 64L, 65L, 
66L), .Label = c("-1071", "-1102", "-1153", "-1159", "-1176", 
"-1177", "-1207", "-1241", "-1256", "-1281", "-1332", "-1353", 
"-1354", "-1502", "-1567", "-18", "-2", "-2715", "-423", "-460", 
"-463", "-471", "-567", "-568", "-828", "-842", "-843", "-871", 
"-980", "0", "1", "1031", "1069", "1070", "1093", "1101", "1126", 
"1151", "1152", "1158", "1159", "1164", "1165", "1166", "1175", 
"1176", "1195", "1200", "1206", "1207", "1215", "1216", "1217", 
"1219", "1240", "1251", "1255", "1256", "1261", "1269", "1279", 
"1280", "1282", "1330", "1331", "1339", "1341", "1343", "1348", 
"1352", "1353", "1354", "1355", "1356", "1357", "1358", "1360", 
"1501", "1566", "16", "1668", "1672", "1674", "17", "1762", "1763", 
"1764", "1767", "1883", "1884", "1885", "1891", "1893", "1894", 
"2", "2164", "2179", "2180", "2183", "2184", "2187", "2192", 
"2195", "2208", "2209", "2210", "2211", "2259", "2260", "2333", 
"2371", "2372", "254", "255", "261", "264", "2684", "2713", "2714", 
"274", "3", "35", "422", "458", "459", "46", "462", "470", "48", 
"49", "54", "565", "566", "567", "570", "577", "581", "648", 
"653", "657", "659", "804", "805", "806", "807", "808", "817", 
"818", "819", "820", "822", "824", "825", "826", "827", "829", 
"832", "834", "837", "838", "839", "840", "841", "842", "843", 
"844", "846", "848", "870", "886", "887", "908", "918", "927", 
"929", "950", "957", "978", "979", "taxid"), class = "factor"), 
X2 = structure(c(3L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Root", 
"lineage", "null"), class = "factor"), X3 = structure(c(1L, 
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Root", "name", "rootrank"
), class = "factor"), X4 = structure(c(2L, 1L, 1L, 1L, 1L, 
1L, 1L), .Label = c("Bacteria", "no rank", "rank"), class = "factor"), 
X5 = structure(c(1L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("194", 
"M2MID06_Trimmed_noGaps.fas", "domain"), class = "factor"), 
X6 = structure(c(NA, NA, 10L, 10L, 10L, 10L, 10L), .Label = c("", 
"Acidobacteria", "Actinobacteria", "Bacteroidetes", "Cyanobacteria/Chloroplast", 
"Firmicutes", "Gemmatimonadetes", "Nitrospira", "Planctomycetes", 
"Proteobacteria", "Spirochaetes", "Verrucomicrobia", "unclassified_Bacteria"
), class = "factor"), X7 = structure(c(NA, 2L, 3L, 3L, 3L, 
3L, 3L), .Label = c("", "Bacteria", "phylum"), class = "factor"), 
X8 = structure(c(NA, 21L, NA, 8L, 8L, 8L, 8L), .Label = c("", 
"Acidobacteria_Gp3", "Acidobacteria_Gp4", "Actinobacteria", 
"Alphaproteobacteria", "Bacilli", "Bacteroidetes_incertae_sedis", 
"Betaproteobacteria", "Chloroplast", "Deltaproteobacteria", 
"Flavobacteria", "Gammaproteobacteria", "Gemmatimonadetes", 
"Nitrospira", "Phycisphaerae", "Planctomycetacia", "Sphingobacteria", 
"Spirochaetes", "Subdivision3", "Verrucomicrobiae", "domain", 
"unclassified_Bacteroidetes", "unclassified_Proteobacteria"
), class = "factor"), X9 = structure(c(NA, 2L, 11L, 14L, 
14L, 14L, 14L), .Label = c("", "194", "Acidobacteria", "Actinobacteria", 
"Bacteroidetes", "Cyanobacteria/Chloroplast", "Firmicutes", 
"Gemmatimonadetes", "Nitrospira", "Planctomycetes", "Proteobacteria", 
"Spirochaetes", "Verrucomicrobia", "class", "unclassified_Bacteria"
), class = "factor"), X10 = structure(c(NA, NA, 29L, NA, 
22L, 22L, 22L), .Label = c("", "Actinobacteridae", "Bdellovibrionales", 
"Burkholderiales", "Caulobacterales", "Chloroplast", "Chromatiales", 
"Flavobacteriales", "Gemmatimonadales", "Gp3", "Gp4", "Lactobacillales", 
"Legionellales", "Methylophilales", "Nitrospirales", "Ohtaekwangia", 
"Phycisphaerales", "Planctomycetales", "Pseudomonadales", 
"Rhizobiales", "Rhodobacterales", "Rhodocyclales", "Rhodospirillales", 
"Sphingobacteriales", "Sphingomonadales", "Spirochaetales", 
"Subdivision3_genera_incertae_sedis", "Verrucomicrobiales", 
"phylum", "unclassified_Alphaproteobacteria", "unclassified_Betaproteobacteria", 
"unclassified_Deltaproteobacteria", "unclassified_Gammaproteobacteria"
), class = "factor")), .Names = c("X1", "X2", "X3", "X4", 
"X5", "X6", "X7", "X8", "X9", "X10"), row.names = 2:8, class = "data.frame")

This basically means I have progressive rows which are filled up with NA in progressive columns. However, there is no way of telling of a specific row where the first NA will be. Preceding this first NA column, I have the two columns which actually interest me: a count of the number of contigs at a specified taxonomical level, and two columns before it the taxonomic level's name.

I already created a list containing the indices for each dataframe that would select the last row by:

library(plyr)
lastcollist<-lapply(hierlist,function(p)lapply(apply(p, 1, function(x) which(!is.na(x)) ),function(x)if(length(x)>0){max(x)}else{0}))
lastcollist<-lapply(lastcollist,unlist)
lastcollist.idx<-llply(lastcollist,function(x)cbind(seq(1,length(x)),x))

Here lastcollist.idx will contain the indices of each row with the last non-NA column:

head(lastcollist.idx$hier_M2MID06_Trimmed_noGaps.fas_fixrank.txt)
        x
[1,] 1  5
[2,] 2  5
[3,] 3  9
[4,] 4 11
[5,] 5 13
[6,] 6 15

So what I would basically want to do now is create a new list which contains dataframes (or in the case of only the last column, variable x in lastcollist.idx) that have for each given row the last selected column.

This would be the desired output for the given example:

 dput(rbind(c('domain','194'),c('Proteobacteria','Phylum'),c('Betaproteobacteria','class'),c ('class','Rhodocyclales'),c('class','Rhodocyclales'),c('class','Rhodocyclales')))
structure(c("domain", "Proteobacteria", "Betaproteobacteria", 
"class", "class", "class", "194", "Phylum", "class", "Rhodocyclales", 
"Rhodocyclales", "Rhodocyclales"), .Dim = c(6L, 2L))

And I have to admit, I wouldn't immediately know how to do so. Any pointers are warmly welcomed. I am not a novice to R, so you don't have to go through great lengths in your explanation.

For a larger reproducible example consider the dataset 'khanmiss' from the bioconductor library impute (bioconductor library impute).

 source("http://bioconductor.org/biocLite.R")
 biocLite("impute")
 require(impute)
 data(khanmiss)

Which is basically a dataframe that has NAs introduced in several places. It is not exactly the same hierarchical structure as my file but it will fit the purpose. As this is a very inconvenient dataframe with 2309 observations and only 222 of it's rows contain missing values, I selected the rows with missing values and added 78 rows randomly that did not have missing values in a new data.frame. This data.frame was then split into a list of 4 dataframes of arbirtraty size (adding up to 300).

isnadf<-as.data.frame(which(is.na(khanmiss),arr.ind=T))
na.rows<-sort(unique(isnadf$row))
length(na.rows) #the dataset has 222 rows which contain NA
na.khanmiss<-khanmiss[na.rows,]
notna.rows<-setdiff(rownames(khanmiss),na.rows)
notna.rows.selected<-sort(as.numeric(sample(notna.rows,78)))
notna.selected.khanmiss<-khanmiss[notna.rows.selected,]
khanmiss.selected<-rbind(na.khanmiss,notna.selected.khanmiss)
dfsizes<-c(82,74,79,65) #arbitrarily selected, adds up to 300
khanmiss.list<-split(khanmiss.selected,rep(letters[1:4],dfsizes))

Which finally gives a list somewhat similar to my dataset.


回答1:


something along these untested lines might work:

apply(dfrm, 1, function(r) { r[ (which(is.na(r))[1]-1):(which(is.na(r))[1]-2)) ] } )

My usual ways of load text output as dataframes are failing for this example so my suggestion is to post dput output rather than screen scrapings. (It also appears to me that you should have done your data entry with header =TRUE since your first row of data does not look like data.)

With the new data (and realizing the need to test for no NA's:

 apply(hierlist, 1, function(r) { r[ 
                      if( any(is.na(r))){ 
                          (which(is.na(r))[1]-1):( which(is.na(r))[1]-2) 
                       }else{
                          (length(r)-2): (length(r)-1)}
                                      ] }
       )
#--------------------------------------
     2         3          4                5                   
[1,] "194"     "domain"   "phylum"         "class"             
[2,] "no rank" "Bacteria" "Proteobacteria" "Betaproteobacteria"
     6                    7                    8                   
[1,] "Betaproteobacteria" "Betaproteobacteria" "Betaproteobacteria"
[2,] "class"              "class"              "class" 



来源:https://stackoverflow.com/questions/21223965/select-last-non-na-column-of-a-list-of-dataframes

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!