Exclude duplicate values in certain columns using ddply

烈酒焚心 提交于 2019-12-04 11:19:07

Answering your first question: If you like, here's a data.table solution:

require(data.table)
dt <- data.table(dftest, key="element")
dt.out <- dt[, .SD[length(table(seqnames)) == 1],by=c("element")]
> dt.out

#    element seqnames     start       end  width strand tx_id   tx_name
# 1:       1    chr19  58858172  58864865   6694      - 36769 NM_130786
# 2:      10     chr8  18248755  18258723   9969      + 16614 NM_000015
# 3:     100    chr20  43248163  43280376  32214      - 37719 NM_000022
# 4:    1000    chr18  25530930  25757445 226516      - 33839 NM_001792
# 5:   10000     chr1 243651535 244006584 355050      -  4182 NM_181690
# 6:   10000     chr1 243663021 244006584 343564      -  4183 NM_005465

And if you prefer the plyr solution:

require(plyr)
out <- ddply(dftest, .(element), function(x) {
    if( length(table(x$seqnames)) == 1) {
        x
    }
})

#   element seqnames     start       end  width strand tx_id   tx_name
# 1       1    chr19  58858172  58864865   6694      - 36769 NM_130786
# 2      10     chr8  18248755  18258723   9969      + 16614 NM_000015
# 3     100    chr20  43248163  43280376  32214      - 37719 NM_000022
# 4    1000    chr18  25530930  25757445 226516      - 33839 NM_001792
# 5   10000     chr1 243651535 244006584 355050      -  4182 NM_181690
# 6   10000     chr1 243663021 244006584 343564      -  4183 NM_005465

Edit: For your second question, basically, in addition to the old solution, you just want to return the first row when your first condition is not satisfied.

plyr solution: (without summarise)

out <- ddply(dftest, .(element), function(x) {
    if (length(table(x$seqnames)) == 1) {
        x
    } else {
        x[1, ]
    }
})

> out
#     element seqnames     start       end  width strand tx_id   tx_name
# 1         1    chr19  58858172  58864865   6694      - 36769 NM_130786
# 2        10     chr8  18248755  18258723   9969      + 16614 NM_000015
# 3       100    chr20  43248163  43280376  32214      - 37719 NM_000022
# 4      1000    chr18  25530930  25757445 226516      - 33839 NM_001792
# 5     10000     chr1 243651535 244006584 355050      -  4182 NM_181690
# 6     10000     chr1 243663021 244006584 343564      -  4183 NM_005465
# 7 100302285    chr12  12264886  12264967     82      + 24050 NR_036052

data.table solution.

dt <- data.table(dftest, key="element")
dt[, .SD[(if(length(table(seqnames)) == 1) seq_len(.N) else 1)], by = element]

> dt.out
#      element seqnames     start       end  width strand tx_id   tx_name
# 1:         1    chr19  58858172  58864865   6694      - 36769 NM_130786
# 2:        10     chr8  18248755  18258723   9969      + 16614 NM_000015
# 3:       100    chr20  43248163  43280376  32214      - 37719 NM_000022
# 4:      1000    chr18  25530930  25757445 226516      - 33839 NM_001792
# 5:     10000     chr1 243651535 244006584 355050      -  4182 NM_181690
# 6:     10000     chr1 243663021 244006584 343564      -  4183 NM_005465
# 7: 100302285    chr12  12264886  12264967     82      + 24050 NR_036052
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!