问题
I have a large data set and I want to write a custom merge function to use with apply but I can't solve a certain issue. I can't use a loop as it will take too long. The data roughly looks like this;
# [ Name, Strand, Start, End ]
R1 = c( 'GeneA', '+', 1000, 1500 )
R2 = c( 'GeneA', '+', 1510, 2000 )
R3 = c( 'GeneA', '+', 2001, 2500 )
R4 = c( 'GeneB', '-', 3100, 4000 )
The data is a data.frame with rows R1:R4
So far I can get a function which compares Ri and Rj (j = i +1) and merges them if Name is the same, Strand is the same, and the gap between them is less then 100.
GAP = Ri[End] - Rj[Start]
If I apply the function to each row and build up the output. The function output then should create
R1 = c( 'GeneA', '+', 1000, 2500 )
R4 = c( 'GeneB', '-', 3100, 4000 )
I can get a working function which can use apply to merge two consecutive elements into one but I can't figure out how to merge three consecutive elements. One ugly solution was to run the two consecutive element merger function until no additional mergers are made but this is not efficient. I'm a bit stuck for ideas and any insights would be appreciated.
EDIT: to clarify, the data is ordered by Start position on consecutive chromosomes (not shown) and each gene name occurs multiple times in the data set at different positions (ie. GeneA can be 1000-2500 and then again at 4000-5000, and I don't want to merge those two genes, only consecutive ones).
EDIT 2: I have used Tim P solution below. A problem has arisen in the efficiency of Merging. Also is there a way to post text files to stack overflow so I can show what the data really looks like and then post my script so far?
# Let ValidMerge be a logical vector of MergeDown corresponding to the rows in the data
# as defined by TimP below
# The data.frame is called RMDB
# TeMerge function is previously defined to merge two rows Ri and Rj into one entry
# which spans the start of Ri to the end of Rj (with same name and strand)
COUNT = 0
RMDB.OUT = 0
while (COUNT < nrow(RMDB)) { # Cycle through all rows of RMDB
COUNT = COUNT + 1
# Is this position a merger start?
# If yes, then returns position in startpt
# which will be the end position in endpt
# If no, returns 0
Merge = match(COUNT,startpt,nomatch=0)
if ( Merge == 0 ){
# No merge starts at this position
RMDB.OUT = rbind(RMDB.OUT,RMDB[COUNT,]) # Append COUNT Row to output
}else{
# Merge COUNT row in RMDB to its endpoint
RMDB.OUT = rbind(RMDB.OUT,
TeMerge(RMDB[COUNT,],RMDB[endpt[Merge],]))
#print(paste('Merging Row',COUNT,' and Row',endpt[Merge]))
COUNT = endpt[Merge] # Move Count to the endpoint
}
}
This script gives exactly the results I am interested in, the only problem is that my data.frame has 5 000 000 entries and I would like to run the analysis using several parameters for GAP size to compare the results. Is there a way which I can rewrite this part of the code to be more efficient? Everything up until this point runs in reasonable amount of time (~couple of minutes). This part of has been going for 3+ hours on a 700 000 subset of the data.
EDIT 3: The spiked data which has all cases at the top (MIR3). Ignore Columns 1:4,8,11:15.
dput(RMDB) structure(list(V1 = c(3612L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 741L, 444L, 741L, 407L, 2059L, 407L, 656L, 2058L, 257L, 4051L, 456L, 351L, 850L, 335L, 1000L, 1566L, 236L, 588L, 3877L, 750L, 2292L, 783L, 747L, 666L, 260L, 1118L, 341L, 7010L, 320L, 7010L, 249L, 458L, 24L, 631L, 631L, 875L), V2 = c(11.4, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 18.7, 11.6, 24.9, 28.8, 14.1, 28.8, 23.6, 18.3, 25, 7, 8.2, 23.6, 24.9, 29.5, 13.5, 19.4, 34.8, 17.4, 22.9, 27.6, 12, 26.6, 30.4, 12.9, 38.5, 35.4, 27.8, 19.2, 0, 17.3, 21.2, 19.3, 0, 3.9, 26.6, 22.6), V3 = c(27, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 4.5, 0, 0, 7.3, 0.3, 7.3, 12.2, 4.9, 0, 0.4, 0, 1.8, 15.1, 12.7, 0, 3.6, 3, 7.5, 14, 9.4, 0, 14.1, 4.1, 4, 1.4, 9.4, 5.9, 2.7, 0, 9.3, 1, 0, 0, 0, 1.8, 9.5), V4 = c(1.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.1, 1.1, 3, 0.3, 3, 3, 0, 0, 0, 0, 3.6, 0.8, 2.7, 0, 2.8, 0.4, 0.8, 1.6, 6.4, 0, 3.8, 3.7, 0, 0, 2.6, 1.5, 2.5, 2.4, 1.4, 2, 5, 0, 0, 0.6, 0.5), V5 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"), V6 = c(10469L, 20001L, 20210L, 21000L, 21201L, 22000L, 22201L, 23000L, 23201L, 20000L, 20001L, 24001L, 24205L, 24405L, 0L, 30855L, 30953L, 31293L, 31436L, 31734L, 32841L, 33048L, 33466L, 33529L, 34048L, 34451L, 34565L, 35217L, 35367L, 37045L, 37733L, 38059L, 38256L, 39465L, 39624L, 39925L, 40333L, 40629L, 40736L, 41380L, 42370L, 43243L, 44836L, 44877L, 45887L, 46079L, 46217L, 46416L, 46553L, 46893L), V7 = c(11447L, 20200L, 20400L, 21200L, 21400L, 22200L, 22400L, 23200L, 23400L, 20001L, 20200L, 24200L, 24400L, 24600L, 0L, 30952L, 31131L, 31435L, 31733L, 31754L, 33037L, 33456L, 33509L, 34041L, 34108L, 34560L, 34921L, 35366L, 35499L, 37431L, 37861L, 38191L, 39464L, 39623L, 39924L, 40294L, 40626L, 40729L, 40878L, 42285L, 42504L, 44835L, 44876L, 45753L, 45987L, 46198L, 46240L, 46493L, 46722L, 47092L), V8 = structure(c(38L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 36L, 35L, 34L, 33L, 32L, 31L, 30L, 29L, 28L, 27L, 26L, 25L, 24L, 23L, 22L, 21L, 20L, 19L, 18L, 17L, 16L, 15L, 14L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("(249203529)", "(249203899)", "(249204128)", "(249204381)", "(249204423)", "(249204634)", "(249204868)", "(249205745)", "(249205786)", "(249208117)", "(249208336)", "(249209743)", "(249209892)", "(249209995)", "(249210327)", "(249210697)", "(249210998)", "(249211157)", "(249212430)", "(249212760)", "(249213190)", "(249215122)", "(249215255)", "(249215700)", "(249216061)", "(249216513)", "(249216580)", "(249217112)", "(249217165)", "(249217584)", "(249218867)", "(249218888)", "(249219186)", "(249219490)", "(249219669)", "(249219773)", "(249235266)", "(249239174)"), class = "factor"), V9 = structure(c(2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("+", "C"), class = "factor"), V10 = structure(c(32L, 23L, 23L, 23L, 23L, 23L, 24L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L, 22L, 12L, 5L, 14L, 13L, 16L, 30L, 25L, 2L, 7L, 16L, 15L, 29L, 28L, 3L, 28L, 15L, 4L, 18L, 8L, 19L, 10L, 31L, 10L, 9L, 11L, 6L, 17L, 20L, 21L ), .Label = c("AluJo", "AluJr", "AluSx", "AluSz6", "AluYc", "AT_rich", "Charlie5", "ERVL-E-int", "L1M5", "L1MA8", "L1MA9", "L1MB5", "L1P1", "L1PA6", "L2a", "L2c", "LTR12F", "LTR16C", "MamRep1527", "MER45A", "MER58A", "MIR", "MIR3", "MIR3a", "MIRb", "MIRc", "MLT1A", "MLT1E1A", "MLT1E1A-int", "MLT1J2", "(TAAA)n", "TAR1", "(TC)n", "XXXXX"), class = "factor"), V11 = structure(c(10L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 11L, 9L, 13L, 12L, 13L, 13L, 3L, 12L, 3L, 3L, 4L, 9L, 13L, 12L, 1L, 4L, 4L, 9L, 9L, 12L, 9L, 4L, 12L, 8L, 8L, 6L, 3L, 11L, 3L, 3L, 3L, 5L, 7L, 2L, 1L), .Label = c("DNA/hAT-Charlie", "DNA/hAT-Tip100", "LINE/L1", "LINE/L2", "Low_complexity", "LTR", "LTR/ERV1", "LTR/ERVL", "LTR/ERVL-MaLR", "Satellite/telo", "Simple_repeat", "SINE/Alu", "SINE/MIR", "XXXXXXXX"), class = "factor"), V12 = structure(c(19L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 2L, 9L, 8L, 25L, 2L, 10L, 9L, 23L, 2L, 1L, 15L, 12L, 1L, 6L, 2L, 11L, 16L, 13L, 7L, 2L, 2L, 8L, 14L, 1L, 3L, 26L, 17L, 18L, 9L, 21L, 22L, 24L, 2L, 4L, 27L, 20L), .Label = c("(0)", "1", "(113)", "(115)", "(119)", "(13)", "131", "173", "2", "218", "2234", "(231)", "2705", "2923", "2970", "3242", "359", "3715", "(399)", "(4)", "5306", "5334", "5746", "6167", "67", "(685)", "7"), class = "factor"), V13 = c(1712L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 162L, 96L, 349L, 217L, 298L, 238L, 216L, 6174L, 44L, 6154L, 3030L, 3156L, 448L, 255L, 133L, 2623L, 3375L, 2846L, 1489L, 172L, 301L, 666L, 3217L, 312L, 376L, 4982L, 499L, 5305L, 41L, 6290L, 5433L, 6280L, 24L, 404L, 178L, 220L), V14 = structure(c(23L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 8L, 1L, 10L, 25L, 4L, 13L, 21L, 1L, 11L, 26L, 15L, 14L, 20L, 30L, 5L, 2L, 1L, 27L, 1L, 18L, 3L, 4L, 7L, 6L, 9L, 19L, 22L, 29L, 1L, 2L, 28L, 16L, 1L, 17L, 1L, 12L), .Label = c("(0)", "(1)", "(11)", "(14)", "(179)", "208", "(209)", "(212)", "232", "(25)", "(255)", "3", "(30)", "3049", "(3116)", "(32)", "327", "(388)", "4016", "41", "(46)", "(470)", "483", "49", "(51)", "5640", "(573)", "(691)", "(838)", "91"), class = "factor"), V15 = c(2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 22L, 23L, 22L, 24L, 25L, 24L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 38L, 39L, 38L, 37L, 40L, 41L, 42L, 43L, 44L, 45L, 44L, 46L, 47L, 48L, 49L, 50L, 51L)), .Names = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15"), class = "data.frame", row.names = c(NA, -50L))
And here is the output after ValidMerge is applied.
dput(RMDB.OUT) structure(list(V1 = c(3612L, NA, NA, 318L, 318L, 318L, 318L, 318L, 318L, NA, 741L, 444L, 741L, 407L, 2059L, 407L, 656L, 2058L, 257L, 4051L, 456L, 351L, 850L, 335L, 1000L, 1566L, 236L, 588L, 3877L, 750L, 2292L, 783L, 747L, 666L, 260L, 1118L, 341L, 7010L, 320L, 7010L, 249L, 458L, 24L, 631L, 631L, 875L), V2 = c(11.4, NA, NA, 23, 23, 23, 23, 23, 23, NA, 18.7, 11.6, 24.9, 28.8, 14.1, 28.8, 23.6, 18.3, 25, 7, 8.2, 23.6, 24.9, 29.5, 13.5, 19.4, 34.8, 17.4, 22.9, 27.6, 12, 26.6, 30.4, 12.9, 38.5, 35.4, 27.8, 19.2, 0, 17.3, 21.2, 19.3, 0, 3.9, 26.6, 22.6), V3 = c(27, NA, NA, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, NA, 4.5, 0, 0, 7.3, 0.3, 7.3, 12.2, 4.9, 0, 0.4, 0, 1.8, 15.1, 12.7, 0, 3.6, 3, 7.5, 14, 9.4, 0, 14.1, 4.1, 4, 1.4, 9.4, 5.9, 2.7, 0, 9.3, 1, 0, 0, 0, 1.8, 9.5 ), V4 = c(1.3, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, 3.1, 1.1, 3, 0.3, 3, 3, 0, 0, 0, 0, 3.6, 0.8, 2.7, 0, 2.8, 0.4, 0.8, 1.6, 6.4, 0, 3.8, 3.7, 0, 0, 2.6, 1.5, 2.5, 2.4, 1.4, 2, 5, 0, 0, 0.6, 0.5), V5 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"), V6 = c(10469L, 20001L, 21000L, 22000L, 22201L, 23000L, 23201L, 20000L, 20001L, 24001L, 0L, 30855L, 30953L, 31293L, 31436L, 31734L, 32841L, 33048L, 33466L, 33529L, 34048L, 34451L, 34565L, 35217L, 35367L, 37045L, 37733L, 38059L, 38256L, 39465L, 39624L, 39925L, 40333L, 40629L, 40736L, 41380L, 42370L, 43243L, 44836L, 44877L, 45887L, 46079L, 46217L, 46416L, 46553L, 46893L), V7 = c(11447L, 20400L, 21400L, 22200L, 22400L, 23200L, 23400L, 20001L, 20200L, 24600L, 0L, 30952L, 31131L, 31435L, 31733L, 31754L, 33037L, 33456L, 33509L, 34041L, 34108L, 34560L, 34921L, 35366L, 35499L, 37431L, 37861L, 38191L, 39464L, 39623L, 39924L, 40294L, 40626L, 40729L, 40878L, 42285L, 42504L, 44835L, 44876L, 45753L, 45987L, 46198L, 46240L, 46493L, 46722L, 47092L), V8 = structure(c(38L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 36L, 35L, 34L, 33L, 32L, 31L, 30L, 29L, 28L, 27L, 26L, 25L, 24L, 23L, 22L, 21L, 20L, 19L, 18L, 17L, 16L, 15L, 14L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("(249203529)", "(249203899)", "(249204128)", "(249204381)", "(249204423)", "(249204634)", "(249204868)", "(249205745)", "(249205786)", "(249208117)", "(249208336)", "(249209743)", "(249209892)", "(249209995)", "(249210327)", "(249210697)", "(249210998)", "(249211157)", "(249212430)", "(249212760)", "(249213190)", "(249215122)", "(249215255)", "(249215700)", "(249216061)", "(249216513)", "(249216580)", "(249217112)", "(249217165)", "(249217584)", "(249218867)", "(249218888)", "(249219186)", "(249219490)", "(249219669)", "(249219773)", "(249235266)", "(249239174)"), class = "factor"), V9 = structure(c(2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("+", "C"), class = "factor"), V10 = structure(c(32L, 23L, 23L, 23L, 24L, 23L, 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L, 22L, 12L, 5L, 14L, 13L, 16L, 30L, 25L, 2L, 7L, 16L, 15L, 29L, 28L, 3L, 28L, 15L, 4L, 18L, 8L, 19L, 10L, 31L, 10L, 9L, 11L, 6L, 17L, 20L, 21L), .Label = c("AluJo", "AluJr", "AluSx", "AluSz6", "AluYc", "AT_rich", "Charlie5", "ERVL-E-int", "L1M5", "L1MA8", "L1MA9", "L1MB5", "L1P1", "L1PA6", "L2a", "L2c", "LTR12F", "LTR16C", "MamRep1527", "MER45A", "MER58A", "MIR", "MIR3", "MIR3a", "MIRb", "MIRc", "MLT1A", "MLT1E1A", "MLT1E1A-int", "MLT1J2", "(TAAA)n", "TAR1", "(TC)n", "XXXXX"), class = "factor"), V11 = structure(c(10L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 11L, 9L, 13L, 12L, 13L, 13L, 3L, 12L, 3L, 3L, 4L, 9L, 13L, 12L, 1L, 4L, 4L, 9L, 9L, 12L, 9L, 4L, 12L, 8L, 8L, 6L, 3L, 11L, 3L, 3L, 3L, 5L, 7L, 2L, 1L), .Label = c("DNA/hAT-Charlie", "DNA/hAT-Tip100", "LINE/L1", "LINE/L2", "Low_complexity", "LTR", "LTR/ERV1", "LTR/ERVL", "LTR/ERVL-MaLR", "Satellite/telo", "Simple_repeat", "SINE/Alu", "SINE/MIR", "XXXXXXXX"), class = "factor"), V12 = structure(c(19L, NA, NA, 5L, 5L, 5L, 5L, 5L, 5L, NA, 2L, 9L, 8L, 25L, 2L, 10L, 9L, 23L, 2L, 1L, 15L, 12L, 1L, 6L, 2L, 11L, 16L, 13L, 7L, 2L, 2L, 8L, 14L, 1L, 3L, 26L, 17L, 18L, 9L, 21L, 22L, 24L, 2L, 4L, 27L, 20L), .Label = c("(0)", "1", "(113)", "(115)", "(119)", "(13)", "131", "173", "2", "218", "2234", "(231)", "2705", "2923", "2970", "3242", "359", "3715", "(399)", "(4)", "5306", "5334", "5746", "6167", "67", "(685)", "7"), class = "factor"), V13 = c(1712L, NA, NA, 143L, 143L, 143L, 143L, 143L, 143L, NA, 162L, 96L, 349L, 217L, 298L, 238L, 216L, 6174L, 44L, 6154L, 3030L, 3156L, 448L, 255L, 133L, 2623L, 3375L, 2846L, 1489L, 172L, 301L, 666L, 3217L, 312L, 376L, 4982L, 499L, 5305L, 41L, 6290L, 5433L, 6280L, 24L, 404L, 178L, 220L), V14 = structure(c(23L, NA, NA, 24L, 24L, 24L, 24L, 24L, 24L, NA, 8L, 1L, 10L, 25L, 4L, 13L, 21L, 1L, 11L, 26L, 15L, 14L, 20L, 30L, 5L, 2L, 1L, 27L, 1L, 18L, 3L, 4L, 7L, 6L, 9L, 19L, 22L, 29L, 1L, 2L, 28L, 16L, 1L, 17L, 1L, 12L), .Label = c("(0)", "(1)", "(11)", "(14)", "(179)", "208", "(209)", "(212)", "232", "(25)", "(255)", "3", "(30)", "3049", "(3116)", "(32)", "327", "(388)", "4016", "41", "(46)", "(470)", "483", "49", "(51)", "5640", "(573)", "(691)", "(838)", "91"), class = "factor"), V15 = c(2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 22L, 23L, 22L, 24L, 25L, 24L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 38L, 39L, 38L, 37L, 40L, 41L, 42L, 43L, 44L, 45L, 44L, 46L, 47L, 48L, 49L, 50L, 51L)), .Names = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15"), class = "data.frame", row.names = c(NA, -46L))
Edit 4: Sorry about that, here is the simplified version: Initial Data.frame
dput(RMDB.cut)
structure(list(Chromosome = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"), Start = c(10469L, 20001L, 20210L, 21000L, 21201L, 22000L, 22201L, 23000L, 23201L, 20000L, 20001L, 24001L, 24205L, 24405L, 0L, 30855L, 30953L, 31293L, 31436L, 31734L), End = c(11447L, 20200L, 20400L, 21200L, 21400L, 22200L, 22400L, 23200L, 23400L, 20001L, 20200L, 24200L, 24400L, 24600L, 0L, 30952L, 31131L, 31435L, 31733L, 31754L),
(Left)
= structure(c(38L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 36L, 35L, 34L, 33L, 32L, 31L), .Label = c("(249203529)", "(249203899)", "(249204128)", "(249204381)", "(249204423)", "(249204634)", "(249204868)", "(249205745)", "(249205786)", "(249208117)", "(249208336)", "(249209743)", "(249209892)", "(249209995)", "(249210327)", "(249210697)", "(249210998)", "(249211157)", "(249212430)", "(249212760)", "(249213190)", "(249215122)", "(249215255)", "(249215700)", "(249216061)", "(249216513)", "(249216580)", "(249217112)", "(249217165)", "(249217584)", "(249218867)", "(249218888)", "(249219186)", "(249219490)", "(249219669)", "(249219773)", "(249235266)", "(249239174)"), class = "factor"), Strand = structure(c(2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("+", "C"), class = "factor"), repName = structure(c(32L, 23L, 23L, 23L, 23L, 23L, 24L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L), .Label = c("AluJo", "AluJr", "AluSx", "AluSz6", "AluYc", "AT_rich", "Charlie5", "ERVL-E-int", "L1M5", "L1MA8", "L1MA9", "L1MB5", "L1P1", "L1PA6", "L2a", "L2c", "LTR12F", "LTR16C", "MamRep1527", "MER45A", "MER58A", "MIR", "MIR3", "MIR3a", "MIRb", "MIRc", "MLT1A", "MLT1E1A", "MLT1E1A-int", "MLT1J2", "(TAAA)n", "TAR1", "(TC)n", "XXXXX"), class = "factor"), ValidMerge = c(FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE)), .Names = c("Chromosome", "Start", "End", "(Left)", "Strand", "repName", "ValidMerge"), row.names = c(NA, 20L), class = "data.frame")
And the output after merger
dput(RMDB.out.cut)
structure(list(Chromosome = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"), Start = c("10469", "20001", "21000", "22000", "22201", "23000", "23201", "20000", "20001", "24001", "0", "30855", "30953", "31293", "31436", "31734"), End = c("11447", "20400", "21400", "22200", "22400", "23200", "23400", "20001", "20200", "24600", "0", "30952", "31131", "31435", "31733", "31754"),
(Left)
= structure(c(38L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 36L, 35L, 34L, 33L, 32L, 31L), .Label = c("(249203529)", "(249203899)", "(249204128)", "(249204381)", "(249204423)", "(249204634)", "(249204868)", "(249205745)", "(249205786)", "(249208117)", "(249208336)", "(249209743)", "(249209892)", "(249209995)", "(249210327)", "(249210697)", "(249210998)", "(249211157)", "(249212430)", "(249212760)", "(249213190)", "(249215122)", "(249215255)", "(249215700)", "(249216061)", "(249216513)", "(249216580)", "(249217112)", "(249217165)", "(249217584)", "(249218867)", "(249218888)", "(249219186)", "(249219490)", "(249219669)", "(249219773)", "(249235266)", "(249239174)" ), class = "factor"), Strand = structure(c(2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("+", "C"), class = "factor"), repName = structure(c(32L, 23L, 23L, 23L, 24L, 23L, 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L), .Label = c("AluJo", "AluJr", "AluSx", "AluSz6", "AluYc", "AT_rich", "Charlie5", "ERVL-E-int", "L1M5", "L1MA8", "L1MA9", "L1MB5", "L1P1", "L1PA6", "L2a", "L2c", "LTR12F", "LTR16C", "MamRep1527", "MER45A", "MER58A", "MIR", "MIR3", "MIR3a", "MIRb", "MIRc", "MLT1A", "MLT1E1A", "MLT1E1A-int", "MLT1J2", "(TAAA)n", "TAR1", "(TC)n", "XXXXX"), class = "factor"), ValidMerge = c(FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE )), .Names = c("Chromosome", "Start", "End", "(Left)", "Strand", "repName", "ValidMerge"), row.names = c("2", "3", "4", "6", "7", "8", "9", "10", "11", "111", "15", "16", "17", "18", "19", "20" ), class = "data.frame")
回答1:
I think the strategy should be to generate another column called DoMerge - and for each row R_i (where i ranges 1 to n-1), DoMerge is TRUE if Name and Strand match between R_i and R_{i+1}, and End for R_i is sufficiently close to Start for R_{i+1} (within 100, if that's the right value). DoMerge for row n is FALSE by convention. Intuitively, DoMerge being TRUE means that it is valid to merge that row with the following row.
Then, we merge together all rows where there is a consecutive string of TRUEs. I can knock up some quick code for that if we agree that's the best strategy! :)
UPDATE:
Here's code for the task, assuming that mydf is the data frame of information with columns Name, Strand, Start and End... the output of the below is the start and end points over which you need to merge - though once you know what needs merging it should be a cinch :)
distanceThresh = 100
isSameName=(mydf$Name==c(mydf$Name[-1],"void"))
isSameStrand=(mydf$Strand==c(mydf$Strand[-1],"void"))
isWithinDistance=(c(mydf$Start[-1],max(mydf$End)+2*distanceThresh)
-mydf$End) <= distanceThresh
validMerge = isSameName & isSameStrand & isWithinDistance
fthent=which(!validMerge & c(validMerge[-1],FALSE))
tthenf=which(validMerge & !c(validMerge[-1],TRUE))
startpt = fthent+1; if (validMerge[1]) {startpt=c(1,startpt)}
endpt = tthenf+1
instructions=NULL
for (kk in seq_along(startpt)) {
instructions = c(instructions,
paste("Merge",startpt[kk],"to",endpt[kk],"inclusive",sep=" "))
}
Let me know if this all makes sense! :)
MERGING METHOD (8 June):
How about something like this (has had some testing, but not on the real data)...
doMerge = unlist(mapply(function(x,y) {seq(x,y,1)},startpt,endpt))
doNotMerge = setdiff(seq_along(validMerge),doMerge)
dataMerged=data.frame(Name=RMDB$Name[startpt], Strand=RMDB$Strand[startpt],
Start=RMDB$Start[startpt], End=RMDB$End[endpt],
stringsAsFactors=FALSE)
dataUnmerged=RMDB[doNotMerge,]
Basically doMerge
tells you the lines that require merging (just set up a sequence running from each startpt
to the corresponding endpt
). And doNotMerge
is all the other lines (assuming the length of validMerge
is the length of the data).
Then dataMerged
just constructs directly what the merged data should look like - obviously Name
, Strand
and Start
are inherited from row startpt
and End
is inherited from row endpt
. (If there are other columns of interest, you'll have to decide where they come from, obviously...) The number of rows in dataMerged
matches the length of startpt
and endpt
, of course. Finally, dataUnmerged
is all rows that were ineligible for merging.
Hopefully the above all makes sense, and it's clear that if you combine dataMerged
and dataUnmerged
and reorder to get everything back in the original sequence (presumably there's an index column that can be used for this), then you have the desired outcome.
And I expect the above will run very, very fast indeed :)
回答2:
Here's the GenomicRanges solution. The first step, one-time only, is to install the package and its dependencies
source("http://bioconductor.org/biocLite.R")
biocLite("GenomicRanges")
Then load the package and create a GRanges
instance from your data, which I've put in a data frame called df
library(GenomicRanges)
gr = with(df, GRanges(Chromosome, IRanges(Start, End), Strand, repName=repName))
Your data is actually a little more complicated, a 'list of GRanges', where each list element is defined by a gene, so
grl = split(gr, values(gr)$repName)
You'd like to reduce
this on an element-by-element basis, allowing for a reduction when the minimum gap width between adjacent elements is 100. So
merged = reduce(grl, min.gapwidth=100L)
You could coerce this back to a data.frame
with as(merged, "data.frame")
. The result prior to coercion looks like
> merged
GRangesList of length 8:
$AluJo
GRanges with 1 range and 0 elementMetadata cols:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr1 [31436, 31733] +
$MIR3
GRanges with 7 ranges and 0 elementMetadata cols:
seqnames ranges strand
[1] chr1 [20001, 20400] +
[2] chr1 [23201, 23400] +
[3] chr1 [24001, 24600] +
[4] chr1 [20000, 20001] -
[5] chr1 [21000, 21400] -
[6] chr1 [22000, 22200] -
[7] chr1 [23000, 23200] -
and as a data.frame
> as(merged, "data.frame")
element seqnames start end width strand
1 AluJo chr1 31436 31733 298 +
2 MIR3 chr1 20001 20400 400 +
3 MIR3 chr1 23201 23400 200 +
4 MIR3 chr1 24001 24600 600 +
5 MIR3 chr1 20000 20001 2 -
6 MIR3 chr1 21000 21400 401 -
7 MIR3 chr1 22000 22200 201 -
8 MIR3 chr1 23000 23200 201 -
For a million rows, arranged into 100000 genes, we have
> length(grl)
[1] 100000
> table(elementLengths(grl))
10
100000
> system.time(reduce(grl, min.gapwidth=100))
user system elapsed
9.468 0.064 9.553
回答3:
I would first re-vectorize the different columns from the data-frame (df
). Do the necessary operations as below and then put them inside a data-frame
name_strand=paste(df$Name,df$Strand,sep=",") #for grouping
st=df$Start
en=df$End
unq=unique(name_strand) #get the unique Name+Strand tags
ls1=list()
for(i in 1:length(unq)){
index=which(name_strand %in% unq[i]) #get the index ids or group by unique Name+Strand
un_ls=unlist(strsplit(unq[i],split=","))
ls1[[i]]=list(Name=un_ls[1],Strand=un_ls[2],Start=min(st[index]),End=max(en[index]))
}
df=as.data.frame(do.call(rbind, ls1))
So this would also solve the situation when you have the Name
and Strand
occurring elsewhere in the data-frame
.
EDIT:
Since there are doubts in your question, I have a modified solution in case you want gap_distance
<100 and also taking into consideration that the data-frame
rows may not be ordered. So my solution takes both these into consideration (you can modify it if the rows are already ordered in the descending order of Start
and End
:
for(i in 1:length(unq)){
index=which(name_strand %in% unq[i]) #get the index ids or group by unique Name+Strand
un_ls=unlist(strsplit(unq[i],split=","))
#previous sol: ls1[[i]]=list(Name=un_ls[1],Strand=un_ls[2],Start=min(st[index]),End=max(en[index]))
new_st=st[index] #new Start vector from the "st" vector for this group by name_strand
new_en=en[index]
new_st=new_st[order(new_st)] #when data-frame rows are not ordered:
new_en=new_en[order(new_en)] #"order" method gives the index in ascending order of START
#using embed method to lag and taking the diff for checking with gap distance considerations
gap_diff=embed(new_st,2)[,1]-new_en[-length(new_en)] #subtract from End vector except the last element of End
a_ind_st=1 #index of new_st (Start) vector
b_ind_en=1 #index of new_en (End) vector
ls_ind=1 #index for list
for (j in 1:(length(new_en)-1)){ #also acts as the index for gap_diff
if (gap_diff[j] < gap_distance){ #where gap_distance = 100 (consideration #2) < or <=
b_ind_en=j+1
if (j + 1 > length(new_en)-1){ #special case for last element in vector
ls1[[ls_ind]]=list(Name=un_ls[1],Strand=un_ls[2],Start=new_st[a_ind_st],End=new_en[b_ind_en]))
ls_ind=ls_ind+1 #increment list index
}
}else{
ls1[[ls_ind]]=list(Name=un_ls[1],Strand=un_ls[2],Start=new_st[a_ind_st],End=new_en[b_ind_en]))
ls_ind=ls_ind+1 #increment list index
a_ind_st=b_ind_en+1
b_ind_en=b_ind_en+1
}
}
}
df=as.data.frame(do.call(rbind, ls1))
This modification will take everything into consideration. If you don't require any restrictions, you can modify the solution.
来源:https://stackoverflow.com/questions/10926948/custom-merge-function-in-r