how to match dna sequence pattern

前端 未结 3 474
悲哀的现实
悲哀的现实 2020-12-14 03:25

I am getting a trouble finding an approach to solve this problem.

Input-output sequences are as follows :

 **input1 :** aaagctgctagag 
 **output1 :**         


        
3条回答
  •  误落风尘
    2020-12-14 03:48

    After trying my own way for a while, my kudos to jbaylina for his beautiful algorithm and C implementation. Here's my attempted version of jbaylina's algorithm in Haskell, and below it further development of my attempt at a linear-time algorithm that attempts to compress segments that include repeated patterns in a one-by-one fashion:

    import Data.Map (fromList, insert, size, (!))
    
    compress s = (foldl f (fromList [(0,([],0)),(1,([s!!0],1))]) [1..n - 1]) ! n  
     where
      n = length s
      f b i = insert (size b) bestCandidate b where
        add (sequence, sLength) (sequence', sLength') = 
          (sequence ++ sequence', sLength + sLength')
        j' = [1..min 100 i]
        bestCandidate = foldr combCandidates (b!i `add` ([s!!i,'1'],2)) j'
        combCandidates j candidate' = 
          let nextCandidate' = comb 2 (b!(i - j + 1) 
                           `add` ((take j . drop (i - j + 1) $ s) ++ "1", j + 1))
          in if snd nextCandidate' <= snd candidate' 
                then nextCandidate' 
                else candidate' where
            comb r candidate
              | r > uBound                         = candidate
              | not (strcmp r True)                = candidate
              | snd nextCandidate <= snd candidate = comb (r + 1) nextCandidate
              | otherwise                          = comb (r + 1) candidate
             where 
               uBound = div (i + 1) j
               prev = b!(i - r * j + 1)
               nextCandidate = prev `add` 
                 ((take j . drop (i - j + 1) $ s) ++ show r, j + length (show r))
               strcmp 1   _    = True
               strcmp num bool 
                 | (take j . drop (i - num * j + 1) $ s) 
                    == (take j . drop (i - (num - 1) * j + 1) $ s) = 
                      strcmp (num - 1) True
                 | otherwise = False
    

    Output:

    *Main> compress "aaagctgctagag"
    ("a3gct2ag2",9)
    
    *Main> compress "aaabbbaaabbbaaabbbaaabbb"
    ("aaabbb4",7)
    


    Linear-time attempt:

    import Data.List (sortBy)
    
    group' xxs sAccum (chr, count)
      | null xxs = if null chr 
                      then singles
                      else if count <= 2 
                              then reverse sAccum ++ multiples ++ "1"
                      else singles ++ if null chr then [] else chr ++ show count
      | [x] == chr = group' xs sAccum (chr,count + 1)
      | otherwise = if null chr 
                       then group' xs (sAccum) ([x],1) 
                       else if count <= 2 
                               then group' xs (multiples ++ sAccum) ([x],1)
                       else singles 
                            ++ chr ++ show count ++ group' xs [] ([x],1)
     where x:xs = xxs
           singles = reverse sAccum ++ (if null sAccum then [] else "1")
           multiples = concat (replicate count chr)
    
    sequences ws strIndex maxSeqLen = repeated' where
      half = if null . drop (2 * maxSeqLen - 1) $ ws 
                then div (length ws) 2 else maxSeqLen
      repeated' = let (sequence,(sequenceStart, sequenceEnd'),notSinglesFlag) = repeated
                  in (sequence,(sequenceStart, sequenceEnd'))
      repeated = foldr divide ([],(strIndex,strIndex),False) [1..half]
      equalChunksOf t a = takeWhile(==t) . map (take a) . iterate (drop a)
      divide chunkSize b@(sequence,(sequenceStart, sequenceEnd'),notSinglesFlag) = 
        let t = take (2*chunkSize) ws
            t' = take chunkSize t
        in if t' == drop chunkSize t
              then let ts = equalChunksOf t' chunkSize ws
                       lenTs = length ts
                       sequenceEnd = strIndex + lenTs * chunkSize
                       newEnd = if sequenceEnd > sequenceEnd' 
                                then sequenceEnd else sequenceEnd'
                   in if chunkSize > 1 
                         then if length (group' (concat (replicate lenTs t')) [] ([],0)) > length (t' ++ show lenTs)
                                 then (((strIndex,sequenceEnd,chunkSize,lenTs),t'):sequence, (sequenceStart,newEnd),True)
                                 else b
                         else if notSinglesFlag
                                 then b
                                 else (((strIndex,sequenceEnd,chunkSize,lenTs),t'):sequence, (sequenceStart,newEnd),False)
              else b
    
    addOne a b
      | null (fst b) = a
      | null (fst a) = b
      | otherwise = 
          let (((start,end,patLen,lenS),sequence):rest,(sStart,sEnd)) = a 
              (((start',end',patLen',lenS'),sequence'):rest',(sStart',sEnd')) = b
          in if sStart' < sEnd && sEnd < sEnd'
                then let c = ((start,end,patLen,lenS),sequence):rest
                         d = ((start',end',patLen',lenS'),sequence'):rest'
                     in (c ++ d, (sStart, sEnd'))
                else a
    
    segment xs baseIndex maxSeqLen = segment' xs baseIndex baseIndex where
      segment' zzs@(z:zs) strIndex farthest
        | null zs                              = initial
        | strIndex >= farthest && strIndex > 0 = ([],(0,0))
        | otherwise                            = addOne initial next
       where
         next@(s',(start',end')) = segment' zs (strIndex + 1) farthest'
         farthest' | null s = farthest
                   | otherwise = if start /= end && end > farthest then end else farthest
         initial@(s,(start,end)) = sequences zzs strIndex maxSeqLen
    
    areExclusive ((a,b,_,_),_) ((a',b',_,_),_) = (a' >= b) || (b' <= a)
    
    combs []     r = [r]
    combs (x:xs) r
      | null r    = combs xs (x:r) ++ if null xs then [] else combs xs r
      | otherwise = if areExclusive (head r) x
                       then combs xs (x:r) ++ combs xs r
                            else if l' > lowerBound
                                    then combs xs (x: reduced : drop 1 r) ++ combs xs r
                                    else combs xs r
     where lowerBound = l + 2 * patLen
           ((l,u,patLen,lenS),s) = head r
           ((l',u',patLen',lenS'),s') = x
           reduce = takeWhile (>=l') . iterate (\x -> x - patLen) $ u
           lenReduced = length reduce
           reduced = ((l,u - lenReduced * patLen,patLen,lenS - lenReduced),s)
    
    buildString origStr sequences = buildString' origStr sequences 0 (0,"",0)
       where
        buildString' origStr sequences index accum@(lenC,cStr,lenOrig)
          | null sequences = accum
          | l /= index     = 
              buildString' (drop l' origStr) sequences l (lenC + l' + 1, cStr ++ take l' origStr ++ "1", lenOrig + l')
          | otherwise      = 
              buildString' (drop u' origStr) rest u (lenC + length s', cStr ++ s', lenOrig + u')
         where
           l' = l - index
           u' = u - l  
           s' = s ++ show lenS       
           (((l,u,patLen,lenS),s):rest) = sequences
    
    compress []         _         accum = reverse accum ++ (if null accum then [] else "1")
    compress zzs@(z:zs) maxSeqLen accum
      | null (fst segment')                      = compress zs maxSeqLen (z:accum)
      | (start,end) == (0,2) && not (null accum) = compress zs maxSeqLen (z:accum)
      | otherwise                                =
          reverse accum ++ (if null accum || takeWhile' compressedStr 0 /= 0 then [] else "1")
          ++ compressedStr
          ++ compress (drop lengthOriginal zzs) maxSeqLen []
     where segment'@(s,(start,end)) = segment zzs 0 maxSeqLen
           combinations = combs (fst $ segment') []
           takeWhile' xxs count
             | null xxs                                             = 0
             | x == '1' && null (reads (take 1 xs)::[(Int,String)]) = count 
             | not (null (reads [x]::[(Int,String)]))               = 0
             | otherwise                                            = takeWhile' xs (count + 1) 
            where x:xs = xxs
           f (lenC,cStr,lenOrig) (lenC',cStr',lenOrig') = 
             let g = compare ((fromIntegral lenC + if not (null accum) && takeWhile' cStr 0 == 0 then 1 else 0) / fromIntegral lenOrig) 
                             ((fromIntegral lenC' + if not (null accum) && takeWhile' cStr' 0 == 0 then 1 else 0) / fromIntegral lenOrig')
             in if g == EQ 
                   then compare (takeWhile' cStr' 0) (takeWhile' cStr 0)
                   else g
           (lenCompressed,compressedStr,lengthOriginal) = 
             head $ sortBy f (map (buildString (take end zzs)) (map reverse combinations))
    

    Output:

    *Main> compress "aaaaaaaaabbbbbbbbbaaaaaaaaabbbbbbbbb" 100 []
    "a9b9a9b9"
    
    *Main> compress "aaabbbaaabbbaaabbbaaabbb" 100 []
    "aaabbb4"
    

提交回复
热议问题