Fastest Way To Find Mismatch Positions Between Two Strings of the Same Length

前端 未结 9 1386
后悔当初
后悔当初 2020-12-28 22:08

I have a millions of pairs of string of same length which I want to compare and find the position where it has mismatches.

For example for each $str1 a

9条回答
  •  夕颜
    夕颜 (楼主)
    2020-12-28 22:38

    I don't know how efficient it is, but you could always xor the two strings you are matching, and find the index of the first mismatch.

    #! /usr/bin/env perl
    use strict;
    use warnings;
    use 5.10.1;
    
    my $str_source = "ATTCCGGG";
    
    my $str1       = "ATTGCGGG";
    my $str2       = "ATACCGGC";
    my $str3       = "GTTCCGGG";
    
    # this returns the index of all of the mismatches (zero based)
    # it returns an empty list if the two strings match.
    sub diff_index{
      my($a,$b) = @_;
      my $cmp = $a^$b;
    
      my @cmp;
      while( $cmp =~ /[^\0]/g ){ # match non-zero byte
        push @cmp, pos($cmp) - 1;
      }
      return @cmp;
    }
    
    for my $str ( $str_source, $str1, $str2, $str3 ){
      say '# "', $str, '"';
      my @ret = diff_index $str_source, $str;
      if( @ret ){
        say '[ ', join( ', ', @ret), ' ]';
      }else{
        say '#   match';
      }
    }
    
    # "ATTCCGGG"
    #   match
    # "ATTGCGGG"
    [ 3 ]
    # "ATACCGGC"
    [ 2, 7 ]
    # "GTTCCGGG"
    [ 0 ]
    

    Running it through B::Concise shows that the CPU expensive operations, happen as single opcodes. Which means that those operations are run in C.

    perl -MO=Concise,-exec,-compact,-src,diff_index test.pl |
    perl -pE's/^[^#].*? \K([^\s]+)$/# $1/' # To fix highlighting bugs
    
    main::diff_index:
    # 15:   my($a,$b) = @_;
    1  <;> nextstate(main 53 test.pl:15) # v:%,*,&,$
    2  <0> pushmark # s
    3  <$> gv(*_) # s
    4  <1> rv2av[t3] # lK/3
    5  <0> pushmark # sRM*/128
    6  <0> padsv[$a:53,58] # lRM*/LVINTRO
    7  <0> padsv[$b:53,58] # lRM*/LVINTRO
    8  <2> aassign[t4] # vKS
    # 16:   my $cmp = $a^$b;
    9  <;> nextstate(main 54 test.pl:16) # v:%,*,&,$
    a  <0> padsv[$a:53,58] # s
    b  <0> padsv[$b:53,58] # s
    c  <2> bit_xor[t6] # sK                     <-----  Single OP -----
    d  <0> padsv[$cmp:54,58] # sRM*/LVINTRO
    e  <2> sassign # vKS/2
    # 18:   my @cmp;
    f  <;> nextstate(main 55 test.pl:18) # v:%,*,&,{,$
    g  <0> padav[@cmp:55,58] # vM/LVINTRO
    # 20:   while( $cmp =~ /[^\0]/g ){ # match non-zero byte
    h  <;> nextstate(main 57 test.pl:20) # v:%,*,&,{,$
    i  <{> enterloop(next->r last->v redo->j) # v
    s  <0> padsv[$cmp:54,58] # s
    t   match(/"[^\\0]"/) # sKS/RTIME        <-----  Single OP -----
    u  <|> and(other->j) # vK/1
    # 21:     push @cmp, pos($cmp) - 1;
    j      <;> nextstate(main 56 test.pl:21) # v:%,*,&,$
    k      <0> pushmark # s
    l      <0> padav[@cmp:55,58] # lRM
    m      <0> padsv[$cmp:54,58] # sRM
    n      <1> pos[t8] # sK/1
    o      <$> const(IV 1) # s
    p      <2> subtract[t9] # sK/2
    q      <@> push[t10] # vK/2
    r      <0> unstack # v
               goto # s
    v  <2> leaveloop # vK/2
    # 24:   return @cmp;
    w  <;> nextstate(main 58 test.pl:24) # v:%,*,&,{,$
    x  <0> pushmark # s
    y  <0> padav[@cmp:55,58] 
    z  <@> return # K
    10 <1> leavesub[1 ref] # K/REFC,1
    

提交回复
热议问题