I have a millions of pairs of string of same length which I want to compare and find the position where it has mismatches.
For example for each $str1 a
Here is a benchmarking script to figure out if the differences in speed of various approaches. Just keep in mind that there will be a lag the first time a script using Inline::C is invoked as the C compiler is invoked etc. So, run the script once, and then benchmark.
#!/usr/bin/perl
use strict;
use warnings;
use Benchmark qw( cmpthese );
my ($copies) = @ARGV;
$copies ||= 1;
my $x = 'ATTCCGGG' x $copies;
my $y = 'ATTGCGGG' x $copies;
my $z = 'ATACCGGC' x $copies;
sub wrapper {
my ($func, @args) = @_;
for my $s (@args) {
my $differences = $func->($x, $s);
# just trying to ensure results are not discarded
if ( @$differences == 0 ) {
print "There is no difference\n";
}
}
return;
}
cmpthese -5, {
explode => sub { wrapper(\&where_do_they_differ, $y, $z) },
mism_pos => sub { wrapper(\&mism_pos, $y, $z) },
inline_c => sub {
wrapper(\&i_dont_know_how_to_do_stuff_with_inline_c, $y, $z) },
};
sub where_do_they_differ {
my ($str1, $str2) = @_;
my @str1 = split //, $str1;
my @str2 = split //, $str2;
[ map {$str1[$_] eq $str2[$_] ? () : $_} 0 .. length($str1) - 1 ];
}
sub mism_pos {
my ($str1, $str2) = @_;
my @mism_pos;
for my $i (0 .. length($str1) - 1) {
if (substr($str1, $i, 1) ne substr($str2, $i, 1) ) {
push @mism_pos, $i;
}
}
return \@mism_pos;
}
sub i_dont_know_how_to_do_stuff_with_inline_c {
[ find_diffs(@_) ];
}
use Inline C => << 'EOC';
void find_diffs(char* x, char* y) {
int i;
Inline_Stack_Vars;
Inline_Stack_Reset;
for(i=0; x[i] && y[i]; ++i) {
if(x[i] != y[i]) {
Inline_Stack_Push(sv_2mortal(newSViv(i)));
}
}
Inline_Stack_Done;
}
EOC
Results (using VC++ 9 on Windows XP with AS Perl 5.10.1) with $copies = 1:
Rate explode mism_pos inline_c
explode 15475/s -- -64% -84%
mism_pos 43196/s 179% -- -56%
inline_c 98378/s 536% 128% --
Results with $copies = 100:
Rate explode mism_pos inline_c
explode 160/s -- -86% -99%
mism_pos 1106/s 593% -- -90%
inline_c 10808/s 6667% 877% --