Match sub-string within a string with tolerance of 1 character mismatch

前端未结

关注

 5  1660

I was going through some Amazon interview questions on CareerCup.com, and I came across this interesting question which I haven\'t been able to figure out how to do. I have

With arbitary no. of tolerance levels.

Worked for all the test cases I could think of. Loosely based on |/|ad's solution.

#include<stdio.h>
#include<string.h>

report (int x, char* str, char* sstr, int[] t) {
    if ( x )
        printf( "%s is a substring of %s for a tolerance[%d]\n",sstr,str[i],t[i] );
    else
        printf ( "%s is NOT a substring of %s for a tolerance[%d]\n",sstr,str[i],t[i] );
}

int find_with_tolerance (char *str, char *sstr, int tol) {

    if ( (*sstr) == '\0' ) //end of substring, and match
        return 1;

    if ( (*str) == '\0' ) //end of string
        if ( tol >= strlen(sstr) ) //but tol saves the day
            return 1;
        else    //there's nothing even the poor tol can do
            return 0;

    if ( *sstr == *str ) { //current char match, smooth
        return find_with_tolerance ( str+1, sstr+1, tol );
    } else {
        if ( tol <= 0 ) //that's it. no more patience
            return 0;
        for(int i=1; i<=tol; i++) {
            if ( *(str+i) == *sstr ) //insertioan of a foreign character
                return find_with_tolerance ( str+i+1, sstr+1, tol-i );
            if ( *str == *(sstr+i) ) //deal with dletion
                return find_with_tolerance ( str+1, sstr+i+1, tol-i );
            if ( *(str+i) == *(sstr+i)  ) //deal with riplacement
                return find_with_tolerance ( str+i+1, sstr+i+1, tol-i );
            if ( *(sstr+i) == '\0' ) //substr ends, thanks to tol & this loop
                return 1;
        }
        return 0; //when all fails
    }
}

int find (char *str, char *sstr, int tol ) {
    int w = 0;
    while (*str!='\0')
        w |= find_with_tolerance ( str++, sstr, tol );
    return (w) ? 1 : 0;
}

int main() {
    const int n=3; //no of test cases
    char *sstr = "dog"; //the substr
    char *str[n] = { "doox", //those cases
                    "xxxxxd",
                    "xxdogxx" };
    int t[] = {1,1,0}; //tolerance levels for those cases
    for(int i = 0; i < n; i++) {
        report( find ( *(str+i), sstr, t[i] ), *(str+i), sstr, t[i] );
    }
    return 0;
}

0 讨论(0)

感情败类

2020-12-28 11:33

This seems to work, let me know if you find any errors and I'll try to fix them:

int findHelper(const char *str, const char *substr, int mustMatch = 0)
{
    if ( *substr == '\0' )
        return 1;

    if ( *str == '\0' )
        return 0;

    if ( *str == *substr )
        return findHelper(str + 1, substr + 1, mustMatch);
    else
    {
        if ( mustMatch )
            return 0;

        if ( *(str + 1) == *substr )
            return findHelper(str + 1, substr, 1);
        else if ( *str == *(substr + 1) )
            return findHelper(str, substr + 1, 1);
        else if ( *(str + 1) == *(substr + 1) )
            return findHelper(str + 1, substr + 1, 1);
        else if ( *(substr + 1) == '\0' )
            return 1;
        else
            return 0;
    }
}

int find(const char *str, const char *substr)
{
    int ok = 0;
    while ( *str != '\0' )
        ok |= findHelper(str++, substr, 0);

    return ok;
}


int main()
{
    printf("%d\n", find("xxxdoogyyyy", "dog"));
    printf("%d\n", find("xxxdgyyyy", "dog"));
    printf("%d\n", find("xxxdigyyyy", "dog"));
}

Basically, I make sure only one character can differ, and run the function that does this for every suffix of the haystack.

0 讨论(0)

旧巷少年郎

2020-12-28 11:39

This is related to a classical problem of IT, referred to as Levenshtein distance. See Wikibooks for a bunch of implementations in different languages.

0 讨论(0)
发布评论:

提交评论
- 加载中...

轮回少年

2020-12-28 11:40

This is slightly different than the earlier solution, but I was intrigued by the problem and wanted to give it a shot. Obviously optimize if desired, I just wanted a solution.

char *match(char *str, char *substr, int tolerance)
{
  if (! *substr) return str;
  if (! *str) return NULL;

  while (*str)
  {
    char *str_p;
    char *substr_p;
    char *matches_missing;
    char *matches_mismatched;

    str_p = str;
    substr_p = substr;

    while (*str_p && *substr_p && *str_p == *substr_p)
    {
      str_p++;
      substr_p++;
    }
    if (! *substr_p) return str;
    if (! tolerance)
    {
      str++;
      continue;
    }

    if (strlen(substr_p) <= tolerance) return str;

    /* missed due to a missing letter */
    matches_missing = match(str_p, substr_p + 1, tolerance - 1);
    if (matches_missing == str_p) return str;

    /* missed due to a mismatch of letters */
    matches_mismatched = match(str_p + 1, substr_p + 1, tolerance - 1);
    if (matches_mismatched == str_p + 1) return str;

    str++;
  }
  return NULL;
}

0 讨论(0)

感情败类

2020-12-28 11:45

Is the problem to do this efficiently?

The naive solution is to loop over every substring of size substr in str, from left to right, and return true if the current substring if only one of the characters is different in a comparison.

Let n = size of str Let m = size of substr

There are O(n) substrings in str, and the matching step takes time O(m). Ergo, the naive solution runs in time O(n*m)

0 讨论(0)
发布评论:

提交评论
- 加载中...