Algorithms for “fuzzy matching” strings

后端 未结 6 552
傲寒
傲寒 2020-12-12 13:23

By fuzzy matching I don\'t mean similar strings by Levenshtein distance or something similar, but the way it\'s used in TextMate/Ido/Icicles: given a list of strings, find t

6条回答
  •  情书的邮戳
    2020-12-12 13:43

    Levenshtein 'Edit Distance' algorithms will definitely work on what you're trying to do: they will give you a measurement of how closely two words or addresses or phone numbers, psalms, monologues and scholarly articles match each other, allowing you you rank the results and choose the best match.

    A more lightweight appproach is to count up the common substrings: it's not as good as Levenshtein, but it provides usable results and runs quickly in slow languages which have access to fast 'InString' functions.

    I published an Excel 'Fuzzy Lookup' in Excellerando a few years ago, using 'FuzzyMatchScore' function that is, as far as I can tell, exactly what you need:

    http://excellerando.blogspot.com/2010/03/vlookup-with-fuzzy-matching-to-get.html

    It is, of course, in Visual Basic for Applications. Proceed with caution, crucifixes and garlic:

    Public Function SumOfCommonStrings( _
                                ByVal s1 As String, _
                                ByVal s2 As String, _
                                Optional Compare As VBA.VbCompareMethod = vbTextCompare, _
                                Optional iScore As Integer = 0 _
                                    ) As Integer
    
    Application.Volatile False
    
    ' N.Heffernan 06 June 2006 
    ' THIS CODE IS IN THE PUBLIC DOMAIN
    
    
    ' Function to measure how much of String 1 is made up of substrings found in String 2
    
    ' This function uses a modified Longest Common String algorithm.
    ' Simple LCS algorithms are unduly sensitive to single-letter
    ' deletions/changes near the midpoint of the test words, eg:
    ' Wednesday is obviously closer to WedXesday on an edit-distance
    ' basis than it is to WednesXXX. So it would be better to score
    ' the 'Wed' as well as the 'esday' and add up the total matched
    
    ' Watch out for strings of differing lengths:
    '
    '    SumOfCommonStrings("Wednesday", "WednesXXXday")
    '
    ' This scores the same as:
    '
    '     SumOfCommonStrings("Wednesday", "Wednesday")
    '
    ' So make sure the calling function uses the length of the longest
    ' string when calculating the degree of similarity from this score.
    
    
    ' This is coded for clarity, not for performance.
    
    Dim arr() As Integer    ' Scoring matrix
    Dim n As Integer        ' length of s1
    Dim m As Integer        ' length of s2
    Dim i As Integer        ' start position in s1
    Dim j As Integer        ' start position in s2
    Dim subs1 As String     ' a substring of s1
    Dim len1 As Integer     ' length of subs1
    
    Dim sBefore1            ' documented in the code
    Dim sBefore2
    Dim sAfter1
    Dim sAfter2
    
    Dim s3 As String
    
    
    SumOfCommonStrings = iScore
    
    n = Len(s1)
    m = Len(s2)
    
    If s1 = s2 Then
        SumOfCommonStrings = n
        Exit Function
    End If
    
    If n = 0 Or m = 0 Then
        Exit Function
    End If
    
    's1 should always be the shorter of the two strings:
    If n > m Then
        s3 = s2
        s2 = s1
        s1 = s3
        n = Len(s1)
        m = Len(s2)
    End If
    
    n = Len(s1)
    m = Len(s2)
    
    ' Special case: s1 is n exact substring of s2
    If InStr(1, s2, s1, Compare) Then
        SumOfCommonStrings = n
        Exit Function
    End If
    
    For len1 = n To 1 Step -1
    
        For i = 1 To n - len1 + 1
    
            subs1 = Mid(s1, i, len1)
            j = 0
            j = InStr(1, s2, subs1, Compare)
    
            If j > 0 Then
    
                ' We've found a matching substring...
                iScore = iScore + len1            
    
              ' Now clip out this substring from s1 and s2...
              ' And search the fragments before and after this excision:
    
    
                If i > 1 And j > 1 Then
                    sBefore1 = left(s1, i - 1)
                    sBefore2 = left(s2, j - 1)
                    iScore = SumOfCommonStrings(sBefore1, _
                                                sBefore2, _
                                                Compare, _
                                                iScore)
                End If
    
    
                If i + len1 < n And j + len1 < m Then
                    sAfter1 = right(s1, n + 1 - i - len1)
                    sAfter2 = right(s2, m + 1 - j - len1)
                    iScore = SumOfCommonStrings(sAfter1, _
                                                sAfter2, _
                                                Compare, _
                                                iScore)
                End If
    
    
                SumOfCommonStrings = iScore
                Exit Function
    
            End If
    
        Next
    
    
    Next
    
    
    End Function
    
    
    Private Function Minimum(ByVal a As Integer, _
                             ByVal b As Integer, _
                             ByVal c As Integer) As Integer
    Dim min As Integer
    
      min = a
    
      If b < min Then
            min = b
      End If
    
      If c < min Then
            min = c
      End If
    
      Minimum = min
    
    End Function
    
    

提交回复
热议问题