By fuzzy matching I don\'t mean similar strings by Levenshtein distance or something similar, but the way it\'s used in TextMate/Ido/Icicles: given a list of strings, find t
Levenshtein 'Edit Distance' algorithms will definitely work on what you're trying to do: they will give you a measurement of how closely two words or addresses or phone numbers, psalms, monologues and scholarly articles match each other, allowing you you rank the results and choose the best match.
A more lightweight appproach is to count up the common substrings: it's not as good as Levenshtein, but it provides usable results and runs quickly in slow languages which have access to fast 'InString' functions.
I published an Excel 'Fuzzy Lookup' in Excellerando a few years ago, using 'FuzzyMatchScore' function that is, as far as I can tell, exactly what you need:
http://excellerando.blogspot.com/2010/03/vlookup-with-fuzzy-matching-to-get.html
It is, of course, in Visual Basic for Applications. Proceed with caution, crucifixes and garlic:
Public Function SumOfCommonStrings( _
ByVal s1 As String, _
ByVal s2 As String, _
Optional Compare As VBA.VbCompareMethod = vbTextCompare, _
Optional iScore As Integer = 0 _
) As Integer
Application.Volatile False
' N.Heffernan 06 June 2006
' THIS CODE IS IN THE PUBLIC DOMAIN
' Function to measure how much of String 1 is made up of substrings found in String 2
' This function uses a modified Longest Common String algorithm.
' Simple LCS algorithms are unduly sensitive to single-letter
' deletions/changes near the midpoint of the test words, eg:
' Wednesday is obviously closer to WedXesday on an edit-distance
' basis than it is to WednesXXX. So it would be better to score
' the 'Wed' as well as the 'esday' and add up the total matched
' Watch out for strings of differing lengths:
'
' SumOfCommonStrings("Wednesday", "WednesXXXday")
'
' This scores the same as:
'
' SumOfCommonStrings("Wednesday", "Wednesday")
'
' So make sure the calling function uses the length of the longest
' string when calculating the degree of similarity from this score.
' This is coded for clarity, not for performance.
Dim arr() As Integer ' Scoring matrix
Dim n As Integer ' length of s1
Dim m As Integer ' length of s2
Dim i As Integer ' start position in s1
Dim j As Integer ' start position in s2
Dim subs1 As String ' a substring of s1
Dim len1 As Integer ' length of subs1
Dim sBefore1 ' documented in the code
Dim sBefore2
Dim sAfter1
Dim sAfter2
Dim s3 As String
SumOfCommonStrings = iScore
n = Len(s1)
m = Len(s2)
If s1 = s2 Then
SumOfCommonStrings = n
Exit Function
End If
If n = 0 Or m = 0 Then
Exit Function
End If
's1 should always be the shorter of the two strings:
If n > m Then
s3 = s2
s2 = s1
s1 = s3
n = Len(s1)
m = Len(s2)
End If
n = Len(s1)
m = Len(s2)
' Special case: s1 is n exact substring of s2
If InStr(1, s2, s1, Compare) Then
SumOfCommonStrings = n
Exit Function
End If
For len1 = n To 1 Step -1
For i = 1 To n - len1 + 1
subs1 = Mid(s1, i, len1)
j = 0
j = InStr(1, s2, subs1, Compare)
If j > 0 Then
' We've found a matching substring...
iScore = iScore + len1
' Now clip out this substring from s1 and s2...
' And search the fragments before and after this excision:
If i > 1 And j > 1 Then
sBefore1 = left(s1, i - 1)
sBefore2 = left(s2, j - 1)
iScore = SumOfCommonStrings(sBefore1, _
sBefore2, _
Compare, _
iScore)
End If
If i + len1 < n And j + len1 < m Then
sAfter1 = right(s1, n + 1 - i - len1)
sAfter2 = right(s2, m + 1 - j - len1)
iScore = SumOfCommonStrings(sAfter1, _
sAfter2, _
Compare, _
iScore)
End If
SumOfCommonStrings = iScore
Exit Function
End If
Next
Next
End Function
Private Function Minimum(ByVal a As Integer, _
ByVal b As Integer, _
ByVal c As Integer) As Integer
Dim min As Integer
min = a
If b < min Then
min = b
End If
If c < min Then
min = c
End If
Minimum = min
End Function