How to parse freeform street/postal address out of text, and into components

后端 未结 9 1202
感动是毒
感动是毒 2020-11-22 13:40

We do business largely in the United States and are trying to improve user experience by combining all the address fields into a single text area. But there are a few proble

9条回答
  •  温柔的废话
    2020-11-22 14:14

    I'm late to the party, here is an Excel VBA script I wrote years ago for Australia. It can be easily modified to support other Countries. I've made a GitHub repository of the C# code here. I've hosted it on my site and you can download it here: http://jeremythompson.net/rocks/ParseAddress.xlsm

    Strategy

    For any country with a PostCode that's numeric or can be matched with a RegEx my strategy works very well:

    1. First we detect the First and Surname which are assumed to be the top line. Its easy to skip the name and start with the address by unticking the checkbox (called 'Name is top row' as shown below).

    2. Next its safe to expect the Address consisting of the Street and Number come before the Suburb and the St, Pde, Ave, Av, Rd, Cres, loop, etc is a separator.

    3. Detecting the Suburb vs the State and even Country can trick the most sophisticated parsers as there can be conflicts. To overcome this I use a PostCode look up based on the fact that after stripping Street and Apartment/Unit numbers as well as the PoBox,Ph,Fax,Mobile etc, only the PostCode number will remain. This is easy to match with a regEx to then look up the suburb(s) and country.

    Your National Post Office Service will provide a list of post codes with Suburbs and States free of charge that you can store in an excel sheet, db table, text/json/xml file, etc.

    1. Finally, since some Post Codes have multiple Suburbs we check which suburb appears in the Address.

    Example

    VBA Code

    DISCLAIMER, I know this code is not perfect, or even written well however its very easy to convert to any programming language and run in any type of application.The strategy is the answer depending on your country and rules, take this code as an example:

    Option Explicit
    
    Private Const TopRow As Integer = 0
    
    Public Sub ParseAddress()
    Dim strArr() As String
    Dim sigRow() As String
    Dim i As Integer
    Dim j As Integer
    Dim k As Integer
    Dim Stat As String
    Dim SpaceInName As Integer
    Dim Temp As String
    Dim PhExt As String
    
    On Error Resume Next
    
    Temp = ActiveSheet.Range("Address")
    
    'Split info into array
    strArr = Split(Temp, vbLf)
    
    'Trim the array
    For i = 0 To UBound(strArr)
    strArr(i) = VBA.Trim(strArr(i))
    Next i
    
    'Remove empty items/rows    
    ReDim sigRow(LBound(strArr) To UBound(strArr))
    For i = LBound(strArr) To UBound(strArr)
        If Trim(strArr(i)) <> "" Then
            sigRow(j) = strArr(i)
            j = j + 1
        End If
    Next i
    ReDim Preserve sigRow(LBound(strArr) To j)
    
    'Find the name (MUST BE ON THE FIRST ROW UNLESS CHECKBOX UNTICKED)
    i = TopRow
    If ActiveSheet.Shapes("chkFirst").ControlFormat.Value = 1 Then
    
    SpaceInName = InStr(1, sigRow(i), " ", vbTextCompare) - 1
    
    If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
    ActiveSheet.Range("FirstName") = VBA.Left(sigRow(i), SpaceInName)
    Else
     If MsgBox("First Name: " & VBA.Mid$(sigRow(i), 1, SpaceInName), vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("FirstName") = VBA.Left(sigRow(i), SpaceInName)
    End If
    
    If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
    ActiveSheet.Range("Surname") = VBA.Mid(sigRow(i), SpaceInName + 2)
    Else
      If MsgBox("Surame: " & VBA.Mid(sigRow(i), SpaceInName + 2), vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("Surname") = VBA.Mid(sigRow(i), SpaceInName + 2)
    End If
    sigRow(i) = ""
    End If
    
    'Find the Street by looking for a "St, Pde, Ave, Av, Rd, Cres, loop, etc"
    For i = 1 To UBound(sigRow)
    If Len(sigRow(i)) > 0 Then
        For j = 0 To 8
        If InStr(1, VBA.UCase(sigRow(i)), Street(j), vbTextCompare) > 0 Then
    
        'Find the position of the street in order to get the suburb
        SpaceInName = InStr(1, VBA.UCase(sigRow(i)), Street(j), vbTextCompare) + Len(Street(j)) - 1
    
        'If its a po box then add 5 chars
        If VBA.Right(Street(j), 3) = "BOX" Then SpaceInName = SpaceInName + 5
    
        If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
        ActiveSheet.Range("Street") = VBA.Mid(sigRow(i), 1, SpaceInName)
        Else
          If MsgBox("Street Address: " & VBA.Mid(sigRow(i), 1, SpaceInName), vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("Street") = VBA.Mid(sigRow(i), 1, SpaceInName)
        End If
        'Trim the Street, Number leaving the Suburb if its exists on the same line
        sigRow(i) = VBA.Mid(sigRow(i), SpaceInName) + 2
        sigRow(i) = Replace(sigRow(i), VBA.Mid(sigRow(i), 1, SpaceInName), "")
    
        GoTo PastAddress:
        End If
        Next j
    End If
    Next i
    PastAddress:
    
    'Mobile
    For i = 1 To UBound(sigRow)
    If Len(sigRow(i)) > 0 Then
        For j = 0 To 3
        Temp = Mb(j)
            If VBA.Left(VBA.UCase(sigRow(i)), Len(Temp)) = Temp Then
            If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
            ActiveSheet.Range("Mobile") = VBA.Mid(sigRow(i), Len(Temp) + 2)
            Else
              If MsgBox("Mobile: " & VBA.Mid(sigRow(i), Len(Temp) + 2), vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("Mobile") = VBA.Mid(sigRow(i), Len(Temp) + 2)
            End If
        sigRow(i) = ""
        GoTo PastMobile:
        End If
        Next j
    End If
    Next i
    PastMobile:
    
    'Phone
    For i = 1 To UBound(sigRow)
    If Len(sigRow(i)) > 0 Then
        For j = 0 To 1
        Temp = Ph(j)
            If VBA.Left(VBA.UCase(sigRow(i)), Len(Temp)) = Temp Then
    
                'TODO: Detect the intl or national extension here.. or if we can from the postcode.
                If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
                ActiveSheet.Range("Phone") = VBA.Mid(sigRow(i), Len(Temp) + 3)
                Else
                  If MsgBox("Phone: " & VBA.Mid(sigRow(i), Len(Temp) + 3), vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("Phone") = VBA.Mid(sigRow(i), Len(Temp) + 3)
                End If
    
            sigRow(i) = ""
            GoTo PastPhone:
            End If
        Next j
    End If
    Next i
    PastPhone:
    
    
    'Email
    For i = 1 To UBound(sigRow)
        If Len(sigRow(i)) > 0 Then
            'replace with regEx search
            If InStr(1, sigRow(i), "@", vbTextCompare) And InStr(1, VBA.UCase(sigRow(i)), ".CO", vbTextCompare) Then
            Dim email As String
            email = sigRow(i)
            email = Replace(VBA.UCase(email), "EMAIL:", "")
            email = Replace(VBA.UCase(email), "E-MAIL:", "")
            email = Replace(VBA.UCase(email), "E:", "")
            email = Replace(VBA.UCase(Trim(email)), "E ", "")
            email = VBA.LCase(email)
    
                If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
                ActiveSheet.Range("Email") = email
                Else
                  If MsgBox("Email: " & email, vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("Email") = email
                End If
            sigRow(i) = ""
            Exit For
            End If
        End If
    Next i
    
    'Now the only remaining items will be the postcode, suburb, country
    'there shouldn't be any numbers (eg. from PoBox,Ph,Fax,Mobile) except for the Post Code
    
    'Join the string and filter out the Post Code
    Temp = Join(sigRow, vbCrLf)
    Temp = Trim(Temp)
    
    For i = 1 To Len(Temp)
    
    Dim postCode As String
    postCode = VBA.Mid(Temp, i, 4)
    
    'In Australia PostCodes are 4 digits
    If VBA.Mid(Temp, i, 1) <> " " And IsNumeric(postCode) Then
    
        If ActiveSheet.Shapes("chkConfirm").ControlFormat.Value = 0 Then
        ActiveSheet.Range("PostCode") = postCode
        Else
          If MsgBox("Post Code: " & postCode, vbQuestion + vbYesNo, "Confirm Details") = vbYes Then ActiveSheet.Range("PostCode") = postCode
        End If
    
        'Lookup the Suburb and State based on the PostCode, the PostCode sheet has the lookup
        Dim mySuburbArray As Range
        Set mySuburbArray = Sheets("PostCodes").Range("A2:B16670")
    
        Dim suburbs As String
        For j = 1 To mySuburbArray.Columns(1).Cells.Count
        If mySuburbArray.Cells(j, 1) = postCode Then
            'Check if the suburb is listed in the address
            If InStr(1, UCase(Temp), mySuburbArray.Cells(j, 2), vbTextCompare) > 0 Then
    
            'Set the Suburb and State
            ActiveSheet.Range("Suburb") = mySuburbArray.Cells(j, 2)
            Stat = mySuburbArray.Cells(j, 3)
            ActiveSheet.Range("State") = Stat
    
            'Knowing the State - for Australia we can get the telephone Ext
            PhExt = PhExtension(VBA.UCase(Stat))
            ActiveSheet.Range("PhExt") = PhExt
    
            'remove the phone extension from the number
            Dim prePhone As String
            prePhone = ActiveSheet.Range("Phone")
            prePhone = Replace(prePhone, PhExt & " ", "")
            prePhone = Replace(prePhone, "(" & PhExt & ") ", "")
            prePhone = Replace(prePhone, "(" & PhExt & ")", "")
            ActiveSheet.Range("Phone") = prePhone
            Exit For
            End If
        End If
        Next j
    Exit For
    End If
    Next i
    
    End Sub
    
    
    Private Function PhExtension(ByVal State As String) As String
    Select Case State
    Case Is = "NSW"
    PhExtension = "02"
    Case Is = "QLD"
    PhExtension = "07"
    Case Is = "VIC"
    PhExtension = "03"
    Case Is = "NT"
    PhExtension = "04"
    Case Is = "WA"
    PhExtension = "05"
    Case Is = "SA"
    PhExtension = "07"
    Case Is = "TAS"
    PhExtension = "06"
    End Select
    End Function
    
    Private Function Ph(ByVal Num As Integer) As String
    Select Case Num
    Case Is = 0
    Ph = "PH"
    Case Is = 1
    Ph = "PHONE"
    'Case Is = 2
    'Ph = "P"
    End Select
    End Function
    
    Private Function Mb(ByVal Num As Integer) As String
    Select Case Num
    Case Is = 0
    Mb = "MB"
    Case Is = 1
    Mb = "MOB"
    Case Is = 2
    Mb = "CELL"
    Case Is = 3
    Mb = "MOBILE"
    'Case Is = 4
    'Mb = "M"
    End Select
    End Function
    
    Private Function Fax(ByVal Num As Integer) As String
    Select Case Num
    Case Is = 0
    Fax = "FAX"
    Case Is = 1
    Fax = "FACSIMILE"
    'Case Is = 2
    'Fax = "F"
    End Select
    End Function
    
    Private Function State(ByVal Num As Integer) As String
    Select Case Num
    Case Is = 0
    State = "NSW"
    Case Is = 1
    State = "QLD"
    Case Is = 2
    State = "VIC"
    Case Is = 3
    State = "NT"
    Case Is = 4
    State = "WA"
    Case Is = 5
    State = "SA"
    Case Is = 6
    State = "TAS"
    End Select
    End Function
    
    Private Function Street(ByVal Num As Integer) As String
    Select Case Num
    Case Is = 0
    Street = " ST"
    Case Is = 1
    Street = " RD"
    Case Is = 2
    Street = " AVE"
    Case Is = 3
    Street = " AV"
    Case Is = 4
    Street = " CRES"
    Case Is = 5
    Street = " LOOP"
    Case Is = 6
    Street = "PO BOX"
    Case Is = 7
    Street = " STREET"
    Case Is = 8
    Street = " ROAD"
    Case Is = 9
    Street = " AVENUE"
    Case Is = 10
    Street = " CRESENT"
    Case Is = 11
    Street = " PARADE"
    Case Is = 12
    Street = " PDE"
    Case Is = 13
    Street = " LANE"
    Case Is = 14
    Street = " COURT"
    Case Is = 15
    Street = " BLVD"
    Case Is = 16
    Street = "P.O. BOX"
    Case Is = 17
    Street = "P.O BOX"
    Case Is = 18
    Street = "PO BOX"
    Case Is = 19
    Street = "POBOX"
    End Select
    End Function
    

提交回复
热议问题