Handle large delimited text files in VBA

后端 未结 4 2086
醉酒成梦
醉酒成梦 2021-01-16 15:33

Using VBA, I need to \"unpivot\" data that is currently in delimited text files (as hundreds of columns by tens of thousands of rows) into a normalized form (four columns by

4条回答
  •  梦谈多话
    2021-01-16 16:39

    This should be fast enough (it takes 8 secs on a 18MB file on my machine, but I only copy data, I don't restructure it - if you don't do calculations but only reorder stuff you should get the same kind of performance). It works even if the number of rows / columns would not fit in a spreadsheet.

    TODO: it is a bit long but you should be able to (a) copy paste it (b) change the file names and (c) amend the manipulateData function to suit your needs. The rest of the code is a bunch of reusable utilities functions that you should not need to change.

    I am not sure you can get much faster using VBA - if you need faster, you should consider an alternative language. Typically, the same code in Java or C# would much shorter because they already have standard libraries to read / write files etc. and would be faster too.

    Option Explicit
    
    Public Sub doIt()
        Dim sourceFile As String
        Dim destinationFile As String
        Dim data As Variant
        Dim result As Variant
    
        sourceFile = "xxxxxxx"
        destinationFile = "xxxxxxx"
    
        data = getDataFromFile(sourceFile, ",")
        If Not isArrayEmpty(data) Then
           result = manipulateData(data)
           writeToCsv result, destinationFile, ","
        Else
           MsgBox ("Empty file")
        End If
    End Sub
    
    Function manipulateData(sourceData As Variant) As Variant
        Dim result As Variant
        Dim i As Long
        Dim j As Long
        Dim k As Long
        Dim m As Long
    
        'redim the result array to the right size - here I only copy so same size as source
        ReDim result(1 To UBound(sourceData, 1), 1 To UBound(sourceData, 2)) As Variant
    
        For i = LBound(sourceData, 1) To UBound(sourceData, 1)
            For j = LBound(sourceData, 2) To UBound(sourceData, 2)
                k = i 'k to be defined - here I only copy data
                m = j 'm to be defined - here I only copy data
                result(k, m) = sourceData(i, j)
            Next j
        Next i
    
        manipulateData = result
    End Function
    
    Public Sub writeToCsv(parData As Variant, parFileName As String, parDelimiter As String)
    
        If getArrayNumberOfDimensions(parData) <> 2 Then Exit Sub
    
        Dim i As Long
        Dim j As Long
        Dim fileNum As Long
        Dim locLine As String
        Dim locCsvString As String
    
        fileNum = FreeFile
        If Dir(parFileName) <> "" Then Kill (parFileName)
        Open parFileName For Binary Lock Read Write As #fileNum
    
        For i = LBound(parData, 1) To UBound(parData, 1)
          locLine = ""
          For j = LBound(parData, 2) To UBound(parData, 2)
            If IsError(parData(i, j)) Then
              locLine = locLine & "#N/A" & parDelimiter
            Else
              locLine = locLine & parData(i, j) & parDelimiter
            End If
          Next j
          locLine = Left(locLine, Len(locLine) - 1)
          If i <> UBound(parData, 1) Then locLine = locLine & vbCrLf
          Put #fileNum, , locLine
        Next i
    
    error_handler:
        Close #fileNum
    
    End Sub
    
    Public Function isArrayEmpty(parArray As Variant) As Boolean
    'Returns false if not an array or dynamic array that has not been initialised (ReDim) or has been erased (Erase)
    
      If IsArray(parArray) = False Then isArrayEmpty = True
      On Error Resume Next
      If UBound(parArray) < LBound(parArray) Then isArrayEmpty = True: Exit Function Else: isArrayEmpty = False
    
    End Function
    
    Public Function getArrayNumberOfDimensions(parArray As Variant) As Long
    'Returns the number of dimension of an array - 0 for an empty array.
    
        Dim i As Long
        Dim errorCheck As Long
    
        If isArrayEmpty(parArray) Then Exit Function 'returns 0
    
        On Error GoTo FinalDimension
        'Visual Basic for Applications arrays can have up to 60000 dimensions
        For i = 1 To 60001
            errorCheck = LBound(parArray, i)
        Next i
    
        'Not supposed to happen
        getArrayNumberOfDimensions = 0
        Exit Function
    
    FinalDimension:
        getArrayNumberOfDimensions = i - 1
    
    End Function
    
    Private Function getDataFromFile(parFileName As String, parDelimiter As String, Optional parExcludeCharacter As String = "") As Variant
    'parFileName is supposed to be a delimited file (csv...)
    'parDelimiter is the delimiter, "," for example in a comma delimited file
    'Returns an empty array if file is empty or can't be opened
    'number of columns based on the line with the largest number of columns, not on the first line
    'parExcludeCharacter: sometimes csv files have quotes around strings: "XXX" - if parExcludeCharacter = """" then removes the quotes
    
    
      Dim locLinesList() As Variant
      Dim locData As Variant
      Dim i As Long
      Dim j As Long
      Dim locNumRows As Long
      Dim locNumCols As Long
      Dim fso As Variant
      Dim ts As Variant
      Const REDIM_STEP = 10000
    
      Set fso = CreateObject("Scripting.FileSystemObject")
    
      On Error GoTo error_open_file
      Set ts = fso.OpenTextFile(parFileName)
      On Error GoTo unhandled_error
    
      'Counts the number of lines and the largest number of columns
      ReDim locLinesList(1 To 1) As Variant
      i = 0
      Do While Not ts.AtEndOfStream
        If i Mod REDIM_STEP = 0 Then
          ReDim Preserve locLinesList(1 To UBound(locLinesList, 1) + REDIM_STEP) As Variant
        End If
        locLinesList(i + 1) = Split(ts.ReadLine, parDelimiter)
        j = UBound(locLinesList(i + 1), 1) 'number of columns
        If locNumCols < j Then locNumCols = j
        If j = 13 Then
          j = j
        End If
        i = i + 1
      Loop
    
      ts.Close
    
      locNumRows = i
    
      If locNumRows = 0 Then Exit Function 'Empty file
    
      ReDim locData(1 To locNumRows, 1 To locNumCols + 1) As Variant
    
      'Copies the file into an array
      If parExcludeCharacter <> "" Then
    
        For i = 1 To locNumRows
          For j = 0 To UBound(locLinesList(i), 1)
            If Left(locLinesList(i)(j), 1) = parExcludeCharacter Then
              If Right(locLinesList(i)(j), 1) = parExcludeCharacter Then
                locLinesList(i)(j) = Mid(locLinesList(i)(j), 2, Len(locLinesList(i)(j)) - 2)       'If locTempArray = "", Mid returns ""
              Else
                locLinesList(i)(j) = Right(locLinesList(i)(j), Len(locLinesList(i)(j)) - 1)
              End If
            ElseIf Right(locLinesList(i)(j), 1) = parExcludeCharacter Then
              locLinesList(i)(j) = Left(locLinesList(i)(j), Len(locLinesList(i)(j)) - 1)
            End If
            locData(i, j + 1) = locLinesList(i)(j)
          Next j
        Next i
    
      Else
    
        For i = 1 To locNumRows
          For j = 0 To UBound(locLinesList(i), 1)
            locData(i, j + 1) = locLinesList(i)(j)
          Next j
        Next i
    
      End If
    
      getDataFromFile = locData
    
      Exit Function
    
    error_open_file:                 'returns empty variant
    unhandled_error:                 'returns empty variant
    
    End Function
    

提交回复
热议问题