Using VBA, I need to \"unpivot\" data that is currently in delimited text files (as hundreds of columns by tens of thousands of rows) into a normalized form (four columns by
This should be fast enough (it takes 8 secs on a 18MB file on my machine, but I only copy data, I don't restructure it - if you don't do calculations but only reorder stuff you should get the same kind of performance). It works even if the number of rows / columns would not fit in a spreadsheet.
TODO: it is a bit long but you should be able to (a) copy paste it (b) change the file names and (c) amend the manipulateData function to suit your needs. The rest of the code is a bunch of reusable utilities functions that you should not need to change.
I am not sure you can get much faster using VBA - if you need faster, you should consider an alternative language. Typically, the same code in Java or C# would much shorter because they already have standard libraries to read / write files etc. and would be faster too.
Option Explicit
Public Sub doIt()
Dim sourceFile As String
Dim destinationFile As String
Dim data As Variant
Dim result As Variant
sourceFile = "xxxxxxx"
destinationFile = "xxxxxxx"
data = getDataFromFile(sourceFile, ",")
If Not isArrayEmpty(data) Then
result = manipulateData(data)
writeToCsv result, destinationFile, ","
Else
MsgBox ("Empty file")
End If
End Sub
Function manipulateData(sourceData As Variant) As Variant
Dim result As Variant
Dim i As Long
Dim j As Long
Dim k As Long
Dim m As Long
'redim the result array to the right size - here I only copy so same size as source
ReDim result(1 To UBound(sourceData, 1), 1 To UBound(sourceData, 2)) As Variant
For i = LBound(sourceData, 1) To UBound(sourceData, 1)
For j = LBound(sourceData, 2) To UBound(sourceData, 2)
k = i 'k to be defined - here I only copy data
m = j 'm to be defined - here I only copy data
result(k, m) = sourceData(i, j)
Next j
Next i
manipulateData = result
End Function
Public Sub writeToCsv(parData As Variant, parFileName As String, parDelimiter As String)
If getArrayNumberOfDimensions(parData) <> 2 Then Exit Sub
Dim i As Long
Dim j As Long
Dim fileNum As Long
Dim locLine As String
Dim locCsvString As String
fileNum = FreeFile
If Dir(parFileName) <> "" Then Kill (parFileName)
Open parFileName For Binary Lock Read Write As #fileNum
For i = LBound(parData, 1) To UBound(parData, 1)
locLine = ""
For j = LBound(parData, 2) To UBound(parData, 2)
If IsError(parData(i, j)) Then
locLine = locLine & "#N/A" & parDelimiter
Else
locLine = locLine & parData(i, j) & parDelimiter
End If
Next j
locLine = Left(locLine, Len(locLine) - 1)
If i <> UBound(parData, 1) Then locLine = locLine & vbCrLf
Put #fileNum, , locLine
Next i
error_handler:
Close #fileNum
End Sub
Public Function isArrayEmpty(parArray As Variant) As Boolean
'Returns false if not an array or dynamic array that has not been initialised (ReDim) or has been erased (Erase)
If IsArray(parArray) = False Then isArrayEmpty = True
On Error Resume Next
If UBound(parArray) < LBound(parArray) Then isArrayEmpty = True: Exit Function Else: isArrayEmpty = False
End Function
Public Function getArrayNumberOfDimensions(parArray As Variant) As Long
'Returns the number of dimension of an array - 0 for an empty array.
Dim i As Long
Dim errorCheck As Long
If isArrayEmpty(parArray) Then Exit Function 'returns 0
On Error GoTo FinalDimension
'Visual Basic for Applications arrays can have up to 60000 dimensions
For i = 1 To 60001
errorCheck = LBound(parArray, i)
Next i
'Not supposed to happen
getArrayNumberOfDimensions = 0
Exit Function
FinalDimension:
getArrayNumberOfDimensions = i - 1
End Function
Private Function getDataFromFile(parFileName As String, parDelimiter As String, Optional parExcludeCharacter As String = "") As Variant
'parFileName is supposed to be a delimited file (csv...)
'parDelimiter is the delimiter, "," for example in a comma delimited file
'Returns an empty array if file is empty or can't be opened
'number of columns based on the line with the largest number of columns, not on the first line
'parExcludeCharacter: sometimes csv files have quotes around strings: "XXX" - if parExcludeCharacter = """" then removes the quotes
Dim locLinesList() As Variant
Dim locData As Variant
Dim i As Long
Dim j As Long
Dim locNumRows As Long
Dim locNumCols As Long
Dim fso As Variant
Dim ts As Variant
Const REDIM_STEP = 10000
Set fso = CreateObject("Scripting.FileSystemObject")
On Error GoTo error_open_file
Set ts = fso.OpenTextFile(parFileName)
On Error GoTo unhandled_error
'Counts the number of lines and the largest number of columns
ReDim locLinesList(1 To 1) As Variant
i = 0
Do While Not ts.AtEndOfStream
If i Mod REDIM_STEP = 0 Then
ReDim Preserve locLinesList(1 To UBound(locLinesList, 1) + REDIM_STEP) As Variant
End If
locLinesList(i + 1) = Split(ts.ReadLine, parDelimiter)
j = UBound(locLinesList(i + 1), 1) 'number of columns
If locNumCols < j Then locNumCols = j
If j = 13 Then
j = j
End If
i = i + 1
Loop
ts.Close
locNumRows = i
If locNumRows = 0 Then Exit Function 'Empty file
ReDim locData(1 To locNumRows, 1 To locNumCols + 1) As Variant
'Copies the file into an array
If parExcludeCharacter <> "" Then
For i = 1 To locNumRows
For j = 0 To UBound(locLinesList(i), 1)
If Left(locLinesList(i)(j), 1) = parExcludeCharacter Then
If Right(locLinesList(i)(j), 1) = parExcludeCharacter Then
locLinesList(i)(j) = Mid(locLinesList(i)(j), 2, Len(locLinesList(i)(j)) - 2) 'If locTempArray = "", Mid returns ""
Else
locLinesList(i)(j) = Right(locLinesList(i)(j), Len(locLinesList(i)(j)) - 1)
End If
ElseIf Right(locLinesList(i)(j), 1) = parExcludeCharacter Then
locLinesList(i)(j) = Left(locLinesList(i)(j), Len(locLinesList(i)(j)) - 1)
End If
locData(i, j + 1) = locLinesList(i)(j)
Next j
Next i
Else
For i = 1 To locNumRows
For j = 0 To UBound(locLinesList(i), 1)
locData(i, j + 1) = locLinesList(i)(j)
Next j
Next i
End If
getDataFromFile = locData
Exit Function
error_open_file: 'returns empty variant
unhandled_error: 'returns empty variant
End Function