I have to work with a collection of 120 files of ~2 GB (525600 lines x 302 columns). The goal is to make some statistics and put the results in a clean SQLite database.
I've found another way to do it, much faster, with awk instead of sed. Here is another example :
library(data.table)
# create path to new temporary file
origData <- tempfile(pattern="origData",fileext=".txt")
# write table with irregular blank spaces separators.
write(paste0(" YYYY MM DD HH mm 19490 40790","\n",
paste(rep(" 1991 10 1 1 0 1.046465E+00 1.568405E+00", 5e6),
collapse="\n"),"\n"),
file=origData
)
# function awkFread : first awk, then fread. Argument : colNums = selection of columns.
awkFread<-function(file, colNums, ...){
require(data.table)
if(is.vector(colNums)){
tmpPath<-tempfile(pattern='tmp',fileext='.txt')
colGen<-paste0("$",colNums,"\",\"", collapse=",")
colGen<-substr(colGen,1,nchar(colGen)-3)
cmdAwk<-paste("awk '{print",colGen,"}'", file, '>', tmpPath)
try(system(cmdAwk))
DT<-fread(tmpPath,...)
try(system(paste('rm', tmpPath)))
return(DT)
}
}
# check read time :
system.time(
DT3 <- awkFread(origData,c(1:5),header=T)
)
> user system elapsed
> 6.230 0.408 6.644