R data.table fread command : how to read large files with irregular separators?

后端 未结 5 1390
迷失自我
迷失自我 2020-12-06 20:49

I have to work with a collection of 120 files of ~2 GB (525600 lines x 302 columns). The goal is to make some statistics and put the results in a clean SQLite database.

相关标签:
5条回答
  • 2020-12-06 20:51

    If peak memory is not an issue or you can stream it in chunks that are manageable, the following gsub()/fread() hybrid should work, converting all continuous space characters to a single delimiter of your choosing (e.g. "\t"), prior to parsing by fread():

    fread_blank = function(inputFile, spaceReplace = "\t", n = -1, ...){
      fread(
        input = paste0(
          gsub(pattern = "[[:space:]]+",
               replacement = spaceReplace,
               x = readLines(inputFile, n = n)),
          collapse = "\n"),
        ...)
    }
    

    I must agree with others that space-delimited files is not ideal choice, but I come across them pretty often whether I like it or not.

    0 讨论(0)
  • 2020-12-06 20:52

    With the answers of NeronLeVelu and Clayton Stanlay, I completed the answer with a custom function, example data and some system.time() for comparison purpose. Those test were made on Mac os 10.9 and R 3.0.2. However, I've made the same test on a linux machine and the sed command was really slow to perform, compared with read.table() with nrows and colClasses pre-calculated. The fread part was really fast, about 5 seconds for 5e6 rows on both system.

    library(data.table)
    
    
    # create path to new temporary file
    origData <- tempfile(pattern="origData",fileext=".txt")
    # write table with irregular blank spaces separators.
    write(paste0(" YYYY MM DD HH mm             19490             40790","\n",
                     paste(rep(" 1991 10  1  1  0      1.046465E+00      1.568405E+00", 5e6), 
                           collapse="\n"),"\n"),
          file=origData
    )
    
    # define column classes for read.table() optimization
    colClasses <- c(rep('integer',5),rep('numeric',2))
    
    # Function to count rows with command wc for read.table() optimization.
    fileRowsCount <- function(file){
        if(file.exists(file)){
                sysCmd <- paste("wc -l", file)
                rowCount <- system(sysCmd, intern=T)
                rowCount <- sub('^\\s', '', rowCount)
            as.numeric(
                           strsplit(rowCount, '\\s')[[1]][1]
                          )
        }
    }
    
    # Function to sed data into temp file before importing with sed
    sedFread<-function(file, sedCmd=NULL, ...){
        require(data.table)
        if(is.null(sedCmd)){
            #default : sed for convert blank separated table to csv. Thanks NeronLevelu !
            sedCmd <- "'s/^[[:blank:]]*//;s/[[:blank:]]\\{1,\\}/,/g'"
        }
        #sed into temp file
        tmpPath<-tempfile(pattern='tmp',fileext='.txt')
        sysCmd<-paste('sed',sedCmd, file, '>',tmpPath)
        try(system(sysCmd))
        DT<-fread(tmpPath,...)
        try(system(paste('rm',tmpPath)))
        return(DT)
    }
    

    Mac OS results :

    # First sed into temp file and then fread.
    system.time(
    DT<-sedFread(origData, header=TRUE)
    )
    > user  system elapsed
    > 23.847   0.628  24.514
    
    # Sed directly in fread command :
    system.time(
    DT <- fread(paste("sed 's/^[[:blank:]]*//;s/[[:blank:]]\\{1,\\}/,/g'", origData),
                header=T)
    )
    > user  system elapsed
    > 23.606   0.515  24.219
    
    
    # read.table without nrows and colclasses
    system.time(
    DF<-read.table(origData, header=TRUE)
    )
    > user  system elapsed
    > 38.053   0.512  38.565
    
    # read.table with nrows an colclasses
    system.time(
    DF<-read.table(origData, header=TRUE, nrows=fileRowsCount(origData), colClasses=colClasses)
    )
    > user  system elapsed
    > 33.813   0.309  34.125
    

    Linux results :

    # First sed into temp file and then fread.
    system.time(
      DT<-sedFread(origData, header=TRUE)
    )
    > Read 5000000 rows and 7 (of 7) columns from 0.186 GB file in 00:00:05
    > user  system elapsed 
    > 47.055   0.724  47.789 
    
    # Sed directly in fread command :
    system.time(
    DT <- fread(paste("sed 's/^[[:blank:]]*//;s/[[:blank:]]\\{1,\\}/,/g'", origData),
                header=T)
    )
    > Read 5000000 rows and 7 (of 7) columns from 0.186 GB file in 00:00:05
    > user  system elapsed 
    > 46.088   0.532  46.623 
    
    # read.table without nrows and colclasses
    system.time(
    DF<-read.table(origData, header=TRUE)
    )
    > user  system elapsed 
    > 32.478   0.436  32.912 
    
    # read.table with nrows an colclasses
    system.time(
    DF<-read.table(origData,
                   header=TRUE, 
                   nrows=fileRowsCount(origData),
                   colClasses=colClasses)
     )
    > user  system elapsed 
    > 21.665   0.524  22.192 
    
    # Control if DT and DF are identical : 
    setnames(DT, old=names(DT), new=names(DF))
    identical(as.data.frame(DT), DF)                                                              
    >[1] TRUE
    

    In fine : the method I used in first place was the most efficient, in this case.

    Thanks to NeronLeVelu, Matt Dowle and Clayton Stanley!

    0 讨论(0)
  • 2020-12-06 20:59

    I've found another way to do it, much faster, with awk instead of sed. Here is another example :

    library(data.table)
    
    # create path to new temporary file
    origData <- tempfile(pattern="origData",fileext=".txt")
    
    # write table with irregular blank spaces separators.
    write(paste0(" YYYY MM DD HH mm             19490             40790","\n",
                paste(rep(" 1991 10  1  1  0      1.046465E+00      1.568405E+00", 5e6),
                collapse="\n"),"\n"),
                file=origData
      )
    
    
    # function awkFread : first awk, then fread. Argument : colNums = selection of columns. 
    awkFread<-function(file, colNums, ...){
            require(data.table)
            if(is.vector(colNums)){
                tmpPath<-tempfile(pattern='tmp',fileext='.txt')
                colGen<-paste0("$",colNums,"\",\"", collapse=",")
                colGen<-substr(colGen,1,nchar(colGen)-3)
                cmdAwk<-paste("awk '{print",colGen,"}'", file, '>', tmpPath)
                try(system(cmdAwk))
                DT<-fread(tmpPath,...)
                try(system(paste('rm', tmpPath)))
                return(DT)
            }
    }
    
    # check read time :
    system.time(
                DT3 <- awkFread(origData,c(1:5),header=T)
                )
    
    > user  system elapsed 
    > 6.230   0.408   6.644
    
    0 讨论(0)
  • 2020-12-06 21:06

    Just committed to devel, v1.9.5. fread() gains strip.white argument with default TRUE (as opposed to base::read.table(), because it's more desirable). The example data is now added to tests.

    With this recent commit:

    require(data.table) # v1.9.5, commit 0e7a835 or more recent
    ans <- fread(" YYYY MM DD HH mm             19490             40790\n   1991 10  1  1  0      1.046465E+00      1.568405E+00")
    #      V1 V2 V3 V4 V5           V6           V7
    # 1: YYYY MM DD HH mm 19490.000000 40790.000000
    # 2: 1991 10  1  1  0     1.046465     1.568405
    sapply(ans, class)
    #          V1          V2          V3          V4          V5          V6          V7 
    # "character" "character" "character" "character" "character"   "numeric"   "numeric" 
    
    0 讨论(0)
  • sed 's/^[[:blank:]]*//;s/[[:blank:]]\{1,\}/,/g' 
    

    for you sed

    it's not possible to collect all result of fread into 1 (temporary) file (adding the source reference) and treat this file with sed (or other tool) to avoid a fork of the tools at every iteration ?

    0 讨论(0)
提交回复
热议问题