Parsing a CSV file using gawk

后端 未结 9 1254
感动是毒
感动是毒 2020-11-29 12:12

How do you parse a CSV file using gawk? Simply setting FS=\",\" is not enough, as a quoted field with a comma inside will be treated as multiple fields.

<
9条回答
  •  无人及你
    2020-11-29 12:56

    csv2delim.awk

    # csv2delim.awk converts comma delimited files with optional quotes to delim separated file
    #     delim can be any character, defaults to tab
    # assumes no repl characters in text, any delim in line converts to repl
    #     repl can be any character, defaults to ~
    # changes two consecutive quotes within quotes to '
    
    # usage: gawk -f csv2delim.awk [-v delim=d] [-v repl=`"] input-file > output-file
    #       -v delim    delimiter, defaults to tab
    #       -v repl     replacement char, defaults to ~
    
    # e.g. gawk -v delim=; -v repl=` -f csv2delim.awk test.csv > test.txt
    
    # abe 2-28-7
    # abe 8-8-8 1.0 fixed empty fields, added replacement option
    # abe 8-27-8 1.1 used split
    # abe 8-27-8 1.2 inline rpl and "" = '
    # abe 8-27-8 1.3 revert to 1.0 as it is much faster, split most of the time
    # abe 8-29-8 1.4 better message if delim present
    
    BEGIN {
        if (delim == "") delim = "\t"
        if (repl == "") repl = "~"
        print "csv2delim.awk v.m 1.4 run at " strftime() > "/dev/stderr" ###########################################
    }
    
    {
        #if ($0 ~ repl) {
        #   print "Replacement character " repl " is on line " FNR ":" lineIn ";" > "/dev/stderr"
        #}
        if ($0 ~ delim) {
            print "Temp delimiter character " delim " is on line " FNR ":" lineIn ";" > "/dev/stderr"
            print "    replaced by " repl > "/dev/stderr"
        }
        gsub(delim, repl)
    
        $0 = gensub(/([^,])\"\"/, "\\1'", "g")
    #   $0 = gensub(/\"\"([^,])/, "'\\1", "g")  # not needed above covers all cases
    
        out = ""
        #for (i = 1;  i <= length($0);  i++)
        n = length($0)
        for (i = 1;  i <= n;  i++)
            if ((ch = substr($0, i, 1)) == "\"")
                inString = (inString) ? 0 : 1 # toggle inString
            else
                out = out ((ch == "," && ! inString) ? delim : ch)
        print out
    }
    
    END {
        print NR " records processed from " FILENAME " at " strftime() > "/dev/stderr"
    }
    

    test.csv

    "first","second","third"
    "fir,st","second","third"
    "first","sec""ond","third"
    " first ",sec   ond,"third"
    "first" , "second","th  ird"
    "first","sec;ond","third"
    "first","second","th;ird"
    1,2,3
    ,2,3
    1,2,
    ,2,
    1,,2
    1,"2",3
    "1",2,"3"
    "1",,"3"
    1,"",3
    "","",""
    "","""aiyn","oh"""
    """","""",""""
    11,2~2,3
    

    test.bat

    rem test csv2delim
    rem default is: -v delim={tab} -v repl=~
    gawk                      -f csv2delim.awk test.csv > test.txt
    gawk -v delim=;           -f csv2delim.awk test.csv > testd.txt
    gawk -v delim=; -v repl=` -f csv2delim.awk test.csv > testdr.txt
    gawk            -v repl=` -f csv2delim.awk test.csv > testr.txt
    

提交回复
热议问题