I am currently trying to import the following large tab-delimited file into a dataframe-like structure within Python---naturally I am using pandas dataframe, th
Another version which takes a separate column file as parameter or uses the first record. Run either way:
awk -f pandas2.awk pandas.txt # first record as header
awk -f pandas2.awk cols.txt pandas.txt # first record from cols.txt
awk -v cols="cols.txt" -f pandas2.awk pandas.txt # read cols from cols.txt
Or even:
awk -v cols="pandas.txt" -f pandas2.awk pandas.txt # separates keys from pandas.txt for header
Code:
$ cat > pandas2.awk
BEGIN {
PROCINFO["sorted_in"]="@ind_str_asc" # traversal order for for(i in a)
if(cols) { # if -v cols="column_file.txt" or even "pandas.txt"
while ((getline line< cols)>0) { # read it in line by line
gsub(/: [^ ]+/,"",line) # remove values from "key: value"
split(line,a) # split to temp array
for(i in a) # collect keys to column array
col[a[i]]
}
for(i in col) # output columns
printf "%6s%s", i, OFS
print ""
}
}
NR==1 && cols=="" { # if the header cols are in the beginning of data file
# if not, -v cols="column_file.txt"
split($0,a," +") # split header record by spaces
for(i in a) {
col[a[i]] # set them to array col
printf "%6s%s", a[i], OFS # output the header
}
print ""
}
NR==1 {
next
}
{
gsub(/: /,"=") # replace key-value separator ": " with "="
split($0,b,FS) # split record from separator FS
for(i in b) {
split(b[i],c,"=") # split key=value to c[1]=key, c[2]=value
b[c[1]]=c[2] # b[key]=value
}
for(i in col) # go thru headers in col[] and printf from b[]
printf "%6s%s", (i in b?b[i]:"NaN"), OFS; print ""
}