Is there a pythonic way to figure out which rows in a CSV file contain headers and values and which rows contain trash and then get the headers/values rows into data frames?
This program might help. It is essentially a wrapper around the csv.reader() object, which wrapper greps the good data out.
import pandas as pd
import csv
import sys
def ignore_comments(fp, start_fn, end_fn, keep_initial):
state = 'keep' if keep_initial else 'start'
for line in fp:
if state == 'start' and start_fn(line):
state = 'keep'
yield line
elif state == 'keep':
if end_fn(line):
state = 'drop'
else:
yield line
elif state == 'drop':
if start_fn(line):
state = 'keep'
if __name__ == "__main__":
df = open('x.in')
df = csv.reader(df, skipinitialspace=True)
df = ignore_comments(
df,
lambda x: x == ['header1', 'header2', 'header3'],
lambda x: x == [],
False)
df = pd.read_csv(df, engine='python')
print df