The CSV file that I want to read does not fit into main memory. How can I read a few (~10K) random lines of it and do some simple statistics on the selected data frame?
class magic_checker:
def __init__(self,target_count):
self.target = target_count
self.count = 0
def __eq__(self,x):
self.count += 1
return self.count >= self.target
min_target=100000
max_target = min_target*2
nlines = randint(100,1000)
seek_target = randint(min_target,max_target)
with open("big.csv") as f:
f.seek(seek_target)
f.readline() #discard this line
rand_lines = list(iter(lambda:f.readline(),magic_checker(nlines)))
#do something to process the lines you got returned .. perhaps just a split
print rand_lines
print rand_lines[0].split(",")
something like that should work I think