问题
I would like to adapt the post here (Parse CSV file and aggregate the values) to sum multiple columns instead of just one.
So for these data:
CITY,AMOUNT,AMOUNT2,AMOUNTn
London,20,21,22
Tokyo,45,46,47
London,55,56,57
New York,25,26,27
How can I get this:
CITY,AMOUNT,AMOUNT2,AMOUNTn
London,75,77,79
Tokyo,45,46,47
New York,25,26,27
I will have several thousand columns eventually, and unfortunately I can not use the pandas package for this task. Here is the code I have just aggregates all three AMOUNT cols into one, which is not what I am after
from __future__ import division
import csv
from collections import defaultdict
def default_factory():
return [0, None, None, 0]
reader = csv.DictReader(open('test_in.txt'))
cities = defaultdict(default_factory)
for row in reader:
headers = [r for r in row.keys()]
headers.remove('CITY')
for i in headers:
amount = int(row[i])
cities[row["CITY"]][0] += amount
max = cities[row["CITY"]][1]
cities[row["CITY"]][1] = amount if max is None else amount if amount > max else max
min = cities[row["CITY"]][2]
cities[row["CITY"]][2] = amount if min is None else amount if amount < min else min
cities[row["CITY"]][3] += 1
for city in cities:
cities[city][3] = cities[city][0]/cities[city][3] # calculate mean
with open('test_out.txt', 'wb') as myfile:
writer = csv.writer(myfile, delimiter="\t")
writer.writerow(["CITY", "AMOUNT", "AMOUNT2", "AMOUNTn ,"max", "min", "mean"])
writer.writerows([city] + cities[city] for city in cities)
Thank you for any help
回答1:
Here is one way using itertools.groupby
.
import StringIO
import csv
import itertools
data = """CITY,AMOUNT,AMOUNT2,AMOUNTn
London,20,21,22
Tokyo,45,46,47
London,55,56,57
New York,25,26,27"""
# I use StringIO to create a file like object for demo purposes
f = StringIO.StringIO(data)
fieldnames = f.readline().strip().split(',')
key = lambda x: x[0] # the first column will be a grouping key
# rows must be sorted by city before passing to itertools.groupby
rows_sorted = sorted(csv.reader(f), key=key)
outfile = StringIO.StringIO('')
writer = csv.DictWriter(outfile, fieldnames=fieldnames, lineterminator='\n')
writer.writeheader()
for city, rows in itertools.groupby(rows_sorted, key=key):
# remove city column for aggregation, convert to ints
rows = [[int(x) for x in row[1:]] for row in rows]
agg = [sum(column) for column in zip(*rows)]
writer.writerow(dict(zip(fieldnames, [city] + agg)))
print outfile.getvalue()
# CITY,AMOUNT,AMOUNT2,AMOUNTn
# London,75,77,79
# New York,25,26,27
# Tokyo,45,46,47
回答2:
Here is how I would do it.
import csv
from StringIO import StringIO
data = '''CITY,AMOUNT,AMOUNT2,AMOUNTn
London,20,21,22
Tokyo,45,46,47
London,55,56,57,99
New York,25,26,27'''
file_ = StringIO(data)
reader = csv.reader(file_)
headers = next(reader)
rows = {}
def add(col1, col2):
l = len(col1)
for i, n in enumerate(col2):
if i >= l:
col1.extend(col2[i:])
break
col1[i] += n
return col1
for row in reader:
key = row[0]
nums = map(int, row[1:])
if key in rows:
rows[key] = add(rows[key], nums)
else:
rows[key] = map(int, nums)
来源:https://stackoverflow.com/questions/17430553/parse-csv-file-with-and-aggregate-values-multiple-columns