问题
I am trying to avoid a segmentation fault with either pandas or IOPro (still investigating), so I am looking for alternative solutions, esp. more efficient ones. The code below runs fine with small data but crashed reading in 90 monthly panels of a few GBs on a Linux server with 256 GB RAM, versions pandas 0.16.2 np19py26_0, iopro 1.7.1 np19py27_p0, and python 2.7.10 0.
What I do here is that I aggregate accounts of drug purchase records (cost in TKOST) for each person (LopNr) and month, while also separating the drugs into categories using their ATC codes.
So while the original data would look like this, in monthly csv files (say July 2006 here, with many other columns in the csv I don't need):
LopNr TKOST ATC
1 5 N01
1 11 N01
1 6 N15
etc.
I wanted aggregate panels, with rows like
LopNr TKOST year month
1 22 2006 7
either separately for a few categories (e.g. neuro for ATCs starting with N here), or with separate summaries for these categories in a single datafile (so with a neuro column etc.).
I opted for IOPro and not simple pandas to be more efficient with memory, but now I am getting a segmentation error.
# -*- coding: utf-8 -*-
import iopro
from pandas import *
neuro = DataFrame()
cardio = DataFrame()
cancer = DataFrame()
addiction = DataFrame()
Adrugs = DataFrame()
Mdrugs = DataFrame()
Vdrugs = DataFrame()
all_drugs = DataFrame()
for year in xrange(2005,2013):
for month in xrange(1,13):
if year == 2005 and month < 7:
continue
filename = 'PATH/lmed_' + str(year) + '_mon'+ str(month) +'.txt'
adapter = iopro.text_adapter(filename,parser='csv',field_names=True,output='dataframe',delimiter='\t')
monthly = adapter[['LopNr','ATC','TKOST']][:]
monthly['year']=year
monthly['month']=month
neuro = neuro.append(monthly[(monthly.ATC.str.startswith('N')) & (~(monthly.TKOST.isnull()))])
cardio = cardio.append(monthly[(monthly.ATC.str.startswith('C')) & (~(monthly.TKOST.isnull()))])
cancer = cancer.append(monthly[(monthly.ATC.str.startswith('L')) & (~(monthly.TKOST.isnull()))])
addiction = addiction.append(monthly[(monthly.ATC.str.startswith('N07')) & (~(monthly.TKOST.isnull()))])
Adrugs = Adrugs.append(monthly[(monthly.ATC.str.startswith('A')) & (~(monthly.TKOST.isnull()))])
Mdrugs = Mdrugs.append(monthly[(monthly.ATC.str.startswith('M')) & (~(monthly.TKOST.isnull()))])
Vdrugs = Vdrugs.append(monthly[(monthly.ATC.str.startswith('V')) & (~(monthly.TKOST.isnull()))])
all_drugs = all_drugs.append(monthly[(~(monthly.TKOST.isnull()))])
del monthly
all_drugs = all_drugs.groupby(['LopNr','year','month']).sum()
all_drugs = all_drugs.astype(int,copy=False)
all_drugs.to_csv('PATH/monthly_all_drugs_costs.csv')
del all_drugs
neuro = neuro.groupby(['LopNr','year','month']).sum()
neuro = neuro.astype(int,copy=False)
neuro.to_csv('PATH/monthly_neuro_costs.csv')
del neuro
cardio = cardio.groupby(['LopNr','year','month']).sum()
cardio = cardio.astype(int,copy=False)
cardio.to_csv('PATH/monthly_cardio_costs.csv')
del cardio
cancer = cancer.groupby(['LopNr','year','month']).sum()
cancer = cancer.astype(int,copy=False)
cancer.to_csv('PATH/monthly_cancer_costs.csv')
del cancer
addiction = addiction.groupby(['LopNr','year','month']).sum()
addiction = addiction.astype(int,copy=False)
addiction.to_csv('PATH/monthly_addiction_costs.csv')
del addiction
Adrugs = Adrugs.groupby(['LopNr','year','month']).sum()
Adrugs = Adrugs.astype(int,copy=False)
Adrugs.to_csv('PATH/monthly_Adrugs_costs.csv')
del Adrugs
Mdrugs = Mdrugs.groupby(['LopNr','year','month']).sum()
Mdrugs = Mdrugs.astype(int,copy=False)
Mdrugs.to_csv('PATH/monthly_Mdrugs_costs.csv')
del Mdrugs
Vdrugs = Vdrugs.groupby(['LopNr','year','month']).sum()
Vdrugs = Vdrugs.astype(int,copy=False)
Vdrugs.to_csv('PATH/monthly_Vdrugs_costs.csv')
del Vdrugs
回答1:
Your code is quite repetitive and could be simplified with dictionary and list comprehensions. This solution should eliminate your memory issues, as you only process one month's data at a time (although you have a growing list of monthly summaries which I don't believe will use much memory).
I can't test this, but I believe it will do everything in your code above.
import pandas as pd
import iopro
items = {'neuro': 'N',
'cardio': 'C',
'cancer': 'L',
'addiction': 'N07',
'Adrugs': 'A',
'Mdrugs': 'M',
'Vdrugs': 'V',
'all_drugs': ''}
# 1. Create data container using dictionary comprehension.
monthly_summaries = {item: list() for item in items.keys()}
# 2. Perform monthly groupby operations.
for year in xrange(2005, 2013):
for month in xrange(1, 13):
if year == 2005 and month < 7:
continue
filename = 'PATH/lmed_' + str(year) + '_mon'+ str(month) +'.txt'
adapter = iopro.text_adapter(filename,
parser='csv',
field_names=True,
output='data frame',
delimiter='\t')
monthly = adapter[['LopNr','ATC','TKOST']][:]
monthly['year'] = year
monthly['month'] = month
dfs = {name: monthly[(monthly.ATC.str.startswith('{0}'.format(code)))
& (~(monthly.TKOST.isnull()))]
for name, code in items.iteritems()}
[monthly_summaries[name].append(dfs[name].groupby(['LopNr','year','month']).sum()
.astype(int, copy=False))
for name in items.keys()]
# 3. Now concatenate all of the monthly summaries into separate DataFrames.
dfs = {name: pd.concat([monthly_summaries[name], ignore_axis=True])
for name in items.keys()}
# 4. Now regroup the aggregate monthly summaries.
monthly_summaries = {name: dfs[name].reset_index().groupby(['LopNr','year','month']).sum()
for name in items.keys()}
# 5. Finally, save the aggregated results to files.
[monthly_summaries[name].to_csv('PATH/monthly_{0}_costs.csv'.format(name))
for name in items()]
来源:https://stackoverflow.com/questions/31637289/memory-efficient-python-pandas-aggregates-of-categories-from-one-csv-file-per