memory efficient Python (pandas) aggregates of categories from one csv file per period

ぐ巨炮叔叔 提交于 2020-01-25 18:31:00

问题


I am trying to avoid a segmentation fault with either pandas or IOPro (still investigating), so I am looking for alternative solutions, esp. more efficient ones. The code below runs fine with small data but crashed reading in 90 monthly panels of a few GBs on a Linux server with 256 GB RAM, versions pandas 0.16.2 np19py26_0, iopro 1.7.1 np19py27_p0, and python 2.7.10 0.

What I do here is that I aggregate accounts of drug purchase records (cost in TKOST) for each person (LopNr) and month, while also separating the drugs into categories using their ATC codes.

So while the original data would look like this, in monthly csv files (say July 2006 here, with many other columns in the csv I don't need):

LopNr TKOST ATC
1         5 N01
1        11 N01
1         6 N15

etc.

I wanted aggregate panels, with rows like

LopNr TKOST year month
1        22 2006     7

either separately for a few categories (e.g. neuro for ATCs starting with N here), or with separate summaries for these categories in a single datafile (so with a neuro column etc.).

I opted for IOPro and not simple pandas to be more efficient with memory, but now I am getting a segmentation error.

# -*- coding: utf-8 -*-
import iopro
from pandas import *

neuro   = DataFrame()
cardio  = DataFrame()
cancer  = DataFrame()
addiction  = DataFrame()
Adrugs  = DataFrame()
Mdrugs  = DataFrame()
Vdrugs  = DataFrame()
all_drugs  = DataFrame()

for year in xrange(2005,2013):
    for month in xrange(1,13):
        if year == 2005 and month < 7:
            continue
        filename = 'PATH/lmed_' + str(year) + '_mon'+ str(month) +'.txt'
        adapter = iopro.text_adapter(filename,parser='csv',field_names=True,output='dataframe',delimiter='\t')
        monthly = adapter[['LopNr','ATC','TKOST']][:]
        monthly['year']=year
        monthly['month']=month
        neuro = neuro.append(monthly[(monthly.ATC.str.startswith('N')) & (~(monthly.TKOST.isnull()))])
        cardio = cardio.append(monthly[(monthly.ATC.str.startswith('C')) & (~(monthly.TKOST.isnull()))])
        cancer = cancer.append(monthly[(monthly.ATC.str.startswith('L')) & (~(monthly.TKOST.isnull()))])
        addiction = addiction.append(monthly[(monthly.ATC.str.startswith('N07')) & (~(monthly.TKOST.isnull()))])
        Adrugs = Adrugs.append(monthly[(monthly.ATC.str.startswith('A')) & (~(monthly.TKOST.isnull()))])
        Mdrugs = Mdrugs.append(monthly[(monthly.ATC.str.startswith('M')) & (~(monthly.TKOST.isnull()))])
        Vdrugs = Vdrugs.append(monthly[(monthly.ATC.str.startswith('V')) & (~(monthly.TKOST.isnull()))])
        all_drugs = all_drugs.append(monthly[(~(monthly.TKOST.isnull()))])
        del monthly

all_drugs = all_drugs.groupby(['LopNr','year','month']).sum()
all_drugs = all_drugs.astype(int,copy=False)
all_drugs.to_csv('PATH/monthly_all_drugs_costs.csv')
del all_drugs

neuro = neuro.groupby(['LopNr','year','month']).sum()
neuro = neuro.astype(int,copy=False)
neuro.to_csv('PATH/monthly_neuro_costs.csv')
del neuro

cardio = cardio.groupby(['LopNr','year','month']).sum()
cardio = cardio.astype(int,copy=False)
cardio.to_csv('PATH/monthly_cardio_costs.csv')
del cardio

cancer = cancer.groupby(['LopNr','year','month']).sum()
cancer = cancer.astype(int,copy=False)
cancer.to_csv('PATH/monthly_cancer_costs.csv')
del cancer

addiction = addiction.groupby(['LopNr','year','month']).sum()
addiction = addiction.astype(int,copy=False)
addiction.to_csv('PATH/monthly_addiction_costs.csv')
del addiction

Adrugs = Adrugs.groupby(['LopNr','year','month']).sum()
Adrugs = Adrugs.astype(int,copy=False)
Adrugs.to_csv('PATH/monthly_Adrugs_costs.csv')
del Adrugs

Mdrugs = Mdrugs.groupby(['LopNr','year','month']).sum()
Mdrugs = Mdrugs.astype(int,copy=False)
Mdrugs.to_csv('PATH/monthly_Mdrugs_costs.csv')
del Mdrugs

Vdrugs = Vdrugs.groupby(['LopNr','year','month']).sum()
Vdrugs = Vdrugs.astype(int,copy=False)
Vdrugs.to_csv('PATH/monthly_Vdrugs_costs.csv')
del Vdrugs

回答1:


Your code is quite repetitive and could be simplified with dictionary and list comprehensions. This solution should eliminate your memory issues, as you only process one month's data at a time (although you have a growing list of monthly summaries which I don't believe will use much memory).

I can't test this, but I believe it will do everything in your code above.

import pandas as pd
import iopro

items = {'neuro': 'N', 
         'cardio': 'C', 
         'cancer': 'L', 
         'addiction': 'N07', 
         'Adrugs': 'A', 
         'Mdrugs': 'M', 
         'Vdrugs': 'V', 
         'all_drugs': ''}

# 1. Create data container using dictionary comprehension.
monthly_summaries = {item: list() for item in items.keys()}

# 2. Perform monthly groupby operations.
for year in xrange(2005, 2013):
    for month in xrange(1, 13):
        if year == 2005 and month < 7:
            continue
        filename = 'PATH/lmed_' + str(year) + '_mon'+ str(month) +'.txt'
        adapter = iopro.text_adapter(filename,
                                     parser='csv', 
                                     field_names=True, 
                                     output='data frame', 
                                     delimiter='\t')
        monthly = adapter[['LopNr','ATC','TKOST']][:]
        monthly['year'] = year
        monthly['month'] = month
        dfs = {name: monthly[(monthly.ATC.str.startswith('{0}'.format(code))) 
                             & (~(monthly.TKOST.isnull()))]
                     for name, code in items.iteritems()}
        [monthly_summaries[name].append(dfs[name].groupby(['LopNr','year','month']).sum()
                                        .astype(int, copy=False)) 
         for name in items.keys()]

# 3. Now concatenate all of the monthly summaries into separate DataFrames.
dfs = {name: pd.concat([monthly_summaries[name], ignore_axis=True]) 
       for name in items.keys()}

# 4. Now regroup the aggregate monthly summaries.
monthly_summaries = {name: dfs[name].reset_index().groupby(['LopNr','year','month']).sum()
                    for name in items.keys()}

# 5. Finally, save the aggregated results to files.
[monthly_summaries[name].to_csv('PATH/monthly_{0}_costs.csv'.format(name))
 for name in items()]


来源:https://stackoverflow.com/questions/31637289/memory-efficient-python-pandas-aggregates-of-categories-from-one-csv-file-per

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!