Does pandas iterrows have performance issues?

前端 未结 6 1832
名媛妹妹
名媛妹妹 2020-11-21 07:04

I have noticed very poor performance when using iterrows from pandas.

Is this something that is experienced by others? Is it specific to iterrows and should this fun

6条回答
  •  暖寄归人
    2020-11-21 07:58

    Another option is to use to_records(), which is faster than both itertuples and iterrows.

    But for your case, there is much room for other types of improvements.

    Here's my final optimized version

    def iterthrough():
        ret = []
        grouped = table2.groupby('letter', sort=False)
        t2info = table2.to_records()
        for index, letter, n1 in table1.to_records():
            t2 = t2info[grouped.groups[letter].values]
            # np.multiply is in general faster than "x * y"
            maxrow = np.multiply(t2.number2, n1).argmax()
            # `[1:]`  removes the index column
            ret.append(t2[maxrow].tolist()[1:])
        global table3
        table3 = pd.DataFrame(ret, columns=('letter', 'number2'))
    

    Benchmark test:

    -- iterrows() --
    100 loops, best of 3: 12.7 ms per loop
      letter  number2
    0      a      0.5
    1      b      0.1
    2      c      5.0
    3      d      4.0
    
    -- itertuple() --
    100 loops, best of 3: 12.3 ms per loop
    
    -- to_records() --
    100 loops, best of 3: 7.29 ms per loop
    
    -- Use group by --
    100 loops, best of 3: 4.07 ms per loop
      letter  number2
    1      a      0.5
    2      b      0.1
    4      c      5.0
    5      d      4.0
    
    -- Avoid multiplication --
    1000 loops, best of 3: 1.39 ms per loop
      letter  number2
    0      a      0.5
    1      b      0.1
    2      c      5.0
    3      d      4.0
    

    Full code:

    import pandas as pd
    import numpy as np
    
    #%% Create the original tables
    t1 = {'letter':['a','b','c','d'],
          'number1':[50,-10,.5,3]}
    
    t2 = {'letter':['a','a','b','b','c','d','c'],
          'number2':[0.2,0.5,0.1,0.4,5,4,1]}
    
    table1 = pd.DataFrame(t1)
    table2 = pd.DataFrame(t2)
    
    #%% Create the body of the new table
    table3 = pd.DataFrame(np.nan, columns=['letter','number2'], index=table1.index)
    
    
    print('\n-- iterrows() --')
    
    def optimize(t2info, t1info):
        calculation = []
        for index, r in t2info.iterrows():
            calculation.append(r['number2'] * t1info)
        maxrow_in_t2 = calculation.index(max(calculation))
        return t2info.loc[maxrow_in_t2]
    
    #%% Iterate through filtering relevant data, optimizing, returning info
    def iterthrough():
        for row_index, row in table1.iterrows():   
            t2info = table2[table2.letter == row['letter']].reset_index()
            table3.iloc[row_index,:] = optimize(t2info, row['number1'])
    
    %timeit iterthrough()
    print(table3)
    
    print('\n-- itertuple() --')
    def optimize(t2info, n1):
        calculation = []
        for index, letter, n2 in t2info.itertuples():
            calculation.append(n2 * n1)
        maxrow = calculation.index(max(calculation))
        return t2info.iloc[maxrow]
    
    def iterthrough():
        for row_index, letter, n1 in table1.itertuples():   
            t2info = table2[table2.letter == letter]
            table3.iloc[row_index,:] = optimize(t2info, n1)
    
    %timeit iterthrough()
    
    
    print('\n-- to_records() --')
    def optimize(t2info, n1):
        calculation = []
        for index, letter, n2 in t2info.to_records():
            calculation.append(n2 * n1)
        maxrow = calculation.index(max(calculation))
        return t2info.iloc[maxrow]
    
    def iterthrough():
        for row_index, letter, n1 in table1.to_records():   
            t2info = table2[table2.letter == letter]
            table3.iloc[row_index,:] = optimize(t2info, n1)
    
    %timeit iterthrough()
    
    print('\n-- Use group by --')
    
    def iterthrough():
        ret = []
        grouped = table2.groupby('letter', sort=False)
        for index, letter, n1 in table1.to_records():
            t2 = table2.iloc[grouped.groups[letter]]
            calculation = t2.number2 * n1
            maxrow = calculation.argsort().iloc[-1]
            ret.append(t2.iloc[maxrow])
        global table3
        table3 = pd.DataFrame(ret)
    
    %timeit iterthrough()
    print(table3)
    
    print('\n-- Even Faster --')
    def iterthrough():
        ret = []
        grouped = table2.groupby('letter', sort=False)
        t2info = table2.to_records()
        for index, letter, n1 in table1.to_records():
            t2 = t2info[grouped.groups[letter].values]
            maxrow = np.multiply(t2.number2, n1).argmax()
            # `[1:]`  removes the index column
            ret.append(t2[maxrow].tolist()[1:])
        global table3
        table3 = pd.DataFrame(ret, columns=('letter', 'number2'))
    
    %timeit iterthrough()
    print(table3)
    

    The final version is almost 10x faster than the original code. The strategy is:

    1. Use groupby to avoid repeated comparing of values.
    2. Use to_records to access raw numpy.records objects.
    3. Don't operate on DataFrame until you have compiled all the data.

提交回复
热议问题