Fastest way to calculate difference in all columns

前端未结

关注

 4  1916

I have a dataframe of all float columns. For example:

import numpy as np
import pandas as pd

df = pd.DataFrame(np.arange(12.0).reshape(3,4), columns=list(\'


                      
              相关标签:


      
      
        
          4条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  不要未来只要你来        
                
              
                            
                2020-12-10 09:58
              
            
            
                                                                       
I think you can do it with NumPy. Let arr=df.values. First, let's find all two-column combinations:

from itertools import combinations    
column_combos = combinations(range(arr.shape[1]), 2)


Now, subtract columns pairwise and convert a list of arrays back to a 2D array:

result = np.array([(arr[:,x[1]] - arr[:,x[0]]) for x in column_combos]).T
#array([[1., 2., 3., 1., 2., 1.],
#       [1., 2., 3., 1., 2., 1.],
#       [1., 2., 3., 1., 2., 1.]])


Another solution is somewhat (~15%) faster because it subtracts whole 2D arrays rather than columns, and has fewer Python-side iterations:

result = np.concatenate([(arr.T - arr.T[x])[x+1:] for x in range(arr.shape[1])]).T
#array([[ 1., 2., 3., 1., 2., 1.],
#       [ 1., 2., 3., 1., 2., 1.],
#       [ 1., 2., 3., 1., 2., 1.]])


You can convert the result back to a DataFrame if you want:

columns = list(map(lambda x: x[1]+x[0], combinations(df.columns, 2)))
#['BA', 'CA', 'DA', 'CB', 'DB', 'DC']

pd.DataFrame(result, columns=columns)
#    BA   CA   DA   CB   DB   DC
#0  1.0  2.0  3.0  1.0  2.0  1.0
#1  1.0  2.0  3.0  1.0  2.0  1.0
#2  1.0  2.0  3.0  1.0  2.0  1.0

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  粉色の甜心        
                
              
                            
                2020-12-10 10:00
              
            
            
                                                                       
import itertools
df = pd.DataFrame(np.arange(12.0).reshape(3,4), columns=list('ABCD'))
df_cols = df.columns.tolist()
#build a index array of all the pairs need to do the subtraction
idx = np.asarray(list(itertools.combinations(range(len(df_cols)),2))).T
#build a new DF using the pairwise difference and column names
df_new = pd.DataFrame(data=df.values[:,idx[0]]-df.values[:,idx[1]], 
                      columns=[''.join(e) for e in (itertools.combinations(df_cols,2))])

df_new
Out[43]: 
    AB   AC   AD   BC   BD   CD
0 -1.0 -2.0 -3.0 -1.0 -2.0 -1.0
1 -1.0 -2.0 -3.0 -1.0 -2.0 -1.0
2 -1.0 -2.0 -3.0 -1.0 -2.0 -1.0

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  广开言路        
                
              
                            
                2020-12-10 10:06
              
            
            
                                                                       
I am not sure how fast can this be compared to other possible methods, but here it is:

df = pd.DataFrame(np.arange(12.0).reshape(3,4), columns=list('ABCD'))

# get the columns as list
cols = list(df.columns)
# define output dataframe
out = pd.DataFrame()

# loop over possible periods
for period in range(1, df.shape[1]):
    names = [l1 + l2 for l1, l2, in zip(cols, cols[period:])]
    out[names] = df.diff(periods=period, axis=1).dropna(axis=1, how='all')

print(out)

# column name shows which two columns are subtracted

    AB   BC   CD   AC   BD   AD
0  1.0  1.0  1.0  2.0  2.0  3.0
1  1.0  1.0  1.0  2.0  2.0  3.0
2  1.0  1.0  1.0  2.0  2.0  3.0

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  余生分开走        
                
              
                            
                2020-12-10 10:12
              
            
            
                                                                       
Listed in this post are two NumPy approaches for performance - One would be fully vectorized approach and another with one loop.

Approach #1

def numpy_triu1(df):          
    a = df.values
    r,c = np.triu_indices(a.shape[1],1)
    cols = df.columns
    nm = [cols[i]+"_"+cols[j] for i,j in zip(r,c)]
    return pd.DataFrame(a[:,r] - a[:,c], columns=nm)


Sample run -

In [72]: df
Out[72]: 
     A    B     C     D
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0

In [78]: numpy_triu(df)
Out[78]: 
   A_B  A_C  A_D  B_C  B_D  C_D
0 -1.0 -2.0 -3.0 -1.0 -2.0 -1.0
1 -1.0 -2.0 -3.0 -1.0 -2.0 -1.0
2 -1.0 -2.0 -3.0 -1.0 -2.0 -1.0


Approach #2

If we are okay with array as output or dataframe without specialized column names, here's another -

def pairwise_col_diffs(a): # a would df.values
    n = a.shape[1]
    N = n*(n-1)//2
    idx = np.concatenate(( [0], np.arange(n-1,0,-1).cumsum() ))
    start, stop = idx[:-1], idx[1:]
    out = np.empty((a.shape[0],N),dtype=a.dtype)
    for j,i in enumerate(range(n-1)):
        out[:, start[j]:stop[j]] = a[:,i,None] - a[:,i+1:]
    return out




Runtime test

Since OP has mentioned that multi-dim array output would work for them as well, here are the array based approaches from other author(s) -

# @Allen's soln
def Allen(arr):
    n = arr.shape[1]
    idx = np.asarray(list(itertools.combinations(range(n),2))).T
    return arr[:,idx[0]]-arr[:,idx[1]]

# @DYZ's soln
def DYZ(arr):
    result = np.concatenate([(arr.T - arr.T[x])[x+1:] \
            for x in range(arr.shape[1])]).T
    return result


pandas based solution from @Gerges Dib's post wasn't included as it came out very slow as compared to others.

Timings -

We will use three dataset sizes - 100, 500 and 1000 :

In [118]: df = pd.DataFrame(np.random.randint(0,9,(3,100)))
     ...: a = df.values
     ...: 

In [119]: %timeit DYZ(a)
     ...: %timeit Allen(a)
     ...: %timeit pairwise_col_diffs(a)
     ...: 
1000 loops, best of 3: 258 µs per loop
1000 loops, best of 3: 1.48 ms per loop
1000 loops, best of 3: 284 µs per loop

In [121]: df = pd.DataFrame(np.random.randint(0,9,(3,500)))
     ...: a = df.values
     ...: 

In [122]: %timeit DYZ(a)
     ...: %timeit Allen(a)
     ...: %timeit pairwise_col_diffs(a)
     ...: 
100 loops, best of 3: 2.56 ms per loop
10 loops, best of 3: 39.9 ms per loop
1000 loops, best of 3: 1.82 ms per loop

In [123]: df = pd.DataFrame(np.random.randint(0,9,(3,1000)))
     ...: a = df.values
     ...: 

In [124]: %timeit DYZ(a)
     ...: %timeit Allen(a)
     ...: %timeit pairwise_col_diffs(a)
     ...: 
100 loops, best of 3: 8.61 ms per loop
10 loops, best of 3: 167 ms per loop
100 loops, best of 3: 5.09 ms per loop

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复