Duplicate entries and rename column row in pandas pivot table without aggregation

前端未结

关注

 1  744

I\'m trying to reshape this sample dataframe from long to wide format, without aggregating any of the data.

import numpy as np
import pandas as pd

df = pd.


                      
              相关标签:


      
      
        
          1条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  谎友^        
                
              
                            
                2020-12-22 01:55
              
            
            
                                                                       
You are on the correct path:

# group
df['idx'] = df.groupby('Date').cumcount()

# set index and unstack
new = df.set_index(['idx','Date', 'SubjectID']).unstack(level=[0,1])

# drop idx column
new.columns = new.columns.droplevel(1)
new.columns = [f'{val}_{date}' for val, date in new.columns]


I think this is your expected output

Using map looks like it will be a little faster:

df['idx'] = df.groupby('Date').cumcount()
df['Date'] = df['Date'].astype(str)
new = df.set_index(['idx','Date', 'SubjectID']).unstack(level=[0,1])
new.columns = new.columns.droplevel(1)
#new.columns = [f'{val}_{date}' for val, date in new.columns]
new.columns = new.columns.map('_'.join)


Here is a 50,000 row test example:

#data
data = pd.DataFrame(pd.date_range('2000-01-01', periods=50000, freq='D'))
data['a'] = list('abcd')*12500
data['b'] = 2
data['c'] = list('ABCD')*12500
data.rename(columns={0:'date'}, inplace=True)

# list comprehension:
%%timeit -r 3 -n 200
new = data.set_index(['a','date','c']).unstack(level=[0,1])
new.columns = new.columns.droplevel(0)
new.columns = [f'{x}_{y}' for x,y in new.columns]

# 98.2 ms ± 13.3 ms per loop (mean ± std. dev. of 3 runs, 200 loops each)

# map with join:
%%timeit -r 3 -n 200
data['date'] = data['date'].astype(str)
new = data.set_index(['a','date','c']).unstack(level=[0,1])
new.columns = new.columns.droplevel(0)
new.columns = new.columns.map('_'.join)

# 84.6 ms ± 3.87 ms per loop (mean ± std. dev. of 3 runs, 200 loops each)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复