How to compute percentiles in Apache Spark

后端未结
关注
 10  537
遥遥无期 2020-12-04 22:08
I have an rdd of integers (i.e. RDD[Int]) and what I would like to do is to compute the following ten percentiles: [0th, 10th, 20th, ..., 90th, 100th]

      
      
        
          10条回答        

        
                    
            
            
                         
                
              
              
                
                   长情又很酷
                                             
                
                
                (楼主)
            
              
              
                2020-12-04 22:51
              

            
            
                        
Here is my Python implementation on Spark for calculating the percentile for a RDD containing values of interest. 

def percentile_threshold(ardd, percentile):
    assert percentile > 0 and percentile <= 100, "percentile should be larger then 0 and smaller or equal to 100"

    return ardd.sortBy(lambda x: x).zipWithIndex().map(lambda x: (x[1], x[0])) \
            .lookup(np.ceil(ardd.count() / 100 * percentile - 1))[0]

# Now test it out
import numpy as np
randlist = range(1,10001)
np.random.shuffle(randlist)
ardd = sc.parallelize(randlist)

print percentile_threshold(ardd,0.001)
print percentile_threshold(ardd,1)
print percentile_threshold(ardd,60.11)
print percentile_threshold(ardd,99)
print percentile_threshold(ardd,99.999)
print percentile_threshold(ardd,100)

# output:
# 1
# 100
# 6011
# 9900
# 10000
# 10000


Separately, I defined the following function to get the 10th to 100th percentile. 

def get_percentiles(rdd, stepsize=10):
    percentiles = []
    rddcount100 = rdd.count() / 100 
    sortedrdd = ardd.sortBy(lambda x: x).zipWithIndex().map(lambda x: (x[1], x[0]))


    for p in range(0, 101, stepsize):
        if p == 0:
            pass
            # I am not aware of a formal definition of 0 percentile, 
            # you can put a place holder like this if you want
            # percentiles.append(sortedrdd.lookup(0)[0] - 1) 
        elif p == 100:
            percentiles.append(sortedrdd.lookup(np.ceil(rddcount100 * 100 - 1))[0])
        else:
            pv = sortedrdd.lookup(np.ceil(rddcount100 * p) - 1)[0]
            percentiles.append(pv)

    return percentiles

randlist = range(1,10001)
np.random.shuffle(randlist)
ardd = sc.parallelize(randlist)
get_percentiles(ardd, 10)

# [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它10个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复