How to calculate 1st and 3rd quartiles?

前端 未结 10 1141
眼角桃花
眼角桃花 2020-12-04 19:14

I have DataFrame:

    time_diff   avg_trips
0   0.450000    1.0
1   0.483333    1.0
2   0.500000    1.0
3   0.516667    1.0
4   0.533333    2.0
相关标签:
10条回答
  • 2020-12-04 19:40

    you can use

    df.describe()
    

    which would show the information

    0 讨论(0)
  • 2020-12-04 19:44

    In my efforts to learn object-oriented programming alongside learning statistics, I made this, maybe you'll find it useful:

    samplesCourse = [9, 10, 10, 11, 13, 15, 16, 19, 19, 21, 23, 28, 30, 33, 34, 36, 44, 45, 47, 60]
    
    class sampleSet:
        def __init__(self, sampleList):
            self.sampleList = sampleList
            self.interList = list(sampleList) # interList is sampleList alias; alias used to maintain integrity of original sampleList
    
        def find_median(self):
            self.median = 0
    
            if len(self.sampleList) % 2 == 0:
                # find median for even-numbered sample list length
                self.medL = self.interList[int(len(self.interList)/2)-1]
                self.medU = self.interList[int(len(self.interList)/2)]
                self.median = (self.medL + self.medU)/2
    
            else:
                # find median for odd-numbered sample list length
                self.median = self.interList[int((len(self.interList)-1)/2)]
            return self.median
    
        def find_1stQuartile(self, median):
            self.lower50List = []
            self.Q1 = 0
    
            # break out lower 50 percentile from sampleList
            if len(self.interList) % 2 == 0:
                self.lower50List = self.interList[:int(len(self.interList)/2)]
            else:
                # drop median to make list ready to divide into 50 percentiles
                self.interList.pop(interList.index(self.median))
                self.lower50List = self.interList[:int(len(self.interList)/2)]
    
            # find 1st quartile (median of lower 50 percentiles)
            if len(self.lower50List) % 2 == 0:
                self.Q1L = self.lower50List[int(len(self.lower50List)/2)-1]
                self.Q1U = self.lower50List[int(len(self.lower50List)/2)]
                self.Q1 = (self.Q1L + self.Q1U)/2
    
            else:
                self.Q1 = self.lower50List[int((len(self.lower50List)-1)/2)]
    
            return self.Q1
    
        def find_3rdQuartile(self, median):
            self.upper50List = []
            self.Q3 = 0
    
            # break out upper 50 percentile from sampleList
            if len(self.sampleList) % 2 == 0:
                self.upper50List = self.interList[int(len(self.interList)/2):]
            else:
                self.interList.pop(interList.index(self.median))
                self.upper50List = self.interList[int(len(self.interList)/2):]
    
            # find 3rd quartile (median of upper 50 percentiles)
            if len(self.upper50List) % 2 == 0:
                self.Q3L = self.upper50List[int(len(self.upper50List)/2)-1]
                self.Q3U = self.upper50List[int(len(self.upper50List)/2)]
                self.Q3 = (self.Q3L + self.Q3U)/2
    
            else:
                self.Q3 = self.upper50List[int((len(self.upper50List)-1)/2)]
    
            return self.Q3
    
        def find_InterQuartileRange(self, Q1, Q3):
            self.IQR = self.Q3 - self.Q1
            return self.IQR
    
        def find_UpperFence(self, Q3, IQR):
            self.fence = self.Q3 + 1.5 * self.IQR
            return self.fence
    
    samples = sampleSet(samplesCourse)
    median = samples.find_median()
    firstQ = samples.find_1stQuartile(median)
    thirdQ = samples.find_3rdQuartile(median)
    iqr = samples.find_InterQuartileRange(firstQ, thirdQ)
    fence = samples.find_UpperFence(thirdQ, iqr)
    
    print("Median is: ", median)
    print("1st quartile is: ", firstQ)
    print("3rd quartile is: ", thirdQ)
    print("IQR is: ", iqr)
    print("Upper fence is: ", fence)
    
    0 讨论(0)
  • 2020-12-04 19:46

    By using pandas:

    df.time_diff.quantile([0.25,0.5,0.75])
    
    
    Out[793]: 
    0.25    0.483333
    0.50    0.500000
    0.75    0.516667
    Name: time_diff, dtype: float64
    
    0 讨论(0)
  • 2020-12-04 19:47

    np.percentile DOES NOT calculate the values of Q1, median, and Q3. Consider the sorted list below:

    samples = [1, 1, 8, 12, 13, 13, 14, 16, 19, 22, 27, 28, 31]
    

    running np.percentile(samples, [25, 50, 75]) returns the actual values from the list:

    Out[1]: array([12., 14., 22.])
    

    However, the quartiles are Q1=10.0, Median=14, Q3=24.5 (you can also use this link to find the quartiles and median online). One can use the below code to calculate the quartiles and median of a sorted list (because of sorting this approach requires O(nlogn) computations where n is the number of items). Moreover, finding quartiles and median can be done in O(n) computations using the Median of medians Selection algorithm (order statistics).

    samples = sorted([28, 12, 8, 27, 16, 31, 14, 13, 19, 1, 1, 22, 13])
    
    def find_median(sorted_list):
        indices = []
    
        list_size = len(sorted_list)
        median = 0
    
        if list_size % 2 == 0:
            indices.append(int(list_size / 2) - 1)  # -1 because index starts from 0
            indices.append(int(list_size / 2))
    
            median = (sorted_list[indices[0]] + sorted_list[indices[1]]) / 2
            pass
        else:
            indices.append(int(list_size / 2))
    
            median = sorted_list[indices[0]]
            pass
    
        return median, indices
        pass
    
    median, median_indices = find_median(samples)
    Q1, Q1_indices = find_median(samples[:median_indices[0]])
    Q2, Q2_indices = find_median(samples[median_indices[-1] + 1:])
    
    quartiles = [Q1, median, Q2]
    
    print("(Q1, median, Q3): {}".format(quartiles))
    
    0 讨论(0)
  • 2020-12-04 19:49

    You can use np.percentile to calculate quartiles (including the median):

    >>> np.percentile(df.time_diff, 25)  # Q1
    0.48333300000000001
    
    >>> np.percentile(df.time_diff, 50)  # median
    0.5
    
    >>> np.percentile(df.time_diff, 75)  # Q3
    0.51666699999999999
    

    Or all at once:

    >>> np.percentile(df.time_diff, [25, 50, 75])
    array([ 0.483333,  0.5     ,  0.516667])
    
    0 讨论(0)
  • 2020-12-04 19:51

    I also faced a similar problem when trying to find a package that finds quartiles. That's not to say the others are wrong but to say this is how I personally would have defined quartiles. It is similar to Shikar's results with using mid-point but also works on lists that have an odd length. If the quartile position is between lengths, it will use the average of the neighbouring values. (i.e. position always treated as either the exact position or 0.5 of the position)

    import math
    
    def find_quartile_postions(size):
        if size == 1:
            # All quartiles are the first (only) element
            return 0, 0, 0
        elif size == 2:
            # Lower quartile is first element, Upper quartile is second element, Median is average
            # Set to 0.5, 0.5, 0.5 if you prefer all quartiles to be the mean value
            return 0, 0.5, 1
        else:
            # Lower quartile is element at 1/4th position, median at 1/2th, upper at 3/4
            # Quartiles can be between positions if size + 1 is not divisible by 4
            return (size + 1) / 4 - 1, (size + 1) / 2 - 1, 3 * (size + 1) / 4 - 1
    
    def find_quartiles(num_array):
        size = len(num_array)
        
        if size == 0:
            quartiles = [0,0,0]
        else:
            sorted_array = sorted(num_array)
            lower_pos, median_pos, upper_pos = find_quartile_postions(size)
    
            # Floor so can work in arrays
            floored_lower_pos = math.floor(lower_pos)
            floored_median_pos = math.floor(median_pos)
            floored_upper_pos = math.floor(upper_pos)
    
            # If position is an integer, the quartile is the elem at position
            # else the quartile is the mean of the elem & the elem one position above
            lower_quartile = (sorted_array[floored_lower_pos]
                              if (lower_pos % 1 == 0)
                              else (sorted_array[floored_lower_pos] + sorted_array[floored_lower_pos + 1]) / 2
                             )
    
            median = (sorted_array[floored_median_pos]
                              if (median_pos % 1 == 0)
                              else (sorted_array[floored_median_pos] + sorted_array[floored_median_pos + 1]) / 2
                             )
    
            upper_quartile = (sorted_array[floored_upper_pos]
                              if (upper_pos % 1 == 0)
                              else (sorted_array[floored_upper_pos] + sorted_array[floored_upper_pos + 1]) / 2
                             )
    
            quartiles = [lower_quartile, median, upper_quartile]
    
        return quartiles
    
    0 讨论(0)
提交回复
热议问题