split->apply->combine
12 import numpy as npimport pandas as pd
12345 df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'], 'key2' : ['one', 'two', 'one', 'two', 'one'], 'data1' : np.random.randn(5), 'data2' : np.random.randn(5)})df
key1
key2
data1
data2
0
a
one
1.587125
-0.517650
1
a
two
0.206854
1.503013
2
b
one
1.074688
-1.310088
3
b
two
0.306591
2.236456
4
a
one
0.462624
0.643336
123 grouped = df['data1'].groupby(df['key1'])grouped #一个groupby对象
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x10637ba20>
key1
a 0.752201
b 0.690639
Name: data1, dtype: float64
count
mean
std
min
25%
50%
75%
max
key1
a
3.0
0.752201
0.734288
0.206854
0.334739
0.462624
1.024874
1.587125
b
2.0
0.690639
0.543127
0.306591
0.498615
0.690639
0.882664
1.074688
12 means = df['data1'].groupby([df['key1'],df['key2']]).mean()means
key1 key2
a one 1.024874
two 0.206854
b one 1.074688
two 0.306591
Name: data1, dtype: float64
key2
one
two
key1
a
1.024874
0.206854
b
1.074688
0.306591
123 states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])years = np.array([2005, 2005, 2006, 2005, 2006])df['data1'].groupby([states, years]).mean()
California 2005 0.206854
2006 1.074688
Ohio 2005 0.946858
2006 0.462624
Name: data1, dtype: float64
1 df.groupby('key1').mean()
data1
data2
key1
a
0.752201
0.542900
b
0.690639
0.463184
1 df.groupby(['key1','key2']).mean()
data1
data2
key1
key2
a
one
1.024874
0.062843
two
0.206854
1.503013
b
one
1.074688
-1.310088
two
0.306591
2.236456
对分组进行迭代 12345 #GroupBy对象支持迭代,可以产生一组二元元组for name,group in df.groupby('key1'): print(name) print(group)
a
key1 key2 data1 data2
0 a one 1.587125 -0.517650
1 a two 0.206854 1.503013
4 a one 0.462624 0.643336
b
key1 key2 data1 data2
2 b one 1.074688 -1.310088
3 b two 0.306591 2.236456
12345 #多个键则返回由键值组成的元组for name,group in df.groupby(['key1','key2']): print(name) print(group)
('a', 'one')
key1 key2 data1 data2
0 a one 1.587125 -0.517650
4 a one 0.462624 0.643336
('a', 'two')
key1 key2 data1 data2
1 a two 0.206854 1.503013
('b', 'one')
key1 key2 data1 data2
2 b one 1.074688 -1.310088
('b', 'two')
key1 key2 data1 data2
3 b two 0.306591 2.236456
123 #造个字典pieces = dict(list(df.groupby('key1')))pieces['b']
key1
key2
data1
data2
2
b
one
1.074688
-1.310088
3
b
two
0.306591
2.236456
选取一个或者一列 1 df['data1'].groupby(df['key1']).describe()
count
mean
std
min
25%
50%
75%
max
key1
a
3.0
0.752201
0.734288
0.206854
0.334739
0.462624
1.024874
1.587125
b
2.0
0.690639
0.543127
0.306591
0.498615
0.690639
0.882664
1.074688
1 df.groupby(['key1'])['data1'].describe() #单独选取‘data1’这一列
count
mean
std
min
25%
50%
75%
max
key1
a
3.0
0.752201
0.734288
0.206854
0.334739
0.462624
1.024874
1.587125
b
2.0
0.690639
0.543127
0.306591
0.498615
0.690639
0.882664
1.074688
通过字典或Series进行分组 12345 people = pd.DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'], index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])people.iloc[2:3, [1, 2]] = np.nan # Add a few NA valuespeople
a
b
c
d
e
Joe
-0.323368
-0.768278
-0.169238
0.308583
-0.459496
Steve
-0.597247
-0.301618
-0.439395
-1.482591
-0.019049
Wes
0.100330
NaN
NaN
0.457292
1.192268
Jim
-0.223226
-0.196377
2.264969
-0.982435
-1.851675
Travis
-1.245037
-0.080968
-0.509985
0.928633
-0.474299
12 mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}
12 by_columns = people.groupby(mapping,axis=1) #要和轴长度一致by_columns.sum()
blue
red
Joe
0.139345
-1.551143
Steve
-1.921986
-0.917914
Wes
0.457292
1.292598
Jim
1.282534
-2.271279
Travis
0.418648
-1.800303
通过函数进行分组 1 people.groupby(len).sum()
a
b
c
d
e
3
-0.446265
-0.964655
2.095731
-0.216560
-1.118904
5
-0.597247
-0.301618
-0.439395
-1.482591
-0.019049
6
-1.245037
-0.080968
-0.509985
0.928633
-0.474299
根据索引级别分组 12345 columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], [1, 3, 5, 1, 3]], names=['cty', 'tenor'])hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)hier_df
cty
US
JP
tenor
1
3
5
1
3
0
-0.297194
0.028656
1.206242
1.330768
-0.267134
1
0.104323
-1.416412
-0.602244
-0.442782
-0.654808
2
1.199846
1.050484
0.077949
-0.214932
1.186233
3
0.214656
0.337322
-0.324943
1.581862
0.821636
数据聚合 主要讲sum(),count()等函数计算时到底发生了什么,我们也可以使用自己定义的聚合函数。
12 grouped = df.groupby('key1')grouped['data1'].quantile(0.9)
key1
a 1.362225
b 0.997878
Name: data1, dtype: float64
1 grouped.describe().stack().head()
data1
data2
key1
a
count
3.000000
3.000000
mean
0.752201
0.542900
std
0.734288
1.014069
min
0.206854
-0.517650
25%
0.334739
0.062843
分组运算和转换 apply:一般性的‘拆分-应用-合并’ 最一般化的GroupBy方法
123 tips = pd.read_csv('examples/tips.csv')tips['tip_pct'] = tips['tip'] / tips['total_bill']tips[:6]
total_bill
tip
smoker
day
time
size
tip_pct
0
16.99
1.01
No
Sun
Dinner
2
0.059447
1
10.34
1.66
No
Sun
Dinner
3
0.160542
2
21.01
3.50
No
Sun
Dinner
3
0.166587
3
23.68
3.31
No
Sun
Dinner
2
0.139780
4
24.59
3.61
No
Sun
Dinner
4
0.146808
5
25.29
4.71
No
Sun
Dinner
4
0.186240
1 grouped = tips.groupby(['day', 'smoker'])
12 grouped_pct = grouped['tip_pct']grouped_pct.agg('mean')
day smoker
Fri No 0.151650
Yes 0.174783
Sat No 0.158048
Yes 0.147906
Sun No 0.160113
Yes 0.187250
Thur No 0.160298
Yes 0.163863
Name: tip_pct, dtype: float64
1 grouped_pct.agg(['mean', 'std'])
mean
std
day
smoker
Fri
No
0.151650
0.028123
Yes
0.174783
0.051293
Sat
No
0.158048
0.039767
Yes
0.147906
0.061375
Sun
No
0.160113
0.042347
Yes
0.187250
0.154134
Thur
No
0.160298
0.038774
Yes
0.163863
0.039389
1 grouped_pct.agg([('foo', 'mean'), ('bar', np.std)]) #传入tuple
foo
bar
day
smoker
Fri
No
0.151650
0.028123
Yes
0.174783
0.051293
Sat
No
0.158048
0.039767
Yes
0.147906
0.061375
Sun
No
0.160113
0.042347
Yes
0.187250
0.154134
Thur
No
0.160298
0.038774
Yes
0.163863
0.039389
123 functions = ['count', 'mean', 'max']result = grouped['tip_pct', 'total_bill'].agg(functions)result
tip_pct
total_bill
count
mean
max
count
mean
max
day
smoker
Fri
No
4
0.151650
0.187735
4
18.420000
22.75
Yes
15
0.174783
0.263480
15
16.813333
40.17
Sat
No
45
0.158048
0.291990
45
19.661778
48.33
Yes
42
0.147906
0.325733
42
21.276667
50.81
Sun
No
57
0.160113
0.252672
57
20.506667
48.17
Yes
19
0.187250
0.710345
19
24.120000
45.35
Thur
No
45
0.160298
0.266312
45
17.113111
41.19
Yes
17
0.163863
0.241255
17
19.190588
43.11
分位数和桶分析 123456 #和pandas的cut|qcut结合frame = pd.DataFrame({'data1': np.random.randn(1000), 'data2': np.random.randn(1000)})quartiles = pd.cut(frame.data1, 4)quartiles[:10]
0 (-2.845, -1.307]
1 (0.225, 1.757]
2 (0.225, 1.757]
3 (0.225, 1.757]
4 (-1.307, 0.225]
5 (0.225, 1.757]
6 (0.225, 1.757]
7 (-1.307, 0.225]
8 (1.757, 3.29]
9 (-1.307, 0.225]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.845, -1.307] < (-1.307, 0.225] < (0.225, 1.757] < (1.757, 3.29]]
12345 def (group): return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}grouped = frame.data2.groupby(quartiles)grouped.apply(get_stats).unstack()
count
max
mean
min
data1
(-2.845, -1.307]
106.0
1.826602
0.117032
-2.382809
(-1.307, 0.225]
478.0
3.553988
0.079556
-2.690484
(0.225, 1.757]
379.0
2.647662
0.005168
-2.656434
(1.757, 3.29]
37.0
1.995249
0.024612
-1.838962
示例 后续补上