Consider the dataframe df
df = pd.DataFrame(dict(
A=list(\'XXYYXXYY\'),
B=range(8, 0, -1)
))
print(df)
A B
0 X 8
1
The only way I figured how to solve this efficiently was to sort twice and unwind once.
v = df.values
# argsort just first column with kind='mergesort' to preserve subgroup order
a1 = v[:, 0].argsort(kind='mergesort')
# Fill in an un-sort array to unwind the `a1` argsort
a_ = np.empty_like(a1)
a_[a1] = np.arange(len(a1))
# argsort by both columns... not exactly what I want, yet.
a2 = np.lexsort(v.T[::-1])
# Sort with `a2` then unwind the first layer with `a_`
pd.DataFrame(v[a2][a_], df.index[a2][a_], df.columns)
A B
5 X 3
4 X 4
7 Y 1
6 Y 2
1 X 7
0 X 8
3 Y 5
2 Y 6
Testing
Code
def np_intra_sort(df):
v = df.values
a1 = v[:, 0].argsort(kind='mergesort')
a_ = np.empty_like(a1)
a_[a1] = np.arange(len(a1))
a2 = np.lexsort(v.T[::-1])
return pd.DataFrame(v[a2][a_], df.index[a2][a_], df.columns)
def pd_intra_sort(df):
def sub_sort(x):
return x.sort_values().index
idx = df.groupby('A').B.transform(sub_sort).values
return df.reindex(idx)
Small data
Large data
df = pd.DataFrame(dict(
A=list('XXYYXXYY') * 10000,
B=range(8 * 10000, 0, -1)
))