I\'m trying to create N balanced random subsamples of my large unbalanced dataset. Is there a way to do this simply with scikit-learn / pandas or do I have to implement it m
A version for pandas Series:
import numpy as np
def balanced_subsample(y, size=None):
subsample = []
if size is None:
n_smp = y.value_counts().min()
else:
n_smp = int(size / len(y.value_counts().index))
for label in y.value_counts().index:
samples = y[y == label].index.values
index_range = range(samples.shape[0])
indexes = np.random.choice(index_range, size=n_smp, replace=False)
subsample += samples[indexes].tolist()
return subsample