I\'m trying to create N balanced random subsamples of my large unbalanced dataset. Is there a way to do this simply with scikit-learn / pandas or do I have to implement it m
My subsampler version, hope this helps
def subsample_indices(y, size):
indices = {}
target_values = set(y_train)
for t in target_values:
indices[t] = [i for i in range(len(y)) if y[i] == t]
min_len = min(size, min([len(indices[t]) for t in indices]))
for t in indices:
if len(indices[t]) > min_len:
indices[t] = random.sample(indices[t], min_len)
return indices
x = [1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1]
j = subsample_indices(x, 2)
print j
print [x[t] for t in j[-1]]
print [x[t] for t in j[1]]