I\'m working on porting a MATLAB simulation into C++. To do this, I am trying to replicate MATLAB\'s randsample() function. I haven\'t figured out an efficient way to do thi
Starting from C++17, there's a standard function for that: std::sample in <algorithm>
library. It is guaranteed to have linear time complexity.
Sample (pun intended) usage:
#include <algorithm>
#include <iostream>
#include <iterator>
#include <random>
#include <vector>
int main()
{
std::vector<int> population {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
std::vector<int> sample;
std::sample(population.begin(), population.end(),
std::back_inserter(sample),
5,
std::mt19937{std::random_device{}()});
for(int i: sample)
std::cout << i << " "; //prints 5 randomly chosen values from population vector
Bob Floyds sampling is a good solution. Reservoir sampling however might be a good option when k is on the same order of magnitude as N.
Reservoir sampling:
vector<size_t> reservoir_sample(const size_t& k,const size_t& N) {
vector<size_t> sample;
if (k==0) return sample;
std::default_random_engine gen;
size_t i;
for (i=0;i!=k;++i) sample.push_back(i);
for (;i<N;++i) {
uniform_int_distribution<size_t> distr(0,i);
if (distr(gen) > k) continue;
distr = uniform_int_distribution<size_t>(0,k-1);
sample[distr(gen)]=i;
}
std::shuffle(sample.begin(),sample.end(),gen);
return sample;
}
Bob Floyd sampling:
std::unordered_set<size_t> floyd_sample(const size_t& k,const size_t& N) {
std::default_random_engine gen;
// for the benchmark I used a faster hash table
std::unordered_set<size_t> elems(k); //preallocation is good
for (size_t r = N - k; r < N; ++r) {
size_t v = std::uniform_int_distribution<>(1, r)(gen);
if (!elems.insert(v).second) elems.insert(r);
}
return elems;
}
Incomplete shuffle sampling:
#include <vector>
#include <random>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <iomanip>
using std::vector;
using std::uniform_int_distribution;
using std::shuffle;
using std::cout;
using std::swap;
template<class iterator,class generator>
void inline shuffle(iterator beg,iterator unt,iterator end,generator gen){
const size_t n = end-beg;
for (;beg!=unt;++beg) {
size_t i=end-beg;
size_t r=uniform_int_distribution<size_t>(0,i)(gen);
swap(*beg,*(beg+r));
}
}
template<class iterator>
vector<size_t> sample(const size_t& k,iterator beg,iterator end) {
vector<size_t> sample(k);
std::default_random_engine gen;
if (k<(end-beg)/2) {
shuffle(beg,beg+k,end,gen);
for (size_t i=0;i!=k;(++i,++beg)) sample[i] = *beg;
} else {
const size_t l = end-beg-k;
shuffle(beg,beg+l,end,gen);
beg+=l;
for (size_t i=0;i!=k;(++i,++beg)) sample[i] = *beg;
}
return sample;
}
int main(int argc,char** argv){
vector<size_t> samples(std::stol(argv[2]));
auto start = std::clock();
std::iota(samples.begin(),samples.end(),0);
sample(std::stol(argv[1]),samples.begin(),samples.end());
cout << std::setw(12) << (std::clock()-start);
}
Some notes: std::shuffle always shuffles the whole range, but when you only need k items you can stop with the fisher-yates shuffle at the kth element, making it the fastest method when the set to be sampled from already exists.
Here's an approach that doesn't require generating and shuffling a huge list, in case N
is huge but k
is not:
std::vector<int> pick(int N, int k) {
std::random_device rd;
std::mt19937 gen(rd());
std::unordered_set<int> elems = pickSet(N, k, gen);
// ok, now we have a set of k elements. but now
// it's in a [unknown] deterministic order.
// so we have to shuffle it:
std::vector<int> result(elems.begin(), elems.end());
std::shuffle(result.begin(), result.end(), gen);
return result;
}
Now the naive approach of implementing pickSet
is:
std::unordered_set<int> pickSet(int N, int k, std::mt19937& gen)
{
std::uniform_int_distribution<> dis(1, N);
std::unordered_set<int> elems;
while (elems.size() < k) {
elems.insert(dis(gen));
}
return elems;
}
But if k
is large relative to N
, this algorithm could lead to lots of collisions and could be pretty slow. We can do better by guaranteeing that we can add one element on each insertion (brought to you by Robert Floyd):
std::unordered_set<int> pickSet(int N, int k, std::mt19937& gen)
{
std::unordered_set<int> elems;
for (int r = N - k; r < N; ++r) {
int v = std::uniform_int_distribution<>(1, r)(gen);
// there are two cases.
// v is not in candidates ==> add it
// v is in candidates ==> well, r is definitely not, because
// this is the first iteration in the loop that we could've
// picked something that big.
if (!elems.insert(v).second) {
elems.insert(r);
}
}
return elems;
}
So this was a solution I came up with that will generate the samples in a random order, rather than in a deterministic manner that would need to be shuffled later:
vector<int> GenerateRandomSample(int range, int samples) {
vector<int> solution; // Populated in the order that the numbers are generated in.
vector<int> to_exclude; // Inserted into in sorted order.
for(int i = 0; i < samples; ++i) {
auto raw_rand = rand() % (range - to_exclude.size());
// This part can be optimized as a binary search
int offset = 0;
while(offset < to_exclude.size() &&
(raw_rand+offset) >= to_exclude[offset]) {
++offset;
}
// Alternatively substitute Binary Search to avoid linearly
// searching for where to put the new element. Arguably not
// actually a benefit.
// int offset = ModifiedBinarySearch(to_exclude, raw_rand);
int to_insert = (raw_rand + offset);
to_exclude.insert(to_exclude.begin() + offset, to_insert);
solution.push_back(to_insert);
}
return solution;
}
I added an optional binary search for the location on where to insert the newly generated random member, but after attempting to benchmark its execution over large ranges(N)/and sets (K) (done on codeinterview.io/), I have not found any significant benefit to doing so, over just linearly traversing and exiting early.
EDIT: After further extensive testing, I've found over a sufficiently large parameters: (eg. N = 1000, K = 500, TRIALS = 10000) The binary search method does in fact offer a considerable improvement: for the given parameters: with binary search: ~2.7 seconds with linear: ~5.1 seconds deterministic (without shuffle as proposed by Barry in the accepted answer based on Robert Floyd): ~3.8 seconds
int ModifiedBinarySearch(const vector<int>& collection, int raw_rand) {
int offset = 0;
int beg = 0, end = collection.size() - 1;
bool upper_range = 0;
while (beg <= end) {
offset = (beg + end) / 2;
auto to_search_for = (raw_rand+offset);
auto left = collection[offset];
auto right = (offset+1 < collection.size() ?
collection[offset+1] :
collection[collection.size() - 1]);
if ((raw_rand+offset) < left) {
upper_range = false;
end = offset - 1;
} else if ((raw_rand+offset+1) >= right) {
upper_range = true;
beg = offset + 1;
} else {
upper_range = true;
break;
}
}
offset = ((beg + end) / 2) + (upper_range ? 1 : 0);
return offset;
}
Bob Floyd created a random sample algorithm that uses sets. The intermediate structure size is proportional to the sample size you want to take.
It works by randomly generating K numbers and adding them to a set. If a generated number happens to already exist in the set, it places the value of a counter instead which is guaranteed to have not been seen yet. Thus it is guaranteed to run in linear time and does not require a large intermediate structure. It still has pretty good random distribution properties.
This code is basically lifted from Programming Pearls with some modifications to use more modern C++.
unordered_set<int> BobFloydAlgo(int sampleSize, int rangeUpperBound)
{
unordered_set<int> sample;
default_random_engine generator;
for(int d = rangeUpperBound - sampleSize; d < rangeUpperBound; d++)
{
int t = uniform_int_distribution<>(0, d)(generator);
if (sample.find(t) == sample.end() )
sample.insert(t);
else
sample.insert(d);
}
return sample;
}
This code has not been tested.