I am trying to generate random number random numbers within the cuda kernel. I wish to generate the random numbers from uniform distribution and in the integer form, startin
EDIT: I've edited my answer to fix some of the deficiencies pointed out in the other answers (@tudorturcu) and comments.
Something like this in your device code:
int idx = threadIdx.x+blockDim.x*blockIdx.x;
// assume have already set up curand and generated state for each thread...
// assume ranges vary by thread index
float myrandf = curand_uniform(&(my_curandstate[idx]));
myrandf *= (max_rand_int[idx] - min_rand_int[idx] + 0.999999);
myrandf += min_rand_int[idx];
int myrand = (int)truncf(myrandf);
You should:
#include <math.h>
for truncf
Here's a fully worked example:
$ cat t527.cu
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <math.h>
#include <assert.h>
#define MIN 2
#define MAX 7
#define ITER 10000000
__global__ void setup_kernel(curandState *state){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
curand_init(1234, idx, 0, &state[idx]);
}
__global__ void generate_kernel(curandState *my_curandstate, const unsigned int n, const unsigned *max_rand_int, const unsigned *min_rand_int, unsigned int *result){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
int count = 0;
while (count < n){
float myrandf = curand_uniform(my_curandstate+idx);
myrandf *= (max_rand_int[idx] - min_rand_int[idx]+0.999999);
myrandf += min_rand_int[idx];
int myrand = (int)truncf(myrandf);
assert(myrand <= max_rand_int[idx]);
assert(myrand >= min_rand_int[idx]);
result[myrand-min_rand_int[idx]]++;
count++;}
}
int main(){
curandState *d_state;
cudaMalloc(&d_state, sizeof(curandState));
unsigned *d_result, *h_result;
unsigned *d_max_rand_int, *h_max_rand_int, *d_min_rand_int, *h_min_rand_int;
cudaMalloc(&d_result, (MAX-MIN+1) * sizeof(unsigned));
h_result = (unsigned *)malloc((MAX-MIN+1)*sizeof(unsigned));
cudaMalloc(&d_max_rand_int, sizeof(unsigned));
h_max_rand_int = (unsigned *)malloc(sizeof(unsigned));
cudaMalloc(&d_min_rand_int, sizeof(unsigned));
h_min_rand_int = (unsigned *)malloc(sizeof(unsigned));
cudaMemset(d_result, 0, (MAX-MIN+1)*sizeof(unsigned));
setup_kernel<<<1,1>>>(d_state);
*h_max_rand_int = MAX;
*h_min_rand_int = MIN;
cudaMemcpy(d_max_rand_int, h_max_rand_int, sizeof(unsigned), cudaMemcpyHostToDevice);
cudaMemcpy(d_min_rand_int, h_min_rand_int, sizeof(unsigned), cudaMemcpyHostToDevice);
generate_kernel<<<1,1>>>(d_state, ITER, d_max_rand_int, d_min_rand_int, d_result);
cudaMemcpy(h_result, d_result, (MAX-MIN+1) * sizeof(unsigned), cudaMemcpyDeviceToHost);
printf("Bin: Count: \n");
for (int i = MIN; i <= MAX; i++)
printf("%d %d\n", i, h_result[i-MIN]);
return 0;
}
$ nvcc -arch=sm_20 -o t527 t527.cu -lcurand
$ cuda-memcheck ./t527
========= CUDA-MEMCHECK
Bin: Count:
2 1665496
3 1668130
4 1667644
5 1667435
6 1665026
7 1666269
========= ERROR SUMMARY: 0 errors
$
@Robert's example doesn't generate a perfectly uniform distribution (although all the numbers in the range are generated and all the generated numbers are in the range). Both the smallest and largest value have 0.5 the probability of being chosen of the rest of the numbers in the range.
At step 2, you should multiply with the number of values in the range: (largest value - smallest value + 0.999999). *
At step 3, the offset should be (+ smallest value) instead of (+ smallest value + 0.5).
Steps 1 and 4 remain the same.
*As @Kamil Czerski noted, 1.0 is included in the distribution. Adding 1.0 instead of 0.99999 would sometimes result in a number outside of the desired range.