How could we generate random numbers in CUDA C with different seed on each run?

时光总嘲笑我的痴心妄想 提交于 2020-01-01 15:31:51

问题


I am working on a stochastic process and I wanted to generate different series if random numbers in CUDA kernel each time I run the program. This similar to what we does in C++ by declaring seed = time(null) followed by srand(seed) and rand( )

I can pass seeds from host to device via the kernel but the problem in doing this is I would have to pass an entire array of seeds into the kernel for each thread to have a different random seed each time. Is there a way I could generate random seed / process if / machine time or something like than within the kernel and pass it as a seed?


回答1:


You don't need to pass an array of random seeds, but, when you use the cuRAND library, you can properly set the sequence number parameter of curand_init. For example [Disclaimer: it is a non-tested function]

__global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    if (i < Np) {

        curandState state;

        curand_init(seed, i, 0, &state);

        numbers[i] = curand_uniform(&state);
    }
}

You can also avoid passing the seed from outside if you change the curand_init instruction to

curand_init(clock64(), i, 0, &state);

EDIT

Following Roger Dahl's comment, I have done a comparison (Kepler K20c) between four different possibilities for the generation of arrays of 131072 elements:

  1. Single random number generation: separate kernels for initialization and random number generation;
  2. Single random number generation: unique kernel for initialization and random number generation;
  3. Multiple random number generation: separate kernels for initialization and random number generation;
  4. Multiple random number generation: unique kernel for initialization and random number generation;

Below is the code. The timing for generating has been the following:

  1. 861ms;
  2. 852ms;
  3. 866ms;
  4. 2556ms;

I hope I have correctly understood the performance issue raised by Roger Dahl.

#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>

#define DSIZE 8192*16
#define nTPB 256

/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
        if (abort) exit(code);
    }
}

/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void initCurand(curandState *state, unsigned long seed){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &state[idx]);
}

__global__ void testrand1(curandState *state, float *a){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    a[idx] = curand_uniform(&state[idx]);
}

__global__ void testrand2(unsigned long seed, float *a){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curandState state;
    curand_init(seed, idx, 0, &state);
    a[idx] = curand_uniform(&state);
}

/********/
/* MAIN */
/********/
int main() {

    int n_iter = 20;

    curandState *devState;  gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));

    float *d_a;             gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));

    float time;
    cudaEvent_t start, stop;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for separate kernels:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for single kernels:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for separate kernels with multiple random number generation:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for single kernels for multiple random number generation:  %3.1f ms \n", time);

    getchar();
}

Output on GTX660:

Elapsed time for separate kernels:  1960.3 ms
Elapsed time for single kernels:  1536.9 ms
Elapsed time for separate kernels with multiple random number generation:  1576.0 ms
Elapsed time for single kernels for multiple random number generation:  4612.2 ms

Output on GTX570:

Elapsed time for separate kernels:  957.2 ms 
Elapsed time for single kernels:  947.7 ms 
Elapsed time for separate kernels with multiple random number generation:  964.6 ms 
Elapsed time for single kernels for multiple random number generation:  2839.0 ms 

Approximately same performance as the K20c.




回答2:


Using a different seed on each run should be straightforward. The exact method will depend on which generator you are using, but if you're using one of the cuRAND generators then you can cast your time_t from time(NULL) to a 64-bit integer and pass it in to the seed functions.

If you're calling the generator from your kernel you will need to pass this seed in either as a kernel argument or via a __device__ variable. You can then use an offset to curand_init() or use skip_ahead() to get distinct sub-sequences.

If you have a specific generator for which this will not work, please post more information.




回答3:


You can create more than one global function for random number initialization and generation. or create a loop to go over the global function example: for (int rns = 0; rns < 5; rns++) { // too seed 'loop' times

    init << < N, 10 >> > (devState, time(0));
    gpuErrchk(cudaMalloc((void**)&gpu_no, N * sizeof(double))); // allocate memory for random numbers on device/GPU
//rndn << < N, 10 >> > (devState, gpu_no);//invoke kernel to launch the random numbers
    gpuErrchk(cudaMemcpy(cpu_no, gpu_no, N * sizeof(double), cudaMemcpyDeviceToHost))
} cout << "the transition matrix  " << ++generate << " seed generation is:  " << init << endl;

This does nt have any noticeable effect on the random number generated. But there is a fear of not being correlated and also lack of convergence in the long run. Why would you like to seed more than once in an iteration anyways. you can use the library function to generate different types of random number distribution like "curand_uniform" curand_normal, curand_poission and so on.

I don't know if this answers your question.



来源:https://stackoverflow.com/questions/22425283/how-could-we-generate-random-numbers-in-cuda-c-with-different-seed-on-each-run

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!