Cuda kernel returning vectors

后端 未结 1 1015
梦如初夏
梦如初夏 2020-11-30 15:49

I have a list of words, my goal is to match each word in a very very long phrase. I\'m having no problem in matching each word, my only problem is to return a vector of stru

相关标签:
1条回答
  • 2020-11-30 15:56

    something like this should work (coded in browser, not tested):

    // N is the maximum number of structs to insert
    #define N 10000
    
    typedef struct {
        int A, B, C; } Match;
    
    __device__ Match dev_data[N];
    __device__ int dev_count = 0;
    
    __device__ int my_push_back(Match * mt) {
      int insert_pt = atomicAdd(&dev_count, 1);
      if (insert_pt < N){
        dev_data[insert_pt] = *mt;
        return insert_pt;}
      else return -1;}
    
    __global__ void Find(veryLongPhrase * _phrase, Words * _word_list, vector<Match> * _matches)
    {
        int a, b, c;
    
        [...] //Parallel search for each word in the phrase
    
        if(match) //When an occurrence is found
        {
            my_push_back(new Match{ A = a, B = b, C = c });    }
    }
    
    
    main()
    {
        [...]
    
        veryLongPhrase * myPhrase = "The quick brown fox jumps over the lazy dog etc etc etc..."
    
        Words * wordList = {"the", "lazy"};
    
        Find<<< X, Y >>>(myPhrase, wordList);
    
        int dsize;
        cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int));
        vector<Match> results(dsize);
        cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match));
    
        [...]
    
    }
    

    This will require compute capability 1.1 or better for the atomic operation.

    nvcc -arch=sm_11 ...
    

    Here's a worked example:

    $ cat t347.cu
    #include <iostream>
    #include <vector>
    
    // N is the maximum number of structs to insert
    #define N 10000
    
    typedef struct {
        int A, B, C; } Match;
    
    __device__ Match dev_data[N];
    __device__ int dev_count = 0;
    
    __device__ int my_push_back(Match & mt) {
      int insert_pt = atomicAdd(&dev_count, 1);
      if (insert_pt < N){
        dev_data[insert_pt] = mt;
        return insert_pt;}
      else return -1;}
    
    __global__ void Find()
    {
    
        if(threadIdx.x < 10) //Simulate a found occurrence
        {
            Match a = { .A = 1, .B = 2, .C = 3 };
            my_push_back(a);    }
    }
    
    
    main()
    {
    
        Find<<< 2, 256 >>>();
    
        int dsize;
        cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int));
        if (dsize >= N) {printf("overflow error\n"); return 1;}
        std::vector<Match> results(dsize);
        cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match));
        std::cout << "number of matches = " << dsize << std::endl;
        std::cout << "A  =  " << results[dsize-1].A << std:: endl;
        std::cout << "B  =  " << results[dsize-1].B << std:: endl;
        std::cout << "C  =  " << results[dsize-1].C << std:: endl;
    
    }
    $ nvcc -arch=sm_11 -o t347 t347.cu
    $ ./t347
    number of matches = 20
    A  =  1
    B  =  2
    C  =  3
    $
    

    Note that in this case my Match result struct creation is different, and I am passing by reference, but the concept is the same.

    0 讨论(0)
提交回复
热议问题