可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效，请关闭广告屏蔽插件后再试):

问题:

I'm working on the problem summing the rows of a matrix in CUDA. I'm giving the following example.

Suppose to have the following 20 * 4 array:

1 2 3 4 4 1 2 3 3 4 1 2  . 1 2 3 4 . . . . . . . . 2 1 3 4

After flattened the 2d array to a 1d array (either in row-major or column-major order), I need to assign each thread to a different row and calculate the cost for that row.

For example
- thread 1 should calculate the cost for 1 2 3 4
- thread 2 should calculate the cost for 4 1 2 3

How can I that in CUDA?

Thank you all for the reply

回答1:

#include <stdio.h> #include <stdlib.h> #define MROWS 20 #define NCOLS 4 #define nTPB 256  __global__ void mykernel(int *costdata, int rows, int cols, int *results){   int tidx = threadIdx.x + blockDim.x*blockIdx.x;   if (tidx < rows){     int mycost = 0;     for (int i = 0; i < cols; i++)        mycost += costdata[(tidx*cols)+i];     results[tidx] = mycost;     }   }  int main(){   //define and initialize host and device storage for cost and results   int *d_costdata, *h_costdata, *d_results, *h_results;   h_results = (int *)malloc(MROWS*sizeof(int));   h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int));   for (int i=0; i<(MROWS*NCOLS); i++)     h_costdata[i] = rand()%4;   cudaMalloc((void **)&d_results, MROWS*sizeof(int));   cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int));   //copy cost data from host to device   cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice);   mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results);   // copy results back from device to host   cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost);   for (int i=0; i<MROWS; i++){     int loc_cost = 0;     for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];     printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]);     }   }

This assumes "cost" of each row is just the sum of the elements in each row. If you have a different "cost" function, you can modify the activity in the kernel for-loop accordingly. This also assumes C-style row-major data storage (1 2 3 4 4 1 2 3 3 4 1 2 etc.)

If you instead use column-major storage (1 4 3 etc.), you can slightly improve the performance, since the data reads can be fully coalesced. Then your kernel code could look like this:

for (int i = 0; i < cols; i++)   mycost += costdata[(i*rows)+tidx];

You should also use proper cuda error checking on all CUDA API calls and kernel calls.

EDIT: As discussed in the comments below, for the row-major storage case, in some situations it might give an increase in memory efficiency by electing to load 16-byte quantities rather than the base type. Following is a modified version that implements this idea for arbitrary dimensions and (more or less) arbitrary base types:

#include <iostream> #include <typeinfo> #include <cstdlib> #include <vector_types.h>  #define MROWS 1742 #define NCOLS 801 #define nTPB 256  typedef double mytype;  __host__ int sizetype(){   int size = 0;   if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int)))       size = 4;   else if (typeid(mytype) == typeid(double))       size = 8;   else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char)))       size = 1;   return size;   }   template<typename T> __global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){   int chunk = 16/size;  // assumes size is a factor of 16   int tidx = threadIdx.x + blockDim.x*blockIdx.x;   if (tidx < rows){     T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch);     T mycost = (T)0;     int count = 0;     while (count < cols){       if ((cols-count)>=chunk){       // read 16 bytes         int4 temp = *((int4 *)(myrowptr + count));         int bcount = 16;         int j = 0;         while (bcount > 0){           mycost += *(((T *)(&temp)) + j++);           bcount -= size;           count++;}         }       else {       // read one quantity at a time         for (; count < cols; count++)           mycost += myrowptr[count];         }     results[tidx] = mycost;     }   } }  int main(){   int typesize = sizetype();   if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;}   //define and initialize host and device storage for cost and results   mytype *d_costdata, *h_costdata, *d_results, *h_results;   h_results = (mytype *)malloc(MROWS*sizeof(mytype));   h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype));   for (int i=0; i<(MROWS*NCOLS); i++)     h_costdata[i] = (mytype)(rand()%4);   size_t pitch = 0;   cudaMalloc((void **)&d_results, MROWS*sizeof(mytype));   cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS);   //copy cost data from host to device   cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype),  MROWS, cudaMemcpyHostToDevice);    mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch);   // copy results back from device to host   cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost);   for (int i=0; i<MROWS; i++){     mytype loc_cost = (mytype)0;     for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];     if ((i < 10) && (typesize > 1))       std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl;     if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; }     }   std::cout << "Results are correct!" << std::endl;   }

文章来源: Summing the rows of a matrix (stored in either row-major or column-major order) in CUDA

标签

sizeof