I'm working on the problem summing the rows of a matrix in CUDA. I'm giving the following example.
Suppose to have the following 20 * 4
array:
1 2 3 4 4 1 2 3 3 4 1 2 . 1 2 3 4 . . . . . . . . 2 1 3 4
After flattened the 2d array to a 1d array (either in row-major or column-major order), I need to assign each thread to a different row and calculate the cost for that row.
For example
- thread 1 should calculate the cost for 1 2 3 4
- thread 2 should calculate the cost for 4 1 2 3
How can I that in CUDA?
Thank you all for the reply
#include <stdio.h> #include <stdlib.h> #define MROWS 20 #define NCOLS 4 #define nTPB 256 __global__ void mykernel(int *costdata, int rows, int cols, int *results){ int tidx = threadIdx.x + blockDim.x*blockIdx.x; if (tidx < rows){ int mycost = 0; for (int i = 0; i < cols; i++) mycost += costdata[(tidx*cols)+i]; results[tidx] = mycost; } } int main(){ //define and initialize host and device storage for cost and results int *d_costdata, *h_costdata, *d_results, *h_results; h_results = (int *)malloc(MROWS*sizeof(int)); h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int)); for (int i=0; i<(MROWS*NCOLS); i++) h_costdata[i] = rand()%4; cudaMalloc((void **)&d_results, MROWS*sizeof(int)); cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int)); //copy cost data from host to device cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice); mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results); // copy results back from device to host cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost); for (int i=0; i<MROWS; i++){ int loc_cost = 0; for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j]; printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]); } }
This assumes "cost" of each row is just the sum of the elements in each row. If you have a different "cost" function, you can modify the activity in the kernel for-loop accordingly. This also assumes C-style row-major data storage (1 2 3 4 4 1 2 3 3 4 1 2 etc.)
If you instead use column-major storage (1 4 3 etc.), you can slightly improve the performance, since the data reads can be fully coalesced. Then your kernel code could look like this:
for (int i = 0; i < cols; i++) mycost += costdata[(i*rows)+tidx];
You should also use proper cuda error checking on all CUDA API calls and kernel calls.
EDIT: As discussed in the comments below, for the row-major storage case, in some situations it might give an increase in memory efficiency by electing to load 16-byte quantities rather than the base type. Following is a modified version that implements this idea for arbitrary dimensions and (more or less) arbitrary base types:
#include <iostream> #include <typeinfo> #include <cstdlib> #include <vector_types.h> #define MROWS 1742 #define NCOLS 801 #define nTPB 256 typedef double mytype; __host__ int sizetype(){ int size = 0; if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int))) size = 4; else if (typeid(mytype) == typeid(double)) size = 8; else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char))) size = 1; return size; } template<typename T> __global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){ int chunk = 16/size; // assumes size is a factor of 16 int tidx = threadIdx.x + blockDim.x*blockIdx.x; if (tidx < rows){ T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch); T mycost = (T)0; int count = 0; while (count < cols){ if ((cols-count)>=chunk){ // read 16 bytes int4 temp = *((int4 *)(myrowptr + count)); int bcount = 16; int j = 0; while (bcount > 0){ mycost += *(((T *)(&temp)) + j++); bcount -= size; count++;} } else { // read one quantity at a time for (; count < cols; count++) mycost += myrowptr[count]; } results[tidx] = mycost; } } } int main(){ int typesize = sizetype(); if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;} //define and initialize host and device storage for cost and results mytype *d_costdata, *h_costdata, *d_results, *h_results; h_results = (mytype *)malloc(MROWS*sizeof(mytype)); h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype)); for (int i=0; i<(MROWS*NCOLS); i++) h_costdata[i] = (mytype)(rand()%4); size_t pitch = 0; cudaMalloc((void **)&d_results, MROWS*sizeof(mytype)); cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS); //copy cost data from host to device cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype), MROWS, cudaMemcpyHostToDevice); mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch); // copy results back from device to host cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost); for (int i=0; i<MROWS; i++){ mytype loc_cost = (mytype)0; for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j]; if ((i < 10) && (typesize > 1)) std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl; if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; } } std::cout << "Results are correct!" << std::endl; }