2D Texture from 2D array CUDA

匿名 (未验证) 提交于 2019-12-03 02:29:01

问题:

I am trying to pass an Nx3 array to a kernel and read from it as in texture memory and write to a second array. Here is my simplified code with N=8:

#include <cstdio> #include "handle.h" using namespace std;  texture<float,2> tex_w;  __global__ void kernel(int imax, float(*w)[3], float (*f)[3]) {   int i = threadIdx.x;   int j = threadIdx.y;    if(i<imax)       f[i][j] = tex2D(tex_w, i, j); }  void print_to_stdio(int imax, float (*w)[3]) {   for (int i=0; i<imax; i++)     {       printf("%2d  %3.6f\t  %3.6f\t %3.6f\n",i, w[i][0], w[i][1], w[i][2]);     } }  int main(void) {   int imax = 8;   float (*w)[3];   float (*d_w)[3], (*d_f)[3];   dim3 grid(imax,3);    w = (float (*)[3])malloc(imax*3*sizeof(float));    for(int i=0; i<imax; i++)     {       for(int j=0; j<3; j++)         {           w[i][j] = i + 0.01f*j;         }     }    cudaMalloc( (void**) &d_w, 3*imax*sizeof(float) );   cudaMalloc( (void**) &d_f, 3*imax*sizeof(float) );    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();   HANDLE_ERROR( cudaBindTexture2D(NULL, tex_w, d_w, desc, imax, 3, sizeof(float)*imax ) );    cudaMemcpy(d_w, w, 3*imax*sizeof(float), cudaMemcpyHostToDevice);    // just use threads for simplicity                                                                     kernel<<<1,grid>>>(imax, d_w, d_f);    cudaMemcpy(w, d_f, 3*imax*sizeof(float), cudaMemcpyDeviceToHost);    cudaUnbindTexture(tex_w);   cudaFree(d_w);   cudaFree(d_f);    print_to_stdio(imax, w);    free(w);   return 0; } 

Running this code I would expect to get:

0  0.000000   0.010000   0.020000 1  1.000000   1.010000   1.020000 2  2.000000   2.010000   2.020000 3  3.000000   3.010000   3.020000 4  4.000000   4.010000   4.020000 5  5.000000   5.010000   5.020000 6  6.000000   6.010000   6.020000 7  7.000000   7.010000   7.020000 

but instead i get:

0  0.000000   2.020000   5.010000 1  0.010000   3.000000   5.020000 2  0.020000   3.010000   6.000000 3  1.000000   3.020000   6.010000 4  1.010000   4.000000   6.020000 5  1.020000   4.010000   7.000000 6  2.000000   4.020000   7.010000 7  2.010000   5.000000   7.020000 

I think this has something to do with the pitch parameter I give to cudaBindTexture2D but using smaller values gives an invalid argument error.

Thanks in advance!

回答1:

After brano's response and looking more into how pitch works, I'll answer my own question. Here is the modified code:

#include <cstdio> #include <iostream> #include "handle.cu"  using namespace std;  texture<float,2,cudaReadModeElementType> tex_w;  __global__ void kernel(int imax, float (*f)[3]) {   int i = threadIdx.x;   int j = threadIdx.y;   // width = 3, height = imax                                                                            // but we have imax threads in x, 3 in y                                                               // therefore height corresponds to x threads (i)                                                       // and width corresponds to y threads (j)                                                              if(i<imax)     {       // linear filtering looks between indices                                                              f[i][j] = tex2D(tex_w, j+0.5f, i+0.5f);     } }  void print_to_stdio(int imax, float (*w)[3]) {   for (int i=0; i<imax; i++)     {       printf("%2d  %3.3f  %3.3f  %3.3f\n",i, w[i][0], w[i][1], w[i][2]);     }   printf("\n"); }  int main(void) {   int imax = 8;   float (*w)[3];   float (*d_f)[3], *d_w;   dim3 grid(imax,3);    w = (float (*)[3])malloc(imax*3*sizeof(float));    for(int i=0; i<imax; i++)     {       for(int j=0; j<3; j++)         {           w[i][j] = i + 0.01f*j;         }     }    print_to_stdio(imax, w);    size_t pitch;   HANDLE_ERROR( cudaMallocPitch((void**)&d_w, &pitch, 3*sizeof(float), imax) );    HANDLE_ERROR( cudaMemcpy2D(d_w,             // device destination                                                                 pitch,           // device pitch (calculated above)                                                    w,               // src on host                                                                        3*sizeof(float), // pitch on src (no padding so just width of row)                                     3*sizeof(float), // width of data in bytes                                                             imax,            // height of data                                                                     cudaMemcpyHostToDevice) );    HANDLE_ERROR( cudaBindTexture2D(NULL, tex_w, d_w, tex_w.channelDesc, 3, imax, pitch) );    tex_w.normalized = false;  // don't use normalized values                                              tex_w.filterMode = cudaFilterModeLinear;   tex_w.addressMode[0] = cudaAddressModeClamp; // don't wrap around indices                              tex_w.addressMode[1] = cudaAddressModeClamp;    // d_f will have result array                                                                          cudaMalloc( &d_f, 3*imax*sizeof(float) );    // just use threads for simplicity                                                                     kernel<<<1,grid>>>(imax, d_f);    cudaMemcpy(w, d_f, 3*imax*sizeof(float), cudaMemcpyDeviceToHost);    cudaUnbindTexture(tex_w);   cudaFree(d_w);   cudaFree(d_f);    print_to_stdio(imax, w);    free(w);   return 0; } 

Instead of using memcpy() and having to deal with pitch on the host machine, using memcpy2D() accepts a pitch argument for both the device data and host data. Since we are using simply allocated data on the host, my understanding is that the pitch would simply be the row width, or 3*sizeof(float).



回答2:

I can give you a complete solution but than you might not learn :D So here are some tips instead and maybe you could fix the rest on your own.

Tip 1.
When using cudaBindTexture2D it requests an offset and pitch. Both parameters have certain hardware dependent alignment restrictions. The offset is guaranteed to be 0 if you use cudaMalloc(..). The pitch is retrieved by using cudaMallocPitch(..). You also need to make sure that your host memory is pitched the same way otherwise your memcpy will not work as expected.

Tip 2.
Understand indexing in 2D. When accessing elements in W[i][j] you need to know that element W[i][j+1] is the next element in memory and NOT W[i+1][j].

Tip 3.
Use 1D arrays and calculate the 2D index your self. This will give you better control.



标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!