可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效，请关闭广告屏蔽插件后再试):

问题:

I am trying to pass an Nx3 array to a kernel and read from it as in texture memory and write to a second array. Here is my simplified code with N=8:

#include <cstdio> #include "handle.h" using namespace std;  texture<float,2> tex_w;  __global__ void kernel(int imax, float(*w)[3], float (*f)[3]) {   int i = threadIdx.x;   int j = threadIdx.y;    if(i<imax)       f[i][j] = tex2D(tex_w, i, j); }  void print_to_stdio(int imax, float (*w)[3]) {   for (int i=0; i<imax; i++)     {       printf("%2d  %3.6f\t  %3.6f\t %3.6f\n",i, w[i][0], w[i][1], w[i][2]);     } }  int main(void) {   int imax = 8;   float (*w)[3];   float (*d_w)[3], (*d_f)[3];   dim3 grid(imax,3);    w = (float (*)[3])malloc(imax*3*sizeof(float));    for(int i=0; i<imax; i++)     {       for(int j=0; j<3; j++)         {           w[i][j] = i + 0.01f*j;         }     }    cudaMalloc( (void**) &d_w, 3*imax*sizeof(float) );   cudaMalloc( (void**) &d_f, 3*imax*sizeof(float) );    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();   HANDLE_ERROR( cudaBindTexture2D(NULL, tex_w, d_w, desc, imax, 3, sizeof(float)*imax ) );    cudaMemcpy(d_w, w, 3*imax*sizeof(float), cudaMemcpyHostToDevice);    // just use threads for simplicity                                                                     kernel<<<1,grid>>>(imax, d_w, d_f);    cudaMemcpy(w, d_f, 3*imax*sizeof(float), cudaMemcpyDeviceToHost);    cudaUnbindTexture(tex_w);   cudaFree(d_w);   cudaFree(d_f);    print_to_stdio(imax, w);    free(w);   return 0; }

Running this code I would expect to get:

0  0.000000   0.010000   0.020000 1  1.000000   1.010000   1.020000 2  2.000000   2.010000   2.020000 3  3.000000   3.010000   3.020000 4  4.000000   4.010000   4.020000 5  5.000000   5.010000   5.020000 6  6.000000   6.010000   6.020000 7  7.000000   7.010000   7.020000

but instead i get:

0  0.000000   2.020000   5.010000 1  0.010000   3.000000   5.020000 2  0.020000   3.010000   6.000000 3  1.000000   3.020000   6.010000 4  1.010000   4.000000   6.020000 5  1.020000   4.010000   7.000000 6  2.000000   4.020000   7.010000 7  2.010000   5.000000   7.020000

I think this has something to do with the pitch parameter I give to cudaBindTexture2D but using smaller values gives an invalid argument error.

Thanks in advance!

回答1:

After brano's response and looking more into how pitch works, I'll answer my own question. Here is the modified code:

#include <cstdio> #include <iostream> #include "handle.cu"  using namespace std;  texture<float,2,cudaReadModeElementType> tex_w;  __global__ void kernel(int imax, float (*f)[3]) {   int i = threadIdx.x;   int j = threadIdx.y;   // width = 3, height = imax                                                                            // but we have imax threads in x, 3 in y                                                               // therefore height corresponds to x threads (i)                                                       // and width corresponds to y threads (j)                                                              if(i<imax)     {       // linear filtering looks between indices                                                              f[i][j] = tex2D(tex_w, j+0.5f, i+0.5f);     } }  void print_to_stdio(int imax, float (*w)[3]) {   for (int i=0; i<imax; i++)     {       printf("%2d  %3.3f  %3.3f  %3.3f\n",i, w[i][0], w[i][1], w[i][2]);     }   printf("\n"); }  int main(void) {   int imax = 8;   float (*w)[3];   float (*d_f)[3], *d_w;   dim3 grid(imax,3);    w = (float (*)[3])malloc(imax*3*sizeof(float));    for(int i=0; i<imax; i++)     {       for(int j=0; j<3; j++)         {           w[i][j] = i + 0.01f*j;         }     }    print_to_stdio(imax, w);    size_t pitch;   HANDLE_ERROR( cudaMallocPitch((void**)&d_w, &pitch, 3*sizeof(float), imax) );    HANDLE_ERROR( cudaMemcpy2D(d_w,             // device destination                                                                 pitch,           // device pitch (calculated above)                                                    w,               // src on host                                                                        3*sizeof(float), // pitch on src (no padding so just width of row)                                     3*sizeof(float), // width of data in bytes                                                             imax,            // height of data                                                                     cudaMemcpyHostToDevice) );    HANDLE_ERROR( cudaBindTexture2D(NULL, tex_w, d_w, tex_w.channelDesc, 3, imax, pitch) );    tex_w.normalized = false;  // don't use normalized values                                              tex_w.filterMode = cudaFilterModeLinear;   tex_w.addressMode[0] = cudaAddressModeClamp; // don't wrap around indices                              tex_w.addressMode[1] = cudaAddressModeClamp;    // d_f will have result array                                                                          cudaMalloc( &d_f, 3*imax*sizeof(float) );    // just use threads for simplicity                                                                     kernel<<<1,grid>>>(imax, d_f);    cudaMemcpy(w, d_f, 3*imax*sizeof(float), cudaMemcpyDeviceToHost);    cudaUnbindTexture(tex_w);   cudaFree(d_w);   cudaFree(d_f);    print_to_stdio(imax, w);    free(w);   return 0; }

Instead of using memcpy() and having to deal with pitch on the host machine, using memcpy2D() accepts a pitch argument for both the device data and host data. Since we are using simply allocated data on the host, my understanding is that the pitch would simply be the row width, or 3*sizeof(float).

回答2:

I can give you a complete solution but than you might not learn :D So here are some tips instead and maybe you could fix the rest on your own.

Tip 1.
When using cudaBindTexture2D it requests an offset and pitch. Both parameters have certain hardware dependent alignment restrictions. The offset is guaranteed to be 0 if you use cudaMalloc(..). The pitch is retrieved by using cudaMallocPitch(..). You also need to make sure that your host memory is pitched the same way otherwise your memcpy will not work as expected.

Tip 2.
Understand indexing in 2D. When accessing elements in W[i][j] you need to know that element W[i][j+1] is the next element in memory and NOT W[i+1][j].

Tip 3.
Use 1D arrays and calculate the 2D index your self. This will give you better control.

文章来源: 2D Texture from 2D array CUDA

标签

imax

sizeof

cuda