Cuda Create 3d texture and cudaArray(3d) from device memory

倖福魔咒の 提交于 2019-11-30 10:36:05

Here's a worked example showing the creation of an array of texture objects, roughly following the path of the code you provided. You can see, by comparing to the texture reference code I placed here, that the first set of texture reads from the first texture object (i.e. the first kernel call) are the same numerical values as the set of reads from the texture reference example (you may need to adjust the grid size of the two example codes to match).

Texture object usage requires compute capability 3.0 or higher.

example:

$ cat t507.cu
#include <helper_cuda.h>
#include <curand.h>
#define NUM_TEX 4

const int SizeNoiseTest = 32;
const int cubeSizeNoiseTest = SizeNoiseTest*SizeNoiseTest*SizeNoiseTest;
static cudaTextureObject_t texNoise[NUM_TEX];

__global__ void AccesTexture(cudaTextureObject_t my_tex)
{
        float test = tex3D<float>(my_tex,(float)threadIdx.x,(float)threadIdx.y,(float)threadIdx.z);//by using this the error occurs
        printf("thread: %d,%d,%d, value: %f\n", threadIdx.x, threadIdx.y, threadIdx.z, test);
}

void CreateTexture()
{

    float *d_NoiseTest;//Device Array with random floats
    cudaMalloc((void **)&d_NoiseTest, cubeSizeNoiseTest*sizeof(float));//Allocation of device Array
    for (int i = 0; i < NUM_TEX; i++){
        //curand Random Generator (needs compiler link -lcurand)
        curandGenerator_t gen;
        curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
        curandSetPseudoRandomGeneratorSeed(gen,1235ULL+i);
        curandGenerateUniform(gen, d_NoiseTest, cubeSizeNoiseTest);//writing data to d_NoiseTest
        curandDestroyGenerator(gen);

        //cudaArray Descriptor
        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
        //cuda Array
        cudaArray *d_cuArr;
        checkCudaErrors(cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest), 0));
        cudaMemcpy3DParms copyParams = {0};


        //Array creation
        copyParams.srcPtr   = make_cudaPitchedPtr(d_NoiseTest, SizeNoiseTest*sizeof(float), SizeNoiseTest, SizeNoiseTest);
        copyParams.dstArray = d_cuArr;
        copyParams.extent   = make_cudaExtent(SizeNoiseTest,SizeNoiseTest,SizeNoiseTest);
        copyParams.kind     = cudaMemcpyDeviceToDevice;
        checkCudaErrors(cudaMemcpy3D(&copyParams));
        //Array creation End

        cudaResourceDesc    texRes;
        memset(&texRes, 0, sizeof(cudaResourceDesc));
        texRes.resType = cudaResourceTypeArray;
        texRes.res.array.array  = d_cuArr;
        cudaTextureDesc     texDescr;
        memset(&texDescr, 0, sizeof(cudaTextureDesc));
        texDescr.normalizedCoords = false;
        texDescr.filterMode = cudaFilterModeLinear;
        texDescr.addressMode[0] = cudaAddressModeClamp;   // clamp
        texDescr.addressMode[1] = cudaAddressModeClamp;
        texDescr.addressMode[2] = cudaAddressModeClamp;
        texDescr.readMode = cudaReadModeElementType;
        checkCudaErrors(cudaCreateTextureObject(&texNoise[i], &texRes, &texDescr, NULL));}
}

int main(int argc, char **argv)
{
        CreateTexture();
        AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[0]);
        AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[1]);
        AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[2]);
        checkCudaErrors(cudaPeekAtLastError());
        checkCudaErrors(cudaDeviceSynchronize());
        return 0;
}

compile with:

$ nvcc -arch=sm_30 -I/shared/apps/cuda/CUDA-v6.0.37/samples/common/inc -lcurand -o t507 t507.cu

output:

$ cuda-memcheck ./t507
========= CUDA-MEMCHECK
thread: 0,0,0, value: 0.310691
thread: 1,0,0, value: 0.627906
thread: 0,1,0, value: 0.638900
thread: 1,1,0, value: 0.665186
thread: 0,0,1, value: 0.167465
thread: 1,0,1, value: 0.565227
thread: 0,1,1, value: 0.397606
thread: 1,1,1, value: 0.503013
thread: 0,0,0, value: 0.809163
thread: 1,0,0, value: 0.795669
thread: 0,1,0, value: 0.808565
thread: 1,1,0, value: 0.847564
thread: 0,0,1, value: 0.853998
thread: 1,0,1, value: 0.688446
thread: 0,1,1, value: 0.733255
thread: 1,1,1, value: 0.649379
thread: 0,0,0, value: 0.040824
thread: 1,0,0, value: 0.087417
thread: 0,1,0, value: 0.301392
thread: 1,1,0, value: 0.298669
thread: 0,0,1, value: 0.161962
thread: 1,0,1, value: 0.316443
thread: 0,1,1, value: 0.452077
thread: 1,1,1, value: 0.477722
========= ERROR SUMMARY: 0 errors

In this case I'm using the same kernel, called multiple times, to read from the individual texture objects. It should be possible to pass multiple objects to the same kernel, however it is not advisable to have a single warp read from multiple textures, if that can be avoided in your code. The actual issue resides at the quad level, which I'd prefer not to get into. It's best if you can arrange your code so that a warp is reading from the same texture object, on any given cycle.

Note that for simplicity of presentation, this CreateTexture() function overwrites previously allocated device pointers such as d_cuArr, during the processing of the loop. This isn't illegal or a functional issue, but it raises the possibility of memory leaks.

I assume you can modify the code to handle deallocation of those if this is a concern. The purpose of this code is to demonstrate the method to get things working.

In cudaMalloc3DArray, it should be something like this make_cudaExtent(SizeNoiseTest,SizeNoiseTest,SizeNoiseTest) not make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest)

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!