纹理对象是CUDA针对纹理参考缺点而提出的升级版,其作用和纹理参考完全一致,但是使用方法更加灵活。
与纹理参考相比,CUDA对其进行各方面的升级,一方面是可以再代码中申请和销毁,另一方面则可以作为设备函数的参数进行传入;可以满足一些特殊的需求。
使用纹理对象主要包括纹理对象创建、纹理访问和纹理对象销毁。
纹理对象创建之前首先要分别对纹理资源和纹理对象属性进行确定,分别对应cudaResoruceDesc和cudaTextureDesc;然后即可利用cudaCreateTextureObject来创建纹理对象。本文通过代码来解释这个过程:
// 纹理资源 struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); //resType指定对应设备内存的形式,主要包括 //cudaResourceTypeArray(二维纹理内存和二维纹理对象) //cudaResourceTypeMipmappedArray(不常用) //cudaResourceTypeLinear(一维纹理内存和一维纹理对象) //cudaResourceTypePitch2D(一维纹理内存和二维纹理对象) resDesc.resType = cudaResourceTypeArray; //res是一个枚举变量,针对不同内存也有不同的形式 //cudaResourceTypeArray 对应 res.array.array //cudaResourceTypeMipmappedArray 对应res.mipmap.mipmap //cudaResourceTypeLinear 对应 res.linear.devPtr(同时还需要设置res.linear.sizeInBytes和res.linear.desc) //cudaResourceTypePitch2D 对应 res.pitch2D.devPtr(同时需要设定res.pitch2D.pitchInBytes,res.pitch2D.width,res.pitch2D.height,res.pitch2D.ddesc) resDesc.res.array.array = cuArray;//指定需要绑定的二维纹理内存 // 纹理对象的属性 // 由于与纹理参考类似,不再重复介绍 struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeWrap; texDesc.addressMode[1] = cudaAddressModeWrap; texDesc.filterMode = cudaFilterModeLinear; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 1; // 创建纹理对象 cudaTextureObject_t texObj = 0; cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
纹理对象的纹理访问也和纹理参考一样,也是使用tex1D或tex2D等函数进行操作。
//一维纹理 template<class T> T tex1D(cudaTextureObject_t texObj, float x); //二维纹理 template<class T> T tex2D(cudaTextureObject_t texObj, float x, float y);
纹理对象的销毁直接使用cudaDestroyTextureObject即可。值得注意的是应该先销毁对象,然后再释放对应的设备内存。
// 核函数 __global__ void transformKernel(float* output, cudaTextureObject_t texObj, int width, int height, float theta) { unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; float u = x / (float)width; float v = y / (float)height; u -= 0.5f; v -= 0.5f; float tu = u * cosf(theta) - v * sinf(theta) + 0.5f; float tv = v * cosf(theta) + u * sinf(theta) + 0.5f; // 获取纹理 output[y * width + x] = tex2D<float>(texObj, tu, tv); } int main() { //实验数据 int width = 10; int height = 10; float h_data[width*height]; for(int y=0;y<height;y++) { for(int x=0;x<width;x++) { h_data[y*width+x] = x+y; } } cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cuArray; cudaMallocArray(&cuArray, &channelDesc, width, height); cudaMemcpyToArray(cuArray, 0, 0, h_data, size, cudaMemcpyHostToDevice); // 创建纹理对象 struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = cuArray; struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeWrap; texDesc.addressMode[1] = cudaAddressModeWrap; texDesc.filterMode = cudaFilterModeLinear; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 1; cudaTextureObject_t texObj = 0; cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); float* output; cudaMalloc(&output, width * height * sizeof(float)); // 调用核函数 dim3 dimBlock(16, 16); dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y); transformKernel<<<dimGrid, dimBlock>>>(output, texObj, width, height, angle); // 销毁纹理对象 cudaDestroyTextureObject(texObj); // 释放设备内存 cudaFreeArray(cuArray); cudaFree(output); return 0; }
在使用纹理对象时还有一些特殊注意。通常而言,使用纹理对象的目的就是使用多个纹理对象,因此会申请一个cudaTextureObject_t 数组。注意在使用时必须将其拷贝到device端才能正常使用。下边给出一个示例。
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <time.h> __global__ void kernel_set_value(const cudaTextureObject_t *texObj,float *dev_result,int width,int height) { int x = threadIdx.x + blockIdx.x*blockDim.x; int y = threadIdx.y + blockIdx.y*blockDim.y; if(x<0 || x>width || y<0 || y>height) { return; } float sum = 0; for(int i=0;i<4;i++) { sum += tex2D<float>(texObj[i],x,y); } int pos = y*width + x; dev_result[pos]= sum; } int main() { const int array_size_width = 10; const int array_size_height = 10; float random_array[array_size_width*array_size_height]; for(int i=0;i<array_size_width*array_size_height;i++) { random_array[i] = 1; } //error status cudaError_t cuda_status; //only chose one GPU cuda_status = cudaSetDevice(0); if(cuda_status != cudaSuccess) { fprintf(stderr,"cudaSetDevice failed! Do you have a CUDA-Capable GPU installed?"); return 1; } cudaArray *dev_random_array[4]; cudaTextureObject_t texObj[4]; for(int i=0;i<4;i++) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); //allocate memory on the GPU cuda_status = cudaMallocArray(&dev_random_array[i], &channelDesc, array_size_width, array_size_height); if(cuda_status != cudaSuccess) { fprintf(stderr,"cudaMallocArray Failed"); exit( EXIT_FAILURE ); } cuda_status = cudaMemcpyToArray(dev_random_array[i], 0, 0, random_array, sizeof(float)*array_size_height*array_size_width, cudaMemcpyHostToDevice); if(cuda_status != cudaSuccess) { fprintf(stderr,"cudaMemcpyToArray Failed"); exit( EXIT_FAILURE ); } // Specify texture struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = dev_random_array[i]; // Specify texture object parameters struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeWrap; texDesc.addressMode[1] = cudaAddressModeWrap; texDesc.filterMode = cudaFilterModePoint; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 0; cudaCreateTextureObject(&texObj[i], &resDesc, &texDesc, NULL); } //将纹理对象拷贝到设备端 cudaTextureObject_t *dev_texObj; cudaMalloc((void**)&dev_texObj,sizeof(cudaTextureObject_t)*4); cudaMemcpy(dev_texObj,texObj,sizeof(cudaTextureObject_t)*4,cudaMemcpyHostToDevice); float *dev_result; cudaMalloc((void**)&dev_result,sizeof(float)*array_size_height*array_size_width); dim3 threads(16,16); dim3 grid((array_size_width+threads.x-1)/threads.x,(array_size_height+threads.y-1)/threads.y); kernel_set_value<<<grid,threads>>>(dev_texObj,dev_result,array_size_width,array_size_height); cuda_status = cudaGetLastError(); if(cuda_status != cudaSuccess) { fprintf(stderr,"kernel_set_value Failed"); exit( EXIT_FAILURE ); } cuda_status = cudaMemcpy(random_array,dev_result,sizeof(float)*array_size_width*array_size_height,cudaMemcpyDeviceToHost);//dev_depthMap if(cuda_status != cudaSuccess) { fprintf(stderr,"cudaMemcpy Failed"); exit( EXIT_FAILURE ); } for(int i=0;i<array_size_width*array_size_height;i++) { printf("%f\n",random_array[i]); } //free cudaFree(dev_texObj); cudaFree(dev_result); for(int i=0;i<4;i++) { cudaFreeArray(dev_random_array[i]); cudaDestroyTextureObject(texObj[i]); } return 0; }
文章来源: CUDA:纹理内存入门到精通--纹理对象