I am working on data prefetch in CUDA (Fermi GPU) through C code. Cuda reference manual talks about the prefetching at ptx level code not at C level code.
Can anyon
According to this thread, below is the code for different cache prefetching techniques:
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static __device__ __forceinline__
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
#define PXL_GLOBAL_PTR "l"
#else
#define PXL_GLOBAL_PTR "r"
#endif
DEVICE_STATIC_INTRINSIC_QUALIFIERS void __prefetch_global_l1(const void* const ptr)
{
asm("prefetch.global.L1 [%0];" : : PXL_GLOBAL_PTR(ptr));
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS void __prefetch_global_uniform(const void* const ptr)
{
asm("prefetchu.L1 [%0];" : : PXL_GLOBAL_PTR(ptr));
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS void __prefetch_global_l2(const void* const ptr)
{
asm("prefetch.global.L2 [%0];" : : PXL_GLOBAL_PTR(ptr));
}