Compile and build .cl file using NVIDIA's nvcc Compiler?

Is it possible to compile .cl file using NVIDIA's nvcc compiler?? I am trying to set up visual studio 2010 to code Opencl under CUDA platform. But when I select CUDA C/C++ Compiler to compile and build .cl file, it gives me errors like nvcc does not exist. What is the issue?

You should be able to use nvcc to compile OpenCL codes. Normally, I would suggest using a filename extension of .c for a C-compliant code, and .cpp for a C++ compliant code(*), however nvcc has filename extension override options (-x ...) so that we can modify the behavior. Here is a worked example using CUDA 8.0.61, RHEL 7, Tesla K20x:

$ cat t4.cpp
#include <CL/opencl.h>
#include <stdint.h>
#include <stdio.h>
#include <inttypes.h>
#include <stdlib.h>

const char source[] =
"__kernel void test_rotate(__global ulong *d_count, ulong loops, ulong patt)"
"{"
"  ulong n = patt;"
"  for (ulong i = 0; i<loops; i++)"
"    n &= (107 << (patt+(i%7)));"
"  d_count[0] = n + loops;"
"}"
;

int main(int argc, char *argv[])
{
  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_command_queue queue1, queue2;
  cl_program program;
  cl_mem mem1, mem2;
  cl_kernel kernel;

  bool two_kernels = false;
  unsigned long long loops = 1000;
  if (argc > 1) loops *= atoi(argv[1]);
  if (argc > 2) two_kernels = true;
  if (two_kernels) printf("running two kernels\n");
  else printf("running one kernel\n");
  printf("running  %lu loops\n", loops);
  unsigned long long pattern = 1;
  clGetPlatformIDs(1, &platform, NULL);
  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
  context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
  queue1 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, NULL);
  queue2 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, NULL);

  const char *sources[1] = {source};
  program = clCreateProgramWithSource(context, 1, sources, NULL, NULL);
  clBuildProgram(program, 1, &device, NULL, NULL, NULL);
  mem1 = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(cl_ulong), NULL, NULL);
  mem2 = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(cl_ulong), NULL, NULL);
  kernel = clCreateKernel(program, "test_rotate", NULL);
  const size_t work_size[1] = {1};
  clSetKernelArg(kernel, 0, sizeof(mem1), &mem1);
  clSetKernelArg(kernel, 1, sizeof(loops), &loops);
  clSetKernelArg(kernel, 2, sizeof(pattern), &pattern);

  clEnqueueNDRangeKernel(queue1, kernel, 1, NULL, work_size, work_size, 0, NULL, NULL);
  if (two_kernels){
    clSetKernelArg(kernel, 0, sizeof(mem2), &mem2);
    clSetKernelArg(kernel, 1, sizeof(loops), &loops);
    clSetKernelArg(kernel, 2, sizeof(pattern), &pattern);

    clEnqueueNDRangeKernel(queue2, kernel, 1, NULL, work_size, work_size, 0, NULL, NULL);
    }
  cl_ulong *buf1 = (cl_ulong *)clEnqueueMapBuffer(queue1, mem1, true, CL_MAP_READ, 0, 1*sizeof(cl_ulong), 0, NULL, NULL, NULL);
  cl_ulong *buf2 = (cl_ulong *)clEnqueueMapBuffer(queue2, mem2, true, CL_MAP_READ, 0, 1*sizeof(cl_ulong), 0, NULL, NULL, NULL);
  printf("result1: %lu\n", buf1[0]);
  printf("result2: %lu\n", buf2[0]);
  clEnqueueUnmapMemObject(queue1, mem1, buf1, 0, NULL, NULL);
  clEnqueueUnmapMemObject(queue2, mem2, buf2, 0, NULL, NULL);
  return 0;
}
$ nvcc -arch=sm_35 -o t4 t4.cpp -lOpenCL
$ ./t4
running one kernel
running  1000 loops
result1: 1000
result2: 0
$ cp t4.cpp t4.cl
$ nvcc -arch=sm_35 -x cu -o t4 t4.cl -lOpenCL
$ ./t4
running one kernel
running  1000 loops
result1: 1000
result2: 0
$

Note that the code here doesn't do anything sensible or significant, so I'd prefer to avoid questions. It's just for demonstration of compilation of a C++ compliant OpenCL code.

(*)(Because such files could also be readily processed by an ordinary host compiler, e.g. gnu compilers, with appropriate switches for include and link options.)

来源：https://stackoverflow.com/questions/13062469/compile-and-build-cl-file-using-nvidias-nvcc-compiler

标签

opencl

gpgpu

nvidia

gpu-programming