Using peered GPUs in a single stream

扶醉桌前 提交于 2020-02-02 13:17:50

问题


In my current project I use GPUs for signal processing and visualization. I'm already using streams to allow for asynchronous operation. The signal is processed in frames and for each frame the processing steps in a stream are as following

  1. memcpy to device
  2. signal conditioning
  3. image processing
  4. visualization

Right now the steps are happening on a single GPU, however my machine has a Multi-GPU card (GeForce GTX 690) and I'd like to distribute the operation between the two devices. Basically I'd like to perform steps 1 and 2 on device A and steps 3 and 4 on device B, while the operations 1, 2, 3 and 4 are still executed as a single asynchronous stream. The desired outcome is a streaming layout that looks like this

Device A Stream a 1 2       1 2  ...
         Stream b      1 2  ...
Device B Stream a    3 4       3 4 ...
         Stream b         3 4  ...

How can I do this?


回答1:


My previous attempt was not correct, because a stream is associated with the device it was created on. So I think the most direct answer to your question as posed in the title is "it can't be done". You cannot create a single stream and issue commands to more than one GPU from it. From here:

Stream and Event Behavior

A kernel launch or memory copy will fail if it is issued to a stream that is not associated to the current device 

However while researching it, I noted that events are a suggested way to synchronize two streams on two different devices:

cudaStreamWaitEvent() will succeed even if the input stream and input event are associated to different devices. cudaStreamWaitEvent() can therefore be used to synchronize multiple devices with each other.

So in that vein, I created the following code to illustrate this:

#include <stdio.h>
#define SIZE 32
#define K1VAL 5
#define K3VAL 3

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


__global__ void kernel1(int *frame, int size){

  int idx = threadIdx.x + (blockDim.x * blockIdx.x);

  if (idx == 0){
   int *a = new int[10000]; // just to make this kernel take a while
   for (int i = 0; i<10000; i++)
     a[i] = 0;
   for (int i = 0; i < size; i++)
     frame[i] += K1VAL;
   }
}


__global__ void kernel3(int *frame, int size){

  int idx = threadIdx.x + (blockDim.x * blockIdx.x);
  if (idx == 0)
   for (int i = 0; i < size; i++)
     frame[i] -= K3VAL;
}

void set_device(int dev){
  int ldev;
  cudaSetDevice(dev);
  cudaGetDevice(&ldev);
  cudaCheckErrors("set device error");
  if (ldev != dev){
    printf("set device mismatch error\n");
    exit(1);
    }
}


int main(){

  int A=0;
  int B=1;
  int framesize = SIZE*sizeof(int);
  int *h_frame;
  int *d_frame_aA,  *d_frame_bB;
  int numdev = 0;
  cudaGetDeviceCount(&numdev);
  cudaCheckErrors("can't determine number of devices");
  if (numdev < 2){
   printf("not enough devices!\n");
   return 1;
   }
  set_device(A);
  cudaMalloc((void **) &d_frame_aA, framesize);  // stream_a
  cudaMemset(d_frame_aA, 0, framesize);
  set_device(B);
  cudaMalloc((void **) &d_frame_bB, framesize);  // stream_b
  cudaMemset(d_frame_bB, 0, framesize);
  cudaHostAlloc((void **) &h_frame, framesize, cudaHostAllocDefault);
  cudaCheckErrors("allocations failure");

  set_device(A);
  cudaStream_t stream_a, stream_b;
  cudaStreamCreate(&stream_a);
  cudaEvent_t absync;
  cudaEventCreate(&absync);
  set_device(B);
  cudaStreamCreate(&stream_b);
  cudaCheckErrors("stream creation failure");


  for (int i = 0; i < SIZE; i++)
    h_frame[i] = 0;

  set_device(A);
  cudaDeviceEnablePeerAccess(B, 0);
  set_device(B);
  cudaDeviceEnablePeerAccess(A, 0);
  cudaCheckErrors("enable peer access fail");


  set_device(A);
  cudaMemcpyAsync(d_frame_aA, h_frame, framesize, cudaMemcpyHostToDevice, stream_a);
  kernel1<<<1,1,0, stream_a>>>(d_frame_aA, SIZE);
  cudaCheckErrors("kernel1 fail");
  cudaMemcpyPeerAsync(d_frame_bB, B, d_frame_aA, A, framesize, stream_a );
  cudaCheckErrors("memcpypeer fail");
  cudaEventRecord(absync, stream_a);
  set_device(B);
  // comment out the next line to see the failure
  cudaStreamWaitEvent(stream_b, absync, 0);
  kernel3<<<1,1,0, stream_b>>>(d_frame_bB, SIZE);
  cudaCheckErrors("main sequence fail");


//  cudaCheckErrors("main sequence failure");

  cudaMemcpy(h_frame, d_frame_bB, framesize, cudaMemcpyDeviceToHost);
  cudaCheckErrors("results_a memcpy fail");
  for (int i = 0; i < SIZE; i++)
    if (h_frame[i] != (K1VAL - K3VAL)) {
      printf("results error\n");
      return 1;
      }
  printf("success\n");
  return 0;

}

If you run the code as-is, you should get a success message. If you comment out the line that forces stream b (on Device B) to wait on stream a (on Device A), then you'll see a results error message. So this demonstrates how to sync a stream on one device to a stream on another. Hope it helps. Sorry for the confusion on the first go-round.




回答2:


cudaStreamWaitEvent() enables inter-GPU synchronization, since you can insert a wait on a CUDA event belonging to another device.

So what you need for inter-GPU synchronization between the producer and the consumer is to allocate a few events (at least 2) for each of the 2 GPUs, then have the producer cudaEventRecord() and the consumer cudaStreamWaitEvent() on the same event. cudaStreamWaitEvent() inserts a command into the current device's command buffer that causes it to suspend execution until the given event has been recorded.

Below see a code fragment where a peer-to-peer memcpy is implemented using cudaStreamWaitEvent() in this way. Once the pump is primed, the producer and consumer should both be doing PCIe transfers concurrently, each to one of the two staging buffers (which are allocated in portable pinned memory).

cudaError_t
chMemcpyPeerToPeer( 
    void *_dst, int dstDevice, 
    const void *_src, int srcDevice, 
    size_t N ) 
{
    cudaError_t status;
    char *dst = (char *) _dst;
    const char *src = (const char *) _src;
    int stagingIndex = 0;
    while ( N ) {
        size_t thisCopySize = min( N, STAGING_BUFFER_SIZE );

        CUDART_CHECK( cudaSetDevice( srcDevice ) );
        CUDART_CHECK( cudaStreamWaitEvent( NULL, g_events[dstDevice][stagingIndex], 0 ) );
        CUDART_CHECK( cudaMemcpyAsync( g_hostBuffers[stagingIndex], src, thisCopySize, 
            cudaMemcpyDeviceToHost, NULL ) );
        CUDART_CHECK( cudaEventRecord( g_events[srcDevice][stagingIndex] ) );

        CUDART_CHECK( cudaSetDevice( dstDevice ) );
        CUDART_CHECK( cudaStreamWaitEvent( NULL, g_events[srcDevice][stagingIndex], 0 ) );
        CUDART_CHECK( cudaMemcpyAsync( dst, g_hostBuffers[stagingIndex], thisCopySize, 
            cudaMemcpyHostToDevice, NULL ) );
        CUDART_CHECK( cudaEventRecord( g_events[dstDevice][stagingIndex] ) );

        dst += thisCopySize;
        src += thisCopySize;
        N -= thisCopySize;
        stagingIndex = 1 - stagingIndex;
    }
    // Wait until both devices are done
    CUDART_CHECK( cudaSetDevice( srcDevice ) );
    CUDART_CHECK( cudaDeviceSynchronize() );

    CUDART_CHECK( cudaSetDevice( dstDevice ) );
    CUDART_CHECK( cudaDeviceSynchronize() );

Error:
    return status;
}

Full source code in https://github.com/ArchaeaSoftware/cudahandbook/blob/master/memory/peer2peerMemcpy.cu



来源:https://stackoverflow.com/questions/14263986/using-peered-gpus-in-a-single-stream

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!