Structure of Arrays vs Array of Structures in CUDA

前端 未结 3 983
野性不改
野性不改 2020-11-27 11:25

From some comments that I have read in here, for some reason it is preferable to have Structure of Arrays (SoA) over Array of Structures

3条回答
  •  我在风中等你
    2020-11-27 12:06

    I just want to provide a simple example showing how a Struct of Arrays (SoA) performs better than an Array of Structs (AoS).

    In the example, I'm considering three different versions of the same code:

    1. SoA (v1)
    2. Straight arrays (v2)
    3. AoS (v3)

    In particular, version 2 considers the use of straight arrays. The timings of versions 2 and 3 are the same for this example and result to be better than version 1. I suspect that, in general, straight arrays could be preferable, although at the expense of readability, since, for example, loading from uniform cache could be enabled through const __restrict__ for this case.

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    #include 
    
    #include 
    
    #include "Utilities.cuh"
    #include "TimingGPU.cuh"
    
    #define BLOCKSIZE   1024
    
    /******************************************/
    /* CELL STRUCT LEADING TO ARRAY OF STRUCT */
    /******************************************/
    struct cellAoS {
    
        unsigned int    x1;
        unsigned int    x2;
        unsigned int    code;
        bool            done;
    
    };
    
    /*******************************************/
    /* CELL STRUCT LEADING TO STRUCT OF ARRAYS */
    /*******************************************/
    struct cellSoA {
    
        unsigned int    *x1;
        unsigned int    *x2;
        unsigned int    *code;
        bool            *done;
    
    };
    
    
    /*******************************************/
    /* KERNEL MANIPULATING THE ARRAY OF STRUCT */
    /*******************************************/
    __global__ void AoSvsSoA_v1(cellAoS *d_cells, const int N) {
    
        const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
        if (tid < N) {
            cellAoS tempCell = d_cells[tid];
    
            tempCell.x1 = tempCell.x1 + 10;
            tempCell.x2 = tempCell.x2 + 10;
    
            d_cells[tid] = tempCell;
        }
    
    }
    
    /******************************/
    /* KERNEL MANIPULATING ARRAYS */
    /******************************/
    __global__ void AoSvsSoA_v2(unsigned int * __restrict__ d_x1, unsigned int * __restrict__ d_x2, const int N) {
    
        const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
        if (tid < N) {
    
            d_x1[tid] = d_x1[tid] + 10;
            d_x2[tid] = d_x2[tid] + 10;
    
        }
    
    }
    
    /********************************************/
    /* KERNEL MANIPULATING THE STRUCT OF ARRAYS */
    /********************************************/
    __global__ void AoSvsSoA_v3(cellSoA cell, const int N) {
    
        const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
        if (tid < N) {
    
            cell.x1[tid] = cell.x1[tid] + 10;
            cell.x2[tid] = cell.x2[tid] + 10;
    
        }
    
    }
    
    /********/
    /* MAIN */
    /********/
    int main() {
    
        const int N = 2048 * 2048 * 4;
    
        TimingGPU timerGPU;
    
        thrust::host_vector    h_cells(N);
        thrust::device_vector  d_cells(N);
    
        thrust::host_vector   h_x1(N);
        thrust::host_vector   h_x2(N);
    
        thrust::device_vector d_x1(N);
        thrust::device_vector d_x2(N);
    
        for (int k = 0; k < N; k++) {
    
            h_cells[k].x1 = k + 1;
            h_cells[k].x2 = k + 2;
            h_cells[k].code = k + 3;
            h_cells[k].done = true;
    
            h_x1[k] = k + 1;
            h_x2[k] = k + 2;
    
        }
    
        d_cells = h_cells;
    
        d_x1 = h_x1;
        d_x2 = h_x2;
    
        cellSoA cell;
        cell.x1 = thrust::raw_pointer_cast(d_x1.data());
        cell.x2 = thrust::raw_pointer_cast(d_x2.data());
        cell.code = NULL;
        cell.done = NULL;
    
        timerGPU.StartCounter();
        AoSvsSoA_v1 << > >(thrust::raw_pointer_cast(d_cells.data()), N);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
        printf("Timing AoSvsSoA_v1 = %f\n", timerGPU.GetCounter());
    
        //timerGPU.StartCounter();
        //AoSvsSoA_v2 << > >(thrust::raw_pointer_cast(d_x1.data()), thrust::raw_pointer_cast(d_x2.data()), N);
        //gpuErrchk(cudaPeekAtLastError());
        //gpuErrchk(cudaDeviceSynchronize());
        //printf("Timing AoSvsSoA_v2 = %f\n", timerGPU.GetCounter());
    
        timerGPU.StartCounter();
        AoSvsSoA_v3 << > >(cell, N);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
        printf("Timing AoSvsSoA_v3 = %f\n", timerGPU.GetCounter());
    
        h_cells = d_cells;
    
        h_x1 = d_x1;
        h_x2 = d_x2;
    
        // --- Check results
        for (int k = 0; k < N; k++) {
            if (h_x1[k] != k + 11) {
                printf("h_x1[%i] not equal to %i\n", h_x1[k], k + 11);
                break;
            }
            if (h_x2[k] != k + 12) {
                printf("h_x2[%i] not equal to %i\n", h_x2[k], k + 12);
                break;
            }
            if (h_cells[k].x1 != k + 11) {
                printf("h_cells[%i].x1 not equal to %i\n", h_cells[k].x1, k + 11);
                break;
            }
            if (h_cells[k].x2 != k + 12) {
                printf("h_cells[%i].x2 not equal to %i\n", h_cells[k].x2, k + 12);
                break;
            }
        }
    
    }
    

    The following are the timings (runs performed on a GTX960):

    Array of struct        9.1ms (v1 kernel)
    Struct of arrays       3.3ms (v3 kernel)
    Straight arrays        3.2ms (v2 kernel)
    

提交回复
热议问题