问题
I have an matrix which is stored as 1D array in the GPU, I'm trying to make an OpenCL kernel which will use reduction in every row of this matrix, for example:
Let's consider my matrix is 2x3 with the elements [1, 2, 3, 4, 5, 6], what I want to do is:
[1, 2, 3] = [ 6]
[4, 5, 6] [15]
Obviously as I'm talking about reduction, the actual return could be of more than one element per row:
[1, 2, 3] = [3, 3]
[4, 5, 6] [9, 6]
Then the final calculation I can do in another kernel or in the CPU.
Well, so far what I have is a kernel which do the reduction but using all the elements of the array, like so:
[1, 2, 3] = [21]
[4, 5, 6]
The actual reduction kernel for doing this is that one (which I got from here in stackoverflow actually):
__kernel void
sum2(__global float *inVector, __global float *outVector,
const unsigned int inVectorSize, __local float *resultScratch)
{
const unsigned int localId = get_local_id(0);
const unsigned int workGroupSize = get_local_size(0);
if (get_global_id(0) < inVectorSize)
resultScratch[localId] = inVector[get_global_id(0)];
else
resultScratch[localId] = 0;
for (unsigned int a = workGroupSize >> 1; a > 0; a >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (a > localId)
resultScratch[localId] += resultScratch[localId + a];
}
if (localId == 0)
outVector[get_group_id(0)] = resultScratch[0];
barrier(CLK_LOCAL_MEM_FENCE);
}
回答1:
I suppose one solution is to modify your reduction kernel, so it can make reduction of the part of the array.
__kernel void
sum2(__global float *inVector,
__global float *outVector,
unsigned int inVectorOffset,
unsigned int inVectorSize,
__local float *resultScratch)
{
const unsigned int localId = get_local_id(0);
const unsigned int workGroupSize = get_local_size(0);
if (get_global_id(0) < inVectorSize)
resultScratch[localId] = inVector[inVectorOffset + get_global_id(0)];
else
resultScratch[localId] = 0;
for (unsigned int a = workGroupSize >> 1; a > 0; a >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (a > localId)
resultScratch[localId] += resultScratch[localId + a];
}
if (localId == 0)
outVector[get_group_id(0)] = resultScratch[0];
barrier(CLK_LOCAL_MEM_FENCE);
}
Then you can do reduction of a row of a matrix, providing as inVectorOffset the beginning of your row and as inVectorSize number of elements in the row.
来源:https://stackoverflow.com/questions/13384542/reduction-of-matrix-rows-in-opencl