问题
I'm trying to implement odd-even sort program in cuda-c language. But, whenever I give a 0 as one of the elements in the input array, the resulted array is not properly sorted.In other cases, however, it is working for other input.I don't understand what is the problem with the code.Here is my code:
#include<stdio.h>
#include<cuda.h>
#define N 5
__global__ void sort(int *c,int *count)
{
int l;
if(*count%2==0)
l=*count/2;
else
l=(*count/2)+1;
for(int i=0;i<l;i++)
{
if(threadIdx.x%2==0) //even phase
{
if(c[threadIdx.x]>c[threadIdx.x+1])
{
int temp=c[threadIdx.x];
c[threadIdx.x]=c[threadIdx.x+1];
c[threadIdx.x+1]=temp;
}
__syncthreads();
}
else //odd phase
{
if(c[threadIdx.x]>c[threadIdx.x+1])
{
int temp=c[threadIdx.x];
c[threadIdx.x]=c[threadIdx.x+1];
c[threadIdx.x+1]=temp;
}
__syncthreads();
}
}//for
}
int main()
{int a[N],b[N],n;
printf("enter size of array");
scanf("%d",&n);
print("enter the elements of array");
for(int i=0;i<n;i++)
{
scanf("%d",&a[i]);
}
printf("ORIGINAL ARRAY : \n");
for(int i=0;i<n;i++)
{
printf("%d ",a[i]);
}
int *c,*count;
cudaMalloc((void**)&c,sizeof(int)*N);
cudaMalloc((void**)&count,sizeof(int));
cudaMemcpy(c,&a,sizeof(int)*N,cudaMemcpyHostToDevice);
cudaMemcpy(count,&n,sizeof(int),cudaMemcpyHostToDevice);
sort<<< 1,n >>>(c,count);
cudaMemcpy(&b,c,sizeof(int)*N,cudaMemcpyDeviceToHost);
printf("\nSORTED ARRAY : \n");
for(int i=1;i<=n;i++)
{
printf("%d ",b[i]);
}
}
回答1:
Your kernel code had two main errors that I could see:
On the odd phase (for even length array, or even phase for odd length array), your last thread will index out of bounds at
c[threadIdx.x+1]. For example, for 4 threads, they are numbered 0,1,2,3. Thread 3 is odd, but if you accessc[3+1], that is not a defined element in your array. We can fix this by restricting each phase to work on all threads but the last one.You were using
__syncthreads()inside a conditional statement that would not allow all threads to reach the barrier. This is a coding error. Read the documentation. We can fix this by adjusting what code is inside the conditional regions.
In the main code, your final printout statements were indexing incorrectly:
for(int i=1;i<=n;i++)
that should be:
for(int i=0;i<n;i++)
You also have typo here:
print("enter the elements of array");
I assume that should be printf.
The following code has the above errors fixed, and seems to run correctly for me for arrays up to length 5 (your hardcoded limit on N). Even if you increased N, I'm not sure this would work beyond the size of a warp and certainly would not work beyond the threadblock size, but hopefully you are aware of that already(if not, read the doc link about __syncthreads()).
"Fixed" code:
#include<stdio.h>
#include<cuda.h>
#define N 5
#define intswap(A,B) {int temp=A;A=B;B=temp;}
__global__ void sort(int *c,int *count)
{
int l;
if(*count%2==0)
l=*count/2;
else
l=(*count/2)+1;
for(int i=0;i<l;i++)
{
if((!(threadIdx.x&1)) && (threadIdx.x<(*count-1))) //even phase
{
if(c[threadIdx.x]>c[threadIdx.x+1])
intswap(c[threadIdx.x], c[threadIdx.x+1]);
}
__syncthreads();
if((threadIdx.x&1) && (threadIdx.x<(*count-1))) //odd phase
{
if(c[threadIdx.x]>c[threadIdx.x+1])
intswap(c[threadIdx.x], c[threadIdx.x+1]);
}
__syncthreads();
}//for
}
int main()
{int a[N],b[N],n;
printf("enter size of array");
scanf("%d",&n);
if (n > N) {printf("too large!\n"); return 1;}
printf("enter the elements of array");
for(int i=0;i<n;i++)
{
scanf("%d",&a[i]);
}
printf("ORIGINAL ARRAY : \n");
for(int i=0;i<n;i++)
{
printf("%d ",a[i]);
}
int *c,*count;
cudaMalloc((void**)&c,sizeof(int)*N);
cudaMalloc((void**)&count,sizeof(int));
cudaMemcpy(c,&a,sizeof(int)*N,cudaMemcpyHostToDevice);
cudaMemcpy(count,&n,sizeof(int),cudaMemcpyHostToDevice);
sort<<< 1,n >>>(c,count);
cudaMemcpy(&b,c,sizeof(int)*N,cudaMemcpyDeviceToHost);
printf("\nSORTED ARRAY : \n");
for(int i=0;i<n;i++)
{
printf("%d ",b[i]);
}
printf("\n");
}
The usual recital about proper cuda error checking belongs here.
来源:https://stackoverflow.com/questions/29581115/odd-even-sort-using-cuda-programming