问题
This question has already been asked before, but the asker didn't provide enough information and left unanswered and I am curious about the program.
Original Question Link
I'm trying to do a sobel edge detection using both opencv and cuda library, the sobel kernel for X direction is
-1 0 1
-2 0 2
-1 0 1
I have 3 files in my project
main.cpp
CudaKernel.cu
CudaKernel.h
main.cpp
#include <stdlib.h>
#include <iostream>
#include <string.h>
#include <Windows.h>
#include <opencv2\core\core.hpp>
#include <opencv2\highgui\highgui.hpp>
#include <opencv2\gpu\gpu.hpp>
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>
#include "CudaKernel.h"
using namespace cv;
using namespace std;
int main(int argc, char** argv)
{
IplImage* image;
try
{
image = cvLoadImage("4555472_460s.jpg", CV_LOAD_IMAGE_GRAYSCALE);
gpu::DeviceInfo info = gpu::getDevice();
cout << info.name() << endl;
cout << "Stream Processor : "<< info.multiProcessorCount() << endl;
cout << "Total Graphic Memory :" << info.totalMemory()/1048576 << " MB" << endl;
}
catch (const cv::Exception* ex)
{
cout << "Error: " << ex->what() << endl;
}
if(!image )
{
cout << "Could not open or find the image" << std::endl ;
return -1;
}
IplImage* image2=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
IplImage* image3=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
unsigned char * pseudo_input=(unsigned char *)image->imageData;
float *output=(float*)image2->imageData;
float *input=(float*)image3->imageData;
int s=image->widthStep/sizeof(float);
for(int w=0;w<=(image->height);w++)
for(int h=0;h<(image->width*image->nChannels);h++)
{
input[w*s+h]= pseudo_input[w*s+h];
}
Pixel *fagget = (unsigned char*) image->imageData;
kernelcall(input, output, image->width,image->height, image->widthStep);
// cv::namedWindow( "Display window", CV_WINDOW_AUTOSIZE );// Create a window for display.
cvShowImage( "Original Image", image ); // Show our image inside it.
cvShowImage("Sobeled Image", image2);
waitKey(0); // Wait for a keystroke in the window
return 0;
}
CudaKernel.cu
#include<cuda.h>
#include<iostream>
#include "CudaKernel.h"
using namespace std;
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
texture <float,2,cudaReadModeElementType> tex1;
texture<unsigned char, 2> tex;
static cudaArray *array = NULL;
static cudaArray *cuArray = NULL;
//Kernel for x direction sobel
__global__ void implement_x_sobel(float* garbage,float* output,int width,int height,int widthStep)
{
int x=blockIdx.x*blockDim.x+threadIdx.x;
int y=blockIdx.y*blockDim.y+threadIdx.y;
float output_value=((0*tex2D(tex1,x,y))+(2*tex2D(tex1,x+1,y))+(-2*tex2D(tex1,x- 1,y))+(0*tex2D(tex1,x,y+1))+(1*tex2D(tex1,x+1,y+1))+(-1*tex2D(tex1,x-1,y+1))+ (1*tex2D(tex1,x+1,y-1))+(0*tex2D(tex1,x,y-1))+(-1*tex2D(tex1,x-1,y-1)));
output[y*widthStep+x]=output_value;
}
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
file, line, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}
//Host Code
inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
if ( cudaSuccess != err )
{
printf("cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
inline void __cudaCheckError( const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if ( cudaSuccess != err )
{
printf("cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
void kernelcall(float* input,float* output,int width,int height,int widthStep){
//cudaChannelFormatDesc channelDesc=cudaCreateChannelDesc(32,32,0,0,cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//cudaArray *cuArray;
CudaSafeCall(cudaMallocArray(&cuArray,&channelDesc,width,height));
cudaMemcpyToArray(cuArray,0,0,input,widthStep*height,cudaMemcpyHostToDevice);
tex1.addressMode[0]=cudaAddressModeClamp;
tex1.addressMode[1]=cudaAddressModeClamp;
tex1.filterMode=cudaFilterModeLinear;
cudaBindTextureToArray(tex1,cuArray,channelDesc);
tex1.normalized=false;
float * D_output_x;
float * garbage=NULL;
CudaSafeCall(cudaMalloc(&D_output_x,widthStep*height));
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(width+blocksize.x-1)/blocksize.x;
gridsize.y=(height+blocksize.y-1)/blocksize.y;
implement_x_sobel<<<gridsize,blocksize>>>(garbage,D_output_x,width,height,widthStep/sizeof(float));
cudaThreadSynchronize();
CudaCheckError();
CudaSafeCall(cudaMemcpy(output,D_output_x,height*widthStep,cudaMemcpyDeviceToHost));
cudaFree(D_output_x);
cudaFree(garbage);
cudaFreeArray(cuArray);
}
the result is really messed up, it didnt look like the original image at all
Result:
I changed some line of the code to
float *pseudo_input=(float *)image->imageData;
float *output=(float*)image2->imageData;
float *input=(float*)image3->imageData;
float *inputnormalized=(float *)image4->imageData;
int s=image->widthStep/sizeof(float);
for(int w=0;w<=(image->height);w++)
for(int h=0;h<(image->width*image->nChannels);h++)
{
input[w*s+h]= pseudo_input[w*s+h];
}
kernelcall(input, output, image->width,image->height, image->widthStep);
cvNormalize(input,inputnormalized,0,255,NORM_MINMAX, CV_8UC1);
cvShowImage( "Original Image", image ); // Show our image inside it.
cvShowImage("Sobeled Image", image2);
But now I get an unhandled exception error.
回答1:
OpenCV rule number 1:
Never access the image data directly through the underlying data pointer unless absolutely necessary, e.g copying data to GPU. Reference (Me :p)
Errors/Recommendations:
Instead of converting the image by looping through the image data pointer, use
cvConvert
to change image data type. Looping is very much prone to error.When calling the function named
kernelcall
, you are passing the data pointer offloat
images, but passing thewidthStep
of the original 8 bit image. This is the main cause of erronous results as it will result in incorrect indexing inside the kernel.When performing memory copy between 2 pitched pointers which have different widthSteps, ALWAYS use 2D memory copy functions available in CUDA Runtime, e.g.
cudaMemcpy2D
,cudaMemcpy2DToArray
etc. In your case, thecuArray
has unknown widthstep internally, and the inputIplImage
has different widthStep than that ofcuArray
.Avoid unnecessary headers, assignments and identifier declaration.
Add bound checks inside the CUDA kernel, so that only those threads perform memory read/write which fall inside the image. It may cause a little divergence, but its better than invalid memory read/writes.
Revised Code (Tested):
Main.cpp
#include <iostream>
#include <opencv2/opencv.hpp>
#include "CudaKernel.h"
using namespace cv;
using namespace std;
int main(int argc, char** argv)
{
IplImage* image;
image = cvLoadImage("4555472_460s.jpg", CV_LOAD_IMAGE_GRAYSCALE);
if(!image )
{
cout << "Could not open or find the image" << std::endl;
return -1;
}
IplImage* image2 = cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
IplImage* image3 = cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
//Convert the input image to float
cvConvert(image,image3);
float *output = (float*)image2->imageData;
float *input = (float*)image3->imageData;
kernelcall(input, output, image->width,image->height, image3->widthStep);
//Normalize the output values from 0.0 to 1.0
cvScale(image2,image2,1.0/255.0);
cvShowImage("Original Image", image );
cvShowImage("Sobeled Image", image2);
cvWaitKey(0);
return 0;
}
CudaKernel.cu
#include<cuda.h>
#include<iostream>
#include "CudaKernel.h"
using namespace std;
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
texture <float,2,cudaReadModeElementType> tex1;
static cudaArray *cuArray = NULL;
//Kernel for x direction sobel
__global__ void implement_x_sobel(float* output,int width,int height,int widthStep)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
//Make sure that thread is inside image bounds
if(x<width && y<height)
{
float output_value = (-1*tex2D(tex1,x-1,y-1)) + (0*tex2D(tex1,x,y-1)) + (1*tex2D(tex1,x+1,y-1))
+ (-2*tex2D(tex1,x-1,y)) + (0*tex2D(tex1,x,y)) + (2*tex2D(tex1,x+1,y))
+ (-1*tex2D(tex1,x-1,y+1)) + (0*tex2D(tex1,x,y+1)) + (1*tex2D(tex1,x+1,y+1));
output[y*widthStep+x]=output_value;
}
}
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
file, line, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}
//Host Code
inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
if ( cudaSuccess != err )
{
printf("cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
inline void __cudaCheckError( const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if ( cudaSuccess != err )
{
printf("cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
void kernelcall(float* input,float* output,int width,int height,int widthStep)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
CudaSafeCall(cudaMallocArray(&cuArray,&channelDesc,width,height));
//Never use 1D memory copy if host and device pointers have different widthStep.
// You don't know the width step of CUDA array, so its better to use cudaMemcpy2D...
cudaMemcpy2DToArray(cuArray,0,0,input,widthStep,width * sizeof(float),height,cudaMemcpyHostToDevice);
cudaBindTextureToArray(tex1,cuArray,channelDesc);
float * D_output_x;
CudaSafeCall(cudaMalloc(&D_output_x,widthStep*height));
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(width+blocksize.x-1)/blocksize.x;
gridsize.y=(height+blocksize.y-1)/blocksize.y;
implement_x_sobel<<<gridsize,blocksize>>>(D_output_x,width,height,widthStep/sizeof(float));
cudaThreadSynchronize();
CudaCheckError();
//Don't forget to unbind the texture
cudaUnbindTexture(tex1);
CudaSafeCall(cudaMemcpy(output,D_output_x,height*widthStep,cudaMemcpyDeviceToHost));
cudaFree(D_output_x);
cudaFreeArray(cuArray);
}
回答2:
Here:-
unsigned char * pseudo_input=(unsigned char *)image->imageData;
float *output=(float*)image2->imageData;
float *input=(float*)image3->imageData;
int s=image->widthStep/sizeof(float);
for(int w=0;w<=(image->height);w++)
for(int h=0;h<(image->width*image->nChannels);h++)
{
input[w*s+h]= pseudo_input[w*s+h];
}
input is float* and pseudo_input is uchar* . convert everything to float and then process. In the end normalize between 0 ans 255 using cvNormalize with NORM_MINMAX to get proper results.
来源:https://stackoverflow.com/questions/14358916/applying-sobel-edge-detection-with-cuda-and-opencv-on-a-grayscale-jpg-image