问题
I need to detect text in images..
Have some code here which works in most cases.. But not in all.. See attached input/output image
code
#include "string"
#include "fstream"
#include "/var/bin/opencv/include/opencv2/opencv.hpp"
using namespace std;
using namespace cv;
void detect_text(string input){
Mat large = imread(input);
Mat rgb;
// downsample and use it for processing
pyrDown(large, rgb);
Mat small;
cvtColor(rgb, small, CV_BGR2GRAY);
// morphological gradient
Mat grad;
Mat morphKernel = getStructuringElement(MORPH_ELLIPSE, Size(3, 3));
morphologyEx(small, grad, MORPH_GRADIENT, morphKernel);
// binarize
Mat bw;
threshold(grad, bw, 0.0, 255.0, THRESH_BINARY | THRESH_OTSU);
// connect horizontally oriented regions
Mat connected;
morphKernel = getStructuringElement(MORPH_RECT, Size(9, 1));
morphologyEx(bw, connected, MORPH_CLOSE, morphKernel);
// find contours
Mat mask = Mat::zeros(bw.size(), CV_8UC1);
vector<vector<Point> > contours;
vector<Vec4i> hierarchy;
findContours(connected, contours, hierarchy, CV_RETR_CCOMP, CV_CHAIN_APPROX_SIMPLE, Point(0, 0));
// filter contours
for(int idx = 0; idx >= 0; idx = hierarchy[idx][0]){
Rect rect = boundingRect(contours[idx]);
Mat maskROI(mask, rect);
maskROI = Scalar(0, 0, 0);
// fill the contour
drawContours(mask, contours, idx, Scalar(255, 255, 255), CV_FILLED);
// ratio of non-zero pixels in the filled region
double r = (double)countNonZero(maskROI) / (rect.width * rect.height);
// assume at least 45% of the area is filled if it contains text
if (r > 0.45 &&
(rect.height > 8 && rect.width > 8) // constraints on region size
// these two conditions alone are not very robust. better to use something
//like the number of significant peaks in a horizontal projection as a third condition
){
rectangle(rgb, rect, Scalar(0, 255, 0), 2);
}
}
imwrite(string("test_text_contours.jpg"), rgb);
}
int main(int argc, char* argv[]){
detect_text(string("input.jpg"));
}
input
output
update
/*
* Compile
* # g++ txtbin.cpp -o txtbin `pkg-config opencv --cflags --libs`
*
* Get opencv version
* # pkg-config --modversion opencv
*
* Run
* # ./txtbin input.jpg output.png
*/
#include "string"
#include "fstream"
#include "/var/bin/opencv/include/opencv2/opencv.hpp"
//#include "/usr/include/opencv2/opencv.hpp"
#include "/usr/include/boost/tuple/tuple.hpp"
using namespace std;
using namespace cv;
using namespace boost;
void CalcBlockMeanVariance(Mat& Img, Mat& Res, float blockSide=21, float contrast=0.01){
/*
* blockSide: set greater for larger fonts in image and vice versa
* contrast: set smaller for lower contrast image
*/
Mat I;
Img.convertTo(I, CV_32FC1);
Res = Mat::zeros(Img.rows / blockSide, Img.cols / blockSide, CV_32FC1);
Mat inpaintmask;
Mat patch;
Mat smallImg;
Scalar m, s;
for(int i = 0; i < Img.rows - blockSide; i += blockSide){
for(int j = 0; j < Img.cols - blockSide; j += blockSide){
patch = I(Range(i, i + blockSide + 1), Range(j, j + blockSide + 1));
meanStdDev(patch, m, s);
if(s[0] > contrast){
Res.at<float>(i / blockSide, j / blockSide) = m[0];
}
else{
Res.at<float>(i / blockSide, j / blockSide) = 0;
}
}
}
resize(I, smallImg, Res.size());
threshold(Res, inpaintmask, 0.02, 1.0, THRESH_BINARY);
Mat inpainted;
smallImg.convertTo(smallImg, CV_8UC1, 255);
inpaintmask.convertTo(inpaintmask, CV_8UC1);
inpaint(smallImg, inpaintmask, inpainted, 5, INPAINT_TELEA);
resize(inpainted, Res, Img.size());
Res.convertTo(Res, CV_32FC1, 1.0 / 255.0);
}
tuple<int, int, int, int> detect_text_box(string input, Mat& res, bool draw_contours=false){
Mat large = imread(input);
bool test_output = false;
int
top = large.rows,
bottom = 0,
left = large.cols,
right = 0;
int
rect_bottom,
rect_right;
Mat rgb;
// downsample and use it for processing
pyrDown(large, rgb);
pyrDown(rgb, rgb);
Mat small;
cvtColor(rgb, small, CV_BGR2GRAY);
// morphological gradient
Mat grad;
Mat morphKernel = getStructuringElement(MORPH_ELLIPSE, Size(3, 3));
morphologyEx(small, grad, MORPH_GRADIENT, morphKernel);
// binarize
Mat bw;
threshold(grad, bw, 0.0, 255.0, THRESH_BINARY | THRESH_OTSU);
// connect horizontally oriented regions
Mat connected;
morphKernel = getStructuringElement(MORPH_RECT, Size(9, 1));
morphologyEx(bw, connected, MORPH_CLOSE, morphKernel);
// find contours
Mat mask = Mat::zeros(bw.size(), CV_8UC1);
vector<vector<Point> > contours;
vector<Vec4i> hierarchy;
findContours(connected, contours, hierarchy, CV_RETR_CCOMP, CV_CHAIN_APPROX_SIMPLE, Point(0, 0));
Scalar color = Scalar(0, 255, 0);
Scalar color2 = Scalar(0, 0, 255);
int thickness = 2;
// filter contours
for(int idx = 0; idx >= 0; idx = hierarchy[idx][0]){
Rect rect = boundingRect(contours[idx]);
Mat maskROI(mask, rect);
maskROI = Scalar(0, 0, 0);
// fill the contour
drawContours(mask, contours, idx, Scalar(255, 255, 255), CV_FILLED);
// ratio of non-zero pixels in the filled region
double r = (double)countNonZero(maskROI) / (rect.width * rect.height);
// assume at least 25% of the area is filled if it contains text
if (r > 0.25 &&
(rect.height > 8 && rect.width > 8) // constraints on region size
// these two conditions alone are not very robust. better to use something
//like the number of significant peaks in a horizontal projection as a third condition
){
if(draw_contours){
rectangle(res, Rect(rect.x * 4, rect.y * 4, rect.width * 4, rect.height * 4), color, thickness);
}
if(test_output){
rectangle(rgb, rect, color, thickness);
}
if(rect.y < top){
top = rect.y;
}
rect_bottom = rect.y + rect.height;
if(rect_bottom > bottom){
bottom = rect_bottom;
}
if(rect.x < left){
left = rect.x;
}
rect_right = rect.x + rect.width;
if(rect_right > right){
right = rect_right;
}
}
}
if(draw_contours){
rectangle(res, Point(left * 4, top * 4), Point(right * 4, bottom * 4), color2, thickness);
}
if(test_output){
rectangle(rgb, Point(left, top), Point(right, bottom), color2, thickness);
imwrite(string("test_text_contours.jpg"), rgb);
}
return make_tuple(left * 4, top * 4, (right - left) * 4, (bottom - top) * 4);
}
int main(int argc, char* argv[]){
string input;
string output = "output.png";
int
width = 0,
height = 0,
blockside = 9;
bool
crop = false,
draw = false;
float margin = 0;
cout << "OpenCV version: " << CV_VERSION << endl;
// Return error if arguments are missing
if(argc < 3){
cerr << "\nUsage: txtbin input [options] output\n\n"
"Options:\n"
"\t-w <number> -- set max width (keeps aspect ratio)\n"
"\t-h <number> -- set max height (keeps aspect ratio)\n"
"\t-c -- crop text content contour\n"
"\t-m <number> -- add margins (number in %)\n"
"\t-b <number> -- set blockside\n"
"\t-d -- draw text content contours (debugging)\n" << endl;
return 1;
}
// Parse arguments
for(int i = 1; i < argc; i++){
if(i == 1){
input = string(argv[i]);
// Return error if input file is invalid
ifstream stream(input.c_str());
if(!stream.good()){
cerr << "Error: Input file is invalid!" << endl;
return 1;
}
}
else if(string(argv[i]) == "-w"){
width = atoi(argv[++i]);
}
else if(string(argv[i]) == "-h"){
height = atoi(argv[++i]);
}
else if(string(argv[i]) == "-c"){
crop = true;
}
else if(string(argv[i]) == "-m"){
margin = atoi(argv[++i]);
}
else if(string(argv[i]) == "-b"){
blockside = atoi(argv[++i]);
}
else if(string(argv[i]) == "-d"){
draw = true;
}
else if(i == argc - 1){
output = string(argv[i]);
}
}
Mat Img = imread(input, CV_LOAD_IMAGE_GRAYSCALE);
Mat res;
Img.convertTo(Img, CV_32FC1, 1.0 / 255.0);
CalcBlockMeanVariance(Img, res, blockside);
res = 1.0 - res;
res = Img + res;
threshold(res, res, 0.85, 1, THRESH_BINARY);
int
txt_x,
txt_y,
txt_width,
txt_height;
if(crop || draw){
tie(txt_x, txt_y, txt_width, txt_height) = detect_text_box(input, res, draw);
}
if(crop){
//res = res(Rect(txt_x, txt_y, txt_width, txt_height)).clone();
res = res(Rect(txt_x, txt_y, txt_width, txt_height));
}
if(margin){
int border = res.cols * margin / 100;
copyMakeBorder(res, res, border, border, border, border, BORDER_CONSTANT, Scalar(255, 255, 255));
}
float
width_input = res.cols,
height_input = res.rows;
bool resized = false;
// Downscale image
if(width > 0 && width_input > width){
float scale = width_input / width;
width_input /= scale;
height_input /= scale;
resized = true;
}
if(height > 0 && height_input > height){
float scale = height_input / height;
width_input /= scale;
height_input /= scale;
resized = true;
}
if(resized){
resize(res, res, Size(round(width_input), round(height_input)));
}
imwrite(output, res * 255);
return 0;
}
回答1:
Your detect_text code is very much similar to my text detection post here. If you have used that code, you will see that the input images in the original post are 1400 x 800. But your input images in this post and your previous post are usually four times as large. So, first you can try downsampling the input images twice. Also your text looks bit tilted, so you can try rotated rectangle instead of the upright rectangle. Then you can tune the parameters for your case. As I have mentioned in the code, the contour filtering criteria isn't very robust. After doing these changes to the code, I get a reasonable output as shown below. Note that I've highlighted the rotated rectangle of the detected text region in green.
Code:
void detect_text(string input){
Mat large = imread(input);
Mat rgb;
// downsample and use it for processing
pyrDown(large, rgb);
pyrDown(rgb, rgb);
Mat small;
cvtColor(rgb, small, CV_BGR2GRAY);
// morphological gradient
Mat grad;
Mat morphKernel = getStructuringElement(MORPH_ELLIPSE, Size(3, 3));
morphologyEx(small, grad, MORPH_GRADIENT, morphKernel);
// binarize
Mat bw;
threshold(grad, bw, 0.0, 255.0, THRESH_BINARY | THRESH_OTSU);
// connect horizontally oriented regions
Mat connected;
morphKernel = getStructuringElement(MORPH_RECT, Size(9, 1));
morphologyEx(bw, connected, MORPH_CLOSE, morphKernel);
// find contours
Mat mask = Mat::zeros(bw.size(), CV_8UC1);
vector<vector<Point> > contours;
vector<Vec4i> hierarchy;
findContours(connected, contours, hierarchy, CV_RETR_CCOMP, CV_CHAIN_APPROX_SIMPLE, Point(0, 0));
// filter contours
for(int idx = 0; idx >= 0; idx = hierarchy[idx][0]){
Rect rect = boundingRect(contours[idx]);
Mat maskROI(mask, rect);
maskROI = Scalar(0, 0, 0);
// fill the contour
drawContours(mask, contours, idx, Scalar(255, 255, 255), CV_FILLED);
RotatedRect rrect = minAreaRect(contours[idx]);
double r = (double)countNonZero(maskROI) / (rrect.size.width * rrect.size.height);
Scalar color;
int thickness = 1;
// assume at least 25% of the area is filled if it contains text
if (r > 0.25 &&
(rrect.size.height > 8 && rrect.size.width > 8) // constraints on region size
// these two conditions alone are not very robust. better to use something
//like the number of significant peaks in a horizontal projection as a third condition
){
thickness = 2;
color = Scalar(0, 255, 0);
}
else
{
thickness = 1;
color = Scalar(0, 0, 255);
}
Point2f pts[4];
rrect.points(pts);
for (int i = 0; i < 4; i++)
{
line(rgb, Point((int)pts[i].x, (int)pts[i].y), Point((int)pts[(i+1)%4].x, (int)pts[(i+1)%4].y), color, thickness);
}
}
imwrite("cont.jpg", rgb);
}
回答2:
You might find that using thresholding might not be the best way to determine text from background. A research group I used to work for used use Otsu's method for thresholding fluorescent cells. It doesn't work so well when the background is non-uniform, such as the shadow on the paper in the image.
Instead you might want to differentiate text from paper by comparing neighbouring pixels and look for sharp changes in colour. This worked much better for the work I did. You can then detect characters by highlighting neighbouring pixels with large changes in colour values.
来源:https://stackoverflow.com/questions/34415815/detect-text-in-images-with-opencv