- 函数接口:
void Detector::match(const std::vector<Mat>& sources, float threshold, std::vector<Match>& matches,
sources为输入图像,threshold是用户数值0-100之间,matches匹配数据,class_ids定义模板名,quantized_images(ResponseMap),mask掩膜
- 根据定义特征计算线性存储的ResponseMap
// For each pyramid level, precompute linear memories for each modality std::vector<Size> sizes; for (int l = 0; l < pyramid_levels; ++l) { int T = T_at_level[l]; std::vector<LinearMemories>& lm_level = lm_pyramid[l]; if (l > 0) { for (int i = 0; i < (int)quantizers.size(); ++i) quantizers[i]->pyrDown(); } Mat quantized, spread_quantized; std::vector<Mat> response_maps; for (int i = 0; i < (int)quantizers.size(); ++i) { quantizers[i]->quantize(quantized); spread(quantized, spread_quantized, T); computeResponseMaps(spread_quantized, response_maps); LinearMemories& memories = lm_level[i]; for (int j = 0; j < 8; ++j) linearize(response_maps[j], memories[j], T); if (quantized_images.needed()) //use copyTo here to side step reference semantics. quantized.copyTo(quantized_images.getMatRef(static_cast<int>(l*quantizers.size() + i))); } sizes.push_back(quantized.size()); }
上面步骤比较清晰,首先对计算出来的方向量化图进行拓展,步长为T;根据拓展的量化图,计算响应reszponse_maps,输入的量化图是一张,而输出的reponse_maps对应8个方向,有8张。
然后对响应图更改为线性化存储,对应文章的内容如图:

这样做的好处是在进行模板匹配查找的时候(模板匹配是根据特征点的方向进行匹配的,后面可以看到),可以通过accessLinearMemory函数快速地得到对应方向的结果。
- 在金字塔高层计算相似度函数
static void similarity(const std::vector<Mat>& linear_memories, const Template& templ, Mat& dst, Size size, int T) { // 63 features or less is a special case because the max similarity per-feature is 4. // 255/4 = 63, so up to that many we can add up similarities in 8 bits without worrying // about overflow. Therefore here we use _mm_add_epi8 as the workhorse, whereas a more // general function would use _mm_add_epi16. CV_Assert(templ.features.size() <= 63); /// @todo Handle more than 255/MAX_RESPONSE features!! // Decimate input image size by factor of T int W = size.width / T; int H = size.height / T; // Feature dimensions, decimated by factor T and rounded up int wf = (templ.width - 1) / T + 1; int hf = (templ.height - 1) / T + 1; // Span is the range over which we can shift the template around the input image int span_x = W - wf; int span_y = H - hf; // Compute number of contiguous (in memory) pixels to check when sliding feature over // image. This allows template to wrap around left/right border incorrectly, so any // wrapped template matches must be filtered out! int template_positions = span_y * W + span_x + 1; // why add 1? //int template_positions = (span_y - 1) * W + span_x; // More correct? /// @todo In old code, dst is buffer of size m_U. Could make it something like /// (span_x)x(span_y) instead? dst = Mat::zeros(H, W, CV_8U); uchar* dst_ptr = dst.ptr<uchar>(); #if CV_SSE2 volatile bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); #if CV_SSE3 volatile bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3); #endif #endif // Compute the similarity measure for this template by accumulating the contribution of // each feature for (int i = 0; i < (int)templ.features.size(); ++i) { // Add the linear memory at the appropriate offset computed from the location of // the feature in the template Feature f = templ.features[i]; // Discard feature if out of bounds /// @todo Shouldn't actually see x or y < 0 here? if (f.x < 0 || f.x >= size.width || f.y < 0 || f.y >= size.height) continue; const uchar* lm_ptr = accessLinearMemory(linear_memories, f, T, W); // Now we do an aligned/unaligned add of dst_ptr and lm_ptr with template_positions elements int j = 0; // Process responses 16 at a time if vectorization possible #if CV_SSE2 #if CV_SSE3 if (haveSSE3) { // LDDQU may be more efficient than MOVDQU for unaligned load of next 16 responses for ( ; j < template_positions - 15; j += 16) { __m128i responses = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(lm_ptr + j)); __m128i* dst_ptr_sse = reinterpret_cast<__m128i*>(dst_ptr + j); *dst_ptr_sse = _mm_add_epi8(*dst_ptr_sse, responses); } } else #endif if (haveSSE2) { // Fall back to MOVDQU for ( ; j < template_positions - 15; j += 16) { __m128i responses = _mm_loadu_si128(reinterpret_cast<const __m128i*>(lm_ptr + j)); __m128i* dst_ptr_sse = reinterpret_cast<__m128i*>(dst_ptr + j); *dst_ptr_sse = _mm_add_epi8(*dst_ptr_sse, responses); } } #endif for ( ; j < template_positions; ++j) dst_ptr[j] = uchar(dst_ptr[j] + lm_ptr[j]); } }
计算前首先根据模板的大小和图像的大小计算需要平移的数目(Compute number of contiguous (in memory) pixels to check when sliding feature),其中accessLinearMemory可以直接获得某个特定方向的整个线性化的Responsemap的响应大小,实际上这里不涉及到模板窗口的平移操作,仅仅进行线性化寻址获得响应值,再将所有特征的响应值进行相加,就得到的匹配结果图。对应的论文原理如图:

因此在进行匹配结果计算时,结果图中每个像素点代表的是模板的Anchor cell(锚点)在该位置的响应值,最大的情况是所有都匹配都是最大值4,而8bit的图像最大值是255,这也是源码中限制了63个特征点的原因。
计算平移数量作者注释也有一些todo和问题标注,计算的方法是W和H是原本匹配的平移数目,可是模板也是有大小的,因此需要减去模板本身的大小所包含的T的数目,最后得出一个访问安全的平移数目。个人觉得+1可能会导致越界,不加的话好像好一些,和模板和图像大小有关。
注意最后SSE加速访问,+16是平移16个char内存空间,一个uchar内存空间为8bit,所以正好平移了128bit的内存,对应着SSE访问内存平移,在最后的不到128bit的内存中,采用非SSE方式进行补全。reinterpret_cast<const __m128i*>是强制转换的,lm_ptr是一个char型指针。
/////////////////////////////
以上内容作为个人学习记录,有错误请请指正。