1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
45 /****************************************************************************************\
47 \****************************************************************************************/
52 BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; }
53 BaseRowFilter::~BaseRowFilter() {}
55 BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; }
56 BaseColumnFilter::~BaseColumnFilter() {}
57 void BaseColumnFilter::reset() {}
59 BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); }
60 BaseFilter::~BaseFilter() {}
61 void BaseFilter::reset() {}
64 Various border types, image boundaries are denoted with '|'
66 * BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
67 * BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
68 * BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
69 * BORDER_WRAP: cdefgh|abcdefgh|abcdefg
70 * BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii with some specified 'i'
72 int borderInterpolate( int p, int len, int borderType )
74 if( (unsigned)p < (unsigned)len )
76 else if( borderType == BORDER_REPLICATE )
77 p = p < 0 ? 0 : len - 1;
78 else if( borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101 )
80 int delta = borderType == BORDER_REFLECT_101;
88 p = len - 1 - (p - len) - delta;
90 while( (unsigned)p >= (unsigned)len );
92 else if( borderType == BORDER_WRAP )
95 p -= ((p-len+1)/len)*len;
99 else if( borderType == BORDER_CONSTANT )
102 CV_Error( CV_StsBadArg, "Unknown/unsupported border type" );
107 FilterEngine::FilterEngine()
109 srcType = dstType = bufType = -1;
110 rowBorderType = columnBorderType = BORDER_REPLICATE;
111 bufStep = startY = startY0 = endY = rowCount = dstY = 0;
114 wholeSize = Size(-1,-1);
118 FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D,
119 const Ptr<BaseRowFilter>& _rowFilter,
120 const Ptr<BaseColumnFilter>& _columnFilter,
121 int _srcType, int _dstType, int _bufType,
122 int _rowBorderType, int _columnBorderType,
123 const Scalar& _borderValue )
125 init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType,
126 _rowBorderType, _columnBorderType, _borderValue);
129 FilterEngine::~FilterEngine()
134 void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
135 const Ptr<BaseRowFilter>& _rowFilter,
136 const Ptr<BaseColumnFilter>& _columnFilter,
137 int _srcType, int _dstType, int _bufType,
138 int _rowBorderType, int _columnBorderType,
139 const Scalar& _borderValue )
141 _srcType = CV_MAT_TYPE(_srcType);
142 _bufType = CV_MAT_TYPE(_bufType);
143 _dstType = CV_MAT_TYPE(_dstType);
146 int srcElemSize = (int)getElemSize(srcType);
150 filter2D = _filter2D;
151 rowFilter = _rowFilter;
152 columnFilter = _columnFilter;
154 if( _columnBorderType < 0 )
155 _columnBorderType = _rowBorderType;
157 rowBorderType = _rowBorderType;
158 columnBorderType = _columnBorderType;
160 CV_Assert( columnBorderType != BORDER_WRAP );
164 CV_Assert( !rowFilter.empty() && !columnFilter.empty() );
165 ksize = Size(rowFilter->ksize, columnFilter->ksize);
166 anchor = Point(rowFilter->anchor, columnFilter->anchor);
170 CV_Assert( bufType == srcType );
171 ksize = filter2D->ksize;
172 anchor = filter2D->anchor;
175 CV_Assert( 0 <= anchor.x && anchor.x < ksize.width &&
176 0 <= anchor.y && anchor.y < ksize.height );
178 borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1);
179 borderTab.resize( std::max(ksize.width - 1, 1)*borderElemSize);
181 maxWidth = bufStep = 0;
182 constBorderRow.clear();
184 if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT )
186 constBorderValue.resize(srcElemSize*(ksize.width - 1));
187 scalarToRawData(_borderValue, &constBorderValue[0], srcType,
188 (ksize.width-1)*CV_MAT_CN(srcType));
191 wholeSize = Size(-1,-1);
194 static const int VEC_ALIGN = CV_MALLOC_ALIGN;
196 int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows)
200 wholeSize = _wholeSize;
202 CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 &&
203 roi.x + roi.width <= wholeSize.width &&
204 roi.y + roi.height <= wholeSize.height );
206 int esz = (int)getElemSize(srcType);
207 int bufElemSize = (int)getElemSize(bufType);
208 const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0;
210 if( _maxBufRows < 0 )
211 _maxBufRows = ksize.height + 3;
212 _maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1);
214 if( maxWidth < roi.width || _maxBufRows != (int)rows.size() )
216 rows.resize(_maxBufRows);
217 maxWidth = std::max(maxWidth, roi.width);
218 int cn = CV_MAT_CN(srcType);
219 srcRow.resize(esz*(maxWidth + ksize.width - 1));
220 if( columnBorderType == BORDER_CONSTANT )
222 constBorderRow.resize(getElemSize(bufType)*(maxWidth+VEC_ALIGN));
223 uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst;
224 int n = (int)constBorderValue.size(), N;
228 N = (maxWidth + ksize.width - 1)*esz;
236 for( i = 0; i < N; i += n )
238 n = std::min( n, N - i );
239 for(j = 0; j < n; j++)
240 tdst[i+j] = constVal[j];
244 (*rowFilter)(&srcRow[0], dst, maxWidth, cn);
247 int maxBufStep = bufElemSize*(int)alignSize(maxWidth +
248 (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);
249 ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN);
252 // adjust bufstep so that the used part of the ring buffer stays compact in memory
253 bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
255 dx1 = std::max(anchor.x - roi.x, 0);
256 dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
258 // recompute border tables
259 if( dx1 > 0 || dx2 > 0 )
261 if( rowBorderType == BORDER_CONSTANT )
263 int nr = isSeparable() ? 1 : (int)rows.size();
264 for( i = 0; i < nr; i++ )
266 uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i;
267 memcpy( dst, constVal, dx1*esz );
268 memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz );
273 int btab_esz = borderElemSize, wholeWidth = wholeSize.width;
274 int* btab = (int*)&borderTab[0];
276 for( i = 0; i < dx1; i++ )
278 int p0 = borderInterpolate(i-dx1, wholeWidth, rowBorderType)*btab_esz;
279 for( j = 0; j < btab_esz; j++ )
280 btab[i*btab_esz + j] = p0 + j;
283 for( i = 0; i < dx2; i++ )
285 int p0 = borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType)*btab_esz;
286 for( j = 0; j < btab_esz; j++ )
287 btab[(i + dx1)*btab_esz + j] = p0 + j;
293 startY = startY0 = std::max(roi.y - anchor.y, 0);
294 endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height);
295 if( !columnFilter.empty() )
296 columnFilter->reset();
297 if( !filter2D.empty() )
304 int FilterEngine::start(const Mat& src, const Rect& _srcRoi,
305 bool isolated, int maxBufRows)
307 Rect srcRoi = _srcRoi;
309 if( srcRoi == Rect(0,0,-1,-1) )
310 srcRoi = Rect(0,0,src.cols,src.rows);
312 CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 &&
313 srcRoi.width >= 0 && srcRoi.height >= 0 &&
314 srcRoi.x + srcRoi.width <= src.cols &&
315 srcRoi.y + srcRoi.height <= src.rows );
318 Size wholeSize(src.cols, src.rows);
320 src.locateROI( wholeSize, ofs );
321 start( wholeSize, srcRoi + ofs, maxBufRows );
323 return startY - ofs.y;
327 int FilterEngine::remainingInputRows() const
329 return endY - startY - rowCount;
332 int FilterEngine::remainingOutputRows() const
334 return roi.height - dstY;
337 int FilterEngine::proceed( const uchar* src, int srcstep, int count,
338 uchar* dst, int dststep )
340 CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 );
342 const int *btab = &borderTab[0];
343 int esz = (int)getElemSize(srcType), btab_esz = borderElemSize;
344 uchar** brows = &rows[0];
345 int bufRows = (int)rows.size();
346 int cn = CV_MAT_CN(bufType);
347 int width = roi.width, kwidth = ksize.width;
348 int kheight = ksize.height, ay = anchor.y;
349 int _dx1 = dx1, _dx2 = dx2;
350 int width1 = roi.width + kwidth - 1;
351 int xofs1 = std::min(roi.x, anchor.x);
352 bool isSep = isSeparable();
353 bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT;
357 count = std::min(count, remainingInputRows());
359 CV_Assert( src && dst && count > 0 );
361 for(;; dst += dststep*i, dy += i)
363 int dcount = bufRows - ay - startY - rowCount + roi.y;
364 dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
365 dcount = std::min(dcount, count);
367 for( ; dcount-- > 0; src += srcstep )
369 int bi = (startY - startY0 + rowCount) % bufRows;
370 uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
371 uchar* row = isSep ? &srcRow[0] : brow;
373 if( ++rowCount > bufRows )
379 memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz );
383 if( btab_esz*(int)sizeof(int) == esz )
385 const int* isrc = (const int*)src;
386 int* irow = (int*)row;
388 for( i = 0; i < _dx1*btab_esz; i++ )
389 irow[i] = isrc[btab[i]];
390 for( i = 0; i < _dx2*btab_esz; i++ )
391 irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]];
395 for( i = 0; i < _dx1*esz; i++ )
396 row[i] = src[btab[i]];
397 for( i = 0; i < _dx2*esz; i++ )
398 row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]];
403 (*rowFilter)(row, brow, width, CV_MAT_CN(srcType));
406 int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1));
407 for( i = 0; i < max_i; i++ )
409 int srcY = borderInterpolate(dstY + dy + i + roi.y - ay,
410 wholeSize.height, columnBorderType);
411 if( srcY < 0 ) // can happen only with constant border type
412 brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN);
415 CV_Assert( srcY >= startY );
416 if( srcY >= startY + rowCount )
418 int bi = (srcY - startY0) % bufRows;
419 brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
426 (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn);
428 (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn);
432 CV_Assert( dstY <= roi.height );
437 void FilterEngine::apply(const Mat& src, Mat& dst,
438 const Rect& _srcRoi, Point dstOfs, bool isolated)
440 CV_Assert( src.type() == srcType && dst.type() == dstType );
442 Rect srcRoi = _srcRoi;
443 if( srcRoi == Rect(0,0,-1,-1) )
444 srcRoi = Rect(0,0,src.cols,src.rows);
446 CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 &&
447 dstOfs.x + srcRoi.width <= dst.cols &&
448 dstOfs.y + srcRoi.height <= dst.rows );
450 int y = start(src, srcRoi, isolated);
451 proceed( src.data + y*src.step, (int)src.step, endY - startY,
452 dst.data + dstOfs.y*dst.step + dstOfs.x*dst.elemSize(), (int)dst.step );
456 /****************************************************************************************\
457 * Separable linear filter *
458 \****************************************************************************************/
460 int getKernelType(const Mat& _kernel, Point anchor)
462 CV_Assert( _kernel.channels() == 1 );
463 int i, sz = _kernel.rows*_kernel.cols;
466 _kernel.convertTo(kernel, CV_64F);
468 const double* coeffs = (double*)kernel.data;
470 int type = KERNEL_SMOOTH + KERNEL_INTEGER;
471 if( (_kernel.rows == 1 || _kernel.cols == 1) &&
472 anchor.x*2 + 1 == _kernel.cols &&
473 anchor.y*2 + 1 == _kernel.rows )
474 type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
476 for( i = 0; i < sz; i++ )
478 double a = coeffs[i], b = coeffs[sz - i - 1];
480 type &= ~KERNEL_SYMMETRICAL;
482 type &= ~KERNEL_ASYMMETRICAL;
484 type &= ~KERNEL_SMOOTH;
485 if( a != saturate_cast<int>(a) )
486 type &= ~KERNEL_INTEGER;
490 if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) )
491 type &= ~KERNEL_SMOOTH;
499 RowNoVec(const Mat&) {}
500 int operator()(const uchar*, uchar*, int, int) const { return 0; }
506 ColumnNoVec(const Mat&, int, int, double) {}
507 int operator()(const uchar**, uchar*, int) const { return 0; }
510 struct SymmRowSmallNoVec
512 SymmRowSmallNoVec() {}
513 SymmRowSmallNoVec(const Mat&, int) {}
514 int operator()(const uchar*, uchar*, int, int) const { return 0; }
517 struct SymmColumnSmallNoVec
519 SymmColumnSmallNoVec() {}
520 SymmColumnSmallNoVec(const Mat&, int, int, double) {}
521 int operator()(const uchar**, uchar*, int) const { return 0; }
527 FilterNoVec(const Mat&, int, double) {}
528 int operator()(const uchar**, uchar*, int) const { return 0; }
534 ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
538 RowVec_8u32s() { smallValues = false; }
539 RowVec_8u32s( const Mat& _kernel )
543 int k, ksize = kernel.rows + kernel.cols - 1;
544 for( k = 0; k < ksize; k++ )
546 int v = ((const int*)kernel.data)[k];
547 if( v < SHRT_MIN || v > SHRT_MAX )
555 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
557 int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
558 int* dst = (int*)_dst;
559 const int* _kx = (const int*)kernel.data;
564 for( ; i <= width - 16; i += 16 )
566 const uchar* src = _src + i;
567 __m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z;
568 __m128i x0, x1, x2, x3;
570 for( k = 0; k < _ksize; k++, src += cn )
572 f = _mm_cvtsi32_si128(_kx[k]);
573 f = _mm_shuffle_epi32(f, 0);
574 f = _mm_packs_epi32(f, f);
576 x0 = _mm_loadu_si128((const __m128i*)src);
577 x2 = _mm_unpackhi_epi8(x0, z);
578 x0 = _mm_unpacklo_epi8(x0, z);
579 x1 = _mm_mulhi_epi16(x0, f);
580 x3 = _mm_mulhi_epi16(x2, f);
581 x0 = _mm_mullo_epi16(x0, f);
582 x2 = _mm_mullo_epi16(x2, f);
584 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
585 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
586 s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3));
587 s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3));
590 _mm_store_si128((__m128i*)(dst + i), s0);
591 _mm_store_si128((__m128i*)(dst + i + 4), s1);
592 _mm_store_si128((__m128i*)(dst + i + 8), s2);
593 _mm_store_si128((__m128i*)(dst + i + 12), s3);
596 for( ; i <= width - 4; i += 4 )
598 const uchar* src = _src + i;
599 __m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1;
601 for( k = 0; k < _ksize; k++, src += cn )
603 f = _mm_cvtsi32_si128(_kx[k]);
604 f = _mm_shuffle_epi32(f, 0);
605 f = _mm_packs_epi32(f, f);
607 x0 = _mm_cvtsi32_si128(*(const int*)src);
608 x0 = _mm_unpacklo_epi8(x0, z);
609 x1 = _mm_mulhi_epi16(x0, f);
610 x0 = _mm_mullo_epi16(x0, f);
611 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
613 _mm_store_si128((__m128i*)(dst + i), s0);
624 struct SymmRowSmallVec_8u32s
626 SymmRowSmallVec_8u32s() { smallValues = false; }
627 SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
630 symmetryType = _symmetryType;
632 int k, ksize = kernel.rows + kernel.cols - 1;
633 for( k = 0; k < ksize; k++ )
635 int v = ((const int*)kernel.data)[k];
636 if( v < SHRT_MIN || v > SHRT_MAX )
644 int operator()(const uchar* src, uchar* _dst, int width, int cn) const
646 int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
647 int* dst = (int*)_dst;
648 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
649 const int* kx = (const int*)kernel.data + _ksize/2;
653 src += (_ksize/2)*cn;
656 __m128i z = _mm_setzero_si128();
663 if( kx[0] == 2 && kx[1] == 1 )
664 for( ; i <= width - 16; i += 16, src += 16 )
666 __m128i x0, x1, x2, y0, y1, y2;
667 x0 = _mm_loadu_si128((__m128i*)(src - cn));
668 x1 = _mm_loadu_si128((__m128i*)src);
669 x2 = _mm_loadu_si128((__m128i*)(src + cn));
670 y0 = _mm_unpackhi_epi8(x0, z);
671 x0 = _mm_unpacklo_epi8(x0, z);
672 y1 = _mm_unpackhi_epi8(x1, z);
673 x1 = _mm_unpacklo_epi8(x1, z);
674 y2 = _mm_unpackhi_epi8(x2, z);
675 x2 = _mm_unpacklo_epi8(x2, z);
676 x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
677 y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
678 _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
679 _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
680 _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
681 _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
683 else if( kx[0] == -2 && kx[1] == 1 )
684 for( ; i <= width - 16; i += 16, src += 16 )
686 __m128i x0, x1, x2, y0, y1, y2;
687 x0 = _mm_loadu_si128((__m128i*)(src - cn));
688 x1 = _mm_loadu_si128((__m128i*)src);
689 x2 = _mm_loadu_si128((__m128i*)(src + cn));
690 y0 = _mm_unpackhi_epi8(x0, z);
691 x0 = _mm_unpacklo_epi8(x0, z);
692 y1 = _mm_unpackhi_epi8(x1, z);
693 x1 = _mm_unpacklo_epi8(x1, z);
694 y2 = _mm_unpackhi_epi8(x2, z);
695 x2 = _mm_unpacklo_epi8(x2, z);
696 x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
697 y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
698 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
699 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
700 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
701 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
705 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
706 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
707 k0 = _mm_packs_epi32(k0, k0);
708 k1 = _mm_packs_epi32(k1, k1);
710 for( ; i <= width - 16; i += 16, src += 16 )
712 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
713 x0 = _mm_loadu_si128((__m128i*)(src - cn));
714 x1 = _mm_loadu_si128((__m128i*)src);
715 x2 = _mm_loadu_si128((__m128i*)(src + cn));
716 y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
717 x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
718 y1 = _mm_unpackhi_epi8(x1, z);
719 x1 = _mm_unpacklo_epi8(x1, z);
721 t1 = _mm_mulhi_epi16(x1, k0);
722 t0 = _mm_mullo_epi16(x1, k0);
723 x2 = _mm_mulhi_epi16(x0, k1);
724 x0 = _mm_mullo_epi16(x0, k1);
725 z0 = _mm_unpacklo_epi16(t0, t1);
726 z1 = _mm_unpackhi_epi16(t0, t1);
727 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
728 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
730 t1 = _mm_mulhi_epi16(y1, k0);
731 t0 = _mm_mullo_epi16(y1, k0);
732 y1 = _mm_mulhi_epi16(y0, k1);
733 y0 = _mm_mullo_epi16(y0, k1);
734 z2 = _mm_unpacklo_epi16(t0, t1);
735 z3 = _mm_unpackhi_epi16(t0, t1);
736 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
737 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
738 _mm_store_si128((__m128i*)(dst + i), z0);
739 _mm_store_si128((__m128i*)(dst + i + 4), z1);
740 _mm_store_si128((__m128i*)(dst + i + 8), z2);
741 _mm_store_si128((__m128i*)(dst + i + 12), z3);
745 else if( _ksize == 5 )
747 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
748 for( ; i <= width - 16; i += 16, src += 16 )
750 __m128i x0, x1, x2, y0, y1, y2;
751 x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
752 x1 = _mm_loadu_si128((__m128i*)src);
753 x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
754 y0 = _mm_unpackhi_epi8(x0, z);
755 x0 = _mm_unpacklo_epi8(x0, z);
756 y1 = _mm_unpackhi_epi8(x1, z);
757 x1 = _mm_unpacklo_epi8(x1, z);
758 y2 = _mm_unpackhi_epi8(x2, z);
759 x2 = _mm_unpacklo_epi8(x2, z);
760 x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
761 y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
762 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
763 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
764 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
765 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
769 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
770 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
771 k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
772 k0 = _mm_packs_epi32(k0, k0);
773 k1 = _mm_packs_epi32(k1, k1);
774 k2 = _mm_packs_epi32(k2, k2);
776 for( ; i <= width - 16; i += 16, src += 16 )
778 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
779 x0 = _mm_loadu_si128((__m128i*)(src - cn));
780 x1 = _mm_loadu_si128((__m128i*)src);
781 x2 = _mm_loadu_si128((__m128i*)(src + cn));
782 y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
783 x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
784 y1 = _mm_unpackhi_epi8(x1, z);
785 x1 = _mm_unpacklo_epi8(x1, z);
787 t1 = _mm_mulhi_epi16(x1, k0);
788 t0 = _mm_mullo_epi16(x1, k0);
789 x2 = _mm_mulhi_epi16(x0, k1);
790 x0 = _mm_mullo_epi16(x0, k1);
791 z0 = _mm_unpacklo_epi16(t0, t1);
792 z1 = _mm_unpackhi_epi16(t0, t1);
793 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
794 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
796 t1 = _mm_mulhi_epi16(y1, k0);
797 t0 = _mm_mullo_epi16(y1, k0);
798 y1 = _mm_mulhi_epi16(y0, k1);
799 y0 = _mm_mullo_epi16(y0, k1);
800 z2 = _mm_unpacklo_epi16(t0, t1);
801 z3 = _mm_unpackhi_epi16(t0, t1);
802 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
803 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
805 x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
806 x1 = _mm_loadu_si128((__m128i*)(src + cn*2));
807 y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
808 y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
810 t1 = _mm_mulhi_epi16(y0, k2);
811 t0 = _mm_mullo_epi16(y0, k2);
812 y0 = _mm_mullo_epi16(y1, k2);
813 y1 = _mm_mulhi_epi16(y1, k2);
814 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
815 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
816 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
817 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
819 _mm_store_si128((__m128i*)(dst + i), z0);
820 _mm_store_si128((__m128i*)(dst + i + 4), z1);
821 _mm_store_si128((__m128i*)(dst + i + 8), z2);
822 _mm_store_si128((__m128i*)(dst + i + 12), z3);
831 if( kx[0] == 0 && kx[1] == 1 )
832 for( ; i <= width - 16; i += 16, src += 16 )
835 x0 = _mm_loadu_si128((__m128i*)(src + cn));
836 x1 = _mm_loadu_si128((__m128i*)(src - cn));
837 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
838 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
839 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
840 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
841 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
842 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
846 __m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
847 k1 = _mm_packs_epi32(k1, k1);
849 for( ; i <= width - 16; i += 16, src += 16 )
851 __m128i x0, x1, y0, y1, z0, z1, z2, z3;
852 x0 = _mm_loadu_si128((__m128i*)(src + cn));
853 x1 = _mm_loadu_si128((__m128i*)(src - cn));
854 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
855 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
857 x1 = _mm_mulhi_epi16(x0, k1);
858 x0 = _mm_mullo_epi16(x0, k1);
859 z0 = _mm_unpacklo_epi16(x0, x1);
860 z1 = _mm_unpackhi_epi16(x0, x1);
862 y1 = _mm_mulhi_epi16(y0, k1);
863 y0 = _mm_mullo_epi16(y0, k1);
864 z2 = _mm_unpacklo_epi16(y0, y1);
865 z3 = _mm_unpackhi_epi16(y0, y1);
866 _mm_store_si128((__m128i*)(dst + i), z0);
867 _mm_store_si128((__m128i*)(dst + i + 4), z1);
868 _mm_store_si128((__m128i*)(dst + i + 8), z2);
869 _mm_store_si128((__m128i*)(dst + i + 12), z3);
873 else if( _ksize == 5 )
875 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
876 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
877 k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
878 k0 = _mm_packs_epi32(k0, k0);
879 k1 = _mm_packs_epi32(k1, k1);
880 k2 = _mm_packs_epi32(k2, k2);
882 for( ; i <= width - 16; i += 16, src += 16 )
884 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
885 x0 = _mm_loadu_si128((__m128i*)(src + cn));
886 x2 = _mm_loadu_si128((__m128i*)(src - cn));
887 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
888 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
890 x2 = _mm_mulhi_epi16(x0, k1);
891 x0 = _mm_mullo_epi16(x0, k1);
892 z0 = _mm_unpacklo_epi16(x0, x2);
893 z1 = _mm_unpackhi_epi16(x0, x2);
894 y1 = _mm_mulhi_epi16(y0, k1);
895 y0 = _mm_mullo_epi16(y0, k1);
896 z2 = _mm_unpacklo_epi16(y0, y1);
897 z3 = _mm_unpackhi_epi16(y0, y1);
899 x0 = _mm_loadu_si128((__m128i*)(src + cn*2));
900 x1 = _mm_loadu_si128((__m128i*)(src - cn*2));
901 y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
902 y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
904 t1 = _mm_mulhi_epi16(y0, k2);
905 t0 = _mm_mullo_epi16(y0, k2);
906 y0 = _mm_mullo_epi16(y1, k2);
907 y1 = _mm_mulhi_epi16(y1, k2);
908 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
909 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
910 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
911 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
913 _mm_store_si128((__m128i*)(dst + i), z0);
914 _mm_store_si128((__m128i*)(dst + i + 4), z1);
915 _mm_store_si128((__m128i*)(dst + i + 8), z2);
916 _mm_store_si128((__m128i*)(dst + i + 12), z3);
921 src -= (_ksize/2)*cn;
923 for( ; i <= width - 4; i += 4, src += 4 )
925 __m128i f, s0 = z, x0, x1;
927 for( k = j = 0; k < _ksize; k++, j += cn )
929 f = _mm_cvtsi32_si128(kx[k]);
930 f = _mm_shuffle_epi32(f, 0);
931 f = _mm_packs_epi32(f, f);
933 x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
934 x0 = _mm_unpacklo_epi8(x0, z);
935 x1 = _mm_mulhi_epi16(x0, f);
936 x0 = _mm_mullo_epi16(x0, f);
937 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
939 _mm_store_si128((__m128i*)(dst + i), s0);
951 struct SymmColumnVec_32s8u
953 SymmColumnVec_32s8u() { symmetryType=0; }
954 SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
956 symmetryType = _symmetryType;
957 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
958 delta = (float)(_delta/(1 << _bits));
959 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
962 int operator()(const uchar** _src, uchar* dst, int width) const
964 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
965 const float* ky = (const float*)kernel.data + ksize2;
967 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
968 const int** src = (const int**)_src;
969 const __m128i *S, *S2;
970 __m128 d4 = _mm_set1_ps(delta);
974 for( ; i <= width - 16; i += 16 )
976 __m128 f = _mm_load_ss(ky);
977 f = _mm_shuffle_ps(f, f, 0);
978 __m128 s0, s1, s2, s3;
980 S = (const __m128i*)(src[0] + i);
981 s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
982 s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
983 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
984 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
985 s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
986 s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
987 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
988 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
990 for( k = 1; k <= ksize2; k++ )
992 S = (const __m128i*)(src[k] + i);
993 S2 = (const __m128i*)(src[-k] + i);
994 f = _mm_load_ss(ky+k);
995 f = _mm_shuffle_ps(f, f, 0);
996 x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
997 x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
998 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
999 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1000 x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1001 x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1002 s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1003 s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1006 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1007 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1008 x0 = _mm_packus_epi16(x0, x1);
1009 _mm_storeu_si128((__m128i*)(dst + i), x0);
1012 for( ; i <= width - 4; i += 4 )
1014 __m128 f = _mm_load_ss(ky);
1015 f = _mm_shuffle_ps(f, f, 0);
1017 __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
1018 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1020 for( k = 1; k <= ksize2; k++ )
1022 S = (const __m128i*)(src[k] + i);
1023 S2 = (const __m128i*)(src[-k] + i);
1024 f = _mm_load_ss(ky+k);
1025 f = _mm_shuffle_ps(f, f, 0);
1026 x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1027 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1030 x0 = _mm_cvtps_epi32(s0);
1031 x0 = _mm_packs_epi32(x0, x0);
1032 x0 = _mm_packus_epi16(x0, x0);
1033 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1038 for( ; i <= width - 16; i += 16 )
1040 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1043 for( k = 1; k <= ksize2; k++ )
1045 S = (const __m128i*)(src[k] + i);
1046 S2 = (const __m128i*)(src[-k] + i);
1047 f = _mm_load_ss(ky+k);
1048 f = _mm_shuffle_ps(f, f, 0);
1049 x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1050 x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1051 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1052 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1053 x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1054 x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1055 s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1056 s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1059 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1060 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1061 x0 = _mm_packus_epi16(x0, x1);
1062 _mm_storeu_si128((__m128i*)(dst + i), x0);
1065 for( ; i <= width - 4; i += 4 )
1070 for( k = 1; k <= ksize2; k++ )
1072 S = (const __m128i*)(src[k] + i);
1073 S2 = (const __m128i*)(src[-k] + i);
1074 f = _mm_load_ss(ky+k);
1075 f = _mm_shuffle_ps(f, f, 0);
1076 x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1077 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1080 x0 = _mm_cvtps_epi32(s0);
1081 x0 = _mm_packs_epi32(x0, x0);
1082 x0 = _mm_packus_epi16(x0, x0);
1083 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1096 struct SymmColumnSmallVec_32s16s
1098 SymmColumnSmallVec_32s16s() { symmetryType=0; }
1099 SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
1101 symmetryType = _symmetryType;
1102 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1103 delta = (float)(_delta/(1 << _bits));
1104 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1107 int operator()(const uchar** _src, uchar* _dst, int width) const
1109 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1110 const float* ky = (const float*)kernel.data + ksize2;
1112 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1113 const int** src = (const int**)_src;
1114 const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1115 short* dst = (short*)_dst;
1116 __m128 df4 = _mm_set1_ps(delta);
1117 __m128i d4 = _mm_cvtps_epi32(df4);
1121 if( ky[0] == 2 && ky[1] == 1 )
1123 for( ; i <= width - 8; i += 8 )
1125 __m128i s0, s1, s2, s3, s4, s5;
1126 s0 = _mm_load_si128((__m128i*)(S0 + i));
1127 s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1128 s2 = _mm_load_si128((__m128i*)(S1 + i));
1129 s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1130 s4 = _mm_load_si128((__m128i*)(S2 + i));
1131 s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1132 s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
1133 s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
1134 s0 = _mm_add_epi32(s0, d4);
1135 s1 = _mm_add_epi32(s1, d4);
1136 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1139 else if( ky[0] == -2 && ky[1] == 1 )
1141 for( ; i <= width - 8; i += 8 )
1143 __m128i s0, s1, s2, s3, s4, s5;
1144 s0 = _mm_load_si128((__m128i*)(S0 + i));
1145 s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1146 s2 = _mm_load_si128((__m128i*)(S1 + i));
1147 s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1148 s4 = _mm_load_si128((__m128i*)(S2 + i));
1149 s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1150 s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
1151 s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
1152 s0 = _mm_add_epi32(s0, d4);
1153 s1 = _mm_add_epi32(s1, d4);
1154 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1159 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1160 for( ; i <= width - 8; i += 8 )
1163 s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
1164 s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
1165 s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
1166 s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
1168 x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1169 _mm_load_si128((__m128i*)(S2 + i)));
1170 x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1171 _mm_load_si128((__m128i*)(S2 + i + 4)));
1172 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1173 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1174 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1175 _mm_storeu_si128((__m128i*)(dst + i), x0);
1181 if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1185 for( ; i <= width - 8; i += 8 )
1187 __m128i s0, s1, s2, s3;
1188 s0 = _mm_load_si128((__m128i*)(S2 + i));
1189 s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
1190 s2 = _mm_load_si128((__m128i*)(S0 + i));
1191 s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
1192 s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
1193 s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
1194 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1199 __m128 k1 = _mm_set1_ps(ky[1]);
1200 for( ; i <= width - 8; i += 8 )
1202 __m128 s0 = df4, s1 = df4;
1204 x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1205 _mm_load_si128((__m128i*)(S2 + i)));
1206 x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1207 _mm_load_si128((__m128i*)(S2 + i + 4)));
1208 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1209 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1210 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1211 _mm_storeu_si128((__m128i*)(dst + i), x0);
1225 /////////////////////////////////////// 32f //////////////////////////////////
1230 RowVec_32f( const Mat& _kernel )
1235 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1237 int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
1238 float* dst = (float*)_dst;
1239 const float* _kx = (const float*)kernel.data;
1242 for( ; i <= width - 8; i += 8 )
1244 const float* src = (const float*)_src + i;
1245 __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1246 for( k = 0; k < _ksize; k++, src += cn )
1248 f = _mm_load_ss(_kx+k);
1249 f = _mm_shuffle_ps(f, f, 0);
1251 x0 = _mm_loadu_ps(src);
1252 x1 = _mm_loadu_ps(src + 4);
1253 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1254 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1256 _mm_store_ps(dst + i, s0);
1257 _mm_store_ps(dst + i + 4, s1);
1266 struct SymmRowSmallVec_32f
1268 SymmRowSmallVec_32f() {}
1269 SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
1272 symmetryType = _symmetryType;
1275 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1277 int i = 0, _ksize = kernel.rows + kernel.cols - 1;
1278 float* dst = (float*)_dst;
1279 const float* src = (const float*)_src + (_ksize/2)*cn;
1280 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1281 const float* kx = (const float*)kernel.data + _ksize/2;
1290 if( kx[0] == 2 && kx[1] == 1 )
1291 for( ; i <= width - 8; i += 8, src += 8 )
1293 __m128 x0, x1, x2, y0, y1, y2;
1294 x0 = _mm_loadu_ps(src - cn);
1295 x1 = _mm_loadu_ps(src);
1296 x2 = _mm_loadu_ps(src + cn);
1297 y0 = _mm_loadu_ps(src - cn + 4);
1298 y1 = _mm_loadu_ps(src + 4);
1299 y2 = _mm_loadu_ps(src + cn + 4);
1300 x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
1301 y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
1302 _mm_store_ps(dst + i, x0);
1303 _mm_store_ps(dst + i + 4, y0);
1305 else if( kx[0] == -2 && kx[1] == 1 )
1306 for( ; i <= width - 8; i += 8, src += 8 )
1308 __m128 x0, x1, x2, y0, y1, y2;
1309 x0 = _mm_loadu_ps(src - cn);
1310 x1 = _mm_loadu_ps(src);
1311 x2 = _mm_loadu_ps(src + cn);
1312 y0 = _mm_loadu_ps(src - cn + 4);
1313 y1 = _mm_loadu_ps(src + 4);
1314 y2 = _mm_loadu_ps(src + cn + 4);
1315 x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1316 y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1317 _mm_store_ps(dst + i, x0);
1318 _mm_store_ps(dst + i + 4, y0);
1322 __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
1323 for( ; i <= width - 8; i += 8, src += 8 )
1325 __m128 x0, x1, x2, y0, y1, y2;
1326 x0 = _mm_loadu_ps(src - cn);
1327 x1 = _mm_loadu_ps(src);
1328 x2 = _mm_loadu_ps(src + cn);
1329 y0 = _mm_loadu_ps(src - cn + 4);
1330 y1 = _mm_loadu_ps(src + 4);
1331 y2 = _mm_loadu_ps(src + cn + 4);
1333 x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1334 y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1335 x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1336 y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1337 _mm_store_ps(dst + i, x0);
1338 _mm_store_ps(dst + i + 4, y0);
1342 else if( _ksize == 5 )
1344 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
1345 for( ; i <= width - 8; i += 8, src += 8 )
1347 __m128 x0, x1, x2, y0, y1, y2;
1348 x0 = _mm_loadu_ps(src - cn*2);
1349 x1 = _mm_loadu_ps(src);
1350 x2 = _mm_loadu_ps(src + cn*2);
1351 y0 = _mm_loadu_ps(src - cn*2 + 4);
1352 y1 = _mm_loadu_ps(src + 4);
1353 y2 = _mm_loadu_ps(src + cn*2 + 4);
1354 x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1355 y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1356 _mm_store_ps(dst + i, x0);
1357 _mm_store_ps(dst + i + 4, y0);
1361 __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1362 for( ; i <= width - 8; i += 8, src += 8 )
1364 __m128 x0, x1, x2, y0, y1, y2;
1365 x0 = _mm_loadu_ps(src - cn);
1366 x1 = _mm_loadu_ps(src);
1367 x2 = _mm_loadu_ps(src + cn);
1368 y0 = _mm_loadu_ps(src - cn + 4);
1369 y1 = _mm_loadu_ps(src + 4);
1370 y2 = _mm_loadu_ps(src + cn + 4);
1372 x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1373 y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1374 x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1375 y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1377 x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1378 y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1379 x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1380 y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1382 _mm_store_ps(dst + i, x0);
1383 _mm_store_ps(dst + i + 4, y0);
1392 if( kx[0] == 0 && kx[1] == 1 )
1393 for( ; i <= width - 8; i += 8, src += 8 )
1395 __m128 x0, x2, y0, y2;
1396 x0 = _mm_loadu_ps(src + cn);
1397 x2 = _mm_loadu_ps(src - cn);
1398 y0 = _mm_loadu_ps(src + cn + 4);
1399 y2 = _mm_loadu_ps(src - cn + 4);
1400 x0 = _mm_sub_ps(x0, x2);
1401 y0 = _mm_sub_ps(y0, y2);
1402 _mm_store_ps(dst + i, x0);
1403 _mm_store_ps(dst + i + 4, y0);
1407 __m128 k1 = _mm_set1_ps(kx[1]);
1408 for( ; i <= width - 8; i += 8, src += 8 )
1410 __m128 x0, x2, y0, y2;
1411 x0 = _mm_loadu_ps(src + cn);
1412 x2 = _mm_loadu_ps(src - cn);
1413 y0 = _mm_loadu_ps(src + cn + 4);
1414 y2 = _mm_loadu_ps(src - cn + 4);
1416 x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1417 y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1418 _mm_store_ps(dst + i, x0);
1419 _mm_store_ps(dst + i + 4, y0);
1423 else if( _ksize == 5 )
1425 __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1426 for( ; i <= width - 8; i += 8, src += 8 )
1428 __m128 x0, x2, y0, y2;
1429 x0 = _mm_loadu_ps(src + cn);
1430 x2 = _mm_loadu_ps(src - cn);
1431 y0 = _mm_loadu_ps(src + cn + 4);
1432 y2 = _mm_loadu_ps(src - cn + 4);
1434 x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1435 y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1437 x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1438 y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1439 x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1440 y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1442 _mm_store_ps(dst + i, x0);
1443 _mm_store_ps(dst + i + 4, y0);
1456 struct SymmColumnVec_32f
1458 SymmColumnVec_32f() { symmetryType=0; }
1459 SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1461 symmetryType = _symmetryType;
1463 delta = (float)_delta;
1464 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1467 int operator()(const uchar** _src, uchar* _dst, int width) const
1469 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1470 const float* ky = (const float*)kernel.data + ksize2;
1472 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1473 const float** src = (const float**)_src;
1474 const float *S, *S2;
1475 float* dst = (float*)_dst;
1476 __m128 d4 = _mm_set1_ps(delta);
1480 for( ; i <= width - 16; i += 16 )
1482 __m128 f = _mm_load_ss(ky);
1483 f = _mm_shuffle_ps(f, f, 0);
1484 __m128 s0, s1, s2, s3;
1487 s0 = _mm_load_ps(S);
1488 s1 = _mm_load_ps(S+4);
1489 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1490 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1491 s2 = _mm_load_ps(S+8);
1492 s3 = _mm_load_ps(S+12);
1493 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1494 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1496 for( k = 1; k <= ksize2; k++ )
1500 f = _mm_load_ss(ky+k);
1501 f = _mm_shuffle_ps(f, f, 0);
1502 x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1503 x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1504 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1505 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1506 x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1507 x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1508 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1509 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1512 _mm_storeu_ps(dst + i, s0);
1513 _mm_storeu_ps(dst + i + 4, s1);
1514 _mm_storeu_ps(dst + i + 8, s2);
1515 _mm_storeu_ps(dst + i + 12, s3);
1518 for( ; i <= width - 4; i += 4 )
1520 __m128 f = _mm_load_ss(ky);
1521 f = _mm_shuffle_ps(f, f, 0);
1522 __m128 x0, s0 = _mm_load_ps(src[0] + i);
1523 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1525 for( k = 1; k <= ksize2; k++ )
1527 f = _mm_load_ss(ky+k);
1528 f = _mm_shuffle_ps(f, f, 0);
1531 x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1532 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1535 _mm_storeu_ps(dst + i, s0);
1540 for( ; i <= width - 16; i += 16 )
1542 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1546 for( k = 1; k <= ksize2; k++ )
1550 f = _mm_load_ss(ky+k);
1551 f = _mm_shuffle_ps(f, f, 0);
1552 x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1553 x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1554 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1555 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1556 x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1557 x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1558 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1559 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1562 _mm_storeu_ps(dst + i, s0);
1563 _mm_storeu_ps(dst + i + 4, s1);
1564 _mm_storeu_ps(dst + i + 8, s2);
1565 _mm_storeu_ps(dst + i + 12, s3);
1568 for( ; i <= width - 4; i += 4 )
1570 __m128 f, x0, s0 = d4;
1572 for( k = 1; k <= ksize2; k++ )
1574 f = _mm_load_ss(ky+k);
1575 f = _mm_shuffle_ps(f, f, 0);
1578 x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1579 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1582 _mm_storeu_ps(dst + i, s0);
1595 struct SymmColumnSmallVec_32f
1597 SymmColumnSmallVec_32f() { symmetryType=0; }
1598 SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1600 symmetryType = _symmetryType;
1602 delta = (float)_delta;
1603 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1606 int operator()(const uchar** _src, uchar* _dst, int width) const
1608 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1609 const float* ky = (const float*)kernel.data + ksize2;
1611 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1612 const float** src = (const float**)_src;
1613 const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1614 float* dst = (float*)_dst;
1615 __m128 d4 = _mm_set1_ps(delta);
1619 if( ky[0] == 2 && ky[1] == 1 )
1621 for( ; i <= width - 8; i += 8 )
1623 __m128 s0, s1, s2, s3, s4, s5;
1624 s0 = _mm_load_ps(S0 + i);
1625 s1 = _mm_load_ps(S0 + i + 4);
1626 s2 = _mm_load_ps(S1 + i);
1627 s3 = _mm_load_ps(S1 + i + 4);
1628 s4 = _mm_load_ps(S2 + i);
1629 s5 = _mm_load_ps(S2 + i + 4);
1630 s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
1631 s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
1632 s0 = _mm_add_ps(s0, d4);
1633 s1 = _mm_add_ps(s1, d4);
1634 _mm_storeu_ps(dst + i, s0);
1635 _mm_storeu_ps(dst + i + 4, s1);
1638 else if( ky[0] == -2 && ky[1] == 1 )
1640 for( ; i <= width - 8; i += 8 )
1642 __m128 s0, s1, s2, s3, s4, s5;
1643 s0 = _mm_load_ps(S0 + i);
1644 s1 = _mm_load_ps(S0 + i + 4);
1645 s2 = _mm_load_ps(S1 + i);
1646 s3 = _mm_load_ps(S1 + i + 4);
1647 s4 = _mm_load_ps(S2 + i);
1648 s5 = _mm_load_ps(S2 + i + 4);
1649 s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
1650 s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
1651 s0 = _mm_add_ps(s0, d4);
1652 s1 = _mm_add_ps(s1, d4);
1653 _mm_storeu_ps(dst + i, s0);
1654 _mm_storeu_ps(dst + i + 4, s1);
1659 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1660 for( ; i <= width - 8; i += 8 )
1662 __m128 s0, s1, x0, x1;
1663 s0 = _mm_load_ps(S1 + i);
1664 s1 = _mm_load_ps(S1 + i + 4);
1665 s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
1666 s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
1667 x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
1668 x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
1669 s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1670 s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1671 _mm_storeu_ps(dst + i, s0);
1672 _mm_storeu_ps(dst + i + 4, s1);
1678 if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1682 for( ; i <= width - 8; i += 8 )
1684 __m128 s0, s1, s2, s3;
1685 s0 = _mm_load_ps(S2 + i);
1686 s1 = _mm_load_ps(S2 + i + 4);
1687 s2 = _mm_load_ps(S0 + i);
1688 s3 = _mm_load_ps(S0 + i + 4);
1689 s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
1690 s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
1691 _mm_storeu_ps(dst + i, s0);
1692 _mm_storeu_ps(dst + i + 4, s1);
1697 __m128 k1 = _mm_set1_ps(ky[1]);
1698 for( ; i <= width - 8; i += 8 )
1700 __m128 s0 = d4, s1 = d4, x0, x1;
1701 x0 = _mm_sub_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
1702 x1 = _mm_sub_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
1703 s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1704 s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1705 _mm_storeu_ps(dst + i, s0);
1706 _mm_storeu_ps(dst + i + 4, s1);
1720 /////////////////////////////// non-separable filters ///////////////////////////////
1722 ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
1727 FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
1730 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1731 delta = (float)(_delta/(1 << _bits));
1732 vector<Point> coords;
1733 preprocess2DKernel(kernel, coords, coeffs);
1734 _nz = (int)coords.size();
1737 int operator()(const uchar** src, uchar* dst, int width) const
1739 const float* kf = (const float*)&coeffs[0];
1740 int i = 0, k, nz = _nz;
1741 __m128 d4 = _mm_set1_ps(delta);
1743 for( ; i <= width - 16; i += 16 )
1745 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1746 __m128i x0, x1, z = _mm_setzero_si128();
1748 for( k = 0; k < nz; k++ )
1750 __m128 f = _mm_load_ss(kf+k), t0, t1;
1751 f = _mm_shuffle_ps(f, f, 0);
1753 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
1754 x1 = _mm_unpackhi_epi8(x0, z);
1755 x0 = _mm_unpacklo_epi8(x0, z);
1757 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1758 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
1759 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1760 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1762 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
1763 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
1764 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1765 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1768 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1769 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1770 x0 = _mm_packus_epi16(x0, x1);
1771 _mm_storeu_si128((__m128i*)(dst + i), x0);
1774 for( ; i <= width - 4; i += 4 )
1777 __m128i x0, z = _mm_setzero_si128();
1779 for( k = 0; k < nz; k++ )
1781 __m128 f = _mm_load_ss(kf+k), t0;
1782 f = _mm_shuffle_ps(f, f, 0);
1784 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
1785 x0 = _mm_unpacklo_epi8(x0, z);
1786 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1787 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1790 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
1791 x0 = _mm_packus_epi16(x0, x0);
1792 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1799 vector<uchar> coeffs;
1804 struct FilterVec_8u16s
1806 FilterVec_8u16s() {}
1807 FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
1810 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1811 delta = (float)(_delta/(1 << _bits));
1812 vector<Point> coords;
1813 preprocess2DKernel(kernel, coords, coeffs);
1814 _nz = (int)coords.size();
1817 int operator()(const uchar** src, uchar* _dst, int width) const
1819 const float* kf = (const float*)&coeffs[0];
1820 short* dst = (short*)_dst;
1821 int i = 0, k, nz = _nz;
1822 __m128 d4 = _mm_set1_ps(delta);
1824 for( ; i <= width - 16; i += 16 )
1826 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1827 __m128i x0, x1, z = _mm_setzero_si128();
1829 for( k = 0; k < nz; k++ )
1831 __m128 f = _mm_load_ss(kf+k), t0, t1;
1832 f = _mm_shuffle_ps(f, f, 0);
1834 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
1835 x1 = _mm_unpackhi_epi8(x0, z);
1836 x0 = _mm_unpacklo_epi8(x0, z);
1838 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1839 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
1840 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1841 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1843 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
1844 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
1845 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1846 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1849 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1850 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1851 _mm_storeu_si128((__m128i*)(dst + i), x0);
1852 _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
1855 for( ; i <= width - 4; i += 4 )
1858 __m128i x0, z = _mm_setzero_si128();
1860 for( k = 0; k < nz; k++ )
1862 __m128 f = _mm_load_ss(kf+k), t0;
1863 f = _mm_shuffle_ps(f, f, 0);
1865 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
1866 x0 = _mm_unpacklo_epi8(x0, z);
1867 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1868 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1871 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
1872 _mm_storel_epi64((__m128i*)(dst + i), x0);
1879 vector<uchar> coeffs;
1884 struct FilterVec_32f
1887 FilterVec_32f(const Mat& _kernel, int, double _delta)
1889 delta = (float)_delta;
1890 vector<Point> coords;
1891 preprocess2DKernel(_kernel, coords, coeffs);
1892 _nz = (int)coords.size();
1895 int operator()(const uchar** _src, uchar* _dst, int width) const
1897 const float* kf = (const float*)&coeffs[0];
1898 const float** src = (const float**)_src;
1899 float* dst = (float*)_dst;
1900 int i = 0, k, nz = _nz;
1901 __m128 d4 = _mm_set1_ps(delta);
1903 for( ; i <= width - 16; i += 16 )
1905 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1907 for( k = 0; k < nz; k++ )
1909 __m128 f = _mm_load_ss(kf+k), t0, t1;
1910 f = _mm_shuffle_ps(f, f, 0);
1911 const float* S = src[k] + i;
1913 t0 = _mm_loadu_ps(S);
1914 t1 = _mm_loadu_ps(S + 4);
1915 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1916 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1918 t0 = _mm_loadu_ps(S + 8);
1919 t1 = _mm_loadu_ps(S + 12);
1920 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1921 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1924 _mm_storeu_ps(dst + i, s0);
1925 _mm_storeu_ps(dst + i + 4, s1);
1926 _mm_storeu_ps(dst + i + 8, s2);
1927 _mm_storeu_ps(dst + i + 12, s3);
1930 for( ; i <= width - 4; i += 4 )
1934 for( k = 0; k < nz; k++ )
1936 __m128 f = _mm_load_ss(kf+k), t0;
1937 f = _mm_shuffle_ps(f, f, 0);
1938 t0 = _mm_loadu_ps(src[k] + i);
1939 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1941 _mm_storeu_ps(dst + i, s0);
1948 vector<uchar> coeffs;
1955 typedef RowNoVec RowVec_8u32s;
1956 typedef RowNoVec RowVec_32f;
1957 typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
1958 typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
1959 typedef ColumnNoVec SymmColumnVec_32s8u;
1960 typedef ColumnNoVec SymmColumnVec_32f;
1961 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
1962 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
1963 typedef FilterNoVec FilterVec_8u;
1964 typedef FilterNoVec FilterVec_8u16s;
1965 typedef FilterNoVec FilterVec_32f;
1970 template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter
1972 RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() )
1974 if( _kernel.isContinuous() )
1977 _kernel.copyTo(kernel);
1979 ksize = kernel.rows + kernel.cols - 1;
1980 CV_Assert( kernel.type() == DataType<DT>::type &&
1981 (kernel.rows == 1 || kernel.cols == 1));
1985 void operator()(const uchar* src, uchar* dst, int width, int cn)
1988 const DT* kx = (const DT*)kernel.data;
1993 i = vecOp(src, dst, width, cn);
1996 for( ; i <= width - 4; i += 4 )
1998 S = (const ST*)src + i;
2000 DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3];
2002 for( k = 1; k < _ksize; k++ )
2006 s0 += f*S[0]; s1 += f*S[1];
2007 s2 += f*S[2]; s3 += f*S[3];
2010 D[i] = s0; D[i+1] = s1;
2011 D[i+2] = s2; D[i+3] = s3;
2014 for( ; i < width; i++ )
2016 S = (const ST*)src + i;
2018 for( k = 1; k < _ksize; k++ )
2032 template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter :
2033 public RowFilter<ST, DT, VecOp>
2035 SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType,
2036 const VecOp& _vecOp = VecOp())
2037 : RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp )
2039 symmetryType = _symmetryType;
2040 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 );
2043 void operator()(const uchar* src, uchar* dst, int width, int cn)
2045 int ksize2 = this->ksize/2, ksize2n = ksize2*cn;
2046 const DT* kx = (const DT*)this->kernel.data + ksize2;
2047 bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2049 int i = this->vecOp(src, dst, width, cn), j, k;
2050 const ST* S = (const ST*)src + i + ksize2n;
2055 if( this->ksize == 1 && kx[0] == 1 )
2057 for( ; i <= width - 2; i += 2 )
2059 DT s0 = S[i], s1 = S[i+1];
2060 D[i] = s0; D[i+1] = s1;
2064 else if( this->ksize == 3 )
2066 if( kx[0] == 2 && kx[1] == 1 )
2067 for( ; i <= width - 2; i += 2, S += 2 )
2069 DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn];
2070 D[i] = s0; D[i+1] = s1;
2072 else if( kx[0] == -2 && kx[1] == 1 )
2073 for( ; i <= width - 2; i += 2, S += 2 )
2075 DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn];
2076 D[i] = s0; D[i+1] = s1;
2080 DT k0 = kx[0], k1 = kx[1];
2081 for( ; i <= width - 2; i += 2, S += 2 )
2083 DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1;
2084 D[i] = s0; D[i+1] = s1;
2088 else if( this->ksize == 5 )
2090 DT k0 = kx[0], k1 = kx[1], k2 = kx[2];
2091 if( k0 == -2 && k1 == 0 && k2 == 1 )
2092 for( ; i <= width - 2; i += 2, S += 2 )
2094 DT s0 = -2*S[0] + S[-cn*2] + S[cn*2];
2095 DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2];
2096 D[i] = s0; D[i+1] = s1;
2099 for( ; i <= width - 2; i += 2, S += 2 )
2101 DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2;
2102 DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2;
2103 D[i] = s0; D[i+1] = s1;
2107 for( ; i < width; i++, S++ )
2110 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2111 s0 += kx[k]*(S[j] + S[-j]);
2117 if( this->ksize == 3 )
2119 if( kx[0] == 0 && kx[1] == 1 )
2120 for( ; i <= width - 2; i += 2, S += 2 )
2122 DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
2123 D[i] = s0; D[i+1] = s1;
2128 for( ; i <= width - 2; i += 2, S += 2 )
2130 DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1;
2131 D[i] = s0; D[i+1] = s1;
2135 else if( this->ksize == 5 )
2137 DT k1 = kx[1], k2 = kx[2];
2138 for( ; i <= width - 2; i += 2, S += 2 )
2140 DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2;
2141 DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2;
2142 D[i] = s0; D[i+1] = s1;
2146 for( ; i < width; i++, S++ )
2149 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2150 s0 += kx[k]*(S[j] - S[-j]);
2160 template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter
2162 typedef typename CastOp::type1 ST;
2163 typedef typename CastOp::rtype DT;
2165 ColumnFilter( const Mat& _kernel, int _anchor,
2166 double _delta, const CastOp& _castOp=CastOp(),
2167 const VecOp& _vecOp=VecOp() )
2169 if( _kernel.isContinuous() )
2172 _kernel.copyTo(kernel);
2174 ksize = kernel.rows + kernel.cols - 1;
2175 delta = saturate_cast<ST>(_delta);
2178 CV_Assert( kernel.type() == DataType<ST>::type &&
2179 (kernel.rows == 1 || kernel.cols == 1));
2182 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2184 const ST* ky = (const ST*)kernel.data;
2188 CastOp castOp = castOp0;
2190 for( ; count--; dst += dststep, src++ )
2193 i = vecOp(src, dst, width);
2194 for( ; i <= width - 4; i += 4 )
2197 const ST* S = (const ST*)src[0] + i;
2198 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2199 s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2201 for( k = 1; k < _ksize; k++ )
2203 S = (const ST*)src[k] + i; f = ky[k];
2204 s0 += f*S[0]; s1 += f*S[1];
2205 s2 += f*S[2]; s3 += f*S[3];
2208 D[i] = castOp(s0); D[i+1] = castOp(s1);
2209 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2212 for( ; i < width; i++ )
2214 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2215 for( k = 1; k < _ksize; k++ )
2216 s0 += ky[k]*((const ST*)src[k])[i];
2229 template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp>
2231 typedef typename CastOp::type1 ST;
2232 typedef typename CastOp::rtype DT;
2234 SymmColumnFilter( const Mat& _kernel, int _anchor,
2235 double _delta, int _symmetryType,
2236 const CastOp& _castOp=CastOp(),
2237 const VecOp& _vecOp=VecOp())
2238 : ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp )
2240 symmetryType = _symmetryType;
2241 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2244 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2246 int ksize2 = this->ksize/2;
2247 const ST* ky = (const ST*)this->kernel.data + ksize2;
2249 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2250 ST _delta = this->delta;
2251 CastOp castOp = this->castOp0;
2256 for( ; count--; dst += dststep, src++ )
2259 i = (this->vecOp)(src, dst, width);
2261 for( ; i <= width - 4; i += 4 )
2264 const ST* S = (const ST*)src[0] + i, *S2;
2265 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2266 s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2268 for( k = 1; k <= ksize2; k++ )
2270 S = (const ST*)src[k] + i;
2271 S2 = (const ST*)src[-k] + i;
2273 s0 += f*(S[0] + S2[0]);
2274 s1 += f*(S[1] + S2[1]);
2275 s2 += f*(S[2] + S2[2]);
2276 s3 += f*(S[3] + S2[3]);
2279 D[i] = castOp(s0); D[i+1] = castOp(s1);
2280 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2283 for( ; i < width; i++ )
2285 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2286 for( k = 1; k <= ksize2; k++ )
2287 s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]);
2294 for( ; count--; dst += dststep, src++ )
2297 i = this->vecOp(src, dst, width);
2299 for( ; i <= width - 4; i += 4 )
2303 ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
2305 for( k = 1; k <= ksize2; k++ )
2307 S = (const ST*)src[k] + i;
2308 S2 = (const ST*)src[-k] + i;
2310 s0 += f*(S[0] - S2[0]);
2311 s1 += f*(S[1] - S2[1]);
2312 s2 += f*(S[2] - S2[2]);
2313 s3 += f*(S[3] - S2[3]);
2316 D[i] = castOp(s0); D[i+1] = castOp(s1);
2317 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2320 for( ; i < width; i++ )
2323 for( k = 1; k <= ksize2; k++ )
2324 s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]);
2335 template<class CastOp, class VecOp>
2336 struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp>
2338 typedef typename CastOp::type1 ST;
2339 typedef typename CastOp::rtype DT;
2341 SymmColumnSmallFilter( const Mat& _kernel, int _anchor,
2342 double _delta, int _symmetryType,
2343 const CastOp& _castOp=CastOp(),
2344 const VecOp& _vecOp=VecOp())
2345 : SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp )
2347 CV_Assert( this->ksize == 3 );
2350 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2352 int ksize2 = this->ksize/2;
2353 const ST* ky = (const ST*)this->kernel.data + ksize2;
2355 bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2356 bool is_1_2_1 = ky[0] == 1 && ky[1] == 2;
2357 bool is_1_m2_1 = ky[0] == 1 && ky[1] == -2;
2358 bool is_m1_0_1 = ky[1] == 1 || ky[1] == -1;
2359 ST f0 = ky[0], f1 = ky[1];
2360 ST _delta = this->delta;
2361 CastOp castOp = this->castOp0;
2364 for( ; count--; dst += dststep, src++ )
2367 i = (this->vecOp)(src, dst, width);
2368 const ST* S0 = (const ST*)src[-1];
2369 const ST* S1 = (const ST*)src[0];
2370 const ST* S2 = (const ST*)src[1];
2376 for( ; i <= width - 4; i += 4 )
2378 ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
2379 ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta;
2381 D[i+1] = castOp(s1);
2383 s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta;
2384 s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta;
2385 D[i+2] = castOp(s0);
2386 D[i+3] = castOp(s1);
2389 else if( is_1_m2_1 )
2391 for( ; i <= width - 4; i += 4 )
2393 ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
2394 ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta;
2396 D[i+1] = castOp(s1);
2398 s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta;
2399 s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta;
2400 D[i+2] = castOp(s0);
2401 D[i+3] = castOp(s1);
2406 for( ; i <= width - 4; i += 4 )
2408 ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
2409 ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta;
2411 D[i+1] = castOp(s1);
2413 s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta;
2414 s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta;
2415 D[i+2] = castOp(s0);
2416 D[i+3] = castOp(s1);
2420 for( ; i < width; i++ )
2421 D[i] = castOp((S0[i] + S2[i])*f1 + S1[i]*f0 + _delta);
2430 for( ; i <= width - 4; i += 4 )
2432 ST s0 = S2[i] - S0[i] + _delta;
2433 ST s1 = S2[i+1] - S0[i+1] + _delta;
2435 D[i+1] = castOp(s1);
2437 s0 = S2[i+2] - S0[i+2] + _delta;
2438 s1 = S2[i+3] - S0[i+3] + _delta;
2439 D[i+2] = castOp(s0);
2440 D[i+3] = castOp(s1);
2448 for( ; i <= width - 4; i += 4 )
2450 ST s0 = (S2[i] - S0[i])*f1 + _delta;
2451 ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta;
2453 D[i+1] = castOp(s1);
2455 s0 = (S2[i+2] - S0[i+2])*f1 + _delta;
2456 s1 = (S2[i+3] - S0[i+3])*f1 + _delta;
2457 D[i+2] = castOp(s0);
2458 D[i+3] = castOp(s1);
2462 for( ; i < width; i++ )
2463 D[i] = castOp((S2[i] - S0[i])*f1 + _delta);
2469 template<typename ST, typename DT> struct Cast
2474 DT operator()(ST val) const { return saturate_cast<DT>(val); }
2477 template<typename ST, typename DT, int bits> struct FixedPtCast
2481 enum { SHIFT = bits, DELTA = 1 << (bits-1) };
2483 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2486 template<typename ST, typename DT> struct FixedPtCastEx
2491 FixedPtCastEx() : SHIFT(0), DELTA(0) {}
2492 FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {}
2493 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2497 Ptr<BaseRowFilter> getLinearRowFilter( int srcType, int bufType,
2498 const Mat& kernel, int anchor,
2501 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType);
2502 int cn = CV_MAT_CN(srcType);
2503 CV_Assert( cn == CV_MAT_CN(bufType) &&
2504 ddepth >= std::max(sdepth, CV_32S) &&
2505 kernel.type() == ddepth );
2506 int ksize = kernel.rows + kernel.cols - 1;
2508 if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 )
2510 if( sdepth == CV_8U && ddepth == CV_32S )
2511 return Ptr<BaseRowFilter>(new SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s>
2512 (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType)));
2513 if( sdepth == CV_32F && ddepth == CV_32F )
2514 return Ptr<BaseRowFilter>(new SymmRowSmallFilter<float, float, SymmRowSmallVec_32f>
2515 (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType)));
2518 if( sdepth == CV_8U && ddepth == CV_32S )
2519 return Ptr<BaseRowFilter>(new RowFilter<uchar, int, RowVec_8u32s>
2520 (kernel, anchor, RowVec_8u32s(kernel)));
2521 if( sdepth == CV_8U && ddepth == CV_32F )
2522 return Ptr<BaseRowFilter>(new RowFilter<uchar, float, RowNoVec>(kernel, anchor));
2523 if( sdepth == CV_8U && ddepth == CV_64F )
2524 return Ptr<BaseRowFilter>(new RowFilter<uchar, double, RowNoVec>(kernel, anchor));
2525 if( sdepth == CV_16U && ddepth == CV_32F )
2526 return Ptr<BaseRowFilter>(new RowFilter<ushort, float, RowNoVec>(kernel, anchor));
2527 if( sdepth == CV_16U && ddepth == CV_64F )
2528 return Ptr<BaseRowFilter>(new RowFilter<ushort, double, RowNoVec>(kernel, anchor));
2529 if( sdepth == CV_16S && ddepth == CV_32F )
2530 return Ptr<BaseRowFilter>(new RowFilter<short, float, RowNoVec>(kernel, anchor));
2531 if( sdepth == CV_16S && ddepth == CV_64F )
2532 return Ptr<BaseRowFilter>(new RowFilter<short, double, RowNoVec>(kernel, anchor));
2533 if( sdepth == CV_32F && ddepth == CV_32F )
2534 return Ptr<BaseRowFilter>(new RowFilter<float, float, RowVec_32f>
2535 (kernel, anchor, RowVec_32f(kernel)));
2536 if( sdepth == CV_64F && ddepth == CV_64F )
2537 return Ptr<BaseRowFilter>(new RowFilter<double, double, RowNoVec>(kernel, anchor));
2539 CV_Error_( CV_StsNotImplemented,
2540 ("Unsupported combination of source format (=%d), and buffer format (=%d)",
2543 return Ptr<BaseRowFilter>(0);
2547 Ptr<BaseColumnFilter> getLinearColumnFilter( int bufType, int dstType,
2548 const Mat& kernel, int anchor,
2549 int symmetryType, double delta,
2552 int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType);
2553 int cn = CV_MAT_CN(dstType);
2554 CV_Assert( cn == CV_MAT_CN(bufType) &&
2555 sdepth >= std::max(ddepth, CV_32S) &&
2556 kernel.type() == sdepth );
2558 if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) )
2560 if( ddepth == CV_8U && sdepth == CV_32S )
2561 return Ptr<BaseColumnFilter>(new ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec>
2562 (kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits)));
2563 if( ddepth == CV_8U && sdepth == CV_32F )
2564 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, uchar>, ColumnNoVec>(kernel, anchor, delta));
2565 if( ddepth == CV_8U && sdepth == CV_64F )
2566 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, uchar>, ColumnNoVec>(kernel, anchor, delta));
2567 if( ddepth == CV_16U && sdepth == CV_32F )
2568 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, ushort>, ColumnNoVec>(kernel, anchor, delta));
2569 if( ddepth == CV_16U && sdepth == CV_64F )
2570 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, ushort>, ColumnNoVec>(kernel, anchor, delta));
2571 if( ddepth == CV_16S && sdepth == CV_32F )
2572 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, short>, ColumnNoVec>(kernel, anchor, delta));
2573 if( ddepth == CV_16S && sdepth == CV_64F )
2574 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, short>, ColumnNoVec>(kernel, anchor, delta));
2575 if( ddepth == CV_32F && sdepth == CV_32F )
2576 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, float>, ColumnNoVec>(kernel, anchor, delta));
2577 if( ddepth == CV_64F && sdepth == CV_64F )
2578 return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, double>, ColumnNoVec>(kernel, anchor, delta));
2582 int ksize = kernel.rows + kernel.cols - 1;
2585 if( ddepth == CV_8U && sdepth == CV_32S )
2586 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2587 FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2588 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2589 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2590 if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 )
2591 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<Cast<int, short>,
2592 SymmColumnSmallVec_32s16s>(kernel, anchor, delta, symmetryType,
2593 Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta)));
2594 if( ddepth == CV_32F && sdepth == CV_32F )
2595 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2596 Cast<float, float>,SymmColumnSmallVec_32f>
2597 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2598 SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta)));
2600 if( ddepth == CV_8U && sdepth == CV_32S )
2601 return Ptr<BaseColumnFilter>(new SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2602 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2603 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2604 if( ddepth == CV_8U && sdepth == CV_32F )
2605 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, uchar>, ColumnNoVec>
2606 (kernel, anchor, delta, symmetryType));
2607 if( ddepth == CV_8U && sdepth == CV_64F )
2608 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, uchar>, ColumnNoVec>
2609 (kernel, anchor, delta, symmetryType));
2610 if( ddepth == CV_16U && sdepth == CV_32F )
2611 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, ushort>, ColumnNoVec>
2612 (kernel, anchor, delta, symmetryType));
2613 if( ddepth == CV_16U && sdepth == CV_64F )
2614 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, ushort>, ColumnNoVec>
2615 (kernel, anchor, delta, symmetryType));
2616 if( ddepth == CV_16S && sdepth == CV_32S )
2617 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<int, short>, ColumnNoVec>
2618 (kernel, anchor, delta, symmetryType));
2619 if( ddepth == CV_16S && sdepth == CV_32F )
2620 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, short>, ColumnNoVec>
2621 (kernel, anchor, delta, symmetryType));
2622 if( ddepth == CV_16S && sdepth == CV_64F )
2623 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, short>, ColumnNoVec>
2624 (kernel, anchor, delta, symmetryType));
2625 if( ddepth == CV_32F && sdepth == CV_32F )
2626 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f>
2627 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2628 SymmColumnVec_32f(kernel, symmetryType, 0, delta)));
2629 if( ddepth == CV_64F && sdepth == CV_64F )
2630 return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, double>, ColumnNoVec>
2631 (kernel, anchor, delta, symmetryType));
2634 CV_Error_( CV_StsNotImplemented,
2635 ("Unsupported combination of buffer format (=%d), and destination format (=%d)",
2638 return Ptr<BaseColumnFilter>(0);
2642 Ptr<FilterEngine> createSeparableLinearFilter(
2643 int _srcType, int _dstType,
2644 const Mat& _rowKernel, const Mat& _columnKernel,
2645 Point _anchor, double _delta,
2646 int _rowBorderType, int _columnBorderType,
2647 const Scalar& _borderValue )
2649 _srcType = CV_MAT_TYPE(_srcType);
2650 _dstType = CV_MAT_TYPE(_dstType);
2651 int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
2652 int cn = CV_MAT_CN(_srcType);
2653 CV_Assert( cn == CV_MAT_CN(_dstType) );
2654 int rsize = _rowKernel.rows + _rowKernel.cols - 1;
2655 int csize = _columnKernel.rows + _columnKernel.cols - 1;
2657 _anchor.x = rsize/2;
2659 _anchor.y = csize/2;
2660 int rtype = getKernelType(_rowKernel,
2661 _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x));
2662 int ctype = getKernelType(_columnKernel,
2663 _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y));
2664 Mat rowKernel, columnKernel;
2666 int bdepth = std::max(CV_32F,std::max(sdepth, ddepth));
2669 if( sdepth == CV_8U &&
2670 ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2671 ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2673 ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2674 (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2675 (rtype & ctype & KERNEL_INTEGER) &&
2676 ddepth == CV_16S)) )
2679 bits = ddepth == CV_8U ? 8 : 0;
2680 _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits );
2681 _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits );
2683 _delta *= (1 << bits);
2687 if( _rowKernel.type() != bdepth )
2688 _rowKernel.convertTo( rowKernel, bdepth );
2690 rowKernel = _rowKernel;
2691 if( _columnKernel.type() != bdepth )
2692 _columnKernel.convertTo( columnKernel, bdepth );
2694 columnKernel = _columnKernel;
2697 int _bufType = CV_MAKETYPE(bdepth, cn);
2698 Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter(
2699 _srcType, _bufType, rowKernel, _anchor.x, rtype);
2700 Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter(
2701 _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits );
2703 return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(0), _rowFilter, _columnFilter,
2704 _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue ));
2708 /****************************************************************************************\
2709 * Non-separable linear filter *
2710 \****************************************************************************************/
2712 void preprocess2DKernel( const Mat& kernel, vector<Point>& coords, vector<uchar>& coeffs )
2714 int i, j, k, nz = countNonZero(kernel), ktype = kernel.type();
2717 CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F );
2719 coeffs.resize(nz*getElemSize(ktype));
2720 uchar* _coeffs = &coeffs[0];
2722 for( i = k = 0; i < kernel.rows; i++ )
2724 const uchar* krow = kernel.data + kernel.step*i;
2725 for( j = 0; j < kernel.cols; j++ )
2727 if( ktype == CV_8U )
2729 uchar val = krow[j];
2732 coords[k] = Point(j,i);
2735 else if( ktype == CV_32S )
2737 int val = ((const int*)krow)[j];
2740 coords[k] = Point(j,i);
2741 ((int*)_coeffs)[k++] = val;
2743 else if( ktype == CV_32F )
2745 float val = ((const float*)krow)[j];
2748 coords[k] = Point(j,i);
2749 ((float*)_coeffs)[k++] = val;
2753 double val = ((const double*)krow)[j];
2756 coords[k] = Point(j,i);
2757 ((double*)_coeffs)[k++] = val;
2764 template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter
2766 typedef typename CastOp::type1 KT;
2767 typedef typename CastOp::rtype DT;
2769 Filter2D( const Mat& _kernel, Point _anchor,
2770 double _delta, const CastOp& _castOp=CastOp(),
2771 const VecOp& _vecOp=VecOp() )
2774 ksize = _kernel.size();
2775 delta = saturate_cast<KT>(_delta);
2778 CV_Assert( _kernel.type() == DataType<KT>::type );
2779 preprocess2DKernel( _kernel, coords, coeffs );
2780 ptrs.resize( coords.size() );
2783 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn)
2786 const Point* pt = &coords[0];
2787 const KT* kf = (const KT*)&coeffs[0];
2788 const ST** kp = (const ST**)&ptrs[0];
2789 int i, k, nz = (int)coords.size();
2790 CastOp castOp = castOp0;
2793 for( ; count > 0; count--, dst += dststep, src++ )
2797 for( k = 0; k < nz; k++ )
2798 kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn;
2800 i = vecOp((const uchar**)kp, dst, width);
2802 for( ; i <= width - 4; i += 4 )
2804 KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
2806 for( k = 0; k < nz; k++ )
2808 const ST* sptr = kp[k] + i;
2816 D[i] = castOp(s0); D[i+1] = castOp(s1);
2817 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2820 for( ; i < width; i++ )
2823 for( k = 0; k < nz; k++ )
2824 s0 += kf[k]*kp[k][i];
2830 vector<Point> coords;
2831 vector<uchar> coeffs;
2832 vector<uchar*> ptrs;
2839 Ptr<BaseFilter> getLinearFilter(int srcType, int dstType,
2840 const Mat& _kernel, Point anchor,
2841 double delta, int bits)
2843 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
2844 int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth();
2845 CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth );
2847 anchor = normalizeAnchor(anchor, _kernel.size());
2849 if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S )
2850 return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u>
2851 (_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits),
2852 FilterVec_8u(_kernel, bits, delta)));
2853 if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S )
2854 return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s>
2855 (_kernel, anchor, delta, FixedPtCastEx<int, short>(bits),
2856 FilterVec_8u16s(_kernel, bits, delta)));
2858 kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F;
2860 if( _kernel.type() == kdepth )
2863 _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.);
2865 if( sdepth == CV_8U && ddepth == CV_8U )
2866 return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, uchar>, FilterVec_8u>
2867 (kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta)));
2868 if( sdepth == CV_8U && ddepth == CV_16U )
2869 return Ptr<BaseFilter>(new Filter2D<uchar,
2870 Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
2871 if( sdepth == CV_8U && ddepth == CV_16S )
2872 return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, short>, FilterVec_8u16s>
2873 (kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta)));
2874 if( sdepth == CV_8U && ddepth == CV_32F )
2875 return Ptr<BaseFilter>(new Filter2D<uchar,
2876 Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2877 if( sdepth == CV_8U && ddepth == CV_64F )
2878 return Ptr<BaseFilter>(new Filter2D<uchar,
2879 Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2881 if( sdepth == CV_16U && ddepth == CV_16U )
2882 return Ptr<BaseFilter>(new Filter2D<ushort,
2883 Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
2884 if( sdepth == CV_16U && ddepth == CV_32F )
2885 return Ptr<BaseFilter>(new Filter2D<ushort,
2886 Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2887 if( sdepth == CV_16U && ddepth == CV_64F )
2888 return Ptr<BaseFilter>(new Filter2D<ushort,
2889 Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2891 if( sdepth == CV_16S && ddepth == CV_16S )
2892 return Ptr<BaseFilter>(new Filter2D<short,
2893 Cast<float, short>, FilterNoVec>(kernel, anchor, delta));
2894 if( sdepth == CV_16S && ddepth == CV_32F )
2895 return Ptr<BaseFilter>(new Filter2D<short,
2896 Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2897 if( sdepth == CV_16S && ddepth == CV_64F )
2898 return Ptr<BaseFilter>(new Filter2D<short,
2899 Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2901 if( sdepth == CV_32F && ddepth == CV_32F )
2902 return Ptr<BaseFilter>(new Filter2D<float, Cast<float, float>, FilterVec_32f>
2903 (kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta)));
2904 if( sdepth == CV_64F && ddepth == CV_64F )
2905 return Ptr<BaseFilter>(new Filter2D<double,
2906 Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2908 CV_Error_( CV_StsNotImplemented,
2909 ("Unsupported combination of source format (=%d), and destination format (=%d)",
2912 return Ptr<BaseFilter>(0);
2916 Ptr<FilterEngine> createLinearFilter( int _srcType, int _dstType, const Mat& _kernel,
2917 Point _anchor, double _delta,
2918 int _rowBorderType, int _columnBorderType,
2919 const Scalar& _borderValue )
2921 _srcType = CV_MAT_TYPE(_srcType);
2922 _dstType = CV_MAT_TYPE(_dstType);
2923 int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
2924 int cn = CV_MAT_CN(_srcType);
2925 CV_Assert( cn == CV_MAT_CN(_dstType) );
2927 Mat kernel = _kernel;
2928 int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor);
2931 if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) &&
2932 _kernel.rows*_kernel.cols <= (1 << 10) )
2934 bits = (ktype & KERNEL_INTEGER) ? 0 : 11;
2935 _kernel.convertTo(kernel, CV_32S, 1 << bits);
2938 Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType,
2939 kernel, _anchor, _delta, bits);
2941 return Ptr<FilterEngine>(new FilterEngine(_filter2D, Ptr<BaseRowFilter>(0),
2942 Ptr<BaseColumnFilter>(0), _srcType, _dstType, _srcType,
2943 _rowBorderType, _columnBorderType, _borderValue ));
2947 void filter2D( const Mat& src, Mat& dst, int ddepth,
2948 const Mat& kernel, Point anchor,
2949 double delta, int borderType )
2952 ddepth = src.depth();
2955 int dft_filter_size = (src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) ||
2956 (src.depth() == CV_32F && ddepth == CV_32F) ? 130 : 50;
2958 int dft_filter_size = 50;
2961 dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
2962 anchor = normalizeAnchor(anchor, kernel.size());
2964 if( kernel.cols*kernel.rows >= dft_filter_size &&
2965 kernel.cols <= src.cols && kernel.rows <= src.rows )
2968 if( src.data != dst.data )
2972 crossCorr( temp, kernel, dst, anchor, delta, borderType );
2976 Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel,
2977 anchor, delta, borderType );
2982 void sepFilter2D( const Mat& src, Mat& dst, int ddepth,
2983 const Mat& kernelX, const Mat& kernelY, Point anchor,
2984 double delta, int borderType )
2987 ddepth = src.depth();
2989 dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
2991 Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(),
2992 dst.type(), kernelX, kernelY, anchor, delta, borderType );
3000 cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor )
3002 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3003 cv::Mat kernel = cv::cvarrToMat(_kernel);
3005 CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() );
3007 cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE );