X-Git-Url: https://vcs.maemo.org/git/?a=blobdiff_plain;f=src%2Fcv%2Fcvfilter.cpp;fp=src%2Fcv%2Fcvfilter.cpp;h=6184dcd6e33abb43638018b0a92cef13a46db6cb;hb=e4c14cdbdf2fe805e79cd96ded236f57e7b89060;hp=0000000000000000000000000000000000000000;hpb=454138ff8a20f6edb9b65a910101403d8b520643;p=opencv diff --git a/src/cv/cvfilter.cpp b/src/cv/cvfilter.cpp new file mode 100644 index 0000000..6184dcd --- /dev/null +++ b/src/cv/cvfilter.cpp @@ -0,0 +1,3010 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "_cv.h" + +/****************************************************************************************\ + Base Image Filter +\****************************************************************************************/ + +namespace cv +{ + +BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; } +BaseRowFilter::~BaseRowFilter() {} + +BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; } +BaseColumnFilter::~BaseColumnFilter() {} +void BaseColumnFilter::reset() {} + +BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); } +BaseFilter::~BaseFilter() {} +void BaseFilter::reset() {} + +/* + Various border types, image boundaries are denoted with '|' + + * BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh + * BORDER_REFLECT: fedcba|abcdefgh|hgfedcb + * BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba + * BORDER_WRAP: cdefgh|abcdefgh|abcdefg + * BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii with some specified 'i' +*/ +int borderInterpolate( int p, int len, int borderType ) +{ + if( (unsigned)p < (unsigned)len ) + ; + else if( borderType == BORDER_REPLICATE ) + p = p < 0 ? 0 : len - 1; + else if( borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101 ) + { + int delta = borderType == BORDER_REFLECT_101; + if( len == 1 ) + return 0; + do + { + if( p < 0 ) + p = -p - 1 + delta; + else + p = len - 1 - (p - len) - delta; + } + while( (unsigned)p >= (unsigned)len ); + } + else if( borderType == BORDER_WRAP ) + { + if( p < 0 ) + p -= ((p-len+1)/len)*len; + if( p >= len ) + p %= len; + } + else if( borderType == BORDER_CONSTANT ) + p = -1; + else + CV_Error( CV_StsBadArg, "Unknown/unsupported border type" ); + return p; +} + + +FilterEngine::FilterEngine() +{ + srcType = dstType = bufType = -1; + rowBorderType = columnBorderType = BORDER_REPLICATE; + bufStep = startY = startY0 = endY = rowCount = dstY = 0; + maxWidth = 0; + + wholeSize = Size(-1,-1); +} + + +FilterEngine::FilterEngine( const Ptr& _filter2D, + const Ptr& _rowFilter, + const Ptr& _columnFilter, + int _srcType, int _dstType, int _bufType, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue ) +{ + init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType, + _rowBorderType, _columnBorderType, _borderValue); +} + +FilterEngine::~FilterEngine() +{ +} + + +void FilterEngine::init( const Ptr& _filter2D, + const Ptr& _rowFilter, + const Ptr& _columnFilter, + int _srcType, int _dstType, int _bufType, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue ) +{ + _srcType = CV_MAT_TYPE(_srcType); + _bufType = CV_MAT_TYPE(_bufType); + _dstType = CV_MAT_TYPE(_dstType); + + srcType = _srcType; + int srcElemSize = (int)getElemSize(srcType); + dstType = _dstType; + bufType = _bufType; + + filter2D = _filter2D; + rowFilter = _rowFilter; + columnFilter = _columnFilter; + + if( _columnBorderType < 0 ) + _columnBorderType = _rowBorderType; + + rowBorderType = _rowBorderType; + columnBorderType = _columnBorderType; + + CV_Assert( columnBorderType != BORDER_WRAP ); + + if( isSeparable() ) + { + CV_Assert( !rowFilter.empty() && !columnFilter.empty() ); + ksize = Size(rowFilter->ksize, columnFilter->ksize); + anchor = Point(rowFilter->anchor, columnFilter->anchor); + } + else + { + CV_Assert( bufType == srcType ); + ksize = filter2D->ksize; + anchor = filter2D->anchor; + } + + CV_Assert( 0 <= anchor.x && anchor.x < ksize.width && + 0 <= anchor.y && anchor.y < ksize.height ); + + borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1); + borderTab.resize( std::max(ksize.width - 1, 1)*borderElemSize); + + maxWidth = bufStep = 0; + constBorderRow.clear(); + + if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT ) + { + constBorderValue.resize(srcElemSize*(ksize.width - 1)); + scalarToRawData(_borderValue, &constBorderValue[0], srcType, + (ksize.width-1)*CV_MAT_CN(srcType)); + } + + wholeSize = Size(-1,-1); +} + +static const int VEC_ALIGN = CV_MALLOC_ALIGN; + +int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows) +{ + int i, j; + + wholeSize = _wholeSize; + roi = _roi; + CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 && + roi.x + roi.width <= wholeSize.width && + roi.y + roi.height <= wholeSize.height ); + + int esz = (int)getElemSize(srcType); + int bufElemSize = (int)getElemSize(bufType); + const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0; + + if( _maxBufRows < 0 ) + _maxBufRows = ksize.height + 3; + _maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1); + + if( maxWidth < roi.width || _maxBufRows != (int)rows.size() ) + { + rows.resize(_maxBufRows); + maxWidth = std::max(maxWidth, roi.width); + int cn = CV_MAT_CN(srcType); + srcRow.resize(esz*(maxWidth + ksize.width - 1)); + if( columnBorderType == BORDER_CONSTANT ) + { + constBorderRow.resize(getElemSize(bufType)*(maxWidth+VEC_ALIGN)); + uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst; + int n = (int)constBorderValue.size(), N; + if( isSeparable() ) + { + tdst = &srcRow[0]; + N = (maxWidth + ksize.width - 1)*esz; + } + else + { + tdst = dst; + N = maxWidth*esz; + } + + for( i = 0; i < N; i += n ) + { + n = std::min( n, N - i ); + for(j = 0; j < n; j++) + tdst[i+j] = constVal[j]; + } + + if( isSeparable() ) + (*rowFilter)(&srcRow[0], dst, maxWidth, cn); + } + + int maxBufStep = bufElemSize*(int)alignSize(maxWidth + + (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); + ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN); + } + + // adjust bufstep so that the used part of the ring buffer stays compact in memory + bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16); + + dx1 = std::max(anchor.x - roi.x, 0); + dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0); + + // recompute border tables + if( dx1 > 0 || dx2 > 0 ) + { + if( rowBorderType == BORDER_CONSTANT ) + { + int nr = isSeparable() ? 1 : (int)rows.size(); + for( i = 0; i < nr; i++ ) + { + uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i; + memcpy( dst, constVal, dx1*esz ); + memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz ); + } + } + else + { + int btab_esz = borderElemSize, wholeWidth = wholeSize.width; + int* btab = (int*)&borderTab[0]; + + for( i = 0; i < dx1; i++ ) + { + int p0 = borderInterpolate(i-dx1, wholeWidth, rowBorderType)*btab_esz; + for( j = 0; j < btab_esz; j++ ) + btab[i*btab_esz + j] = p0 + j; + } + + for( i = 0; i < dx2; i++ ) + { + int p0 = borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType)*btab_esz; + for( j = 0; j < btab_esz; j++ ) + btab[(i + dx1)*btab_esz + j] = p0 + j; + } + } + } + + rowCount = dstY = 0; + startY = startY0 = std::max(roi.y - anchor.y, 0); + endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height); + if( !columnFilter.empty() ) + columnFilter->reset(); + if( !filter2D.empty() ) + filter2D->reset(); + + return startY; +} + + +int FilterEngine::start(const Mat& src, const Rect& _srcRoi, + bool isolated, int maxBufRows) +{ + Rect srcRoi = _srcRoi; + + if( srcRoi == Rect(0,0,-1,-1) ) + srcRoi = Rect(0,0,src.cols,src.rows); + + CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 && + srcRoi.width >= 0 && srcRoi.height >= 0 && + srcRoi.x + srcRoi.width <= src.cols && + srcRoi.y + srcRoi.height <= src.rows ); + + Point ofs; + Size wholeSize(src.cols, src.rows); + if( !isolated ) + src.locateROI( wholeSize, ofs ); + start( wholeSize, srcRoi + ofs, maxBufRows ); + + return startY - ofs.y; +} + + +int FilterEngine::remainingInputRows() const +{ + return endY - startY - rowCount; +} + +int FilterEngine::remainingOutputRows() const +{ + return roi.height - dstY; +} + +int FilterEngine::proceed( const uchar* src, int srcstep, int count, + uchar* dst, int dststep ) +{ + CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 ); + + const int *btab = &borderTab[0]; + int esz = (int)getElemSize(srcType), btab_esz = borderElemSize; + uchar** brows = &rows[0]; + int bufRows = (int)rows.size(); + int cn = CV_MAT_CN(bufType); + int width = roi.width, kwidth = ksize.width; + int kheight = ksize.height, ay = anchor.y; + int _dx1 = dx1, _dx2 = dx2; + int width1 = roi.width + kwidth - 1; + int xofs1 = std::min(roi.x, anchor.x); + bool isSep = isSeparable(); + bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT; + int dy = 0, i = 0; + + src -= xofs1*esz; + count = std::min(count, remainingInputRows()); + + CV_Assert( src && dst && count > 0 ); + + for(;; dst += dststep*i, dy += i) + { + int dcount = bufRows - ay - startY - rowCount + roi.y; + dcount = dcount > 0 ? dcount : bufRows - kheight + 1; + dcount = std::min(dcount, count); + count -= dcount; + for( ; dcount-- > 0; src += srcstep ) + { + int bi = (startY - startY0 + rowCount) % bufRows; + uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; + uchar* row = isSep ? &srcRow[0] : brow; + + if( ++rowCount > bufRows ) + { + --rowCount; + ++startY; + } + + memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz ); + + if( makeBorder ) + { + if( btab_esz*(int)sizeof(int) == esz ) + { + const int* isrc = (const int*)src; + int* irow = (int*)row; + + for( i = 0; i < _dx1*btab_esz; i++ ) + irow[i] = isrc[btab[i]]; + for( i = 0; i < _dx2*btab_esz; i++ ) + irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]]; + } + else + { + for( i = 0; i < _dx1*esz; i++ ) + row[i] = src[btab[i]]; + for( i = 0; i < _dx2*esz; i++ ) + row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]]; + } + } + + if( isSep ) + (*rowFilter)(row, brow, width, CV_MAT_CN(srcType)); + } + + int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1)); + for( i = 0; i < max_i; i++ ) + { + int srcY = borderInterpolate(dstY + dy + i + roi.y - ay, + wholeSize.height, columnBorderType); + if( srcY < 0 ) // can happen only with constant border type + brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN); + else + { + CV_Assert( srcY >= startY ); + if( srcY >= startY + rowCount ) + break; + int bi = (srcY - startY0) % bufRows; + brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; + } + } + if( i < kheight ) + break; + i -= kheight - 1; + if( isSeparable() ) + (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn); + else + (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn); + } + + dstY += dy; + CV_Assert( dstY <= roi.height ); + return dy; +} + + +void FilterEngine::apply(const Mat& src, Mat& dst, + const Rect& _srcRoi, Point dstOfs, bool isolated) +{ + CV_Assert( src.type() == srcType && dst.type() == dstType ); + + Rect srcRoi = _srcRoi; + if( srcRoi == Rect(0,0,-1,-1) ) + srcRoi = Rect(0,0,src.cols,src.rows); + + CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 && + dstOfs.x + srcRoi.width <= dst.cols && + dstOfs.y + srcRoi.height <= dst.rows ); + + int y = start(src, srcRoi, isolated); + proceed( src.data + y*src.step, (int)src.step, endY - startY, + dst.data + dstOfs.y*dst.step + dstOfs.x*dst.elemSize(), (int)dst.step ); +} + + +/****************************************************************************************\ +* Separable linear filter * +\****************************************************************************************/ + +int getKernelType(const Mat& _kernel, Point anchor) +{ + CV_Assert( _kernel.channels() == 1 ); + int i, sz = _kernel.rows*_kernel.cols; + + Mat kernel; + _kernel.convertTo(kernel, CV_64F); + + const double* coeffs = (double*)kernel.data; + double sum = 0; + int type = KERNEL_SMOOTH + KERNEL_INTEGER; + if( (_kernel.rows == 1 || _kernel.cols == 1) && + anchor.x*2 + 1 == _kernel.cols && + anchor.y*2 + 1 == _kernel.rows ) + type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL); + + for( i = 0; i < sz; i++ ) + { + double a = coeffs[i], b = coeffs[sz - i - 1]; + if( a != b ) + type &= ~KERNEL_SYMMETRICAL; + if( a != -b ) + type &= ~KERNEL_ASYMMETRICAL; + if( a < 0 ) + type &= ~KERNEL_SMOOTH; + if( a != saturate_cast(a) ) + type &= ~KERNEL_INTEGER; + sum += a; + } + + if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) ) + type &= ~KERNEL_SMOOTH; + return type; +} + + +struct RowNoVec +{ + RowNoVec() {} + RowNoVec(const Mat&) {} + int operator()(const uchar*, uchar*, int, int) const { return 0; } +}; + +struct ColumnNoVec +{ + ColumnNoVec() {} + ColumnNoVec(const Mat&, int, int, double) {} + int operator()(const uchar**, uchar*, int) const { return 0; } +}; + +struct SymmRowSmallNoVec +{ + SymmRowSmallNoVec() {} + SymmRowSmallNoVec(const Mat&, int) {} + int operator()(const uchar*, uchar*, int, int) const { return 0; } +}; + +struct SymmColumnSmallNoVec +{ + SymmColumnSmallNoVec() {} + SymmColumnSmallNoVec(const Mat&, int, int, double) {} + int operator()(const uchar**, uchar*, int) const { return 0; } +}; + +struct FilterNoVec +{ + FilterNoVec() {} + FilterNoVec(const Mat&, int, double) {} + int operator()(const uchar**, uchar*, int) const { return 0; } +}; + + +#if CV_SSE2 + +///////////////////////////////////// 8u-16s & 8u-8u ////////////////////////////////// + +struct RowVec_8u32s +{ + RowVec_8u32s() { smallValues = false; } + RowVec_8u32s( const Mat& _kernel ) + { + kernel = _kernel; + smallValues = true; + int k, ksize = kernel.rows + kernel.cols - 1; + for( k = 0; k < ksize; k++ ) + { + int v = ((const int*)kernel.data)[k]; + if( v < SHRT_MIN || v > SHRT_MAX ) + { + smallValues = false; + break; + } + } + } + + int operator()(const uchar* _src, uchar* _dst, int width, int cn) const + { + int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; + int* dst = (int*)_dst; + const int* _kx = (const int*)kernel.data; + width *= cn; + + if( smallValues ) + { + for( ; i <= width - 16; i += 16 ) + { + const uchar* src = _src + i; + __m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z; + __m128i x0, x1, x2, x3; + + for( k = 0; k < _ksize; k++, src += cn ) + { + f = _mm_cvtsi32_si128(_kx[k]); + f = _mm_shuffle_epi32(f, 0); + f = _mm_packs_epi32(f, f); + + x0 = _mm_loadu_si128((const __m128i*)src); + x2 = _mm_unpackhi_epi8(x0, z); + x0 = _mm_unpacklo_epi8(x0, z); + x1 = _mm_mulhi_epi16(x0, f); + x3 = _mm_mulhi_epi16(x2, f); + x0 = _mm_mullo_epi16(x0, f); + x2 = _mm_mullo_epi16(x2, f); + + s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1)); + s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1)); + s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3)); + s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3)); + } + + _mm_store_si128((__m128i*)(dst + i), s0); + _mm_store_si128((__m128i*)(dst + i + 4), s1); + _mm_store_si128((__m128i*)(dst + i + 8), s2); + _mm_store_si128((__m128i*)(dst + i + 12), s3); + } + + for( ; i <= width - 4; i += 4 ) + { + const uchar* src = _src + i; + __m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1; + + for( k = 0; k < _ksize; k++, src += cn ) + { + f = _mm_cvtsi32_si128(_kx[k]); + f = _mm_shuffle_epi32(f, 0); + f = _mm_packs_epi32(f, f); + + x0 = _mm_cvtsi32_si128(*(const int*)src); + x0 = _mm_unpacklo_epi8(x0, z); + x1 = _mm_mulhi_epi16(x0, f); + x0 = _mm_mullo_epi16(x0, f); + s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1)); + } + _mm_store_si128((__m128i*)(dst + i), s0); + } + } + return i; + } + + Mat kernel; + bool smallValues; +}; + + +struct SymmRowSmallVec_8u32s +{ + SymmRowSmallVec_8u32s() { smallValues = false; } + SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType ) + { + kernel = _kernel; + symmetryType = _symmetryType; + smallValues = true; + int k, ksize = kernel.rows + kernel.cols - 1; + for( k = 0; k < ksize; k++ ) + { + int v = ((const int*)kernel.data)[k]; + if( v < SHRT_MIN || v > SHRT_MAX ) + { + smallValues = false; + break; + } + } + } + + int operator()(const uchar* src, uchar* _dst, int width, int cn) const + { + int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1; + int* dst = (int*)_dst; + bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; + const int* kx = (const int*)kernel.data + _ksize/2; + if( !smallValues ) + return 0; + + src += (_ksize/2)*cn; + width *= cn; + + __m128i z = _mm_setzero_si128(); + if( symmetrical ) + { + if( _ksize == 1 ) + return 0; + if( _ksize == 3 ) + { + if( kx[0] == 2 && kx[1] == 1 ) + for( ; i <= width - 16; i += 16, src += 16 ) + { + __m128i x0, x1, x2, y0, y1, y2; + x0 = _mm_loadu_si128((__m128i*)(src - cn)); + x1 = _mm_loadu_si128((__m128i*)src); + x2 = _mm_loadu_si128((__m128i*)(src + cn)); + y0 = _mm_unpackhi_epi8(x0, z); + x0 = _mm_unpacklo_epi8(x0, z); + y1 = _mm_unpackhi_epi8(x1, z); + x1 = _mm_unpacklo_epi8(x1, z); + y2 = _mm_unpackhi_epi8(x2, z); + x2 = _mm_unpacklo_epi8(x2, z); + x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2)); + y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2)); + _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z)); + _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z)); + _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z)); + _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z)); + } + else if( kx[0] == -2 && kx[1] == 1 ) + for( ; i <= width - 16; i += 16, src += 16 ) + { + __m128i x0, x1, x2, y0, y1, y2; + x0 = _mm_loadu_si128((__m128i*)(src - cn)); + x1 = _mm_loadu_si128((__m128i*)src); + x2 = _mm_loadu_si128((__m128i*)(src + cn)); + y0 = _mm_unpackhi_epi8(x0, z); + x0 = _mm_unpacklo_epi8(x0, z); + y1 = _mm_unpackhi_epi8(x1, z); + x1 = _mm_unpacklo_epi8(x1, z); + y2 = _mm_unpackhi_epi8(x2, z); + x2 = _mm_unpacklo_epi8(x2, z); + x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1))); + y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1))); + _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16)); + _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16)); + _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16)); + _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16)); + } + else + { + __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0), + k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0); + k0 = _mm_packs_epi32(k0, k0); + k1 = _mm_packs_epi32(k1, k1); + + for( ; i <= width - 16; i += 16, src += 16 ) + { + __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3; + x0 = _mm_loadu_si128((__m128i*)(src - cn)); + x1 = _mm_loadu_si128((__m128i*)src); + x2 = _mm_loadu_si128((__m128i*)(src + cn)); + y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z)); + x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z)); + y1 = _mm_unpackhi_epi8(x1, z); + x1 = _mm_unpacklo_epi8(x1, z); + + t1 = _mm_mulhi_epi16(x1, k0); + t0 = _mm_mullo_epi16(x1, k0); + x2 = _mm_mulhi_epi16(x0, k1); + x0 = _mm_mullo_epi16(x0, k1); + z0 = _mm_unpacklo_epi16(t0, t1); + z1 = _mm_unpackhi_epi16(t0, t1); + z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2)); + z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2)); + + t1 = _mm_mulhi_epi16(y1, k0); + t0 = _mm_mullo_epi16(y1, k0); + y1 = _mm_mulhi_epi16(y0, k1); + y0 = _mm_mullo_epi16(y0, k1); + z2 = _mm_unpacklo_epi16(t0, t1); + z3 = _mm_unpackhi_epi16(t0, t1); + z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1)); + z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1)); + _mm_store_si128((__m128i*)(dst + i), z0); + _mm_store_si128((__m128i*)(dst + i + 4), z1); + _mm_store_si128((__m128i*)(dst + i + 8), z2); + _mm_store_si128((__m128i*)(dst + i + 12), z3); + } + } + } + else if( _ksize == 5 ) + { + if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) + for( ; i <= width - 16; i += 16, src += 16 ) + { + __m128i x0, x1, x2, y0, y1, y2; + x0 = _mm_loadu_si128((__m128i*)(src - cn*2)); + x1 = _mm_loadu_si128((__m128i*)src); + x2 = _mm_loadu_si128((__m128i*)(src + cn*2)); + y0 = _mm_unpackhi_epi8(x0, z); + x0 = _mm_unpacklo_epi8(x0, z); + y1 = _mm_unpackhi_epi8(x1, z); + x1 = _mm_unpacklo_epi8(x1, z); + y2 = _mm_unpackhi_epi8(x2, z); + x2 = _mm_unpacklo_epi8(x2, z); + x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1))); + y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1))); + _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16)); + _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16)); + _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16)); + _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16)); + } + else + { + __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0), + k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0), + k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0); + k0 = _mm_packs_epi32(k0, k0); + k1 = _mm_packs_epi32(k1, k1); + k2 = _mm_packs_epi32(k2, k2); + + for( ; i <= width - 16; i += 16, src += 16 ) + { + __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3; + x0 = _mm_loadu_si128((__m128i*)(src - cn)); + x1 = _mm_loadu_si128((__m128i*)src); + x2 = _mm_loadu_si128((__m128i*)(src + cn)); + y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z)); + x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z)); + y1 = _mm_unpackhi_epi8(x1, z); + x1 = _mm_unpacklo_epi8(x1, z); + + t1 = _mm_mulhi_epi16(x1, k0); + t0 = _mm_mullo_epi16(x1, k0); + x2 = _mm_mulhi_epi16(x0, k1); + x0 = _mm_mullo_epi16(x0, k1); + z0 = _mm_unpacklo_epi16(t0, t1); + z1 = _mm_unpackhi_epi16(t0, t1); + z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2)); + z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2)); + + t1 = _mm_mulhi_epi16(y1, k0); + t0 = _mm_mullo_epi16(y1, k0); + y1 = _mm_mulhi_epi16(y0, k1); + y0 = _mm_mullo_epi16(y0, k1); + z2 = _mm_unpacklo_epi16(t0, t1); + z3 = _mm_unpackhi_epi16(t0, t1); + z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1)); + z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1)); + + x0 = _mm_loadu_si128((__m128i*)(src - cn*2)); + x1 = _mm_loadu_si128((__m128i*)(src + cn*2)); + y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z)); + y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z)); + + t1 = _mm_mulhi_epi16(y0, k2); + t0 = _mm_mullo_epi16(y0, k2); + y0 = _mm_mullo_epi16(y1, k2); + y1 = _mm_mulhi_epi16(y1, k2); + z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1)); + z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1)); + z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1)); + z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1)); + + _mm_store_si128((__m128i*)(dst + i), z0); + _mm_store_si128((__m128i*)(dst + i + 4), z1); + _mm_store_si128((__m128i*)(dst + i + 8), z2); + _mm_store_si128((__m128i*)(dst + i + 12), z3); + } + } + } + } + else + { + if( _ksize == 3 ) + { + if( kx[0] == 0 && kx[1] == 1 ) + for( ; i <= width - 16; i += 16, src += 16 ) + { + __m128i x0, x1, y0; + x0 = _mm_loadu_si128((__m128i*)(src + cn)); + x1 = _mm_loadu_si128((__m128i*)(src - cn)); + y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z)); + x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z)); + _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16)); + _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16)); + _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16)); + _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16)); + } + else + { + __m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0); + k1 = _mm_packs_epi32(k1, k1); + + for( ; i <= width - 16; i += 16, src += 16 ) + { + __m128i x0, x1, y0, y1, z0, z1, z2, z3; + x0 = _mm_loadu_si128((__m128i*)(src + cn)); + x1 = _mm_loadu_si128((__m128i*)(src - cn)); + y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z)); + x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z)); + + x1 = _mm_mulhi_epi16(x0, k1); + x0 = _mm_mullo_epi16(x0, k1); + z0 = _mm_unpacklo_epi16(x0, x1); + z1 = _mm_unpackhi_epi16(x0, x1); + + y1 = _mm_mulhi_epi16(y0, k1); + y0 = _mm_mullo_epi16(y0, k1); + z2 = _mm_unpacklo_epi16(y0, y1); + z3 = _mm_unpackhi_epi16(y0, y1); + _mm_store_si128((__m128i*)(dst + i), z0); + _mm_store_si128((__m128i*)(dst + i + 4), z1); + _mm_store_si128((__m128i*)(dst + i + 8), z2); + _mm_store_si128((__m128i*)(dst + i + 12), z3); + } + } + } + else if( _ksize == 5 ) + { + __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0), + k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0), + k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0); + k0 = _mm_packs_epi32(k0, k0); + k1 = _mm_packs_epi32(k1, k1); + k2 = _mm_packs_epi32(k2, k2); + + for( ; i <= width - 16; i += 16, src += 16 ) + { + __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3; + x0 = _mm_loadu_si128((__m128i*)(src + cn)); + x2 = _mm_loadu_si128((__m128i*)(src - cn)); + y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z)); + x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z)); + + x2 = _mm_mulhi_epi16(x0, k1); + x0 = _mm_mullo_epi16(x0, k1); + z0 = _mm_unpacklo_epi16(x0, x2); + z1 = _mm_unpackhi_epi16(x0, x2); + y1 = _mm_mulhi_epi16(y0, k1); + y0 = _mm_mullo_epi16(y0, k1); + z2 = _mm_unpacklo_epi16(y0, y1); + z3 = _mm_unpackhi_epi16(y0, y1); + + x0 = _mm_loadu_si128((__m128i*)(src + cn*2)); + x1 = _mm_loadu_si128((__m128i*)(src - cn*2)); + y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z)); + y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z)); + + t1 = _mm_mulhi_epi16(y0, k2); + t0 = _mm_mullo_epi16(y0, k2); + y0 = _mm_mullo_epi16(y1, k2); + y1 = _mm_mulhi_epi16(y1, k2); + z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1)); + z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1)); + z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1)); + z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1)); + + _mm_store_si128((__m128i*)(dst + i), z0); + _mm_store_si128((__m128i*)(dst + i + 4), z1); + _mm_store_si128((__m128i*)(dst + i + 8), z2); + _mm_store_si128((__m128i*)(dst + i + 12), z3); + } + } + } + + src -= (_ksize/2)*cn; + kx -= _ksize/2; + for( ; i <= width - 4; i += 4, src += 4 ) + { + __m128i f, s0 = z, x0, x1; + + for( k = j = 0; k < _ksize; k++, j += cn ) + { + f = _mm_cvtsi32_si128(kx[k]); + f = _mm_shuffle_epi32(f, 0); + f = _mm_packs_epi32(f, f); + + x0 = _mm_cvtsi32_si128(*(const int*)(src + j)); + x0 = _mm_unpacklo_epi8(x0, z); + x1 = _mm_mulhi_epi16(x0, f); + x0 = _mm_mullo_epi16(x0, f); + s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1)); + } + _mm_store_si128((__m128i*)(dst + i), s0); + } + + return i; + } + + Mat kernel; + int symmetryType; + bool smallValues; +}; + + +struct SymmColumnVec_32s8u +{ + SymmColumnVec_32s8u() { symmetryType=0; } + SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta) + { + symmetryType = _symmetryType; + _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); + delta = (float)(_delta/(1 << _bits)); + CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); + } + + int operator()(const uchar** _src, uchar* dst, int width) const + { + int ksize2 = (kernel.rows + kernel.cols - 1)/2; + const float* ky = (const float*)kernel.data + ksize2; + int i = 0, k; + bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; + const int** src = (const int**)_src; + const __m128i *S, *S2; + __m128 d4 = _mm_set1_ps(delta); + + if( symmetrical ) + { + for( ; i <= width - 16; i += 16 ) + { + __m128 f = _mm_load_ss(ky); + f = _mm_shuffle_ps(f, f, 0); + __m128 s0, s1, s2, s3; + __m128i x0, x1; + S = (const __m128i*)(src[0] + i); + s0 = _mm_cvtepi32_ps(_mm_load_si128(S)); + s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1)); + s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); + s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4); + s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2)); + s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3)); + s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4); + s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4); + + for( k = 1; k <= ksize2; k++ ) + { + S = (const __m128i*)(src[k] + i); + S2 = (const __m128i*)(src[-k] + i); + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2)); + x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1)); + s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); + s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); + x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2)); + x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3)); + s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); + s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); + } + + x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); + x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); + x0 = _mm_packus_epi16(x0, x1); + _mm_storeu_si128((__m128i*)(dst + i), x0); + } + + for( ; i <= width - 4; i += 4 ) + { + __m128 f = _mm_load_ss(ky); + f = _mm_shuffle_ps(f, f, 0); + __m128i x0; + __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i))); + s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); + + for( k = 1; k <= ksize2; k++ ) + { + S = (const __m128i*)(src[k] + i); + S2 = (const __m128i*)(src[-k] + i); + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2)); + s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); + } + + x0 = _mm_cvtps_epi32(s0); + x0 = _mm_packs_epi32(x0, x0); + x0 = _mm_packus_epi16(x0, x0); + *(int*)(dst + i) = _mm_cvtsi128_si32(x0); + } + } + else + { + for( ; i <= width - 16; i += 16 ) + { + __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4; + __m128i x0, x1; + + for( k = 1; k <= ksize2; k++ ) + { + S = (const __m128i*)(src[k] + i); + S2 = (const __m128i*)(src[-k] + i); + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2)); + x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1)); + s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); + s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); + x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2)); + x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3)); + s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); + s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); + } + + x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); + x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); + x0 = _mm_packus_epi16(x0, x1); + _mm_storeu_si128((__m128i*)(dst + i), x0); + } + + for( ; i <= width - 4; i += 4 ) + { + __m128 f, s0 = d4; + __m128i x0; + + for( k = 1; k <= ksize2; k++ ) + { + S = (const __m128i*)(src[k] + i); + S2 = (const __m128i*)(src[-k] + i); + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2)); + s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); + } + + x0 = _mm_cvtps_epi32(s0); + x0 = _mm_packs_epi32(x0, x0); + x0 = _mm_packus_epi16(x0, x0); + *(int*)(dst + i) = _mm_cvtsi128_si32(x0); + } + } + + return i; + } + + int symmetryType; + float delta; + Mat kernel; +}; + + +struct SymmColumnSmallVec_32s16s +{ + SymmColumnSmallVec_32s16s() { symmetryType=0; } + SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta) + { + symmetryType = _symmetryType; + _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); + delta = (float)(_delta/(1 << _bits)); + CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); + } + + int operator()(const uchar** _src, uchar* _dst, int width) const + { + int ksize2 = (kernel.rows + kernel.cols - 1)/2; + const float* ky = (const float*)kernel.data + ksize2; + int i = 0; + bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; + const int** src = (const int**)_src; + const int *S0 = src[-1], *S1 = src[0], *S2 = src[1]; + short* dst = (short*)_dst; + __m128 df4 = _mm_set1_ps(delta); + __m128i d4 = _mm_cvtps_epi32(df4); + + if( symmetrical ) + { + if( ky[0] == 2 && ky[1] == 1 ) + { + for( ; i <= width - 8; i += 8 ) + { + __m128i s0, s1, s2, s3, s4, s5; + s0 = _mm_load_si128((__m128i*)(S0 + i)); + s1 = _mm_load_si128((__m128i*)(S0 + i + 4)); + s2 = _mm_load_si128((__m128i*)(S1 + i)); + s3 = _mm_load_si128((__m128i*)(S1 + i + 4)); + s4 = _mm_load_si128((__m128i*)(S2 + i)); + s5 = _mm_load_si128((__m128i*)(S2 + i + 4)); + s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2))); + s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3))); + s0 = _mm_add_epi32(s0, d4); + s1 = _mm_add_epi32(s1, d4); + _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1)); + } + } + else if( ky[0] == -2 && ky[1] == 1 ) + { + for( ; i <= width - 8; i += 8 ) + { + __m128i s0, s1, s2, s3, s4, s5; + s0 = _mm_load_si128((__m128i*)(S0 + i)); + s1 = _mm_load_si128((__m128i*)(S0 + i + 4)); + s2 = _mm_load_si128((__m128i*)(S1 + i)); + s3 = _mm_load_si128((__m128i*)(S1 + i + 4)); + s4 = _mm_load_si128((__m128i*)(S2 + i)); + s5 = _mm_load_si128((__m128i*)(S2 + i + 4)); + s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2))); + s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3))); + s0 = _mm_add_epi32(s0, d4); + s1 = _mm_add_epi32(s1, d4); + _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1)); + } + } + else + { + __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]); + for( ; i <= width - 8; i += 8 ) + { + __m128 s0, s1; + s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i))); + s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4))); + s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4); + s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4); + __m128i x0, x1; + x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)), + _mm_load_si128((__m128i*)(S2 + i))); + x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)), + _mm_load_si128((__m128i*)(S2 + i + 4))); + s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1)); + s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1)); + x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); + _mm_storeu_si128((__m128i*)(dst + i), x0); + } + } + } + else + { + if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) + { + if( ky[1] < 0 ) + std::swap(S0, S2); + for( ; i <= width - 8; i += 8 ) + { + __m128i s0, s1, s2, s3; + s0 = _mm_load_si128((__m128i*)(S2 + i)); + s1 = _mm_load_si128((__m128i*)(S2 + i + 4)); + s2 = _mm_load_si128((__m128i*)(S0 + i)); + s3 = _mm_load_si128((__m128i*)(S0 + i + 4)); + s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4); + s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4); + _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1)); + } + } + else + { + __m128 k1 = _mm_set1_ps(ky[1]); + for( ; i <= width - 8; i += 8 ) + { + __m128 s0 = df4, s1 = df4; + __m128i x0, x1; + x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i)), + _mm_load_si128((__m128i*)(S2 + i))); + x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)), + _mm_load_si128((__m128i*)(S2 + i + 4))); + s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1)); + s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1)); + x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); + _mm_storeu_si128((__m128i*)(dst + i), x0); + } + } + } + + return i; + } + + int symmetryType; + float delta; + Mat kernel; +}; + + +/////////////////////////////////////// 32f ////////////////////////////////// + +struct RowVec_32f +{ + RowVec_32f() {} + RowVec_32f( const Mat& _kernel ) + { + kernel = _kernel; + } + + int operator()(const uchar* _src, uchar* _dst, int width, int cn) const + { + int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; + float* dst = (float*)_dst; + const float* _kx = (const float*)kernel.data; + width *= cn; + + for( ; i <= width - 8; i += 8 ) + { + const float* src = (const float*)_src + i; + __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1; + for( k = 0; k < _ksize; k++, src += cn ) + { + f = _mm_load_ss(_kx+k); + f = _mm_shuffle_ps(f, f, 0); + + x0 = _mm_loadu_ps(src); + x1 = _mm_loadu_ps(src + 4); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); + } + _mm_store_ps(dst + i, s0); + _mm_store_ps(dst + i + 4, s1); + } + return i; + } + + Mat kernel; +}; + + +struct SymmRowSmallVec_32f +{ + SymmRowSmallVec_32f() {} + SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType ) + { + kernel = _kernel; + symmetryType = _symmetryType; + } + + int operator()(const uchar* _src, uchar* _dst, int width, int cn) const + { + int i = 0, _ksize = kernel.rows + kernel.cols - 1; + float* dst = (float*)_dst; + const float* src = (const float*)_src + (_ksize/2)*cn; + bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; + const float* kx = (const float*)kernel.data + _ksize/2; + width *= cn; + + if( symmetrical ) + { + if( _ksize == 1 ) + return 0; + if( _ksize == 3 ) + { + if( kx[0] == 2 && kx[1] == 1 ) + for( ; i <= width - 8; i += 8, src += 8 ) + { + __m128 x0, x1, x2, y0, y1, y2; + x0 = _mm_loadu_ps(src - cn); + x1 = _mm_loadu_ps(src); + x2 = _mm_loadu_ps(src + cn); + y0 = _mm_loadu_ps(src - cn + 4); + y1 = _mm_loadu_ps(src + 4); + y2 = _mm_loadu_ps(src + cn + 4); + x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2)); + y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2)); + _mm_store_ps(dst + i, x0); + _mm_store_ps(dst + i + 4, y0); + } + else if( kx[0] == -2 && kx[1] == 1 ) + for( ; i <= width - 8; i += 8, src += 8 ) + { + __m128 x0, x1, x2, y0, y1, y2; + x0 = _mm_loadu_ps(src - cn); + x1 = _mm_loadu_ps(src); + x2 = _mm_loadu_ps(src + cn); + y0 = _mm_loadu_ps(src - cn + 4); + y1 = _mm_loadu_ps(src + 4); + y2 = _mm_loadu_ps(src + cn + 4); + x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1))); + y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1))); + _mm_store_ps(dst + i, x0); + _mm_store_ps(dst + i + 4, y0); + } + else + { + __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]); + for( ; i <= width - 8; i += 8, src += 8 ) + { + __m128 x0, x1, x2, y0, y1, y2; + x0 = _mm_loadu_ps(src - cn); + x1 = _mm_loadu_ps(src); + x2 = _mm_loadu_ps(src + cn); + y0 = _mm_loadu_ps(src - cn + 4); + y1 = _mm_loadu_ps(src + 4); + y2 = _mm_loadu_ps(src + cn + 4); + + x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1); + y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1); + x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0)); + y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0)); + _mm_store_ps(dst + i, x0); + _mm_store_ps(dst + i + 4, y0); + } + } + } + else if( _ksize == 5 ) + { + if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) + for( ; i <= width - 8; i += 8, src += 8 ) + { + __m128 x0, x1, x2, y0, y1, y2; + x0 = _mm_loadu_ps(src - cn*2); + x1 = _mm_loadu_ps(src); + x2 = _mm_loadu_ps(src + cn*2); + y0 = _mm_loadu_ps(src - cn*2 + 4); + y1 = _mm_loadu_ps(src + 4); + y2 = _mm_loadu_ps(src + cn*2 + 4); + x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1))); + y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1))); + _mm_store_ps(dst + i, x0); + _mm_store_ps(dst + i + 4, y0); + } + else + { + __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]); + for( ; i <= width - 8; i += 8, src += 8 ) + { + __m128 x0, x1, x2, y0, y1, y2; + x0 = _mm_loadu_ps(src - cn); + x1 = _mm_loadu_ps(src); + x2 = _mm_loadu_ps(src + cn); + y0 = _mm_loadu_ps(src - cn + 4); + y1 = _mm_loadu_ps(src + 4); + y2 = _mm_loadu_ps(src + cn + 4); + + x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1); + y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1); + x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0)); + y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0)); + + x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2)); + y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4)); + x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2)); + y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2)); + + _mm_store_ps(dst + i, x0); + _mm_store_ps(dst + i + 4, y0); + } + } + } + } + else + { + if( _ksize == 3 ) + { + if( kx[0] == 0 && kx[1] == 1 ) + for( ; i <= width - 8; i += 8, src += 8 ) + { + __m128 x0, x2, y0, y2; + x0 = _mm_loadu_ps(src + cn); + x2 = _mm_loadu_ps(src - cn); + y0 = _mm_loadu_ps(src + cn + 4); + y2 = _mm_loadu_ps(src - cn + 4); + x0 = _mm_sub_ps(x0, x2); + y0 = _mm_sub_ps(y0, y2); + _mm_store_ps(dst + i, x0); + _mm_store_ps(dst + i + 4, y0); + } + else + { + __m128 k1 = _mm_set1_ps(kx[1]); + for( ; i <= width - 8; i += 8, src += 8 ) + { + __m128 x0, x2, y0, y2; + x0 = _mm_loadu_ps(src + cn); + x2 = _mm_loadu_ps(src - cn); + y0 = _mm_loadu_ps(src + cn + 4); + y2 = _mm_loadu_ps(src - cn + 4); + + x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1); + y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1); + _mm_store_ps(dst + i, x0); + _mm_store_ps(dst + i + 4, y0); + } + } + } + else if( _ksize == 5 ) + { + __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]); + for( ; i <= width - 8; i += 8, src += 8 ) + { + __m128 x0, x2, y0, y2; + x0 = _mm_loadu_ps(src + cn); + x2 = _mm_loadu_ps(src - cn); + y0 = _mm_loadu_ps(src + cn + 4); + y2 = _mm_loadu_ps(src - cn + 4); + + x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1); + y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1); + + x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2)); + y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4)); + x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2)); + y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2)); + + _mm_store_ps(dst + i, x0); + _mm_store_ps(dst + i + 4, y0); + } + } + } + + return i; + } + + Mat kernel; + int symmetryType; +}; + + +struct SymmColumnVec_32f +{ + SymmColumnVec_32f() { symmetryType=0; } + SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) + { + symmetryType = _symmetryType; + kernel = _kernel; + delta = (float)_delta; + CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); + } + + int operator()(const uchar** _src, uchar* _dst, int width) const + { + int ksize2 = (kernel.rows + kernel.cols - 1)/2; + const float* ky = (const float*)kernel.data + ksize2; + int i = 0, k; + bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; + const float** src = (const float**)_src; + const float *S, *S2; + float* dst = (float*)_dst; + __m128 d4 = _mm_set1_ps(delta); + + if( symmetrical ) + { + for( ; i <= width - 16; i += 16 ) + { + __m128 f = _mm_load_ss(ky); + f = _mm_shuffle_ps(f, f, 0); + __m128 s0, s1, s2, s3; + __m128 x0, x1; + S = src[0] + i; + s0 = _mm_load_ps(S); + s1 = _mm_load_ps(S+4); + s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); + s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4); + s2 = _mm_load_ps(S+8); + s3 = _mm_load_ps(S+12); + s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4); + s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4); + + for( k = 1; k <= ksize2; k++ ) + { + S = src[k] + i; + S2 = src[-k] + i; + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2)); + x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); + x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); + x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); + s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); + s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); + } + + _mm_storeu_ps(dst + i, s0); + _mm_storeu_ps(dst + i + 4, s1); + _mm_storeu_ps(dst + i + 8, s2); + _mm_storeu_ps(dst + i + 12, s3); + } + + for( ; i <= width - 4; i += 4 ) + { + __m128 f = _mm_load_ss(ky); + f = _mm_shuffle_ps(f, f, 0); + __m128 x0, s0 = _mm_load_ps(src[0] + i); + s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); + + for( k = 1; k <= ksize2; k++ ) + { + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + S = src[k] + i; + S2 = src[-k] + i; + x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + } + + _mm_storeu_ps(dst + i, s0); + } + } + else + { + for( ; i <= width - 16; i += 16 ) + { + __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4; + __m128 x0, x1; + S = src[0] + i; + + for( k = 1; k <= ksize2; k++ ) + { + S = src[k] + i; + S2 = src[-k] + i; + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2)); + x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); + x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); + x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); + s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); + s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); + } + + _mm_storeu_ps(dst + i, s0); + _mm_storeu_ps(dst + i + 4, s1); + _mm_storeu_ps(dst + i + 8, s2); + _mm_storeu_ps(dst + i + 12, s3); + } + + for( ; i <= width - 4; i += 4 ) + { + __m128 f, x0, s0 = d4; + + for( k = 1; k <= ksize2; k++ ) + { + f = _mm_load_ss(ky+k); + f = _mm_shuffle_ps(f, f, 0); + S = src[k] + i; + S2 = src[-k] + i; + x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + } + + _mm_storeu_ps(dst + i, s0); + } + } + + return i; + } + + int symmetryType; + float delta; + Mat kernel; +}; + + +struct SymmColumnSmallVec_32f +{ + SymmColumnSmallVec_32f() { symmetryType=0; } + SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) + { + symmetryType = _symmetryType; + kernel = _kernel; + delta = (float)_delta; + CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); + } + + int operator()(const uchar** _src, uchar* _dst, int width) const + { + int ksize2 = (kernel.rows + kernel.cols - 1)/2; + const float* ky = (const float*)kernel.data + ksize2; + int i = 0; + bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; + const float** src = (const float**)_src; + const float *S0 = src[-1], *S1 = src[0], *S2 = src[1]; + float* dst = (float*)_dst; + __m128 d4 = _mm_set1_ps(delta); + + if( symmetrical ) + { + if( ky[0] == 2 && ky[1] == 1 ) + { + for( ; i <= width - 8; i += 8 ) + { + __m128 s0, s1, s2, s3, s4, s5; + s0 = _mm_load_ps(S0 + i); + s1 = _mm_load_ps(S0 + i + 4); + s2 = _mm_load_ps(S1 + i); + s3 = _mm_load_ps(S1 + i + 4); + s4 = _mm_load_ps(S2 + i); + s5 = _mm_load_ps(S2 + i + 4); + s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2))); + s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3))); + s0 = _mm_add_ps(s0, d4); + s1 = _mm_add_ps(s1, d4); + _mm_storeu_ps(dst + i, s0); + _mm_storeu_ps(dst + i + 4, s1); + } + } + else if( ky[0] == -2 && ky[1] == 1 ) + { + for( ; i <= width - 8; i += 8 ) + { + __m128 s0, s1, s2, s3, s4, s5; + s0 = _mm_load_ps(S0 + i); + s1 = _mm_load_ps(S0 + i + 4); + s2 = _mm_load_ps(S1 + i); + s3 = _mm_load_ps(S1 + i + 4); + s4 = _mm_load_ps(S2 + i); + s5 = _mm_load_ps(S2 + i + 4); + s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2))); + s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3))); + s0 = _mm_add_ps(s0, d4); + s1 = _mm_add_ps(s1, d4); + _mm_storeu_ps(dst + i, s0); + _mm_storeu_ps(dst + i + 4, s1); + } + } + else + { + __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]); + for( ; i <= width - 8; i += 8 ) + { + __m128 s0, s1, x0, x1; + s0 = _mm_load_ps(S1 + i); + s1 = _mm_load_ps(S1 + i + 4); + s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4); + s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4); + x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i)); + x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1)); + s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1)); + _mm_storeu_ps(dst + i, s0); + _mm_storeu_ps(dst + i + 4, s1); + } + } + } + else + { + if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) + { + if( ky[1] < 0 ) + std::swap(S0, S2); + for( ; i <= width - 8; i += 8 ) + { + __m128 s0, s1, s2, s3; + s0 = _mm_load_ps(S2 + i); + s1 = _mm_load_ps(S2 + i + 4); + s2 = _mm_load_ps(S0 + i); + s3 = _mm_load_ps(S0 + i + 4); + s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4); + s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4); + _mm_storeu_ps(dst + i, s0); + _mm_storeu_ps(dst + i + 4, s1); + } + } + else + { + __m128 k1 = _mm_set1_ps(ky[1]); + for( ; i <= width - 8; i += 8 ) + { + __m128 s0 = d4, s1 = d4, x0, x1; + x0 = _mm_sub_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i)); + x1 = _mm_sub_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1)); + s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1)); + _mm_storeu_ps(dst + i, s0); + _mm_storeu_ps(dst + i + 4, s1); + } + } + } + + return i; + } + + int symmetryType; + float delta; + Mat kernel; +}; + + +/////////////////////////////// non-separable filters /////////////////////////////// + +///////////////////////////////// 8u<->8u, 8u<->16s ///////////////////////////////// + +struct FilterVec_8u +{ + FilterVec_8u() {} + FilterVec_8u(const Mat& _kernel, int _bits, double _delta) + { + Mat kernel; + _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); + delta = (float)(_delta/(1 << _bits)); + vector coords; + preprocess2DKernel(kernel, coords, coeffs); + _nz = (int)coords.size(); + } + + int operator()(const uchar** src, uchar* dst, int width) const + { + const float* kf = (const float*)&coeffs[0]; + int i = 0, k, nz = _nz; + __m128 d4 = _mm_set1_ps(delta); + + for( ; i <= width - 16; i += 16 ) + { + __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4; + __m128i x0, x1, z = _mm_setzero_si128(); + + for( k = 0; k < nz; k++ ) + { + __m128 f = _mm_load_ss(kf+k), t0, t1; + f = _mm_shuffle_ps(f, f, 0); + + x0 = _mm_loadu_si128((const __m128i*)(src[k] + i)); + x1 = _mm_unpackhi_epi8(x0, z); + x0 = _mm_unpacklo_epi8(x0, z); + + t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); + t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z)); + s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); + s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f)); + + t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z)); + t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z)); + s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f)); + s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f)); + } + + x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); + x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); + x0 = _mm_packus_epi16(x0, x1); + _mm_storeu_si128((__m128i*)(dst + i), x0); + } + + for( ; i <= width - 4; i += 4 ) + { + __m128 s0 = d4; + __m128i x0, z = _mm_setzero_si128(); + + for( k = 0; k < nz; k++ ) + { + __m128 f = _mm_load_ss(kf+k), t0; + f = _mm_shuffle_ps(f, f, 0); + + x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i)); + x0 = _mm_unpacklo_epi8(x0, z); + t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); + s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); + } + + x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z); + x0 = _mm_packus_epi16(x0, x0); + *(int*)(dst + i) = _mm_cvtsi128_si32(x0); + } + + return i; + } + + int _nz; + vector coeffs; + float delta; +}; + + +struct FilterVec_8u16s +{ + FilterVec_8u16s() {} + FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta) + { + Mat kernel; + _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); + delta = (float)(_delta/(1 << _bits)); + vector coords; + preprocess2DKernel(kernel, coords, coeffs); + _nz = (int)coords.size(); + } + + int operator()(const uchar** src, uchar* _dst, int width) const + { + const float* kf = (const float*)&coeffs[0]; + short* dst = (short*)_dst; + int i = 0, k, nz = _nz; + __m128 d4 = _mm_set1_ps(delta); + + for( ; i <= width - 16; i += 16 ) + { + __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4; + __m128i x0, x1, z = _mm_setzero_si128(); + + for( k = 0; k < nz; k++ ) + { + __m128 f = _mm_load_ss(kf+k), t0, t1; + f = _mm_shuffle_ps(f, f, 0); + + x0 = _mm_loadu_si128((const __m128i*)(src[k] + i)); + x1 = _mm_unpackhi_epi8(x0, z); + x0 = _mm_unpacklo_epi8(x0, z); + + t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); + t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z)); + s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); + s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f)); + + t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z)); + t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z)); + s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f)); + s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f)); + } + + x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); + x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); + _mm_storeu_si128((__m128i*)(dst + i), x0); + _mm_storeu_si128((__m128i*)(dst + i + 8), x1); + } + + for( ; i <= width - 4; i += 4 ) + { + __m128 s0 = d4; + __m128i x0, z = _mm_setzero_si128(); + + for( k = 0; k < nz; k++ ) + { + __m128 f = _mm_load_ss(kf+k), t0; + f = _mm_shuffle_ps(f, f, 0); + + x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i)); + x0 = _mm_unpacklo_epi8(x0, z); + t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); + s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); + } + + x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z); + _mm_storel_epi64((__m128i*)(dst + i), x0); + } + + return i; + } + + int _nz; + vector coeffs; + float delta; +}; + + +struct FilterVec_32f +{ + FilterVec_32f() {} + FilterVec_32f(const Mat& _kernel, int, double _delta) + { + delta = (float)_delta; + vector coords; + preprocess2DKernel(_kernel, coords, coeffs); + _nz = (int)coords.size(); + } + + int operator()(const uchar** _src, uchar* _dst, int width) const + { + const float* kf = (const float*)&coeffs[0]; + const float** src = (const float**)_src; + float* dst = (float*)_dst; + int i = 0, k, nz = _nz; + __m128 d4 = _mm_set1_ps(delta); + + for( ; i <= width - 16; i += 16 ) + { + __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4; + + for( k = 0; k < nz; k++ ) + { + __m128 f = _mm_load_ss(kf+k), t0, t1; + f = _mm_shuffle_ps(f, f, 0); + const float* S = src[k] + i; + + t0 = _mm_loadu_ps(S); + t1 = _mm_loadu_ps(S + 4); + s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); + s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f)); + + t0 = _mm_loadu_ps(S + 8); + t1 = _mm_loadu_ps(S + 12); + s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f)); + s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f)); + } + + _mm_storeu_ps(dst + i, s0); + _mm_storeu_ps(dst + i + 4, s1); + _mm_storeu_ps(dst + i + 8, s2); + _mm_storeu_ps(dst + i + 12, s3); + } + + for( ; i <= width - 4; i += 4 ) + { + __m128 s0 = d4; + + for( k = 0; k < nz; k++ ) + { + __m128 f = _mm_load_ss(kf+k), t0; + f = _mm_shuffle_ps(f, f, 0); + t0 = _mm_loadu_ps(src[k] + i); + s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); + } + _mm_storeu_ps(dst + i, s0); + } + + return i; + } + + int _nz; + vector coeffs; + float delta; +}; + + +#else + +typedef RowNoVec RowVec_8u32s; +typedef RowNoVec RowVec_32f; +typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s; +typedef SymmRowSmallNoVec SymmRowSmallVec_32f; +typedef ColumnNoVec SymmColumnVec_32s8u; +typedef ColumnNoVec SymmColumnVec_32f; +typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s; +typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f; +typedef FilterNoVec FilterVec_8u; +typedef FilterNoVec FilterVec_8u16s; +typedef FilterNoVec FilterVec_32f; + +#endif + + +template struct RowFilter : public BaseRowFilter +{ + RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() ) + { + if( _kernel.isContinuous() ) + kernel = _kernel; + else + _kernel.copyTo(kernel); + anchor = _anchor; + ksize = kernel.rows + kernel.cols - 1; + CV_Assert( kernel.type() == DataType
::type && + (kernel.rows == 1 || kernel.cols == 1)); + vecOp = _vecOp; + } + + void operator()(const uchar* src, uchar* dst, int width, int cn) + { + int _ksize = ksize; + const DT* kx = (const DT*)kernel.data; + const ST* S; + DT* D = (DT*)dst; + int i, k; + + i = vecOp(src, dst, width, cn); + width *= cn; + + for( ; i <= width - 4; i += 4 ) + { + S = (const ST*)src + i; + DT f = kx[0]; + DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3]; + + for( k = 1; k < _ksize; k++ ) + { + S += cn; + f = kx[k]; + s0 += f*S[0]; s1 += f*S[1]; + s2 += f*S[2]; s3 += f*S[3]; + } + + D[i] = s0; D[i+1] = s1; + D[i+2] = s2; D[i+3] = s3; + } + + for( ; i < width; i++ ) + { + S = (const ST*)src + i; + DT s0 = kx[0]*S[0]; + for( k = 1; k < _ksize; k++ ) + { + S += cn; + s0 += kx[k]*S[0]; + } + D[i] = s0; + } + } + + Mat kernel; + VecOp vecOp; +}; + + +template struct SymmRowSmallFilter : + public RowFilter +{ + SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType, + const VecOp& _vecOp = VecOp()) + : RowFilter( _kernel, _anchor, _vecOp ) + { + symmetryType = _symmetryType; + CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 ); + } + + void operator()(const uchar* src, uchar* dst, int width, int cn) + { + int ksize2 = this->ksize/2, ksize2n = ksize2*cn; + const DT* kx = (const DT*)this->kernel.data + ksize2; + bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0; + DT* D = (DT*)dst; + int i = this->vecOp(src, dst, width, cn), j, k; + const ST* S = (const ST*)src + i + ksize2n; + width *= cn; + + if( symmetrical ) + { + if( this->ksize == 1 && kx[0] == 1 ) + { + for( ; i <= width - 2; i += 2 ) + { + DT s0 = S[i], s1 = S[i+1]; + D[i] = s0; D[i+1] = s1; + } + S += i; + } + else if( this->ksize == 3 ) + { + if( kx[0] == 2 && kx[1] == 1 ) + for( ; i <= width - 2; i += 2, S += 2 ) + { + DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn]; + D[i] = s0; D[i+1] = s1; + } + else if( kx[0] == -2 && kx[1] == 1 ) + for( ; i <= width - 2; i += 2, S += 2 ) + { + DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn]; + D[i] = s0; D[i+1] = s1; + } + else + { + DT k0 = kx[0], k1 = kx[1]; + for( ; i <= width - 2; i += 2, S += 2 ) + { + DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1; + D[i] = s0; D[i+1] = s1; + } + } + } + else if( this->ksize == 5 ) + { + DT k0 = kx[0], k1 = kx[1], k2 = kx[2]; + if( k0 == -2 && k1 == 0 && k2 == 1 ) + for( ; i <= width - 2; i += 2, S += 2 ) + { + DT s0 = -2*S[0] + S[-cn*2] + S[cn*2]; + DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2]; + D[i] = s0; D[i+1] = s1; + } + else + for( ; i <= width - 2; i += 2, S += 2 ) + { + DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2; + DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2; + D[i] = s0; D[i+1] = s1; + } + } + + for( ; i < width; i++, S++ ) + { + DT s0 = kx[0]*S[0]; + for( k = 1, j = cn; k <= ksize2; k++, j += cn ) + s0 += kx[k]*(S[j] + S[-j]); + D[i] = s0; + } + } + else + { + if( this->ksize == 3 ) + { + if( kx[0] == 0 && kx[1] == 1 ) + for( ; i <= width - 2; i += 2, S += 2 ) + { + DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn]; + D[i] = s0; D[i+1] = s1; + } + else + { + DT k1 = kx[1]; + for( ; i <= width - 2; i += 2, S += 2 ) + { + DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1; + D[i] = s0; D[i+1] = s1; + } + } + } + else if( this->ksize == 5 ) + { + DT k1 = kx[1], k2 = kx[2]; + for( ; i <= width - 2; i += 2, S += 2 ) + { + DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2; + DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2; + D[i] = s0; D[i+1] = s1; + } + } + + for( ; i < width; i++, S++ ) + { + DT s0 = kx[0]*S[0]; + for( k = 1, j = cn; k <= ksize2; k++, j += cn ) + s0 += kx[k]*(S[j] - S[-j]); + D[i] = s0; + } + } + } + + int symmetryType; +}; + + +template struct ColumnFilter : public BaseColumnFilter +{ + typedef typename CastOp::type1 ST; + typedef typename CastOp::rtype DT; + + ColumnFilter( const Mat& _kernel, int _anchor, + double _delta, const CastOp& _castOp=CastOp(), + const VecOp& _vecOp=VecOp() ) + { + if( _kernel.isContinuous() ) + kernel = _kernel; + else + _kernel.copyTo(kernel); + anchor = _anchor; + ksize = kernel.rows + kernel.cols - 1; + delta = saturate_cast(_delta); + castOp0 = _castOp; + vecOp = _vecOp; + CV_Assert( kernel.type() == DataType::type && + (kernel.rows == 1 || kernel.cols == 1)); + } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) + { + const ST* ky = (const ST*)kernel.data; + ST _delta = delta; + int _ksize = ksize; + int i, k; + CastOp castOp = castOp0; + + for( ; count--; dst += dststep, src++ ) + { + DT* D = (DT*)dst; + i = vecOp(src, dst, width); + for( ; i <= width - 4; i += 4 ) + { + ST f = ky[0]; + const ST* S = (const ST*)src[0] + i; + ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta, + s2 = f*S[2] + _delta, s3 = f*S[3] + _delta; + + for( k = 1; k < _ksize; k++ ) + { + S = (const ST*)src[k] + i; f = ky[k]; + s0 += f*S[0]; s1 += f*S[1]; + s2 += f*S[2]; s3 += f*S[3]; + } + + D[i] = castOp(s0); D[i+1] = castOp(s1); + D[i+2] = castOp(s2); D[i+3] = castOp(s3); + } + + for( ; i < width; i++ ) + { + ST s0 = ky[0]*((const ST*)src[0])[i] + _delta; + for( k = 1; k < _ksize; k++ ) + s0 += ky[k]*((const ST*)src[k])[i]; + D[i] = castOp(s0); + } + } + } + + Mat kernel; + CastOp castOp0; + VecOp vecOp; + ST delta; +}; + + +template struct SymmColumnFilter : public ColumnFilter +{ + typedef typename CastOp::type1 ST; + typedef typename CastOp::rtype DT; + + SymmColumnFilter( const Mat& _kernel, int _anchor, + double _delta, int _symmetryType, + const CastOp& _castOp=CastOp(), + const VecOp& _vecOp=VecOp()) + : ColumnFilter( _kernel, _anchor, _delta, _castOp, _vecOp ) + { + symmetryType = _symmetryType; + CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); + } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) + { + int ksize2 = this->ksize/2; + const ST* ky = (const ST*)this->kernel.data + ksize2; + int i, k; + bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; + ST _delta = this->delta; + CastOp castOp = this->castOp0; + src += ksize2; + + if( symmetrical ) + { + for( ; count--; dst += dststep, src++ ) + { + DT* D = (DT*)dst; + i = (this->vecOp)(src, dst, width); + + for( ; i <= width - 4; i += 4 ) + { + ST f = ky[0]; + const ST* S = (const ST*)src[0] + i, *S2; + ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta, + s2 = f*S[2] + _delta, s3 = f*S[3] + _delta; + + for( k = 1; k <= ksize2; k++ ) + { + S = (const ST*)src[k] + i; + S2 = (const ST*)src[-k] + i; + f = ky[k]; + s0 += f*(S[0] + S2[0]); + s1 += f*(S[1] + S2[1]); + s2 += f*(S[2] + S2[2]); + s3 += f*(S[3] + S2[3]); + } + + D[i] = castOp(s0); D[i+1] = castOp(s1); + D[i+2] = castOp(s2); D[i+3] = castOp(s3); + } + + for( ; i < width; i++ ) + { + ST s0 = ky[0]*((const ST*)src[0])[i] + _delta; + for( k = 1; k <= ksize2; k++ ) + s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]); + D[i] = castOp(s0); + } + } + } + else + { + for( ; count--; dst += dststep, src++ ) + { + DT* D = (DT*)dst; + i = this->vecOp(src, dst, width); + + for( ; i <= width - 4; i += 4 ) + { + ST f = ky[0]; + const ST *S, *S2; + ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta; + + for( k = 1; k <= ksize2; k++ ) + { + S = (const ST*)src[k] + i; + S2 = (const ST*)src[-k] + i; + f = ky[k]; + s0 += f*(S[0] - S2[0]); + s1 += f*(S[1] - S2[1]); + s2 += f*(S[2] - S2[2]); + s3 += f*(S[3] - S2[3]); + } + + D[i] = castOp(s0); D[i+1] = castOp(s1); + D[i+2] = castOp(s2); D[i+3] = castOp(s3); + } + + for( ; i < width; i++ ) + { + ST s0 = _delta; + for( k = 1; k <= ksize2; k++ ) + s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]); + D[i] = castOp(s0); + } + } + } + } + + int symmetryType; +}; + + +template +struct SymmColumnSmallFilter : public SymmColumnFilter +{ + typedef typename CastOp::type1 ST; + typedef typename CastOp::rtype DT; + + SymmColumnSmallFilter( const Mat& _kernel, int _anchor, + double _delta, int _symmetryType, + const CastOp& _castOp=CastOp(), + const VecOp& _vecOp=VecOp()) + : SymmColumnFilter( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp ) + { + CV_Assert( this->ksize == 3 ); + } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) + { + int ksize2 = this->ksize/2; + const ST* ky = (const ST*)this->kernel.data + ksize2; + int i; + bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0; + bool is_1_2_1 = ky[0] == 1 && ky[1] == 2; + bool is_1_m2_1 = ky[0] == 1 && ky[1] == -2; + bool is_m1_0_1 = ky[1] == 1 || ky[1] == -1; + ST f0 = ky[0], f1 = ky[1]; + ST _delta = this->delta; + CastOp castOp = this->castOp0; + src += ksize2; + + for( ; count--; dst += dststep, src++ ) + { + DT* D = (DT*)dst; + i = (this->vecOp)(src, dst, width); + const ST* S0 = (const ST*)src[-1]; + const ST* S1 = (const ST*)src[0]; + const ST* S2 = (const ST*)src[1]; + + if( symmetrical ) + { + if( is_1_2_1 ) + { + for( ; i <= width - 4; i += 4 ) + { + ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta; + ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta; + D[i] = castOp(s0); + D[i+1] = castOp(s1); + + s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta; + s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta; + D[i+2] = castOp(s0); + D[i+3] = castOp(s1); + } + } + else if( is_1_m2_1 ) + { + for( ; i <= width - 4; i += 4 ) + { + ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta; + ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta; + D[i] = castOp(s0); + D[i+1] = castOp(s1); + + s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta; + s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta; + D[i+2] = castOp(s0); + D[i+3] = castOp(s1); + } + } + else + { + for( ; i <= width - 4; i += 4 ) + { + ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta; + ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta; + D[i] = castOp(s0); + D[i+1] = castOp(s1); + + s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta; + s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta; + D[i+2] = castOp(s0); + D[i+3] = castOp(s1); + } + } + + for( ; i < width; i++ ) + D[i] = castOp((S0[i] + S2[i])*f1 + S1[i]*f0 + _delta); + } + else + { + if( is_m1_0_1 ) + { + if( f1 < 0 ) + std::swap(S0, S2); + + for( ; i <= width - 4; i += 4 ) + { + ST s0 = S2[i] - S0[i] + _delta; + ST s1 = S2[i+1] - S0[i+1] + _delta; + D[i] = castOp(s0); + D[i+1] = castOp(s1); + + s0 = S2[i+2] - S0[i+2] + _delta; + s1 = S2[i+3] - S0[i+3] + _delta; + D[i+2] = castOp(s0); + D[i+3] = castOp(s1); + } + + if( f1 < 0 ) + std::swap(S0, S2); + } + else + { + for( ; i <= width - 4; i += 4 ) + { + ST s0 = (S2[i] - S0[i])*f1 + _delta; + ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta; + D[i] = castOp(s0); + D[i+1] = castOp(s1); + + s0 = (S2[i+2] - S0[i+2])*f1 + _delta; + s1 = (S2[i+3] - S0[i+3])*f1 + _delta; + D[i+2] = castOp(s0); + D[i+3] = castOp(s1); + } + } + + for( ; i < width; i++ ) + D[i] = castOp((S2[i] - S0[i])*f1 + _delta); + } + } + } +}; + +template struct Cast +{ + typedef ST type1; + typedef DT rtype; + + DT operator()(ST val) const { return saturate_cast
(val); } +}; + +template struct FixedPtCast +{ + typedef ST type1; + typedef DT rtype; + enum { SHIFT = bits, DELTA = 1 << (bits-1) }; + + DT operator()(ST val) const { return saturate_cast
((val + DELTA)>>SHIFT); } +}; + +template struct FixedPtCastEx +{ + typedef ST type1; + typedef DT rtype; + + FixedPtCastEx() : SHIFT(0), DELTA(0) {} + FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {} + DT operator()(ST val) const { return saturate_cast
((val + DELTA)>>SHIFT); } + int SHIFT, DELTA; +}; + +Ptr getLinearRowFilter( int srcType, int bufType, + const Mat& kernel, int anchor, + int symmetryType ) +{ + int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType); + int cn = CV_MAT_CN(srcType); + CV_Assert( cn == CV_MAT_CN(bufType) && + ddepth >= std::max(sdepth, CV_32S) && + kernel.type() == ddepth ); + int ksize = kernel.rows + kernel.cols - 1; + + if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 ) + { + if( sdepth == CV_8U && ddepth == CV_32S ) + return Ptr(new SymmRowSmallFilter + (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType))); + if( sdepth == CV_32F && ddepth == CV_32F ) + return Ptr(new SymmRowSmallFilter + (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType))); + } + + if( sdepth == CV_8U && ddepth == CV_32S ) + return Ptr(new RowFilter + (kernel, anchor, RowVec_8u32s(kernel))); + if( sdepth == CV_8U && ddepth == CV_32F ) + return Ptr(new RowFilter(kernel, anchor)); + if( sdepth == CV_8U && ddepth == CV_64F ) + return Ptr(new RowFilter(kernel, anchor)); + if( sdepth == CV_16U && ddepth == CV_32F ) + return Ptr(new RowFilter(kernel, anchor)); + if( sdepth == CV_16U && ddepth == CV_64F ) + return Ptr(new RowFilter(kernel, anchor)); + if( sdepth == CV_16S && ddepth == CV_32F ) + return Ptr(new RowFilter(kernel, anchor)); + if( sdepth == CV_16S && ddepth == CV_64F ) + return Ptr(new RowFilter(kernel, anchor)); + if( sdepth == CV_32F && ddepth == CV_32F ) + return Ptr(new RowFilter + (kernel, anchor, RowVec_32f(kernel))); + if( sdepth == CV_64F && ddepth == CV_64F ) + return Ptr(new RowFilter(kernel, anchor)); + + CV_Error_( CV_StsNotImplemented, + ("Unsupported combination of source format (=%d), and buffer format (=%d)", + srcType, bufType)); + + return Ptr(0); +} + + +Ptr getLinearColumnFilter( int bufType, int dstType, + const Mat& kernel, int anchor, + int symmetryType, double delta, + int bits ) +{ + int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType); + int cn = CV_MAT_CN(dstType); + CV_Assert( cn == CV_MAT_CN(bufType) && + sdepth >= std::max(ddepth, CV_32S) && + kernel.type() == sdepth ); + + if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) ) + { + if( ddepth == CV_8U && sdepth == CV_32S ) + return Ptr(new ColumnFilter, ColumnNoVec> + (kernel, anchor, delta, FixedPtCastEx(bits))); + if( ddepth == CV_8U && sdepth == CV_32F ) + return Ptr(new ColumnFilter, ColumnNoVec>(kernel, anchor, delta)); + if( ddepth == CV_8U && sdepth == CV_64F ) + return Ptr(new ColumnFilter, ColumnNoVec>(kernel, anchor, delta)); + if( ddepth == CV_16U && sdepth == CV_32F ) + return Ptr(new ColumnFilter, ColumnNoVec>(kernel, anchor, delta)); + if( ddepth == CV_16U && sdepth == CV_64F ) + return Ptr(new ColumnFilter, ColumnNoVec>(kernel, anchor, delta)); + if( ddepth == CV_16S && sdepth == CV_32F ) + return Ptr(new ColumnFilter, ColumnNoVec>(kernel, anchor, delta)); + if( ddepth == CV_16S && sdepth == CV_64F ) + return Ptr(new ColumnFilter, ColumnNoVec>(kernel, anchor, delta)); + if( ddepth == CV_32F && sdepth == CV_32F ) + return Ptr(new ColumnFilter, ColumnNoVec>(kernel, anchor, delta)); + if( ddepth == CV_64F && sdepth == CV_64F ) + return Ptr(new ColumnFilter, ColumnNoVec>(kernel, anchor, delta)); + } + else + { + int ksize = kernel.rows + kernel.cols - 1; + if( ksize == 3 ) + { + if( ddepth == CV_8U && sdepth == CV_32S ) + return Ptr(new SymmColumnSmallFilter< + FixedPtCastEx, SymmColumnVec_32s8u> + (kernel, anchor, delta, symmetryType, FixedPtCastEx(bits), + SymmColumnVec_32s8u(kernel, symmetryType, bits, delta))); + if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 ) + return Ptr(new SymmColumnSmallFilter, + SymmColumnSmallVec_32s16s>(kernel, anchor, delta, symmetryType, + Cast(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta))); + if( ddepth == CV_32F && sdepth == CV_32F ) + return Ptr(new SymmColumnSmallFilter< + Cast,SymmColumnSmallVec_32f> + (kernel, anchor, delta, symmetryType, Cast(), + SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta))); + } + if( ddepth == CV_8U && sdepth == CV_32S ) + return Ptr(new SymmColumnFilter, SymmColumnVec_32s8u> + (kernel, anchor, delta, symmetryType, FixedPtCastEx(bits), + SymmColumnVec_32s8u(kernel, symmetryType, bits, delta))); + if( ddepth == CV_8U && sdepth == CV_32F ) + return Ptr(new SymmColumnFilter, ColumnNoVec> + (kernel, anchor, delta, symmetryType)); + if( ddepth == CV_8U && sdepth == CV_64F ) + return Ptr(new SymmColumnFilter, ColumnNoVec> + (kernel, anchor, delta, symmetryType)); + if( ddepth == CV_16U && sdepth == CV_32F ) + return Ptr(new SymmColumnFilter, ColumnNoVec> + (kernel, anchor, delta, symmetryType)); + if( ddepth == CV_16U && sdepth == CV_64F ) + return Ptr(new SymmColumnFilter, ColumnNoVec> + (kernel, anchor, delta, symmetryType)); + if( ddepth == CV_16S && sdepth == CV_32S ) + return Ptr(new SymmColumnFilter, ColumnNoVec> + (kernel, anchor, delta, symmetryType)); + if( ddepth == CV_16S && sdepth == CV_32F ) + return Ptr(new SymmColumnFilter, ColumnNoVec> + (kernel, anchor, delta, symmetryType)); + if( ddepth == CV_16S && sdepth == CV_64F ) + return Ptr(new SymmColumnFilter, ColumnNoVec> + (kernel, anchor, delta, symmetryType)); + if( ddepth == CV_32F && sdepth == CV_32F ) + return Ptr(new SymmColumnFilter, SymmColumnVec_32f> + (kernel, anchor, delta, symmetryType, Cast(), + SymmColumnVec_32f(kernel, symmetryType, 0, delta))); + if( ddepth == CV_64F && sdepth == CV_64F ) + return Ptr(new SymmColumnFilter, ColumnNoVec> + (kernel, anchor, delta, symmetryType)); + } + + CV_Error_( CV_StsNotImplemented, + ("Unsupported combination of buffer format (=%d), and destination format (=%d)", + bufType, dstType)); + + return Ptr(0); +} + + +Ptr createSeparableLinearFilter( + int _srcType, int _dstType, + const Mat& _rowKernel, const Mat& _columnKernel, + Point _anchor, double _delta, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue ) +{ + _srcType = CV_MAT_TYPE(_srcType); + _dstType = CV_MAT_TYPE(_dstType); + int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); + int cn = CV_MAT_CN(_srcType); + CV_Assert( cn == CV_MAT_CN(_dstType) ); + int rsize = _rowKernel.rows + _rowKernel.cols - 1; + int csize = _columnKernel.rows + _columnKernel.cols - 1; + if( _anchor.x < 0 ) + _anchor.x = rsize/2; + if( _anchor.y < 0 ) + _anchor.y = csize/2; + int rtype = getKernelType(_rowKernel, + _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x)); + int ctype = getKernelType(_columnKernel, + _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y)); + Mat rowKernel, columnKernel; + + int bdepth = std::max(CV_32F,std::max(sdepth, ddepth)); + int bits = 0; + + if( sdepth == CV_8U && + ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && + ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && + ddepth == CV_8U) || + ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && + (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && + (rtype & ctype & KERNEL_INTEGER) && + ddepth == CV_16S)) ) + { + bdepth = CV_32S; + bits = ddepth == CV_8U ? 8 : 0; + _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits ); + _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits ); + bits *= 2; + _delta *= (1 << bits); + } + else + { + if( _rowKernel.type() != bdepth ) + _rowKernel.convertTo( rowKernel, bdepth ); + else + rowKernel = _rowKernel; + if( _columnKernel.type() != bdepth ) + _columnKernel.convertTo( columnKernel, bdepth ); + else + columnKernel = _columnKernel; + } + + int _bufType = CV_MAKETYPE(bdepth, cn); + Ptr _rowFilter = getLinearRowFilter( + _srcType, _bufType, rowKernel, _anchor.x, rtype); + Ptr _columnFilter = getLinearColumnFilter( + _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits ); + + return Ptr( new FilterEngine(Ptr(0), _rowFilter, _columnFilter, + _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue )); +} + + +/****************************************************************************************\ +* Non-separable linear filter * +\****************************************************************************************/ + +void preprocess2DKernel( const Mat& kernel, vector& coords, vector& coeffs ) +{ + int i, j, k, nz = countNonZero(kernel), ktype = kernel.type(); + if(nz == 0) + nz = 1; + CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F ); + coords.resize(nz); + coeffs.resize(nz*getElemSize(ktype)); + uchar* _coeffs = &coeffs[0]; + + for( i = k = 0; i < kernel.rows; i++ ) + { + const uchar* krow = kernel.data + kernel.step*i; + for( j = 0; j < kernel.cols; j++ ) + { + if( ktype == CV_8U ) + { + uchar val = krow[j]; + if( val == 0 ) + continue; + coords[k] = Point(j,i); + _coeffs[k++] = val; + } + else if( ktype == CV_32S ) + { + int val = ((const int*)krow)[j]; + if( val == 0 ) + continue; + coords[k] = Point(j,i); + ((int*)_coeffs)[k++] = val; + } + else if( ktype == CV_32F ) + { + float val = ((const float*)krow)[j]; + if( val == 0 ) + continue; + coords[k] = Point(j,i); + ((float*)_coeffs)[k++] = val; + } + else + { + double val = ((const double*)krow)[j]; + if( val == 0 ) + continue; + coords[k] = Point(j,i); + ((double*)_coeffs)[k++] = val; + } + } + } +} + + +template struct Filter2D : public BaseFilter +{ + typedef typename CastOp::type1 KT; + typedef typename CastOp::rtype DT; + + Filter2D( const Mat& _kernel, Point _anchor, + double _delta, const CastOp& _castOp=CastOp(), + const VecOp& _vecOp=VecOp() ) + { + anchor = _anchor; + ksize = _kernel.size(); + delta = saturate_cast(_delta); + castOp0 = _castOp; + vecOp = _vecOp; + CV_Assert( _kernel.type() == DataType::type ); + preprocess2DKernel( _kernel, coords, coeffs ); + ptrs.resize( coords.size() ); + } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn) + { + KT _delta = delta; + const Point* pt = &coords[0]; + const KT* kf = (const KT*)&coeffs[0]; + const ST** kp = (const ST**)&ptrs[0]; + int i, k, nz = (int)coords.size(); + CastOp castOp = castOp0; + + width *= cn; + for( ; count > 0; count--, dst += dststep, src++ ) + { + DT* D = (DT*)dst; + + for( k = 0; k < nz; k++ ) + kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn; + + i = vecOp((const uchar**)kp, dst, width); + + for( ; i <= width - 4; i += 4 ) + { + KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta; + + for( k = 0; k < nz; k++ ) + { + const ST* sptr = kp[k] + i; + KT f = kf[k]; + s0 += f*sptr[0]; + s1 += f*sptr[1]; + s2 += f*sptr[2]; + s3 += f*sptr[3]; + } + + D[i] = castOp(s0); D[i+1] = castOp(s1); + D[i+2] = castOp(s2); D[i+3] = castOp(s3); + } + + for( ; i < width; i++ ) + { + KT s0 = _delta; + for( k = 0; k < nz; k++ ) + s0 += kf[k]*kp[k][i]; + D[i] = castOp(s0); + } + } + } + + vector coords; + vector coeffs; + vector ptrs; + KT delta; + CastOp castOp0; + VecOp vecOp; +}; + + +Ptr getLinearFilter(int srcType, int dstType, + const Mat& _kernel, Point anchor, + double delta, int bits) +{ + int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); + int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth(); + CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth ); + + anchor = normalizeAnchor(anchor, _kernel.size()); + + if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S ) + return Ptr(new Filter2D, FilterVec_8u> + (_kernel, anchor, delta, FixedPtCastEx(bits), + FilterVec_8u(_kernel, bits, delta))); + if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S ) + return Ptr(new Filter2D, FilterVec_8u16s> + (_kernel, anchor, delta, FixedPtCastEx(bits), + FilterVec_8u16s(_kernel, bits, delta))); + + kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F; + Mat kernel; + if( _kernel.type() == kdepth ) + kernel = _kernel; + else + _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.); + + if( sdepth == CV_8U && ddepth == CV_8U ) + return Ptr(new Filter2D, FilterVec_8u> + (kernel, anchor, delta, Cast(), FilterVec_8u(kernel, 0, delta))); + if( sdepth == CV_8U && ddepth == CV_16U ) + return Ptr(new Filter2D, FilterNoVec>(kernel, anchor, delta)); + if( sdepth == CV_8U && ddepth == CV_16S ) + return Ptr(new Filter2D, FilterVec_8u16s> + (kernel, anchor, delta, Cast(), FilterVec_8u16s(kernel, 0, delta))); + if( sdepth == CV_8U && ddepth == CV_32F ) + return Ptr(new Filter2D, FilterNoVec>(kernel, anchor, delta)); + if( sdepth == CV_8U && ddepth == CV_64F ) + return Ptr(new Filter2D, FilterNoVec>(kernel, anchor, delta)); + + if( sdepth == CV_16U && ddepth == CV_16U ) + return Ptr(new Filter2D, FilterNoVec>(kernel, anchor, delta)); + if( sdepth == CV_16U && ddepth == CV_32F ) + return Ptr(new Filter2D, FilterNoVec>(kernel, anchor, delta)); + if( sdepth == CV_16U && ddepth == CV_64F ) + return Ptr(new Filter2D, FilterNoVec>(kernel, anchor, delta)); + + if( sdepth == CV_16S && ddepth == CV_16S ) + return Ptr(new Filter2D, FilterNoVec>(kernel, anchor, delta)); + if( sdepth == CV_16S && ddepth == CV_32F ) + return Ptr(new Filter2D, FilterNoVec>(kernel, anchor, delta)); + if( sdepth == CV_16S && ddepth == CV_64F ) + return Ptr(new Filter2D, FilterNoVec>(kernel, anchor, delta)); + + if( sdepth == CV_32F && ddepth == CV_32F ) + return Ptr(new Filter2D, FilterVec_32f> + (kernel, anchor, delta, Cast(), FilterVec_32f(kernel, 0, delta))); + if( sdepth == CV_64F && ddepth == CV_64F ) + return Ptr(new Filter2D, FilterNoVec>(kernel, anchor, delta)); + + CV_Error_( CV_StsNotImplemented, + ("Unsupported combination of source format (=%d), and destination format (=%d)", + srcType, dstType)); + + return Ptr(0); +} + + +Ptr createLinearFilter( int _srcType, int _dstType, const Mat& _kernel, + Point _anchor, double _delta, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue ) +{ + _srcType = CV_MAT_TYPE(_srcType); + _dstType = CV_MAT_TYPE(_dstType); + int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); + int cn = CV_MAT_CN(_srcType); + CV_Assert( cn == CV_MAT_CN(_dstType) ); + + Mat kernel = _kernel; + int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor); + int bits = 0; + + if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) && + _kernel.rows*_kernel.cols <= (1 << 10) ) + { + bits = (ktype & KERNEL_INTEGER) ? 0 : 11; + _kernel.convertTo(kernel, CV_32S, 1 << bits); + } + + Ptr _filter2D = getLinearFilter(_srcType, _dstType, + kernel, _anchor, _delta, bits); + + return Ptr(new FilterEngine(_filter2D, Ptr(0), + Ptr(0), _srcType, _dstType, _srcType, + _rowBorderType, _columnBorderType, _borderValue )); +} + + +void filter2D( const Mat& src, Mat& dst, int ddepth, + const Mat& kernel, Point anchor, + double delta, int borderType ) +{ + if( ddepth < 0 ) + ddepth = src.depth(); + +#if CV_SSE2 + int dft_filter_size = (src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || + (src.depth() == CV_32F && ddepth == CV_32F) ? 130 : 50; +#else + int dft_filter_size = 50; +#endif + + dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); + anchor = normalizeAnchor(anchor, kernel.size()); + + if( kernel.cols*kernel.rows >= dft_filter_size && + kernel.cols <= src.cols && kernel.rows <= src.rows ) + { + Mat temp; + if( src.data != dst.data ) + temp = src; + else + src.copyTo(temp); + crossCorr( temp, kernel, dst, anchor, delta, borderType ); + return; + } + + Ptr f = createLinearFilter(src.type(), dst.type(), kernel, + anchor, delta, borderType ); + f->apply(src, dst); +} + + +void sepFilter2D( const Mat& src, Mat& dst, int ddepth, + const Mat& kernelX, const Mat& kernelY, Point anchor, + double delta, int borderType ) +{ + if( ddepth < 0 ) + ddepth = src.depth(); + + dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); + + Ptr f = createSeparableLinearFilter(src.type(), + dst.type(), kernelX, kernelY, anchor, delta, borderType ); + f->apply(src, dst); +} + +} + + +CV_IMPL void +cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor ) +{ + cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); + cv::Mat kernel = cv::cvarrToMat(_kernel); + + CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() ); + + cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE ); +} + +/* End of file. */