Update to 2.0.0 tree from current Fremantle build
[opencv] / src / cv / cvfilter.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "_cv.h"
44
45 /****************************************************************************************\
46                                     Base Image Filter
47 \****************************************************************************************/
48
49 namespace cv
50 {
51
52 BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; }
53 BaseRowFilter::~BaseRowFilter() {}
54
55 BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; }
56 BaseColumnFilter::~BaseColumnFilter() {}
57 void BaseColumnFilter::reset() {}
58
59 BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); }
60 BaseFilter::~BaseFilter() {}
61 void BaseFilter::reset() {}
62
63 /*
64  Various border types, image boundaries are denoted with '|'
65  
66     * BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
67     * BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
68     * BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
69     * BORDER_WRAP:          cdefgh|abcdefgh|abcdefg        
70     * BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii  with some specified 'i'
71 */
72 int borderInterpolate( int p, int len, int borderType )
73 {
74     if( (unsigned)p < (unsigned)len )
75         ;
76     else if( borderType == BORDER_REPLICATE )
77         p = p < 0 ? 0 : len - 1;
78     else if( borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101 )
79     {
80         int delta = borderType == BORDER_REFLECT_101;
81         if( len == 1 )
82             return 0;
83         do
84         {
85             if( p < 0 )
86                 p = -p - 1 + delta;
87             else
88                 p = len - 1 - (p - len) - delta;
89         }
90         while( (unsigned)p >= (unsigned)len );
91     }
92     else if( borderType == BORDER_WRAP )
93     {
94         if( p < 0 )
95             p -= ((p-len+1)/len)*len;
96         if( p >= len )
97             p %= len;
98     }
99     else if( borderType == BORDER_CONSTANT )
100         p = -1;
101     else
102         CV_Error( CV_StsBadArg, "Unknown/unsupported border type" );
103     return p;
104 }
105
106
107 FilterEngine::FilterEngine()
108 {
109     srcType = dstType = bufType = -1;
110     rowBorderType = columnBorderType = BORDER_REPLICATE;
111     bufStep = startY = startY0 = endY = rowCount = dstY = 0;
112     maxWidth = 0;
113
114     wholeSize = Size(-1,-1);
115 }
116     
117
118 FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D,
119                             const Ptr<BaseRowFilter>& _rowFilter,
120                             const Ptr<BaseColumnFilter>& _columnFilter,
121                             int _srcType, int _dstType, int _bufType,
122                             int _rowBorderType, int _columnBorderType,
123                             const Scalar& _borderValue )
124 {
125     init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType,
126          _rowBorderType, _columnBorderType, _borderValue);
127 }
128     
129 FilterEngine::~FilterEngine()
130 {
131 }
132
133
134 void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
135                          const Ptr<BaseRowFilter>& _rowFilter,
136                          const Ptr<BaseColumnFilter>& _columnFilter,
137                          int _srcType, int _dstType, int _bufType,
138                          int _rowBorderType, int _columnBorderType,
139                          const Scalar& _borderValue )
140 {
141     _srcType = CV_MAT_TYPE(_srcType);
142     _bufType = CV_MAT_TYPE(_bufType);
143     _dstType = CV_MAT_TYPE(_dstType);
144         
145     srcType = _srcType;
146     int srcElemSize = (int)getElemSize(srcType);
147     dstType = _dstType;
148     bufType = _bufType;
149     
150     filter2D = _filter2D;
151     rowFilter = _rowFilter;
152     columnFilter = _columnFilter;
153
154     if( _columnBorderType < 0 )
155         _columnBorderType = _rowBorderType;
156     
157     rowBorderType = _rowBorderType;
158     columnBorderType = _columnBorderType;
159     
160     CV_Assert( columnBorderType != BORDER_WRAP );
161     
162     if( isSeparable() )
163     {
164         CV_Assert( !rowFilter.empty() && !columnFilter.empty() );
165         ksize = Size(rowFilter->ksize, columnFilter->ksize);
166         anchor = Point(rowFilter->anchor, columnFilter->anchor);
167     }
168     else
169     {
170         CV_Assert( bufType == srcType );
171         ksize = filter2D->ksize;
172         anchor = filter2D->anchor;
173     }
174
175     CV_Assert( 0 <= anchor.x && anchor.x < ksize.width &&
176                0 <= anchor.y && anchor.y < ksize.height );
177
178     borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1);
179     borderTab.resize( std::max(ksize.width - 1, 1)*borderElemSize);
180     
181     maxWidth = bufStep = 0;
182     constBorderRow.clear();
183
184     if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT )
185     {
186         constBorderValue.resize(srcElemSize*(ksize.width - 1));
187         scalarToRawData(_borderValue, &constBorderValue[0], srcType,
188                         (ksize.width-1)*CV_MAT_CN(srcType));
189     }
190
191     wholeSize = Size(-1,-1);
192 }
193
194 static const int VEC_ALIGN = CV_MALLOC_ALIGN;
195
196 int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows)
197 {
198     int i, j;
199     
200     wholeSize = _wholeSize;
201     roi = _roi;
202     CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 &&
203         roi.x + roi.width <= wholeSize.width &&
204         roi.y + roi.height <= wholeSize.height );
205
206     int esz = (int)getElemSize(srcType);
207     int bufElemSize = (int)getElemSize(bufType);
208     const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0;
209
210     if( _maxBufRows < 0 )
211         _maxBufRows = ksize.height + 3;
212     _maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1);
213
214     if( maxWidth < roi.width || _maxBufRows != (int)rows.size() )
215     {
216         rows.resize(_maxBufRows);
217         maxWidth = std::max(maxWidth, roi.width);
218         int cn = CV_MAT_CN(srcType);
219         srcRow.resize(esz*(maxWidth + ksize.width - 1));
220         if( columnBorderType == BORDER_CONSTANT )
221         {
222             constBorderRow.resize(getElemSize(bufType)*(maxWidth+VEC_ALIGN));
223             uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst;
224             int n = (int)constBorderValue.size(), N;
225             if( isSeparable() )
226             {
227                 tdst = &srcRow[0];
228                 N = (maxWidth + ksize.width - 1)*esz;
229             }
230             else
231             {
232                 tdst = dst;
233                 N = maxWidth*esz;
234             }
235             
236             for( i = 0; i < N; i += n )
237             {
238                 n = std::min( n, N - i );
239                 for(j = 0; j < n; j++)
240                     tdst[i+j] = constVal[j];
241             }
242
243             if( isSeparable() )
244                 (*rowFilter)(&srcRow[0], dst, maxWidth, cn);
245         }
246         
247         int maxBufStep = bufElemSize*(int)alignSize(maxWidth +
248             (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);
249         ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN);
250     }
251
252     // adjust bufstep so that the used part of the ring buffer stays compact in memory
253     bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
254
255     dx1 = std::max(anchor.x - roi.x, 0);
256     dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
257
258     // recompute border tables
259     if( dx1 > 0 || dx2 > 0 )
260     {
261         if( rowBorderType == BORDER_CONSTANT )
262         {
263             int nr = isSeparable() ? 1 : (int)rows.size();
264             for( i = 0; i < nr; i++ )
265             {
266                 uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i;
267                 memcpy( dst, constVal, dx1*esz );
268                 memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz );
269             }
270         }
271         else
272         {
273             int btab_esz = borderElemSize, wholeWidth = wholeSize.width;
274             int* btab = (int*)&borderTab[0];
275             
276             for( i = 0; i < dx1; i++ )
277             {
278                 int p0 = borderInterpolate(i-dx1, wholeWidth, rowBorderType)*btab_esz;
279                 for( j = 0; j < btab_esz; j++ )
280                     btab[i*btab_esz + j] = p0 + j;
281             }
282
283             for( i = 0; i < dx2; i++ )
284             {
285                 int p0 = borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType)*btab_esz;
286                 for( j = 0; j < btab_esz; j++ )
287                     btab[(i + dx1)*btab_esz + j] = p0 + j;
288             }
289         }
290     }
291
292     rowCount = dstY = 0;
293     startY = startY0 = std::max(roi.y - anchor.y, 0);
294     endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height);
295     if( !columnFilter.empty() )
296         columnFilter->reset();
297     if( !filter2D.empty() )
298         filter2D->reset();
299
300     return startY;
301 }
302
303
304 int FilterEngine::start(const Mat& src, const Rect& _srcRoi,
305                         bool isolated, int maxBufRows)
306 {
307     Rect srcRoi = _srcRoi;
308     
309     if( srcRoi == Rect(0,0,-1,-1) )
310         srcRoi = Rect(0,0,src.cols,src.rows);
311     
312     CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 &&
313         srcRoi.width >= 0 && srcRoi.height >= 0 &&
314         srcRoi.x + srcRoi.width <= src.cols &&
315         srcRoi.y + srcRoi.height <= src.rows );
316
317     Point ofs;
318     Size wholeSize(src.cols, src.rows);
319     if( !isolated )
320         src.locateROI( wholeSize, ofs );
321     start( wholeSize, srcRoi + ofs, maxBufRows );
322
323     return startY - ofs.y;
324 }
325
326
327 int FilterEngine::remainingInputRows() const
328 {
329     return endY - startY - rowCount;
330 }
331
332 int FilterEngine::remainingOutputRows() const
333 {
334     return roi.height - dstY;
335 }
336
337 int FilterEngine::proceed( const uchar* src, int srcstep, int count,
338                            uchar* dst, int dststep )
339 {
340     CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 );
341     
342     const int *btab = &borderTab[0];
343     int esz = (int)getElemSize(srcType), btab_esz = borderElemSize;
344     uchar** brows = &rows[0];
345     int bufRows = (int)rows.size();
346     int cn = CV_MAT_CN(bufType);
347     int width = roi.width, kwidth = ksize.width;
348     int kheight = ksize.height, ay = anchor.y;
349     int _dx1 = dx1, _dx2 = dx2;
350     int width1 = roi.width + kwidth - 1;
351     int xofs1 = std::min(roi.x, anchor.x);
352     bool isSep = isSeparable();
353     bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT;
354     int dy = 0, i = 0;
355
356     src -= xofs1*esz;
357     count = std::min(count, remainingInputRows());
358
359     CV_Assert( src && dst && count > 0 );
360
361     for(;; dst += dststep*i, dy += i)
362     {
363         int dcount = bufRows - ay - startY - rowCount + roi.y;
364         dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
365         dcount = std::min(dcount, count);
366         count -= dcount;
367         for( ; dcount-- > 0; src += srcstep )
368         {
369             int bi = (startY - startY0 + rowCount) % bufRows;
370             uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
371             uchar* row = isSep ? &srcRow[0] : brow;
372             
373             if( ++rowCount > bufRows )
374             {
375                 --rowCount;
376                 ++startY;
377             }
378
379             memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz );
380
381             if( makeBorder )
382             {
383                 if( btab_esz*(int)sizeof(int) == esz )
384                 {
385                     const int* isrc = (const int*)src;
386                     int* irow = (int*)row;
387
388                     for( i = 0; i < _dx1*btab_esz; i++ )
389                         irow[i] = isrc[btab[i]];
390                     for( i = 0; i < _dx2*btab_esz; i++ )
391                         irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]];
392                 }
393                 else
394                 {
395                     for( i = 0; i < _dx1*esz; i++ )
396                         row[i] = src[btab[i]];
397                     for( i = 0; i < _dx2*esz; i++ )
398                         row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]];
399                 }
400             }
401             
402             if( isSep )
403                 (*rowFilter)(row, brow, width, CV_MAT_CN(srcType));
404         }
405
406         int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1));
407         for( i = 0; i < max_i; i++ )
408         {
409             int srcY = borderInterpolate(dstY + dy + i + roi.y - ay,
410                             wholeSize.height, columnBorderType);
411             if( srcY < 0 ) // can happen only with constant border type
412                 brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN);
413             else
414             {
415                 CV_Assert( srcY >= startY );
416                 if( srcY >= startY + rowCount )
417                     break;
418                 int bi = (srcY - startY0) % bufRows;
419                 brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
420             }
421         }
422         if( i < kheight )
423             break;
424         i -= kheight - 1;
425         if( isSeparable() )
426             (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn);
427         else
428             (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn);
429     }
430
431     dstY += dy;
432     CV_Assert( dstY <= roi.height );
433     return dy;
434 }
435
436
437 void FilterEngine::apply(const Mat& src, Mat& dst,
438     const Rect& _srcRoi, Point dstOfs, bool isolated)
439 {
440     CV_Assert( src.type() == srcType && dst.type() == dstType );
441     
442     Rect srcRoi = _srcRoi;
443     if( srcRoi == Rect(0,0,-1,-1) )
444         srcRoi = Rect(0,0,src.cols,src.rows);
445
446     CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 &&
447         dstOfs.x + srcRoi.width <= dst.cols &&
448         dstOfs.y + srcRoi.height <= dst.rows );
449
450     int y = start(src, srcRoi, isolated);
451     proceed( src.data + y*src.step, (int)src.step, endY - startY,
452              dst.data + dstOfs.y*dst.step + dstOfs.x*dst.elemSize(), (int)dst.step );
453 }
454
455
456 /****************************************************************************************\
457 *                                 Separable linear filter                                *
458 \****************************************************************************************/
459
460 int getKernelType(const Mat& _kernel, Point anchor)
461 {
462     CV_Assert( _kernel.channels() == 1 );
463     int i, sz = _kernel.rows*_kernel.cols;
464
465     Mat kernel;
466     _kernel.convertTo(kernel, CV_64F);
467
468     const double* coeffs = (double*)kernel.data;
469     double sum = 0;
470     int type = KERNEL_SMOOTH + KERNEL_INTEGER;
471     if( (_kernel.rows == 1 || _kernel.cols == 1) &&
472         anchor.x*2 + 1 == _kernel.cols &&
473         anchor.y*2 + 1 == _kernel.rows )
474         type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
475
476     for( i = 0; i < sz; i++ )
477     {
478         double a = coeffs[i], b = coeffs[sz - i - 1];
479         if( a != b )
480             type &= ~KERNEL_SYMMETRICAL;
481         if( a != -b )
482             type &= ~KERNEL_ASYMMETRICAL;
483         if( a < 0 )
484             type &= ~KERNEL_SMOOTH;
485         if( a != saturate_cast<int>(a) )
486             type &= ~KERNEL_INTEGER;
487         sum += a;
488     }
489
490     if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) )
491         type &= ~KERNEL_SMOOTH;
492     return type;
493 }
494
495
496 struct RowNoVec
497 {
498     RowNoVec() {}
499     RowNoVec(const Mat&) {}
500     int operator()(const uchar*, uchar*, int, int) const { return 0; }
501 };
502
503 struct ColumnNoVec
504 {
505     ColumnNoVec() {}
506     ColumnNoVec(const Mat&, int, int, double) {}
507     int operator()(const uchar**, uchar*, int) const { return 0; }
508 };
509
510 struct SymmRowSmallNoVec
511 {
512     SymmRowSmallNoVec() {}
513     SymmRowSmallNoVec(const Mat&, int) {}
514     int operator()(const uchar*, uchar*, int, int) const { return 0; }
515 };
516
517 struct SymmColumnSmallNoVec
518 {
519     SymmColumnSmallNoVec() {}
520     SymmColumnSmallNoVec(const Mat&, int, int, double) {}
521     int operator()(const uchar**, uchar*, int) const { return 0; }
522 };
523
524 struct FilterNoVec
525 {
526     FilterNoVec() {}
527     FilterNoVec(const Mat&, int, double) {}
528     int operator()(const uchar**, uchar*, int) const { return 0; }
529 };
530
531
532 #if CV_SSE2
533
534 ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
535
536 struct RowVec_8u32s
537 {
538     RowVec_8u32s() { smallValues = false; }
539     RowVec_8u32s( const Mat& _kernel )
540     {
541         kernel = _kernel;
542         smallValues = true;
543         int k, ksize = kernel.rows + kernel.cols - 1;
544         for( k = 0; k < ksize; k++ )
545         {
546             int v = ((const int*)kernel.data)[k];
547             if( v < SHRT_MIN || v > SHRT_MAX )
548             {
549                 smallValues = false;
550                 break;
551             }
552         }
553     }
554
555     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
556     {
557         int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
558         int* dst = (int*)_dst;
559         const int* _kx = (const int*)kernel.data;
560         width *= cn;
561
562         if( smallValues )
563         {
564             for( ; i <= width - 16; i += 16 )
565             {
566                 const uchar* src = _src + i;
567                 __m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z;
568                 __m128i x0, x1, x2, x3;
569
570                 for( k = 0; k < _ksize; k++, src += cn )
571                 {
572                     f = _mm_cvtsi32_si128(_kx[k]);
573                     f = _mm_shuffle_epi32(f, 0);
574                     f = _mm_packs_epi32(f, f);
575
576                     x0 = _mm_loadu_si128((const __m128i*)src);
577                     x2 = _mm_unpackhi_epi8(x0, z);
578                     x0 = _mm_unpacklo_epi8(x0, z);
579                     x1 = _mm_mulhi_epi16(x0, f);
580                     x3 = _mm_mulhi_epi16(x2, f);
581                     x0 = _mm_mullo_epi16(x0, f);
582                     x2 = _mm_mullo_epi16(x2, f);
583
584                     s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
585                     s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
586                     s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3));
587                     s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3));
588                 }
589                 
590                 _mm_store_si128((__m128i*)(dst + i), s0);
591                 _mm_store_si128((__m128i*)(dst + i + 4), s1);
592                 _mm_store_si128((__m128i*)(dst + i + 8), s2);
593                 _mm_store_si128((__m128i*)(dst + i + 12), s3);
594             }
595
596             for( ; i <= width - 4; i += 4 )
597             {
598                 const uchar* src = _src + i;
599                 __m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1;
600
601                 for( k = 0; k < _ksize; k++, src += cn )
602                 {
603                     f = _mm_cvtsi32_si128(_kx[k]);
604                     f = _mm_shuffle_epi32(f, 0);
605                     f = _mm_packs_epi32(f, f);
606
607                     x0 = _mm_cvtsi32_si128(*(const int*)src);
608                     x0 = _mm_unpacklo_epi8(x0, z);
609                     x1 = _mm_mulhi_epi16(x0, f);
610                     x0 = _mm_mullo_epi16(x0, f);
611                     s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
612                 }
613                 _mm_store_si128((__m128i*)(dst + i), s0);
614             }
615         }
616         return i;
617     }
618
619     Mat kernel;
620     bool smallValues;
621 };
622
623
624 struct SymmRowSmallVec_8u32s
625 {
626     SymmRowSmallVec_8u32s() { smallValues = false; }
627     SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
628     {
629         kernel = _kernel;
630         symmetryType = _symmetryType;
631         smallValues = true;
632         int k, ksize = kernel.rows + kernel.cols - 1;
633         for( k = 0; k < ksize; k++ )
634         {
635             int v = ((const int*)kernel.data)[k];
636             if( v < SHRT_MIN || v > SHRT_MAX )
637             {
638                 smallValues = false;
639                 break;
640             }
641         }
642     }
643
644     int operator()(const uchar* src, uchar* _dst, int width, int cn) const
645     {
646         int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
647         int* dst = (int*)_dst;
648         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
649         const int* kx = (const int*)kernel.data + _ksize/2;
650         if( !smallValues )
651             return 0;
652
653         src += (_ksize/2)*cn;
654         width *= cn;
655
656         __m128i z = _mm_setzero_si128();
657         if( symmetrical )
658         {
659             if( _ksize == 1 )
660                 return 0;
661             if( _ksize == 3 )
662             {
663                 if( kx[0] == 2 && kx[1] == 1 )
664                     for( ; i <= width - 16; i += 16, src += 16 )
665                     {
666                         __m128i x0, x1, x2, y0, y1, y2;
667                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
668                         x1 = _mm_loadu_si128((__m128i*)src);
669                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
670                         y0 = _mm_unpackhi_epi8(x0, z);
671                         x0 = _mm_unpacklo_epi8(x0, z);
672                         y1 = _mm_unpackhi_epi8(x1, z);
673                         x1 = _mm_unpacklo_epi8(x1, z);
674                         y2 = _mm_unpackhi_epi8(x2, z);
675                         x2 = _mm_unpacklo_epi8(x2, z);
676                         x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
677                         y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
678                         _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
679                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
680                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
681                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
682                     }
683                 else if( kx[0] == -2 && kx[1] == 1 )
684                     for( ; i <= width - 16; i += 16, src += 16 )
685                     {
686                         __m128i x0, x1, x2, y0, y1, y2;
687                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
688                         x1 = _mm_loadu_si128((__m128i*)src);
689                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
690                         y0 = _mm_unpackhi_epi8(x0, z);
691                         x0 = _mm_unpacklo_epi8(x0, z);
692                         y1 = _mm_unpackhi_epi8(x1, z);
693                         x1 = _mm_unpacklo_epi8(x1, z);
694                         y2 = _mm_unpackhi_epi8(x2, z);
695                         x2 = _mm_unpacklo_epi8(x2, z);
696                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
697                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
698                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
699                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
700                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
701                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
702                     }
703                 else
704                 {
705                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
706                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
707                     k0 = _mm_packs_epi32(k0, k0);
708                     k1 = _mm_packs_epi32(k1, k1);
709
710                     for( ; i <= width - 16; i += 16, src += 16 )
711                     {
712                         __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
713                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
714                         x1 = _mm_loadu_si128((__m128i*)src);
715                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
716                         y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
717                         x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
718                         y1 = _mm_unpackhi_epi8(x1, z);
719                         x1 = _mm_unpacklo_epi8(x1, z);
720
721                         t1 = _mm_mulhi_epi16(x1, k0);
722                         t0 = _mm_mullo_epi16(x1, k0);
723                         x2 = _mm_mulhi_epi16(x0, k1);
724                         x0 = _mm_mullo_epi16(x0, k1);
725                         z0 = _mm_unpacklo_epi16(t0, t1);
726                         z1 = _mm_unpackhi_epi16(t0, t1);
727                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
728                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
729
730                         t1 = _mm_mulhi_epi16(y1, k0);
731                         t0 = _mm_mullo_epi16(y1, k0);
732                         y1 = _mm_mulhi_epi16(y0, k1);
733                         y0 = _mm_mullo_epi16(y0, k1);
734                         z2 = _mm_unpacklo_epi16(t0, t1);
735                         z3 = _mm_unpackhi_epi16(t0, t1);
736                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
737                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
738                         _mm_store_si128((__m128i*)(dst + i), z0);
739                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
740                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
741                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
742                     }
743                 }
744             }
745             else if( _ksize == 5 )
746             {
747                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
748                     for( ; i <= width - 16; i += 16, src += 16 )
749                     {
750                         __m128i x0, x1, x2, y0, y1, y2;
751                         x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
752                         x1 = _mm_loadu_si128((__m128i*)src);
753                         x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
754                         y0 = _mm_unpackhi_epi8(x0, z);
755                         x0 = _mm_unpacklo_epi8(x0, z);
756                         y1 = _mm_unpackhi_epi8(x1, z);
757                         x1 = _mm_unpacklo_epi8(x1, z);
758                         y2 = _mm_unpackhi_epi8(x2, z);
759                         x2 = _mm_unpacklo_epi8(x2, z);
760                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
761                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
762                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
763                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
764                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
765                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
766                     }
767                 else
768                 {
769                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
770                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
771                             k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
772                     k0 = _mm_packs_epi32(k0, k0);
773                     k1 = _mm_packs_epi32(k1, k1);
774                     k2 = _mm_packs_epi32(k2, k2);
775
776                     for( ; i <= width - 16; i += 16, src += 16 )
777                     {
778                         __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
779                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
780                         x1 = _mm_loadu_si128((__m128i*)src);
781                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
782                         y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
783                         x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
784                         y1 = _mm_unpackhi_epi8(x1, z);
785                         x1 = _mm_unpacklo_epi8(x1, z);
786
787                         t1 = _mm_mulhi_epi16(x1, k0);
788                         t0 = _mm_mullo_epi16(x1, k0);
789                         x2 = _mm_mulhi_epi16(x0, k1);
790                         x0 = _mm_mullo_epi16(x0, k1);
791                         z0 = _mm_unpacklo_epi16(t0, t1);
792                         z1 = _mm_unpackhi_epi16(t0, t1);
793                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
794                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
795
796                         t1 = _mm_mulhi_epi16(y1, k0);
797                         t0 = _mm_mullo_epi16(y1, k0);
798                         y1 = _mm_mulhi_epi16(y0, k1);
799                         y0 = _mm_mullo_epi16(y0, k1);
800                         z2 = _mm_unpacklo_epi16(t0, t1);
801                         z3 = _mm_unpackhi_epi16(t0, t1);
802                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
803                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
804
805                         x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
806                         x1 = _mm_loadu_si128((__m128i*)(src + cn*2));
807                         y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
808                         y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
809
810                         t1 = _mm_mulhi_epi16(y0, k2);
811                         t0 = _mm_mullo_epi16(y0, k2);
812                         y0 = _mm_mullo_epi16(y1, k2);
813                         y1 = _mm_mulhi_epi16(y1, k2);
814                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
815                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
816                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
817                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
818
819                         _mm_store_si128((__m128i*)(dst + i), z0);
820                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
821                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
822                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
823                     }
824                 }
825             }
826         }
827         else
828         {
829             if( _ksize == 3 )
830             {
831                 if( kx[0] == 0 && kx[1] == 1 )
832                     for( ; i <= width - 16; i += 16, src += 16 )
833                     {
834                         __m128i x0, x1, y0;
835                         x0 = _mm_loadu_si128((__m128i*)(src + cn));
836                         x1 = _mm_loadu_si128((__m128i*)(src - cn));
837                         y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
838                         x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
839                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
840                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
841                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
842                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
843                     }
844                 else
845                 {
846                     __m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
847                     k1 = _mm_packs_epi32(k1, k1);
848
849                     for( ; i <= width - 16; i += 16, src += 16 )
850                     {
851                         __m128i x0, x1, y0, y1, z0, z1, z2, z3;
852                         x0 = _mm_loadu_si128((__m128i*)(src + cn));
853                         x1 = _mm_loadu_si128((__m128i*)(src - cn));
854                         y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
855                         x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
856
857                         x1 = _mm_mulhi_epi16(x0, k1);
858                         x0 = _mm_mullo_epi16(x0, k1);
859                         z0 = _mm_unpacklo_epi16(x0, x1);
860                         z1 = _mm_unpackhi_epi16(x0, x1);
861
862                         y1 = _mm_mulhi_epi16(y0, k1);
863                         y0 = _mm_mullo_epi16(y0, k1);
864                         z2 = _mm_unpacklo_epi16(y0, y1);
865                         z3 = _mm_unpackhi_epi16(y0, y1);
866                         _mm_store_si128((__m128i*)(dst + i), z0);
867                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
868                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
869                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
870                     }
871                 }
872             }
873             else if( _ksize == 5 )
874             {
875                 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
876                         k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
877                         k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
878                 k0 = _mm_packs_epi32(k0, k0);
879                 k1 = _mm_packs_epi32(k1, k1);
880                 k2 = _mm_packs_epi32(k2, k2);
881
882                 for( ; i <= width - 16; i += 16, src += 16 )
883                 {
884                     __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
885                     x0 = _mm_loadu_si128((__m128i*)(src + cn));
886                     x2 = _mm_loadu_si128((__m128i*)(src - cn));
887                     y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
888                     x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
889
890                     x2 = _mm_mulhi_epi16(x0, k1);
891                     x0 = _mm_mullo_epi16(x0, k1);
892                     z0 = _mm_unpacklo_epi16(x0, x2);
893                     z1 = _mm_unpackhi_epi16(x0, x2);
894                     y1 = _mm_mulhi_epi16(y0, k1);
895                     y0 = _mm_mullo_epi16(y0, k1);
896                     z2 = _mm_unpacklo_epi16(y0, y1);
897                     z3 = _mm_unpackhi_epi16(y0, y1);
898
899                     x0 = _mm_loadu_si128((__m128i*)(src + cn*2));
900                     x1 = _mm_loadu_si128((__m128i*)(src - cn*2));
901                     y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
902                     y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
903
904                     t1 = _mm_mulhi_epi16(y0, k2);
905                     t0 = _mm_mullo_epi16(y0, k2);
906                     y0 = _mm_mullo_epi16(y1, k2);
907                     y1 = _mm_mulhi_epi16(y1, k2);
908                     z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
909                     z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
910                     z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
911                     z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
912
913                     _mm_store_si128((__m128i*)(dst + i), z0);
914                     _mm_store_si128((__m128i*)(dst + i + 4), z1);
915                     _mm_store_si128((__m128i*)(dst + i + 8), z2);
916                     _mm_store_si128((__m128i*)(dst + i + 12), z3);
917                 }
918             }
919         }
920
921         src -= (_ksize/2)*cn;
922         kx -= _ksize/2;
923         for( ; i <= width - 4; i += 4, src += 4 )
924         {
925             __m128i f, s0 = z, x0, x1;
926
927             for( k = j = 0; k < _ksize; k++, j += cn )
928             {
929                 f = _mm_cvtsi32_si128(kx[k]);
930                 f = _mm_shuffle_epi32(f, 0);
931                 f = _mm_packs_epi32(f, f);
932
933                 x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
934                 x0 = _mm_unpacklo_epi8(x0, z);
935                 x1 = _mm_mulhi_epi16(x0, f);
936                 x0 = _mm_mullo_epi16(x0, f);
937                 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
938             }
939             _mm_store_si128((__m128i*)(dst + i), s0);
940         }
941
942         return i;
943     }
944
945     Mat kernel;
946     int symmetryType;
947     bool smallValues;
948 };
949
950
951 struct SymmColumnVec_32s8u
952 {
953     SymmColumnVec_32s8u() { symmetryType=0; }
954     SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
955     {
956         symmetryType = _symmetryType;
957         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
958         delta = (float)(_delta/(1 << _bits));
959         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
960     }
961
962     int operator()(const uchar** _src, uchar* dst, int width) const
963     {
964         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
965         const float* ky = (const float*)kernel.data + ksize2;
966         int i = 0, k;
967         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
968         const int** src = (const int**)_src;
969         const __m128i *S, *S2;
970         __m128 d4 = _mm_set1_ps(delta);
971
972         if( symmetrical )
973         {
974             for( ; i <= width - 16; i += 16 )
975             {
976                 __m128 f = _mm_load_ss(ky);
977                 f = _mm_shuffle_ps(f, f, 0);
978                 __m128 s0, s1, s2, s3;
979                 __m128i x0, x1;
980                 S = (const __m128i*)(src[0] + i);
981                 s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
982                 s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
983                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
984                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
985                 s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
986                 s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
987                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
988                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
989
990                 for( k = 1; k <= ksize2; k++ )
991                 {
992                     S = (const __m128i*)(src[k] + i);
993                     S2 = (const __m128i*)(src[-k] + i);
994                     f = _mm_load_ss(ky+k);
995                     f = _mm_shuffle_ps(f, f, 0);
996                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
997                     x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
998                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
999                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1000                     x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1001                     x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1002                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1003                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1004                 }
1005
1006                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1007                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1008                 x0 = _mm_packus_epi16(x0, x1);
1009                 _mm_storeu_si128((__m128i*)(dst + i), x0);
1010             }
1011
1012             for( ; i <= width - 4; i += 4 )
1013             {
1014                 __m128 f = _mm_load_ss(ky);
1015                 f = _mm_shuffle_ps(f, f, 0);
1016                 __m128i x0;
1017                 __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
1018                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1019
1020                 for( k = 1; k <= ksize2; k++ )
1021                 {
1022                     S = (const __m128i*)(src[k] + i);
1023                     S2 = (const __m128i*)(src[-k] + i);
1024                     f = _mm_load_ss(ky+k);
1025                     f = _mm_shuffle_ps(f, f, 0);
1026                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1027                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1028                 }
1029
1030                 x0 = _mm_cvtps_epi32(s0);
1031                 x0 = _mm_packs_epi32(x0, x0);
1032                 x0 = _mm_packus_epi16(x0, x0);
1033                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1034             }
1035         }
1036         else
1037         {
1038             for( ; i <= width - 16; i += 16 )
1039             {
1040                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1041                 __m128i x0, x1;
1042
1043                 for( k = 1; k <= ksize2; k++ )
1044                 {
1045                     S = (const __m128i*)(src[k] + i);
1046                     S2 = (const __m128i*)(src[-k] + i);
1047                     f = _mm_load_ss(ky+k);
1048                     f = _mm_shuffle_ps(f, f, 0);
1049                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1050                     x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1051                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1052                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1053                     x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1054                     x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1055                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1056                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1057                 }
1058
1059                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1060                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1061                 x0 = _mm_packus_epi16(x0, x1);
1062                 _mm_storeu_si128((__m128i*)(dst + i), x0);
1063             }
1064
1065             for( ; i <= width - 4; i += 4 )
1066             {
1067                 __m128 f, s0 = d4;
1068                 __m128i x0;
1069
1070                 for( k = 1; k <= ksize2; k++ )
1071                 {
1072                     S = (const __m128i*)(src[k] + i);
1073                     S2 = (const __m128i*)(src[-k] + i);
1074                     f = _mm_load_ss(ky+k);
1075                     f = _mm_shuffle_ps(f, f, 0);
1076                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1077                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1078                 }
1079
1080                 x0 = _mm_cvtps_epi32(s0);
1081                 x0 = _mm_packs_epi32(x0, x0);
1082                 x0 = _mm_packus_epi16(x0, x0);
1083                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1084             }
1085         }
1086
1087         return i;
1088     }
1089
1090     int symmetryType;
1091     float delta;
1092     Mat kernel;
1093 };
1094
1095
1096 struct SymmColumnSmallVec_32s16s
1097 {
1098     SymmColumnSmallVec_32s16s() { symmetryType=0; }
1099     SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
1100     {
1101         symmetryType = _symmetryType;
1102         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1103         delta = (float)(_delta/(1 << _bits));
1104         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1105     }
1106
1107     int operator()(const uchar** _src, uchar* _dst, int width) const
1108     {
1109         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1110         const float* ky = (const float*)kernel.data + ksize2;
1111         int i = 0;
1112         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1113         const int** src = (const int**)_src;
1114         const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1115         short* dst = (short*)_dst;
1116         __m128 df4 = _mm_set1_ps(delta);
1117         __m128i d4 = _mm_cvtps_epi32(df4);
1118
1119         if( symmetrical )
1120         {
1121             if( ky[0] == 2 && ky[1] == 1 )
1122             {
1123                 for( ; i <= width - 8; i += 8 )
1124                 {
1125                     __m128i s0, s1, s2, s3, s4, s5;
1126                     s0 = _mm_load_si128((__m128i*)(S0 + i));
1127                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1128                     s2 = _mm_load_si128((__m128i*)(S1 + i));
1129                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1130                     s4 = _mm_load_si128((__m128i*)(S2 + i));
1131                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1132                     s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
1133                     s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
1134                     s0 = _mm_add_epi32(s0, d4);
1135                     s1 = _mm_add_epi32(s1, d4);
1136                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1137                 }
1138             }
1139             else if( ky[0] == -2 && ky[1] == 1 )
1140             {
1141                 for( ; i <= width - 8; i += 8 )
1142                 {
1143                     __m128i s0, s1, s2, s3, s4, s5;
1144                     s0 = _mm_load_si128((__m128i*)(S0 + i));
1145                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1146                     s2 = _mm_load_si128((__m128i*)(S1 + i));
1147                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1148                     s4 = _mm_load_si128((__m128i*)(S2 + i));
1149                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1150                     s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
1151                     s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
1152                     s0 = _mm_add_epi32(s0, d4);
1153                     s1 = _mm_add_epi32(s1, d4);
1154                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1155                 }
1156             }
1157             else
1158             {
1159                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1160                 for( ; i <= width - 8; i += 8 )
1161                 {
1162                     __m128 s0, s1;
1163                     s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
1164                     s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
1165                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
1166                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
1167                     __m128i x0, x1;
1168                     x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1169                                        _mm_load_si128((__m128i*)(S2 + i)));
1170                     x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1171                                        _mm_load_si128((__m128i*)(S2 + i + 4)));
1172                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1173                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1174                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1175                     _mm_storeu_si128((__m128i*)(dst + i), x0);
1176                 }
1177             }
1178         }
1179         else
1180         {
1181             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1182             {
1183                 if( ky[1] < 0 )
1184                     std::swap(S0, S2);
1185                 for( ; i <= width - 8; i += 8 )
1186                 {
1187                     __m128i s0, s1, s2, s3;
1188                     s0 = _mm_load_si128((__m128i*)(S2 + i));
1189                     s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
1190                     s2 = _mm_load_si128((__m128i*)(S0 + i));
1191                     s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
1192                     s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
1193                     s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
1194                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1195                 }
1196             }
1197             else
1198             {
1199                 __m128 k1 = _mm_set1_ps(ky[1]);
1200                 for( ; i <= width - 8; i += 8 )
1201                 {
1202                     __m128 s0 = df4, s1 = df4;
1203                     __m128i x0, x1;
1204                     x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1205                                        _mm_load_si128((__m128i*)(S2 + i)));
1206                     x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1207                                        _mm_load_si128((__m128i*)(S2 + i + 4)));
1208                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1209                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1210                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1211                     _mm_storeu_si128((__m128i*)(dst + i), x0);
1212                 }
1213             }
1214         }
1215
1216         return i;
1217     }
1218
1219     int symmetryType;
1220     float delta;
1221     Mat kernel;
1222 };
1223
1224
1225 /////////////////////////////////////// 32f //////////////////////////////////
1226
1227 struct RowVec_32f
1228 {
1229     RowVec_32f() {}
1230     RowVec_32f( const Mat& _kernel )
1231     {
1232         kernel = _kernel;
1233     }
1234
1235     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1236     {
1237         int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
1238         float* dst = (float*)_dst;
1239         const float* _kx = (const float*)kernel.data;
1240         width *= cn;
1241
1242         for( ; i <= width - 8; i += 8 )
1243         {
1244             const float* src = (const float*)_src + i;
1245             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1246             for( k = 0; k < _ksize; k++, src += cn )
1247             {
1248                 f = _mm_load_ss(_kx+k);
1249                 f = _mm_shuffle_ps(f, f, 0);
1250
1251                 x0 = _mm_loadu_ps(src);
1252                 x1 = _mm_loadu_ps(src + 4);
1253                 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1254                 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1255             }
1256             _mm_store_ps(dst + i, s0);
1257             _mm_store_ps(dst + i + 4, s1);
1258         }
1259         return i;
1260     }
1261
1262     Mat kernel;
1263 };
1264
1265
1266 struct SymmRowSmallVec_32f
1267 {
1268     SymmRowSmallVec_32f() {}
1269     SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
1270     {
1271         kernel = _kernel;
1272         symmetryType = _symmetryType;
1273     }
1274
1275     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1276     {
1277         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
1278         float* dst = (float*)_dst;
1279         const float* src = (const float*)_src + (_ksize/2)*cn;
1280         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1281         const float* kx = (const float*)kernel.data + _ksize/2;
1282         width *= cn;
1283
1284         if( symmetrical )
1285         {
1286             if( _ksize == 1 )
1287                 return 0;
1288             if( _ksize == 3 )
1289             {
1290                 if( kx[0] == 2 && kx[1] == 1 )
1291                     for( ; i <= width - 8; i += 8, src += 8 )
1292                     {
1293                         __m128 x0, x1, x2, y0, y1, y2;
1294                         x0 = _mm_loadu_ps(src - cn);
1295                         x1 = _mm_loadu_ps(src);
1296                         x2 = _mm_loadu_ps(src + cn);
1297                         y0 = _mm_loadu_ps(src - cn + 4);
1298                         y1 = _mm_loadu_ps(src + 4);
1299                         y2 = _mm_loadu_ps(src + cn + 4);
1300                         x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
1301                         y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
1302                         _mm_store_ps(dst + i, x0);
1303                         _mm_store_ps(dst + i + 4, y0);
1304                     }
1305                 else if( kx[0] == -2 && kx[1] == 1 )
1306                     for( ; i <= width - 8; i += 8, src += 8 )
1307                     {
1308                         __m128 x0, x1, x2, y0, y1, y2;
1309                         x0 = _mm_loadu_ps(src - cn);
1310                         x1 = _mm_loadu_ps(src);
1311                         x2 = _mm_loadu_ps(src + cn);
1312                         y0 = _mm_loadu_ps(src - cn + 4);
1313                         y1 = _mm_loadu_ps(src + 4);
1314                         y2 = _mm_loadu_ps(src + cn + 4);
1315                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1316                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1317                         _mm_store_ps(dst + i, x0);
1318                         _mm_store_ps(dst + i + 4, y0);
1319                     }
1320                 else
1321                 {
1322                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
1323                     for( ; i <= width - 8; i += 8, src += 8 )
1324                     {
1325                         __m128 x0, x1, x2, y0, y1, y2;
1326                         x0 = _mm_loadu_ps(src - cn);
1327                         x1 = _mm_loadu_ps(src);
1328                         x2 = _mm_loadu_ps(src + cn);
1329                         y0 = _mm_loadu_ps(src - cn + 4);
1330                         y1 = _mm_loadu_ps(src + 4);
1331                         y2 = _mm_loadu_ps(src + cn + 4);
1332
1333                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1334                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1335                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1336                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1337                         _mm_store_ps(dst + i, x0);
1338                         _mm_store_ps(dst + i + 4, y0);
1339                     }
1340                 }
1341             }
1342             else if( _ksize == 5 )
1343             {
1344                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
1345                     for( ; i <= width - 8; i += 8, src += 8 )
1346                     {
1347                         __m128 x0, x1, x2, y0, y1, y2;
1348                         x0 = _mm_loadu_ps(src - cn*2);
1349                         x1 = _mm_loadu_ps(src);
1350                         x2 = _mm_loadu_ps(src + cn*2);
1351                         y0 = _mm_loadu_ps(src - cn*2 + 4);
1352                         y1 = _mm_loadu_ps(src + 4);
1353                         y2 = _mm_loadu_ps(src + cn*2 + 4);
1354                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1355                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1356                         _mm_store_ps(dst + i, x0);
1357                         _mm_store_ps(dst + i + 4, y0);
1358                     }
1359                 else
1360                 {
1361                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1362                     for( ; i <= width - 8; i += 8, src += 8 )
1363                     {
1364                         __m128 x0, x1, x2, y0, y1, y2;
1365                         x0 = _mm_loadu_ps(src - cn);
1366                         x1 = _mm_loadu_ps(src);
1367                         x2 = _mm_loadu_ps(src + cn);
1368                         y0 = _mm_loadu_ps(src - cn + 4);
1369                         y1 = _mm_loadu_ps(src + 4);
1370                         y2 = _mm_loadu_ps(src + cn + 4);
1371
1372                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1373                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1374                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1375                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1376                         
1377                         x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1378                         y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1379                         x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1380                         y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1381                         
1382                         _mm_store_ps(dst + i, x0);
1383                         _mm_store_ps(dst + i + 4, y0);
1384                     }
1385                 }
1386             }
1387         }
1388         else
1389         {
1390             if( _ksize == 3 )
1391             {
1392                 if( kx[0] == 0 && kx[1] == 1 )
1393                     for( ; i <= width - 8; i += 8, src += 8 )
1394                     {
1395                         __m128 x0, x2, y0, y2;
1396                         x0 = _mm_loadu_ps(src + cn);
1397                         x2 = _mm_loadu_ps(src - cn);
1398                         y0 = _mm_loadu_ps(src + cn + 4);
1399                         y2 = _mm_loadu_ps(src - cn + 4);
1400                         x0 = _mm_sub_ps(x0, x2);
1401                         y0 = _mm_sub_ps(y0, y2);
1402                         _mm_store_ps(dst + i, x0);
1403                         _mm_store_ps(dst + i + 4, y0);
1404                     }
1405                 else
1406                 {
1407                     __m128 k1 = _mm_set1_ps(kx[1]);
1408                     for( ; i <= width - 8; i += 8, src += 8 )
1409                     {
1410                         __m128 x0, x2, y0, y2;
1411                         x0 = _mm_loadu_ps(src + cn);
1412                         x2 = _mm_loadu_ps(src - cn);
1413                         y0 = _mm_loadu_ps(src + cn + 4);
1414                         y2 = _mm_loadu_ps(src - cn + 4);
1415
1416                         x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1417                         y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1418                         _mm_store_ps(dst + i, x0);
1419                         _mm_store_ps(dst + i + 4, y0);
1420                     }
1421                 }
1422             }
1423             else if( _ksize == 5 )
1424             {
1425                 __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1426                 for( ; i <= width - 8; i += 8, src += 8 )
1427                 {
1428                     __m128 x0, x2, y0, y2;
1429                     x0 = _mm_loadu_ps(src + cn);
1430                     x2 = _mm_loadu_ps(src - cn);
1431                     y0 = _mm_loadu_ps(src + cn + 4);
1432                     y2 = _mm_loadu_ps(src - cn + 4);
1433
1434                     x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1435                     y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1436                     
1437                     x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1438                     y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1439                     x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1440                     y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1441                     
1442                     _mm_store_ps(dst + i, x0);
1443                     _mm_store_ps(dst + i + 4, y0);
1444                 }
1445             }
1446         }
1447
1448         return i;
1449     }
1450
1451     Mat kernel;
1452     int symmetryType;
1453 };
1454
1455
1456 struct SymmColumnVec_32f
1457 {
1458     SymmColumnVec_32f() { symmetryType=0; }
1459     SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1460     {
1461         symmetryType = _symmetryType;
1462         kernel = _kernel;
1463         delta = (float)_delta;
1464         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1465     }
1466
1467     int operator()(const uchar** _src, uchar* _dst, int width) const
1468     {
1469         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1470         const float* ky = (const float*)kernel.data + ksize2;
1471         int i = 0, k;
1472         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1473         const float** src = (const float**)_src;
1474         const float *S, *S2;
1475         float* dst = (float*)_dst;
1476         __m128 d4 = _mm_set1_ps(delta);
1477
1478         if( symmetrical )
1479         {
1480             for( ; i <= width - 16; i += 16 )
1481             {
1482                 __m128 f = _mm_load_ss(ky);
1483                 f = _mm_shuffle_ps(f, f, 0);
1484                 __m128 s0, s1, s2, s3;
1485                 __m128 x0, x1;
1486                 S = src[0] + i;
1487                 s0 = _mm_load_ps(S);
1488                 s1 = _mm_load_ps(S+4);
1489                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1490                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1491                 s2 = _mm_load_ps(S+8);
1492                 s3 = _mm_load_ps(S+12);
1493                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1494                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1495
1496                 for( k = 1; k <= ksize2; k++ )
1497                 {
1498                     S = src[k] + i;
1499                     S2 = src[-k] + i;
1500                     f = _mm_load_ss(ky+k);
1501                     f = _mm_shuffle_ps(f, f, 0);
1502                     x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1503                     x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1504                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1505                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1506                     x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1507                     x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1508                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1509                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1510                 }
1511
1512                 _mm_storeu_ps(dst + i, s0);
1513                 _mm_storeu_ps(dst + i + 4, s1);
1514                 _mm_storeu_ps(dst + i + 8, s2);
1515                 _mm_storeu_ps(dst + i + 12, s3);
1516             }
1517
1518             for( ; i <= width - 4; i += 4 )
1519             {
1520                 __m128 f = _mm_load_ss(ky);
1521                 f = _mm_shuffle_ps(f, f, 0);
1522                 __m128 x0, s0 = _mm_load_ps(src[0] + i);
1523                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1524
1525                 for( k = 1; k <= ksize2; k++ )
1526                 {
1527                     f = _mm_load_ss(ky+k);
1528                     f = _mm_shuffle_ps(f, f, 0);
1529                     S = src[k] + i;
1530                     S2 = src[-k] + i;
1531                     x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1532                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1533                 }
1534
1535                 _mm_storeu_ps(dst + i, s0);
1536             }
1537         }
1538         else
1539         {
1540             for( ; i <= width - 16; i += 16 )
1541             {
1542                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1543                 __m128 x0, x1;
1544                 S = src[0] + i;
1545
1546                 for( k = 1; k <= ksize2; k++ )
1547                 {
1548                     S = src[k] + i;
1549                     S2 = src[-k] + i;
1550                     f = _mm_load_ss(ky+k);
1551                     f = _mm_shuffle_ps(f, f, 0);
1552                     x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1553                     x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1554                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1555                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1556                     x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1557                     x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1558                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1559                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1560                 }
1561
1562                 _mm_storeu_ps(dst + i, s0);
1563                 _mm_storeu_ps(dst + i + 4, s1);
1564                 _mm_storeu_ps(dst + i + 8, s2);
1565                 _mm_storeu_ps(dst + i + 12, s3);
1566             }
1567
1568             for( ; i <= width - 4; i += 4 )
1569             {
1570                 __m128 f, x0, s0 = d4;
1571
1572                 for( k = 1; k <= ksize2; k++ )
1573                 {
1574                     f = _mm_load_ss(ky+k);
1575                     f = _mm_shuffle_ps(f, f, 0);
1576                     S = src[k] + i;
1577                     S2 = src[-k] + i;
1578                     x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1579                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1580                 }
1581
1582                 _mm_storeu_ps(dst + i, s0);
1583             }
1584         }
1585
1586         return i;
1587     }
1588
1589     int symmetryType;
1590     float delta;
1591     Mat kernel;
1592 };
1593
1594
1595 struct SymmColumnSmallVec_32f
1596 {
1597     SymmColumnSmallVec_32f() { symmetryType=0; }
1598     SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1599     {
1600         symmetryType = _symmetryType;
1601         kernel = _kernel;
1602         delta = (float)_delta;
1603         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1604     }
1605
1606     int operator()(const uchar** _src, uchar* _dst, int width) const
1607     {
1608         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1609         const float* ky = (const float*)kernel.data + ksize2;
1610         int i = 0;
1611         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1612         const float** src = (const float**)_src;
1613         const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1614         float* dst = (float*)_dst;
1615         __m128 d4 = _mm_set1_ps(delta);
1616
1617         if( symmetrical )
1618         {
1619             if( ky[0] == 2 && ky[1] == 1 )
1620             {
1621                 for( ; i <= width - 8; i += 8 )
1622                 {
1623                     __m128 s0, s1, s2, s3, s4, s5;
1624                     s0 = _mm_load_ps(S0 + i);
1625                     s1 = _mm_load_ps(S0 + i + 4);
1626                     s2 = _mm_load_ps(S1 + i);
1627                     s3 = _mm_load_ps(S1 + i + 4);
1628                     s4 = _mm_load_ps(S2 + i);
1629                     s5 = _mm_load_ps(S2 + i + 4);
1630                     s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
1631                     s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
1632                     s0 = _mm_add_ps(s0, d4);
1633                     s1 = _mm_add_ps(s1, d4);
1634                     _mm_storeu_ps(dst + i, s0);
1635                     _mm_storeu_ps(dst + i + 4, s1);
1636                 }
1637             }
1638             else if( ky[0] == -2 && ky[1] == 1 )
1639             {
1640                 for( ; i <= width - 8; i += 8 )
1641                 {
1642                     __m128 s0, s1, s2, s3, s4, s5;
1643                     s0 = _mm_load_ps(S0 + i);
1644                     s1 = _mm_load_ps(S0 + i + 4);
1645                     s2 = _mm_load_ps(S1 + i);
1646                     s3 = _mm_load_ps(S1 + i + 4);
1647                     s4 = _mm_load_ps(S2 + i);
1648                     s5 = _mm_load_ps(S2 + i + 4);
1649                     s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
1650                     s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
1651                     s0 = _mm_add_ps(s0, d4);
1652                     s1 = _mm_add_ps(s1, d4);
1653                     _mm_storeu_ps(dst + i, s0);
1654                     _mm_storeu_ps(dst + i + 4, s1);
1655                 }
1656             }
1657             else
1658             {
1659                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1660                 for( ; i <= width - 8; i += 8 )
1661                 {
1662                     __m128 s0, s1, x0, x1;
1663                     s0 = _mm_load_ps(S1 + i);
1664                     s1 = _mm_load_ps(S1 + i + 4);
1665                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
1666                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
1667                     x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
1668                     x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
1669                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1670                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1671                     _mm_storeu_ps(dst + i, s0);
1672                     _mm_storeu_ps(dst + i + 4, s1);
1673                 }
1674             }
1675         }
1676         else
1677         {
1678             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1679             {
1680                 if( ky[1] < 0 )
1681                     std::swap(S0, S2);
1682                 for( ; i <= width - 8; i += 8 )
1683                 {
1684                     __m128 s0, s1, s2, s3;
1685                     s0 = _mm_load_ps(S2 + i);
1686                     s1 = _mm_load_ps(S2 + i + 4);
1687                     s2 = _mm_load_ps(S0 + i);
1688                     s3 = _mm_load_ps(S0 + i + 4);
1689                     s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
1690                     s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
1691                     _mm_storeu_ps(dst + i, s0);
1692                     _mm_storeu_ps(dst + i + 4, s1);
1693                 }
1694             }
1695             else
1696             {
1697                 __m128 k1 = _mm_set1_ps(ky[1]);
1698                 for( ; i <= width - 8; i += 8 )
1699                 {
1700                     __m128 s0 = d4, s1 = d4, x0, x1;
1701                     x0 = _mm_sub_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
1702                     x1 = _mm_sub_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
1703                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1704                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1705                     _mm_storeu_ps(dst + i, s0);
1706                     _mm_storeu_ps(dst + i + 4, s1);
1707                 }
1708             }
1709         }
1710
1711         return i;
1712     }
1713
1714     int symmetryType;
1715     float delta;
1716     Mat kernel;
1717 };
1718
1719
1720 /////////////////////////////// non-separable filters ///////////////////////////////
1721
1722 ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
1723
1724 struct FilterVec_8u
1725 {
1726     FilterVec_8u() {}
1727     FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
1728     {
1729         Mat kernel;
1730         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1731         delta = (float)(_delta/(1 << _bits));
1732         vector<Point> coords;
1733         preprocess2DKernel(kernel, coords, coeffs);
1734         _nz = (int)coords.size();
1735     }
1736
1737     int operator()(const uchar** src, uchar* dst, int width) const
1738     {
1739         const float* kf = (const float*)&coeffs[0];
1740         int i = 0, k, nz = _nz;
1741         __m128 d4 = _mm_set1_ps(delta);
1742
1743         for( ; i <= width - 16; i += 16 )
1744         {
1745             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1746             __m128i x0, x1, z = _mm_setzero_si128();
1747
1748             for( k = 0; k < nz; k++ )
1749             {
1750                 __m128 f = _mm_load_ss(kf+k), t0, t1;
1751                 f = _mm_shuffle_ps(f, f, 0);
1752
1753                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
1754                 x1 = _mm_unpackhi_epi8(x0, z);
1755                 x0 = _mm_unpacklo_epi8(x0, z);
1756
1757                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1758                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
1759                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1760                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1761
1762                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
1763                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
1764                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1765                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1766             }
1767
1768             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1769             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1770             x0 = _mm_packus_epi16(x0, x1);
1771             _mm_storeu_si128((__m128i*)(dst + i), x0);
1772         }
1773
1774         for( ; i <= width - 4; i += 4 )
1775         {
1776             __m128 s0 = d4;
1777             __m128i x0, z = _mm_setzero_si128();
1778
1779             for( k = 0; k < nz; k++ )
1780             {
1781                 __m128 f = _mm_load_ss(kf+k), t0;
1782                 f = _mm_shuffle_ps(f, f, 0);
1783
1784                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
1785                 x0 = _mm_unpacklo_epi8(x0, z);
1786                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1787                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1788             }
1789
1790             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
1791             x0 = _mm_packus_epi16(x0, x0);
1792             *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1793         }
1794
1795         return i;
1796     }
1797
1798     int _nz;
1799     vector<uchar> coeffs;
1800     float delta;
1801 };
1802
1803
1804 struct FilterVec_8u16s
1805 {
1806     FilterVec_8u16s() {}
1807     FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
1808     {
1809         Mat kernel;
1810         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1811         delta = (float)(_delta/(1 << _bits));
1812         vector<Point> coords;
1813         preprocess2DKernel(kernel, coords, coeffs);
1814         _nz = (int)coords.size();
1815     }
1816
1817     int operator()(const uchar** src, uchar* _dst, int width) const
1818     {
1819         const float* kf = (const float*)&coeffs[0];
1820         short* dst = (short*)_dst;
1821         int i = 0, k, nz = _nz;
1822         __m128 d4 = _mm_set1_ps(delta);
1823
1824         for( ; i <= width - 16; i += 16 )
1825         {
1826             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1827             __m128i x0, x1, z = _mm_setzero_si128();
1828
1829             for( k = 0; k < nz; k++ )
1830             {
1831                 __m128 f = _mm_load_ss(kf+k), t0, t1;
1832                 f = _mm_shuffle_ps(f, f, 0);
1833
1834                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
1835                 x1 = _mm_unpackhi_epi8(x0, z);
1836                 x0 = _mm_unpacklo_epi8(x0, z);
1837
1838                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1839                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
1840                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1841                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1842
1843                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
1844                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
1845                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1846                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1847             }
1848
1849             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1850             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1851             _mm_storeu_si128((__m128i*)(dst + i), x0);
1852             _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
1853         }
1854
1855         for( ; i <= width - 4; i += 4 )
1856         {
1857             __m128 s0 = d4;
1858             __m128i x0, z = _mm_setzero_si128();
1859
1860             for( k = 0; k < nz; k++ )
1861             {
1862                 __m128 f = _mm_load_ss(kf+k), t0;
1863                 f = _mm_shuffle_ps(f, f, 0);
1864
1865                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
1866                 x0 = _mm_unpacklo_epi8(x0, z);
1867                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1868                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1869             }
1870
1871             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
1872             _mm_storel_epi64((__m128i*)(dst + i), x0);
1873         }
1874
1875         return i;
1876     }
1877
1878     int _nz;
1879     vector<uchar> coeffs;
1880     float delta;
1881 };
1882
1883
1884 struct FilterVec_32f
1885 {
1886     FilterVec_32f() {}
1887     FilterVec_32f(const Mat& _kernel, int, double _delta)
1888     {
1889         delta = (float)_delta;
1890         vector<Point> coords;
1891         preprocess2DKernel(_kernel, coords, coeffs);
1892         _nz = (int)coords.size();
1893     }
1894
1895     int operator()(const uchar** _src, uchar* _dst, int width) const
1896     {
1897         const float* kf = (const float*)&coeffs[0];
1898         const float** src = (const float**)_src;
1899         float* dst = (float*)_dst;
1900         int i = 0, k, nz = _nz;
1901         __m128 d4 = _mm_set1_ps(delta);
1902
1903         for( ; i <= width - 16; i += 16 )
1904         {
1905             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1906
1907             for( k = 0; k < nz; k++ )
1908             {
1909                 __m128 f = _mm_load_ss(kf+k), t0, t1;
1910                 f = _mm_shuffle_ps(f, f, 0);
1911                 const float* S = src[k] + i;
1912
1913                 t0 = _mm_loadu_ps(S);
1914                 t1 = _mm_loadu_ps(S + 4);
1915                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1916                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1917
1918                 t0 = _mm_loadu_ps(S + 8);
1919                 t1 = _mm_loadu_ps(S + 12);
1920                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1921                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1922             }
1923
1924             _mm_storeu_ps(dst + i, s0);
1925             _mm_storeu_ps(dst + i + 4, s1);
1926             _mm_storeu_ps(dst + i + 8, s2);
1927             _mm_storeu_ps(dst + i + 12, s3);
1928         }
1929
1930         for( ; i <= width - 4; i += 4 )
1931         {
1932             __m128 s0 = d4;
1933
1934             for( k = 0; k < nz; k++ )
1935             {
1936                 __m128 f = _mm_load_ss(kf+k), t0;
1937                 f = _mm_shuffle_ps(f, f, 0);
1938                 t0 = _mm_loadu_ps(src[k] + i);
1939                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1940             }
1941             _mm_storeu_ps(dst + i, s0);
1942         }
1943
1944         return i;
1945     }
1946
1947     int _nz;
1948     vector<uchar> coeffs;
1949     float delta;
1950 };
1951
1952
1953 #else
1954
1955 typedef RowNoVec RowVec_8u32s;
1956 typedef RowNoVec RowVec_32f;
1957 typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
1958 typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
1959 typedef ColumnNoVec SymmColumnVec_32s8u;
1960 typedef ColumnNoVec SymmColumnVec_32f;
1961 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
1962 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
1963 typedef FilterNoVec FilterVec_8u;
1964 typedef FilterNoVec FilterVec_8u16s;
1965 typedef FilterNoVec FilterVec_32f;
1966
1967 #endif
1968
1969
1970 template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter
1971 {
1972     RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() )
1973     {
1974         if( _kernel.isContinuous() )
1975             kernel = _kernel;
1976         else
1977             _kernel.copyTo(kernel);
1978         anchor = _anchor;
1979         ksize = kernel.rows + kernel.cols - 1;
1980         CV_Assert( kernel.type() == DataType<DT>::type &&
1981                    (kernel.rows == 1 || kernel.cols == 1));
1982         vecOp = _vecOp;
1983     }
1984     
1985     void operator()(const uchar* src, uchar* dst, int width, int cn)
1986     {
1987         int _ksize = ksize;
1988         const DT* kx = (const DT*)kernel.data;
1989         const ST* S;
1990         DT* D = (DT*)dst;
1991         int i, k;
1992
1993         i = vecOp(src, dst, width, cn);
1994         width *= cn;
1995
1996         for( ; i <= width - 4; i += 4 )
1997         {
1998             S = (const ST*)src + i;
1999             DT f = kx[0];
2000             DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3];
2001
2002             for( k = 1; k < _ksize; k++ )
2003             {
2004                 S += cn;
2005                 f = kx[k];
2006                 s0 += f*S[0]; s1 += f*S[1];
2007                 s2 += f*S[2]; s3 += f*S[3];
2008             }
2009             
2010             D[i] = s0; D[i+1] = s1;
2011             D[i+2] = s2; D[i+3] = s3;
2012         }
2013
2014         for( ; i < width; i++ )
2015         {
2016             S = (const ST*)src + i;
2017             DT s0 = kx[0]*S[0];
2018             for( k = 1; k < _ksize; k++ )
2019             {
2020                 S += cn;
2021                 s0 += kx[k]*S[0];
2022             }
2023             D[i] = s0;
2024         }
2025     }
2026
2027     Mat kernel;
2028     VecOp vecOp;
2029 };
2030
2031
2032 template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter :
2033     public RowFilter<ST, DT, VecOp>
2034 {
2035     SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType,
2036                         const VecOp& _vecOp = VecOp())
2037         : RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp )
2038     {
2039         symmetryType = _symmetryType;
2040         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 );
2041     }
2042     
2043     void operator()(const uchar* src, uchar* dst, int width, int cn)
2044     {
2045         int ksize2 = this->ksize/2, ksize2n = ksize2*cn;
2046         const DT* kx = (const DT*)this->kernel.data + ksize2;
2047         bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2048         DT* D = (DT*)dst;
2049         int i = this->vecOp(src, dst, width, cn), j, k;
2050         const ST* S = (const ST*)src + i + ksize2n;
2051         width *= cn;
2052
2053         if( symmetrical )
2054         {
2055             if( this->ksize == 1 && kx[0] == 1 )
2056             {
2057                 for( ; i <= width - 2; i += 2 )
2058                 {
2059                     DT s0 = S[i], s1 = S[i+1];
2060                     D[i] = s0; D[i+1] = s1;
2061                 }
2062                 S += i;
2063             }
2064             else if( this->ksize == 3 )
2065             {
2066                 if( kx[0] == 2 && kx[1] == 1 )
2067                     for( ; i <= width - 2; i += 2, S += 2 )
2068                     {
2069                         DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn];
2070                         D[i] = s0; D[i+1] = s1;
2071                     }
2072                 else if( kx[0] == -2 && kx[1] == 1 )
2073                     for( ; i <= width - 2; i += 2, S += 2 )
2074                     {
2075                         DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn];
2076                         D[i] = s0; D[i+1] = s1;
2077                     }
2078                 else
2079                 {
2080                     DT k0 = kx[0], k1 = kx[1];
2081                     for( ; i <= width - 2; i += 2, S += 2 )
2082                     {
2083                         DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1;
2084                         D[i] = s0; D[i+1] = s1;
2085                     }
2086                 }
2087             }
2088             else if( this->ksize == 5 )
2089             {
2090                 DT k0 = kx[0], k1 = kx[1], k2 = kx[2];
2091                 if( k0 == -2 && k1 == 0 && k2 == 1 )
2092                     for( ; i <= width - 2; i += 2, S += 2 )
2093                     {
2094                         DT s0 = -2*S[0] + S[-cn*2] + S[cn*2];
2095                         DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2];
2096                         D[i] = s0; D[i+1] = s1;
2097                     }
2098                 else
2099                     for( ; i <= width - 2; i += 2, S += 2 )
2100                     {
2101                         DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2;
2102                         DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2;
2103                         D[i] = s0; D[i+1] = s1;
2104                     }
2105             }
2106
2107             for( ; i < width; i++, S++ )
2108             {
2109                 DT s0 = kx[0]*S[0];
2110                 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2111                     s0 += kx[k]*(S[j] + S[-j]);
2112                 D[i] = s0;
2113             }
2114         }
2115         else
2116         {
2117             if( this->ksize == 3 )
2118             {
2119                 if( kx[0] == 0 && kx[1] == 1 )
2120                     for( ; i <= width - 2; i += 2, S += 2 )
2121                     {
2122                         DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
2123                         D[i] = s0; D[i+1] = s1;
2124                     }
2125                 else
2126                 {
2127                     DT k1 = kx[1];
2128                     for( ; i <= width - 2; i += 2, S += 2 )
2129                     {
2130                         DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1;
2131                         D[i] = s0; D[i+1] = s1;
2132                     }
2133                 }
2134             }
2135             else if( this->ksize == 5 )
2136             {
2137                 DT k1 = kx[1], k2 = kx[2];
2138                 for( ; i <= width - 2; i += 2, S += 2 )
2139                 {
2140                     DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2;
2141                     DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2;
2142                     D[i] = s0; D[i+1] = s1;
2143                 }
2144             }
2145
2146             for( ; i < width; i++, S++ )
2147             {
2148                 DT s0 = kx[0]*S[0];
2149                 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2150                     s0 += kx[k]*(S[j] - S[-j]);
2151                 D[i] = s0;
2152             }
2153         }
2154     }
2155
2156     int symmetryType;
2157 };
2158
2159
2160 template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter
2161 {
2162     typedef typename CastOp::type1 ST;
2163     typedef typename CastOp::rtype DT;
2164     
2165     ColumnFilter( const Mat& _kernel, int _anchor,
2166         double _delta, const CastOp& _castOp=CastOp(),
2167         const VecOp& _vecOp=VecOp() )
2168     {
2169         if( _kernel.isContinuous() )
2170             kernel = _kernel;
2171         else
2172             _kernel.copyTo(kernel);
2173         anchor = _anchor;
2174         ksize = kernel.rows + kernel.cols - 1;
2175         delta = saturate_cast<ST>(_delta);
2176         castOp0 = _castOp;
2177         vecOp = _vecOp;
2178         CV_Assert( kernel.type() == DataType<ST>::type &&
2179                    (kernel.rows == 1 || kernel.cols == 1));
2180     }
2181
2182     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2183     {
2184         const ST* ky = (const ST*)kernel.data;
2185         ST _delta = delta;
2186         int _ksize = ksize;
2187         int i, k;
2188         CastOp castOp = castOp0;
2189
2190         for( ; count--; dst += dststep, src++ )
2191         {
2192             DT* D = (DT*)dst;
2193             i = vecOp(src, dst, width);
2194             for( ; i <= width - 4; i += 4 )
2195             {
2196                 ST f = ky[0];
2197                 const ST* S = (const ST*)src[0] + i;
2198                 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2199                     s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2200
2201                 for( k = 1; k < _ksize; k++ )
2202                 {
2203                     S = (const ST*)src[k] + i; f = ky[k];
2204                     s0 += f*S[0]; s1 += f*S[1];
2205                     s2 += f*S[2]; s3 += f*S[3];
2206                 }
2207
2208                 D[i] = castOp(s0); D[i+1] = castOp(s1);
2209                 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2210             }
2211
2212             for( ; i < width; i++ )
2213             {
2214                 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2215                 for( k = 1; k < _ksize; k++ )
2216                     s0 += ky[k]*((const ST*)src[k])[i];
2217                 D[i] = castOp(s0);
2218             }
2219         }
2220     }
2221
2222     Mat kernel;
2223     CastOp castOp0;
2224     VecOp vecOp;
2225     ST delta;
2226 };
2227
2228
2229 template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp>
2230 {
2231     typedef typename CastOp::type1 ST;
2232     typedef typename CastOp::rtype DT;
2233
2234     SymmColumnFilter( const Mat& _kernel, int _anchor,
2235         double _delta, int _symmetryType,
2236         const CastOp& _castOp=CastOp(),
2237         const VecOp& _vecOp=VecOp())
2238         : ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp )
2239     {
2240         symmetryType = _symmetryType;
2241         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2242     }
2243
2244     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2245     {
2246         int ksize2 = this->ksize/2;
2247         const ST* ky = (const ST*)this->kernel.data + ksize2;
2248         int i, k;
2249         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2250         ST _delta = this->delta;
2251         CastOp castOp = this->castOp0;
2252         src += ksize2;
2253
2254         if( symmetrical )
2255         {
2256             for( ; count--; dst += dststep, src++ )
2257             {
2258                 DT* D = (DT*)dst;
2259                 i = (this->vecOp)(src, dst, width);
2260
2261                 for( ; i <= width - 4; i += 4 )
2262                 {
2263                     ST f = ky[0];
2264                     const ST* S = (const ST*)src[0] + i, *S2;
2265                     ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2266                         s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2267
2268                     for( k = 1; k <= ksize2; k++ )
2269                     {
2270                         S = (const ST*)src[k] + i;
2271                         S2 = (const ST*)src[-k] + i;
2272                         f = ky[k];
2273                         s0 += f*(S[0] + S2[0]);
2274                         s1 += f*(S[1] + S2[1]);
2275                         s2 += f*(S[2] + S2[2]);
2276                         s3 += f*(S[3] + S2[3]);
2277                     }
2278
2279                     D[i] = castOp(s0); D[i+1] = castOp(s1);
2280                     D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2281                 }
2282
2283                 for( ; i < width; i++ )
2284                 {
2285                     ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2286                     for( k = 1; k <= ksize2; k++ )
2287                         s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]);
2288                     D[i] = castOp(s0);
2289                 }
2290             }
2291         }
2292         else
2293         {
2294             for( ; count--; dst += dststep, src++ )
2295             {
2296                 DT* D = (DT*)dst;
2297                 i = this->vecOp(src, dst, width);
2298
2299                 for( ; i <= width - 4; i += 4 )
2300                 {
2301                     ST f = ky[0];
2302                     const ST *S, *S2;
2303                     ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
2304
2305                     for( k = 1; k <= ksize2; k++ )
2306                     {
2307                         S = (const ST*)src[k] + i;
2308                         S2 = (const ST*)src[-k] + i;
2309                         f = ky[k];
2310                         s0 += f*(S[0] - S2[0]);
2311                         s1 += f*(S[1] - S2[1]);
2312                         s2 += f*(S[2] - S2[2]);
2313                         s3 += f*(S[3] - S2[3]);
2314                     }
2315
2316                     D[i] = castOp(s0); D[i+1] = castOp(s1);
2317                     D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2318                 }
2319
2320                 for( ; i < width; i++ )
2321                 {
2322                     ST s0 = _delta;
2323                     for( k = 1; k <= ksize2; k++ )
2324                         s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]);
2325                     D[i] = castOp(s0);
2326                 }
2327             }
2328         }
2329     }
2330
2331     int symmetryType;
2332 };
2333
2334
2335 template<class CastOp, class VecOp>
2336 struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp>
2337 {
2338     typedef typename CastOp::type1 ST;
2339     typedef typename CastOp::rtype DT;
2340     
2341     SymmColumnSmallFilter( const Mat& _kernel, int _anchor,
2342                            double _delta, int _symmetryType,
2343                            const CastOp& _castOp=CastOp(),
2344                            const VecOp& _vecOp=VecOp())
2345         : SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp )
2346     {
2347         CV_Assert( this->ksize == 3 );
2348     }
2349
2350     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2351     {
2352         int ksize2 = this->ksize/2;
2353         const ST* ky = (const ST*)this->kernel.data + ksize2;
2354         int i;
2355         bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2356         bool is_1_2_1 = ky[0] == 1 && ky[1] == 2;
2357         bool is_1_m2_1 = ky[0] == 1 && ky[1] == -2;
2358         bool is_m1_0_1 = ky[1] == 1 || ky[1] == -1;
2359         ST f0 = ky[0], f1 = ky[1];
2360         ST _delta = this->delta;
2361         CastOp castOp = this->castOp0;
2362         src += ksize2;
2363
2364         for( ; count--; dst += dststep, src++ )
2365         {
2366             DT* D = (DT*)dst;
2367             i = (this->vecOp)(src, dst, width);
2368             const ST* S0 = (const ST*)src[-1];
2369             const ST* S1 = (const ST*)src[0];
2370             const ST* S2 = (const ST*)src[1];
2371
2372             if( symmetrical )
2373             {
2374                 if( is_1_2_1 )
2375                 {
2376                     for( ; i <= width - 4; i += 4 )
2377                     {
2378                         ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
2379                         ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta;
2380                         D[i] = castOp(s0);
2381                         D[i+1] = castOp(s1);
2382
2383                         s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta;
2384                         s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta;
2385                         D[i+2] = castOp(s0);
2386                         D[i+3] = castOp(s1);
2387                     }
2388                 }
2389                 else if( is_1_m2_1 )
2390                 {
2391                     for( ; i <= width - 4; i += 4 )
2392                     {
2393                         ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
2394                         ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta;
2395                         D[i] = castOp(s0);
2396                         D[i+1] = castOp(s1);
2397
2398                         s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta;
2399                         s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta;
2400                         D[i+2] = castOp(s0);
2401                         D[i+3] = castOp(s1);
2402                     }
2403                 }
2404                 else
2405                 {
2406                     for( ; i <= width - 4; i += 4 )
2407                     {
2408                         ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
2409                         ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta;
2410                         D[i] = castOp(s0);
2411                         D[i+1] = castOp(s1);
2412
2413                         s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta;
2414                         s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta;
2415                         D[i+2] = castOp(s0);
2416                         D[i+3] = castOp(s1);
2417                     }
2418                 }
2419
2420                 for( ; i < width; i++ )
2421                     D[i] = castOp((S0[i] + S2[i])*f1 + S1[i]*f0 + _delta);
2422             }
2423             else
2424             {
2425                 if( is_m1_0_1 )
2426                 {
2427                     if( f1 < 0 )
2428                         std::swap(S0, S2);
2429
2430                     for( ; i <= width - 4; i += 4 )
2431                     {
2432                         ST s0 = S2[i] - S0[i] + _delta;
2433                         ST s1 = S2[i+1] - S0[i+1] + _delta;
2434                         D[i] = castOp(s0);
2435                         D[i+1] = castOp(s1);
2436
2437                         s0 = S2[i+2] - S0[i+2] + _delta;
2438                         s1 = S2[i+3] - S0[i+3] + _delta;
2439                         D[i+2] = castOp(s0);
2440                         D[i+3] = castOp(s1);
2441                     }
2442
2443                     if( f1 < 0 )
2444                         std::swap(S0, S2);
2445                 }
2446                 else
2447                 {
2448                     for( ; i <= width - 4; i += 4 )
2449                     {
2450                         ST s0 = (S2[i] - S0[i])*f1 + _delta;
2451                         ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta;
2452                         D[i] = castOp(s0);
2453                         D[i+1] = castOp(s1);
2454
2455                         s0 = (S2[i+2] - S0[i+2])*f1 + _delta;
2456                         s1 = (S2[i+3] - S0[i+3])*f1 + _delta;
2457                         D[i+2] = castOp(s0);
2458                         D[i+3] = castOp(s1);
2459                     }
2460                 }
2461
2462                 for( ; i < width; i++ )
2463                     D[i] = castOp((S2[i] - S0[i])*f1 + _delta);
2464             }
2465         }
2466     }
2467 };
2468
2469 template<typename ST, typename DT> struct Cast
2470 {
2471     typedef ST type1;
2472     typedef DT rtype;
2473
2474     DT operator()(ST val) const { return saturate_cast<DT>(val); }
2475 };
2476
2477 template<typename ST, typename DT, int bits> struct FixedPtCast
2478 {
2479     typedef ST type1;
2480     typedef DT rtype;
2481     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
2482
2483     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2484 };
2485
2486 template<typename ST, typename DT> struct FixedPtCastEx
2487 {
2488     typedef ST type1;
2489     typedef DT rtype;
2490
2491     FixedPtCastEx() : SHIFT(0), DELTA(0) {}
2492     FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {}
2493     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2494     int SHIFT, DELTA;
2495 };
2496
2497 Ptr<BaseRowFilter> getLinearRowFilter( int srcType, int bufType,
2498                                           const Mat& kernel, int anchor,
2499                                           int symmetryType )
2500 {
2501     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType);
2502     int cn = CV_MAT_CN(srcType);
2503     CV_Assert( cn == CV_MAT_CN(bufType) &&
2504         ddepth >= std::max(sdepth, CV_32S) &&
2505         kernel.type() == ddepth );
2506     int ksize = kernel.rows + kernel.cols - 1;
2507
2508     if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 )
2509     {
2510         if( sdepth == CV_8U && ddepth == CV_32S )
2511             return Ptr<BaseRowFilter>(new SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s>
2512                 (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType)));
2513         if( sdepth == CV_32F && ddepth == CV_32F )
2514             return Ptr<BaseRowFilter>(new SymmRowSmallFilter<float, float, SymmRowSmallVec_32f>
2515                 (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType)));
2516     }
2517         
2518     if( sdepth == CV_8U && ddepth == CV_32S )
2519         return Ptr<BaseRowFilter>(new RowFilter<uchar, int, RowVec_8u32s>
2520             (kernel, anchor, RowVec_8u32s(kernel)));
2521     if( sdepth == CV_8U && ddepth == CV_32F )
2522         return Ptr<BaseRowFilter>(new RowFilter<uchar, float, RowNoVec>(kernel, anchor));
2523     if( sdepth == CV_8U && ddepth == CV_64F )
2524         return Ptr<BaseRowFilter>(new RowFilter<uchar, double, RowNoVec>(kernel, anchor));
2525     if( sdepth == CV_16U && ddepth == CV_32F )
2526         return Ptr<BaseRowFilter>(new RowFilter<ushort, float, RowNoVec>(kernel, anchor));
2527     if( sdepth == CV_16U && ddepth == CV_64F )
2528         return Ptr<BaseRowFilter>(new RowFilter<ushort, double, RowNoVec>(kernel, anchor));
2529     if( sdepth == CV_16S && ddepth == CV_32F )
2530         return Ptr<BaseRowFilter>(new RowFilter<short, float, RowNoVec>(kernel, anchor));
2531     if( sdepth == CV_16S && ddepth == CV_64F )
2532         return Ptr<BaseRowFilter>(new RowFilter<short, double, RowNoVec>(kernel, anchor));
2533     if( sdepth == CV_32F && ddepth == CV_32F )
2534         return Ptr<BaseRowFilter>(new RowFilter<float, float, RowVec_32f>
2535             (kernel, anchor, RowVec_32f(kernel)));
2536     if( sdepth == CV_64F && ddepth == CV_64F )
2537         return Ptr<BaseRowFilter>(new RowFilter<double, double, RowNoVec>(kernel, anchor));
2538
2539     CV_Error_( CV_StsNotImplemented,
2540         ("Unsupported combination of source format (=%d), and buffer format (=%d)",
2541         srcType, bufType));
2542
2543     return Ptr<BaseRowFilter>(0);
2544 }
2545
2546
2547 Ptr<BaseColumnFilter> getLinearColumnFilter( int bufType, int dstType,
2548                                              const Mat& kernel, int anchor,
2549                                              int symmetryType, double delta, 
2550                                              int bits )
2551 {
2552     int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType);
2553     int cn = CV_MAT_CN(dstType);
2554     CV_Assert( cn == CV_MAT_CN(bufType) &&
2555         sdepth >= std::max(ddepth, CV_32S) &&
2556         kernel.type() == sdepth );
2557
2558     if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) )
2559     {
2560         if( ddepth == CV_8U && sdepth == CV_32S )
2561             return Ptr<BaseColumnFilter>(new ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec>
2562             (kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits)));
2563         if( ddepth == CV_8U && sdepth == CV_32F )
2564             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, uchar>, ColumnNoVec>(kernel, anchor, delta));
2565         if( ddepth == CV_8U && sdepth == CV_64F )
2566             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, uchar>, ColumnNoVec>(kernel, anchor, delta));
2567         if( ddepth == CV_16U && sdepth == CV_32F )
2568             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, ushort>, ColumnNoVec>(kernel, anchor, delta));
2569         if( ddepth == CV_16U && sdepth == CV_64F )
2570             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, ushort>, ColumnNoVec>(kernel, anchor, delta));
2571         if( ddepth == CV_16S && sdepth == CV_32F )
2572             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, short>, ColumnNoVec>(kernel, anchor, delta));
2573         if( ddepth == CV_16S && sdepth == CV_64F )
2574             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, short>, ColumnNoVec>(kernel, anchor, delta));
2575         if( ddepth == CV_32F && sdepth == CV_32F )
2576             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, float>, ColumnNoVec>(kernel, anchor, delta));
2577         if( ddepth == CV_64F && sdepth == CV_64F )
2578             return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, double>, ColumnNoVec>(kernel, anchor, delta));
2579     }
2580     else
2581     {
2582         int ksize = kernel.rows + kernel.cols - 1;
2583         if( ksize == 3 )
2584         {
2585             if( ddepth == CV_8U && sdepth == CV_32S )
2586                 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2587                     FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2588                     (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2589                     SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2590             if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 )
2591                 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<Cast<int, short>,
2592                     SymmColumnSmallVec_32s16s>(kernel, anchor, delta, symmetryType,
2593                         Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta)));
2594             if( ddepth == CV_32F && sdepth == CV_32F )
2595                 return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2596                     Cast<float, float>,SymmColumnSmallVec_32f>
2597                     (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2598                     SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta)));
2599         }
2600         if( ddepth == CV_8U && sdepth == CV_32S )
2601             return Ptr<BaseColumnFilter>(new SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2602                 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2603                 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2604         if( ddepth == CV_8U && sdepth == CV_32F )
2605             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, uchar>, ColumnNoVec>
2606                 (kernel, anchor, delta, symmetryType));
2607         if( ddepth == CV_8U && sdepth == CV_64F )
2608             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, uchar>, ColumnNoVec>
2609                 (kernel, anchor, delta, symmetryType));
2610         if( ddepth == CV_16U && sdepth == CV_32F )
2611             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, ushort>, ColumnNoVec>
2612                 (kernel, anchor, delta, symmetryType));
2613         if( ddepth == CV_16U && sdepth == CV_64F )
2614             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, ushort>, ColumnNoVec>
2615                 (kernel, anchor, delta, symmetryType));
2616         if( ddepth == CV_16S && sdepth == CV_32S )
2617             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<int, short>, ColumnNoVec>
2618                 (kernel, anchor, delta, symmetryType));
2619         if( ddepth == CV_16S && sdepth == CV_32F )
2620             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, short>, ColumnNoVec>
2621                 (kernel, anchor, delta, symmetryType));
2622         if( ddepth == CV_16S && sdepth == CV_64F )
2623             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, short>, ColumnNoVec>
2624                 (kernel, anchor, delta, symmetryType));
2625         if( ddepth == CV_32F && sdepth == CV_32F )
2626             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f>
2627                 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2628                 SymmColumnVec_32f(kernel, symmetryType, 0, delta)));
2629         if( ddepth == CV_64F && sdepth == CV_64F )
2630             return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, double>, ColumnNoVec>
2631                 (kernel, anchor, delta, symmetryType));
2632     }
2633
2634     CV_Error_( CV_StsNotImplemented,
2635         ("Unsupported combination of buffer format (=%d), and destination format (=%d)",
2636         bufType, dstType));
2637
2638     return Ptr<BaseColumnFilter>(0);
2639 }
2640
2641
2642 Ptr<FilterEngine> createSeparableLinearFilter(
2643     int _srcType, int _dstType,
2644     const Mat& _rowKernel, const Mat& _columnKernel,
2645     Point _anchor, double _delta,
2646     int _rowBorderType, int _columnBorderType,
2647     const Scalar& _borderValue )
2648 {
2649     _srcType = CV_MAT_TYPE(_srcType);
2650     _dstType = CV_MAT_TYPE(_dstType);
2651     int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
2652     int cn = CV_MAT_CN(_srcType);
2653     CV_Assert( cn == CV_MAT_CN(_dstType) );
2654     int rsize = _rowKernel.rows + _rowKernel.cols - 1;
2655     int csize = _columnKernel.rows + _columnKernel.cols - 1;
2656     if( _anchor.x < 0 )
2657         _anchor.x = rsize/2;
2658     if( _anchor.y < 0 )
2659         _anchor.y = csize/2;
2660     int rtype = getKernelType(_rowKernel,
2661         _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x));
2662     int ctype = getKernelType(_columnKernel,
2663         _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y));
2664     Mat rowKernel, columnKernel;
2665
2666     int bdepth = std::max(CV_32F,std::max(sdepth, ddepth));
2667     int bits = 0;
2668
2669     if( sdepth == CV_8U &&
2670         ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2671           ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2672           ddepth == CV_8U) ||
2673          ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2674           (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2675           (rtype & ctype & KERNEL_INTEGER) &&
2676           ddepth == CV_16S)) )
2677     {
2678         bdepth = CV_32S;
2679         bits = ddepth == CV_8U ? 8 : 0;
2680         _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits );
2681         _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits );
2682         bits *= 2;
2683         _delta *= (1 << bits);
2684     }
2685     else
2686     {
2687         if( _rowKernel.type() != bdepth )
2688             _rowKernel.convertTo( rowKernel, bdepth );
2689         else
2690             rowKernel = _rowKernel;
2691         if( _columnKernel.type() != bdepth )
2692             _columnKernel.convertTo( columnKernel, bdepth );
2693         else
2694             columnKernel = _columnKernel;
2695     }
2696
2697     int _bufType = CV_MAKETYPE(bdepth, cn);
2698     Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter(
2699         _srcType, _bufType, rowKernel, _anchor.x, rtype);
2700     Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter(
2701         _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits );
2702
2703     return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(0), _rowFilter, _columnFilter,
2704         _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue ));
2705 }
2706
2707
2708 /****************************************************************************************\
2709 *                               Non-separable linear filter                              *
2710 \****************************************************************************************/
2711
2712 void preprocess2DKernel( const Mat& kernel, vector<Point>& coords, vector<uchar>& coeffs )
2713 {
2714     int i, j, k, nz = countNonZero(kernel), ktype = kernel.type();
2715     if(nz == 0)
2716         nz = 1;
2717     CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F );
2718     coords.resize(nz);
2719     coeffs.resize(nz*getElemSize(ktype));
2720     uchar* _coeffs = &coeffs[0];
2721
2722     for( i = k = 0; i < kernel.rows; i++ )
2723     {
2724         const uchar* krow = kernel.data + kernel.step*i;
2725         for( j = 0; j < kernel.cols; j++ )
2726         {
2727             if( ktype == CV_8U )
2728             {
2729                 uchar val = krow[j];
2730                 if( val == 0 )
2731                     continue;
2732                 coords[k] = Point(j,i);
2733                 _coeffs[k++] = val;
2734             }
2735             else if( ktype == CV_32S )
2736             {
2737                 int val = ((const int*)krow)[j];
2738                 if( val == 0 )
2739                     continue;
2740                 coords[k] = Point(j,i);
2741                 ((int*)_coeffs)[k++] = val;
2742             }
2743             else if( ktype == CV_32F )
2744             {
2745                 float val = ((const float*)krow)[j];
2746                 if( val == 0 )
2747                     continue;
2748                 coords[k] = Point(j,i);
2749                 ((float*)_coeffs)[k++] = val;
2750             }
2751             else
2752             {
2753                 double val = ((const double*)krow)[j];
2754                 if( val == 0 )
2755                     continue;
2756                 coords[k] = Point(j,i);
2757                 ((double*)_coeffs)[k++] = val;
2758             }
2759         }
2760     }
2761 }
2762
2763
2764 template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter
2765 {
2766     typedef typename CastOp::type1 KT;
2767     typedef typename CastOp::rtype DT;
2768     
2769     Filter2D( const Mat& _kernel, Point _anchor,
2770         double _delta, const CastOp& _castOp=CastOp(),
2771         const VecOp& _vecOp=VecOp() )
2772     {
2773         anchor = _anchor;
2774         ksize = _kernel.size();
2775         delta = saturate_cast<KT>(_delta);
2776         castOp0 = _castOp;
2777         vecOp = _vecOp;
2778         CV_Assert( _kernel.type() == DataType<KT>::type );
2779         preprocess2DKernel( _kernel, coords, coeffs );
2780         ptrs.resize( coords.size() );
2781     }
2782
2783     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn)
2784     {
2785         KT _delta = delta;
2786         const Point* pt = &coords[0];
2787         const KT* kf = (const KT*)&coeffs[0];
2788         const ST** kp = (const ST**)&ptrs[0];
2789         int i, k, nz = (int)coords.size();
2790         CastOp castOp = castOp0;
2791
2792         width *= cn;
2793         for( ; count > 0; count--, dst += dststep, src++ )
2794         {
2795             DT* D = (DT*)dst;
2796
2797             for( k = 0; k < nz; k++ )
2798                 kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn;
2799
2800             i = vecOp((const uchar**)kp, dst, width);
2801
2802             for( ; i <= width - 4; i += 4 )
2803             {
2804                 KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
2805
2806                 for( k = 0; k < nz; k++ )
2807                 {
2808                     const ST* sptr = kp[k] + i;
2809                     KT f = kf[k];
2810                     s0 += f*sptr[0];
2811                     s1 += f*sptr[1];
2812                     s2 += f*sptr[2];
2813                     s3 += f*sptr[3];
2814                 }
2815
2816                 D[i] = castOp(s0); D[i+1] = castOp(s1);
2817                 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2818             }
2819
2820             for( ; i < width; i++ )
2821             {
2822                 KT s0 = _delta;
2823                 for( k = 0; k < nz; k++ )
2824                     s0 += kf[k]*kp[k][i];
2825                 D[i] = castOp(s0);
2826             }
2827         }
2828     }
2829
2830     vector<Point> coords;
2831     vector<uchar> coeffs;
2832     vector<uchar*> ptrs;
2833     KT delta;
2834     CastOp castOp0;
2835     VecOp vecOp;
2836 };
2837
2838
2839 Ptr<BaseFilter> getLinearFilter(int srcType, int dstType,
2840                                 const Mat& _kernel, Point anchor,
2841                                 double delta, int bits)
2842 {
2843     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
2844     int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth();
2845     CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth );
2846
2847     anchor = normalizeAnchor(anchor, _kernel.size());
2848
2849     if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S )
2850         return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u>
2851             (_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits),
2852             FilterVec_8u(_kernel, bits, delta)));
2853     if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S )
2854         return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s>
2855             (_kernel, anchor, delta, FixedPtCastEx<int, short>(bits),
2856             FilterVec_8u16s(_kernel, bits, delta)));
2857
2858     kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F;
2859     Mat kernel;
2860     if( _kernel.type() == kdepth )
2861         kernel = _kernel;
2862     else
2863         _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.);
2864     
2865     if( sdepth == CV_8U && ddepth == CV_8U )
2866         return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, uchar>, FilterVec_8u>
2867             (kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta)));
2868     if( sdepth == CV_8U && ddepth == CV_16U )
2869         return Ptr<BaseFilter>(new Filter2D<uchar,
2870             Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
2871     if( sdepth == CV_8U && ddepth == CV_16S )
2872         return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, short>, FilterVec_8u16s>
2873             (kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta)));
2874     if( sdepth == CV_8U && ddepth == CV_32F )
2875         return Ptr<BaseFilter>(new Filter2D<uchar,
2876             Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2877     if( sdepth == CV_8U && ddepth == CV_64F )
2878         return Ptr<BaseFilter>(new Filter2D<uchar,
2879             Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2880
2881     if( sdepth == CV_16U && ddepth == CV_16U )
2882         return Ptr<BaseFilter>(new Filter2D<ushort,
2883             Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
2884     if( sdepth == CV_16U && ddepth == CV_32F )
2885         return Ptr<BaseFilter>(new Filter2D<ushort,
2886             Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2887     if( sdepth == CV_16U && ddepth == CV_64F )
2888         return Ptr<BaseFilter>(new Filter2D<ushort,
2889             Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2890
2891     if( sdepth == CV_16S && ddepth == CV_16S )
2892         return Ptr<BaseFilter>(new Filter2D<short,
2893             Cast<float, short>, FilterNoVec>(kernel, anchor, delta));
2894     if( sdepth == CV_16S && ddepth == CV_32F )
2895         return Ptr<BaseFilter>(new Filter2D<short,
2896             Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2897     if( sdepth == CV_16S && ddepth == CV_64F )
2898         return Ptr<BaseFilter>(new Filter2D<short,
2899             Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2900
2901     if( sdepth == CV_32F && ddepth == CV_32F )
2902         return Ptr<BaseFilter>(new Filter2D<float, Cast<float, float>, FilterVec_32f>
2903             (kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta)));
2904     if( sdepth == CV_64F && ddepth == CV_64F )
2905         return Ptr<BaseFilter>(new Filter2D<double,
2906             Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2907
2908     CV_Error_( CV_StsNotImplemented,
2909         ("Unsupported combination of source format (=%d), and destination format (=%d)",
2910         srcType, dstType));
2911
2912     return Ptr<BaseFilter>(0);
2913 }
2914
2915
2916 Ptr<FilterEngine> createLinearFilter( int _srcType, int _dstType, const Mat& _kernel,
2917                          Point _anchor, double _delta,
2918                          int _rowBorderType, int _columnBorderType,
2919                          const Scalar& _borderValue )
2920 {
2921     _srcType = CV_MAT_TYPE(_srcType);
2922     _dstType = CV_MAT_TYPE(_dstType);
2923     int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
2924     int cn = CV_MAT_CN(_srcType);
2925     CV_Assert( cn == CV_MAT_CN(_dstType) );
2926
2927     Mat kernel = _kernel;
2928     int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor);
2929     int bits = 0;
2930
2931     if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) &&
2932         _kernel.rows*_kernel.cols <= (1 << 10) )
2933     {
2934         bits = (ktype & KERNEL_INTEGER) ? 0 : 11;
2935         _kernel.convertTo(kernel, CV_32S, 1 << bits);
2936     }
2937     
2938     Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType,
2939         kernel, _anchor, _delta, bits);
2940
2941     return Ptr<FilterEngine>(new FilterEngine(_filter2D, Ptr<BaseRowFilter>(0),
2942         Ptr<BaseColumnFilter>(0), _srcType, _dstType, _srcType,
2943         _rowBorderType, _columnBorderType, _borderValue ));
2944 }
2945
2946
2947 void filter2D( const Mat& src, Mat& dst, int ddepth,
2948                const Mat& kernel, Point anchor,
2949                double delta, int borderType )
2950 {
2951     if( ddepth < 0 )
2952         ddepth = src.depth();
2953
2954 #if CV_SSE2
2955     int dft_filter_size = (src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) ||
2956         (src.depth() == CV_32F && ddepth == CV_32F) ? 130 : 50;
2957 #else
2958     int dft_filter_size = 50;
2959 #endif
2960
2961     dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
2962     anchor = normalizeAnchor(anchor, kernel.size());
2963
2964     if( kernel.cols*kernel.rows >= dft_filter_size &&
2965         kernel.cols <= src.cols && kernel.rows <= src.rows )
2966     {
2967         Mat temp;
2968         if( src.data != dst.data )
2969             temp = src;
2970         else
2971             src.copyTo(temp);
2972         crossCorr( temp, kernel, dst, anchor, delta, borderType );
2973         return;
2974     }
2975
2976     Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel,
2977                                              anchor, delta, borderType );
2978     f->apply(src, dst);
2979 }
2980
2981
2982 void sepFilter2D( const Mat& src, Mat& dst, int ddepth,
2983                   const Mat& kernelX, const Mat& kernelY, Point anchor,
2984                   double delta, int borderType )
2985 {
2986     if( ddepth < 0 )
2987         ddepth = src.depth();
2988
2989     dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
2990
2991     Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(),
2992         dst.type(), kernelX, kernelY, anchor, delta, borderType );
2993     f->apply(src, dst);
2994 }
2995
2996 }
2997
2998
2999 CV_IMPL void
3000 cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor )
3001 {
3002     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3003     cv::Mat kernel = cv::cvarrToMat(_kernel);
3004
3005     CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() );
3006
3007     cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE );
3008 }
3009
3010 /* End of file. */