--- /dev/null
+From 04c686998b70b616d6c3a13cc694cf0811ba8650 Mon Sep 17 00:00:00 2001
+From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Tue, 14 Jul 2009 21:50:18 +0300
+Subject: [PATCH] ARM NEON optimized version of uyvy422_to_yuv420p function
+
+---
+ gst/ffmpegcolorspace/imgconvert.c | 153 +++++++++++++++++++++++++++++++++++++
+ 1 files changed, 153 insertions(+), 0 deletions(-)
+
+diff --git a/gst/ffmpegcolorspace/imgconvert.c b/gst/ffmpegcolorspace/imgconvert.c
+index 2436a20..67cb19d 100644
+--- a/gst/ffmpegcolorspace/imgconvert.c
++++ b/gst/ffmpegcolorspace/imgconvert.c
+@@ -956,6 +956,158 @@ uyvy422_to_rgb24_neon (AVPicture * dst, const AVPicture * src,
+ }
+ }
+
++/*
++ * ARM NEON optimized implementation of UYVY -> YV12 convertor
++ */
++static void
++uyvy422_to_yuv420p_neon (AVPicture * dst, const AVPicture * src,
++ int width, int height)
++{
++ asm volatile
++ (".macro convert_macroblock_uyvy422_to_yuv420p size, store_c, src, lum, cb, cr\n"
++ /* load up to 16 source pixels in UYVY format */
++ ".if \\size == 16\n" " vld1.8 {d0, d1, d2, d3}, [\\src]!\n" " pld [\\src, #256]\n" ".elseif \\size == 8\n" " vld1.8 {d0, d1}, [\\src]!\n" ".elseif \\size == 4\n" " vld1.8 {d0}, [\\src]!\n" ".elseif \\size == 2\n" " vld1.8 {d0[0]}, [\\src]!\n" " vld1.8 {d0[1]}, [\\src]!\n" " vld1.8 {d0[2]}, [\\src]!\n" " vld1.8 {d0[3]}, [\\src]!\n" ".elseif \\size == 1\n" " vld1.8 {d0[0]}, [\\src]!\n" " vld1.8 {d0[1]}, [\\src]!\n" " vld1.8 {d0[2]}, [\\src]!\n" ".else\n" " .error \"unsupported macroblock size\"\n" ".endif\n" " vuzp.8 d0, d1\n" /* d1 - separated Y (first 8 bytes) */
++ " vuzp.8 d2, d3\n" /* d3 - separated Y (next 8 bytes) */
++ " vuzp.8 d0, d2\n" /* d0 - separated U, d2 - separated V */
++ " vswp d1, d2\n" /* exchange d1 and d2 */
++ ".if \\size == 16\n"
++ " vst1.8 {d2, d3}, [\\lum]!\n"
++ ".if \\store_c\n"
++ " vst1.8 {d0}, [\\cb]!\n"
++ " vst1.8 {d1}, [\\cr]!\n"
++ ".endif\n"
++ ".elseif \\size == 8\n"
++ " vst1.8 {d2}, [\\lum]!\n"
++ ".if \\store_c\n"
++ " vst1.8 {d0[0]}, [\\cb]!\n"
++ " vst1.8 {d0[1]}, [\\cb]!\n"
++ " vst1.8 {d0[2]}, [\\cb]!\n"
++ " vst1.8 {d0[3]}, [\\cb]!\n"
++ " vst1.8 {d1[0]}, [\\cr]!\n"
++ " vst1.8 {d1[1]}, [\\cr]!\n"
++ " vst1.8 {d1[2]}, [\\cr]!\n"
++ " vst1.8 {d1[3]}, [\\cr]!\n"
++ ".endif\n"
++ ".elseif \\size == 4\n"
++ " vst1.8 {d2[0]}, [\\lum]!\n"
++ " vst1.8 {d2[1]}, [\\lum]!\n"
++ " vst1.8 {d2[2]}, [\\lum]!\n"
++ " vst1.8 {d2[3]}, [\\lum]!\n"
++ ".if \\store_c\n"
++ " vst1.8 {d0[0]}, [\\cb]!\n"
++ " vst1.8 {d0[1]}, [\\cb]!\n"
++ " vst1.8 {d1[0]}, [\\cr]!\n"
++ " vst1.8 {d1[1]}, [\\cr]!\n"
++ ".endif\n"
++ ".elseif \\size == 2\n"
++ " vst1.8 {d2[0]}, [\\lum]!\n"
++ " vst1.8 {d2[1]}, [\\lum]!\n"
++ ".if \\store_c\n"
++ " vst1.8 {d0[0]}, [\\cb]!\n"
++ " vst1.8 {d1[0]}, [\\cr]!\n"
++ ".endif\n"
++ ".elseif \\size == 1\n"
++ " vst1.8 {d2[0]}, [\\lum]!\n"
++ ".if \\store_c\n"
++ " vst1.8 {d0[0]}, [\\cb]!\n"
++ " vst1.8 {d1[0]}, [\\cr]!\n"
++ ".endif\n"
++ ".else\n"
++ " .error \"unsupported macroblock size\"\n" ".endif\n" ".endm\n");
++
++ const uint8_t *p, *p1;
++ uint8_t *lum, *cr, *cb, *lum1, *cr1, *cb1;
++ int w;
++
++ p1 = src->data[0];
++
++ lum1 = dst->data[0];
++ cb1 = dst->data[1];
++ cr1 = dst->data[2];
++
++ for (; height >= 1; height -= 2) {
++ p = p1;
++ lum = lum1;
++ cb = cb1;
++ cr = cr1;
++ w = width;
++#if 1
++ asm volatile (" subs %[w], %[w], #16\n"
++ " blt 2f\n"
++ "1:\n"
++ " convert_macroblock_uyvy422_to_yuv420p 16, 1, %[p], %[lum], %[cb], %[cr]\n"
++ " subs %[w], %[w], #16\n"
++ " bge 1b\n"
++ "2:\n"
++ " .irp size, 8, 4, 2, 1\n"
++ " tst %[w], #\\size\n"
++ " beq 3f\n"
++ " convert_macroblock_uyvy422_to_yuv420p \\size, 1, %[p], %[lum], %[cb], %[cr]\n"
++ "3:\n"
++ " .endr\n":[w] "+&r" (w),[p] "+&r" (p),[lum] "+&r" (lum),
++ [cb] "+&r" (cb),[cr] "+&r" (cr)
++ ::"cc", "memory", "d0", "d1", "d2", "d3");
++#else
++ for (; w >= 2; w -= 2) {
++ lum[0] = p[1];
++ cb[0] = p[0];
++ lum[1] = p[3];
++ cr[0] = p[2];
++ p += 4;
++ lum += 2;
++ cb++;
++ cr++;
++ }
++ if (w) {
++ lum[0] = p[1];
++ cb[0] = p[0];
++ cr[0] = p[2];
++ cb++;
++ cr++;
++ }
++#endif
++ p1 += src->linesize[0];
++ lum1 += dst->linesize[0];
++ if (height > 1) {
++ p = p1;
++ lum = lum1;
++ w = width;
++#if 1
++ asm volatile (" subs %[w], %[w], #16\n"
++ " blt 2f\n"
++ "1:\n"
++ " convert_macroblock_uyvy422_to_yuv420p 16, 0, %[p], %[lum], %[cb], %[cr]\n"
++ " subs %[w], %[w], #16\n"
++ " bge 1b\n"
++ "2:\n"
++ " .irp size, 8, 4, 2, 1\n"
++ " tst %[w], #\\size\n"
++ " beq 3f\n"
++ " convert_macroblock_uyvy422_to_yuv420p \\size, 0, %[p], %[lum], %[cb], %[cr]\n"
++ "3:\n"
++ " .endr\n":[w] "+&r" (w),[p] "+&r" (p),[lum] "+&r" (lum),
++ [cb] "+&r" (cb),[cr] "+&r" (cr)
++ ::"cc", "memory", "d0", "d1", "d2", "d3");
++#else
++ for (w = width; w >= 2; w -= 2) {
++ lum[0] = p[1];
++ lum[1] = p[3];
++ p += 4;
++ lum += 2;
++ }
++ if (w) {
++ lum[0] = p[1];
++ }
++#endif
++ p1 += src->linesize[0];
++ lum1 += dst->linesize[0];
++ }
++ cb1 += dst->linesize[1];
++ cr1 += dst->linesize[2];
++ }
++ asm volatile (".purgem convert_macroblock_uyvy422_to_yuv420p\n");
++}
++
+ #endif
+
+ /* XXX: totally non optimized */
+@@ -3311,6 +3463,7 @@ static ConvertEntry convert_table[] = {
+
+ #ifdef __ARM_NEON__
+ {PIX_FMT_UYVY422, PIX_FMT_RGB24, uyvy422_to_rgb24_neon},
++ {PIX_FMT_UYVY422, PIX_FMT_YUV420P, uyvy422_to_yuv420p_neon},
+ #endif
+ {PIX_FMT_UYVY422, PIX_FMT_YUV420P, uyvy422_to_yuv420p},
+ {PIX_FMT_UYVY422, PIX_FMT_YUV422P, uyvy422_to_yuv422p},