From db7d45237ab6fc7fe90ec861cb756b2a109504a4 Mon Sep 17 00:00:00 2001
From: Aneesh Dogra
Date: Tue, 7 Feb 2012 01:39:22 +0530
Subject: bytestream: Add bytestream2 writing API.

Signed-off-by: Justin Ruggles <justin.ruggles@gmail.com>
---
 libavcodec/bytestream.h | 112 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/libavcodec/bytestream.h b/libavcodec/bytestream.h
index 503598a4bc..49d7fa42a8 100644
--- a/libavcodec/bytestream.h
+++ b/libavcodec/bytestream.h
@@ -1,6 +1,7 @@
 /*
  * Bytestream functions
  * copyright (c) 2006 Baptiste Coudurier <baptiste.coudurier@free.fr>
+ * Copyright (c) 2012 Aneesh Dogra (lionaneesh) <lionaneesh@gmail.com>
  *
  * This file is part of Libav.
  *
@@ -30,6 +31,11 @@ typedef struct {
     const uint8_t *buffer, *buffer_end, *buffer_start;
 } GetByteContext;
 
+typedef struct {
+    uint8_t *buffer, *buffer_end, *buffer_start;
+    int eof;
+} PutByteContext;
+
 #define DEF_T(type, name, bytes, read, write)                             \
 static av_always_inline type bytestream_get_ ## name(const uint8_t **b){\
     (*b) += bytes;\
@@ -39,6 +45,17 @@ static av_always_inline void bytestream_put_ ##name(uint8_t **b, const type valu
     write(*b, value);\
     (*b) += bytes;\
 }\
+static av_always_inline void bytestream2_put_ ## name ## u(PutByteContext *p, const type value)\
+{\
+    bytestream_put_ ## name(&p->buffer, value);\
+}\
+static av_always_inline void bytestream2_put_ ## name(PutByteContext *p, const type value){\
+    if (!p->eof && (p->buffer_end - p->buffer >= bytes)) {\
+        write(p->buffer, value);\
+        p->buffer += bytes;\
+    } else\
+        p->eof = 1;\
+}\
 static av_always_inline type bytestream2_get_ ## name ## u(GetByteContext *g)\
 {\
     return bytestream_get_ ## name(&g->buffer);\
@@ -119,22 +136,53 @@ static av_always_inline void bytestream2_init(GetByteContext *g,
     g->buffer_end = buf + buf_size;
 }
 
+static av_always_inline void bytestream2_init_writer(PutByteContext *p,
+                                                     uint8_t *buf, int buf_size)
+{
+    p->buffer       = buf;
+    p->buffer_start = buf;
+    p->buffer_end   = buf + buf_size;
+    p->eof          = 0;
+}
+
 static av_always_inline unsigned int bytestream2_get_bytes_left(GetByteContext *g)
 {
     return g->buffer_end - g->buffer;
 }
 
+static av_always_inline unsigned int bytestream2_get_bytes_left_p(PutByteContext *p)
+{
+    return p->buffer_end - p->buffer;
+}
+
 static av_always_inline void bytestream2_skip(GetByteContext *g,
                                               unsigned int size)
 {
     g->buffer += FFMIN(g->buffer_end - g->buffer, size);
 }
 
+static av_always_inline void bytestream2_skip_p(PutByteContext *p,
+                                                unsigned int size)
+{
+    int size2;
+    if (p->eof)
+        return;
+    size2 = FFMIN(p->buffer_end - p->buffer, size);
+    if (size2 != size)
+        p->eof = 1;
+    p->buffer += size2;
+}
+
 static av_always_inline int bytestream2_tell(GetByteContext *g)
 {
     return (int)(g->buffer - g->buffer_start);
 }
 
+static av_always_inline int bytestream2_tell_p(PutByteContext *p)
+{
+    return (int)(p->buffer - p->buffer_start);
+}
+
 static av_always_inline int bytestream2_seek(GetByteContext *g, int offset,
                                              int whence)
 {
@@ -158,6 +206,36 @@ static av_always_inline int bytestream2_seek(GetByteContext *g, int offset,
     return bytestream2_tell(g);
 }
 
+static av_always_inline int bytestream2_seek_p(PutByteContext *p, int offset,
+                                               int whence)
+{
+    p->eof = 0;
+    switch (whence) {
+    case SEEK_CUR:
+        if (p->buffer_end - p->buffer < offset)
+            p->eof = 1;
+        offset = av_clip(offset, -(p->buffer - p->buffer_start),
+                         p->buffer_end - p->buffer);
+        p->buffer += offset;
+        break;
+    case SEEK_END:
+        if (offset > 0)
+            p->eof = 1;
+        offset = av_clip(offset, -(p->buffer_end - p->buffer_start), 0);
+        p->buffer = p->buffer_end + offset;
+        break;
+    case SEEK_SET:
+        if (p->buffer_end - p->buffer_start < offset)
+            p->eof = 1;
+        offset = av_clip(offset, 0, p->buffer_end - p->buffer_start);
+        p->buffer = p->buffer_start + offset;
+        break;
+    default:
+        return AVERROR(EINVAL);
+    }
+    return bytestream2_tell_p(p);
+}
+
 static av_always_inline unsigned int bytestream2_get_buffer(GetByteContext *g,
                                                             uint8_t *dst,
                                                             unsigned int size)
@@ -168,6 +246,40 @@ static av_always_inline unsigned int bytestream2_get_buffer(GetByteContext *g,
     return size2;
 }
 
+static av_always_inline unsigned int bytestream2_put_buffer(PutByteContext *p,
+                                                            const uint8_t *src,
+                                                            unsigned int size)
+{
+    int size2;
+    if (p->eof)
+        return 0;
+    size2 = FFMIN(p->buffer_end - p->buffer, size);
+    if (size2 != size)
+        p->eof = 1;
+    memcpy(p->buffer, src, size2);
+    p->buffer += size2;
+    return size2;
+}
+
+static av_always_inline void bytestream2_set_buffer(PutByteContext *p,
+                                                    const uint8_t c,
+                                                    unsigned int size)
+{
+    int size2;
+    if (p->eof)
+        return;
+    size2 = FFMIN(p->buffer_end - p->buffer, size);
+    if (size2 != size)
+        p->eof = 1;
+    memset(p->buffer, c, size2);
+    p->buffer += size2;
+}
+
+static av_always_inline unsigned int bytestream2_get_eof(PutByteContext *p)
+{
+    return p->eof;
+}
+
 static av_always_inline unsigned int bytestream_get_buffer(const uint8_t **b, uint8_t *dst, unsigned int size)
 {
     memcpy(dst, *b, size);
-- 
cgit v1.2.3


From 2dd7a1c030e64cdd8fed18e2530fb5b95f41b990 Mon Sep 17 00:00:00 2001
From: Ronald S. Bultje
Date: Wed, 1 Feb 2012 07:38:55 -0800
Subject: swscale: Split C input functions into separate file.

Signed-off-by: Diego Biurrun <diego@biurrun.de>
---
 libswscale/Makefile           |   4 +-
 libswscale/input.c            | 761 ++++++++++++++++++++++++++++++++++++++++++
 libswscale/swscale.c          | 722 +--------------------------------------
 libswscale/swscale_internal.h |   1 +
 4 files changed, 765 insertions(+), 723 deletions(-)
 create mode 100644 libswscale/input.c

diff --git a/libswscale/Makefile b/libswscale/Makefile
index 0aee7e497b..f093516b5e 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -3,8 +3,8 @@ FFLIBS = avutil
 
 HEADERS = swscale.h
 
-OBJS = options.o rgb2rgb.o swscale.o utils.o yuv2rgb.o \
-       swscale_unscaled.o
+OBJS = input.o options.o rgb2rgb.o swscale.o utils.o \
+       swscale_unscaled.o yuv2rgb.o
 
 OBJS-$(ARCH_BFIN)          +=  bfin/internal_bfin.o     \
                                bfin/swscale_bfin.o      \
diff --git a/libswscale/input.c b/libswscale/input.c
new file mode 100644
index 0000000000..e636eac1f6
--- /dev/null
+++ b/libswscale/input.c
@@ -0,0 +1,761 @@
+/*
+ * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "libavutil/avutil.h"
+#include "libavutil/bswap.h"
+#include "libavutil/cpu.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/pixdesc.h"
+#include "config.h"
+#include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+
+#define RGB2YUV_SHIFT 15
+#define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
+
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+
+#define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
+#define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
+
+static av_always_inline void
+rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
+                    enum PixelFormat origin)
+{
+    int i;
+    for (i = 0; i < width; i++) {
+        unsigned int r_b = input_pixel(&src[i*3+0]);
+        unsigned int   g = input_pixel(&src[i*3+1]);
+        unsigned int b_r = input_pixel(&src[i*3+2]);
+
+        dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void
+rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
+                    const uint16_t *src1, const uint16_t *src2,
+                    int width, enum PixelFormat origin)
+{
+    int i;
+    assert(src1==src2);
+    for (i = 0; i < width; i++) {
+        int r_b = input_pixel(&src1[i*3+0]);
+        int   g = input_pixel(&src1[i*3+1]);
+        int b_r = input_pixel(&src1[i*3+2]);
+
+        dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void
+rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
+                          const uint16_t *src1, const uint16_t *src2,
+                          int width, enum PixelFormat origin)
+{
+    int i;
+    assert(src1==src2);
+    for (i = 0; i < width; i++) {
+        int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
+        int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
+        int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
+
+        dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+#undef r
+#undef b
+#undef input_pixel
+
+#define rgb48funcs(pattern, BE_LE, origin) \
+static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
+                                    int width, uint32_t *unused) \
+{ \
+    const uint16_t *src = (const uint16_t *) _src; \
+    uint16_t *dst = (uint16_t *) _dst; \
+    rgb48ToY_c_template(dst, src, width, origin); \
+} \
+ \
+static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
+                                    const uint8_t *_src1, const uint8_t *_src2, \
+                                    int width, uint32_t *unused) \
+{ \
+    const uint16_t *src1 = (const uint16_t *) _src1, \
+                   *src2 = (const uint16_t *) _src2; \
+    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+    rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
+} \
+ \
+static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
+                                    const uint8_t *_src1, const uint8_t *_src2, \
+                                    int width, uint32_t *unused) \
+{ \
+    const uint16_t *src1 = (const uint16_t *) _src1, \
+                   *src2 = (const uint16_t *) _src2; \
+    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+    rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
+}
+
+rgb48funcs(rgb, LE, PIX_FMT_RGB48LE)
+rgb48funcs(rgb, BE, PIX_FMT_RGB48BE)
+rgb48funcs(bgr, LE, PIX_FMT_BGR48LE)
+rgb48funcs(bgr, BE, PIX_FMT_BGR48BE)
+
+#define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
+                         origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
+                        (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
+
+static av_always_inline void
+rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
+                       int width, enum PixelFormat origin,
+                       int shr,   int shg,   int shb, int shp,
+                       int maskr, int maskg, int maskb,
+                       int rsh,   int gsh,   int bsh, int S)
+{
+    const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh;
+    const unsigned rnd = 33u << (S - 1);
+    int i;
+
+    for (i = 0; i < width; i++) {
+        int px = input_pixel(i) >> shp;
+        int b = (px & maskb) >> shb;
+        int g = (px & maskg) >> shg;
+        int r = (px & maskr) >> shr;
+
+        dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
+    }
+}
+
+static av_always_inline void
+rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
+                        const uint8_t *src, int width,
+                        enum PixelFormat origin,
+                        int shr,   int shg,   int shb, int shp,
+                        int maskr, int maskg, int maskb,
+                        int rsh,   int gsh,   int bsh, int S)
+{
+    const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
+              rv = RV << rsh, gv = GV << gsh, bv = BV << bsh;
+    const unsigned rnd = 257u << (S - 1);
+    int i;
+
+    for (i = 0; i < width; i++) {
+        int px = input_pixel(i) >> shp;
+        int b = (px & maskb) >> shb;
+        int g = (px & maskg) >> shg;
+        int r = (px & maskr) >> shr;
+
+        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
+        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
+    }
+}
+
+static av_always_inline void
+rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
+                             const uint8_t *src, int width,
+                             enum PixelFormat origin,
+                             int shr,   int shg,   int shb, int shp,
+                             int maskr, int maskg, int maskb,
+                             int rsh,   int gsh,   int bsh, int S)
+{
+    const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
+              rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
+              maskgx = ~(maskr | maskb);
+    const unsigned rnd = 257u << S;
+    int i;
+
+    maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
+    for (i = 0; i < width; i++) {
+        int px0 = input_pixel(2 * i + 0) >> shp;
+        int px1 = input_pixel(2 * i + 1) >> shp;
+        int b, r, g = (px0 & maskgx) + (px1 & maskgx);
+        int rb = px0 + px1 - g;
+
+        b = (rb & maskb) >> shb;
+        if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
+            origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
+            g >>= shg;
+        } else {
+            g = (g  & maskg) >> shg;
+        }
+        r = (rb & maskr) >> shr;
+
+        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
+        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
+    }
+}
+
+#undef input_pixel
+
+#define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
+                         maskg, maskb, rsh, gsh, bsh, S) \
+static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
+                          int width, uint32_t *unused) \
+{ \
+    rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
+                           maskr, maskg, maskb, rsh, gsh, bsh, S); \
+} \
+ \
+static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
+                           const uint8_t *src, const uint8_t *dummy, \
+                           int width, uint32_t *unused) \
+{ \
+    rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
+                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
+} \
+ \
+static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
+                                const uint8_t *src, const uint8_t *dummy, \
+                                int width, uint32_t *unused) \
+{ \
+    rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
+                                 maskr, maskg, maskb, rsh, gsh, bsh, S); \
+}
+
+rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
+rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
+rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
+rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
+rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
+rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
+rgb16_32_wrapper(PIX_FMT_BGR444LE, bgr12le, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT+4)
+rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
+rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
+rgb16_32_wrapper(PIX_FMT_RGB444LE, rgb12le, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT+4)
+rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
+rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
+rgb16_32_wrapper(PIX_FMT_BGR444BE, bgr12be, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT+4)
+rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
+rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
+rgb16_32_wrapper(PIX_FMT_RGB444BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT+4)
+
+static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i]= src[4*i];
+    }
+}
+
+static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i]= src[4*i+3];
+    }
+}
+
+static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        int d= src[i];
+
+        dst[i]= pal[d] & 0xFF;
+    }
+}
+
+static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
+                      const uint8_t *src1, const uint8_t *src2,
+                      int width, uint32_t *pal)
+{
+    int i;
+    assert(src1 == src2);
+    for (i=0; i<width; i++) {
+        int p= pal[src1[i]];
+
+        dstU[i]= p>>8;
+        dstV[i]= p>>16;
+    }
+}
+
+static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
+                          int width, uint32_t *unused)
+{
+    int i, j;
+    for (i=0; i<width/8; i++) {
+        int d= ~src[i];
+        for(j=0; j<8; j++)
+            dst[8*i+j]= ((d>>(7-j))&1)*255;
+    }
+}
+
+static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
+                          int width, uint32_t *unused)
+{
+    int i, j;
+    for (i=0; i<width/8; i++) {
+        int d= src[i];
+        for(j=0; j<8; j++)
+            dst[8*i+j]= ((d>>(7-j))&1)*255;
+    }
+}
+
+static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
+                      uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++)
+        dst[i]= src[2*i];
+}
+
+static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+                       const uint8_t *src2, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dstU[i]= src1[4*i + 1];
+        dstV[i]= src1[4*i + 3];
+    }
+    assert(src1 == src2);
+}
+
+static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
+{
+    int i;
+    const uint16_t *src = (const uint16_t *) _src;
+    uint16_t *dst = (uint16_t *) _dst;
+    for (i=0; i<width; i++) {
+        dst[i] = av_bswap16(src[i]);
+    }
+}
+
+static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
+                        const uint8_t *_src2, int width, uint32_t *unused)
+{
+    int i;
+    const uint16_t *src1 = (const uint16_t *) _src1,
+                   *src2 = (const uint16_t *) _src2;
+    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
+    for (i=0; i<width; i++) {
+        dstU[i] = av_bswap16(src1[i]);
+        dstV[i] = av_bswap16(src2[i]);
+    }
+}
+
+/* This is almost identical to the previous, end exists only because
+ * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
+static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
+                      uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++)
+        dst[i]= src[2*i+1];
+}
+
+static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+                       const uint8_t *src2, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dstU[i]= src1[4*i + 0];
+        dstV[i]= src1[4*i + 2];
+    }
+    assert(src1 == src2);
+}
+
+static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
+                                        const uint8_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; i++) {
+        dst1[i] = src[2*i+0];
+        dst2[i] = src[2*i+1];
+    }
+}
+
+static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
+                       const uint8_t *src1, const uint8_t *src2,
+                       int width, uint32_t *unused)
+{
+    nvXXtoUV_c(dstU, dstV, src1, width);
+}
+
+static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
+                       const uint8_t *src1, const uint8_t *src2,
+                       int width, uint32_t *unused)
+{
+    nvXXtoUV_c(dstV, dstU, src1, width);
+}
+
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+
+static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
+                       int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        int b= src[i*3+0];
+        int g= src[i*3+1];
+        int r= src[i*3+2];
+
+        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
+    }
+}
+
+static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+                        const uint8_t *src2, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        int b= src1[3*i + 0];
+        int g= src1[3*i + 1];
+        int r= src1[3*i + 2];
+
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+    }
+    assert(src1 == src2);
+}
+
+static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+                             const uint8_t *src2, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        int b= src1[6*i + 0] + src1[6*i + 3];
+        int g= src1[6*i + 1] + src1[6*i + 4];
+        int r= src1[6*i + 2] + src1[6*i + 5];
+
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
+    }
+    assert(src1 == src2);
+}
+
+static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
+                       uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        int r= src[i*3+0];
+        int g= src[i*3+1];
+        int b= src[i*3+2];
+
+        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
+    }
+}
+
+static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+                        const uint8_t *src2, int width, uint32_t *unused)
+{
+    int i;
+    assert(src1==src2);
+    for (i=0; i<width; i++) {
+        int r= src1[3*i + 0];
+        int g= src1[3*i + 1];
+        int b= src1[3*i + 2];
+
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+    }
+}
+
+static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+                             const uint8_t *src2, int width, uint32_t *unused)
+{
+    int i;
+    assert(src1==src2);
+    for (i=0; i<width; i++) {
+        int r= src1[6*i + 0] + src1[6*i + 3];
+        int g= src1[6*i + 1] + src1[6*i + 4];
+        int b= src1[6*i + 2] + src1[6*i + 5];
+
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
+    }
+}
+
+static void planar_rgb_to_y(uint8_t *dst, const uint8_t *src[4], int width)
+{
+    int i;
+    for (i = 0; i < width; i++) {
+        int g = src[0][i];
+        int b = src[1][i];
+        int r = src[2][i];
+
+        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+    }
+}
+
+static void planar_rgb16le_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
+{
+    int i;
+    const uint16_t **src = (const uint16_t **) _src;
+    uint16_t *dst = (uint16_t *) _dst;
+    for (i = 0; i < width; i++) {
+        int g = AV_RL16(src[0] + i);
+        int b = AV_RL16(src[1] + i);
+        int r = AV_RL16(src[2] + i);
+
+        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+    }
+}
+
+static void planar_rgb16be_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
+{
+    int i;
+    const uint16_t **src = (const uint16_t **) _src;
+    uint16_t *dst = (uint16_t *) _dst;
+    for (i = 0; i < width; i++) {
+        int g = AV_RB16(src[0] + i);
+        int b = AV_RB16(src[1] + i);
+        int r = AV_RB16(src[2] + i);
+
+        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+    }
+}
+
+static void planar_rgb_to_uv(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4], int width)
+{
+    int i;
+    for (i = 0; i < width; i++) {
+        int g = src[0][i];
+        int b = src[1][i];
+        int r = src[2][i];
+
+        dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
+        dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
+    }
+}
+
+static void planar_rgb16le_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
+{
+    int i;
+    const uint16_t **src = (const uint16_t **) _src;
+    uint16_t *dstU = (uint16_t *) _dstU;
+    uint16_t *dstV = (uint16_t *) _dstV;
+    for (i = 0; i < width; i++) {
+        int g = AV_RL16(src[0] + i);
+        int b = AV_RL16(src[1] + i);
+        int r = AV_RL16(src[2] + i);
+
+        dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
+        dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
+    }
+}
+
+static void planar_rgb16be_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
+{
+    int i;
+    const uint16_t **src = (const uint16_t **) _src;
+    uint16_t *dstU = (uint16_t *) _dstU;
+    uint16_t *dstV = (uint16_t *) _dstV;
+    for (i = 0; i < width; i++) {
+        int g = AV_RB16(src[0] + i);
+        int b = AV_RB16(src[1] + i);
+        int r = AV_RB16(src[2] + i);
+
+        dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
+        dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
+    }
+}
+
+av_cold void ff_sws_init_input_funcs(SwsContext *c)
+{
+    enum PixelFormat srcFormat = c->srcFormat;
+
+    c->chrToYV12 = NULL;
+    switch(srcFormat) {
+        case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
+        case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
+        case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
+        case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
+        case PIX_FMT_RGB8     :
+        case PIX_FMT_BGR8     :
+        case PIX_FMT_PAL8     :
+        case PIX_FMT_BGR4_BYTE:
+        case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
+        case PIX_FMT_GBRP9LE:
+        case PIX_FMT_GBRP10LE:
+        case PIX_FMT_GBRP16LE:  c->readChrPlanar = planar_rgb16le_to_uv; break;
+        case PIX_FMT_GBRP9BE:
+        case PIX_FMT_GBRP10BE:
+        case PIX_FMT_GBRP16BE:  c->readChrPlanar = planar_rgb16be_to_uv; break;
+        case PIX_FMT_GBRP:      c->readChrPlanar = planar_rgb_to_uv; break;
+#if HAVE_BIGENDIAN
+        case PIX_FMT_YUV444P9LE:
+        case PIX_FMT_YUV422P9LE:
+        case PIX_FMT_YUV420P9LE:
+        case PIX_FMT_YUV422P10LE:
+        case PIX_FMT_YUV444P10LE:
+        case PIX_FMT_YUV420P10LE:
+        case PIX_FMT_YUV420P16LE:
+        case PIX_FMT_YUV422P16LE:
+        case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
+#else
+        case PIX_FMT_YUV444P9BE:
+        case PIX_FMT_YUV422P9BE:
+        case PIX_FMT_YUV420P9BE:
+        case PIX_FMT_YUV444P10BE:
+        case PIX_FMT_YUV422P10BE:
+        case PIX_FMT_YUV420P10BE:
+        case PIX_FMT_YUV420P16BE:
+        case PIX_FMT_YUV422P16BE:
+        case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
+#endif
+    }
+    if (c->chrSrcHSubSample) {
+        switch(srcFormat) {
+        case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
+        case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
+        case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
+        case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
+        case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
+        case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
+        case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
+        case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
+        case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
+        case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
+        case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
+        case PIX_FMT_BGR444LE: c->chrToYV12 = bgr12leToUV_half_c; break;
+        case PIX_FMT_BGR444BE: c->chrToYV12 = bgr12beToUV_half_c; break;
+        case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
+        case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
+        case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
+        case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
+        case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
+        case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
+        case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
+        case PIX_FMT_RGB444LE: c->chrToYV12 = rgb12leToUV_half_c; break;
+        case PIX_FMT_RGB444BE: c->chrToYV12 = rgb12beToUV_half_c; break;
+        }
+    } else {
+        switch(srcFormat) {
+        case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
+        case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
+        case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
+        case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
+        case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
+        case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
+        case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
+        case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
+        case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
+        case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
+        case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
+        case PIX_FMT_BGR444LE: c->chrToYV12 = bgr12leToUV_c; break;
+        case PIX_FMT_BGR444BE: c->chrToYV12 = bgr12beToUV_c; break;
+        case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
+        case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
+        case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
+        case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
+        case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
+        case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
+        case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
+        case PIX_FMT_RGB444LE: c->chrToYV12 = rgb12leToUV_c; break;
+        case PIX_FMT_RGB444BE: c->chrToYV12 = rgb12beToUV_c; break;
+        }
+    }
+
+    c->lumToYV12 = NULL;
+    c->alpToYV12 = NULL;
+    switch (srcFormat) {
+    case PIX_FMT_GBRP9LE:
+    case PIX_FMT_GBRP10LE:
+    case PIX_FMT_GBRP16LE: c->readLumPlanar = planar_rgb16le_to_y; break;
+    case PIX_FMT_GBRP9BE:
+    case PIX_FMT_GBRP10BE:
+    case PIX_FMT_GBRP16BE: c->readLumPlanar = planar_rgb16be_to_y; break;
+    case PIX_FMT_GBRP:     c->readLumPlanar = planar_rgb_to_y; break;
+#if HAVE_BIGENDIAN
+    case PIX_FMT_YUV444P9LE:
+    case PIX_FMT_YUV422P9LE:
+    case PIX_FMT_YUV420P9LE:
+    case PIX_FMT_YUV444P10LE:
+    case PIX_FMT_YUV422P10LE:
+    case PIX_FMT_YUV420P10LE:
+    case PIX_FMT_YUV420P16LE:
+    case PIX_FMT_YUV422P16LE:
+    case PIX_FMT_YUV444P16LE:
+    case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
+#else
+    case PIX_FMT_YUV444P9BE:
+    case PIX_FMT_YUV422P9BE:
+    case PIX_FMT_YUV420P9BE:
+    case PIX_FMT_YUV444P10BE:
+    case PIX_FMT_YUV422P10BE:
+    case PIX_FMT_YUV420P10BE:
+    case PIX_FMT_YUV420P16BE:
+    case PIX_FMT_YUV422P16BE:
+    case PIX_FMT_YUV444P16BE:
+    case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
+#endif
+    case PIX_FMT_YUYV422  :
+    case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
+    case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
+    case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
+    case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
+    case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
+    case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
+    case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
+    case PIX_FMT_BGR444LE : c->lumToYV12 = bgr12leToY_c; break;
+    case PIX_FMT_BGR444BE : c->lumToYV12 = bgr12beToY_c; break;
+    case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
+    case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
+    case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
+    case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
+    case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
+    case PIX_FMT_RGB444LE : c->lumToYV12 = rgb12leToY_c; break;
+    case PIX_FMT_RGB444BE : c->lumToYV12 = rgb12beToY_c; break;
+    case PIX_FMT_RGB8     :
+    case PIX_FMT_BGR8     :
+    case PIX_FMT_PAL8     :
+    case PIX_FMT_BGR4_BYTE:
+    case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
+    case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
+    case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
+    case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
+    case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
+    case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
+    case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
+    case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
+    case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
+    case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
+    case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
+    }
+    if (c->alpPixBuf) {
+        switch (srcFormat) {
+        case PIX_FMT_BGRA:
+        case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
+        case PIX_FMT_ABGR:
+        case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
+        case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
+        }
+    }
+}
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index a7057bb48e..b7f348b4ff 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -34,19 +34,6 @@
 #include "libavutil/bswap.h"
 #include "libavutil/pixdesc.h"
 
-#define DITHER1XBPP
-
-#define RGB2YUV_SHIFT 15
-#define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
-#define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
-#define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
-#define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
-#define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
-#define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
-#define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
-#define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
-#define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
-
 /*
 NOTES
 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
@@ -1329,547 +1316,6 @@ static av_always_inline void fillPlane(uint8_t* plane, int stride,
     }
 }
 
-#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
-
-#define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
-#define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
-
-static av_always_inline void
-rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
-                    enum PixelFormat origin)
-{
-    int i;
-    for (i = 0; i < width; i++) {
-        unsigned int r_b = input_pixel(&src[i*3+0]);
-        unsigned int   g = input_pixel(&src[i*3+1]);
-        unsigned int b_r = input_pixel(&src[i*3+2]);
-
-        dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
-    }
-}
-
-static av_always_inline void
-rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
-                    const uint16_t *src1, const uint16_t *src2,
-                    int width, enum PixelFormat origin)
-{
-    int i;
-    assert(src1==src2);
-    for (i = 0; i < width; i++) {
-        int r_b = input_pixel(&src1[i*3+0]);
-        int   g = input_pixel(&src1[i*3+1]);
-        int b_r = input_pixel(&src1[i*3+2]);
-
-        dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
-        dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
-    }
-}
-
-static av_always_inline void
-rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
-                          const uint16_t *src1, const uint16_t *src2,
-                          int width, enum PixelFormat origin)
-{
-    int i;
-    assert(src1==src2);
-    for (i = 0; i < width; i++) {
-        int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
-        int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
-        int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
-
-        dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
-        dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
-    }
-}
-
-#undef r
-#undef b
-#undef input_pixel
-
-#define rgb48funcs(pattern, BE_LE, origin) \
-static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
-                                    int width, uint32_t *unused) \
-{ \
-    const uint16_t *src = (const uint16_t *) _src; \
-    uint16_t *dst = (uint16_t *) _dst; \
-    rgb48ToY_c_template(dst, src, width, origin); \
-} \
- \
-static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
-                                    const uint8_t *_src1, const uint8_t *_src2, \
-                                    int width, uint32_t *unused) \
-{ \
-    const uint16_t *src1 = (const uint16_t *) _src1, \
-                   *src2 = (const uint16_t *) _src2; \
-    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
-    rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
-} \
- \
-static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
-                                    const uint8_t *_src1, const uint8_t *_src2, \
-                                    int width, uint32_t *unused) \
-{ \
-    const uint16_t *src1 = (const uint16_t *) _src1, \
-                   *src2 = (const uint16_t *) _src2; \
-    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
-    rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
-}
-
-rgb48funcs(rgb, LE, PIX_FMT_RGB48LE)
-rgb48funcs(rgb, BE, PIX_FMT_RGB48BE)
-rgb48funcs(bgr, LE, PIX_FMT_BGR48LE)
-rgb48funcs(bgr, BE, PIX_FMT_BGR48BE)
-
-#define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
-                         origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
-                        (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
-
-static av_always_inline void
-rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
-                       int width, enum PixelFormat origin,
-                       int shr,   int shg,   int shb, int shp,
-                       int maskr, int maskg, int maskb,
-                       int rsh,   int gsh,   int bsh, int S)
-{
-    const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh;
-    const unsigned rnd = 33u << (S - 1);
-    int i;
-
-    for (i = 0; i < width; i++) {
-        int px = input_pixel(i) >> shp;
-        int b = (px & maskb) >> shb;
-        int g = (px & maskg) >> shg;
-        int r = (px & maskr) >> shr;
-
-        dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
-    }
-}
-
-static av_always_inline void
-rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
-                        const uint8_t *src, int width,
-                        enum PixelFormat origin,
-                        int shr,   int shg,   int shb, int shp,
-                        int maskr, int maskg, int maskb,
-                        int rsh,   int gsh,   int bsh, int S)
-{
-    const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
-              rv = RV << rsh, gv = GV << gsh, bv = BV << bsh;
-    const unsigned rnd = 257u << (S - 1);
-    int i;
-
-    for (i = 0; i < width; i++) {
-        int px = input_pixel(i) >> shp;
-        int b = (px & maskb) >> shb;
-        int g = (px & maskg) >> shg;
-        int r = (px & maskr) >> shr;
-
-        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
-        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
-    }
-}
-
-static av_always_inline void
-rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
-                             const uint8_t *src, int width,
-                             enum PixelFormat origin,
-                             int shr,   int shg,   int shb, int shp,
-                             int maskr, int maskg, int maskb,
-                             int rsh,   int gsh,   int bsh, int S)
-{
-    const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
-              rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
-              maskgx = ~(maskr | maskb);
-    const unsigned rnd = 257u << S;
-    int i;
-
-    maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
-    for (i = 0; i < width; i++) {
-        int px0 = input_pixel(2 * i + 0) >> shp;
-        int px1 = input_pixel(2 * i + 1) >> shp;
-        int b, r, g = (px0 & maskgx) + (px1 & maskgx);
-        int rb = px0 + px1 - g;
-
-        b = (rb & maskb) >> shb;
-        if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
-            origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
-            g >>= shg;
-        } else {
-            g = (g  & maskg) >> shg;
-        }
-        r = (rb & maskr) >> shr;
-
-        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
-        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
-    }
-}
-
-#undef input_pixel
-
-#define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
-                         maskg, maskb, rsh, gsh, bsh, S) \
-static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
-                          int width, uint32_t *unused) \
-{ \
-    rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
-                           maskr, maskg, maskb, rsh, gsh, bsh, S); \
-} \
- \
-static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
-                           const uint8_t *src, const uint8_t *dummy, \
-                           int width, uint32_t *unused) \
-{ \
-    rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
-                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
-} \
- \
-static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
-                                const uint8_t *src, const uint8_t *dummy, \
-                                int width, uint32_t *unused) \
-{ \
-    rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
-                                 maskr, maskg, maskb, rsh, gsh, bsh, S); \
-}
-
-rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
-rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
-rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
-rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
-rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
-rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
-rgb16_32_wrapper(PIX_FMT_BGR444LE, bgr12le, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT+4)
-rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
-rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
-rgb16_32_wrapper(PIX_FMT_RGB444LE, rgb12le, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT+4)
-rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
-rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
-rgb16_32_wrapper(PIX_FMT_BGR444BE, bgr12be, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT+4)
-rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
-rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
-rgb16_32_wrapper(PIX_FMT_RGB444BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT+4)
-
-static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        dst[i]= src[4*i];
-    }
-}
-
-static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        dst[i]= src[4*i+3];
-    }
-}
-
-static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        int d= src[i];
-
-        dst[i]= pal[d] & 0xFF;
-    }
-}
-
-static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
-                      const uint8_t *src1, const uint8_t *src2,
-                      int width, uint32_t *pal)
-{
-    int i;
-    assert(src1 == src2);
-    for (i=0; i<width; i++) {
-        int p= pal[src1[i]];
-
-        dstU[i]= p>>8;
-        dstV[i]= p>>16;
-    }
-}
-
-static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
-                          int width, uint32_t *unused)
-{
-    int i, j;
-    for (i=0; i<width/8; i++) {
-        int d= ~src[i];
-        for(j=0; j<8; j++)
-            dst[8*i+j]= ((d>>(7-j))&1)*255;
-    }
-}
-
-static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
-                          int width, uint32_t *unused)
-{
-    int i, j;
-    for (i=0; i<width/8; i++) {
-        int d= src[i];
-        for(j=0; j<8; j++)
-            dst[8*i+j]= ((d>>(7-j))&1)*255;
-    }
-}
-
-static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
-                      uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++)
-        dst[i]= src[2*i];
-}
-
-static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
-                       const uint8_t *src2, int width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        dstU[i]= src1[4*i + 1];
-        dstV[i]= src1[4*i + 3];
-    }
-    assert(src1 == src2);
-}
-
-static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
-{
-    int i;
-    const uint16_t *src = (const uint16_t *) _src;
-    uint16_t *dst = (uint16_t *) _dst;
-    for (i=0; i<width; i++) {
-        dst[i] = av_bswap16(src[i]);
-    }
-}
-
-static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
-                        const uint8_t *_src2, int width, uint32_t *unused)
-{
-    int i;
-    const uint16_t *src1 = (const uint16_t *) _src1,
-                   *src2 = (const uint16_t *) _src2;
-    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
-    for (i=0; i<width; i++) {
-        dstU[i] = av_bswap16(src1[i]);
-        dstV[i] = av_bswap16(src2[i]);
-    }
-}
-
-/* This is almost identical to the previous, end exists only because
- * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
-static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
-                      uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++)
-        dst[i]= src[2*i+1];
-}
-
-static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
-                       const uint8_t *src2, int width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        dstU[i]= src1[4*i + 0];
-        dstV[i]= src1[4*i + 2];
-    }
-    assert(src1 == src2);
-}
-
-static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
-                                        const uint8_t *src, int width)
-{
-    int i;
-    for (i = 0; i < width; i++) {
-        dst1[i] = src[2*i+0];
-        dst2[i] = src[2*i+1];
-    }
-}
-
-static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
-                       const uint8_t *src1, const uint8_t *src2,
-                       int width, uint32_t *unused)
-{
-    nvXXtoUV_c(dstU, dstV, src1, width);
-}
-
-static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
-                       const uint8_t *src1, const uint8_t *src2,
-                       int width, uint32_t *unused)
-{
-    nvXXtoUV_c(dstV, dstU, src1, width);
-}
-
-#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
-
-static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
-                       int width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        int b= src[i*3+0];
-        int g= src[i*3+1];
-        int r= src[i*3+2];
-
-        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
-    }
-}
-
-static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
-                        const uint8_t *src2, int width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        int b= src1[3*i + 0];
-        int g= src1[3*i + 1];
-        int r= src1[3*i + 2];
-
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
-    }
-    assert(src1 == src2);
-}
-
-static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
-                             const uint8_t *src2, int width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        int b= src1[6*i + 0] + src1[6*i + 3];
-        int g= src1[6*i + 1] + src1[6*i + 4];
-        int r= src1[6*i + 2] + src1[6*i + 5];
-
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-    }
-    assert(src1 == src2);
-}
-
-static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
-                       uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        int r= src[i*3+0];
-        int g= src[i*3+1];
-        int b= src[i*3+2];
-
-        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
-    }
-}
-
-static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
-                        const uint8_t *src2, int width, uint32_t *unused)
-{
-    int i;
-    assert(src1==src2);
-    for (i=0; i<width; i++) {
-        int r= src1[3*i + 0];
-        int g= src1[3*i + 1];
-        int b= src1[3*i + 2];
-
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
-    }
-}
-
-static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
-                             const uint8_t *src2, int width, uint32_t *unused)
-{
-    int i;
-    assert(src1==src2);
-    for (i=0; i<width; i++) {
-        int r= src1[6*i + 0] + src1[6*i + 3];
-        int g= src1[6*i + 1] + src1[6*i + 4];
-        int b= src1[6*i + 2] + src1[6*i + 5];
-
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-    }
-}
-
-static void planar_rgb_to_y(uint8_t *dst, const uint8_t *src[4], int width)
-{
-    int i;
-    for (i = 0; i < width; i++) {
-        int g = src[0][i];
-        int b = src[1][i];
-        int r = src[2][i];
-
-        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-    }
-}
-
-static void planar_rgb16le_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
-{
-    int i;
-    const uint16_t **src = (const uint16_t **) _src;
-    uint16_t *dst = (uint16_t *) _dst;
-    for (i = 0; i < width; i++) {
-        int g = AV_RL16(src[0] + i);
-        int b = AV_RL16(src[1] + i);
-        int r = AV_RL16(src[2] + i);
-
-        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-    }
-}
-
-static void planar_rgb16be_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
-{
-    int i;
-    const uint16_t **src = (const uint16_t **) _src;
-    uint16_t *dst = (uint16_t *) _dst;
-    for (i = 0; i < width; i++) {
-        int g = AV_RB16(src[0] + i);
-        int b = AV_RB16(src[1] + i);
-        int r = AV_RB16(src[2] + i);
-
-        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-    }
-}
-
-static void planar_rgb_to_uv(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4], int width)
-{
-    int i;
-    for (i = 0; i < width; i++) {
-        int g = src[0][i];
-        int b = src[1][i];
-        int r = src[2][i];
-
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
-    }
-}
-
-static void planar_rgb16le_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
-{
-    int i;
-    const uint16_t **src = (const uint16_t **) _src;
-    uint16_t *dstU = (uint16_t *) _dstU;
-    uint16_t *dstV = (uint16_t *) _dstV;
-    for (i = 0; i < width; i++) {
-        int g = AV_RL16(src[0] + i);
-        int b = AV_RL16(src[1] + i);
-        int r = AV_RL16(src[2] + i);
-
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
-    }
-}
-
-static void planar_rgb16be_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
-{
-    int i;
-    const uint16_t **src = (const uint16_t **) _src;
-    uint16_t *dstU = (uint16_t *) _dstU;
-    uint16_t *dstV = (uint16_t *) _dstV;
-    for (i = 0; i < width; i++) {
-        int g = AV_RB16(src[0] + i);
-        int b = AV_RB16(src[1] + i);
-        int r = AV_RB16(src[2] + i);
-
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
-    }
-}
-
 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
                            const int16_t *filter,
                            const int16_t *filterPos, int filterSize)
@@ -2697,173 +2143,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
                                    &c->yuv2nv12cX, &c->yuv2packed1, &c->yuv2packed2,
                                    &c->yuv2packedX);
 
-    c->chrToYV12 = NULL;
-    switch(srcFormat) {
-        case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
-        case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
-        case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
-        case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
-        case PIX_FMT_RGB8     :
-        case PIX_FMT_BGR8     :
-        case PIX_FMT_PAL8     :
-        case PIX_FMT_BGR4_BYTE:
-        case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
-        case PIX_FMT_GBRP9LE:
-        case PIX_FMT_GBRP10LE:
-        case PIX_FMT_GBRP16LE:  c->readChrPlanar = planar_rgb16le_to_uv; break;
-        case PIX_FMT_GBRP9BE:
-        case PIX_FMT_GBRP10BE:
-        case PIX_FMT_GBRP16BE:  c->readChrPlanar = planar_rgb16be_to_uv; break;
-        case PIX_FMT_GBRP:      c->readChrPlanar = planar_rgb_to_uv; break;
-#if HAVE_BIGENDIAN
-        case PIX_FMT_YUV444P9LE:
-        case PIX_FMT_YUV422P9LE:
-        case PIX_FMT_YUV420P9LE:
-        case PIX_FMT_YUV422P10LE:
-        case PIX_FMT_YUV444P10LE:
-        case PIX_FMT_YUV420P10LE:
-        case PIX_FMT_YUV420P16LE:
-        case PIX_FMT_YUV422P16LE:
-        case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
-#else
-        case PIX_FMT_YUV444P9BE:
-        case PIX_FMT_YUV422P9BE:
-        case PIX_FMT_YUV420P9BE:
-        case PIX_FMT_YUV444P10BE:
-        case PIX_FMT_YUV422P10BE:
-        case PIX_FMT_YUV420P10BE:
-        case PIX_FMT_YUV420P16BE:
-        case PIX_FMT_YUV422P16BE:
-        case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
-#endif
-    }
-    if (c->chrSrcHSubSample) {
-        switch(srcFormat) {
-        case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
-        case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
-        case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
-        case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
-        case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
-        case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
-        case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
-        case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
-        case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
-        case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
-        case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
-        case PIX_FMT_BGR444LE: c->chrToYV12 = bgr12leToUV_half_c; break;
-        case PIX_FMT_BGR444BE: c->chrToYV12 = bgr12beToUV_half_c; break;
-        case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
-        case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
-        case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
-        case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
-        case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
-        case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
-        case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
-        case PIX_FMT_RGB444LE: c->chrToYV12 = rgb12leToUV_half_c; break;
-        case PIX_FMT_RGB444BE: c->chrToYV12 = rgb12beToUV_half_c; break;
-        }
-    } else {
-        switch(srcFormat) {
-        case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
-        case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
-        case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
-        case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
-        case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
-        case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
-        case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
-        case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
-        case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
-        case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
-        case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
-        case PIX_FMT_BGR444LE: c->chrToYV12 = bgr12leToUV_c; break;
-        case PIX_FMT_BGR444BE: c->chrToYV12 = bgr12beToUV_c; break;
-        case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
-        case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
-        case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
-        case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
-        case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
-        case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
-        case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
-        case PIX_FMT_RGB444LE: c->chrToYV12 = rgb12leToUV_c; break;
-        case PIX_FMT_RGB444BE: c->chrToYV12 = rgb12beToUV_c; break;
-        }
-    }
-
-    c->lumToYV12 = NULL;
-    c->alpToYV12 = NULL;
-    switch (srcFormat) {
-    case PIX_FMT_GBRP9LE:
-    case PIX_FMT_GBRP10LE:
-    case PIX_FMT_GBRP16LE: c->readLumPlanar = planar_rgb16le_to_y; break;
-    case PIX_FMT_GBRP9BE:
-    case PIX_FMT_GBRP10BE:
-    case PIX_FMT_GBRP16BE: c->readLumPlanar = planar_rgb16be_to_y; break;
-    case PIX_FMT_GBRP:     c->readLumPlanar = planar_rgb_to_y; break;
-#if HAVE_BIGENDIAN
-    case PIX_FMT_YUV444P9LE:
-    case PIX_FMT_YUV422P9LE:
-    case PIX_FMT_YUV420P9LE:
-    case PIX_FMT_YUV444P10LE:
-    case PIX_FMT_YUV422P10LE:
-    case PIX_FMT_YUV420P10LE:
-    case PIX_FMT_YUV420P16LE:
-    case PIX_FMT_YUV422P16LE:
-    case PIX_FMT_YUV444P16LE:
-    case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
-#else
-    case PIX_FMT_YUV444P9BE:
-    case PIX_FMT_YUV422P9BE:
-    case PIX_FMT_YUV420P9BE:
-    case PIX_FMT_YUV444P10BE:
-    case PIX_FMT_YUV422P10BE:
-    case PIX_FMT_YUV420P10BE:
-    case PIX_FMT_YUV420P16BE:
-    case PIX_FMT_YUV422P16BE:
-    case PIX_FMT_YUV444P16BE:
-    case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
-#endif
-    case PIX_FMT_YUYV422  :
-    case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
-    case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
-    case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
-    case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
-    case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
-    case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
-    case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
-    case PIX_FMT_BGR444LE : c->lumToYV12 = bgr12leToY_c; break;
-    case PIX_FMT_BGR444BE : c->lumToYV12 = bgr12beToY_c; break;
-    case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
-    case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
-    case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
-    case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
-    case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
-    case PIX_FMT_RGB444LE : c->lumToYV12 = rgb12leToY_c; break;
-    case PIX_FMT_RGB444BE : c->lumToYV12 = rgb12beToY_c; break;
-    case PIX_FMT_RGB8     :
-    case PIX_FMT_BGR8     :
-    case PIX_FMT_PAL8     :
-    case PIX_FMT_BGR4_BYTE:
-    case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
-    case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
-    case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
-    case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
-    case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
-    case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
-    case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
-    case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
-    case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
-    case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
-    case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
-    }
-    if (c->alpPixBuf) {
-        switch (srcFormat) {
-        case PIX_FMT_BGRA:
-        case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
-        case PIX_FMT_ABGR:
-        case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
-        case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
-        }
-    }
+    ff_sws_init_input_funcs(c);
 
     if (c->srcBpc == 8) {
         if (c->dstBpc <= 10) {
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 3436b92788..6d77608a59 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -649,6 +649,7 @@ void ff_swscale_get_unscaled_altivec(SwsContext *c);
  */
 SwsFunc ff_getSwsFunc(SwsContext *c);
 
+void ff_sws_init_input_funcs(SwsContext *c);
 void ff_sws_init_swScale_altivec(SwsContext *c);
 void ff_sws_init_swScale_mmx(SwsContext *c);
 
-- 
cgit v1.2.3


From 21449410693d40743c39d7e75580863f8ad266e3 Mon Sep 17 00:00:00 2001
From: Ronald S. Bultje
Date: Wed, 1 Feb 2012 07:38:56 -0800
Subject: swscale: split C output functions into separate file.

Signed-off-by: Diego Biurrun <diego@biurrun.de>
---
 libswscale/Makefile           |    4 +-
 libswscale/output.c           | 1524 +++++++++++++++++++++++++++++++++++++++++
 libswscale/swscale.c          | 1511 +---------------------------------------
 libswscale/swscale_internal.h |    7 +
 4 files changed, 1538 insertions(+), 1508 deletions(-)
 create mode 100644 libswscale/output.c

diff --git a/libswscale/Makefile b/libswscale/Makefile
index f093516b5e..75262f857b 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -3,8 +3,8 @@ FFLIBS = avutil
 
 HEADERS = swscale.h
 
-OBJS = input.o options.o rgb2rgb.o swscale.o utils.o \
-       swscale_unscaled.o yuv2rgb.o
+OBJS = input.o options.o output.o rgb2rgb.o swscale.o \
+       swscale_unscaled.o utils.o yuv2rgb.o
 
 OBJS-$(ARCH_BFIN)          +=  bfin/internal_bfin.o     \
                                bfin/swscale_bfin.o      \
diff --git a/libswscale/output.c b/libswscale/output.c
new file mode 100644
index 0000000000..93a6a03061
--- /dev/null
+++ b/libswscale/output.c
@@ -0,0 +1,1524 @@
+/*
+ * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "libavutil/avutil.h"
+#include "libavutil/bswap.h"
+#include "libavutil/cpu.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/pixdesc.h"
+#include "config.h"
+#include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+
+DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
+{  1,   3,   1,   3,   1,   3,   1,   3, },
+{  2,   0,   2,   0,   2,   0,   2,   0, },
+};
+
+DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
+{  6,   2,   6,   2,   6,   2,   6,   2, },
+{  0,   4,   0,   4,   0,   4,   0,   4, },
+};
+
+DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
+{  8,   4,  11,   7,   8,   4,  11,   7, },
+{  2,  14,   1,  13,   2,  14,   1,  13, },
+{ 10,   6,   9,   5,  10,   6,   9,   5, },
+{  0,  12,   3,  15,   0,  12,   3,  15, },
+};
+
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
+{ 17,   9,  23,  15,  16,   8,  22,  14, },
+{  5,  29,   3,  27,   4,  28,   2,  26, },
+{ 21,  13,  19,  11,  20,  12,  18,  10, },
+{  0,  24,   6,  30,   1,  25,   7,  31, },
+{ 16,   8,  22,  14,  17,   9,  23,  15, },
+{  4,  28,   2,  26,   5,  29,   3,  27, },
+{ 20,  12,  18,  10,  21,  13,  19,  11, },
+{  1,  25,   7,  31,   0,  24,   6,  30, },
+};
+
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
+{  0,  55,  14,  68,   3,  58,  17,  72, },
+{ 37,  18,  50,  32,  40,  22,  54,  35, },
+{  9,  64,   5,  59,  13,  67,   8,  63, },
+{ 46,  27,  41,  23,  49,  31,  44,  26, },
+{  2,  57,  16,  71,   1,  56,  15,  70, },
+{ 39,  21,  52,  34,  38,  19,  51,  33, },
+{ 11,  66,   7,  62,  10,  65,   6,  60, },
+{ 48,  30,  43,  25,  47,  29,  42,  24, },
+};
+
+#if 1
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
+{117,  62, 158, 103, 113,  58, 155, 100, },
+{ 34, 199,  21, 186,  31, 196,  17, 182, },
+{144,  89, 131,  76, 141,  86, 127,  72, },
+{  0, 165,  41, 206,  10, 175,  52, 217, },
+{110,  55, 151,  96, 120,  65, 162, 107, },
+{ 28, 193,  14, 179,  38, 203,  24, 189, },
+{138,  83, 124,  69, 148,  93, 134,  79, },
+{  7, 172,  48, 213,   3, 168,  45, 210, },
+};
+#elif 1
+// tries to correct a gamma of 1.5
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
+{  0, 143,  18, 200,   2, 156,  25, 215, },
+{ 78,  28, 125,  64,  89,  36, 138,  74, },
+{ 10, 180,   3, 161,  16, 195,   8, 175, },
+{109,  51,  93,  38, 121,  60, 105,  47, },
+{  1, 152,  23, 210,   0, 147,  20, 205, },
+{ 85,  33, 134,  71,  81,  30, 130,  67, },
+{ 14, 190,   6, 171,  12, 185,   5, 166, },
+{117,  57, 101,  44, 113,  54,  97,  41, },
+};
+#elif 1
+// tries to correct a gamma of 2.0
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
+{  0, 124,   8, 193,   0, 140,  12, 213, },
+{ 55,  14, 104,  42,  66,  19, 119,  52, },
+{  3, 168,   1, 145,   6, 187,   3, 162, },
+{ 86,  31,  70,  21,  99,  39,  82,  28, },
+{  0, 134,  11, 206,   0, 129,   9, 200, },
+{ 62,  17, 114,  48,  58,  16, 109,  45, },
+{  5, 181,   2, 157,   4, 175,   1, 151, },
+{ 95,  36,  78,  26,  90,  34,  74,  24, },
+};
+#else
+// tries to correct a gamma of 2.5
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
+{  0, 107,   3, 187,   0, 125,   6, 212, },
+{ 39,   7,  86,  28,  49,  11, 102,  36, },
+{  1, 158,   0, 131,   3, 180,   1, 151, },
+{ 68,  19,  52,  12,  81,  25,  64,  17, },
+{  0, 119,   5, 203,   0, 113,   4, 195, },
+{ 45,   9,  96,  33,  42,   8,  91,  30, },
+{  2, 172,   1, 144,   2, 165,   0, 137, },
+{ 77,  23,  60,  15,  72,  21,  56,  14, },
+};
+#endif
+
+#define output_pixel(pos, val, bias, signedness) \
+    if (big_endian) { \
+        AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+    } else { \
+        AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+    }
+
+static av_always_inline void
+yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
+                         int big_endian, int output_bits)
+{
+    int i;
+    int shift = 19 - output_bits;
+
+    for (i = 0; i < dstW; i++) {
+        int val = src[i] + (1 << (shift - 1));
+        output_pixel(&dest[i], val, 0, uint);
+    }
+}
+
+static av_always_inline void
+yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
+                         const int32_t **src, uint16_t *dest, int dstW,
+                         int big_endian, int output_bits)
+{
+    int i;
+    int shift = 15 + 16 - output_bits;
+
+    for (i = 0; i < dstW; i++) {
+        int val = 1 << (30-output_bits);
+        int j;
+
+        /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
+         * filters (or anything with negative coeffs, the range can be slightly
+         * wider in both directions. To account for this overflow, we subtract
+         * a constant so it always fits in the signed range (assuming a
+         * reasonable filterSize), and re-add that at the end. */
+        val -= 0x40000000;
+        for (j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];
+
+        output_pixel(&dest[i], val, 0x8000, int);
+    }
+}
+
+#undef output_pixel
+
+#define output_pixel(pos, val) \
+    if (big_endian) { \
+        AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
+    } else { \
+        AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
+    }
+
+static av_always_inline void
+yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
+                         int big_endian, int output_bits)
+{
+    int i;
+    int shift = 15 - output_bits;
+
+    for (i = 0; i < dstW; i++) {
+        int val = src[i] + (1 << (shift - 1));
+        output_pixel(&dest[i], val);
+    }
+}
+
+static av_always_inline void
+yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
+                         const int16_t **src, uint16_t *dest, int dstW,
+                         int big_endian, int output_bits)
+{
+    int i;
+    int shift = 11 + 16 - output_bits;
+
+    for (i = 0; i < dstW; i++) {
+        int val = 1 << (26-output_bits);
+        int j;
+
+        for (j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];
+
+        output_pixel(&dest[i], val);
+    }
+}
+
+#undef output_pixel
+
+#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
+static void yuv2plane1_ ## bits ## BE_LE ## _c(const int16_t *src, \
+                              uint8_t *dest, int dstW, \
+                              const uint8_t *dither, int offset)\
+{ \
+    yuv2plane1_ ## template_size ## _c_template((const typeX_t *) src, \
+                         (uint16_t *) dest, dstW, is_be, bits); \
+}\
+static void yuv2planeX_ ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \
+                              const int16_t **src, uint8_t *dest, int dstW, \
+                              const uint8_t *dither, int offset)\
+{ \
+    yuv2planeX_## template_size ## _c_template(filter, \
+                         filterSize, (const typeX_t **) src, \
+                         (uint16_t *) dest, dstW, is_be, bits); \
+}
+yuv2NBPS( 9, BE, 1, 10, int16_t)
+yuv2NBPS( 9, LE, 0, 10, int16_t)
+yuv2NBPS(10, BE, 1, 10, int16_t)
+yuv2NBPS(10, LE, 0, 10, int16_t)
+yuv2NBPS(16, BE, 1, 16, int32_t)
+yuv2NBPS(16, LE, 0, 16, int32_t)
+
+static void yuv2planeX_8_c(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    int i;
+    for (i=0; i<dstW; i++) {
+        int val = dither[(i + offset) & 7] << 12;
+        int j;
+        for (j=0; j<filterSize; j++)
+            val += src[j][i] * filter[j];
+
+        dest[i]= av_clip_uint8(val>>19);
+    }
+}
+
+static void yuv2plane1_8_c(const int16_t *src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    int i;
+    for (i=0; i<dstW; i++) {
+        int val = (src[i] + dither[(i + offset) & 7]) >> 7;
+        dest[i]= av_clip_uint8(val);
+    }
+}
+
+static void yuv2nv12cX_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
+                        const int16_t **chrUSrc, const int16_t **chrVSrc,
+                        uint8_t *dest, int chrDstW)
+{
+    enum PixelFormat dstFormat = c->dstFormat;
+    const uint8_t *chrDither = c->chrDither8;
+    int i;
+
+    if (dstFormat == PIX_FMT_NV12)
+        for (i=0; i<chrDstW; i++) {
+            int u = chrDither[i & 7] << 12;
+            int v = chrDither[(i + 3) & 7] << 12;
+            int j;
+            for (j=0; j<chrFilterSize; j++) {
+                u += chrUSrc[j][i] * chrFilter[j];
+                v += chrVSrc[j][i] * chrFilter[j];
+            }
+
+            dest[2*i]= av_clip_uint8(u>>19);
+            dest[2*i+1]= av_clip_uint8(v>>19);
+        }
+    else
+        for (i=0; i<chrDstW; i++) {
+            int u = chrDither[i & 7] << 12;
+            int v = chrDither[(i + 3) & 7] << 12;
+            int j;
+            for (j=0; j<chrFilterSize; j++) {
+                u += chrUSrc[j][i] * chrFilter[j];
+                v += chrVSrc[j][i] * chrFilter[j];
+            }
+
+            dest[2*i]= av_clip_uint8(v>>19);
+            dest[2*i+1]= av_clip_uint8(u>>19);
+        }
+}
+
+#define output_pixel(pos, val) \
+        if (target == PIX_FMT_GRAY16BE) { \
+            AV_WB16(pos, val); \
+        } else { \
+            AV_WL16(pos, val); \
+        }
+
+static av_always_inline void
+yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
+                        const int32_t **lumSrc, int lumFilterSize,
+                        const int16_t *chrFilter, const int32_t **chrUSrc,
+                        const int32_t **chrVSrc, int chrFilterSize,
+                        const int32_t **alpSrc, uint16_t *dest, int dstW,
+                        int y, enum PixelFormat target)
+{
+    int i;
+
+    for (i = 0; i < (dstW >> 1); i++) {
+        int j;
+        int Y1 = (1 << 14) - 0x40000000;
+        int Y2 = (1 << 14) - 0x40000000;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            Y1 += lumSrc[j][i * 2]     * lumFilter[j];
+            Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
+        }
+        Y1 >>= 15;
+        Y2 >>= 15;
+        Y1 = av_clip_int16(Y1);
+        Y2 = av_clip_int16(Y2);
+        output_pixel(&dest[i * 2 + 0], 0x8000 + Y1);
+        output_pixel(&dest[i * 2 + 1], 0x8000 + Y2);
+    }
+}
+
+static av_always_inline void
+yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
+                        const int32_t *ubuf[2], const int32_t *vbuf[2],
+                        const int32_t *abuf[2], uint16_t *dest, int dstW,
+                        int yalpha, int uvalpha, int y,
+                        enum PixelFormat target)
+{
+    int  yalpha1 = 4095 - yalpha;
+    int i;
+    const int32_t *buf0 = buf[0], *buf1 = buf[1];
+
+    for (i = 0; i < (dstW >> 1); i++) {
+        int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
+        int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
+
+        output_pixel(&dest[i * 2 + 0], Y1);
+        output_pixel(&dest[i * 2 + 1], Y2);
+    }
+}
+
+static av_always_inline void
+yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
+                        const int32_t *ubuf[2], const int32_t *vbuf[2],
+                        const int32_t *abuf0, uint16_t *dest, int dstW,
+                        int uvalpha, int y, enum PixelFormat target)
+{
+    int i;
+
+    for (i = 0; i < (dstW >> 1); i++) {
+        int Y1 = buf0[i * 2    ] << 1;
+        int Y2 = buf0[i * 2 + 1] << 1;
+
+        output_pixel(&dest[i * 2 + 0], Y1);
+        output_pixel(&dest[i * 2 + 1], Y2);
+    }
+}
+
+#undef output_pixel
+
+#define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
+static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
+                        const int16_t **_lumSrc, int lumFilterSize, \
+                        const int16_t *chrFilter, const int16_t **_chrUSrc, \
+                        const int16_t **_chrVSrc, int chrFilterSize, \
+                        const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
+                        int y) \
+{ \
+    const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
+                  **chrUSrc = (const int32_t **) _chrUSrc, \
+                  **chrVSrc = (const int32_t **) _chrVSrc, \
+                  **alpSrc  = (const int32_t **) _alpSrc; \
+    uint16_t *dest = (uint16_t *) _dest; \
+    name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
+                          chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+                          alpSrc, dest, dstW, y, fmt); \
+} \
+ \
+static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
+                        const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
+                        const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
+                        int yalpha, int uvalpha, int y) \
+{ \
+    const int32_t **buf  = (const int32_t **) _buf, \
+                  **ubuf = (const int32_t **) _ubuf, \
+                  **vbuf = (const int32_t **) _vbuf, \
+                  **abuf = (const int32_t **) _abuf; \
+    uint16_t *dest = (uint16_t *) _dest; \
+    name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
+                          dest, dstW, yalpha, uvalpha, y, fmt); \
+} \
+ \
+static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
+                        const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
+                        const int16_t *_abuf0, uint8_t *_dest, int dstW, \
+                        int uvalpha, int y) \
+{ \
+    const int32_t *buf0  = (const int32_t *)  _buf0, \
+                 **ubuf  = (const int32_t **) _ubuf, \
+                 **vbuf  = (const int32_t **) _vbuf, \
+                  *abuf0 = (const int32_t *)  _abuf0; \
+    uint16_t *dest = (uint16_t *) _dest; \
+    name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
+                                  dstW, uvalpha, y, fmt); \
+}
+
+YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE)
+YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE)
+
+#define output_pixel(pos, acc) \
+    if (target == PIX_FMT_MONOBLACK) { \
+        pos = acc; \
+    } else { \
+        pos = ~acc; \
+    }
+
+static av_always_inline void
+yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
+                      const int16_t **lumSrc, int lumFilterSize,
+                      const int16_t *chrFilter, const int16_t **chrUSrc,
+                      const int16_t **chrVSrc, int chrFilterSize,
+                      const int16_t **alpSrc, uint8_t *dest, int dstW,
+                      int y, enum PixelFormat target)
+{
+    const uint8_t * const d128=dither_8x8_220[y&7];
+    uint8_t *g = c->table_gU[128] + c->table_gV[128];
+    int i;
+    unsigned acc = 0;
+
+    for (i = 0; i < dstW - 1; i += 2) {
+        int j;
+        int Y1 = 1 << 18;
+        int Y2 = 1 << 18;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            Y1 += lumSrc[j][i]   * lumFilter[j];
+            Y2 += lumSrc[j][i+1] * lumFilter[j];
+        }
+        Y1 >>= 19;
+        Y2 >>= 19;
+        if ((Y1 | Y2) & 0x100) {
+            Y1 = av_clip_uint8(Y1);
+            Y2 = av_clip_uint8(Y2);
+        }
+        acc += acc + g[Y1 + d128[(i + 0) & 7]];
+        acc += acc + g[Y2 + d128[(i + 1) & 7]];
+        if ((i & 7) == 6) {
+            output_pixel(*dest++, acc);
+        }
+    }
+}
+
+static av_always_inline void
+yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
+                      const int16_t *ubuf[2], const int16_t *vbuf[2],
+                      const int16_t *abuf[2], uint8_t *dest, int dstW,
+                      int yalpha, int uvalpha, int y,
+                      enum PixelFormat target)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1];
+    const uint8_t * const d128 = dither_8x8_220[y & 7];
+    uint8_t *g = c->table_gU[128] + c->table_gV[128];
+    int  yalpha1 = 4095 - yalpha;
+    int i;
+
+    for (i = 0; i < dstW - 7; i += 8) {
+        int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
+        acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
+        acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
+        acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
+        acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
+        acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
+        acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
+        acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
+        output_pixel(*dest++, acc);
+    }
+}
+
+static av_always_inline void
+yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
+                      const int16_t *ubuf[2], const int16_t *vbuf[2],
+                      const int16_t *abuf0, uint8_t *dest, int dstW,
+                      int uvalpha, int y, enum PixelFormat target)
+{
+    const uint8_t * const d128 = dither_8x8_220[y & 7];
+    uint8_t *g = c->table_gU[128] + c->table_gV[128];
+    int i;
+
+    for (i = 0; i < dstW - 7; i += 8) {
+        int acc =    g[(buf0[i    ] >> 7) + d128[0]];
+        acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
+        acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
+        acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
+        acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
+        acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
+        acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
+        acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
+        output_pixel(*dest++, acc);
+    }
+}
+
+#undef output_pixel
+
+#define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
+static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
+                                const int16_t **lumSrc, int lumFilterSize, \
+                                const int16_t *chrFilter, const int16_t **chrUSrc, \
+                                const int16_t **chrVSrc, int chrFilterSize, \
+                                const int16_t **alpSrc, uint8_t *dest, int dstW, \
+                                int y) \
+{ \
+    name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
+                                  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+                                  alpSrc, dest, dstW, y, fmt); \
+} \
+ \
+static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
+                                const int16_t *ubuf[2], const int16_t *vbuf[2], \
+                                const int16_t *abuf[2], uint8_t *dest, int dstW, \
+                                int yalpha, int uvalpha, int y) \
+{ \
+    name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
+                                  dest, dstW, yalpha, uvalpha, y, fmt); \
+} \
+ \
+static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
+                                const int16_t *ubuf[2], const int16_t *vbuf[2], \
+                                const int16_t *abuf0, uint8_t *dest, int dstW, \
+                                int uvalpha, int y) \
+{ \
+    name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
+                                  abuf0, dest, dstW, uvalpha, \
+                                  y, fmt); \
+}
+
+YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE)
+YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK)
+
+#define output_pixels(pos, Y1, U, Y2, V) \
+    if (target == PIX_FMT_YUYV422) { \
+        dest[pos + 0] = Y1; \
+        dest[pos + 1] = U;  \
+        dest[pos + 2] = Y2; \
+        dest[pos + 3] = V;  \
+    } else { \
+        dest[pos + 0] = U;  \
+        dest[pos + 1] = Y1; \
+        dest[pos + 2] = V;  \
+        dest[pos + 3] = Y2; \
+    }
+
+static av_always_inline void
+yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
+                     const int16_t **lumSrc, int lumFilterSize,
+                     const int16_t *chrFilter, const int16_t **chrUSrc,
+                     const int16_t **chrVSrc, int chrFilterSize,
+                     const int16_t **alpSrc, uint8_t *dest, int dstW,
+                     int y, enum PixelFormat target)
+{
+    int i;
+
+    for (i = 0; i < (dstW >> 1); i++) {
+        int j;
+        int Y1 = 1 << 18;
+        int Y2 = 1 << 18;
+        int U  = 1 << 18;
+        int V  = 1 << 18;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            Y1 += lumSrc[j][i * 2]     * lumFilter[j];
+            Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            U += chrUSrc[j][i] * chrFilter[j];
+            V += chrVSrc[j][i] * chrFilter[j];
+        }
+        Y1 >>= 19;
+        Y2 >>= 19;
+        U  >>= 19;
+        V  >>= 19;
+        if ((Y1 | Y2 | U | V) & 0x100) {
+            Y1 = av_clip_uint8(Y1);
+            Y2 = av_clip_uint8(Y2);
+            U  = av_clip_uint8(U);
+            V  = av_clip_uint8(V);
+        }
+        output_pixels(4*i, Y1, U, Y2, V);
+    }
+}
+
+static av_always_inline void
+yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
+                     const int16_t *ubuf[2], const int16_t *vbuf[2],
+                     const int16_t *abuf[2], uint8_t *dest, int dstW,
+                     int yalpha, int uvalpha, int y,
+                     enum PixelFormat target)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+    int  yalpha1 = 4095 - yalpha;
+    int uvalpha1 = 4095 - uvalpha;
+    int i;
+
+    for (i = 0; i < (dstW >> 1); i++) {
+        int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
+        int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
+        int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
+        int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
+
+        output_pixels(i * 4, Y1, U, Y2, V);
+    }
+}
+
+static av_always_inline void
+yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
+                     const int16_t *ubuf[2], const int16_t *vbuf[2],
+                     const int16_t *abuf0, uint8_t *dest, int dstW,
+                     int uvalpha, int y, enum PixelFormat target)
+{
+    const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+    int i;
+
+    if (uvalpha < 2048) {
+        for (i = 0; i < (dstW >> 1); i++) {
+            int Y1 = buf0[i * 2]     >> 7;
+            int Y2 = buf0[i * 2 + 1] >> 7;
+            int U  = ubuf1[i]        >> 7;
+            int V  = vbuf1[i]        >> 7;
+
+            output_pixels(i * 4, Y1, U, Y2, V);
+        }
+    } else {
+        for (i = 0; i < (dstW >> 1); i++) {
+            int Y1 =  buf0[i * 2]          >> 7;
+            int Y2 =  buf0[i * 2 + 1]      >> 7;
+            int U  = (ubuf0[i] + ubuf1[i]) >> 8;
+            int V  = (vbuf0[i] + vbuf1[i]) >> 8;
+
+            output_pixels(i * 4, Y1, U, Y2, V);
+        }
+    }
+}
+
+#undef output_pixels
+
+YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422)
+YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422)
+
+#define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
+#define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
+#define output_pixel(pos, val) \
+    if (isBE(target)) { \
+        AV_WB16(pos, val); \
+    } else { \
+        AV_WL16(pos, val); \
+    }
+
+static av_always_inline void
+yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
+                       const int32_t **lumSrc, int lumFilterSize,
+                       const int16_t *chrFilter, const int32_t **chrUSrc,
+                       const int32_t **chrVSrc, int chrFilterSize,
+                       const int32_t **alpSrc, uint16_t *dest, int dstW,
+                       int y, enum PixelFormat target)
+{
+    int i;
+
+    for (i = 0; i < (dstW >> 1); i++) {
+        int j;
+        int Y1 = -0x40000000;
+        int Y2 = -0x40000000;
+        int U  = -128 << 23; // 19
+        int V  = -128 << 23;
+        int R, G, B;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            Y1 += lumSrc[j][i * 2]     * lumFilter[j];
+            Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            U += chrUSrc[j][i] * chrFilter[j];
+            V += chrVSrc[j][i] * chrFilter[j];
+        }
+
+        // 8bit: 12+15=27; 16-bit: 12+19=31
+        Y1 >>= 14; // 10
+        Y1 += 0x10000;
+        Y2 >>= 14;
+        Y2 += 0x10000;
+        U  >>= 14;
+        V  >>= 14;
+
+        // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
+        Y1 -= c->yuv2rgb_y_offset;
+        Y2 -= c->yuv2rgb_y_offset;
+        Y1 *= c->yuv2rgb_y_coeff;
+        Y2 *= c->yuv2rgb_y_coeff;
+        Y1 += 1 << 13; // 21
+        Y2 += 1 << 13;
+        // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
+
+        R = V * c->yuv2rgb_v2r_coeff;
+        G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
+        B =                            U * c->yuv2rgb_u2b_coeff;
+
+        // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
+        output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+        output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
+        output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
+        output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+        output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
+        output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
+        dest += 6;
+    }
+}
+
+static av_always_inline void
+yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
+                       const int32_t *ubuf[2], const int32_t *vbuf[2],
+                       const int32_t *abuf[2], uint16_t *dest, int dstW,
+                       int yalpha, int uvalpha, int y,
+                       enum PixelFormat target)
+{
+    const int32_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+    int  yalpha1 = 4095 - yalpha;
+    int uvalpha1 = 4095 - uvalpha;
+    int i;
+
+    for (i = 0; i < (dstW >> 1); i++) {
+        int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
+        int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
+        int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
+        int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
+        int R, G, B;
+
+        Y1 -= c->yuv2rgb_y_offset;
+        Y2 -= c->yuv2rgb_y_offset;
+        Y1 *= c->yuv2rgb_y_coeff;
+        Y2 *= c->yuv2rgb_y_coeff;
+        Y1 += 1 << 13;
+        Y2 += 1 << 13;
+
+        R = V * c->yuv2rgb_v2r_coeff;
+        G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
+        B =                            U * c->yuv2rgb_u2b_coeff;
+
+        output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+        output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
+        output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
+        output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+        output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
+        output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
+        dest += 6;
+    }
+}
+
+static av_always_inline void
+yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
+                       const int32_t *ubuf[2], const int32_t *vbuf[2],
+                       const int32_t *abuf0, uint16_t *dest, int dstW,
+                       int uvalpha, int y, enum PixelFormat target)
+{
+    const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+    int i;
+
+    if (uvalpha < 2048) {
+        for (i = 0; i < (dstW >> 1); i++) {
+            int Y1 = (buf0[i * 2]    ) >> 2;
+            int Y2 = (buf0[i * 2 + 1]) >> 2;
+            int U  = (ubuf0[i] + (-128 << 11)) >> 2;
+            int V  = (vbuf0[i] + (-128 << 11)) >> 2;
+            int R, G, B;
+
+            Y1 -= c->yuv2rgb_y_offset;
+            Y2 -= c->yuv2rgb_y_offset;
+            Y1 *= c->yuv2rgb_y_coeff;
+            Y2 *= c->yuv2rgb_y_coeff;
+            Y1 += 1 << 13;
+            Y2 += 1 << 13;
+
+            R = V * c->yuv2rgb_v2r_coeff;
+            G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
+            B =                            U * c->yuv2rgb_u2b_coeff;
+
+            output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+            output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
+            output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
+            output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+            output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
+            output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
+            dest += 6;
+        }
+    } else {
+        for (i = 0; i < (dstW >> 1); i++) {
+            int Y1 = (buf0[i * 2]    ) >> 2;
+            int Y2 = (buf0[i * 2 + 1]) >> 2;
+            int U  = (ubuf0[i] + ubuf1[i] + (-128 << 11)) >> 3;
+            int V  = (vbuf0[i] + vbuf1[i] + (-128 << 11)) >> 3;
+            int R, G, B;
+
+            Y1 -= c->yuv2rgb_y_offset;
+            Y2 -= c->yuv2rgb_y_offset;
+            Y1 *= c->yuv2rgb_y_coeff;
+            Y2 *= c->yuv2rgb_y_coeff;
+            Y1 += 1 << 13;
+            Y2 += 1 << 13;
+
+            R = V * c->yuv2rgb_v2r_coeff;
+            G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
+            B =                            U * c->yuv2rgb_u2b_coeff;
+
+            output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+            output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
+            output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
+            output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+            output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
+            output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
+            dest += 6;
+        }
+    }
+}
+
+#undef output_pixel
+#undef r_b
+#undef b_r
+
+YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE)
+YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE)
+YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE)
+YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE)
+
+/*
+ * Write out 2 RGB pixels in the target pixel format. This function takes a
+ * R/G/B LUT as generated by ff_yuv2rgb_c_init_tables(), which takes care of
+ * things like endianness conversion and shifting. The caller takes care of
+ * setting the correct offset in these tables from the chroma (U/V) values.
+ * This function then uses the luminance (Y1/Y2) values to write out the
+ * correct RGB values into the destination buffer.
+ */
+static av_always_inline void
+yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
+              unsigned A1, unsigned A2,
+              const void *_r, const void *_g, const void *_b, int y,
+              enum PixelFormat target, int hasAlpha)
+{
+    if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
+        target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
+        uint32_t *dest = (uint32_t *) _dest;
+        const uint32_t *r = (const uint32_t *) _r;
+        const uint32_t *g = (const uint32_t *) _g;
+        const uint32_t *b = (const uint32_t *) _b;
+
+#if CONFIG_SMALL
+        int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
+
+        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
+        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
+#else
+        if (hasAlpha) {
+            int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
+
+            dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
+            dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
+        } else {
+            dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+            dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+        }
+#endif
+    } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
+        uint8_t *dest = (uint8_t *) _dest;
+        const uint8_t *r = (const uint8_t *) _r;
+        const uint8_t *g = (const uint8_t *) _g;
+        const uint8_t *b = (const uint8_t *) _b;
+
+#define r_b ((target == PIX_FMT_RGB24) ? r : b)
+#define b_r ((target == PIX_FMT_RGB24) ? b : r)
+        dest[i * 6 + 0] = r_b[Y1];
+        dest[i * 6 + 1] =   g[Y1];
+        dest[i * 6 + 2] = b_r[Y1];
+        dest[i * 6 + 3] = r_b[Y2];
+        dest[i * 6 + 4] =   g[Y2];
+        dest[i * 6 + 5] = b_r[Y2];
+#undef r_b
+#undef b_r
+    } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
+               target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
+               target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
+        uint16_t *dest = (uint16_t *) _dest;
+        const uint16_t *r = (const uint16_t *) _r;
+        const uint16_t *g = (const uint16_t *) _g;
+        const uint16_t *b = (const uint16_t *) _b;
+        int dr1, dg1, db1, dr2, dg2, db2;
+
+        if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
+            dr1 = dither_2x2_8[ y & 1     ][0];
+            dg1 = dither_2x2_4[ y & 1     ][0];
+            db1 = dither_2x2_8[(y & 1) ^ 1][0];
+            dr2 = dither_2x2_8[ y & 1     ][1];
+            dg2 = dither_2x2_4[ y & 1     ][1];
+            db2 = dither_2x2_8[(y & 1) ^ 1][1];
+        } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
+            dr1 = dither_2x2_8[ y & 1     ][0];
+            dg1 = dither_2x2_8[ y & 1     ][1];
+            db1 = dither_2x2_8[(y & 1) ^ 1][0];
+            dr2 = dither_2x2_8[ y & 1     ][1];
+            dg2 = dither_2x2_8[ y & 1     ][0];
+            db2 = dither_2x2_8[(y & 1) ^ 1][1];
+        } else {
+            dr1 = dither_4x4_16[ y & 3     ][0];
+            dg1 = dither_4x4_16[ y & 3     ][1];
+            db1 = dither_4x4_16[(y & 3) ^ 3][0];
+            dr2 = dither_4x4_16[ y & 3     ][1];
+            dg2 = dither_4x4_16[ y & 3     ][0];
+            db2 = dither_4x4_16[(y & 3) ^ 3][1];
+        }
+
+        dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+        dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+    } else /* 8/4-bit */ {
+        uint8_t *dest = (uint8_t *) _dest;
+        const uint8_t *r = (const uint8_t *) _r;
+        const uint8_t *g = (const uint8_t *) _g;
+        const uint8_t *b = (const uint8_t *) _b;
+        int dr1, dg1, db1, dr2, dg2, db2;
+
+        if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
+            const uint8_t * const d64 = dither_8x8_73[y & 7];
+            const uint8_t * const d32 = dither_8x8_32[y & 7];
+            dr1 = dg1 = d32[(i * 2 + 0) & 7];
+            db1 =       d64[(i * 2 + 0) & 7];
+            dr2 = dg2 = d32[(i * 2 + 1) & 7];
+            db2 =       d64[(i * 2 + 1) & 7];
+        } else {
+            const uint8_t * const d64  = dither_8x8_73 [y & 7];
+            const uint8_t * const d128 = dither_8x8_220[y & 7];
+            dr1 = db1 = d128[(i * 2 + 0) & 7];
+            dg1 =        d64[(i * 2 + 0) & 7];
+            dr2 = db2 = d128[(i * 2 + 1) & 7];
+            dg2 =        d64[(i * 2 + 1) & 7];
+        }
+
+        if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
+            dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
+                    ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
+        } else {
+            dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+            dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+        }
+    }
+}
+
+static av_always_inline void
+yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
+                     const int16_t **lumSrc, int lumFilterSize,
+                     const int16_t *chrFilter, const int16_t **chrUSrc,
+                     const int16_t **chrVSrc, int chrFilterSize,
+                     const int16_t **alpSrc, uint8_t *dest, int dstW,
+                     int y, enum PixelFormat target, int hasAlpha)
+{
+    int i;
+
+    for (i = 0; i < (dstW >> 1); i++) {
+        int j;
+        int Y1 = 1 << 18;
+        int Y2 = 1 << 18;
+        int U  = 1 << 18;
+        int V  = 1 << 18;
+        int av_unused A1, A2;
+        const void *r, *g, *b;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            Y1 += lumSrc[j][i * 2]     * lumFilter[j];
+            Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            U += chrUSrc[j][i] * chrFilter[j];
+            V += chrVSrc[j][i] * chrFilter[j];
+        }
+        Y1 >>= 19;
+        Y2 >>= 19;
+        U  >>= 19;
+        V  >>= 19;
+        if ((Y1 | Y2 | U | V) & 0x100) {
+            Y1 = av_clip_uint8(Y1);
+            Y2 = av_clip_uint8(Y2);
+            U  = av_clip_uint8(U);
+            V  = av_clip_uint8(V);
+        }
+        if (hasAlpha) {
+            A1 = 1 << 18;
+            A2 = 1 << 18;
+            for (j = 0; j < lumFilterSize; j++) {
+                A1 += alpSrc[j][i * 2    ] * lumFilter[j];
+                A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
+            }
+            A1 >>= 19;
+            A2 >>= 19;
+            if ((A1 | A2) & 0x100) {
+                A1 = av_clip_uint8(A1);
+                A2 = av_clip_uint8(A2);
+            }
+        }
+
+        /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
+        r =  c->table_rV[V];
+        g = (c->table_gU[U] + c->table_gV[V]);
+        b =  c->table_bU[U];
+
+        yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
+                      r, g, b, y, target, hasAlpha);
+    }
+}
+
+static av_always_inline void
+yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
+                     const int16_t *ubuf[2], const int16_t *vbuf[2],
+                     const int16_t *abuf[2], uint8_t *dest, int dstW,
+                     int yalpha, int uvalpha, int y,
+                     enum PixelFormat target, int hasAlpha)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
+                  *abuf0 = hasAlpha ? abuf[0] : NULL,
+                  *abuf1 = hasAlpha ? abuf[1] : NULL;
+    int  yalpha1 = 4095 - yalpha;
+    int uvalpha1 = 4095 - uvalpha;
+    int i;
+
+    for (i = 0; i < (dstW >> 1); i++) {
+        int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
+        int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
+        int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
+        int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
+        int A1, A2;
+        const void *r =  c->table_rV[V],
+                   *g = (c->table_gU[U] + c->table_gV[V]),
+                   *b =  c->table_bU[U];
+
+        if (hasAlpha) {
+            A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
+            A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
+        }
+
+        yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
+                      r, g, b, y, target, hasAlpha);
+    }
+}
+
+static av_always_inline void
+yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
+                     const int16_t *ubuf[2], const int16_t *vbuf[2],
+                     const int16_t *abuf0, uint8_t *dest, int dstW,
+                     int uvalpha, int y, enum PixelFormat target,
+                     int hasAlpha)
+{
+    const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+    int i;
+
+    if (uvalpha < 2048) {
+        for (i = 0; i < (dstW >> 1); i++) {
+            int Y1 = buf0[i * 2]     >> 7;
+            int Y2 = buf0[i * 2 + 1] >> 7;
+            int U  = ubuf1[i]        >> 7;
+            int V  = vbuf1[i]        >> 7;
+            int A1, A2;
+            const void *r =  c->table_rV[V],
+                       *g = (c->table_gU[U] + c->table_gV[V]),
+                       *b =  c->table_bU[U];
+
+            if (hasAlpha) {
+                A1 = abuf0[i * 2    ] >> 7;
+                A2 = abuf0[i * 2 + 1] >> 7;
+            }
+
+            yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
+                          r, g, b, y, target, hasAlpha);
+        }
+    } else {
+        for (i = 0; i < (dstW >> 1); i++) {
+            int Y1 =  buf0[i * 2]          >> 7;
+            int Y2 =  buf0[i * 2 + 1]      >> 7;
+            int U  = (ubuf0[i] + ubuf1[i]) >> 8;
+            int V  = (vbuf0[i] + vbuf1[i]) >> 8;
+            int A1, A2;
+            const void *r =  c->table_rV[V],
+                       *g = (c->table_gU[U] + c->table_gV[V]),
+                       *b =  c->table_bU[U];
+
+            if (hasAlpha) {
+                A1 = abuf0[i * 2    ] >> 7;
+                A2 = abuf0[i * 2 + 1] >> 7;
+            }
+
+            yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
+                          r, g, b, y, target, hasAlpha);
+        }
+    }
+}
+
+#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
+static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
+                                const int16_t **lumSrc, int lumFilterSize, \
+                                const int16_t *chrFilter, const int16_t **chrUSrc, \
+                                const int16_t **chrVSrc, int chrFilterSize, \
+                                const int16_t **alpSrc, uint8_t *dest, int dstW, \
+                                int y) \
+{ \
+    name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
+                                  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+                                  alpSrc, dest, dstW, y, fmt, hasAlpha); \
+}
+#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
+YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
+static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
+                                const int16_t *ubuf[2], const int16_t *vbuf[2], \
+                                const int16_t *abuf[2], uint8_t *dest, int dstW, \
+                                int yalpha, int uvalpha, int y) \
+{ \
+    name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
+                                  dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
+} \
+ \
+static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
+                                const int16_t *ubuf[2], const int16_t *vbuf[2], \
+                                const int16_t *abuf0, uint8_t *dest, int dstW, \
+                                int uvalpha, int y) \
+{ \
+    name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
+                                  dstW, uvalpha, y, fmt, hasAlpha); \
+}
+
+#if CONFIG_SMALL
+YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+#else
+#if CONFIG_SWSCALE_ALPHA
+YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1)
+YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1)
+#endif
+YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0)
+YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0)
+YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0)
+YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0)
+YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0)
+YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0)
+YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0)
+YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0)
+YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0)
+
+static av_always_inline void
+yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
+                          const int16_t **lumSrc, int lumFilterSize,
+                          const int16_t *chrFilter, const int16_t **chrUSrc,
+                          const int16_t **chrVSrc, int chrFilterSize,
+                          const int16_t **alpSrc, uint8_t *dest,
+                          int dstW, int y, enum PixelFormat target, int hasAlpha)
+{
+    int i;
+    int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
+
+    for (i = 0; i < dstW; i++) {
+        int j;
+        int Y = 0;
+        int U = -128 << 19;
+        int V = -128 << 19;
+        int av_unused A;
+        int R, G, B;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            Y += lumSrc[j][i] * lumFilter[j];
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            U += chrUSrc[j][i] * chrFilter[j];
+            V += chrVSrc[j][i] * chrFilter[j];
+        }
+        Y >>= 10;
+        U >>= 10;
+        V >>= 10;
+        if (hasAlpha) {
+            A = 1 << 21;
+            for (j = 0; j < lumFilterSize; j++) {
+                A += alpSrc[j][i] * lumFilter[j];
+            }
+            A >>= 19;
+            if (A & 0x100)
+                A = av_clip_uint8(A);
+        }
+        Y -= c->yuv2rgb_y_offset;
+        Y *= c->yuv2rgb_y_coeff;
+        Y += 1 << 21;
+        R = Y + V*c->yuv2rgb_v2r_coeff;
+        G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
+        B = Y +                          U*c->yuv2rgb_u2b_coeff;
+        if ((R | G | B) & 0xC0000000) {
+            R = av_clip_uintp2(R, 30);
+            G = av_clip_uintp2(G, 30);
+            B = av_clip_uintp2(B, 30);
+        }
+
+        switch(target) {
+        case PIX_FMT_ARGB:
+            dest[0] = hasAlpha ? A : 255;
+            dest[1] = R >> 22;
+            dest[2] = G >> 22;
+            dest[3] = B >> 22;
+            break;
+        case PIX_FMT_RGB24:
+            dest[0] = R >> 22;
+            dest[1] = G >> 22;
+            dest[2] = B >> 22;
+            break;
+        case PIX_FMT_RGBA:
+            dest[0] = R >> 22;
+            dest[1] = G >> 22;
+            dest[2] = B >> 22;
+            dest[3] = hasAlpha ? A : 255;
+            break;
+        case PIX_FMT_ABGR:
+            dest[0] = hasAlpha ? A : 255;
+            dest[1] = B >> 22;
+            dest[2] = G >> 22;
+            dest[3] = R >> 22;
+            dest += 4;
+            break;
+        case PIX_FMT_BGR24:
+            dest[0] = B >> 22;
+            dest[1] = G >> 22;
+            dest[2] = R >> 22;
+            break;
+        case PIX_FMT_BGRA:
+            dest[0] = B >> 22;
+            dest[1] = G >> 22;
+            dest[2] = R >> 22;
+            dest[3] = hasAlpha ? A : 255;
+            break;
+        }
+        dest += step;
+    }
+}
+
+#if CONFIG_SMALL
+YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+#else
+#if CONFIG_SWSCALE_ALPHA
+YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1)
+YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1)
+YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1)
+YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1)
+#endif
+YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0)
+YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0)
+YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0)
+YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0)
+#endif
+YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0)
+YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0)
+
+void ff_sws_init_output_funcs(SwsContext *c,
+                              yuv2planar1_fn *yuv2plane1,
+                              yuv2planarX_fn *yuv2planeX,
+                              yuv2interleavedX_fn *yuv2nv12cX,
+                              yuv2packed1_fn *yuv2packed1,
+                              yuv2packed2_fn *yuv2packed2,
+                              yuv2packedX_fn *yuv2packedX)
+{
+    enum PixelFormat dstFormat = c->dstFormat;
+
+    if (is16BPS(dstFormat)) {
+        *yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_c  : yuv2planeX_16LE_c;
+        *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_c  : yuv2plane1_16LE_c;
+    } else if (is9_OR_10BPS(dstFormat)) {
+        if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
+            *yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_c  : yuv2planeX_9LE_c;
+            *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_c  : yuv2plane1_9LE_c;
+        } else {
+            *yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_c  : yuv2planeX_10LE_c;
+            *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_c  : yuv2plane1_10LE_c;
+        }
+    } else {
+        *yuv2plane1 = yuv2plane1_8_c;
+        *yuv2planeX = yuv2planeX_8_c;
+        if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21)
+            *yuv2nv12cX = yuv2nv12cX_c;
+    }
+
+    if(c->flags & SWS_FULL_CHR_H_INT) {
+        switch (dstFormat) {
+            case PIX_FMT_RGBA:
+#if CONFIG_SMALL
+                *yuv2packedX = yuv2rgba32_full_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+                if (c->alpPixBuf) {
+                    *yuv2packedX = yuv2rgba32_full_X_c;
+                } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+                {
+                    *yuv2packedX = yuv2rgbx32_full_X_c;
+                }
+#endif /* !CONFIG_SMALL */
+                break;
+            case PIX_FMT_ARGB:
+#if CONFIG_SMALL
+                *yuv2packedX = yuv2argb32_full_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+                if (c->alpPixBuf) {
+                    *yuv2packedX = yuv2argb32_full_X_c;
+                } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+                {
+                    *yuv2packedX = yuv2xrgb32_full_X_c;
+                }
+#endif /* !CONFIG_SMALL */
+                break;
+            case PIX_FMT_BGRA:
+#if CONFIG_SMALL
+                *yuv2packedX = yuv2bgra32_full_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+                if (c->alpPixBuf) {
+                    *yuv2packedX = yuv2bgra32_full_X_c;
+                } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+                {
+                    *yuv2packedX = yuv2bgrx32_full_X_c;
+                }
+#endif /* !CONFIG_SMALL */
+                break;
+            case PIX_FMT_ABGR:
+#if CONFIG_SMALL
+                *yuv2packedX = yuv2abgr32_full_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+                if (c->alpPixBuf) {
+                    *yuv2packedX = yuv2abgr32_full_X_c;
+                } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+                {
+                    *yuv2packedX = yuv2xbgr32_full_X_c;
+                }
+#endif /* !CONFIG_SMALL */
+                break;
+            case PIX_FMT_RGB24:
+            *yuv2packedX = yuv2rgb24_full_X_c;
+            break;
+        case PIX_FMT_BGR24:
+            *yuv2packedX = yuv2bgr24_full_X_c;
+            break;
+        }
+    } else {
+        switch (dstFormat) {
+        case PIX_FMT_RGB48LE:
+            *yuv2packed1 = yuv2rgb48le_1_c;
+            *yuv2packed2 = yuv2rgb48le_2_c;
+            *yuv2packedX = yuv2rgb48le_X_c;
+            break;
+        case PIX_FMT_RGB48BE:
+            *yuv2packed1 = yuv2rgb48be_1_c;
+            *yuv2packed2 = yuv2rgb48be_2_c;
+            *yuv2packedX = yuv2rgb48be_X_c;
+            break;
+        case PIX_FMT_BGR48LE:
+            *yuv2packed1 = yuv2bgr48le_1_c;
+            *yuv2packed2 = yuv2bgr48le_2_c;
+            *yuv2packedX = yuv2bgr48le_X_c;
+            break;
+        case PIX_FMT_BGR48BE:
+            *yuv2packed1 = yuv2bgr48be_1_c;
+            *yuv2packed2 = yuv2bgr48be_2_c;
+            *yuv2packedX = yuv2bgr48be_X_c;
+            break;
+        case PIX_FMT_RGB32:
+        case PIX_FMT_BGR32:
+#if CONFIG_SMALL
+            *yuv2packed1 = yuv2rgb32_1_c;
+            *yuv2packed2 = yuv2rgb32_2_c;
+            *yuv2packedX = yuv2rgb32_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+                if (c->alpPixBuf) {
+                    *yuv2packed1 = yuv2rgba32_1_c;
+                    *yuv2packed2 = yuv2rgba32_2_c;
+                    *yuv2packedX = yuv2rgba32_X_c;
+                } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+                {
+                    *yuv2packed1 = yuv2rgbx32_1_c;
+                    *yuv2packed2 = yuv2rgbx32_2_c;
+                    *yuv2packedX = yuv2rgbx32_X_c;
+                }
+#endif /* !CONFIG_SMALL */
+            break;
+        case PIX_FMT_RGB32_1:
+        case PIX_FMT_BGR32_1:
+#if CONFIG_SMALL
+                *yuv2packed1 = yuv2rgb32_1_1_c;
+                *yuv2packed2 = yuv2rgb32_1_2_c;
+                *yuv2packedX = yuv2rgb32_1_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+                if (c->alpPixBuf) {
+                    *yuv2packed1 = yuv2rgba32_1_1_c;
+                    *yuv2packed2 = yuv2rgba32_1_2_c;
+                    *yuv2packedX = yuv2rgba32_1_X_c;
+                } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+                {
+                    *yuv2packed1 = yuv2rgbx32_1_1_c;
+                    *yuv2packed2 = yuv2rgbx32_1_2_c;
+                    *yuv2packedX = yuv2rgbx32_1_X_c;
+                }
+#endif /* !CONFIG_SMALL */
+                break;
+        case PIX_FMT_RGB24:
+            *yuv2packed1 = yuv2rgb24_1_c;
+            *yuv2packed2 = yuv2rgb24_2_c;
+            *yuv2packedX = yuv2rgb24_X_c;
+            break;
+        case PIX_FMT_BGR24:
+            *yuv2packed1 = yuv2bgr24_1_c;
+            *yuv2packed2 = yuv2bgr24_2_c;
+            *yuv2packedX = yuv2bgr24_X_c;
+            break;
+        case PIX_FMT_RGB565LE:
+        case PIX_FMT_RGB565BE:
+        case PIX_FMT_BGR565LE:
+        case PIX_FMT_BGR565BE:
+            *yuv2packed1 = yuv2rgb16_1_c;
+            *yuv2packed2 = yuv2rgb16_2_c;
+            *yuv2packedX = yuv2rgb16_X_c;
+            break;
+        case PIX_FMT_RGB555LE:
+        case PIX_FMT_RGB555BE:
+        case PIX_FMT_BGR555LE:
+        case PIX_FMT_BGR555BE:
+            *yuv2packed1 = yuv2rgb15_1_c;
+            *yuv2packed2 = yuv2rgb15_2_c;
+            *yuv2packedX = yuv2rgb15_X_c;
+            break;
+        case PIX_FMT_RGB444LE:
+        case PIX_FMT_RGB444BE:
+        case PIX_FMT_BGR444LE:
+        case PIX_FMT_BGR444BE:
+            *yuv2packed1 = yuv2rgb12_1_c;
+            *yuv2packed2 = yuv2rgb12_2_c;
+            *yuv2packedX = yuv2rgb12_X_c;
+            break;
+        case PIX_FMT_RGB8:
+        case PIX_FMT_BGR8:
+            *yuv2packed1 = yuv2rgb8_1_c;
+            *yuv2packed2 = yuv2rgb8_2_c;
+            *yuv2packedX = yuv2rgb8_X_c;
+            break;
+        case PIX_FMT_RGB4:
+        case PIX_FMT_BGR4:
+            *yuv2packed1 = yuv2rgb4_1_c;
+            *yuv2packed2 = yuv2rgb4_2_c;
+            *yuv2packedX = yuv2rgb4_X_c;
+            break;
+        case PIX_FMT_RGB4_BYTE:
+        case PIX_FMT_BGR4_BYTE:
+            *yuv2packed1 = yuv2rgb4b_1_c;
+            *yuv2packed2 = yuv2rgb4b_2_c;
+            *yuv2packedX = yuv2rgb4b_X_c;
+            break;
+        }
+    }
+    switch (dstFormat) {
+    case PIX_FMT_GRAY16BE:
+        *yuv2packed1 = yuv2gray16BE_1_c;
+        *yuv2packed2 = yuv2gray16BE_2_c;
+        *yuv2packedX = yuv2gray16BE_X_c;
+        break;
+    case PIX_FMT_GRAY16LE:
+        *yuv2packed1 = yuv2gray16LE_1_c;
+        *yuv2packed2 = yuv2gray16LE_2_c;
+        *yuv2packedX = yuv2gray16LE_X_c;
+        break;
+    case PIX_FMT_MONOWHITE:
+        *yuv2packed1 = yuv2monowhite_1_c;
+        *yuv2packed2 = yuv2monowhite_2_c;
+        *yuv2packedX = yuv2monowhite_X_c;
+        break;
+    case PIX_FMT_MONOBLACK:
+        *yuv2packed1 = yuv2monoblack_1_c;
+        *yuv2packed2 = yuv2monoblack_2_c;
+        *yuv2packedX = yuv2monoblack_X_c;
+        break;
+    case PIX_FMT_YUYV422:
+        *yuv2packed1 = yuv2yuyv422_1_c;
+        *yuv2packed2 = yuv2yuyv422_2_c;
+        *yuv2packedX = yuv2yuyv422_X_c;
+        break;
+    case PIX_FMT_UYVY422:
+        *yuv2packed1 = yuv2uyvy422_1_c;
+        *yuv2packed2 = yuv2uyvy422_2_c;
+        *yuv2packedX = yuv2uyvy422_X_c;
+        break;
+    }
+}
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index b7f348b4ff..ddfafbe530 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -34,108 +34,6 @@
 #include "libavutil/bswap.h"
 #include "libavutil/pixdesc.h"
 
-/*
-NOTES
-Special versions: fast Y 1:1 scaling (no interpolation in y direction)
-
-TODO
-more intelligent misalignment avoidance for the horizontal scaler
-write special vertical cubic upscale version
-optimize C code (YV12 / minmax)
-add support for packed pixel YUV input & output
-add support for Y8 output
-optimize BGR24 & BGR32
-add BGR4 output support
-write special BGR->BGR scaler
-*/
-
-DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
-{  1,   3,   1,   3,   1,   3,   1,   3, },
-{  2,   0,   2,   0,   2,   0,   2,   0, },
-};
-
-DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
-{  6,   2,   6,   2,   6,   2,   6,   2, },
-{  0,   4,   0,   4,   0,   4,   0,   4, },
-};
-
-DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
-{  8,   4,  11,   7,   8,   4,  11,   7, },
-{  2,  14,   1,  13,   2,  14,   1,  13, },
-{ 10,   6,   9,   5,  10,   6,   9,   5, },
-{  0,  12,   3,  15,   0,  12,   3,  15, },
-};
-
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
-{ 17,   9,  23,  15,  16,   8,  22,  14, },
-{  5,  29,   3,  27,   4,  28,   2,  26, },
-{ 21,  13,  19,  11,  20,  12,  18,  10, },
-{  0,  24,   6,  30,   1,  25,   7,  31, },
-{ 16,   8,  22,  14,  17,   9,  23,  15, },
-{  4,  28,   2,  26,   5,  29,   3,  27, },
-{ 20,  12,  18,  10,  21,  13,  19,  11, },
-{  1,  25,   7,  31,   0,  24,   6,  30, },
-};
-
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
-{  0,  55,  14,  68,   3,  58,  17,  72, },
-{ 37,  18,  50,  32,  40,  22,  54,  35, },
-{  9,  64,   5,  59,  13,  67,   8,  63, },
-{ 46,  27,  41,  23,  49,  31,  44,  26, },
-{  2,  57,  16,  71,   1,  56,  15,  70, },
-{ 39,  21,  52,  34,  38,  19,  51,  33, },
-{ 11,  66,   7,  62,  10,  65,   6,  60, },
-{ 48,  30,  43,  25,  47,  29,  42,  24, },
-};
-
-#if 1
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
-{117,  62, 158, 103, 113,  58, 155, 100, },
-{ 34, 199,  21, 186,  31, 196,  17, 182, },
-{144,  89, 131,  76, 141,  86, 127,  72, },
-{  0, 165,  41, 206,  10, 175,  52, 217, },
-{110,  55, 151,  96, 120,  65, 162, 107, },
-{ 28, 193,  14, 179,  38, 203,  24, 189, },
-{138,  83, 124,  69, 148,  93, 134,  79, },
-{  7, 172,  48, 213,   3, 168,  45, 210, },
-};
-#elif 1
-// tries to correct a gamma of 1.5
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
-{  0, 143,  18, 200,   2, 156,  25, 215, },
-{ 78,  28, 125,  64,  89,  36, 138,  74, },
-{ 10, 180,   3, 161,  16, 195,   8, 175, },
-{109,  51,  93,  38, 121,  60, 105,  47, },
-{  1, 152,  23, 210,   0, 147,  20, 205, },
-{ 85,  33, 134,  71,  81,  30, 130,  67, },
-{ 14, 190,   6, 171,  12, 185,   5, 166, },
-{117,  57, 101,  44, 113,  54,  97,  41, },
-};
-#elif 1
-// tries to correct a gamma of 2.0
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
-{  0, 124,   8, 193,   0, 140,  12, 213, },
-{ 55,  14, 104,  42,  66,  19, 119,  52, },
-{  3, 168,   1, 145,   6, 187,   3, 162, },
-{ 86,  31,  70,  21,  99,  39,  82,  28, },
-{  0, 134,  11, 206,   0, 129,   9, 200, },
-{ 62,  17, 114,  48,  58,  16, 109,  45, },
-{  5, 181,   2, 157,   4, 175,   1, 151, },
-{ 95,  36,  78,  26,  90,  34,  74,  24, },
-};
-#else
-// tries to correct a gamma of 2.5
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
-{  0, 107,   3, 187,   0, 125,   6, 212, },
-{ 39,   7,  86,  28,  49,  11, 102,  36, },
-{  1, 158,   0, 131,   3, 180,   1, 151, },
-{ 68,  19,  52,  12,  81,  25,  64,  17, },
-{  0, 119,   5, 203,   0, 113,   4, 195, },
-{ 45,   9,  96,  33,  42,   8,  91,  30, },
-{  2, 172,   1, 144,   2, 165,   0, 137, },
-{ 77,  23,  60,  15,  72,  21,  56,  14, },
-};
-#endif
 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 {  36, 68, 60, 92, 34, 66, 58, 90,},
 { 100,  4,124, 28, 98,  2,122, 26,},
@@ -149,1161 +47,6 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 {  64, 64, 64, 64, 64, 64, 64, 64 };
 
-#define output_pixel(pos, val, bias, signedness) \
-    if (big_endian) { \
-        AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
-    } else { \
-        AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
-    }
-
-static av_always_inline void
-yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
-                         int big_endian, int output_bits)
-{
-    int i;
-    int shift = 19 - output_bits;
-
-    for (i = 0; i < dstW; i++) {
-        int val = src[i] + (1 << (shift - 1));
-        output_pixel(&dest[i], val, 0, uint);
-    }
-}
-
-static av_always_inline void
-yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
-                         const int32_t **src, uint16_t *dest, int dstW,
-                         int big_endian, int output_bits)
-{
-    int i;
-    int shift = 15 + 16 - output_bits;
-
-    for (i = 0; i < dstW; i++) {
-        int val = 1 << (30-output_bits);
-        int j;
-
-        /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
-         * filters (or anything with negative coeffs, the range can be slightly
-         * wider in both directions. To account for this overflow, we subtract
-         * a constant so it always fits in the signed range (assuming a
-         * reasonable filterSize), and re-add that at the end. */
-        val -= 0x40000000;
-        for (j = 0; j < filterSize; j++)
-            val += src[j][i] * filter[j];
-
-        output_pixel(&dest[i], val, 0x8000, int);
-    }
-}
-
-#undef output_pixel
-
-#define output_pixel(pos, val) \
-    if (big_endian) { \
-        AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
-    } else { \
-        AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
-    }
-
-static av_always_inline void
-yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
-                         int big_endian, int output_bits)
-{
-    int i;
-    int shift = 15 - output_bits;
-
-    for (i = 0; i < dstW; i++) {
-        int val = src[i] + (1 << (shift - 1));
-        output_pixel(&dest[i], val);
-    }
-}
-
-static av_always_inline void
-yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
-                         const int16_t **src, uint16_t *dest, int dstW,
-                         int big_endian, int output_bits)
-{
-    int i;
-    int shift = 11 + 16 - output_bits;
-
-    for (i = 0; i < dstW; i++) {
-        int val = 1 << (26-output_bits);
-        int j;
-
-        for (j = 0; j < filterSize; j++)
-            val += src[j][i] * filter[j];
-
-        output_pixel(&dest[i], val);
-    }
-}
-
-#undef output_pixel
-
-#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
-static void yuv2plane1_ ## bits ## BE_LE ## _c(const int16_t *src, \
-                              uint8_t *dest, int dstW, \
-                              const uint8_t *dither, int offset)\
-{ \
-    yuv2plane1_ ## template_size ## _c_template((const typeX_t *) src, \
-                         (uint16_t *) dest, dstW, is_be, bits); \
-}\
-static void yuv2planeX_ ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \
-                              const int16_t **src, uint8_t *dest, int dstW, \
-                              const uint8_t *dither, int offset)\
-{ \
-    yuv2planeX_## template_size ## _c_template(filter, \
-                         filterSize, (const typeX_t **) src, \
-                         (uint16_t *) dest, dstW, is_be, bits); \
-}
-yuv2NBPS( 9, BE, 1, 10, int16_t)
-yuv2NBPS( 9, LE, 0, 10, int16_t)
-yuv2NBPS(10, BE, 1, 10, int16_t)
-yuv2NBPS(10, LE, 0, 10, int16_t)
-yuv2NBPS(16, BE, 1, 16, int32_t)
-yuv2NBPS(16, LE, 0, 16, int32_t)
-
-static void yuv2planeX_8_c(const int16_t *filter, int filterSize,
-                           const int16_t **src, uint8_t *dest, int dstW,
-                           const uint8_t *dither, int offset)
-{
-    int i;
-    for (i=0; i<dstW; i++) {
-        int val = dither[(i + offset) & 7] << 12;
-        int j;
-        for (j=0; j<filterSize; j++)
-            val += src[j][i] * filter[j];
-
-        dest[i]= av_clip_uint8(val>>19);
-    }
-}
-
-static void yuv2plane1_8_c(const int16_t *src, uint8_t *dest, int dstW,
-                           const uint8_t *dither, int offset)
-{
-    int i;
-    for (i=0; i<dstW; i++) {
-        int val = (src[i] + dither[(i + offset) & 7]) >> 7;
-        dest[i]= av_clip_uint8(val);
-    }
-}
-
-static void yuv2nv12cX_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
-                        const int16_t **chrUSrc, const int16_t **chrVSrc,
-                        uint8_t *dest, int chrDstW)
-{
-    enum PixelFormat dstFormat = c->dstFormat;
-    const uint8_t *chrDither = c->chrDither8;
-    int i;
-
-    if (dstFormat == PIX_FMT_NV12)
-        for (i=0; i<chrDstW; i++) {
-            int u = chrDither[i & 7] << 12;
-            int v = chrDither[(i + 3) & 7] << 12;
-            int j;
-            for (j=0; j<chrFilterSize; j++) {
-                u += chrUSrc[j][i] * chrFilter[j];
-                v += chrVSrc[j][i] * chrFilter[j];
-            }
-
-            dest[2*i]= av_clip_uint8(u>>19);
-            dest[2*i+1]= av_clip_uint8(v>>19);
-        }
-    else
-        for (i=0; i<chrDstW; i++) {
-            int u = chrDither[i & 7] << 12;
-            int v = chrDither[(i + 3) & 7] << 12;
-            int j;
-            for (j=0; j<chrFilterSize; j++) {
-                u += chrUSrc[j][i] * chrFilter[j];
-                v += chrVSrc[j][i] * chrFilter[j];
-            }
-
-            dest[2*i]= av_clip_uint8(v>>19);
-            dest[2*i+1]= av_clip_uint8(u>>19);
-        }
-}
-
-#define output_pixel(pos, val) \
-        if (target == PIX_FMT_GRAY16BE) { \
-            AV_WB16(pos, val); \
-        } else { \
-            AV_WL16(pos, val); \
-        }
-
-static av_always_inline void
-yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
-                        const int32_t **lumSrc, int lumFilterSize,
-                        const int16_t *chrFilter, const int32_t **chrUSrc,
-                        const int32_t **chrVSrc, int chrFilterSize,
-                        const int32_t **alpSrc, uint16_t *dest, int dstW,
-                        int y, enum PixelFormat target)
-{
-    int i;
-
-    for (i = 0; i < (dstW >> 1); i++) {
-        int j;
-        int Y1 = (1 << 14) - 0x40000000;
-        int Y2 = (1 << 14) - 0x40000000;
-
-        for (j = 0; j < lumFilterSize; j++) {
-            Y1 += lumSrc[j][i * 2]     * lumFilter[j];
-            Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
-        }
-        Y1 >>= 15;
-        Y2 >>= 15;
-        Y1 = av_clip_int16(Y1);
-        Y2 = av_clip_int16(Y2);
-        output_pixel(&dest[i * 2 + 0], 0x8000 + Y1);
-        output_pixel(&dest[i * 2 + 1], 0x8000 + Y2);
-    }
-}
-
-static av_always_inline void
-yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
-                        const int32_t *ubuf[2], const int32_t *vbuf[2],
-                        const int32_t *abuf[2], uint16_t *dest, int dstW,
-                        int yalpha, int uvalpha, int y,
-                        enum PixelFormat target)
-{
-    int  yalpha1 = 4095 - yalpha;
-    int i;
-    const int32_t *buf0 = buf[0], *buf1 = buf[1];
-
-    for (i = 0; i < (dstW >> 1); i++) {
-        int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
-        int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
-
-        output_pixel(&dest[i * 2 + 0], Y1);
-        output_pixel(&dest[i * 2 + 1], Y2);
-    }
-}
-
-static av_always_inline void
-yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
-                        const int32_t *ubuf[2], const int32_t *vbuf[2],
-                        const int32_t *abuf0, uint16_t *dest, int dstW,
-                        int uvalpha, int y, enum PixelFormat target)
-{
-    int i;
-
-    for (i = 0; i < (dstW >> 1); i++) {
-        int Y1 = buf0[i * 2    ] << 1;
-        int Y2 = buf0[i * 2 + 1] << 1;
-
-        output_pixel(&dest[i * 2 + 0], Y1);
-        output_pixel(&dest[i * 2 + 1], Y2);
-    }
-}
-
-#undef output_pixel
-
-#define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
-static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
-                        const int16_t **_lumSrc, int lumFilterSize, \
-                        const int16_t *chrFilter, const int16_t **_chrUSrc, \
-                        const int16_t **_chrVSrc, int chrFilterSize, \
-                        const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
-                        int y) \
-{ \
-    const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
-                  **chrUSrc = (const int32_t **) _chrUSrc, \
-                  **chrVSrc = (const int32_t **) _chrVSrc, \
-                  **alpSrc  = (const int32_t **) _alpSrc; \
-    uint16_t *dest = (uint16_t *) _dest; \
-    name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
-                          chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
-                          alpSrc, dest, dstW, y, fmt); \
-} \
- \
-static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
-                        const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
-                        const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
-                        int yalpha, int uvalpha, int y) \
-{ \
-    const int32_t **buf  = (const int32_t **) _buf, \
-                  **ubuf = (const int32_t **) _ubuf, \
-                  **vbuf = (const int32_t **) _vbuf, \
-                  **abuf = (const int32_t **) _abuf; \
-    uint16_t *dest = (uint16_t *) _dest; \
-    name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
-                          dest, dstW, yalpha, uvalpha, y, fmt); \
-} \
- \
-static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
-                        const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
-                        const int16_t *_abuf0, uint8_t *_dest, int dstW, \
-                        int uvalpha, int y) \
-{ \
-    const int32_t *buf0  = (const int32_t *)  _buf0, \
-                 **ubuf  = (const int32_t **) _ubuf, \
-                 **vbuf  = (const int32_t **) _vbuf, \
-                  *abuf0 = (const int32_t *)  _abuf0; \
-    uint16_t *dest = (uint16_t *) _dest; \
-    name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
-                                  dstW, uvalpha, y, fmt); \
-}
-
-YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE)
-YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE)
-
-#define output_pixel(pos, acc) \
-    if (target == PIX_FMT_MONOBLACK) { \
-        pos = acc; \
-    } else { \
-        pos = ~acc; \
-    }
-
-static av_always_inline void
-yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
-                      const int16_t **lumSrc, int lumFilterSize,
-                      const int16_t *chrFilter, const int16_t **chrUSrc,
-                      const int16_t **chrVSrc, int chrFilterSize,
-                      const int16_t **alpSrc, uint8_t *dest, int dstW,
-                      int y, enum PixelFormat target)
-{
-    const uint8_t * const d128=dither_8x8_220[y&7];
-    uint8_t *g = c->table_gU[128] + c->table_gV[128];
-    int i;
-    unsigned acc = 0;
-
-    for (i = 0; i < dstW - 1; i += 2) {
-        int j;
-        int Y1 = 1 << 18;
-        int Y2 = 1 << 18;
-
-        for (j = 0; j < lumFilterSize; j++) {
-            Y1 += lumSrc[j][i]   * lumFilter[j];
-            Y2 += lumSrc[j][i+1] * lumFilter[j];
-        }
-        Y1 >>= 19;
-        Y2 >>= 19;
-        if ((Y1 | Y2) & 0x100) {
-            Y1 = av_clip_uint8(Y1);
-            Y2 = av_clip_uint8(Y2);
-        }
-        acc += acc + g[Y1 + d128[(i + 0) & 7]];
-        acc += acc + g[Y2 + d128[(i + 1) & 7]];
-        if ((i & 7) == 6) {
-            output_pixel(*dest++, acc);
-        }
-    }
-}
-
-static av_always_inline void
-yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
-                      const int16_t *ubuf[2], const int16_t *vbuf[2],
-                      const int16_t *abuf[2], uint8_t *dest, int dstW,
-                      int yalpha, int uvalpha, int y,
-                      enum PixelFormat target)
-{
-    const int16_t *buf0  = buf[0],  *buf1  = buf[1];
-    const uint8_t * const d128 = dither_8x8_220[y & 7];
-    uint8_t *g = c->table_gU[128] + c->table_gV[128];
-    int  yalpha1 = 4095 - yalpha;
-    int i;
-
-    for (i = 0; i < dstW - 7; i += 8) {
-        int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
-        acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
-        acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
-        acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
-        acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
-        acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
-        acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
-        acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
-        output_pixel(*dest++, acc);
-    }
-}
-
-static av_always_inline void
-yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
-                      const int16_t *ubuf[2], const int16_t *vbuf[2],
-                      const int16_t *abuf0, uint8_t *dest, int dstW,
-                      int uvalpha, int y, enum PixelFormat target)
-{
-    const uint8_t * const d128 = dither_8x8_220[y & 7];
-    uint8_t *g = c->table_gU[128] + c->table_gV[128];
-    int i;
-
-    for (i = 0; i < dstW - 7; i += 8) {
-        int acc =    g[(buf0[i    ] >> 7) + d128[0]];
-        acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
-        acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
-        acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
-        acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
-        acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
-        acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
-        acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
-        output_pixel(*dest++, acc);
-    }
-}
-
-#undef output_pixel
-
-#define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
-static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
-                                const int16_t **lumSrc, int lumFilterSize, \
-                                const int16_t *chrFilter, const int16_t **chrUSrc, \
-                                const int16_t **chrVSrc, int chrFilterSize, \
-                                const int16_t **alpSrc, uint8_t *dest, int dstW, \
-                                int y) \
-{ \
-    name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
-                                  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
-                                  alpSrc, dest, dstW, y, fmt); \
-} \
- \
-static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
-                                const int16_t *ubuf[2], const int16_t *vbuf[2], \
-                                const int16_t *abuf[2], uint8_t *dest, int dstW, \
-                                int yalpha, int uvalpha, int y) \
-{ \
-    name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
-                                  dest, dstW, yalpha, uvalpha, y, fmt); \
-} \
- \
-static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
-                                const int16_t *ubuf[2], const int16_t *vbuf[2], \
-                                const int16_t *abuf0, uint8_t *dest, int dstW, \
-                                int uvalpha, int y) \
-{ \
-    name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
-                                  abuf0, dest, dstW, uvalpha, \
-                                  y, fmt); \
-}
-
-YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE)
-YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK)
-
-#define output_pixels(pos, Y1, U, Y2, V) \
-    if (target == PIX_FMT_YUYV422) { \
-        dest[pos + 0] = Y1; \
-        dest[pos + 1] = U;  \
-        dest[pos + 2] = Y2; \
-        dest[pos + 3] = V;  \
-    } else { \
-        dest[pos + 0] = U;  \
-        dest[pos + 1] = Y1; \
-        dest[pos + 2] = V;  \
-        dest[pos + 3] = Y2; \
-    }
-
-static av_always_inline void
-yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
-                     const int16_t **lumSrc, int lumFilterSize,
-                     const int16_t *chrFilter, const int16_t **chrUSrc,
-                     const int16_t **chrVSrc, int chrFilterSize,
-                     const int16_t **alpSrc, uint8_t *dest, int dstW,
-                     int y, enum PixelFormat target)
-{
-    int i;
-
-    for (i = 0; i < (dstW >> 1); i++) {
-        int j;
-        int Y1 = 1 << 18;
-        int Y2 = 1 << 18;
-        int U  = 1 << 18;
-        int V  = 1 << 18;
-
-        for (j = 0; j < lumFilterSize; j++) {
-            Y1 += lumSrc[j][i * 2]     * lumFilter[j];
-            Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
-        }
-        for (j = 0; j < chrFilterSize; j++) {
-            U += chrUSrc[j][i] * chrFilter[j];
-            V += chrVSrc[j][i] * chrFilter[j];
-        }
-        Y1 >>= 19;
-        Y2 >>= 19;
-        U  >>= 19;
-        V  >>= 19;
-        if ((Y1 | Y2 | U | V) & 0x100) {
-            Y1 = av_clip_uint8(Y1);
-            Y2 = av_clip_uint8(Y2);
-            U  = av_clip_uint8(U);
-            V  = av_clip_uint8(V);
-        }
-        output_pixels(4*i, Y1, U, Y2, V);
-    }
-}
-
-static av_always_inline void
-yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
-                     const int16_t *ubuf[2], const int16_t *vbuf[2],
-                     const int16_t *abuf[2], uint8_t *dest, int dstW,
-                     int yalpha, int uvalpha, int y,
-                     enum PixelFormat target)
-{
-    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
-                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
-                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
-    int  yalpha1 = 4095 - yalpha;
-    int uvalpha1 = 4095 - uvalpha;
-    int i;
-
-    for (i = 0; i < (dstW >> 1); i++) {
-        int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
-        int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
-        int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
-        int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
-
-        output_pixels(i * 4, Y1, U, Y2, V);
-    }
-}
-
-static av_always_inline void
-yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
-                     const int16_t *ubuf[2], const int16_t *vbuf[2],
-                     const int16_t *abuf0, uint8_t *dest, int dstW,
-                     int uvalpha, int y, enum PixelFormat target)
-{
-    const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
-                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
-    int i;
-
-    if (uvalpha < 2048) {
-        for (i = 0; i < (dstW >> 1); i++) {
-            int Y1 = buf0[i * 2]     >> 7;
-            int Y2 = buf0[i * 2 + 1] >> 7;
-            int U  = ubuf1[i]        >> 7;
-            int V  = vbuf1[i]        >> 7;
-
-            output_pixels(i * 4, Y1, U, Y2, V);
-        }
-    } else {
-        for (i = 0; i < (dstW >> 1); i++) {
-            int Y1 =  buf0[i * 2]          >> 7;
-            int Y2 =  buf0[i * 2 + 1]      >> 7;
-            int U  = (ubuf0[i] + ubuf1[i]) >> 8;
-            int V  = (vbuf0[i] + vbuf1[i]) >> 8;
-
-            output_pixels(i * 4, Y1, U, Y2, V);
-        }
-    }
-}
-
-#undef output_pixels
-
-YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422)
-YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422)
-
-#define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
-#define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
-#define output_pixel(pos, val) \
-    if (isBE(target)) { \
-        AV_WB16(pos, val); \
-    } else { \
-        AV_WL16(pos, val); \
-    }
-
-static av_always_inline void
-yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
-                       const int32_t **lumSrc, int lumFilterSize,
-                       const int16_t *chrFilter, const int32_t **chrUSrc,
-                       const int32_t **chrVSrc, int chrFilterSize,
-                       const int32_t **alpSrc, uint16_t *dest, int dstW,
-                       int y, enum PixelFormat target)
-{
-    int i;
-
-    for (i = 0; i < (dstW >> 1); i++) {
-        int j;
-        int Y1 = -0x40000000;
-        int Y2 = -0x40000000;
-        int U  = -128 << 23; // 19
-        int V  = -128 << 23;
-        int R, G, B;
-
-        for (j = 0; j < lumFilterSize; j++) {
-            Y1 += lumSrc[j][i * 2]     * lumFilter[j];
-            Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
-        }
-        for (j = 0; j < chrFilterSize; j++) {
-            U += chrUSrc[j][i] * chrFilter[j];
-            V += chrVSrc[j][i] * chrFilter[j];
-        }
-
-        // 8bit: 12+15=27; 16-bit: 12+19=31
-        Y1 >>= 14; // 10
-        Y1 += 0x10000;
-        Y2 >>= 14;
-        Y2 += 0x10000;
-        U  >>= 14;
-        V  >>= 14;
-
-        // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
-        Y1 -= c->yuv2rgb_y_offset;
-        Y2 -= c->yuv2rgb_y_offset;
-        Y1 *= c->yuv2rgb_y_coeff;
-        Y2 *= c->yuv2rgb_y_coeff;
-        Y1 += 1 << 13; // 21
-        Y2 += 1 << 13;
-        // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
-
-        R = V * c->yuv2rgb_v2r_coeff;
-        G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
-        B =                            U * c->yuv2rgb_u2b_coeff;
-
-        // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
-        output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
-        output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
-        output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
-        output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
-        output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
-        output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
-        dest += 6;
-    }
-}
-
-static av_always_inline void
-yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
-                       const int32_t *ubuf[2], const int32_t *vbuf[2],
-                       const int32_t *abuf[2], uint16_t *dest, int dstW,
-                       int yalpha, int uvalpha, int y,
-                       enum PixelFormat target)
-{
-    const int32_t *buf0  = buf[0],  *buf1  = buf[1],
-                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
-                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
-    int  yalpha1 = 4095 - yalpha;
-    int uvalpha1 = 4095 - uvalpha;
-    int i;
-
-    for (i = 0; i < (dstW >> 1); i++) {
-        int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
-        int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
-        int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
-        int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
-        int R, G, B;
-
-        Y1 -= c->yuv2rgb_y_offset;
-        Y2 -= c->yuv2rgb_y_offset;
-        Y1 *= c->yuv2rgb_y_coeff;
-        Y2 *= c->yuv2rgb_y_coeff;
-        Y1 += 1 << 13;
-        Y2 += 1 << 13;
-
-        R = V * c->yuv2rgb_v2r_coeff;
-        G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
-        B =                            U * c->yuv2rgb_u2b_coeff;
-
-        output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
-        output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
-        output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
-        output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
-        output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
-        output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
-        dest += 6;
-    }
-}
-
-static av_always_inline void
-yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
-                       const int32_t *ubuf[2], const int32_t *vbuf[2],
-                       const int32_t *abuf0, uint16_t *dest, int dstW,
-                       int uvalpha, int y, enum PixelFormat target)
-{
-    const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
-                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
-    int i;
-
-    if (uvalpha < 2048) {
-        for (i = 0; i < (dstW >> 1); i++) {
-            int Y1 = (buf0[i * 2]    ) >> 2;
-            int Y2 = (buf0[i * 2 + 1]) >> 2;
-            int U  = (ubuf0[i] + (-128 << 11)) >> 2;
-            int V  = (vbuf0[i] + (-128 << 11)) >> 2;
-            int R, G, B;
-
-            Y1 -= c->yuv2rgb_y_offset;
-            Y2 -= c->yuv2rgb_y_offset;
-            Y1 *= c->yuv2rgb_y_coeff;
-            Y2 *= c->yuv2rgb_y_coeff;
-            Y1 += 1 << 13;
-            Y2 += 1 << 13;
-
-            R = V * c->yuv2rgb_v2r_coeff;
-            G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
-            B =                            U * c->yuv2rgb_u2b_coeff;
-
-            output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
-            output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
-            output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
-            output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
-            output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
-            output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
-            dest += 6;
-        }
-    } else {
-        for (i = 0; i < (dstW >> 1); i++) {
-            int Y1 = (buf0[i * 2]    ) >> 2;
-            int Y2 = (buf0[i * 2 + 1]) >> 2;
-            int U  = (ubuf0[i] + ubuf1[i] + (-128 << 11)) >> 3;
-            int V  = (vbuf0[i] + vbuf1[i] + (-128 << 11)) >> 3;
-            int R, G, B;
-
-            Y1 -= c->yuv2rgb_y_offset;
-            Y2 -= c->yuv2rgb_y_offset;
-            Y1 *= c->yuv2rgb_y_coeff;
-            Y2 *= c->yuv2rgb_y_coeff;
-            Y1 += 1 << 13;
-            Y2 += 1 << 13;
-
-            R = V * c->yuv2rgb_v2r_coeff;
-            G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
-            B =                            U * c->yuv2rgb_u2b_coeff;
-
-            output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
-            output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
-            output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
-            output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
-            output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
-            output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
-            dest += 6;
-        }
-    }
-}
-
-#undef output_pixel
-#undef r_b
-#undef b_r
-
-YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE)
-YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE)
-YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE)
-YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE)
-
-/*
- * Write out 2 RGB pixels in the target pixel format. This function takes a
- * R/G/B LUT as generated by ff_yuv2rgb_c_init_tables(), which takes care of
- * things like endianness conversion and shifting. The caller takes care of
- * setting the correct offset in these tables from the chroma (U/V) values.
- * This function then uses the luminance (Y1/Y2) values to write out the
- * correct RGB values into the destination buffer.
- */
-static av_always_inline void
-yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
-              unsigned A1, unsigned A2,
-              const void *_r, const void *_g, const void *_b, int y,
-              enum PixelFormat target, int hasAlpha)
-{
-    if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
-        target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
-        uint32_t *dest = (uint32_t *) _dest;
-        const uint32_t *r = (const uint32_t *) _r;
-        const uint32_t *g = (const uint32_t *) _g;
-        const uint32_t *b = (const uint32_t *) _b;
-
-#if CONFIG_SMALL
-        int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
-
-        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
-        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
-#else
-        if (hasAlpha) {
-            int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
-
-            dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
-            dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
-        } else {
-            dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
-            dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
-        }
-#endif
-    } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
-        uint8_t *dest = (uint8_t *) _dest;
-        const uint8_t *r = (const uint8_t *) _r;
-        const uint8_t *g = (const uint8_t *) _g;
-        const uint8_t *b = (const uint8_t *) _b;
-
-#define r_b ((target == PIX_FMT_RGB24) ? r : b)
-#define b_r ((target == PIX_FMT_RGB24) ? b : r)
-        dest[i * 6 + 0] = r_b[Y1];
-        dest[i * 6 + 1] =   g[Y1];
-        dest[i * 6 + 2] = b_r[Y1];
-        dest[i * 6 + 3] = r_b[Y2];
-        dest[i * 6 + 4] =   g[Y2];
-        dest[i * 6 + 5] = b_r[Y2];
-#undef r_b
-#undef b_r
-    } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
-               target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
-               target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
-        uint16_t *dest = (uint16_t *) _dest;
-        const uint16_t *r = (const uint16_t *) _r;
-        const uint16_t *g = (const uint16_t *) _g;
-        const uint16_t *b = (const uint16_t *) _b;
-        int dr1, dg1, db1, dr2, dg2, db2;
-
-        if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
-            dr1 = dither_2x2_8[ y & 1     ][0];
-            dg1 = dither_2x2_4[ y & 1     ][0];
-            db1 = dither_2x2_8[(y & 1) ^ 1][0];
-            dr2 = dither_2x2_8[ y & 1     ][1];
-            dg2 = dither_2x2_4[ y & 1     ][1];
-            db2 = dither_2x2_8[(y & 1) ^ 1][1];
-        } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
-            dr1 = dither_2x2_8[ y & 1     ][0];
-            dg1 = dither_2x2_8[ y & 1     ][1];
-            db1 = dither_2x2_8[(y & 1) ^ 1][0];
-            dr2 = dither_2x2_8[ y & 1     ][1];
-            dg2 = dither_2x2_8[ y & 1     ][0];
-            db2 = dither_2x2_8[(y & 1) ^ 1][1];
-        } else {
-            dr1 = dither_4x4_16[ y & 3     ][0];
-            dg1 = dither_4x4_16[ y & 3     ][1];
-            db1 = dither_4x4_16[(y & 3) ^ 3][0];
-            dr2 = dither_4x4_16[ y & 3     ][1];
-            dg2 = dither_4x4_16[ y & 3     ][0];
-            db2 = dither_4x4_16[(y & 3) ^ 3][1];
-        }
-
-        dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
-        dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
-    } else /* 8/4-bit */ {
-        uint8_t *dest = (uint8_t *) _dest;
-        const uint8_t *r = (const uint8_t *) _r;
-        const uint8_t *g = (const uint8_t *) _g;
-        const uint8_t *b = (const uint8_t *) _b;
-        int dr1, dg1, db1, dr2, dg2, db2;
-
-        if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
-            const uint8_t * const d64 = dither_8x8_73[y & 7];
-            const uint8_t * const d32 = dither_8x8_32[y & 7];
-            dr1 = dg1 = d32[(i * 2 + 0) & 7];
-            db1 =       d64[(i * 2 + 0) & 7];
-            dr2 = dg2 = d32[(i * 2 + 1) & 7];
-            db2 =       d64[(i * 2 + 1) & 7];
-        } else {
-            const uint8_t * const d64  = dither_8x8_73 [y & 7];
-            const uint8_t * const d128 = dither_8x8_220[y & 7];
-            dr1 = db1 = d128[(i * 2 + 0) & 7];
-            dg1 =        d64[(i * 2 + 0) & 7];
-            dr2 = db2 = d128[(i * 2 + 1) & 7];
-            dg2 =        d64[(i * 2 + 1) & 7];
-        }
-
-        if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
-            dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
-                    ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
-        } else {
-            dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
-            dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
-        }
-    }
-}
-
-static av_always_inline void
-yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
-                     const int16_t **lumSrc, int lumFilterSize,
-                     const int16_t *chrFilter, const int16_t **chrUSrc,
-                     const int16_t **chrVSrc, int chrFilterSize,
-                     const int16_t **alpSrc, uint8_t *dest, int dstW,
-                     int y, enum PixelFormat target, int hasAlpha)
-{
-    int i;
-
-    for (i = 0; i < (dstW >> 1); i++) {
-        int j;
-        int Y1 = 1 << 18;
-        int Y2 = 1 << 18;
-        int U  = 1 << 18;
-        int V  = 1 << 18;
-        int av_unused A1, A2;
-        const void *r, *g, *b;
-
-        for (j = 0; j < lumFilterSize; j++) {
-            Y1 += lumSrc[j][i * 2]     * lumFilter[j];
-            Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
-        }
-        for (j = 0; j < chrFilterSize; j++) {
-            U += chrUSrc[j][i] * chrFilter[j];
-            V += chrVSrc[j][i] * chrFilter[j];
-        }
-        Y1 >>= 19;
-        Y2 >>= 19;
-        U  >>= 19;
-        V  >>= 19;
-        if ((Y1 | Y2 | U | V) & 0x100) {
-            Y1 = av_clip_uint8(Y1);
-            Y2 = av_clip_uint8(Y2);
-            U  = av_clip_uint8(U);
-            V  = av_clip_uint8(V);
-        }
-        if (hasAlpha) {
-            A1 = 1 << 18;
-            A2 = 1 << 18;
-            for (j = 0; j < lumFilterSize; j++) {
-                A1 += alpSrc[j][i * 2    ] * lumFilter[j];
-                A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
-            }
-            A1 >>= 19;
-            A2 >>= 19;
-            if ((A1 | A2) & 0x100) {
-                A1 = av_clip_uint8(A1);
-                A2 = av_clip_uint8(A2);
-            }
-        }
-
-        /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
-        r =  c->table_rV[V];
-        g = (c->table_gU[U] + c->table_gV[V]);
-        b =  c->table_bU[U];
-
-        yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
-                      r, g, b, y, target, hasAlpha);
-    }
-}
-
-static av_always_inline void
-yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
-                     const int16_t *ubuf[2], const int16_t *vbuf[2],
-                     const int16_t *abuf[2], uint8_t *dest, int dstW,
-                     int yalpha, int uvalpha, int y,
-                     enum PixelFormat target, int hasAlpha)
-{
-    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
-                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
-                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
-                  *abuf0 = hasAlpha ? abuf[0] : NULL,
-                  *abuf1 = hasAlpha ? abuf[1] : NULL;
-    int  yalpha1 = 4095 - yalpha;
-    int uvalpha1 = 4095 - uvalpha;
-    int i;
-
-    for (i = 0; i < (dstW >> 1); i++) {
-        int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
-        int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
-        int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
-        int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
-        int A1, A2;
-        const void *r =  c->table_rV[V],
-                   *g = (c->table_gU[U] + c->table_gV[V]),
-                   *b =  c->table_bU[U];
-
-        if (hasAlpha) {
-            A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
-            A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
-        }
-
-        yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
-                      r, g, b, y, target, hasAlpha);
-    }
-}
-
-static av_always_inline void
-yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
-                     const int16_t *ubuf[2], const int16_t *vbuf[2],
-                     const int16_t *abuf0, uint8_t *dest, int dstW,
-                     int uvalpha, int y, enum PixelFormat target,
-                     int hasAlpha)
-{
-    const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
-                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
-    int i;
-
-    if (uvalpha < 2048) {
-        for (i = 0; i < (dstW >> 1); i++) {
-            int Y1 = buf0[i * 2]     >> 7;
-            int Y2 = buf0[i * 2 + 1] >> 7;
-            int U  = ubuf1[i]        >> 7;
-            int V  = vbuf1[i]        >> 7;
-            int A1, A2;
-            const void *r =  c->table_rV[V],
-                       *g = (c->table_gU[U] + c->table_gV[V]),
-                       *b =  c->table_bU[U];
-
-            if (hasAlpha) {
-                A1 = abuf0[i * 2    ] >> 7;
-                A2 = abuf0[i * 2 + 1] >> 7;
-            }
-
-            yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
-                          r, g, b, y, target, hasAlpha);
-        }
-    } else {
-        for (i = 0; i < (dstW >> 1); i++) {
-            int Y1 =  buf0[i * 2]          >> 7;
-            int Y2 =  buf0[i * 2 + 1]      >> 7;
-            int U  = (ubuf0[i] + ubuf1[i]) >> 8;
-            int V  = (vbuf0[i] + vbuf1[i]) >> 8;
-            int A1, A2;
-            const void *r =  c->table_rV[V],
-                       *g = (c->table_gU[U] + c->table_gV[V]),
-                       *b =  c->table_bU[U];
-
-            if (hasAlpha) {
-                A1 = abuf0[i * 2    ] >> 7;
-                A2 = abuf0[i * 2 + 1] >> 7;
-            }
-
-            yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
-                          r, g, b, y, target, hasAlpha);
-        }
-    }
-}
-
-#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
-static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
-                                const int16_t **lumSrc, int lumFilterSize, \
-                                const int16_t *chrFilter, const int16_t **chrUSrc, \
-                                const int16_t **chrVSrc, int chrFilterSize, \
-                                const int16_t **alpSrc, uint8_t *dest, int dstW, \
-                                int y) \
-{ \
-    name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
-                                  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
-                                  alpSrc, dest, dstW, y, fmt, hasAlpha); \
-}
-#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
-YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
-static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
-                                const int16_t *ubuf[2], const int16_t *vbuf[2], \
-                                const int16_t *abuf[2], uint8_t *dest, int dstW, \
-                                int yalpha, int uvalpha, int y) \
-{ \
-    name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
-                                  dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
-} \
- \
-static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
-                                const int16_t *ubuf[2], const int16_t *vbuf[2], \
-                                const int16_t *abuf0, uint8_t *dest, int dstW, \
-                                int uvalpha, int y) \
-{ \
-    name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
-                                  dstW, uvalpha, y, fmt, hasAlpha); \
-}
-
-#if CONFIG_SMALL
-YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
-YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
-#else
-#if CONFIG_SWSCALE_ALPHA
-YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1)
-YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1)
-#endif
-YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0)
-YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0)
-#endif
-YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0)
-YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0)
-YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0)
-YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0)
-YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0)
-YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0)
-YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0)
-YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0)
-
-static av_always_inline void
-yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
-                          const int16_t **lumSrc, int lumFilterSize,
-                          const int16_t *chrFilter, const int16_t **chrUSrc,
-                          const int16_t **chrVSrc, int chrFilterSize,
-                          const int16_t **alpSrc, uint8_t *dest,
-                          int dstW, int y, enum PixelFormat target, int hasAlpha)
-{
-    int i;
-    int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
-
-    for (i = 0; i < dstW; i++) {
-        int j;
-        int Y = 0;
-        int U = -128 << 19;
-        int V = -128 << 19;
-        int av_unused A;
-        int R, G, B;
-
-        for (j = 0; j < lumFilterSize; j++) {
-            Y += lumSrc[j][i] * lumFilter[j];
-        }
-        for (j = 0; j < chrFilterSize; j++) {
-            U += chrUSrc[j][i] * chrFilter[j];
-            V += chrVSrc[j][i] * chrFilter[j];
-        }
-        Y >>= 10;
-        U >>= 10;
-        V >>= 10;
-        if (hasAlpha) {
-            A = 1 << 21;
-            for (j = 0; j < lumFilterSize; j++) {
-                A += alpSrc[j][i] * lumFilter[j];
-            }
-            A >>= 19;
-            if (A & 0x100)
-                A = av_clip_uint8(A);
-        }
-        Y -= c->yuv2rgb_y_offset;
-        Y *= c->yuv2rgb_y_coeff;
-        Y += 1 << 21;
-        R = Y + V*c->yuv2rgb_v2r_coeff;
-        G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
-        B = Y +                          U*c->yuv2rgb_u2b_coeff;
-        if ((R | G | B) & 0xC0000000) {
-            R = av_clip_uintp2(R, 30);
-            G = av_clip_uintp2(G, 30);
-            B = av_clip_uintp2(B, 30);
-        }
-
-        switch(target) {
-        case PIX_FMT_ARGB:
-            dest[0] = hasAlpha ? A : 255;
-            dest[1] = R >> 22;
-            dest[2] = G >> 22;
-            dest[3] = B >> 22;
-            break;
-        case PIX_FMT_RGB24:
-            dest[0] = R >> 22;
-            dest[1] = G >> 22;
-            dest[2] = B >> 22;
-            break;
-        case PIX_FMT_RGBA:
-            dest[0] = R >> 22;
-            dest[1] = G >> 22;
-            dest[2] = B >> 22;
-            dest[3] = hasAlpha ? A : 255;
-            break;
-        case PIX_FMT_ABGR:
-            dest[0] = hasAlpha ? A : 255;
-            dest[1] = B >> 22;
-            dest[2] = G >> 22;
-            dest[3] = R >> 22;
-            dest += 4;
-            break;
-        case PIX_FMT_BGR24:
-            dest[0] = B >> 22;
-            dest[1] = G >> 22;
-            dest[2] = R >> 22;
-            break;
-        case PIX_FMT_BGRA:
-            dest[0] = B >> 22;
-            dest[1] = G >> 22;
-            dest[2] = R >> 22;
-            dest[3] = hasAlpha ? A : 255;
-            break;
-        }
-        dest += step;
-    }
-}
-
-#if CONFIG_SMALL
-YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
-YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
-YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
-YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
-#else
-#if CONFIG_SWSCALE_ALPHA
-YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1)
-YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1)
-YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1)
-YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1)
-#endif
-YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0)
-YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0)
-YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0)
-YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0)
-#endif
-YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0)
-YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0)
-
 static av_always_inline void fillPlane(uint8_t* plane, int stride,
                                        int width, int height,
                                        int y, uint8_t val)
@@ -1552,250 +295,6 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2
         c->chrConvertRange(dst1, dst2, dstWidth);
 }
 
-static av_always_inline void
-find_c_packed_planar_out_funcs(SwsContext *c,
-                               yuv2planar1_fn *yuv2plane1, yuv2planarX_fn *yuv2planeX,
-                               yuv2interleavedX_fn *yuv2nv12cX,
-                               yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
-                               yuv2packedX_fn *yuv2packedX)
-{
-    enum PixelFormat dstFormat = c->dstFormat;
-
-    if (is16BPS(dstFormat)) {
-        *yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_c  : yuv2planeX_16LE_c;
-        *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_c  : yuv2plane1_16LE_c;
-    } else if (is9_OR_10BPS(dstFormat)) {
-        if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
-            *yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_c  : yuv2planeX_9LE_c;
-            *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_c  : yuv2plane1_9LE_c;
-        } else {
-            *yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_c  : yuv2planeX_10LE_c;
-            *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_c  : yuv2plane1_10LE_c;
-        }
-    } else {
-        *yuv2plane1 = yuv2plane1_8_c;
-        *yuv2planeX = yuv2planeX_8_c;
-        if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21)
-            *yuv2nv12cX = yuv2nv12cX_c;
-    }
-
-    if(c->flags & SWS_FULL_CHR_H_INT) {
-        switch (dstFormat) {
-            case PIX_FMT_RGBA:
-#if CONFIG_SMALL
-                *yuv2packedX = yuv2rgba32_full_X_c;
-#else
-#if CONFIG_SWSCALE_ALPHA
-                if (c->alpPixBuf) {
-                    *yuv2packedX = yuv2rgba32_full_X_c;
-                } else
-#endif /* CONFIG_SWSCALE_ALPHA */
-                {
-                    *yuv2packedX = yuv2rgbx32_full_X_c;
-                }
-#endif /* !CONFIG_SMALL */
-                break;
-            case PIX_FMT_ARGB:
-#if CONFIG_SMALL
-                *yuv2packedX = yuv2argb32_full_X_c;
-#else
-#if CONFIG_SWSCALE_ALPHA
-                if (c->alpPixBuf) {
-                    *yuv2packedX = yuv2argb32_full_X_c;
-                } else
-#endif /* CONFIG_SWSCALE_ALPHA */
-                {
-                    *yuv2packedX = yuv2xrgb32_full_X_c;
-                }
-#endif /* !CONFIG_SMALL */
-                break;
-            case PIX_FMT_BGRA:
-#if CONFIG_SMALL
-                *yuv2packedX = yuv2bgra32_full_X_c;
-#else
-#if CONFIG_SWSCALE_ALPHA
-                if (c->alpPixBuf) {
-                    *yuv2packedX = yuv2bgra32_full_X_c;
-                } else
-#endif /* CONFIG_SWSCALE_ALPHA */
-                {
-                    *yuv2packedX = yuv2bgrx32_full_X_c;
-                }
-#endif /* !CONFIG_SMALL */
-                break;
-            case PIX_FMT_ABGR:
-#if CONFIG_SMALL
-                *yuv2packedX = yuv2abgr32_full_X_c;
-#else
-#if CONFIG_SWSCALE_ALPHA
-                if (c->alpPixBuf) {
-                    *yuv2packedX = yuv2abgr32_full_X_c;
-                } else
-#endif /* CONFIG_SWSCALE_ALPHA */
-                {
-                    *yuv2packedX = yuv2xbgr32_full_X_c;
-                }
-#endif /* !CONFIG_SMALL */
-                break;
-            case PIX_FMT_RGB24:
-            *yuv2packedX = yuv2rgb24_full_X_c;
-            break;
-        case PIX_FMT_BGR24:
-            *yuv2packedX = yuv2bgr24_full_X_c;
-            break;
-        }
-    } else {
-        switch (dstFormat) {
-        case PIX_FMT_RGB48LE:
-            *yuv2packed1 = yuv2rgb48le_1_c;
-            *yuv2packed2 = yuv2rgb48le_2_c;
-            *yuv2packedX = yuv2rgb48le_X_c;
-            break;
-        case PIX_FMT_RGB48BE:
-            *yuv2packed1 = yuv2rgb48be_1_c;
-            *yuv2packed2 = yuv2rgb48be_2_c;
-            *yuv2packedX = yuv2rgb48be_X_c;
-            break;
-        case PIX_FMT_BGR48LE:
-            *yuv2packed1 = yuv2bgr48le_1_c;
-            *yuv2packed2 = yuv2bgr48le_2_c;
-            *yuv2packedX = yuv2bgr48le_X_c;
-            break;
-        case PIX_FMT_BGR48BE:
-            *yuv2packed1 = yuv2bgr48be_1_c;
-            *yuv2packed2 = yuv2bgr48be_2_c;
-            *yuv2packedX = yuv2bgr48be_X_c;
-            break;
-        case PIX_FMT_RGB32:
-        case PIX_FMT_BGR32:
-#if CONFIG_SMALL
-            *yuv2packed1 = yuv2rgb32_1_c;
-            *yuv2packed2 = yuv2rgb32_2_c;
-            *yuv2packedX = yuv2rgb32_X_c;
-#else
-#if CONFIG_SWSCALE_ALPHA
-                if (c->alpPixBuf) {
-                    *yuv2packed1 = yuv2rgba32_1_c;
-                    *yuv2packed2 = yuv2rgba32_2_c;
-                    *yuv2packedX = yuv2rgba32_X_c;
-                } else
-#endif /* CONFIG_SWSCALE_ALPHA */
-                {
-                    *yuv2packed1 = yuv2rgbx32_1_c;
-                    *yuv2packed2 = yuv2rgbx32_2_c;
-                    *yuv2packedX = yuv2rgbx32_X_c;
-                }
-#endif /* !CONFIG_SMALL */
-            break;
-        case PIX_FMT_RGB32_1:
-        case PIX_FMT_BGR32_1:
-#if CONFIG_SMALL
-                *yuv2packed1 = yuv2rgb32_1_1_c;
-                *yuv2packed2 = yuv2rgb32_1_2_c;
-                *yuv2packedX = yuv2rgb32_1_X_c;
-#else
-#if CONFIG_SWSCALE_ALPHA
-                if (c->alpPixBuf) {
-                    *yuv2packed1 = yuv2rgba32_1_1_c;
-                    *yuv2packed2 = yuv2rgba32_1_2_c;
-                    *yuv2packedX = yuv2rgba32_1_X_c;
-                } else
-#endif /* CONFIG_SWSCALE_ALPHA */
-                {
-                    *yuv2packed1 = yuv2rgbx32_1_1_c;
-                    *yuv2packed2 = yuv2rgbx32_1_2_c;
-                    *yuv2packedX = yuv2rgbx32_1_X_c;
-                }
-#endif /* !CONFIG_SMALL */
-                break;
-        case PIX_FMT_RGB24:
-            *yuv2packed1 = yuv2rgb24_1_c;
-            *yuv2packed2 = yuv2rgb24_2_c;
-            *yuv2packedX = yuv2rgb24_X_c;
-            break;
-        case PIX_FMT_BGR24:
-            *yuv2packed1 = yuv2bgr24_1_c;
-            *yuv2packed2 = yuv2bgr24_2_c;
-            *yuv2packedX = yuv2bgr24_X_c;
-            break;
-        case PIX_FMT_RGB565LE:
-        case PIX_FMT_RGB565BE:
-        case PIX_FMT_BGR565LE:
-        case PIX_FMT_BGR565BE:
-            *yuv2packed1 = yuv2rgb16_1_c;
-            *yuv2packed2 = yuv2rgb16_2_c;
-            *yuv2packedX = yuv2rgb16_X_c;
-            break;
-        case PIX_FMT_RGB555LE:
-        case PIX_FMT_RGB555BE:
-        case PIX_FMT_BGR555LE:
-        case PIX_FMT_BGR555BE:
-            *yuv2packed1 = yuv2rgb15_1_c;
-            *yuv2packed2 = yuv2rgb15_2_c;
-            *yuv2packedX = yuv2rgb15_X_c;
-            break;
-        case PIX_FMT_RGB444LE:
-        case PIX_FMT_RGB444BE:
-        case PIX_FMT_BGR444LE:
-        case PIX_FMT_BGR444BE:
-            *yuv2packed1 = yuv2rgb12_1_c;
-            *yuv2packed2 = yuv2rgb12_2_c;
-            *yuv2packedX = yuv2rgb12_X_c;
-            break;
-        case PIX_FMT_RGB8:
-        case PIX_FMT_BGR8:
-            *yuv2packed1 = yuv2rgb8_1_c;
-            *yuv2packed2 = yuv2rgb8_2_c;
-            *yuv2packedX = yuv2rgb8_X_c;
-            break;
-        case PIX_FMT_RGB4:
-        case PIX_FMT_BGR4:
-            *yuv2packed1 = yuv2rgb4_1_c;
-            *yuv2packed2 = yuv2rgb4_2_c;
-            *yuv2packedX = yuv2rgb4_X_c;
-            break;
-        case PIX_FMT_RGB4_BYTE:
-        case PIX_FMT_BGR4_BYTE:
-            *yuv2packed1 = yuv2rgb4b_1_c;
-            *yuv2packed2 = yuv2rgb4b_2_c;
-            *yuv2packedX = yuv2rgb4b_X_c;
-            break;
-        }
-    }
-    switch (dstFormat) {
-    case PIX_FMT_GRAY16BE:
-        *yuv2packed1 = yuv2gray16BE_1_c;
-        *yuv2packed2 = yuv2gray16BE_2_c;
-        *yuv2packedX = yuv2gray16BE_X_c;
-        break;
-    case PIX_FMT_GRAY16LE:
-        *yuv2packed1 = yuv2gray16LE_1_c;
-        *yuv2packed2 = yuv2gray16LE_2_c;
-        *yuv2packedX = yuv2gray16LE_X_c;
-        break;
-    case PIX_FMT_MONOWHITE:
-        *yuv2packed1 = yuv2monowhite_1_c;
-        *yuv2packed2 = yuv2monowhite_2_c;
-        *yuv2packedX = yuv2monowhite_X_c;
-        break;
-    case PIX_FMT_MONOBLACK:
-        *yuv2packed1 = yuv2monoblack_1_c;
-        *yuv2packed2 = yuv2monoblack_2_c;
-        *yuv2packedX = yuv2monoblack_X_c;
-        break;
-    case PIX_FMT_YUYV422:
-        *yuv2packed1 = yuv2yuyv422_1_c;
-        *yuv2packed2 = yuv2yuyv422_2_c;
-        *yuv2packedX = yuv2yuyv422_X_c;
-        break;
-    case PIX_FMT_UYVY422:
-        *yuv2packed1 = yuv2uyvy422_1_c;
-        *yuv2packed2 = yuv2uyvy422_2_c;
-        *yuv2packedX = yuv2uyvy422_X_c;
-        break;
-    }
-}
-
 #define DEBUG_SWSCALE_BUFFERS 0
 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
 
@@ -2003,8 +502,8 @@ static int swScale(SwsContext *c, const uint8_t* src[],
         }
         if (dstY >= dstH-2) {
             // hmm looks like we can't use MMX here without overwriting this array's tail
-            find_c_packed_planar_out_funcs(c, &yuv2plane1, &yuv2planeX,  &yuv2nv12cX,
-                                           &yuv2packed1, &yuv2packed2, &yuv2packedX);
+            ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX,  &yuv2nv12cX,
+                                     &yuv2packed1, &yuv2packed2, &yuv2packedX);
         }
 
         {
@@ -2139,9 +638,9 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
 {
     enum PixelFormat srcFormat = c->srcFormat;
 
-    find_c_packed_planar_out_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
-                                   &c->yuv2nv12cX, &c->yuv2packed1, &c->yuv2packed2,
-                                   &c->yuv2packedX);
+    ff_sws_init_output_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
+                             &c->yuv2nv12cX, &c->yuv2packed1,
+                             &c->yuv2packed2, &c->yuv2packedX);
 
     ff_sws_init_input_funcs(c);
 
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 6d77608a59..242a8508b4 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -650,6 +650,13 @@ void ff_swscale_get_unscaled_altivec(SwsContext *c);
 SwsFunc ff_getSwsFunc(SwsContext *c);
 
 void ff_sws_init_input_funcs(SwsContext *c);
+void ff_sws_init_output_funcs(SwsContext *c,
+                              yuv2planar1_fn *yuv2plane1,
+                              yuv2planarX_fn *yuv2planeX,
+                              yuv2interleavedX_fn *yuv2nv12cX,
+                              yuv2packed1_fn *yuv2packed1,
+                              yuv2packed2_fn *yuv2packed2,
+                              yuv2packedX_fn *yuv2packedX);
 void ff_sws_init_swScale_altivec(SwsContext *c);
 void ff_sws_init_swScale_mmx(SwsContext *c);
 
-- 
cgit v1.2.3


From 32c61400c02eb8e2ad91deea4daa11d4241fbbb2 Mon Sep 17 00:00:00 2001
From: Justin Ruggles
Date: Mon, 6 Feb 2012 13:36:42 -0500
Subject: apedec: use sizeof(field) instead of sizeof(type)

---
 libavcodec/apedec.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index 5aef878ede..870442c74c 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -474,8 +474,8 @@ static void entropy_decode(APEContext *ctx, int blockstodecode, int stereo)
 
     if (ctx->frameflags & APE_FRAMECODE_STEREO_SILENCE) {
         /* We are pure silence, just memset the output buffer. */
-        memset(decoded0, 0, blockstodecode * sizeof(int32_t));
-        memset(decoded1, 0, blockstodecode * sizeof(int32_t));
+        memset(decoded0, 0, blockstodecode * sizeof(*decoded0));
+        memset(decoded1, 0, blockstodecode * sizeof(*decoded1));
     } else {
         while (blockstodecode--) {
             *decoded0++ = ape_decode_value(ctx, &ctx->riceY);
@@ -525,7 +525,7 @@ static void init_predictor_decoder(APEContext *ctx)
     APEPredictor *p = &ctx->predictor;
 
     /* Zero the history buffers */
-    memset(p->historybuffer, 0, PREDICTOR_SIZE * sizeof(int32_t));
+    memset(p->historybuffer, 0, PREDICTOR_SIZE * sizeof(*p->historybuffer));
     p->buf = p->historybuffer;
 
     /* Initialize and zero the coefficients */
@@ -610,7 +610,8 @@ static void predictor_decode_stereo(APEContext *ctx, int count)
 
         /* Have we filled the history buffer? */
         if (p->buf == p->historybuffer + HISTORY_SIZE) {
-            memmove(p->historybuffer, p->buf, PREDICTOR_SIZE * sizeof(int32_t));
+            memmove(p->historybuffer, p->buf,
+                    PREDICTOR_SIZE * sizeof(*p->historybuffer));
             p->buf = p->historybuffer;
         }
     }
@@ -650,7 +651,8 @@ static void predictor_decode_mono(APEContext *ctx, int count)
 
         /* Have we filled the history buffer? */
         if (p->buf == p->historybuffer + HISTORY_SIZE) {
-            memmove(p->historybuffer, p->buf, PREDICTOR_SIZE * sizeof(int32_t));
+            memmove(p->historybuffer, p->buf,
+                    PREDICTOR_SIZE * sizeof(*p->historybuffer));
             p->buf = p->historybuffer;
         }
 
@@ -668,8 +670,8 @@ static void do_init_filter(APEFilter *f, int16_t *buf, int order)
     f->delay       = f->historybuffer + order * 2;
     f->adaptcoeffs = f->historybuffer + order;
 
-    memset(f->historybuffer, 0, (order * 2) * sizeof(int16_t));
-    memset(f->coeffs, 0, order * sizeof(int16_t));
+    memset(f->historybuffer, 0, (order * 2) * sizeof(*f->historybuffer));
+    memset(f->coeffs, 0, order * sizeof(*f->coeffs));
     f->avg = 0;
 }
 
@@ -725,7 +727,7 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
         /* Have we filled the history buffer? */
         if (f->delay == f->historybuffer + HISTORY_SIZE + (order * 2)) {
             memmove(f->historybuffer, f->delay - (order * 2),
-                    (order * 2) * sizeof(int16_t));
+                    (order * 2) * sizeof(*f->historybuffer));
             f->delay = f->historybuffer + order * 2;
             f->adaptcoeffs = f->historybuffer + order;
         }
-- 
cgit v1.2.3


From 1d3c672d27d3beece88485e28f2286ec2df62e6e Mon Sep 17 00:00:00 2001
From: Justin Ruggles
Date: Mon, 6 Feb 2012 16:31:26 -0500
Subject: apedec: allocate a single flat buffer for decoded samples

This will allow the decoder to return samples for the full packet, and it also
makes the decoded buffer pointers aligned.
---
 libavcodec/apedec.c | 53 +++++++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index 870442c74c..438d833a11 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -142,8 +142,9 @@ typedef struct APEContext {
     int frameflags;                          ///< frame flags
     APEPredictor predictor;                  ///< predictor used for final reconstruction
 
-    int32_t decoded0[BLOCKS_PER_LOOP];       ///< decoded data for the first channel
-    int32_t decoded1[BLOCKS_PER_LOOP];       ///< decoded data for the second channel
+    int32_t *decoded_buffer;
+    int decoded_size;
+    int32_t *decoded[MAX_CHANNELS];          ///< decoded data for each channel
 
     int16_t* filterbuf[APE_FILTER_LEVELS];   ///< filter memory
 
@@ -170,8 +171,9 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
     for (i = 0; i < APE_FILTER_LEVELS; i++)
         av_freep(&s->filterbuf[i]);
 
+    av_freep(&s->decoded_buffer);
     av_freep(&s->data);
-    s->data_size = 0;
+    s->decoded_size = s->data_size = 0;
 
     return 0;
 }
@@ -469,8 +471,8 @@ static inline int ape_decode_value(APEContext *ctx, APERice *rice)
 
 static void entropy_decode(APEContext *ctx, int blockstodecode, int stereo)
 {
-    int32_t *decoded0 = ctx->decoded0;
-    int32_t *decoded1 = ctx->decoded1;
+    int32_t *decoded0 = ctx->decoded[0];
+    int32_t *decoded1 = ctx->decoded[1];
 
     if (ctx->frameflags & APE_FRAMECODE_STEREO_SILENCE) {
         /* We are pure silence, just memset the output buffer. */
@@ -593,8 +595,8 @@ static av_always_inline int predictor_update_filter(APEPredictor *p,
 static void predictor_decode_stereo(APEContext *ctx, int count)
 {
     APEPredictor *p = &ctx->predictor;
-    int32_t *decoded0 = ctx->decoded0;
-    int32_t *decoded1 = ctx->decoded1;
+    int32_t *decoded0 = ctx->decoded[0];
+    int32_t *decoded1 = ctx->decoded[1];
 
     while (count--) {
         /* Predictor Y */
@@ -620,7 +622,7 @@ static void predictor_decode_stereo(APEContext *ctx, int count)
 static void predictor_decode_mono(APEContext *ctx, int count)
 {
     APEPredictor *p = &ctx->predictor;
-    int32_t *decoded0 = ctx->decoded0;
+    int32_t *decoded0 = ctx->decoded[0];
     int32_t predictionA, currentA, A, sign;
 
     currentA = p->lastA[0];
@@ -775,9 +777,6 @@ static int init_frame_decoder(APEContext *ctx)
 
 static void ape_unpack_mono(APEContext *ctx, int count)
 {
-    int32_t *decoded0 = ctx->decoded0;
-    int32_t *decoded1 = ctx->decoded1;
-
     if (ctx->frameflags & APE_FRAMECODE_STEREO_SILENCE) {
         entropy_decode(ctx, count, 0);
         /* We are pure silence, so we're done. */
@@ -786,22 +785,22 @@ static void ape_unpack_mono(APEContext *ctx, int count)
     }
 
     entropy_decode(ctx, count, 0);
-    ape_apply_filters(ctx, decoded0, NULL, count);
+    ape_apply_filters(ctx, ctx->decoded[0], NULL, count);
 
     /* Now apply the predictor decoding */
     predictor_decode_mono(ctx, count);
 
     /* Pseudo-stereo - just copy left channel to right channel */
     if (ctx->channels == 2) {
-        memcpy(decoded1, decoded0, count * sizeof(*decoded1));
+        memcpy(ctx->decoded[1], ctx->decoded[0], count * sizeof(*ctx->decoded[1]));
     }
 }
 
 static void ape_unpack_stereo(APEContext *ctx, int count)
 {
     int32_t left, right;
-    int32_t *decoded0 = ctx->decoded0;
-    int32_t *decoded1 = ctx->decoded1;
+    int32_t *decoded0 = ctx->decoded[0];
+    int32_t *decoded1 = ctx->decoded[1];
 
     if (ctx->frameflags & APE_FRAMECODE_STEREO_SILENCE) {
         /* We are pure silence, so we're done. */
@@ -885,9 +884,6 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
         }
         s->samples = nblocks;
 
-        memset(s->decoded0,  0, sizeof(s->decoded0));
-        memset(s->decoded1,  0, sizeof(s->decoded1));
-
         /* Initialize the frame decoder */
         if (init_frame_decoder(s) < 0) {
             av_log(avctx, AV_LOG_ERROR, "Error reading frame header\n");
@@ -904,6 +900,15 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
 
     blockstodecode = FFMIN(BLOCKS_PER_LOOP, s->samples);
 
+    /* reallocate decoded sample buffer if needed */
+    av_fast_malloc(&s->decoded_buffer, &s->decoded_size,
+                   2 * FFALIGN(blockstodecode, 8) * sizeof(*s->decoded_buffer));
+    if (!s->decoded_buffer)
+        return AVERROR(ENOMEM);
+    memset(s->decoded_buffer, 0, s->decoded_size);
+    s->decoded[0] = s->decoded_buffer;
+    s->decoded[1] = s->decoded_buffer + FFALIGN(blockstodecode, 8);
+
     /* get output buffer */
     s->frame.nb_samples = blockstodecode;
     if ((ret = avctx->get_buffer(avctx, &s->frame)) < 0) {
@@ -929,25 +934,25 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
     case 8:
         sample8 = (uint8_t *)s->frame.data[0];
         for (i = 0; i < blockstodecode; i++) {
-            *sample8++ = (s->decoded0[i] + 0x80) & 0xff;
+            *sample8++ = (s->decoded[0][i] + 0x80) & 0xff;
             if (s->channels == 2)
-                *sample8++ = (s->decoded1[i] + 0x80) & 0xff;
+                *sample8++ = (s->decoded[1][i] + 0x80) & 0xff;
         }
         break;
     case 16:
         sample16 = (int16_t *)s->frame.data[0];
         for (i = 0; i < blockstodecode; i++) {
-            *sample16++ = s->decoded0[i];
+            *sample16++ = s->decoded[0][i];
             if (s->channels == 2)
-                *sample16++ = s->decoded1[i];
+                *sample16++ = s->decoded[1][i];
         }
         break;
     case 24:
         sample24 = (int32_t *)s->frame.data[0];
         for (i = 0; i < blockstodecode; i++) {
-            *sample24++ = s->decoded0[i] << 8;
+            *sample24++ = s->decoded[0][i] << 8;
             if (s->channels == 2)
-                *sample24++ = s->decoded1[i] << 8;
+                *sample24++ = s->decoded[1][i] << 8;
         }
         break;
     }
-- 
cgit v1.2.3


From 39575eead279662a27d9d585d14b96ec639be8ad Mon Sep 17 00:00:00 2001
From: Justin Ruggles
Date: Mon, 6 Feb 2012 16:34:50 -0500
Subject: apedec: do not unnecessarily zero output samples for mono frames

---
 libavcodec/apedec.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index 438d833a11..028c29c85e 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -474,16 +474,10 @@ static void entropy_decode(APEContext *ctx, int blockstodecode, int stereo)
     int32_t *decoded0 = ctx->decoded[0];
     int32_t *decoded1 = ctx->decoded[1];
 
-    if (ctx->frameflags & APE_FRAMECODE_STEREO_SILENCE) {
-        /* We are pure silence, just memset the output buffer. */
-        memset(decoded0, 0, blockstodecode * sizeof(*decoded0));
-        memset(decoded1, 0, blockstodecode * sizeof(*decoded1));
-    } else {
-        while (blockstodecode--) {
-            *decoded0++ = ape_decode_value(ctx, &ctx->riceY);
-            if (stereo)
-                *decoded1++ = ape_decode_value(ctx, &ctx->riceX);
-        }
+    while (blockstodecode--) {
+        *decoded0++ = ape_decode_value(ctx, &ctx->riceY);
+        if (stereo)
+            *decoded1++ = ape_decode_value(ctx, &ctx->riceX);
     }
 }
 
@@ -778,7 +772,6 @@ static int init_frame_decoder(APEContext *ctx)
 static void ape_unpack_mono(APEContext *ctx, int count)
 {
     if (ctx->frameflags & APE_FRAMECODE_STEREO_SILENCE) {
-        entropy_decode(ctx, count, 0);
         /* We are pure silence, so we're done. */
         av_log(ctx->avctx, AV_LOG_DEBUG, "pure silence mono\n");
         return;
-- 
cgit v1.2.3


From 37390d5cca3ec3556d9564ad9734643a9f525779 Mon Sep 17 00:00:00 2001
From: Justin Ruggles
Date: Mon, 6 Feb 2012 17:46:41 -0500
Subject: apedec: allow the user to set the maximum number of output samples
 per call

It makes sense in some cases to split up the output packet to save on memory
usage (ape frames can be very large), but the current/default size is
arbitrary. Allowing the user to configure this gives more flexibility and
requires minimal additional code.
---
 libavcodec/apedec.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index 028c29c85e..2b95874078 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -25,13 +25,13 @@
 #include "bytestream.h"
 #include "libavutil/audioconvert.h"
 #include "libavutil/avassert.h"
+#include "libavutil/opt.h"
 
 /**
  * @file
  * Monkey's Audio lossless audio decoder
  */
 
-#define BLOCKS_PER_LOOP     4608
 #define MAX_CHANNELS        2
 #define MAX_BYTESPERSAMPLE  3
 
@@ -126,6 +126,7 @@ typedef struct APEPredictor {
 
 /** Decoder context */
 typedef struct APEContext {
+    AVClass *class;                          ///< class for AVOptions
     AVCodecContext *avctx;
     AVFrame frame;
     DSPContext dsp;
@@ -145,6 +146,7 @@ typedef struct APEContext {
     int32_t *decoded_buffer;
     int decoded_size;
     int32_t *decoded[MAX_CHANNELS];          ///< decoded data for each channel
+    int blocks_per_loop;                     ///< maximum number of samples to decode for each call
 
     int16_t* filterbuf[APE_FILTER_LEVELS];   ///< filter memory
 
@@ -891,7 +893,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
         return avpkt->size;
     }
 
-    blockstodecode = FFMIN(BLOCKS_PER_LOOP, s->samples);
+    blockstodecode = FFMIN(s->blocks_per_loop, s->samples);
 
     /* reallocate decoded sample buffer if needed */
     av_fast_malloc(&s->decoded_buffer, &s->decoded_size,
@@ -964,6 +966,21 @@ static void ape_flush(AVCodecContext *avctx)
     s->samples= 0;
 }
 
+#define OFFSET(x) offsetof(APEContext, x)
+#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM)
+static const AVOption options[] = {
+    { "max_samples", "maximum number of samples decoded per call",             OFFSET(blocks_per_loop), AV_OPT_TYPE_INT,   { 4608 },    1,       INT_MAX, PAR, "max_samples" },
+    { "all",         "no maximum. decode all samples for each packet at once", 0,                       AV_OPT_TYPE_CONST, { INT_MAX }, INT_MIN, INT_MAX, PAR, "max_samples" },
+    { NULL},
+};
+
+static const AVClass ape_decoder_class = {
+    .class_name = "APE decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_ape_decoder = {
     .name           = "ape",
     .type           = AVMEDIA_TYPE_AUDIO,
@@ -975,4 +992,5 @@ AVCodec ff_ape_decoder = {
     .capabilities   = CODEC_CAP_SUBFRAMES | CODEC_CAP_DELAY | CODEC_CAP_DR1,
     .flush = ape_flush,
     .long_name = NULL_IF_CONFIG_SMALL("Monkey's Audio"),
+    .priv_class     = &ape_decoder_class,
 };
-- 
cgit v1.2.3


From 06b320ad78a95618855e0f94f6978aac044f8a8d Mon Sep 17 00:00:00 2001
From: Diego Biurrun
Date: Wed, 18 Jan 2012 18:33:16 +0100
Subject: swscale: K&R formatting cosmetics for SPARC code

---
 libswscale/sparc/yuv2rgb_vis.c | 251 ++++++++++++++++++++---------------------
 1 file changed, 125 insertions(+), 126 deletions(-)

diff --git a/libswscale/sparc/yuv2rgb_vis.c b/libswscale/sparc/yuv2rgb_vis.c
index 2111ea8f64..bcd2081aa8 100644
--- a/libswscale/sparc/yuv2rgb_vis.c
+++ b/libswscale/sparc/yuv2rgb_vis.c
@@ -25,149 +25,148 @@
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
 
-#define YUV2RGB_INIT \
-    "wr %%g0, 0x10, %%gsr \n\t" \
-    "ldd [%5], %%f32      \n\t" \
-    "ldd [%5+8], %%f34    \n\t" \
-    "ldd [%5+16], %%f36   \n\t" \
-    "ldd [%5+24], %%f38   \n\t" \
-    "ldd [%5+32], %%f40   \n\t" \
-    "ldd [%5+40], %%f42   \n\t" \
-    "ldd [%5+48], %%f44   \n\t" \
-    "ldd [%5+56], %%f46   \n\t" \
-    "ldd [%5+64], %%f48   \n\t" \
-    "ldd [%5+72], %%f50   \n\t"
-
-#define YUV2RGB_KERNEL \
-    /* ^^^^ f0=Y f3=u f5=v */ \
-    "fmul8x16 %%f3, %%f48, %%f6   \n\t" \
-    "fmul8x16 %%f19, %%f48, %%f22 \n\t" \
-    "fmul8x16 %%f5, %%f44, %%f8   \n\t" \
-    "fmul8x16 %%f21, %%f44, %%f24 \n\t" \
-    "fmul8x16 %%f0, %%f42, %%f0   \n\t" \
-    "fmul8x16 %%f16, %%f42, %%f16 \n\t" \
-    "fmul8x16 %%f3, %%f50, %%f2   \n\t" \
-    "fmul8x16 %%f19, %%f50, %%f18 \n\t" \
-    "fmul8x16 %%f5, %%f46, %%f4   \n\t" \
-    "fmul8x16 %%f21, %%f46, %%f20 \n\t" \
-    \
-    "fpsub16 %%f6, %%f34, %%f6   \n\t" /* 1 */ \
-    "fpsub16 %%f22, %%f34, %%f22 \n\t" /* 1 */ \
-    "fpsub16 %%f8, %%f38, %%f8   \n\t" /* 3 */ \
-    "fpsub16 %%f24, %%f38, %%f24 \n\t" /* 3 */ \
-    "fpsub16 %%f0, %%f32, %%f0   \n\t" /* 0 */ \
-    "fpsub16 %%f16, %%f32, %%f16 \n\t" /* 0 */ \
-    "fpsub16 %%f2, %%f36, %%f2   \n\t" /* 2 */ \
-    "fpsub16 %%f18, %%f36, %%f18 \n\t" /* 2 */ \
-    "fpsub16 %%f4, %%f40, %%f4   \n\t" /* 4 */ \
-    "fpsub16 %%f20, %%f40, %%f20 \n\t" /* 4 */ \
-    \
-    "fpadd16 %%f0, %%f8, %%f8    \n\t" /* Gt */ \
-    "fpadd16 %%f16, %%f24, %%f24 \n\t" /* Gt */ \
-    "fpadd16 %%f0, %%f4, %%f4    \n\t" /* R */ \
-    "fpadd16 %%f16, %%f20, %%f20 \n\t" /* R */ \
-    "fpadd16 %%f0, %%f6, %%f6    \n\t" /* B */ \
-    "fpadd16 %%f16, %%f22, %%f22 \n\t" /* B */ \
-    "fpadd16 %%f8, %%f2, %%f2    \n\t" /* G */ \
-    "fpadd16 %%f24, %%f18, %%f18 \n\t" /* G */ \
-    \
-    "fpack16 %%f4, %%f4    \n\t" \
-    "fpack16 %%f20, %%f20  \n\t" \
-    "fpack16 %%f6, %%f6    \n\t" \
-    "fpack16 %%f22, %%f22  \n\t" \
-    "fpack16 %%f2, %%f2    \n\t" \
-    "fpack16 %%f18, %%f18  \n\t"
-
-
+#define YUV2RGB_INIT                               \
+    "wr %%g0, 0x10, %%gsr \n\t"                    \
+    "ldd [%5],      %%f32 \n\t"                    \
+    "ldd [%5 +  8], %%f34 \n\t"                    \
+    "ldd [%5 + 16], %%f36 \n\t"                    \
+    "ldd [%5 + 24], %%f38 \n\t"                    \
+    "ldd [%5 + 32], %%f40 \n\t"                    \
+    "ldd [%5 + 40], %%f42 \n\t"                    \
+    "ldd [%5 + 48], %%f44 \n\t"                    \
+    "ldd [%5 + 56], %%f46 \n\t"                    \
+    "ldd [%5 + 64], %%f48 \n\t"                    \
+    "ldd [%5 + 72], %%f50 \n\t"
+
+#define YUV2RGB_KERNEL                             \
+    /* ^^^^ f0=Y f3=u f5=v */                      \
+    "fmul8x16 %%f3,  %%f48,  %%f6 \n\t"            \
+    "fmul8x16 %%f19, %%f48, %%f22 \n\t"            \
+    "fmul8x16 %%f5,  %%f44,  %%f8 \n\t"            \
+    "fmul8x16 %%f21, %%f44, %%f24 \n\t"            \
+    "fmul8x16 %%f0,  %%f42,  %%f0 \n\t"            \
+    "fmul8x16 %%f16, %%f42, %%f16 \n\t"            \
+    "fmul8x16 %%f3,  %%f50,  %%f2 \n\t"            \
+    "fmul8x16 %%f19, %%f50, %%f18 \n\t"            \
+    "fmul8x16 %%f5,  %%f46,  %%f4 \n\t"            \
+    "fmul8x16 %%f21, %%f46, %%f20 \n\t"            \
+                                                   \
+    "fpsub16 %%f6,  %%f34,  %%f6 \n\t" /* 1 */     \
+    "fpsub16 %%f22, %%f34, %%f22 \n\t" /* 1 */     \
+    "fpsub16 %%f8,  %%f38,  %%f8 \n\t" /* 3 */     \
+    "fpsub16 %%f24, %%f38, %%f24 \n\t" /* 3 */     \
+    "fpsub16 %%f0,  %%f32,  %%f0 \n\t" /* 0 */     \
+    "fpsub16 %%f16, %%f32, %%f16 \n\t" /* 0 */     \
+    "fpsub16 %%f2,  %%f36,  %%f2 \n\t" /* 2 */     \
+    "fpsub16 %%f18, %%f36, %%f18 \n\t" /* 2 */     \
+    "fpsub16 %%f4,  %%f40,  %%f4 \n\t" /* 4 */     \
+    "fpsub16 %%f20, %%f40, %%f20 \n\t" /* 4 */     \
+                                                   \
+    "fpadd16 %%f0,  %%f8,  %%f8  \n\t" /* Gt */    \
+    "fpadd16 %%f16, %%f24, %%f24 \n\t" /* Gt */    \
+    "fpadd16 %%f0,  %%f4,  %%f4  \n\t" /* R */     \
+    "fpadd16 %%f16, %%f20, %%f20 \n\t" /* R */     \
+    "fpadd16 %%f0,  %%f6,  %%f6  \n\t" /* B */     \
+    "fpadd16 %%f16, %%f22, %%f22 \n\t" /* B */     \
+    "fpadd16 %%f8,  %%f2,  %%f2  \n\t" /* G */     \
+    "fpadd16 %%f24, %%f18, %%f18 \n\t" /* G */     \
+                                                   \
+    "fpack16 %%f4,  %%f4  \n\t"                    \
+    "fpack16 %%f20, %%f20 \n\t"                    \
+    "fpack16 %%f6,  %%f6  \n\t"                    \
+    "fpack16 %%f22, %%f22 \n\t"                    \
+    "fpack16 %%f2,  %%f2  \n\t"                    \
+    "fpack16 %%f18, %%f18 \n\t"
 
 // FIXME: must be changed to set alpha to 255 instead of 0
-static int vis_420P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                           int srcSliceH, uint8_t* dst[], int dstStride[])
+static int vis_420P_ARGB32(SwsContext *c, uint8_t *src[], int srcStride[],
+                           int srcSliceY, int srcSliceH,
+                           uint8_t *dst[], int dstStride[])
 {
     int y, out1, out2, out3, out4, out5, out6;
 
-    for(y=0;y < srcSliceH;++y) {
+    for (y = 0; y < srcSliceH; ++y)
         __asm__ volatile (
             YUV2RGB_INIT
-            "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
+            "wr %%g0, 0xd2, %%asi        \n\t"  /* ASI_FL16_P */
             "1:                          \n\t"
-            "ldda [%1] %%asi, %%f2       \n\t"
-            "ldda [%1+2] %%asi, %%f18    \n\t"
-            "ldda [%2] %%asi, %%f4       \n\t"
-            "ldda [%2+2] %%asi, %%f20    \n\t"
+            "ldda [%1]     %%asi, %%f2   \n\t"
+            "ldda [%1 + 2] %%asi, %%f18  \n\t"
+            "ldda [%2]     %%asi, %%f4   \n\t"
+            "ldda [%2 + 2] %%asi, %%f20  \n\t"
             "ld [%0], %%f0               \n\t"
             "ld [%0+4], %%f16            \n\t"
-            "fpmerge %%f3, %%f3, %%f2    \n\t"
+            "fpmerge %%f3,  %%f3,  %%f2  \n\t"
             "fpmerge %%f19, %%f19, %%f18 \n\t"
-            "fpmerge %%f5, %%f5, %%f4    \n\t"
+            "fpmerge %%f5,  %%f5,  %%f4  \n\t"
             "fpmerge %%f21, %%f21, %%f20 \n\t"
             YUV2RGB_KERNEL
             "fzero %%f0                  \n\t"
-            "fpmerge %%f4, %%f6, %%f8    \n\t"  // r,b,t1
-            "fpmerge %%f20, %%f22, %%f24 \n\t"  // r,b,t1
-            "fpmerge %%f0, %%f2, %%f10   \n\t"  // 0,g,t2
-            "fpmerge %%f0, %%f18, %%f26  \n\t"  // 0,g,t2
-            "fpmerge %%f10, %%f8, %%f4   \n\t"  // t2,t1,msb
-            "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2,t1,msb
-            "fpmerge %%f11, %%f9, %%f6   \n\t"  // t2,t1,lsb
-            "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2,t1,lsb
-            "std %%f4, [%3]              \n\t"
-            "std %%f20, [%3+16]          \n\t"
-            "std %%f6, [%3+8]            \n\t"
-            "std %%f22, [%3+24]          \n\t"
+            "fpmerge %%f4,  %%f6,  %%f8  \n\t"  // r, b, t1
+            "fpmerge %%f20, %%f22, %%f24 \n\t"  // r, b, t1
+            "fpmerge %%f0,  %%f2,  %%f10 \n\t"  // 0, g, t2
+            "fpmerge %%f0,  %%f18, %%f26 \n\t"  // 0, g, t2
+            "fpmerge %%f10, %%f8,  %%f4  \n\t"  // t2, t1, msb
+            "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2, t1, msb
+            "fpmerge %%f11, %%f9,  %%f6  \n\t"  // t2, t1, lsb
+            "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2, t1, lsb
+            "std %%f4,  [%3]             \n\t"
+            "std %%f20, [%3 + 16]        \n\t"
+            "std %%f6,  [%3 +  8]        \n\t"
+            "std %%f22, [%3 + 24]        \n\t"
 
             "add %0, 8, %0   \n\t"
             "add %1, 4, %1   \n\t"
             "add %2, 4, %2   \n\t"
             "subcc %4, 8, %4 \n\t"
             "bne 1b          \n\t"
-            "add %3, 32, %3  \n\t" //delay slot
+            "add %3, 32, %3  \n\t"              // delay slot
             : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
-            : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+((y+srcSliceY)>>1)*srcStride[1]),
-                "2" (src[2]+((y+srcSliceY)>>1)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]),
-                "4" (c->dstW),
-                "5" (c->sparc_coeffs)
-        );
-    }
+            : "0" (src[0] + (y + srcSliceY) * srcStride[0]), "1" (src[1] + ((y + srcSliceY) >> 1) * srcStride[1]),
+            "2" (src[2] + ((y + srcSliceY) >> 1) * srcStride[2]), "3" (dst[0] + (y + srcSliceY) * dstStride[0]),
+            "4" (c->dstW),
+            "5" (c->sparc_coeffs)
+            );
 
     return srcSliceH;
 }
 
 // FIXME: must be changed to set alpha to 255 instead of 0
-static int vis_422P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                           int srcSliceH, uint8_t* dst[], int dstStride[])
+static int vis_422P_ARGB32(SwsContext *c, uint8_t *src[], int srcStride[],
+                           int srcSliceY, int srcSliceH,
+                           uint8_t *dst[], int dstStride[])
 {
     int y, out1, out2, out3, out4, out5, out6;
 
-    for(y=0;y < srcSliceH;++y) {
+    for (y = 0; y < srcSliceH; ++y)
         __asm__ volatile (
             YUV2RGB_INIT
             "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
             "1:                          \n\t"
-            "ldda [%1] %%asi, %%f2       \n\t"
-            "ldda [%1+2] %%asi, %%f18    \n\t"
-            "ldda [%2] %%asi, %%f4       \n\t"
-            "ldda [%2+2] %%asi, %%f20    \n\t"
-            "ld [%0], %%f0               \n\t"
-            "ld [%0+4], %%f16            \n\t"
-            "fpmerge %%f3, %%f3, %%f2    \n\t"
+            "ldda [%1]     %%asi, %%f2   \n\t"
+            "ldda [%1 + 2] %%asi, %%f18  \n\t"
+            "ldda [%2]     %%asi, %%f4   \n\t"
+            "ldda [%2 + 2] %%asi, %%f20  \n\t"
+            "ld [%0],     %%f0           \n\t"
+            "ld [%0 + 4], %%f16          \n\t"
+            "fpmerge %%f3,  %%f3,  %%f2  \n\t"
             "fpmerge %%f19, %%f19, %%f18 \n\t"
-            "fpmerge %%f5, %%f5, %%f4    \n\t"
+            "fpmerge %%f5,  %%f5,  %%f4  \n\t"
             "fpmerge %%f21, %%f21, %%f20 \n\t"
             YUV2RGB_KERNEL
             "fzero %%f0 \n\t"
-            "fpmerge %%f4, %%f6, %%f8    \n\t"  // r,b,t1
+            "fpmerge %%f4,  %%f6,  %%f8  \n\t"  // r,b,t1
             "fpmerge %%f20, %%f22, %%f24 \n\t"  // r,b,t1
-            "fpmerge %%f0, %%f2, %%f10   \n\t"  // 0,g,t2
-            "fpmerge %%f0, %%f18, %%f26  \n\t"  // 0,g,t2
-            "fpmerge %%f10, %%f8, %%f4   \n\t"  // t2,t1,msb
+            "fpmerge %%f0,  %%f2,  %%f10 \n\t"  // 0,g,t2
+            "fpmerge %%f0,  %%f18, %%f26 \n\t"  // 0,g,t2
+            "fpmerge %%f10, %%f8,  %%f4  \n\t"  // t2,t1,msb
             "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2,t1,msb
-            "fpmerge %%f11, %%f9, %%f6   \n\t"  // t2,t1,lsb
+            "fpmerge %%f11, %%f9,  %%f6  \n\t"  // t2,t1,lsb
             "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2,t1,lsb
-            "std %%f4, [%3]              \n\t"
-            "std %%f20, [%3+16]          \n\t"
-            "std %%f6, [%3+8]            \n\t"
-            "std %%f22, [%3+24]          \n\t"
+            "std %%f4,  [%3]             \n\t"
+            "std %%f20, [%3 + 16]        \n\t"
+            "std %%f6,  [%3 + 8]         \n\t"
+            "std %%f22, [%3 + 24]        \n\t"
 
             "add %0, 8, %0   \n\t"
             "add %1, 4, %1   \n\t"
@@ -176,36 +175,36 @@ static int vis_422P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int s
             "bne 1b          \n\t"
             "add %3, 32, %3  \n\t" //delay slot
             : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
-            : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+(y+srcSliceY)*srcStride[1]),
-                "2" (src[2]+(y+srcSliceY)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]),
-                "4" (c->dstW),
-                "5" (c->sparc_coeffs)
-        );
-    }
+            : "0" (src[0] + (y + srcSliceY) * srcStride[0]), "1" (src[1] + (y + srcSliceY) * srcStride[1]),
+            "2" (src[2] + (y + srcSliceY) * srcStride[2]), "3" (dst[0] + (y + srcSliceY) * dstStride[0]),
+            "4" (c->dstW),
+            "5" (c->sparc_coeffs)
+            );
 
     return srcSliceH;
 }
 
 SwsFunc ff_yuv2rgb_init_vis(SwsContext *c)
 {
-    c->sparc_coeffs[5]=c->yCoeff;
-    c->sparc_coeffs[6]=c->vgCoeff;
-    c->sparc_coeffs[7]=c->vrCoeff;
-    c->sparc_coeffs[8]=c->ubCoeff;
-    c->sparc_coeffs[9]=c->ugCoeff;
-
-    c->sparc_coeffs[0]=(((int16_t)c->yOffset*(int16_t)c->yCoeff >>11) & 0xffff) * 0x0001000100010001ULL;
-    c->sparc_coeffs[1]=(((int16_t)c->uOffset*(int16_t)c->ubCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
-    c->sparc_coeffs[2]=(((int16_t)c->uOffset*(int16_t)c->ugCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
-    c->sparc_coeffs[3]=(((int16_t)c->vOffset*(int16_t)c->vgCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
-    c->sparc_coeffs[4]=(((int16_t)c->vOffset*(int16_t)c->vrCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
-
-    if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV422P && (c->dstW & 7)==0) {
-        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV422P -> RGB32 (WARNING: alpha value is wrong)\n");
+    c->sparc_coeffs[5] = c->yCoeff;
+    c->sparc_coeffs[6] = c->vgCoeff;
+    c->sparc_coeffs[7] = c->vrCoeff;
+    c->sparc_coeffs[8] = c->ubCoeff;
+    c->sparc_coeffs[9] = c->ugCoeff;
+
+    c->sparc_coeffs[0] = (((int16_t)c->yOffset * (int16_t)c->yCoeff  >> 11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[1] = (((int16_t)c->uOffset * (int16_t)c->ubCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[2] = (((int16_t)c->uOffset * (int16_t)c->ugCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[3] = (((int16_t)c->vOffset * (int16_t)c->vgCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[4] = (((int16_t)c->vOffset * (int16_t)c->vrCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+
+    if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV422P && (c->dstW & 7) == 0) {
+        av_log(c, AV_LOG_INFO,
+               "SPARC VIS accelerated YUV422P -> RGB32 (WARNING: alpha value is wrong)\n");
         return vis_422P_ARGB32;
-    }
-    else if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV420P && (c->dstW & 7)==0) {
-        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV420P -> RGB32 (WARNING: alpha value is wrong)\n");
+    } else if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV420P && (c->dstW & 7) == 0) {
+        av_log(c, AV_LOG_INFO,
+               "SPARC VIS accelerated YUV420P -> RGB32 (WARNING: alpha value is wrong)\n");
         return vis_420P_ARGB32;
     }
     return NULL;
-- 
cgit v1.2.3


From 4be1d7dc20e3f191917adab2bf7cd405f1d25b54 Mon Sep 17 00:00:00 2001
From: Anton Khirnov
Date: Sun, 5 Feb 2012 12:18:14 +0100
Subject: pva-demux test: add -vn

The output is obviously not supposed to contain video (since only
-acodec copy is specified), but that only happens because of the way -t
handling is implemented currently.
---
 tests/fate/demux.mak     |  2 +-
 tests/ref/fate/pva-demux | 53 ++++++++++++++++++++++++------------------------
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/tests/fate/demux.mak b/tests/fate/demux.mak
index 15899861f2..0de3ddbd2b 100644
--- a/tests/fate/demux.mak
+++ b/tests/fate/demux.mak
@@ -59,7 +59,7 @@ FATE_TESTS += fate-psx-str-v3-mdec
 fate-psx-str-v3-mdec: CMD = framecrc -i $(SAMPLES)/psx-str/abc000_cut.str -an
 
 FATE_TESTS += fate-pva-demux
-fate-pva-demux: CMD = framecrc -idct simple -i $(SAMPLES)/pva/PVA_test-partial.pva -t 0.6 -acodec copy
+fate-pva-demux: CMD = framecrc -idct simple -i $(SAMPLES)/pva/PVA_test-partial.pva -t 0.6 -acodec copy -vn
 
 FATE_TESTS += fate-qcp-demux
 fate-qcp-demux: CMD = crc -i $(SAMPLES)/qcp/0036580847.QCP -acodec copy
diff --git a/tests/ref/fate/pva-demux b/tests/ref/fate/pva-demux
index 5227b5b851..8eb6fc9382 100644
--- a/tests/ref/fate/pva-demux
+++ b/tests/ref/fate/pva-demux
@@ -1,27 +1,26 @@
-#tb 0: 1/25
-#tb 1: 1/90000
-1,          0,          0,     2160,      384, 0x071abcc8
-1,          0,          0,     2160,      384, 0x31c9aee0
-1,       2160,       2160,     2160,      384, 0xa50eaa94
-1,       4320,       4320,     2160,      384, 0x9e86ba0e
-1,       8640,       8640,     2160,      384, 0x2321b800
-1,      10800,      10800,     2160,      384, 0x2347afa8
-1,      12960,      12960,     2160,      384, 0x0831b8d3
-1,      15120,      15120,     2160,      384, 0xd5acafa1
-1,      17280,      17280,     2160,      384, 0xc975b9d2
-1,      19440,      19440,     2160,      384, 0x2e10b02a
-1,      21600,      21600,     2160,      384, 0x501eadd0
-1,      23760,      23760,     2160,      384, 0x153fc171
-1,      25920,      25920,     2160,      384, 0xc5f0b3c2
-1,      28080,      28080,     2160,      384, 0xf731b200
-1,      30240,      30240,     2160,      384, 0x2e16b713
-1,      32400,      32400,     2160,      384, 0x61f6bba9
-1,      34560,      34560,     2160,      384, 0x1b9eb0ff
-1,      36720,      36720,     2160,      384, 0x2ab4b7bd
-1,      38880,      38880,     2160,      384, 0xd66eb45c
-1,      41040,      41040,     2160,      384, 0x145ab426
-1,      43200,      43200,     2160,      384, 0x297cb370
-1,      45360,      45360,     2160,      384, 0x287bb6b7
-1,      47520,      47520,     2160,      384, 0xfddbb7df
-1,      49680,      49680,     2160,      384, 0xbbb2af0c
-1,      51840,      51840,     2160,      384, 0x8f03b5fc
+#tb 0: 1/90000
+0,          0,          0,     2160,      384, 0x071abcc8
+0,          0,          0,     2160,      384, 0x31c9aee0
+0,       2160,       2160,     2160,      384, 0xa50eaa94
+0,       4320,       4320,     2160,      384, 0x9e86ba0e
+0,       8640,       8640,     2160,      384, 0x2321b800
+0,      10800,      10800,     2160,      384, 0x2347afa8
+0,      12960,      12960,     2160,      384, 0x0831b8d3
+0,      15120,      15120,     2160,      384, 0xd5acafa1
+0,      17280,      17280,     2160,      384, 0xc975b9d2
+0,      19440,      19440,     2160,      384, 0x2e10b02a
+0,      21600,      21600,     2160,      384, 0x501eadd0
+0,      23760,      23760,     2160,      384, 0x153fc171
+0,      25920,      25920,     2160,      384, 0xc5f0b3c2
+0,      28080,      28080,     2160,      384, 0xf731b200
+0,      30240,      30240,     2160,      384, 0x2e16b713
+0,      32400,      32400,     2160,      384, 0x61f6bba9
+0,      34560,      34560,     2160,      384, 0x1b9eb0ff
+0,      36720,      36720,     2160,      384, 0x2ab4b7bd
+0,      38880,      38880,     2160,      384, 0xd66eb45c
+0,      41040,      41040,     2160,      384, 0x145ab426
+0,      43200,      43200,     2160,      384, 0x297cb370
+0,      45360,      45360,     2160,      384, 0x287bb6b7
+0,      47520,      47520,     2160,      384, 0xfddbb7df
+0,      49680,      49680,     2160,      384, 0xbbb2af0c
+0,      51840,      51840,     2160,      384, 0x8f03b5fc
-- 
cgit v1.2.3


From c9af8326238c37d1a1f0029a158c7ede33836aa3 Mon Sep 17 00:00:00 2001
From: Anton Khirnov
Date: Sun, 5 Feb 2012 14:28:19 +0100
Subject: avconv: set encoder timebase for subtitles.

The actual number (1/1000) will probably require some
discussion/tweaking in the future, but should be good enough for now,
since the timestamps in AVSubtitle are in this timebase by definition.
---
 avconv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/avconv.c b/avconv.c
index 509b1dc249..876ad1996c 100644
--- a/avconv.c
+++ b/avconv.c
@@ -2434,6 +2434,7 @@ static int transcode_init(OutputFile *output_files,
 #endif
                 break;
             case AVMEDIA_TYPE_SUBTITLE:
+                codec->time_base = (AVRational){1, 1000};
                 break;
             default:
                 abort();
-- 
cgit v1.2.3


From 1270e12e49c9039fa5fe18cfb45137558d127f92 Mon Sep 17 00:00:00 2001
From: Anton Khirnov
Date: Sun, 5 Feb 2012 14:32:10 +0100
Subject: avconv: rework -t handling for encoding.

Current code compares the desired recording time with InputStream.pts,
which has a very unclear meaning. Change the code to use actual
timestamps of the frames passed to the encoder.

In several tests, one less frame is encoded, which is more correct.

In the idroq test one more frame is encoded, which is again more
correct.

Behavior with stream copy should be unchanged.
---
 avconv.c                          | 47 ++++++++++++++++++++++++++++++---------
 tests/ref/fate/idroq-video-encode |  2 +-
 tests/ref/fate/real-rv40          |  1 -
 tests/ref/lavf/asf                |  4 ++--
 tests/ref/lavf/avi                |  6 ++---
 tests/ref/lavf/ffm                |  4 ++--
 tests/ref/lavf/mkv                |  6 ++---
 tests/ref/lavf/mpg                |  4 ++--
 tests/ref/lavf/nut                |  6 ++---
 tests/ref/lavf/ogg                |  6 ++---
 tests/ref/lavf/rm                 |  4 ++--
 tests/ref/lavf/ts                 |  6 ++---
 tests/ref/seek/lavf_avi           |  6 ++---
 tests/ref/seek/lavf_mkv           |  6 ++---
 tests/ref/seek/lavf_mpg           |  6 ++---
 tests/ref/seek/lavf_rm            | 18 ++++++++++-----
 16 files changed, 82 insertions(+), 50 deletions(-)

diff --git a/avconv.c b/avconv.c
index 876ad1996c..30782e1826 100644
--- a/avconv.c
+++ b/avconv.c
@@ -204,6 +204,9 @@ typedef struct OutputStream {
     // double sync_ipts;        /* dts from the AVPacket of the demuxer in second units */
     struct InputStream *sync_ist; /* input stream to sync against */
     int64_t sync_opts;       /* output frame counter, could be changed to some true timestamp */ // FIXME look at frame_number
+    /* pts of the first frame encoded for this stream, used for limiting
+     * recording time */
+    int64_t first_pts;
     AVBitStreamFilterContext *bitstream_filters;
     AVCodec *enc;
     int64_t max_frames;
@@ -918,6 +921,19 @@ static void write_frame(AVFormatContext *s, AVPacket *pkt, OutputStream *ost)
     }
 }
 
+static int check_recording_time(OutputStream *ost)
+{
+    OutputFile *of = &output_files[ost->file_index];
+
+    if (of->recording_time != INT64_MAX &&
+        av_compare_ts(ost->sync_opts - ost->first_pts, ost->st->codec->time_base, of->recording_time,
+                      AV_TIME_BASE_Q) >= 0) {
+        ost->is_past_recording_time = 1;
+        return 0;
+    }
+    return 1;
+}
+
 static void generate_silence(uint8_t* buf, enum AVSampleFormat sample_fmt, size_t size)
 {
     int fill_char = 0x00;
@@ -958,6 +974,11 @@ static int encode_audio_frame(AVFormatContext *s, OutputStream *ost,
             av_log(NULL, AV_LOG_FATAL, "Audio encoding failed\n");
             exit_program(1);
         }
+
+        if (!check_recording_time(ost))
+            return 0;
+
+        ost->sync_opts += frame->nb_samples;
     }
 
     got_packet = 0;
@@ -977,9 +998,6 @@ static int encode_audio_frame(AVFormatContext *s, OutputStream *ost,
         audio_size += pkt.size;
     }
 
-    if (frame)
-        ost->sync_opts += frame->nb_samples;
-
     return pkt.size;
 }
 
@@ -1241,6 +1259,10 @@ static void do_subtitle_out(AVFormatContext *s,
         nb = 1;
 
     for (i = 0; i < nb; i++) {
+        ost->sync_opts = av_rescale_q(pts, ist->st->time_base, enc->time_base);
+        if (!check_recording_time(ost))
+            return;
+
         sub->pts = av_rescale_q(pts, ist->st->time_base, AV_TIME_BASE_Q);
         // start_display_time is required to be 0
         sub->pts               += av_rescale_q(sub->start_display_time, (AVRational){ 1, 1000 }, AV_TIME_BASE_Q);
@@ -1382,11 +1404,17 @@ static void do_video_out(AVFormatContext *s,
     final_picture = in_picture;
 #endif
 
+    if (!ost->frame_number)
+        ost->first_pts = ost->sync_opts;
+
     /* duplicates frame if needed */
     for (i = 0; i < nb_frames; i++) {
         AVPacket pkt;
         av_init_packet(&pkt);
 
+        if (!check_recording_time(ost))
+            return;
+
         if (s->oformat->flags & AVFMT_RAWPICTURE &&
             enc->codec->id == CODEC_ID_RAWVIDEO) {
             /* raw pictures are written as AVPicture structure to
@@ -1723,13 +1751,6 @@ static int check_output_constraints(InputStream *ist, OutputStream *ost)
     if (of->start_time && ist->pts < of->start_time)
         return 0;
 
-    if (of->recording_time != INT64_MAX &&
-        av_compare_ts(ist->pts, AV_TIME_BASE_Q, of->recording_time + of->start_time,
-                      (AVRational){ 1, 1000000 }) >= 0) {
-        ost->is_past_recording_time = 1;
-        return 0;
-    }
-
     return 1;
 }
 
@@ -1745,6 +1766,12 @@ static void do_streamcopy(InputStream *ist, OutputStream *ost, const AVPacket *p
         !ost->copy_initial_nonkeyframes)
         return;
 
+    if (of->recording_time != INT64_MAX &&
+        ist->pts >= of->recording_time + of->start_time) {
+        ost->is_past_recording_time = 1;
+        return;
+    }
+
     /* force the input stream PTS */
     if (ost->st->codec->codec_type == AVMEDIA_TYPE_AUDIO)
         audio_size += pkt->size;
diff --git a/tests/ref/fate/idroq-video-encode b/tests/ref/fate/idroq-video-encode
index 4c89de8b63..f243a0d43b 100644
--- a/tests/ref/fate/idroq-video-encode
+++ b/tests/ref/fate/idroq-video-encode
@@ -1 +1 @@
-75df9ce475c1b185fc3dbc219596edd3
+72e5b060ff0ab8855da22f33a6e04bff
diff --git a/tests/ref/fate/real-rv40 b/tests/ref/fate/real-rv40
index 6159859cad..8307817021 100644
--- a/tests/ref/fate/real-rv40
+++ b/tests/ref/fate/real-rv40
@@ -238,4 +238,3 @@
 0,        236,        236,        1,   276480, 0x8f316c66
 0,        237,        237,        1,   276480, 0x6348ecf5
 0,        238,        238,        1,   276480, 0x34b5b78a
-0,        239,        239,        1,   276480, 0xcbf66922
diff --git a/tests/ref/lavf/asf b/tests/ref/lavf/asf
index 533a3c3591..2d4788aee8 100644
--- a/tests/ref/lavf/asf
+++ b/tests/ref/lavf/asf
@@ -1,3 +1,3 @@
-c544bb40c2f4c09d44318db5228ee499 *./tests/data/lavf/lavf.asf
+98ffddfa94926558ecf20cc6f47236a6 *./tests/data/lavf/lavf.asf
 333375 ./tests/data/lavf/lavf.asf
-./tests/data/lavf/lavf.asf CRC=0x9f5ab3e6
+./tests/data/lavf/lavf.asf CRC=0x51485213
diff --git a/tests/ref/lavf/avi b/tests/ref/lavf/avi
index cf47755ab7..ae85efc50b 100644
--- a/tests/ref/lavf/avi
+++ b/tests/ref/lavf/avi
@@ -1,3 +1,3 @@
-7e5e4db8c04f0acd16cff6b30e60d0e5 *./tests/data/lavf/lavf.avi
-331032 ./tests/data/lavf/lavf.avi
-./tests/data/lavf/lavf.avi CRC=0x2a83e6b0
+e056e1164236b22fafc8325de8221a58 *./tests/data/lavf/lavf.avi
+330798 ./tests/data/lavf/lavf.avi
+./tests/data/lavf/lavf.avi CRC=0xa79b84dd
diff --git a/tests/ref/lavf/ffm b/tests/ref/lavf/ffm
index 73e94ddfd6..0ae40f59d9 100644
--- a/tests/ref/lavf/ffm
+++ b/tests/ref/lavf/ffm
@@ -1,3 +1,3 @@
-4ef091d638bb20b8eaef5b3a0d6f97b7 *./tests/data/lavf/lavf.ffm
+c963591a7f9a08d48e0f988640795690 *./tests/data/lavf/lavf.ffm
 376832 ./tests/data/lavf/lavf.ffm
-./tests/data/lavf/lavf.ffm CRC=0xf361ed74
+./tests/data/lavf/lavf.ffm CRC=0x88f58ba1
diff --git a/tests/ref/lavf/mkv b/tests/ref/lavf/mkv
index 05eb67dd1c..8185d15a61 100644
--- a/tests/ref/lavf/mkv
+++ b/tests/ref/lavf/mkv
@@ -1,3 +1,3 @@
-a36c2d9378b9870880556ced1cb89ecf *./tests/data/lavf/lavf.mkv
-  320478 ./tests/data/lavf/lavf.mkv
-./tests/data/lavf/lavf.mkv CRC=0x2a83e6b0
+ed1f083a7d2169c51e5894dabed061df *./tests/data/lavf/lavf.mkv
+320262 ./tests/data/lavf/lavf.mkv
+./tests/data/lavf/lavf.mkv CRC=0xa79b84dd
diff --git a/tests/ref/lavf/mpg b/tests/ref/lavf/mpg
index b81cb8a8de..1b63d753fb 100644
--- a/tests/ref/lavf/mpg
+++ b/tests/ref/lavf/mpg
@@ -1,3 +1,3 @@
-dd60652c2193670abffb8c2a123a820e *./tests/data/lavf/lavf.mpg
+f0b995c0c10b08133f5138069c3e9786 *./tests/data/lavf/lavf.mpg
 372736 ./tests/data/lavf/lavf.mpg
-./tests/data/lavf/lavf.mpg CRC=0xf361ed74
+./tests/data/lavf/lavf.mpg CRC=0x88f58ba1
diff --git a/tests/ref/lavf/nut b/tests/ref/lavf/nut
index 461b18587c..e658434849 100644
--- a/tests/ref/lavf/nut
+++ b/tests/ref/lavf/nut
@@ -1,3 +1,3 @@
-16b9d2cf8effb7dae316c6b9248a49b7 *./tests/data/lavf/lavf.nut
-319888 ./tests/data/lavf/lavf.nut
-./tests/data/lavf/lavf.nut CRC=0x2a83e6b0
+7e44a8ed5ff2fe5442f758d48fe1b496 *./tests/data/lavf/lavf.nut
+319680 ./tests/data/lavf/lavf.nut
+./tests/data/lavf/lavf.nut CRC=0xa79b84dd
diff --git a/tests/ref/lavf/ogg b/tests/ref/lavf/ogg
index b90bb70276..7bc66a3657 100644
--- a/tests/ref/lavf/ogg
+++ b/tests/ref/lavf/ogg
@@ -1,3 +1,3 @@
-b55661ae1a65f99af249d8efc7619a03 *./tests/data/lavf/lavf.ogg
-13819 ./tests/data/lavf/lavf.ogg
-./tests/data/lavf/lavf.ogg CRC=0xf1ae5536
+37147a98d9a484208389efa6a1f8796f *./tests/data/lavf/lavf.ogg
+13966 ./tests/data/lavf/lavf.ogg
+./tests/data/lavf/lavf.ogg CRC=0x37a143ea
diff --git a/tests/ref/lavf/rm b/tests/ref/lavf/rm
index eae422ae4d..27054969c6 100644
--- a/tests/ref/lavf/rm
+++ b/tests/ref/lavf/rm
@@ -1,2 +1,2 @@
-2e3d6b1944c6cd2cf14e13055aecf82a *./tests/data/lavf/lavf.rm
-346706 ./tests/data/lavf/lavf.rm
+f3ce1f1850655ae43f6184ae436acb70 *./tests/data/lavf/lavf.rm
+346414 ./tests/data/lavf/lavf.rm
diff --git a/tests/ref/lavf/ts b/tests/ref/lavf/ts
index ebe6a77566..94937669e1 100644
--- a/tests/ref/lavf/ts
+++ b/tests/ref/lavf/ts
@@ -1,3 +1,3 @@
-293142d7286db15e5f4d7d1ca0d9c97c *./tests/data/lavf/lavf.ts
-406644 ./tests/data/lavf/lavf.ts
-./tests/data/lavf/lavf.ts CRC=0x133216c1
+9b59f71822c6dde18fb38b084c150285 *./tests/data/lavf/lavf.ts
+406456 ./tests/data/lavf/lavf.ts
+./tests/data/lavf/lavf.ts CRC=0x64fab4df
diff --git a/tests/ref/seek/lavf_avi b/tests/ref/seek/lavf_avi
index 964f0b8b6f..2e09ad10cc 100644
--- a/tests/ref/seek/lavf_avi
+++ b/tests/ref/seek/lavf_avi
@@ -8,7 +8,7 @@ ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos: 301466 size: 27864
 ret:-1         st: 0 flags:1  ts:-0.320000
 ret:-1         st: 1 flags:0  ts: 2.586122
 ret: 0         st: 1 flags:1  ts: 1.462857
-ret: 0         st: 1 flags:1 dts: 1.018776 pts: 1.018776 pos: 329774 size:   209
+ret: 0         st: 1 flags:1 dts: 0.992653 pts: 0.992653 pos: 329556 size:   209
 ret: 0         st:-1 flags:0  ts: 0.365002
 ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos: 156166 size: 27955
 ret:-1         st:-1 flags:1  ts:-0.740831
@@ -18,7 +18,7 @@ ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos: 301466 size: 27864
 ret: 0         st: 1 flags:0  ts:-0.052245
 ret: 0         st: 1 flags:1 dts: 0.000000 pts: 0.000000 pos:  37784 size:   208
 ret: 0         st: 1 flags:1  ts: 2.847347
-ret: 0         st: 1 flags:1 dts: 1.018776 pts: 1.018776 pos: 329774 size:   209
+ret: 0         st: 1 flags:1 dts: 0.992653 pts: 0.992653 pos: 329556 size:   209
 ret:-1         st:-1 flags:0  ts: 1.730004
 ret: 0         st:-1 flags:1  ts: 0.624171
 ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos: 156166 size: 27955
@@ -38,7 +38,7 @@ ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos: 301466 size: 27864
 ret:-1         st: 0 flags:1  ts:-0.240000
 ret:-1         st: 1 flags:0  ts: 2.664490
 ret: 0         st: 1 flags:1  ts: 1.567347
-ret: 0         st: 1 flags:1 dts: 1.018776 pts: 1.018776 pos: 329774 size:   209
+ret: 0         st: 1 flags:1 dts: 0.992653 pts: 0.992653 pos: 329556 size:   209
 ret: 0         st:-1 flags:0  ts: 0.460008
 ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos: 156166 size: 27955
 ret:-1         st:-1 flags:1  ts:-0.645825
diff --git a/tests/ref/seek/lavf_mkv b/tests/ref/seek/lavf_mkv
index 9f447b3257..f2590211df 100644
--- a/tests/ref/seek/lavf_mkv
+++ b/tests/ref/seek/lavf_mkv
@@ -10,7 +10,7 @@ ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    513 size: 27837
 ret: 0         st: 1 flags:0  ts: 2.577000
 ret:-EOF
 ret: 0         st: 1 flags:1  ts: 1.471000
-ret: 0         st: 1 flags:1 dts: 1.019000 pts: 1.019000 pos: 320207 size:   209
+ret: 0         st: 1 flags:1 dts: 0.993000 pts: 0.993000 pos: 319991 size:   209
 ret: 0         st:-1 flags:0  ts: 0.365002
 ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos: 146703 size: 27925
 ret: 0         st:-1 flags:1  ts:-0.740831
@@ -22,7 +22,7 @@ ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos: 291934 size: 27834
 ret: 0         st: 1 flags:0  ts:-0.058000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    513 size: 27837
 ret: 0         st: 1 flags:1  ts: 2.836000
-ret: 0         st: 1 flags:1 dts: 1.019000 pts: 1.019000 pos: 320207 size:   209
+ret: 0         st: 1 flags:1 dts: 0.993000 pts: 0.993000 pos: 319991 size:   209
 ret: 0         st:-1 flags:0  ts: 1.730004
 ret:-EOF
 ret: 0         st:-1 flags:1  ts: 0.624171
@@ -46,7 +46,7 @@ ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    513 size: 27837
 ret: 0         st: 1 flags:0  ts: 2.672000
 ret:-EOF
 ret: 0         st: 1 flags:1  ts: 1.566000
-ret: 0         st: 1 flags:1 dts: 1.019000 pts: 1.019000 pos: 320207 size:   209
+ret: 0         st: 1 flags:1 dts: 0.993000 pts: 0.993000 pos: 319991 size:   209
 ret: 0         st:-1 flags:0  ts: 0.460008
 ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos: 146703 size: 27925
 ret: 0         st:-1 flags:1  ts:-0.645825
diff --git a/tests/ref/seek/lavf_mpg b/tests/ref/seek/lavf_mpg
index dc51195293..aad61dcae2 100644
--- a/tests/ref/seek/lavf_mpg
+++ b/tests/ref/seek/lavf_mpg
@@ -8,7 +8,7 @@ ret: 0         st: 1 flags:1 dts: 1.000000 pts: 1.000000 pos:   2048 size:   208
 ret: 0         st: 0 flags:1  ts:-0.317500
 ret: 0         st: 1 flags:1 dts: 1.000000 pts: 1.000000 pos:   2048 size:   208
 ret: 0         st: 1 flags:0  ts: 2.576667
-ret: 0         st: 1 flags:1 dts: 2.018778 pts: 2.018778 pos: 370700 size:   235
+ret: 0         st: 1 flags:1 dts: 1.783678 pts: 1.783678 pos: 368652 size:   379
 ret: 0         st: 1 flags:1  ts: 1.470833
 ret: 0         st: 1 flags:1 dts: 1.261222 pts: 1.261222 pos: 145408 size:   261
 ret: 0         st:-1 flags:0  ts: 0.365002
@@ -22,7 +22,7 @@ ret: 0         st: 0 flags:0 dts: 1.040000 pts: 1.080000 pos:  40960 size: 16073
 ret: 0         st: 1 flags:0  ts:-0.058333
 ret: 0         st: 1 flags:1 dts: 1.000000 pts: 1.000000 pos:   2048 size:   208
 ret: 0         st: 1 flags:1  ts: 2.835833
-ret: 0         st: 1 flags:1 dts: 2.018778 pts: 2.018778 pos: 370700 size:   235
+ret: 0         st: 1 flags:1 dts: 1.783678 pts: 1.783678 pos: 368652 size:   379
 ret: 0         st:-1 flags:0  ts: 1.730004
 ret: 0         st: 0 flags:0 dts: 1.760000 pts: 1.800000 pos: 292864 size: 13170
 ret: 0         st:-1 flags:1  ts: 0.624171
@@ -44,7 +44,7 @@ ret: 0         st: 1 flags:1 dts: 1.000000 pts: 1.000000 pos:   2048 size:   208
 ret: 0         st: 0 flags:1  ts:-0.222489
 ret: 0         st: 1 flags:1 dts: 1.000000 pts: 1.000000 pos:   2048 size:   208
 ret: 0         st: 1 flags:0  ts: 2.671678
-ret: 0         st: 1 flags:1 dts: 2.018778 pts: 2.018778 pos: 370700 size:   235
+ret: 0         st: 1 flags:1 dts: 1.783678 pts: 1.783678 pos: 368652 size:   379
 ret: 0         st: 1 flags:1  ts: 1.565844
 ret: 0         st: 1 flags:1 dts: 1.522444 pts: 1.522444 pos: 342028 size:   314
 ret: 0         st:-1 flags:0  ts: 0.460008
diff --git a/tests/ref/seek/lavf_rm b/tests/ref/seek/lavf_rm
index 188367bc90..394928086b 100644
--- a/tests/ref/seek/lavf_rm
+++ b/tests/ref/seek/lavf_rm
@@ -7,8 +7,10 @@ ret: 0         st: 0 flags:0  ts: 0.788000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos: 314982 size: 31143
 ret: 0         st: 0 flags:1  ts:-0.317000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    398 size: 31082
-ret:-1         st: 1 flags:0  ts: 2.577000
-ret:-1         st: 1 flags:1  ts: 1.471000
+ret: 0         st: 1 flags:0  ts: 2.577000
+ret: 0         st: 1 flags:1 dts: 0.975000 pts: 0.975000 pos: 346128 size:   278
+ret: 0         st: 1 flags:1  ts: 1.471000
+ret: 0         st: 1 flags:1 dts: 0.975000 pts: 0.975000 pos: 346128 size:   278
 ret: 0         st:-1 flags:0  ts: 0.365002
 ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos: 158515 size: 31134
 ret: 0         st:-1 flags:1  ts:-0.740831
@@ -19,7 +21,8 @@ ret: 0         st: 0 flags:1  ts: 1.048000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos: 314982 size: 31143
 ret: 0         st: 1 flags:0  ts:-0.058000
 ret: 0         st: 1 flags:1 dts: 0.000000 pts: 0.000000 pos:  31483 size:   278
-ret:-1         st: 1 flags:1  ts: 2.836000
+ret: 0         st: 1 flags:1  ts: 2.836000
+ret: 0         st: 1 flags:1 dts: 0.975000 pts: 0.975000 pos: 346128 size:   278
 ret: 0         st:-1 flags:0  ts: 1.730004
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos: 314982 size: 31143
 ret: 0         st:-1 flags:1  ts: 0.624171
@@ -28,7 +31,8 @@ ret: 0         st: 0 flags:0  ts:-0.482000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    398 size: 31082
 ret: 0         st: 0 flags:1  ts: 2.413000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos: 314982 size: 31143
-ret:-1         st: 1 flags:0  ts: 1.307000
+ret: 0         st: 1 flags:0  ts: 1.307000
+ret: 0         st: 1 flags:1 dts: 0.975000 pts: 0.975000 pos: 346128 size:   278
 ret: 0         st: 1 flags:1  ts: 0.201000
 ret: 0         st: 1 flags:1 dts: 0.174000 pts: 0.174000 pos:  78969 size:   278
 ret: 0         st:-1 flags:0  ts:-0.904994
@@ -39,8 +43,10 @@ ret: 0         st: 0 flags:0  ts: 0.883000
 ret: 0         st: 0 flags:1 dts: 0.960000 pts: 0.960000 pos: 314982 size: 31143
 ret: 0         st: 0 flags:1  ts:-0.222000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    398 size: 31082
-ret:-1         st: 1 flags:0  ts: 2.672000
-ret:-1         st: 1 flags:1  ts: 1.566000
+ret: 0         st: 1 flags:0  ts: 2.672000
+ret: 0         st: 1 flags:1 dts: 0.975000 pts: 0.975000 pos: 346128 size:   278
+ret: 0         st: 1 flags:1  ts: 1.566000
+ret: 0         st: 1 flags:1 dts: 0.975000 pts: 0.975000 pos: 346128 size:   278
 ret: 0         st:-1 flags:0  ts: 0.460008
 ret: 0         st: 0 flags:1 dts: 0.480000 pts: 0.480000 pos: 158515 size: 31134
 ret: 0         st:-1 flags:1  ts:-0.645825
-- 
cgit v1.2.3


From 3101bb6669ea4ccb33232b353ef0efcc178c65e7 Mon Sep 17 00:00:00 2001
From: Anton Khirnov
Date: Fri, 3 Feb 2012 15:32:51 +0100
Subject: avconv: rename InputStream.next_pts to next_dts.

It's used to predict dts, not pts.
---
 avconv.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/avconv.c b/avconv.c
index 30782e1826..8ad6fcd412 100644
--- a/avconv.c
+++ b/avconv.c
@@ -168,8 +168,9 @@ typedef struct InputStream {
     AVFrame *filtered_frame;
 
     int64_t       start;     /* time when read started */
-    int64_t       next_pts;  /* synthetic pts for cases where pkt.pts
-                                is not defined */
+    /* predicted dts of the next packet read for this stream or (when there are
+     * several frames in a packet) of the next frame in current packet */
+    int64_t       next_dts;
     int64_t       pts;       /* current pts */
     PtsCorrectionContext pts_ctx;
     double ts_scale;
@@ -1847,11 +1848,11 @@ static int transcode_audio(InputStream *ist, AVPacket *pkt, int *got_output)
     /* if the decoder provides a pts, use it instead of the last packet pts.
        the decoder could be delaying output by a packet or more. */
     if (decoded_frame->pts != AV_NOPTS_VALUE)
-        ist->next_pts = decoded_frame->pts;
+        ist->next_dts = decoded_frame->pts;
 
-    /* increment next_pts to use for the case where the input stream does not
+    /* increment next_dts to use for the case where the input stream does not
        have timestamps or there are multiple frames in the packet */
-    ist->next_pts += ((int64_t)AV_TIME_BASE * decoded_frame->nb_samples) /
+    ist->next_dts += ((int64_t)AV_TIME_BASE * decoded_frame->nb_samples) /
                      avctx->sample_rate;
 
     // preprocess audio (volume)
@@ -1954,14 +1955,14 @@ static int transcode_video(InputStream *ist, AVPacket *pkt, int *got_output, int
         /* no picture yet */
         return ret;
     }
-    ist->next_pts = ist->pts = guess_correct_pts(&ist->pts_ctx, decoded_frame->pkt_pts,
+    ist->next_dts = ist->pts = guess_correct_pts(&ist->pts_ctx, decoded_frame->pkt_pts,
                                                  decoded_frame->pkt_dts);
     if (pkt->duration)
-        ist->next_pts += av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
+        ist->next_dts += av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
     else if (ist->st->codec->time_base.num != 0) {
         int ticks      = ist->st->parser ? ist->st->parser->repeat_pict + 1 :
                                            ist->st->codec->ticks_per_frame;
-        ist->next_pts += ((int64_t)AV_TIME_BASE *
+        ist->next_dts += ((int64_t)AV_TIME_BASE *
                           ist->st->codec->time_base.num * ticks) /
                           ist->st->codec->time_base.den;
     }
@@ -2091,8 +2092,8 @@ static int output_packet(InputStream *ist,
     int64_t pkt_pts = AV_NOPTS_VALUE;
     AVPacket avpkt;
 
-    if (ist->next_pts == AV_NOPTS_VALUE)
-        ist->next_pts = ist->pts;
+    if (ist->next_dts == AV_NOPTS_VALUE)
+        ist->next_dts = ist->pts;
 
     if (pkt == NULL) {
         /* EOF handling */
@@ -2105,7 +2106,7 @@ static int output_packet(InputStream *ist,
     }
 
     if (pkt->dts != AV_NOPTS_VALUE)
-        ist->next_pts = ist->pts = av_rescale_q(pkt->dts, ist->st->time_base, AV_TIME_BASE_Q);
+        ist->next_dts = ist->pts = av_rescale_q(pkt->dts, ist->st->time_base, AV_TIME_BASE_Q);
     if (pkt->pts != AV_NOPTS_VALUE)
         pkt_pts = av_rescale_q(pkt->pts, ist->st->time_base, AV_TIME_BASE_Q);
 
@@ -2114,7 +2115,7 @@ static int output_packet(InputStream *ist,
         int ret = 0;
     handle_eof:
 
-        ist->pts = ist->next_pts;
+        ist->pts = ist->next_dts;
 
         if (avpkt.size && avpkt.size != pkt->size) {
             av_log(NULL, ist->showed_multi_packet_warning ? AV_LOG_VERBOSE : AV_LOG_WARNING,
@@ -2151,16 +2152,16 @@ static int output_packet(InputStream *ist,
     /* handle stream copy */
     if (!ist->decoding_needed) {
         rate_emu_sleep(ist);
-        ist->pts = ist->next_pts;
+        ist->pts = ist->next_dts;
         switch (ist->st->codec->codec_type) {
         case AVMEDIA_TYPE_AUDIO:
-            ist->next_pts += ((int64_t)AV_TIME_BASE * ist->st->codec->frame_size) /
+            ist->next_dts += ((int64_t)AV_TIME_BASE * ist->st->codec->frame_size) /
                              ist->st->codec->sample_rate;
             break;
         case AVMEDIA_TYPE_VIDEO:
             if (ist->st->codec->time_base.num != 0) {
                 int ticks = ist->st->parser ? ist->st->parser->repeat_pict + 1 : ist->st->codec->ticks_per_frame;
-                ist->next_pts += ((int64_t)AV_TIME_BASE *
+                ist->next_dts += ((int64_t)AV_TIME_BASE *
                                   ist->st->codec->time_base.num * ticks) /
                                   ist->st->codec->time_base.den;
             }
@@ -2237,7 +2238,7 @@ static int init_input_stream(int ist_index, OutputStream *output_streams, int nb
     }
 
     ist->pts = ist->st->avg_frame_rate.num ? - ist->st->codec->has_b_frames * AV_TIME_BASE / av_q2d(ist->st->avg_frame_rate) : 0;
-    ist->next_pts = AV_NOPTS_VALUE;
+    ist->next_dts = AV_NOPTS_VALUE;
     init_pts_correction(&ist->pts_ctx);
     ist->is_start = 1;
 
@@ -2759,13 +2760,13 @@ static int transcode(OutputFile *output_files,
             pkt.dts *= ist->ts_scale;
 
         //fprintf(stderr, "next:%"PRId64" dts:%"PRId64" off:%"PRId64" %d\n",
-        //        ist->next_pts,
+        //        ist->next_dts,
         //        pkt.dts, input_files[ist->file_index].ts_offset,
         //        ist->st->codec->codec_type);
-        if (pkt.dts != AV_NOPTS_VALUE && ist->next_pts != AV_NOPTS_VALUE
+        if (pkt.dts != AV_NOPTS_VALUE && ist->next_dts != AV_NOPTS_VALUE
             && (is->iformat->flags & AVFMT_TS_DISCONT)) {
             int64_t pkt_dts = av_rescale_q(pkt.dts, ist->st->time_base, AV_TIME_BASE_Q);
-            int64_t delta   = pkt_dts - ist->next_pts;
+            int64_t delta   = pkt_dts - ist->next_dts;
             if ((FFABS(delta) > 1LL * dts_delta_threshold * AV_TIME_BASE || pkt_dts + 1 < ist->pts) && !copy_ts) {
                 input_files[ist->file_index].ts_offset -= delta;
                 av_log(NULL, AV_LOG_DEBUG,
-- 
cgit v1.2.3


From 6e983902589fadfe6de799e08f0cfb2c377a4acd Mon Sep 17 00:00:00 2001
From: Anton Khirnov
Date: Sun, 5 Feb 2012 20:56:24 +0100
Subject: avconv: reduce overloading for InputStream.pts.

It currently has different meanings at different times (dts of the last
read packet/pts of the last decoded frame). Reduce obfuscation by
storing pts of the decoded frame in the frame itself.
---
 avconv.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/avconv.c b/avconv.c
index 8ad6fcd412..ab4fb7c980 100644
--- a/avconv.c
+++ b/avconv.c
@@ -866,11 +866,10 @@ static void choose_pixel_fmt(AVStream *st, AVCodec *codec)
 }
 
 static double
-get_sync_ipts(const OutputStream *ost)
+get_sync_ipts(const OutputStream *ost, int64_t pts)
 {
-    const InputStream *ist = ost->sync_ist;
     OutputFile *of = &output_files[ost->file_index];
-    return (double)(ist->pts - of->start_time) / AV_TIME_BASE;
+    return (double)(pts - of->start_time) / AV_TIME_BASE;
 }
 
 static void write_frame(AVFormatContext *s, AVPacket *pkt, OutputStream *ost)
@@ -1094,7 +1093,7 @@ need_realloc:
     }
 
     if (audio_sync_method) {
-        double delta = get_sync_ipts(ost) * enc->sample_rate - ost->sync_opts -
+        double delta = get_sync_ipts(ost, ist->pts) * enc->sample_rate - ost->sync_opts -
                        av_fifo_size(ost->fifo) / (enc->channels * osize);
         int idelta = delta * dec->sample_rate / enc->sample_rate;
         int byte_delta = idelta * isize * dec->channels;
@@ -1137,7 +1136,7 @@ need_realloc:
             }
         }
     } else
-        ost->sync_opts = lrintf(get_sync_ipts(ost) * enc->sample_rate) -
+        ost->sync_opts = lrintf(get_sync_ipts(ost, ist->pts) * enc->sample_rate) -
                                 av_fifo_size(ost->fifo) / (enc->channels * osize); // FIXME wrong
 
     if (ost->audio_resample) {
@@ -1361,7 +1360,7 @@ static void do_video_out(AVFormatContext *s,
 
     enc = ost->st->codec;
 
-    sync_ipts = get_sync_ipts(ost) / av_q2d(enc->time_base);
+    sync_ipts = get_sync_ipts(ost, in_picture->pts) / av_q2d(enc->time_base);
 
     /* by default, we output a single frame */
     nb_frames = 1;
@@ -1955,8 +1954,8 @@ static int transcode_video(InputStream *ist, AVPacket *pkt, int *got_output, int
         /* no picture yet */
         return ret;
     }
-    ist->next_dts = ist->pts = guess_correct_pts(&ist->pts_ctx, decoded_frame->pkt_pts,
-                                                 decoded_frame->pkt_dts);
+    ist->next_dts = decoded_frame->pts = guess_correct_pts(&ist->pts_ctx, decoded_frame->pkt_pts,
+                                                           decoded_frame->pkt_dts);
     if (pkt->duration)
         ist->next_dts += av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
     else if (ist->st->codec->time_base.num != 0) {
@@ -2011,7 +2010,6 @@ static int transcode_video(InputStream *ist, AVPacket *pkt, int *got_output, int
                                         ist->st->codec->pix_fmt);
 
             avfilter_copy_frame_props(fb, decoded_frame);
-            fb->pts                 = ist->pts;
             fb->buf->priv           = buf;
             fb->buf->free           = filter_release_buffer;
 
@@ -2019,7 +2017,7 @@ static int transcode_video(InputStream *ist, AVPacket *pkt, int *got_output, int
             av_buffersrc_buffer(ost->input_video_filter, fb);
         } else
             av_vsrc_buffer_add_frame(ost->input_video_filter, decoded_frame,
-                                     ist->pts, decoded_frame->sample_aspect_ratio);
+                                     decoded_frame->pts, decoded_frame->sample_aspect_ratio);
 
         if (!ist->filtered_frame && !(ist->filtered_frame = avcodec_alloc_frame())) {
             av_free(buffer_to_free);
@@ -2034,7 +2032,7 @@ static int transcode_video(InputStream *ist, AVPacket *pkt, int *got_output, int
             if (ost->output_video_filter)
                 get_filtered_video_frame(ost->output_video_filter, filtered_frame, &ost->picref, &ist_pts_tb);
             if (ost->picref)
-                ist->pts = av_rescale_q(ost->picref->pts, ist_pts_tb, AV_TIME_BASE_Q);
+                filtered_frame->pts = av_rescale_q(ost->picref->pts, ist_pts_tb, AV_TIME_BASE_Q);
             if (ost->picref->video && !ost->frame_aspect_ratio)
                 ost->st->codec->sample_aspect_ratio = ost->picref->video->pixel_aspect;
 #else
-- 
cgit v1.2.3


From 23576b3fbba3f5438fbd6aa6e5e73a39682b942b Mon Sep 17 00:00:00 2001
From: Anton Khirnov
Date: Sun, 5 Feb 2012 21:12:43 +0100
Subject: avconv: rename InputStream.pts to last_dts.

It more accurately describes what does this variable store.
---
 avconv.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/avconv.c b/avconv.c
index ab4fb7c980..a4f67ff6fd 100644
--- a/avconv.c
+++ b/avconv.c
@@ -171,7 +171,8 @@ typedef struct InputStream {
     /* predicted dts of the next packet read for this stream or (when there are
      * several frames in a packet) of the next frame in current packet */
     int64_t       next_dts;
-    int64_t       pts;       /* current pts */
+    /* dts of the last packet read for this stream */
+    int64_t       last_dts;
     PtsCorrectionContext pts_ctx;
     double ts_scale;
     int is_start;            /* is 1 at the start and after a discontinuity */
@@ -1093,7 +1094,7 @@ need_realloc:
     }
 
     if (audio_sync_method) {
-        double delta = get_sync_ipts(ost, ist->pts) * enc->sample_rate - ost->sync_opts -
+        double delta = get_sync_ipts(ost, ist->last_dts) * enc->sample_rate - ost->sync_opts -
                        av_fifo_size(ost->fifo) / (enc->channels * osize);
         int idelta = delta * dec->sample_rate / enc->sample_rate;
         int byte_delta = idelta * isize * dec->channels;
@@ -1136,7 +1137,7 @@ need_realloc:
             }
         }
     } else
-        ost->sync_opts = lrintf(get_sync_ipts(ost, ist->pts) * enc->sample_rate) -
+        ost->sync_opts = lrintf(get_sync_ipts(ost, ist->last_dts) * enc->sample_rate) -
                                 av_fifo_size(ost->fifo) / (enc->channels * osize); // FIXME wrong
 
     if (ost->audio_resample) {
@@ -1748,7 +1749,7 @@ static int check_output_constraints(InputStream *ist, OutputStream *ost)
     if (ost->source_index != ist_index)
         return 0;
 
-    if (of->start_time && ist->pts < of->start_time)
+    if (of->start_time && ist->last_dts < of->start_time)
         return 0;
 
     return 1;
@@ -1767,7 +1768,7 @@ static void do_streamcopy(InputStream *ist, OutputStream *ost, const AVPacket *p
         return;
 
     if (of->recording_time != INT64_MAX &&
-        ist->pts >= of->recording_time + of->start_time) {
+        ist->last_dts >= of->recording_time + of->start_time) {
         ost->is_past_recording_time = 1;
         return;
     }
@@ -1786,7 +1787,7 @@ static void do_streamcopy(InputStream *ist, OutputStream *ost, const AVPacket *p
         opkt.pts = AV_NOPTS_VALUE;
 
     if (pkt->dts == AV_NOPTS_VALUE)
-        opkt.dts = av_rescale_q(ist->pts, AV_TIME_BASE_Q, ost->st->time_base);
+        opkt.dts = av_rescale_q(ist->last_dts, AV_TIME_BASE_Q, ost->st->time_base);
     else
         opkt.dts = av_rescale_q(pkt->dts, ist->st->time_base, ost->st->time_base);
     opkt.dts -= ost_tb_start_time;
@@ -1814,7 +1815,7 @@ static void do_streamcopy(InputStream *ist, OutputStream *ost, const AVPacket *p
 static void rate_emu_sleep(InputStream *ist)
 {
     if (input_files[ist->file_index].rate_emu) {
-        int64_t pts = av_rescale(ist->pts, 1000000, AV_TIME_BASE);
+        int64_t pts = av_rescale(ist->last_dts, 1000000, AV_TIME_BASE);
         int64_t now = av_gettime() - ist->start;
         if (pts > now)
             usleep(pts - now);
@@ -1941,7 +1942,7 @@ static int transcode_video(InputStream *ist, AVPacket *pkt, int *got_output, int
         avcodec_get_frame_defaults(ist->decoded_frame);
     decoded_frame = ist->decoded_frame;
     pkt->pts  = *pkt_pts;
-    pkt->dts  = ist->pts;
+    pkt->dts  = ist->last_dts;
     *pkt_pts  = AV_NOPTS_VALUE;
 
     ret = avcodec_decode_video2(ist->st->codec,
@@ -2091,7 +2092,7 @@ static int output_packet(InputStream *ist,
     AVPacket avpkt;
 
     if (ist->next_dts == AV_NOPTS_VALUE)
-        ist->next_dts = ist->pts;
+        ist->next_dts = ist->last_dts;
 
     if (pkt == NULL) {
         /* EOF handling */
@@ -2104,7 +2105,7 @@ static int output_packet(InputStream *ist,
     }
 
     if (pkt->dts != AV_NOPTS_VALUE)
-        ist->next_dts = ist->pts = av_rescale_q(pkt->dts, ist->st->time_base, AV_TIME_BASE_Q);
+        ist->next_dts = ist->last_dts = av_rescale_q(pkt->dts, ist->st->time_base, AV_TIME_BASE_Q);
     if (pkt->pts != AV_NOPTS_VALUE)
         pkt_pts = av_rescale_q(pkt->pts, ist->st->time_base, AV_TIME_BASE_Q);
 
@@ -2113,7 +2114,7 @@ static int output_packet(InputStream *ist,
         int ret = 0;
     handle_eof:
 
-        ist->pts = ist->next_dts;
+        ist->last_dts = ist->next_dts;
 
         if (avpkt.size && avpkt.size != pkt->size) {
             av_log(NULL, ist->showed_multi_packet_warning ? AV_LOG_VERBOSE : AV_LOG_WARNING,
@@ -2150,7 +2151,7 @@ static int output_packet(InputStream *ist,
     /* handle stream copy */
     if (!ist->decoding_needed) {
         rate_emu_sleep(ist);
-        ist->pts = ist->next_dts;
+        ist->last_dts = ist->next_dts;
         switch (ist->st->codec->codec_type) {
         case AVMEDIA_TYPE_AUDIO:
             ist->next_dts += ((int64_t)AV_TIME_BASE * ist->st->codec->frame_size) /
@@ -2235,7 +2236,7 @@ static int init_input_stream(int ist_index, OutputStream *output_streams, int nb
         assert_avoptions(ist->opts);
     }
 
-    ist->pts = ist->st->avg_frame_rate.num ? - ist->st->codec->has_b_frames * AV_TIME_BASE / av_q2d(ist->st->avg_frame_rate) : 0;
+    ist->last_dts = ist->st->avg_frame_rate.num ? - ist->st->codec->has_b_frames * AV_TIME_BASE / av_q2d(ist->st->avg_frame_rate) : 0;
     ist->next_dts = AV_NOPTS_VALUE;
     init_pts_correction(&ist->pts_ctx);
     ist->is_start = 1;
@@ -2685,7 +2686,7 @@ static int transcode(OutputFile *output_files,
                 (os->pb && avio_tell(os->pb) >= of->limit_filesize))
                 continue;
             opts = ost->st->pts.val * av_q2d(ost->st->time_base);
-            ipts = ist->pts;
+            ipts = ist->last_dts;
             if (!input_files[ist->file_index].eof_reached) {
                 if (ipts < ipts_min) {
                     ipts_min = ipts;
@@ -2765,7 +2766,7 @@ static int transcode(OutputFile *output_files,
             && (is->iformat->flags & AVFMT_TS_DISCONT)) {
             int64_t pkt_dts = av_rescale_q(pkt.dts, ist->st->time_base, AV_TIME_BASE_Q);
             int64_t delta   = pkt_dts - ist->next_dts;
-            if ((FFABS(delta) > 1LL * dts_delta_threshold * AV_TIME_BASE || pkt_dts + 1 < ist->pts) && !copy_ts) {
+            if ((FFABS(delta) > 1LL * dts_delta_threshold * AV_TIME_BASE || pkt_dts + 1 < ist->last_dts) && !copy_ts) {
                 input_files[ist->file_index].ts_offset -= delta;
                 av_log(NULL, AV_LOG_DEBUG,
                        "timestamp discontinuity %"PRId64", new offset= %"PRId64"\n",
-- 
cgit v1.2.3


From b34856a116f7ea4748ee7c89bfe1f1bc135d079d Mon Sep 17 00:00:00 2001
From: Anton Khirnov
Date: Thu, 2 Feb 2012 12:21:37 +0100
Subject: avconv: better next_dts usage.

next_dts is used for estimating the dts of the next packet if it's
missing. Therefore, it makes no sense to set it from the pts of the last
decoded frame. Also it should be estimated from the current packet
duration/ticks_per_frame always, not only when a frame was successfully
decoded.
---
 avconv.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/avconv.c b/avconv.c
index a4f67ff6fd..92af92430c 100644
--- a/avconv.c
+++ b/avconv.c
@@ -1955,17 +1955,8 @@ static int transcode_video(InputStream *ist, AVPacket *pkt, int *got_output, int
         /* no picture yet */
         return ret;
     }
-    ist->next_dts = decoded_frame->pts = guess_correct_pts(&ist->pts_ctx, decoded_frame->pkt_pts,
-                                                           decoded_frame->pkt_dts);
-    if (pkt->duration)
-        ist->next_dts += av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
-    else if (ist->st->codec->time_base.num != 0) {
-        int ticks      = ist->st->parser ? ist->st->parser->repeat_pict + 1 :
-                                           ist->st->codec->ticks_per_frame;
-        ist->next_dts += ((int64_t)AV_TIME_BASE *
-                          ist->st->codec->time_base.num * ticks) /
-                          ist->st->codec->time_base.den;
-    }
+    decoded_frame->pts = guess_correct_pts(&ist->pts_ctx, decoded_frame->pkt_pts,
+                                           decoded_frame->pkt_dts);
     pkt->size = 0;
     pre_process_video_frame(ist, (AVPicture *)decoded_frame, &buffer_to_free);
 
@@ -2128,6 +2119,13 @@ static int output_packet(InputStream *ist,
             break;
         case AVMEDIA_TYPE_VIDEO:
             ret = transcode_video    (ist, &avpkt, &got_output, &pkt_pts);
+            if (avpkt.duration)
+                ist->next_dts += av_rescale_q(avpkt.duration, ist->st->time_base, AV_TIME_BASE_Q);
+            else if (ist->st->codec->time_base.num != 0) {
+                int ticks      = ist->st->parser ? ist->st->parser->repeat_pict + 1 :
+                                                   ist->st->codec->ticks_per_frame;
+                ist->next_dts += av_rescale_q(ticks, ist->st->codec->time_base, AV_TIME_BASE_Q);
+            }
             break;
         case AVMEDIA_TYPE_SUBTITLE:
             ret = transcode_subtitles(ist, &avpkt, &got_output);
-- 
cgit v1.2.3


From 722410ade56f67b474e4d0e5684e66d4942f5973 Mon Sep 17 00:00:00 2001
From: Anton Khirnov
Date: Thu, 2 Feb 2012 13:18:49 +0100
Subject: avconv: estimate next_dts from framerate if it is set.

---
 avconv.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/avconv.c b/avconv.c
index 92af92430c..0c3d384571 100644
--- a/avconv.c
+++ b/avconv.c
@@ -2121,6 +2121,10 @@ static int output_packet(InputStream *ist,
             ret = transcode_video    (ist, &avpkt, &got_output, &pkt_pts);
             if (avpkt.duration)
                 ist->next_dts += av_rescale_q(avpkt.duration, ist->st->time_base, AV_TIME_BASE_Q);
+            else if (ist->st->r_frame_rate.num)
+                ist->next_dts += av_rescale_q(1, (AVRational){ist->st->r_frame_rate.den,
+                                                              ist->st->r_frame_rate.num},
+                                              AV_TIME_BASE_Q);
             else if (ist->st->codec->time_base.num != 0) {
                 int ticks      = ist->st->parser ? ist->st->parser->repeat_pict + 1 :
                                                    ist->st->codec->ticks_per_frame;
-- 
cgit v1.2.3


From fb90785e98ac405198c0ca9fec133227f6d82826 Mon Sep 17 00:00:00 2001
From: Ronald S. Bultje
Date: Tue, 31 Jan 2012 15:17:59 -0800
Subject: vp8: always update next_framep[] before returning from
 decode_frame().

Also slightly move around code not allocate a new frame if we won't
decode it. This prevents us from putting undecoded frames in frame
pointers, which (in mt decoding) other threads will use and wait on
as references, causing a deadlock (if we skipped decoding) or a crash
(if we didn't initialized next_framep[] at all).

Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
---
 libavcodec/vp8.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 7669af1ed1..7cf18c07cd 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -1571,7 +1571,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     release_queued_segmaps(s, 0);
 
     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
-        return ret;
+        goto err;
 
     prev_frame = s->framep[VP56_FRAME_CURRENT];
 
@@ -1583,6 +1583,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
 
     if (avctx->skip_frame >= skip_thresh) {
         s->invisible = 1;
+        memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
         goto skip_decode;
     }
     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
@@ -1612,12 +1613,23 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     if (curframe->data[0])
         vp8_release_frame(s, curframe, 1, 0);
 
+    // Given that arithmetic probabilities are updated every frame, it's quite likely
+    // that the values we have on a random interframe are complete junk if we didn't
+    // start decode on a keyframe. So just don't display anything rather than junk.
+    if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
+                         !s->framep[VP56_FRAME_GOLDEN] ||
+                         !s->framep[VP56_FRAME_GOLDEN2])) {
+        av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
     curframe->key_frame = s->keyframe;
     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
     curframe->reference = referenced ? 3 : 0;
     if ((ret = vp8_alloc_frame(s, curframe))) {
         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
-        return ret;
+        goto err;
     }
 
     // check if golden and altref are swapped
@@ -1640,16 +1652,6 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
 
     ff_thread_finish_setup(avctx);
 
-    // Given that arithmetic probabilities are updated every frame, it's quite likely
-    // that the values we have on a random interframe are complete junk if we didn't
-    // start decode on a keyframe. So just don't display anything rather than junk.
-    if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
-                         !s->framep[VP56_FRAME_GOLDEN] ||
-                         !s->framep[VP56_FRAME_GOLDEN2])) {
-        av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
-        return AVERROR_INVALIDDATA;
-    }
-
     s->linesize   = curframe->linesize[0];
     s->uvlinesize = curframe->linesize[1];
 
@@ -1759,20 +1761,23 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     }
 
     ff_thread_report_progress(curframe, INT_MAX, 0);
+    memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
+
 skip_decode:
     // if future frames don't use the updated probabilities,
     // reset them to the values we saved
     if (!s->update_probabilities)
         s->prob[0] = s->prob[1];
 
-    memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
-
     if (!s->invisible) {
         *(AVFrame*)data = *curframe;
         *data_size = sizeof(AVFrame);
     }
 
     return avpkt->size;
+err:
+    memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
+    return ret;
 }
 
 static av_cold int vp8_decode_init(AVCodecContext *avctx)
-- 
cgit v1.2.3


From b7542dd3d71d1ee873277020b6a8eab2674bb167 Mon Sep 17 00:00:00 2001
From: Ronald S. Bultje
Date: Tue, 7 Feb 2012 11:33:20 -0800
Subject: swscale: fix V plane memory location in bilinear/unscaled RGB/YUYV
 case.

Fixes bug 221.

CC: libav-stable@libav.org
---
 libswscale/x86/swscale_template.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index ea2e83750e..c6f90f69f1 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -688,10 +688,10 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
     "1:                                 \n\t"\
     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
-    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
-    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
@@ -919,10 +919,10 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
     "1:                                 \n\t"\
     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
-    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
-    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
@@ -974,9 +974,9 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
     ".p2align              4            \n\t"\
     "1:                                 \n\t"\
     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
-    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
-    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
@@ -1027,10 +1027,10 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
     "1:                                 \n\t"\
     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
-    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
-    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
     "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
@@ -1294,9 +1294,9 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
     ".p2align              4            \n\t"\
     "1:                                 \n\t"\
     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
-    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
-    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "psraw                $7, %%mm3     \n\t" \
     "psraw                $7, %%mm4     \n\t" \
     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
@@ -1312,10 +1312,10 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
     "1:                                 \n\t"\
     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
-    "add   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
-    "sub   "UV_OFF_PX"("#c"), "#index"  \n\t" \
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
     "psrlw                $8, %%mm3     \n\t" \
-- 
cgit v1.2.3


From 3206cccc0ee2166e994bb0c698178ee55b8b4c59 Mon Sep 17 00:00:00 2001
From: Michael Kostylev
Date: Sat, 4 Feb 2012 00:16:35 -0800
Subject: h264: mark h264_idct_add8_10 with number of XMM registers.

This fixes XMM register clobber problems on Win64.

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
---
 libavcodec/x86/h264_idct_10bit.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index 62a528b340..501c2a4da1 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -315,7 +315,7 @@ IDCT_ADD16INTRA_10 avx
 ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
 ;-----------------------------------------------------------------------------
 %macro IDCT_ADD8 1
-cglobal h264_idct_add8_10_%1,5,7
+cglobal h264_idct_add8_10_%1,5,7,7
 %if ARCH_X86_64
     mov r10, r0
 %endif
-- 
cgit v1.2.3


From ef1c785f11c168384e42d147648c8fdf5317739b Mon Sep 17 00:00:00 2001
From: Ronald S. Bultje
Date: Sat, 4 Feb 2012 10:00:27 +0100
Subject: swscale: make yuv2yuv1 use named registers.

---
 libswscale/x86/output.asm | 50 +++++++++++++++++++++++------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index a288f08867..8a72d55533 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -273,17 +273,17 @@ yuv2planeX_fn 10,  7, 5
 %macro yuv2plane1_mainloop 2
 .loop_%2:
 %if %1 == 8
-    paddsw          m0, m2, [r0+r2*2+mmsize*0]
-    paddsw          m1, m3, [r0+r2*2+mmsize*1]
+    paddsw          m0, m2, [srcq+dstwq*2+mmsize*0]
+    paddsw          m1, m3, [srcq+dstwq*2+mmsize*1]
     psraw           m0, 7
     psraw           m1, 7
     packuswb        m0, m1
     mov%2      [r1+r2], m0
 %elif %1 == 16
-    paddd           m0, m4, [r0+r2*4+mmsize*0]
-    paddd           m1, m4, [r0+r2*4+mmsize*1]
-    paddd           m2, m4, [r0+r2*4+mmsize*2]
-    paddd           m3, m4, [r0+r2*4+mmsize*3]
+    paddd           m0, m4, [srcq+dstwq*4+mmsize*0]
+    paddd           m1, m4, [srcq+dstwq*4+mmsize*1]
+    paddd           m2, m4, [srcq+dstwq*4+mmsize*2]
+    paddd           m3, m4, [srcq+dstwq*4+mmsize*3]
     psrad           m0, 3
     psrad           m1, 3
     psrad           m2, 3
@@ -297,46 +297,46 @@ yuv2planeX_fn 10,  7, 5
     paddw           m0, m5
     paddw           m2, m5
 %endif ; mmx/sse2/sse4/avx
-    mov%2    [r1+r2*2], m0
-    mov%2    [r1+r2*2+mmsize], m2
-%else
-    paddsw          m0, m2, [r0+r2*2+mmsize*0]
-    paddsw          m1, m2, [r0+r2*2+mmsize*1]
+    mov%2    [dstq+dstwq*2+mmsize*0], m0
+    mov%2    [dstq+dstwq*2+mmsize*1], m2
+%else ; %1 == 9/10
+    paddsw          m0, m2, [srcq+dstwq*2+mmsize*0]
+    paddsw          m1, m2, [srcq+dstwq*2+mmsize*1]
     psraw           m0, 15 - %1
     psraw           m1, 15 - %1
     pmaxsw          m0, m4
     pmaxsw          m1, m4
     pminsw          m0, m3
     pminsw          m1, m3
-    mov%2    [r1+r2*2], m0
-    mov%2    [r1+r2*2+mmsize], m1
+    mov%2    [dstq+dstwq*2+mmsize*0], m0
+    mov%2    [dstq+dstwq*2+mmsize*1], m1
 %endif
-    add             r2, mmsize
+    add          dstwq, mmsize
     jl .loop_%2
 %endmacro
 
 %macro yuv2plane1_fn 3
-cglobal yuv2plane1_%1, %3, %3, %2
-    add             r2, mmsize - 1
-    and             r2, ~(mmsize - 1)
+cglobal yuv2plane1_%1, %3, %3, %2, src, dst, dstw, dither, offset
+    add          dstwq, mmsize - 1
+    and          dstwq, ~(mmsize - 1)
 %if %1 == 8
-    add             r1, r2
+    add           dstq, dstwq
 %else ; %1 != 8
-    lea             r1, [r1+r2*2]
+    lea           dstq, [dstq+dstwq*2]
 %endif ; %1 == 8
 %if %1 == 16
-    lea             r0, [r0+r2*4]
+    lea           srcq, [srcq+dstwq*4]
 %else ; %1 != 16
-    lea             r0, [r0+r2*2]
+    lea           srcq, [srcq+dstwq*2]
 %endif ; %1 == 16
-    neg             r2
+    neg          dstwq
 
 %if %1 == 8
     pxor            m4, m4               ; zero
 
     ; create registers holding dither
-    movq            m3, [r3]             ; dither
-    test           r4d, r4d
+    movq            m3, [ditherq]        ; dither
+    test       offsetd, offsetd
     jz              .no_rot
 %if mmsize == 16
     punpcklqdq      m3, m3
@@ -372,7 +372,7 @@ cglobal yuv2plane1_%1, %3, %3, %2
 %if mmsize == 8
     yuv2plane1_mainloop %1, a
 %else ; mmsize == 16
-    test            r1, 15
+    test          dstq, 15
     jnz .unaligned
     yuv2plane1_mainloop %1, a
     REP_RET
-- 
cgit v1.2.3