aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libavcodec/arm/Makefile5
-rw-r--r--libavcodec/arm/vp9dsp_init.h29
-rw-r--r--libavcodec/arm/vp9dsp_init_10bpp_arm.c23
-rw-r--r--libavcodec/arm/vp9dsp_init_12bpp_arm.c23
-rw-r--r--libavcodec/arm/vp9dsp_init_16bpp_arm_template.c147
-rw-r--r--libavcodec/arm/vp9dsp_init_arm.c9
-rw-r--r--libavcodec/arm/vp9mc_16bpp_neon.S615
7 files changed, 849 insertions, 2 deletions
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 7f18daa7de..fb35d253b9 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -44,7 +44,9 @@ OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_arm.o
+OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_10bpp_arm.o \
+ arm/vp9dsp_init_12bpp_arm.o \
+ arm/vp9dsp_init_arm.o
# ARMv5 optimizations
@@ -142,4 +144,5 @@ NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o
NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_neon.o \
arm/vp9lpf_neon.o \
+ arm/vp9mc_16bpp_neon.o \
arm/vp9mc_neon.o
diff --git a/libavcodec/arm/vp9dsp_init.h b/libavcodec/arm/vp9dsp_init.h
new file mode 100644
index 0000000000..0dc1c2dc20
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_VP9DSP_INIT_H
+#define AVCODEC_ARM_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+void ff_vp9dsp_init_10bpp_arm(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12bpp_arm(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_ARM_VP9DSP_INIT_H */
diff --git a/libavcodec/arm/vp9dsp_init_10bpp_arm.c b/libavcodec/arm/vp9dsp_init_10bpp_arm.c
new file mode 100644
index 0000000000..b8cb293b20
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_10bpp_arm.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_arm
+#include "vp9dsp_init_16bpp_arm_template.c"
diff --git a/libavcodec/arm/vp9dsp_init_12bpp_arm.c b/libavcodec/arm/vp9dsp_init_12bpp_arm.c
new file mode 100644
index 0000000000..fa65eb260b
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_12bpp_arm.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_arm
+#include "vp9dsp_init_16bpp_arm_template.c"
diff --git a/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c b/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c
new file mode 100644
index 0000000000..05efd2942f
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz, suffix) \
+void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define decl_mc_func(op, filter, dir, sz, bpp) \
+void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz, bpp) \
+static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \
+ /* We only need h + 7 lines, but the horizontal filter assumes an \
+ * even number of rows, so filter h + 8 lines here. */ \
+ ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \
+ src - 3 * src_stride, src_stride, \
+ h + 8, mx, 0); \
+ ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \
+ temp + 3 * 2 * sz, 2 * sz, \
+ h, 0, my); \
+}
+
+#define decl_filter_funcs(op, dir, sz, bpp) \
+ decl_mc_func(op, regular, dir, sz, bpp); \
+ decl_mc_func(op, sharp, dir, sz, bpp); \
+ decl_mc_func(op, smooth, dir, sz, bpp)
+
+#define decl_mc_funcs(sz, bpp) \
+ decl_filter_funcs(put, h, sz, bpp); \
+ decl_filter_funcs(avg, h, sz, bpp); \
+ decl_filter_funcs(put, v, sz, bpp); \
+ decl_filter_funcs(avg, v, sz, bpp); \
+ decl_filter_funcs(put, hv, sz, bpp); \
+ decl_filter_funcs(avg, hv, sz, bpp)
+
+declare_fpel(copy, 128, );
+declare_fpel(copy, 64, );
+declare_fpel(copy, 32, );
+declare_fpel(copy, 16, );
+declare_fpel(copy, 8, );
+declare_fpel(avg, 64, _16);
+declare_fpel(avg, 32, _16);
+declare_fpel(avg, 16, _16);
+declare_fpel(avg, 8, _16);
+declare_fpel(avg, 4, _16);
+
+decl_mc_funcs(64, BPP);
+decl_mc_funcs(32, BPP);
+decl_mc_funcs(16, BPP);
+decl_mc_funcs(8, BPP);
+decl_mc_funcs(4, BPP);
+
+#define define_8tap_2d_funcs(sz, bpp) \
+ define_8tap_2d_fn(put, regular, sz, bpp) \
+ define_8tap_2d_fn(put, sharp, sz, bpp) \
+ define_8tap_2d_fn(put, smooth, sz, bpp) \
+ define_8tap_2d_fn(avg, regular, sz, bpp) \
+ define_8tap_2d_fn(avg, sharp, sz, bpp) \
+ define_8tap_2d_fn(avg, smooth, sz, bpp)
+
+define_8tap_2d_funcs(64, BPP)
+define_8tap_2d_funcs(32, BPP)
+define_8tap_2d_funcs(16, BPP)
+define_8tap_2d_funcs(8, BPP)
+define_8tap_2d_funcs(4, BPP)
+
+
+static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#define init_fpel(idx1, idx2, sz, type, suffix) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix##_neon
+
+#define init_copy_avg(idx, sz1, sz2) \
+ init_fpel(idx, 0, sz2, copy, ); \
+ init_fpel(idx, 1, sz1, avg, _16)
+
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
+ dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \
+ init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp)
+
+#define init_mc_funcs_dirs(idx, sz, bpp) \
+ init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \
+ init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \
+ init_mc_funcs(idx, hv, 1, 1, sz, , bpp)
+
+ init_copy_avg(0, 64, 128);
+ init_copy_avg(1, 32, 64);
+ init_copy_avg(2, 16, 32);
+ init_copy_avg(3, 8, 16);
+ init_copy_avg(4, 4, 8);
+
+ init_mc_funcs_dirs(0, 64, BPP);
+ init_mc_funcs_dirs(1, 32, BPP);
+ init_mc_funcs_dirs(2, 16, BPP);
+ init_mc_funcs_dirs(3, 8, BPP);
+ init_mc_funcs_dirs(4, 4, BPP);
+ }
+}
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp)
+{
+ vp9dsp_mc_init_arm(dsp);
+}
diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
index 0b76eb1638..f7b539e7e2 100644
--- a/libavcodec/arm/vp9dsp_init_arm.c
+++ b/libavcodec/arm/vp9dsp_init_arm.c
@@ -23,6 +23,7 @@
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_init.h"
#define declare_fpel(type, sz) \
void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
@@ -240,7 +241,13 @@ static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp)
{
- if (bpp != 8)
+ if (bpp == 10) {
+ ff_vp9dsp_init_10bpp_arm(dsp);
+ return;
+ } else if (bpp == 12) {
+ ff_vp9dsp_init_12bpp_arm(dsp);
+ return;
+ } else if (bpp != 8)
return;
vp9dsp_mc_init_arm(dsp);
diff --git a/libavcodec/arm/vp9mc_16bpp_neon.S b/libavcodec/arm/vp9mc_16bpp_neon.S
new file mode 100644
index 0000000000..f6ec0375f2
--- /dev/null
+++ b/libavcodec/arm/vp9mc_16bpp_neon.S
@@ -0,0 +1,615 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+@ All public functions in this file have the following signature:
+@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+@ const uint8_t *ref, ptrdiff_t ref_stride,
+@ int h, int mx, int my);
+
+function ff_vp9_copy128_neon, export=1
+ ldr r12, [sp]
+ sub r1, r1, #96
+ sub r3, r3, #96
+1:
+ subs r12, r12, #1
+ vld1.16 {q0, q1}, [r2]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q2, q3}, [r2]!
+ vst1.16 {q2, q3}, [r0, :128]!
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2], r3
+ vst1.16 {q10, q11}, [r0, :128], r1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg64_16_neon, export=1
+ push {lr}
+ ldr r12, [sp, #4]
+ sub r1, r1, #96
+ sub r3, r3, #96
+ mov lr, r0
+1:
+ subs r12, r12, #1
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vrhadd.u16 q0, q0, q8
+ vld1.16 {q2, q3}, [r0, :128]!
+ vrhadd.u16 q1, q1, q9
+ vld1.16 {q12, q13}, [r2]!
+ vrhadd.u16 q2, q2, q10
+ vst1.16 {q0, q1}, [lr, :128]!
+ vrhadd.u16 q3, q3, q11
+ vld1.16 {q8, q9}, [r0, :128]!
+ vst1.16 {q2, q3}, [lr, :128]!
+ vrhadd.u16 q8, q8, q12
+ vld1.16 {q14, q15}, [r2], r3
+ vrhadd.u16 q9, q9, q13
+ vld1.16 {q10, q11}, [r0, :128], r1
+ vrhadd.u16 q10, q10, q14
+ vst1.16 {q8, q9}, [lr, :128]!
+ vrhadd.u16 q11, q11, q15
+ vst1.16 {q10, q11}, [lr, :128], r1
+ bne 1b
+ pop {pc}
+endfunc
+
+function ff_vp9_avg32_16_neon, export=1
+ push {lr}
+ ldr r12, [sp, #4]
+ sub r1, r1, #32
+ sub r3, r3, #32
+ mov lr, r0
+1:
+ subs r12, r12, #1
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2], r3
+ vrhadd.u16 q0, q0, q8
+ vld1.16 {q2, q3}, [r0, :128], r1
+ vrhadd.u16 q1, q1, q9
+ vrhadd.u16 q2, q2, q10
+ vst1.16 {q0, q1}, [lr, :128]!
+ vrhadd.u16 q3, q3, q11
+ vst1.16 {q2, q3}, [lr, :128], r1
+ bne 1b
+ pop {pc}
+endfunc
+
+function ff_vp9_avg16_16_neon, export=1
+ ldr r12, [sp]
+1:
+ subs r12, r12, #1
+ vld1.16 {q2, q3}, [r2], r3
+ vld1.16 {q0, q1}, [r0, :128]
+ vrhadd.u16 q0, q0, q2
+ vrhadd.u16 q1, q1, q3
+ vst1.16 {q0, q1}, [r0, :128], r1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg8_16_neon, export=1
+ push {lr}
+ ldr r12, [sp, #4]
+ mov lr, r0
+1:
+ subs r12, r12, #2
+ vld1.16 {q2}, [r2], r3
+ vld1.16 {q0}, [r0, :128], r1
+ vld1.16 {q3}, [r2], r3
+ vrhadd.u16 q0, q0, q2
+ vld1.16 {q1}, [r0, :128], r1
+ vrhadd.u16 q1, q1, q3
+ vst1.16 {q0}, [lr, :128], r1
+ vst1.16 {q1}, [lr, :128], r1
+ bne 1b
+ pop {pc}
+endfunc
+
+function ff_vp9_avg4_16_neon, export=1
+ ldr r12, [sp]
+1:
+ subs r12, r12, #2
+ vld1.16 {d2}, [r2], r3
+ vld1.16 {d0}, [r0, :64], r1
+ vld1.16 {d3}, [r2], r3
+ vrhadd.u16 d0, d0, d2
+ vld1.16 {d1}, [r0, :64]
+ sub r0, r0, r1
+ vrhadd.u16 d1, d1, d3
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r0, :64], r1
+ bne 1b
+ bx lr
+endfunc
+
+@ Helper macros for vmull/vmlal with a constant from either d0 or d1 depending on index
+.macro vmull_lane dst, src, idx
+.if \idx < 4
+ vmull.s16 \dst, \src, d0[\idx]
+.else
+ vmull.s16 \dst, \src, d1[\idx - 4]
+.endif
+.endm
+.macro vmlal_lane dst, src, idx
+.if \idx < 4
+ vmlal.s16 \dst, \src, d0[\idx]
+.else
+ vmlal.s16 \dst, \src, d1[\idx - 4]
+.endif
+.endm
+
+@ Extract a vector from src1-src2 and src3-src4, andmultiply-accumulate
+@ into dst1 and dst3 (or dst1-dst2 and dst3-dst4 for size >= 8)
+.macro extmlal dst1, dst2, dst3, dst4, src1, src2, src3, src4, offset, size
+ vext.8 q14, \src1, \src2, #(2*\offset)
+ vext.8 q15, \src3, \src4, #(2*\offset)
+ vmlal_lane \dst1, d28, \offset
+ vmlal_lane \dst3, d30, \offset
+.if \size >= 8
+ vmlal_lane \dst2, d29, \offset
+ vmlal_lane \dst4, d31, \offset
+.endif
+.endm
+
+
+@ Instantiate a horizontal filter function for the given size.
+@ This can work on 4 or 8 pixels in parallel; for larger
+@ widths it will do 8 pixels at a time and loop horizontally.
+@ The actual width (in bytes) is passed in r5, the height in r4 and
+@ the filter coefficients in r12.
+.macro do_8tap_h type, size
+function \type\()_8tap_\size\()h
+ sub r2, r2, #6
+ add r6, r0, r1
+ add r7, r2, r3
+ add r1, r1, r1
+ add r3, r3, r3
+ @ Only size >= 8 loops horizontally and needs
+ @ reduced dst stride
+.if \size >= 8
+ sub r1, r1, r5
+.endif
+ @ size >= 8 loads two qwords and increments r2,
+ @ for size 4 it's enough with three dwords and no
+ @ postincrement
+.if \size >= 8
+ sub r3, r3, r5
+ sub r3, r3, #16
+.endif
+ @ Load the filter vector
+ vld1.16 {q0}, [r12,:128]
+1:
+.if \size >= 8
+ mov r12, r5
+.endif
+ @ Load src
+.if \size >= 8
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q10, q11}, [r7]!
+.else
+ vld1.16 {d16, d17, d18}, [r2]
+ vld1.16 {d20, d21, d22}, [r7]
+.endif
+2:
+
+ vmull.s16 q1, d16, d0[0]
+ vmull.s16 q12, d20, d0[0]
+.if \size >= 8
+ vmull.s16 q2, d17, d0[0]
+ vmull.s16 q13, d21, d0[0]
+.endif
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 1, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 2, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 3, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 4, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 5, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 6, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 7, \size
+
+ @ Round, shift and saturate.
+ @ The vqrshrun takes care of clamping negative values to zero, but
+ @ we manually need to do vmin with the max pixel value.
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d24, q12, #7
+.if \size >= 8
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d25, q13, #7
+ vmin.u16 q1, q1, q3
+ vmin.u16 q12, q12, q3
+.else
+ vmin.u16 d2, d2, d6
+ vmin.u16 d24, d24, d6
+.endif
+ @ Average
+.ifc \type,avg
+.if \size >= 8
+ vld1.16 {q14}, [r0,:128]
+ vld1.16 {q15}, [r6,:128]
+ vrhadd.u16 q1, q1, q14
+ vrhadd.u16 q12, q12, q15
+.else
+ vld1.16 {d28}, [r0,:64]
+ vld1.16 {d30}, [r6,:64]
+ vrhadd.u16 d2, d2, d28
+ vrhadd.u16 d24, d24, d30
+.endif
+.endif
+ @ Store and loop horizontally (for size >= 8)
+.if \size >= 8
+ subs r12, r12, #16
+ vst1.16 {q1}, [r0,:128]!
+ vst1.16 {q12}, [r6,:128]!
+ beq 3f
+ vmov q8, q9
+ vmov q10, q11
+ vld1.16 {q9}, [r2]!
+ vld1.16 {q11}, [r7]!
+ b 2b
+.else @ \size == 4
+ vst1.16 {d2}, [r0,:64]
+ vst1.16 {d24}, [r6,:64]
+.endif
+3:
+ @ Loop vertically
+ add r0, r0, r1
+ add r6, r6, r1
+ add r2, r2, r3
+ add r7, r7, r3
+ subs r4, r4, #2
+ bne 1b
+ pop {r4-r7}
+ bx lr
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size
+do_8tap_h avg, \size
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+
+.macro do_8tap_h_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
+ push {r4-r7}
+ ldr r4, [sp, #16]
+ ldr r5, [sp, #20]
+ vmvn.u16 q3, #((0xffff << \bpp) & 0xffff)
+ movrelx r12, X(ff_vp9_subpel_filters), r6
+ add r12, r12, 256*\offset
+ add r12, r12, r5, lsl #4
+ mov r5, #2*\size
+.if \size >= 8
+ b \type\()_8tap_8h
+.else
+ b \type\()_8tap_4h
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size, bpp
+do_8tap_h_func put, regular, 1, \size, \bpp
+do_8tap_h_func avg, regular, 1, \size, \bpp
+do_8tap_h_func put, sharp, 2, \size, \bpp
+do_8tap_h_func avg, sharp, 2, \size, \bpp
+do_8tap_h_func put, smooth, 0, \size, \bpp
+do_8tap_h_func avg, smooth, 0, \size, \bpp
+.endm
+
+.macro do_8tap_h_filters_bpp bpp
+do_8tap_h_filters 64, \bpp
+do_8tap_h_filters 32, \bpp
+do_8tap_h_filters 16, \bpp
+do_8tap_h_filters 8, \bpp
+do_8tap_h_filters 4, \bpp
+.endm
+
+do_8tap_h_filters_bpp 10
+do_8tap_h_filters_bpp 12
+
+.ltorg
+
+@ Vertical filters
+
+@ Round, shift and saturate and store qreg1-4
+.macro do_store4 qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, minreg, type
+ vqrshrun.s32 \dreg1, \qreg1, #7
+ vqrshrun.s32 \dreg2, \qreg2, #7
+ vqrshrun.s32 \dreg3, \qreg3, #7
+ vqrshrun.s32 \dreg4, \qreg4, #7
+.ifc \type,avg
+ vld1.16 {\tmp1}, [r6,:64], r1
+ vld1.16 {\tmp2}, [r6,:64], r1
+ vld1.16 {\tmp3}, [r6,:64], r1
+ vld1.16 {\tmp4}, [r6,:64], r1
+.endif
+ vmin.u16 \dreg1, \dreg1, \minreg
+ vmin.u16 \dreg2, \dreg2, \minreg
+ vmin.u16 \dreg3, \dreg3, \minreg
+ vmin.u16 \dreg4, \dreg4, \minreg
+.ifc \type,avg
+ vrhadd.u16 \dreg1, \dreg1, \tmp1
+ vrhadd.u16 \dreg2, \dreg2, \tmp2
+ vrhadd.u16 \dreg3, \dreg3, \tmp3
+ vrhadd.u16 \dreg4, \dreg4, \tmp4
+.endif
+ vst1.16 {\dreg1}, [r0,:64], r1
+ vst1.16 {\dreg2}, [r0,:64], r1
+ vst1.16 {\dreg3}, [r0,:64], r1
+ vst1.16 {\dreg4}, [r0,:64], r1
+.endm
+
+@ Round, shift and saturate and store qreg1-4
+@ qreg1-2 belong to one line and qreg3-4 to the second line.
+@ dreg1-2 == qreg1, dreg3-4 == qreg2.
+.macro do_store8 qreg1, qreg2, qreg3, qreg4, dreg1, dreg2, dreg3, dreg4, minreg, type
+ vqrshrun.s32 \dreg1, \qreg1, #7
+ vqrshrun.s32 \dreg2, \qreg2, #7
+ vqrshrun.s32 \dreg3, \qreg3, #7
+ vqrshrun.s32 \dreg4, \qreg4, #7
+.ifc \type,avg
+ vld1.16 {\qreg3}, [r6,:128], r1
+ vld1.16 {\qreg4}, [r6,:128], r1
+.endif
+ vmin.u16 \qreg1, \qreg1, \minreg
+ vmin.u16 \qreg2, \qreg2, \minreg
+.ifc \type,avg
+ vrhadd.u16 \qreg1, \qreg1, \qreg3
+ vrhadd.u16 \qreg2, \qreg2, \qreg4
+.endif
+ vst1.16 {\qreg1}, [r0,:128], r1
+ vst1.16 {\qreg2}, [r0,:128], r1
+.endm
+
+@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+@ (src1-src8 into dst1, src2-src9 into dst2).
+.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
+ vmull.s16 \dst1, \src1, d0[0]
+ vmull.s16 \dst2, \src2, d0[0]
+ vmull.s16 \tmp1, \src2, d0[1]
+ vmull.s16 \tmp2, \src3, d0[1]
+ vmlal.s16 \dst1, \src3, d0[2]
+ vmlal.s16 \dst2, \src4, d0[2]
+ vmlal.s16 \tmp1, \src4, d0[3]
+ vmlal.s16 \tmp2, \src5, d0[3]
+ vmlal.s16 \dst1, \src5, d1[0]
+ vmlal.s16 \dst2, \src6, d1[0]
+ vmlal.s16 \tmp1, \src6, d1[1]
+ vmlal.s16 \tmp2, \src7, d1[1]
+ vmlal.s16 \dst1, \src7, d1[2]
+ vmlal.s16 \dst2, \src8, d1[2]
+ vmlal.s16 \tmp1, \src8, d1[3]
+ vmlal.s16 \tmp2, \src9, d1[3]
+ vadd.s32 \dst1, \dst1, \tmp1
+ vadd.s32 \dst2, \dst2, \tmp2
+.endm
+
+@ Evaluate the filter twice in parallel. This does the same as convolve4 above,
+@ but with double width (two input/output registers per row).
+.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15, src16, src17, src18
+ vmull.s16 \dst1, \src1, d0[0]
+ vmull.s16 \dst2, \src2, d0[0]
+ vmull.s16 \dst3, \src3, d0[0]
+ vmull.s16 \dst4, \src4, d0[0]
+ vmlal.s16 \dst1, \src3, d0[1]
+ vmlal.s16 \dst2, \src4, d0[1]
+ vmlal.s16 \dst3, \src5, d0[1]
+ vmlal.s16 \dst4, \src6, d0[1]
+ vmlal.s16 \dst1, \src5, d0[2]
+ vmlal.s16 \dst2, \src6, d0[2]
+ vmlal.s16 \dst3, \src7, d0[2]
+ vmlal.s16 \dst4, \src8, d0[2]
+ vmlal.s16 \dst1, \src7, d0[3]
+ vmlal.s16 \dst2, \src8, d0[3]
+ vmlal.s16 \dst3, \src9, d0[3]
+ vmlal.s16 \dst4, \src10, d0[3]
+ vmlal.s16 \dst1, \src9, d1[0]
+ vmlal.s16 \dst2, \src10, d1[0]
+ vmlal.s16 \dst3, \src11, d1[0]
+ vmlal.s16 \dst4, \src12, d1[0]
+ vmlal.s16 \dst1, \src11, d1[1]
+ vmlal.s16 \dst2, \src12, d1[1]
+ vmlal.s16 \dst3, \src13, d1[1]
+ vmlal.s16 \dst4, \src14, d1[1]
+ vmlal.s16 \dst1, \src13, d1[2]
+ vmlal.s16 \dst2, \src14, d1[2]
+ vmlal.s16 \dst3, \src15, d1[2]
+ vmlal.s16 \dst4, \src16, d1[2]
+ vmlal.s16 \dst1, \src15, d1[3]
+ vmlal.s16 \dst2, \src16, d1[3]
+ vmlal.s16 \dst3, \src17, d1[3]
+ vmlal.s16 \dst4, \src18, d1[3]
+.endm
+
+@ Instantiate a vertical filter function for filtering 8 pixels at a time.
+@ The height is passed in r4, the width in r5 and the filter coefficients
+@ in r12.
+.macro do_8tap_8v type
+function \type\()_8tap_8v
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ vld1.16 {q0}, [r12, :128]
+1:
+.ifc \type,avg
+ mov r6, r0
+.endif
+ mov r12, r4
+
+ vld1.16 {q5}, [r2], r3
+ vld1.16 {q6}, [r2], r3
+ vld1.16 {q7}, [r2], r3
+ vld1.16 {q8}, [r2], r3
+ vld1.16 {q9}, [r2], r3
+ vld1.16 {q10}, [r2], r3
+ vld1.16 {q11}, [r2], r3
+2:
+ vld1.16 {q12}, [r2], r3
+ vld1.16 {q13}, [r2], r3
+ vld1.16 {q14}, [r2], r3
+ vld1.16 {q15}, [r2], r3
+ convolve8 q2, q3, q4, q5, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27
+ do_store8 q2, q3, q4, q5, d4, d5, d6, d7, q1, \type
+ convolve8 q2, q3, q4, q5, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ do_store8 q2, q3, q4, q5, d4, d5, d6, d7, q1, \type
+
+ subs r12, r12, #4
+ beq 8f
+
+ vld1.16 {q4}, [r2], r3
+ vld1.16 {q5}, [r2], r3
+ vld1.16 {q6}, [r2], r3
+ vld1.16 {q7}, [r2], r3
+ convolve8 q2, q3, q8, q9, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11
+ do_store8 q2, q3, q8, q9, d4, d5, d6, d7, q1, \type
+ convolve8 q2, q3, q8, q9, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15
+ do_store8 q2, q3, q8, q9, d4, d5, d6, d7, q1, \type
+
+ subs r12, r12, #4
+ beq 8f
+
+ vld1.16 {q8}, [r2], r3
+ vld1.16 {q9}, [r2], r3
+ vld1.16 {q10}, [r2], r3
+ vld1.16 {q11}, [r2], r3
+ convolve8 q2, q3, q12, q13, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19
+ do_store8 q2, q3, q12, q13, d4, d5, d6, d7, q1, \type
+ convolve8 q2, q3, q12, q13, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23
+ do_store8 q2, q3, q12, q13, d4, d5, d6, d7, q1, \type
+
+ subs r12, r12, #4
+ bne 2b
+
+8:
+ subs r5, r5, #8
+ beq 9f
+ @ r0 -= h * dst_stride
+ mls r0, r1, r4, r0
+ @ r2 -= h * src_stride
+ mls r2, r3, r4, r2
+ @ r2 -= 8 * src_stride
+ sub r2, r2, r3, lsl #3
+ @ r2 += 1 * src_stride
+ add r2, r2, r3
+ add r2, r2, #16
+ add r0, r0, #16
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r6}
+ bx lr
+endfunc
+.endm
+
+do_8tap_8v put
+do_8tap_8v avg
+
+@ Instantiate a vertical filter function for filtering a 4 pixels wide
+@ slice. This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type
+function \type\()_8tap_4v
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ vld1.16 {q0}, [r12, :128]
+.ifc \type,avg
+ mov r6, r0
+.endif
+
+ vld1.16 {d16}, [r2], r3
+ vld1.16 {d17}, [r2], r3
+ vld1.16 {d18}, [r2], r3
+ vld1.16 {d19}, [r2], r3
+ vld1.16 {d20}, [r2], r3
+ vld1.16 {d21}, [r2], r3
+ vld1.16 {d22}, [r2], r3
+ vld1.16 {d23}, [r2], r3
+ vld1.16 {d24}, [r2], r3
+ vld1.16 {d25}, [r2], r3
+ vld1.16 {d26}, [r2], r3
+ convolve4 q2, q3, d16, d17, d18, d19, d20, d21, d22, d23, d24, q14, q15
+ convolve4 q14, q15, d18, d19, d20, d21, d22, d23, d24, d25, d26, q8, q9
+ do_store4 q2, d4, q3, d6, q14, d28, q15, d30, d5, d7, d29, d31, d2, \type
+
+ subs r4, r4, #4
+ beq 9f
+
+ vld1.16 {d27}, [r2], r3
+ vld1.16 {d28}, [r2], r3
+ vld1.16 {d29}, [r2], r3
+ vld1.16 {d30}, [r2], r3
+ convolve4 q2, q3, d20, d21, d22, d23, d24, d25, d26, d27, d28, q8, q9
+ convolve4 q8, q9, d22, d23, d24, d25, d26, d27, d28, d29, d30, q10, q11
+ do_store4 q2, d4, q3, d6, q8, d16, q9, d18, d5, d7, d17, d19, d2, \type
+
+9:
+ pop {r4-r6}
+ bx lr
+endfunc
+.endm
+
+do_8tap_4v put
+do_8tap_4v avg
+
+.macro do_8tap_v_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
+ push {r4-r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #20]
+.if \size >= 8
+ vpush {q4-q7}
+.endif
+ vmvn.u16 q1, #((0xffff << \bpp) & 0xffff)
+ movrelx r12, X(ff_vp9_subpel_filters), r6
+ add r12, r12, 256*\offset
+ add r12, r12, r5, lsl #4
+ mov r5, #\size
+.if \size >= 8
+ b \type\()_8tap_8v
+.else
+ b \type\()_8tap_4v
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size, bpp
+do_8tap_v_func put, regular, 1, \size, \bpp
+do_8tap_v_func avg, regular, 1, \size, \bpp
+do_8tap_v_func put, sharp, 2, \size, \bpp
+do_8tap_v_func avg, sharp, 2, \size, \bpp
+do_8tap_v_func put, smooth, 0, \size, \bpp
+do_8tap_v_func avg, smooth, 0, \size, \bpp
+.endm
+
+.macro do_8tap_v_filters_bpp bpp
+do_8tap_v_filters 64, \bpp
+do_8tap_v_filters 32, \bpp
+do_8tap_v_filters 16, \bpp
+do_8tap_v_filters 8, \bpp
+do_8tap_v_filters 4, \bpp
+.endm
+
+do_8tap_v_filters_bpp 10
+do_8tap_v_filters_bpp 12