6 files changed, 64 insertions, 43 deletions
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index d49a1765f6..2ea12895de 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -24,16 +24,22 @@
 #include "libavutil/attributes.h"
 #include "libavcodec/dcadsp.h"
 
-void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
-                        int decifactor, float scale);
+void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs,
+                          float scale);
+void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs,
+                          float scale);
+
+void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs,
+                          float scale);
+void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs,
+                          float scale);
+
 void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
                                 SynthFilterContext *synth, FFTContext *imdct,
                                 float synth_buf_ptr[512],
                                 int *synth_buf_offset, float synth_buf2[32],
                                 const float window[512], float *samples_out,
                                 float raXin[32], float scale);
-void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
-                         int decifactor, float scale);
 
 void ff_synth_filter_float_vfp(FFTContext *imdct,
                                float *synth_buf_ptr, int *synth_buf_offset,
@@ -52,11 +58,14 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
     int cpu_flags = av_get_cpu_flags();
 
     if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
-        s->lfe_fir = ff_dca_lfe_fir_vfp;
+        s->lfe_fir[0]      = ff_dca_lfe_fir32_vfp;
+        s->lfe_fir[1]      = ff_dca_lfe_fir64_vfp;
         s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
     }
-    if (have_neon(cpu_flags))
-        s->lfe_fir = ff_dca_lfe_fir_neon;
+    if (have_neon(cpu_flags)) {
+        s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
+        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
+    }
 }
 
 av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
index fe3aae801a..c798fea7f7 100644
--- a/libavcodec/arm/dcadsp_neon.S
+++ b/libavcodec/arm/dcadsp_neon.S
@@ -20,17 +20,23 @@
 
 #include "libavutil/arm/asm.S"
 
-function ff_dca_lfe_fir_neon, export=1
+function ff_dca_lfe_fir0_neon, export=1
         push            {r4-r6,lr}
+NOVFP   vmov            s0,  r3                 @ scale
+        mov             r3,  #32                @ decifactor
+        mov             r6,  #256/32
+        b               dca_lfe_fir
+endfunc
 
+function ff_dca_lfe_fir1_neon, export=1
+        push            {r4-r6,lr}
+NOVFP   vmov            s0,  r3                 @ scale
+        mov             r3,  #64                @ decifactor
+        mov             r6,  #256/64
+dca_lfe_fir:
         add             r4,  r0,  r3,  lsl #2   @ out2
         add             r5,  r2,  #256*4-16     @ cf1
         sub             r1,  r1,  #12
-        cmp             r3,  #32
-        ite             eq
-        moveq           r6,  #256/32
-        movne           r6,  #256/64
-NOVFP   vldr            s0,  [sp, #16]          @ scale
         mov             lr,  #-16
 1:
         vmov.f32        q2,  #0.0               @ v0
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
index 5892a84342..edabc29e23 100644
--- a/libavcodec/arm/dcadsp_vfp.S
+++ b/libavcodec/arm/dcadsp_vfp.S
@@ -24,7 +24,6 @@
 POUT          .req    a1
 PIN           .req    a2
 PCOEF         .req    a3
-DECIFACTOR    .req    a4
 OLDFPSCR      .req    a4
 COUNTER       .req    ip
 
@@ -129,6 +128,15 @@ POST3         .req    s27
 .endm
 
 .macro dca_lfe_fir  decifactor
+function ff_dca_lfe_fir\decifactor\()_vfp, export=1
+NOVFP   vmov    s0, r3
+        fmrx    OLDFPSCR, FPSCR
+        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
+        fmxr    FPSCR, ip
+        vldr    IN0, [PIN, #-0*4]
+        vldr    IN1, [PIN, #-1*4]
+        vldr    IN2, [PIN, #-2*4]
+        vldr    IN3, [PIN, #-3*4]
  .if \decifactor == 32
   .set JMAX, 8
         vpush   {s16-s31}
@@ -165,32 +173,16 @@ POST3         .req    s27
  .endif
         fmxr    FPSCR, OLDFPSCR
         bx      lr
+endfunc
 .endm
 
-
-/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
- *                         int decifactor, float scale)
- */
-function ff_dca_lfe_fir_vfp, export=1
-        teq     DECIFACTOR, #32
-        fmrx    OLDFPSCR, FPSCR
-        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
-        fmxr    FPSCR, ip
-NOVFP   vldr    s0, [sp]
-        vldr    IN0, [PIN, #-0*4]
-        vldr    IN1, [PIN, #-1*4]
-        vldr    IN2, [PIN, #-2*4]
-        vldr    IN3, [PIN, #-3*4]
-        beq     32f
-64:     dca_lfe_fir  64
+        dca_lfe_fir  64
  .ltorg
-32:     dca_lfe_fir  32
-endfunc
+        dca_lfe_fir  32
 
         .unreq  POUT
         .unreq  PIN
         .unreq  PCOEF
-        .unreq  DECIFACTOR
         .unreq  OLDFPSCR
         .unreq  COUNTER
 
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index 6ffb040aaa..723ed191dc 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -957,23 +957,23 @@ static void lfe_interpolation_fir(DCAContext *s, int decimation_select,
      * samples_out: An array holding interpolated samples
      */
 
-    int decifactor;
+    int idx;
     const float *prCoeff;
     int deciindex;
 
     /* Select decimation filter */
     if (decimation_select == 1) {
-        decifactor = 64;
+        idx = 1;
         prCoeff = lfe_fir_128;
     } else {
-        decifactor = 32;
+        idx = 0;
         prCoeff = lfe_fir_64;
     }
     /* Interpolation */
     for (deciindex = 0; deciindex < num_deci_sample; deciindex++) {
-        s->dcadsp.lfe_fir(samples_out, samples_in, prCoeff, decifactor, scale);
+        s->dcadsp.lfe_fir[idx](samples_out, samples_in, prCoeff, scale);
         samples_in++;
-        samples_out += 2 * decifactor;
+        samples_out += 2 * 32 * (1 + idx);
     }
 }
 
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index 148f6dd607..8d242c5959 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -32,8 +32,9 @@ static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale)
         dst[i] = src[i] * fscale;
 }
 
-static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
-                          int decifactor, float scale)
+static inline void
+dca_lfe_fir(float *out, const float *in, const float *coefs,
+            int decifactor, float scale)
 {
     float *out2 = out + decifactor;
     const float *cf0 = coefs;
@@ -82,9 +83,22 @@ static void dca_qmf_32_subbands(float samples_in[32][8], int sb_act,
     }
 }
 
+static void dca_lfe_fir0_c(float *out, const float *in, const float *coefs,
+                           float scale)
+{
+    dca_lfe_fir(out, in, coefs, 32, scale);
+}
+
+static void dca_lfe_fir1_c(float *out, const float *in, const float *coefs,
+                           float scale)
+{
+    dca_lfe_fir(out, in, coefs, 64, scale);
+}
+
 av_cold void ff_dcadsp_init(DCADSPContext *s)
 {
-    s->lfe_fir = dca_lfe_fir_c;
+    s->lfe_fir[0] = dca_lfe_fir0_c;
+    s->lfe_fir[1] = dca_lfe_fir1_c;
     s->qmf_32_subbands = dca_qmf_32_subbands;
     s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
     if (ARCH_ARM) ff_dcadsp_init_arm(s);
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index e2ad09adf6..3e04426a80 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -23,8 +23,8 @@
 #include "synth_filter.h"
 
 typedef struct DCADSPContext {
-    void (*lfe_fir)(float *out, const float *in, const float *coefs,
-                    int decifactor, float scale);
+    void (*lfe_fir[2])(float *out, const float *in, const float *coefs,
+                       float scale);
     void (*qmf_32_subbands)(float samples_in[32][8], int sb_act,
                             SynthFilterContext *synth, FFTContext *imdct,
                             float synth_buf_ptr[512],