dcadsp: split lfe_dir cases

The x86 runs short on registers because numerous elements are not static. In addition, splitting them allows more optimized code, at least for x86. Arm asm changes by Janne Grunau. Signed-off-by: Janne Grunau <janne-libav@jannau.net>
author: Christophe Gisquet 2014-02-05 23:40:52 +0000
committer: Janne Grunau 2014-02-07 22:54:18 +0100
commit: 5fdbfcb5b793f5849c496214668094a8ec99fa07 (patch)
tree: 8f705b537443ec12285e367aa0747fd1fec1671b /libavcodec/arm
parent: 5b59a9fc6152169599561f04b4f66370edda5c9c (diff)
3 files changed, 40 insertions, 33 deletions
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index d49a1765f6..2ea12895de 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -24,16 +24,22 @@
 #include "libavutil/attributes.h"
 #include "libavcodec/dcadsp.h"
 
-void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
-                        int decifactor, float scale);
+void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs,
+                          float scale);
+void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs,
+                          float scale);
+
+void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs,
+                          float scale);
+void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs,
+                          float scale);
+
 void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
                                 SynthFilterContext *synth, FFTContext *imdct,
                                 float synth_buf_ptr[512],
                                 int *synth_buf_offset, float synth_buf2[32],
                                 const float window[512], float *samples_out,
                                 float raXin[32], float scale);
-void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
-                         int decifactor, float scale);
 
 void ff_synth_filter_float_vfp(FFTContext *imdct,
                                float *synth_buf_ptr, int *synth_buf_offset,
@@ -52,11 +58,14 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
     int cpu_flags = av_get_cpu_flags();
 
     if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
-        s->lfe_fir = ff_dca_lfe_fir_vfp;
+        s->lfe_fir[0]      = ff_dca_lfe_fir32_vfp;
+        s->lfe_fir[1]      = ff_dca_lfe_fir64_vfp;
         s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
     }
-    if (have_neon(cpu_flags))
-        s->lfe_fir = ff_dca_lfe_fir_neon;
+    if (have_neon(cpu_flags)) {
+        s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
+        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
+    }
 }
 
 av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
index fe3aae801a..c798fea7f7 100644
--- a/libavcodec/arm/dcadsp_neon.S
+++ b/libavcodec/arm/dcadsp_neon.S
@@ -20,17 +20,23 @@
 
 #include "libavutil/arm/asm.S"
 
-function ff_dca_lfe_fir_neon, export=1
+function ff_dca_lfe_fir0_neon, export=1
         push            {r4-r6,lr}
+NOVFP   vmov            s0,  r3                 @ scale
+        mov             r3,  #32                @ decifactor
+        mov             r6,  #256/32
+        b               dca_lfe_fir
+endfunc
 
+function ff_dca_lfe_fir1_neon, export=1
+        push            {r4-r6,lr}
+NOVFP   vmov            s0,  r3                 @ scale
+        mov             r3,  #64                @ decifactor
+        mov             r6,  #256/64
+dca_lfe_fir:
         add             r4,  r0,  r3,  lsl #2   @ out2
         add             r5,  r2,  #256*4-16     @ cf1
         sub             r1,  r1,  #12
-        cmp             r3,  #32
-        ite             eq
-        moveq           r6,  #256/32
-        movne           r6,  #256/64
-NOVFP   vldr            s0,  [sp, #16]          @ scale
         mov             lr,  #-16
 1:
         vmov.f32        q2,  #0.0               @ v0
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
index 5892a84342..edabc29e23 100644
--- a/libavcodec/arm/dcadsp_vfp.S
+++ b/libavcodec/arm/dcadsp_vfp.S
@@ -24,7 +24,6 @@
 POUT          .req    a1
 PIN           .req    a2
 PCOEF         .req    a3
-DECIFACTOR    .req    a4
 OLDFPSCR      .req    a4
 COUNTER       .req    ip
 
@@ -129,6 +128,15 @@ POST3         .req    s27
 .endm
 
 .macro dca_lfe_fir  decifactor
+function ff_dca_lfe_fir\decifactor\()_vfp, export=1
+NOVFP   vmov    s0, r3
+        fmrx    OLDFPSCR, FPSCR
+        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
+        fmxr    FPSCR, ip
+        vldr    IN0, [PIN, #-0*4]
+        vldr    IN1, [PIN, #-1*4]
+        vldr    IN2, [PIN, #-2*4]
+        vldr    IN3, [PIN, #-3*4]
  .if \decifactor == 32
   .set JMAX, 8
         vpush   {s16-s31}
@@ -165,32 +173,16 @@ POST3         .req    s27
  .endif
         fmxr    FPSCR, OLDFPSCR
         bx      lr
+endfunc
 .endm
 
-
-/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
- *                         int decifactor, float scale)
- */
-function ff_dca_lfe_fir_vfp, export=1
-        teq     DECIFACTOR, #32
-        fmrx    OLDFPSCR, FPSCR
-        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
-        fmxr    FPSCR, ip
-NOVFP   vldr    s0, [sp]
-        vldr    IN0, [PIN, #-0*4]
-        vldr    IN1, [PIN, #-1*4]
-        vldr    IN2, [PIN, #-2*4]
-        vldr    IN3, [PIN, #-3*4]
-        beq     32f
-64:     dca_lfe_fir  64
+        dca_lfe_fir  64
  .ltorg
-32:     dca_lfe_fir  32
-endfunc
+        dca_lfe_fir  32
 
         .unreq  POUT
         .unreq  PIN
         .unreq  PCOEF
-        .unreq  DECIFACTOR
         .unreq  OLDFPSCR
         .unreq  COUNTER
author	Christophe Gisquet	2014-02-05 23:40:52 +0000
committer	Janne Grunau	2014-02-07 22:54:18 +0100
commit	5fdbfcb5b793f5849c496214668094a8ec99fa07 (patch)
tree	8f705b537443ec12285e367aa0747fd1fec1671b /libavcodec/arm
parent	5b59a9fc6152169599561f04b4f66370edda5c9c (diff)