diff options
author | Ben Avison | 2014-07-11 00:12:34 +0100 |
---|---|---|
committer | Martin Storsjö | 2014-07-18 01:34:38 +0300 |
commit | 5a272190a04666f0fe41be767396b30712638c21 (patch) | |
tree | 1bff50bd3d7926346d90e319e80bb526b128ee89 /libavutil/arm | |
parent | 5edad2c4a1f46bcc56be755af86ab355c2f1b37f (diff) |
armv6: Accelerate butterflies_float
I benchmarked the result by measuring the number of gperftools samples that
hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
specifically in butterflies_float_c() / ff_butterflies_float_vfp() for the
same sample AAC stream:
Before After
Mean StdDev Mean StdDev Confidence Change
Audio decode 1542.8 43.7 1470.5 41.5 100.0% +4.9%
butterflies_float 130.0 11.9 70.2 12.1 100.0% +85.2%
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavutil/arm')
-rw-r--r-- | libavutil/arm/float_dsp_init_vfp.c | 4 | ||||
-rw-r--r-- | libavutil/arm/float_dsp_vfp.S | 116 |
2 files changed, 120 insertions, 0 deletions
diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c index f44020e0fd..61ff2ed38e 100644 --- a/libavutil/arm/float_dsp_init_vfp.c +++ b/libavutil/arm/float_dsp_init_vfp.c @@ -32,6 +32,8 @@ void ff_vector_fmul_window_vfp(float *dst, const float *src0, void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len); +void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len); + av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags) { if (!have_vfpv3(cpu_flags)) { @@ -39,4 +41,6 @@ av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags) fdsp->vector_fmul_window = ff_vector_fmul_window_vfp; } fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; + if (!have_vfpv3(cpu_flags)) + fdsp->butterflies_float = ff_butterflies_float_vfp; } diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S index c25588f978..9f920aae70 100644 --- a/libavutil/arm/float_dsp_vfp.S +++ b/libavutil/arm/float_dsp_vfp.S @@ -339,3 +339,119 @@ function ff_vector_fmul_reverse_vfp, export=1 vpop {d8-d15} bx lr endfunc + +/** + * ARM VFP implementation of 'butterflies_float_c' function + * Assume that len is a positive non-zero number + */ +@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len) +function ff_butterflies_float_vfp, export=1 +BASE1 .req a1 +BASE2 .req a2 +LEN .req a3 +OLDFPSCR .req a4 + + vpush {s16-s31} + fmrx OLDFPSCR, FPSCR + + tst LEN, #7 + beq 4f @ common case: len is a multiple of 8 + + ldr ip, =0x03000000 @ RunFast mode, scalar mode + fmxr FPSCR, ip + + tst LEN, #1 + beq 1f + vldmia BASE1!, {s0} + vldmia BASE2!, {s8} + vadd.f s16, s0, s8 + vsub.f s24, s0, s8 + vstr s16, [BASE1, #0-4*1] + vstr s24, [BASE2, #0-4*1] +1: + tst LEN, #2 + beq 2f + vldmia BASE1!, {s0-s1} + vldmia BASE2!, {s8-s9} + vadd.f s16, s0, s8 + vadd.f s17, s1, s9 + vsub.f s24, s0, s8 + vsub.f s25, s1, s9 + vstr d8, [BASE1, #0-8*1] @ s16,s17 + vstr d12, [BASE2, #0-8*1] @ s24,s25 +2: + tst LEN, #4 + beq 3f + vldmia BASE1!, {s0-s1} + vldmia BASE2!, {s8-s9} + vldmia BASE1!, {s2-s3} + vldmia BASE2!, {s10-s11} + vadd.f s16, s0, s8 + vadd.f s17, s1, s9 + vsub.f s24, s0, s8 + vsub.f s25, s1, s9 + vadd.f s18, s2, s10 + vadd.f s19, s3, s11 + vsub.f s26, s2, s10 + vsub.f s27, s3, s11 + vstr d8, [BASE1, #0-16*1] @ s16,s17 + vstr d12, [BASE2, #0-16*1] @ s24,s25 + vstr d9, [BASE1, #8-16*1] @ s18,s19 + vstr d13, [BASE2, #8-16*1] @ s26,s27 +3: + bics LEN, LEN, #7 + beq 7f +4: + ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 + fmxr FPSCR, ip + + vldmia BASE1!, {s0-s1} + vldmia BASE2!, {s8-s9} + vldmia BASE1!, {s2-s3} + vldmia BASE2!, {s10-s11} + vadd.f s16, s0, s8 + vldmia BASE1!, {s4-s5} + vldmia BASE2!, {s12-s13} + vldmia BASE1!, {s6-s7} + vldmia BASE2!, {s14-s15} + vsub.f s24, s0, s8 + vadd.f s20, s4, s12 + subs LEN, LEN, #8 + beq 6f +5: vldmia BASE1!, {s0-s3} + vldmia BASE2!, {s8-s11} + vsub.f s28, s4, s12 + vstr d8, [BASE1, #0-16*3] @ s16,s17 + vstr d9, [BASE1, #8-16*3] @ s18,s19 + vstr d12, [BASE2, #0-16*3] @ s24,s25 + vstr d13, [BASE2, #8-16*3] @ s26,s27 + vadd.f s16, s0, s8 + vldmia BASE1!, {s4-s7} + vldmia BASE2!, {s12-s15} + vsub.f s24, s0, s8 + vstr d10, [BASE1, #0-16*3] @ s20,s21 + vstr d11, [BASE1, #8-16*3] @ s22,s23 + vstr d14, [BASE2, #0-16*3] @ s28,s29 + vstr d15, [BASE2, #8-16*3] @ s30,s31 + vadd.f s20, s4, s12 + subs LEN, LEN, #8 + bne 5b +6: vsub.f s28, s4, s12 + vstr d8, [BASE1, #0-16*2] @ s16,s17 + vstr d9, [BASE1, #8-16*2] @ s18,s19 + vstr d12, [BASE2, #0-16*2] @ s24,s25 + vstr d13, [BASE2, #8-16*2] @ s26,s27 + vstr d10, [BASE1, #0-16*1] @ s20,s21 + vstr d11, [BASE1, #8-16*1] @ s22,s23 + vstr d14, [BASE2, #0-16*1] @ s28,s29 + vstr d15, [BASE2, #8-16*1] @ s30,s31 +7: + fmxr FPSCR, OLDFPSCR + vpop {s16-s31} + bx lr + + .unreq BASE1 + .unreq BASE2 + .unreq LEN + .unreq OLDFPSCR +endfunc |