diff options
author | Daniel Kang | 2013-01-16 02:41:39 -0500 |
---|---|---|
committer | Luca Barbato | 2013-01-21 09:54:10 +0100 |
commit | 9f00b1cbababa08dd220dbc0c74286a4707be746 (patch) | |
tree | db8d5ab341c405cbec8162af49d2f9243ac95734 /libavcodec | |
parent | c7df1532e5d690cb445ae443c998bd564c906a30 (diff) |
dsputilenc: x86: Convert pixel inline asm to yasm
Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/dsputilenc.asm | 152 | ||||
-rw-r--r-- | libavcodec/x86/dsputilenc_mmx.c | 201 |
2 files changed, 172 insertions, 181 deletions
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index a2cb7f9202..7b8763cf59 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -333,3 +333,155 @@ cglobal sse16, 5, 5, 8 paddd m7, m1 movd eax, m7 ; return value RET + +INIT_MMX mmx +; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) +cglobal get_pixels, 3,4 + movsxdifnidn r2, r2d + add r0, 128 + mov r3, -128 + pxor m7, m7 +.loop: + mova m0, [r1] + mova m2, [r1+r2] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + mova [r0+r3+ 0], m0 + mova [r0+r3+ 8], m1 + mova [r0+r3+16], m2 + mova [r0+r3+24], m3 + lea r1, [r1+r2*2] + add r3, 32 + js .loop + REP_RET + +INIT_XMM sse2 +cglobal get_pixels, 3, 4 + movsxdifnidn r2, r2d + lea r3, [r2*3] + pxor m4, m4 + movh m0, [r1] + movh m1, [r1+r2] + movh m2, [r1+r2*2] + movh m3, [r1+r3] + lea r1, [r1+r2*4] + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 + mova [r0], m0 + mova [r0+0x10], m1 + mova [r0+0x20], m2 + mova [r0+0x30], m3 + movh m0, [r1] + movh m1, [r1+r2*1] + movh m2, [r1+r2*2] + movh m3, [r1+r3] + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 + mova [r0+0x40], m0 + mova [r0+0x50], m1 + mova [r0+0x60], m2 + mova [r0+0x70], m3 + RET + +INIT_MMX mmx +; diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const unint8_t *s2, stride) +cglobal diff_pixels, 4,5 + movsxdifnidn r3, r3d + pxor m7, m7 + add r0, 128 + mov r4, -128 +.loop: + mova m0, [r1] + mova m2, [r2] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + psubw m0, m2 + psubw m1, m3 + mova [r0+r4+0], m0 + mova [r0+r4+8], m1 + add r1, r3 + add r2, r3 + add r4, 16 + jne .loop + REP_RET + +INIT_MMX mmx +; pix_sum16_mmx(uint8_t * pix, int line_size) +cglobal pix_sum16, 2, 3 + movsxdifnidn r1, r1d + mov r2, r1 + neg r2 + shl r2, 4 + sub r0, r2 + pxor m7, m7 + pxor m6, m6 +.loop: + mova m0, [r0+r2+0] + mova m1, [r0+r2+0] + mova m2, [r0+r2+8] + mova m3, [r0+r2+8] + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + paddw m1, m0 + paddw m3, m2 + paddw m3, m1 + paddw m6, m3 + add r2, r1 + js .loop + mova m5, m6 + psrlq m6, 32 + paddw m6, m5 + mova m5, m6 + psrlq m6, 16 + paddw m6, m5 + movd eax, m6 + and eax, 0xffff + RET + +INIT_MMX mmx +; pix_norm1_mmx(uint8_t *pix, int line_size) +cglobal pix_norm1, 2, 4 + movsxdifnidn r1, r1d + mov r2, 16 + pxor m0, m0 + pxor m7, m7 +.loop: + mova m2, [r0+0] + mova m3, [r0+8] + mova m1, m2 + punpckhbw m1, m0 + punpcklbw m2, m0 + mova m4, m3 + punpckhbw m3, m0 + punpcklbw m4, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m2, m1 + paddd m4, m3 + paddd m7, m2 + add r0, r1 + paddd m7, m4 + dec r2 + jne .loop + mova m1, m7 + psrlq m7, 32 + paddd m1, m7 + movd eax, m1 + RET + diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index e5d2473e3b..fa126d68fd 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -30,181 +30,14 @@ #include "libavcodec/mathops.h" #include "dsputil_mmx.h" +void ff_get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size); +void ff_get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size); +void ff_diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride); +int ff_pix_sum16_mmx(uint8_t * pix, int line_size); +int ff_pix_norm1_mmx(uint8_t *pix, int line_size); #if HAVE_INLINE_ASM -static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) -{ - __asm__ volatile( - "mov $-128, %%"REG_a" \n\t" - "pxor %%mm7, %%mm7 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%0), %%mm0 \n\t" - "movq (%0, %2), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "movq %%mm0, (%1, %%"REG_a") \n\t" - "movq %%mm1, 8(%1, %%"REG_a") \n\t" - "movq %%mm2, 16(%1, %%"REG_a") \n\t" - "movq %%mm3, 24(%1, %%"REG_a") \n\t" - "add %3, %0 \n\t" - "add $32, %%"REG_a" \n\t" - "js 1b \n\t" - : "+r" (pixels) - : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) - : "%"REG_a - ); -} - -static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) -{ - __asm__ volatile( - "pxor %%xmm4, %%xmm4 \n\t" - "movq (%0), %%xmm0 \n\t" - "movq (%0, %2), %%xmm1 \n\t" - "movq (%0, %2,2), %%xmm2 \n\t" - "movq (%0, %3), %%xmm3 \n\t" - "lea (%0,%2,4), %0 \n\t" - "punpcklbw %%xmm4, %%xmm0 \n\t" - "punpcklbw %%xmm4, %%xmm1 \n\t" - "punpcklbw %%xmm4, %%xmm2 \n\t" - "punpcklbw %%xmm4, %%xmm3 \n\t" - "movdqa %%xmm0, (%1) \n\t" - "movdqa %%xmm1, 16(%1) \n\t" - "movdqa %%xmm2, 32(%1) \n\t" - "movdqa %%xmm3, 48(%1) \n\t" - "movq (%0), %%xmm0 \n\t" - "movq (%0, %2), %%xmm1 \n\t" - "movq (%0, %2,2), %%xmm2 \n\t" - "movq (%0, %3), %%xmm3 \n\t" - "punpcklbw %%xmm4, %%xmm0 \n\t" - "punpcklbw %%xmm4, %%xmm1 \n\t" - "punpcklbw %%xmm4, %%xmm2 \n\t" - "punpcklbw %%xmm4, %%xmm3 \n\t" - "movdqa %%xmm0, 64(%1) \n\t" - "movdqa %%xmm1, 80(%1) \n\t" - "movdqa %%xmm2, 96(%1) \n\t" - "movdqa %%xmm3, 112(%1) \n\t" - : "+r" (pixels) - : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) - ); -} - -static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) -{ - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "mov $-128, %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%0), %%mm0 \n\t" - "movq (%1), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "movq %%mm1, 8(%2, %%"REG_a") \n\t" - "add %3, %0 \n\t" - "add %3, %1 \n\t" - "add $16, %%"REG_a" \n\t" - "jnz 1b \n\t" - : "+r" (s1), "+r" (s2) - : "r" (block+64), "r" ((x86_reg)stride) - : "%"REG_a - ); -} - -static int pix_sum16_mmx(uint8_t * pix, int line_size){ - const int h=16; - int sum; - x86_reg index= -line_size*h; - - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "pxor %%mm6, %%mm6 \n\t" - "1: \n\t" - "movq (%2, %1), %%mm0 \n\t" - "movq (%2, %1), %%mm1 \n\t" - "movq 8(%2, %1), %%mm2 \n\t" - "movq 8(%2, %1), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "paddw %%mm1, %%mm3 \n\t" - "paddw %%mm3, %%mm6 \n\t" - "add %3, %1 \n\t" - " js 1b \n\t" - "movq %%mm6, %%mm5 \n\t" - "psrlq $32, %%mm6 \n\t" - "paddw %%mm5, %%mm6 \n\t" - "movq %%mm6, %%mm5 \n\t" - "psrlq $16, %%mm6 \n\t" - "paddw %%mm5, %%mm6 \n\t" - "movd %%mm6, %0 \n\t" - "andl $0xFFFF, %0 \n\t" - : "=&r" (sum), "+r" (index) - : "r" (pix - index), "r" ((x86_reg)line_size) - ); - - return sum; -} - -static int pix_norm1_mmx(uint8_t *pix, int line_size) { - int tmp; - __asm__ volatile ( - "movl $16,%%ecx\n" - "pxor %%mm0,%%mm0\n" - "pxor %%mm7,%%mm7\n" - "1:\n" - "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ - "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ - - "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ - - "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ - "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ - - "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ - "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ - "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ - - "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ - "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ - - "pmaddwd %%mm3,%%mm3\n" - "pmaddwd %%mm4,%%mm4\n" - - "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, - pix2^2+pix3^2+pix6^2+pix7^2) */ - "paddd %%mm3,%%mm4\n" - "paddd %%mm2,%%mm7\n" - - "add %2, %0\n" - "paddd %%mm4,%%mm7\n" - "dec %%ecx\n" - "jnz 1b\n" - - "movq %%mm7,%%mm1\n" - "psrlq $32, %%mm7\n" /* shift hi dword to lo */ - "paddd %%mm7,%%mm1\n" - "movd %%mm1,%1\n" - : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); - return tmp; -} - static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; __asm__ volatile ( @@ -1111,10 +944,23 @@ hadamard_func(ssse3) void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) { int mm_flags = av_get_cpu_flags(); - -#if HAVE_INLINE_ASM int bit_depth = avctx->bits_per_raw_sample; +#if HAVE_YASM + if (EXTERNAL_MMX(mm_flags)) { + if (bit_depth <= 8) + c->get_pixels = ff_get_pixels_mmx; + c->diff_pixels = ff_diff_pixels_mmx; + c->pix_sum = ff_pix_sum16_mmx; + + c->pix_norm1 = ff_pix_norm1_mmx; + } + if (EXTERNAL_SSE2(mm_flags)) + if (bit_depth <= 8) + c->get_pixels = ff_get_pixels_sse2; +#endif /* HAVE_YASM */ + +#if HAVE_INLINE_ASM if (mm_flags & AV_CPU_FLAG_MMX) { const int dct_algo = avctx->dct_algo; if (avctx->bits_per_raw_sample <= 8 && @@ -1128,15 +974,10 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) } } - if (bit_depth <= 8) - c->get_pixels = get_pixels_mmx; - c->diff_pixels = diff_pixels_mmx; - c->pix_sum = pix_sum16_mmx; c->diff_bytes= diff_bytes_mmx; c->sum_abs_dctelem= sum_abs_dctelem_mmx; - c->pix_norm1 = pix_norm1_mmx; c->sse[0] = sse16_mmx; c->sse[1] = sse8_mmx; c->vsad[4]= vsad_intra16_mmx; @@ -1166,8 +1007,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) } if(mm_flags & AV_CPU_FLAG_SSE2){ - if (bit_depth <= 8) - c->get_pixels = get_pixels_sse2; c->sum_abs_dctelem= sum_abs_dctelem_sse2; } |