aboutsummaryrefslogtreecommitdiff
path: root/libavcodec
diff options
context:
space:
mode:
authorDaniel Kang2013-01-16 02:41:39 -0500
committerLuca Barbato2013-01-21 09:54:10 +0100
commit9f00b1cbababa08dd220dbc0c74286a4707be746 (patch)
treedb8d5ab341c405cbec8162af49d2f9243ac95734 /libavcodec
parentc7df1532e5d690cb445ae443c998bd564c906a30 (diff)
dsputilenc: x86: Convert pixel inline asm to yasm
Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/x86/dsputilenc.asm152
-rw-r--r--libavcodec/x86/dsputilenc_mmx.c201
2 files changed, 172 insertions, 181 deletions
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index a2cb7f9202..7b8763cf59 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -333,3 +333,155 @@ cglobal sse16, 5, 5, 8
paddd m7, m1
movd eax, m7 ; return value
RET
+
+INIT_MMX mmx
+; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
+cglobal get_pixels, 3,4
+ movsxdifnidn r2, r2d
+ add r0, 128
+ mov r3, -128
+ pxor m7, m7
+.loop:
+ mova m0, [r1]
+ mova m2, [r1+r2]
+ mova m1, m0
+ mova m3, m2
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ mova [r0+r3+ 0], m0
+ mova [r0+r3+ 8], m1
+ mova [r0+r3+16], m2
+ mova [r0+r3+24], m3
+ lea r1, [r1+r2*2]
+ add r3, 32
+ js .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal get_pixels, 3, 4
+ movsxdifnidn r2, r2d
+ lea r3, [r2*3]
+ pxor m4, m4
+ movh m0, [r1]
+ movh m1, [r1+r2]
+ movh m2, [r1+r2*2]
+ movh m3, [r1+r3]
+ lea r1, [r1+r2*4]
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ mova [r0], m0
+ mova [r0+0x10], m1
+ mova [r0+0x20], m2
+ mova [r0+0x30], m3
+ movh m0, [r1]
+ movh m1, [r1+r2*1]
+ movh m2, [r1+r2*2]
+ movh m3, [r1+r3]
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ mova [r0+0x40], m0
+ mova [r0+0x50], m1
+ mova [r0+0x60], m2
+ mova [r0+0x70], m3
+ RET
+
+INIT_MMX mmx
+; diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const unint8_t *s2, stride)
+cglobal diff_pixels, 4,5
+ movsxdifnidn r3, r3d
+ pxor m7, m7
+ add r0, 128
+ mov r4, -128
+.loop:
+ mova m0, [r1]
+ mova m2, [r2]
+ mova m1, m0
+ mova m3, m2
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ psubw m0, m2
+ psubw m1, m3
+ mova [r0+r4+0], m0
+ mova [r0+r4+8], m1
+ add r1, r3
+ add r2, r3
+ add r4, 16
+ jne .loop
+ REP_RET
+
+INIT_MMX mmx
+; pix_sum16_mmx(uint8_t * pix, int line_size)
+cglobal pix_sum16, 2, 3
+ movsxdifnidn r1, r1d
+ mov r2, r1
+ neg r2
+ shl r2, 4
+ sub r0, r2
+ pxor m7, m7
+ pxor m6, m6
+.loop:
+ mova m0, [r0+r2+0]
+ mova m1, [r0+r2+0]
+ mova m2, [r0+r2+8]
+ mova m3, [r0+r2+8]
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ paddw m1, m0
+ paddw m3, m2
+ paddw m3, m1
+ paddw m6, m3
+ add r2, r1
+ js .loop
+ mova m5, m6
+ psrlq m6, 32
+ paddw m6, m5
+ mova m5, m6
+ psrlq m6, 16
+ paddw m6, m5
+ movd eax, m6
+ and eax, 0xffff
+ RET
+
+INIT_MMX mmx
+; pix_norm1_mmx(uint8_t *pix, int line_size)
+cglobal pix_norm1, 2, 4
+ movsxdifnidn r1, r1d
+ mov r2, 16
+ pxor m0, m0
+ pxor m7, m7
+.loop:
+ mova m2, [r0+0]
+ mova m3, [r0+8]
+ mova m1, m2
+ punpckhbw m1, m0
+ punpcklbw m2, m0
+ mova m4, m3
+ punpckhbw m3, m0
+ punpcklbw m4, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m2, m1
+ paddd m4, m3
+ paddd m7, m2
+ add r0, r1
+ paddd m7, m4
+ dec r2
+ jne .loop
+ mova m1, m7
+ psrlq m7, 32
+ paddd m1, m7
+ movd eax, m1
+ RET
+
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index e5d2473e3b..fa126d68fd 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -30,181 +30,14 @@
#include "libavcodec/mathops.h"
#include "dsputil_mmx.h"
+void ff_get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size);
+void ff_get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size);
+void ff_diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
+int ff_pix_sum16_mmx(uint8_t * pix, int line_size);
+int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
#if HAVE_INLINE_ASM
-static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
-{
- __asm__ volatile(
- "mov $-128, %%"REG_a" \n\t"
- "pxor %%mm7, %%mm7 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%0), %%mm0 \n\t"
- "movq (%0, %2), %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "movq %%mm0, (%1, %%"REG_a") \n\t"
- "movq %%mm1, 8(%1, %%"REG_a") \n\t"
- "movq %%mm2, 16(%1, %%"REG_a") \n\t"
- "movq %%mm3, 24(%1, %%"REG_a") \n\t"
- "add %3, %0 \n\t"
- "add $32, %%"REG_a" \n\t"
- "js 1b \n\t"
- : "+r" (pixels)
- : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
- : "%"REG_a
- );
-}
-
-static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
-{
- __asm__ volatile(
- "pxor %%xmm4, %%xmm4 \n\t"
- "movq (%0), %%xmm0 \n\t"
- "movq (%0, %2), %%xmm1 \n\t"
- "movq (%0, %2,2), %%xmm2 \n\t"
- "movq (%0, %3), %%xmm3 \n\t"
- "lea (%0,%2,4), %0 \n\t"
- "punpcklbw %%xmm4, %%xmm0 \n\t"
- "punpcklbw %%xmm4, %%xmm1 \n\t"
- "punpcklbw %%xmm4, %%xmm2 \n\t"
- "punpcklbw %%xmm4, %%xmm3 \n\t"
- "movdqa %%xmm0, (%1) \n\t"
- "movdqa %%xmm1, 16(%1) \n\t"
- "movdqa %%xmm2, 32(%1) \n\t"
- "movdqa %%xmm3, 48(%1) \n\t"
- "movq (%0), %%xmm0 \n\t"
- "movq (%0, %2), %%xmm1 \n\t"
- "movq (%0, %2,2), %%xmm2 \n\t"
- "movq (%0, %3), %%xmm3 \n\t"
- "punpcklbw %%xmm4, %%xmm0 \n\t"
- "punpcklbw %%xmm4, %%xmm1 \n\t"
- "punpcklbw %%xmm4, %%xmm2 \n\t"
- "punpcklbw %%xmm4, %%xmm3 \n\t"
- "movdqa %%xmm0, 64(%1) \n\t"
- "movdqa %%xmm1, 80(%1) \n\t"
- "movdqa %%xmm2, 96(%1) \n\t"
- "movdqa %%xmm3, 112(%1) \n\t"
- : "+r" (pixels)
- : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
- );
-}
-
-static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
-{
- __asm__ volatile(
- "pxor %%mm7, %%mm7 \n\t"
- "mov $-128, %%"REG_a" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%0), %%mm0 \n\t"
- "movq (%1), %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "psubw %%mm2, %%mm0 \n\t"
- "psubw %%mm3, %%mm1 \n\t"
- "movq %%mm0, (%2, %%"REG_a") \n\t"
- "movq %%mm1, 8(%2, %%"REG_a") \n\t"
- "add %3, %0 \n\t"
- "add %3, %1 \n\t"
- "add $16, %%"REG_a" \n\t"
- "jnz 1b \n\t"
- : "+r" (s1), "+r" (s2)
- : "r" (block+64), "r" ((x86_reg)stride)
- : "%"REG_a
- );
-}
-
-static int pix_sum16_mmx(uint8_t * pix, int line_size){
- const int h=16;
- int sum;
- x86_reg index= -line_size*h;
-
- __asm__ volatile(
- "pxor %%mm7, %%mm7 \n\t"
- "pxor %%mm6, %%mm6 \n\t"
- "1: \n\t"
- "movq (%2, %1), %%mm0 \n\t"
- "movq (%2, %1), %%mm1 \n\t"
- "movq 8(%2, %1), %%mm2 \n\t"
- "movq 8(%2, %1), %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "paddw %%mm0, %%mm1 \n\t"
- "paddw %%mm2, %%mm3 \n\t"
- "paddw %%mm1, %%mm3 \n\t"
- "paddw %%mm3, %%mm6 \n\t"
- "add %3, %1 \n\t"
- " js 1b \n\t"
- "movq %%mm6, %%mm5 \n\t"
- "psrlq $32, %%mm6 \n\t"
- "paddw %%mm5, %%mm6 \n\t"
- "movq %%mm6, %%mm5 \n\t"
- "psrlq $16, %%mm6 \n\t"
- "paddw %%mm5, %%mm6 \n\t"
- "movd %%mm6, %0 \n\t"
- "andl $0xFFFF, %0 \n\t"
- : "=&r" (sum), "+r" (index)
- : "r" (pix - index), "r" ((x86_reg)line_size)
- );
-
- return sum;
-}
-
-static int pix_norm1_mmx(uint8_t *pix, int line_size) {
- int tmp;
- __asm__ volatile (
- "movl $16,%%ecx\n"
- "pxor %%mm0,%%mm0\n"
- "pxor %%mm7,%%mm7\n"
- "1:\n"
- "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
- "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
-
- "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
-
- "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
- "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
-
- "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
- "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
- "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
-
- "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
- "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
-
- "pmaddwd %%mm3,%%mm3\n"
- "pmaddwd %%mm4,%%mm4\n"
-
- "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
- pix2^2+pix3^2+pix6^2+pix7^2) */
- "paddd %%mm3,%%mm4\n"
- "paddd %%mm2,%%mm7\n"
-
- "add %2, %0\n"
- "paddd %%mm4,%%mm7\n"
- "dec %%ecx\n"
- "jnz 1b\n"
-
- "movq %%mm7,%%mm1\n"
- "psrlq $32, %%mm7\n" /* shift hi dword to lo */
- "paddd %%mm7,%%mm1\n"
- "movd %%mm1,%1\n"
- : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
- return tmp;
-}
-
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
int tmp;
__asm__ volatile (
@@ -1111,10 +944,23 @@ hadamard_func(ssse3)
void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
-
-#if HAVE_INLINE_ASM
int bit_depth = avctx->bits_per_raw_sample;
+#if HAVE_YASM
+ if (EXTERNAL_MMX(mm_flags)) {
+ if (bit_depth <= 8)
+ c->get_pixels = ff_get_pixels_mmx;
+ c->diff_pixels = ff_diff_pixels_mmx;
+ c->pix_sum = ff_pix_sum16_mmx;
+
+ c->pix_norm1 = ff_pix_norm1_mmx;
+ }
+ if (EXTERNAL_SSE2(mm_flags))
+ if (bit_depth <= 8)
+ c->get_pixels = ff_get_pixels_sse2;
+#endif /* HAVE_YASM */
+
+#if HAVE_INLINE_ASM
if (mm_flags & AV_CPU_FLAG_MMX) {
const int dct_algo = avctx->dct_algo;
if (avctx->bits_per_raw_sample <= 8 &&
@@ -1128,15 +974,10 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
}
- if (bit_depth <= 8)
- c->get_pixels = get_pixels_mmx;
- c->diff_pixels = diff_pixels_mmx;
- c->pix_sum = pix_sum16_mmx;
c->diff_bytes= diff_bytes_mmx;
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
- c->pix_norm1 = pix_norm1_mmx;
c->sse[0] = sse16_mmx;
c->sse[1] = sse8_mmx;
c->vsad[4]= vsad_intra16_mmx;
@@ -1166,8 +1007,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
if(mm_flags & AV_CPU_FLAG_SSE2){
- if (bit_depth <= 8)
- c->get_pixels = get_pixels_sse2;
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
}