diff options
Diffstat (limited to 'libavcodec/x86')
163 files changed, 19749 insertions, 4446 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 09bb6a2fe1..08cee1c4fd 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -3,11 +3,13 @@ OBJS += x86/constants.o \ # subsystems OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o -OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o +OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o OBJS-$(CONFIG_DCT) += x86/dct_init.o OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o OBJS-$(CONFIG_FFT) += x86/fft_init.o +OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp_init.o +OBJS-$(CONFIG_FLAC_ENCODER) += x86/flacdsp_init.o OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o @@ -15,6 +17,8 @@ OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o +OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp_init.o +OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_mmx.o OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o @@ -33,20 +37,26 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o # decoders/encoders OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o -OBJS-$(CONFIG_APE_DECODER) += x86/apedsp_init.o +OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o +OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o +OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o -OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o +OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o +OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o +OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \ x86/rv40dsp_init.o -OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o -OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o +OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o +OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o +OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o +OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o @@ -54,21 +64,18 @@ OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o OBJS-$(CONFIG_VP7_DECODER) += x86/vp8dsp_init.o OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o +OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o # GCC inline assembly optimizations # subsystems -MMX-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_mmx.o -MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \ - x86/hpeldsp_mmx.o +MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o -MMX-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_mmx.o \ - x86/simple_idct.o -MMX-OBJS-$(CONFIG_QPELDSP) += x86/fpel_mmx.o +MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o # decoders/encoders -MMX-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_mmx.o \ - x86/xvididct_sse2.o +MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o +MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o @@ -78,10 +85,17 @@ YASM-OBJS += x86/deinterlace.o \ # subsystems YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o YASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o +YASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o YASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o +YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\ + x86/dwt_yasm.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o YASM-OBJS-$(CONFIG_FFT) += x86/fft.o +YASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o +ifdef CONFIG_GPL +YASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o +endif YASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ @@ -101,6 +115,9 @@ YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \ YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \ x86/hpeldsp.o YASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o +YASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o +YASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o +YASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o YASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o YASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o @@ -108,20 +125,34 @@ YASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ x86/fpel.o \ x86/qpel.o -YASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o # decoders/encoders YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o -YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o +YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o +YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o +YASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o -YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o +YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_mc.o \ + x86/hevc_deblock.o \ + x86/hevc_idct.o \ + x86/hevc_res_add.o \ + x86/hevc_sao.o +YASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o +YASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o +YASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o +YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ x86/rv40dsp.o +YASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o +YASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o +YASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o +YASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o +YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o @@ -129,4 +160,8 @@ YASM-OBJS-$(CONFIG_VP7_DECODER) += x86/vp8dsp.o \ x86/vp8dsp_loopfilter.o YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o \ x86/vp8dsp_loopfilter.o -YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp.o +YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \ + x86/vp9itxfm.o \ + x86/vp9lpf.o \ + x86/vp9mc.o +YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm index 817d5a319c..675ade3101 100644 --- a/libavcodec/x86/ac3dsp.asm +++ b/libavcodec/x86/ac3dsp.asm @@ -2,20 +2,20 @@ ;* x86-optimized AC-3 DSP functions ;* Copyright (c) 2011 Justin Ruggles ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -32,7 +32,7 @@ pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768 pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7 ; used in ff_ac3_extract_exponents() -pd_1: times 4 dd 1 +cextern pd_1 pd_151: times 4 dd 151 ; used in ff_apply_window_int16() diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c index cd638b9228..eea2736bfa 100644 --- a/libavcodec/x86/ac3dsp_init.c +++ b/libavcodec/x86/ac3dsp_init.c @@ -2,20 +2,20 @@ * x86-optimized AC-3 DSP functions * Copyright (c) 2011 Justin Ruggles * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -64,6 +64,11 @@ void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input, void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, const int16_t *window, unsigned int len); +#if ARCH_X86_32 && defined(__INTEL_COMPILER) +# undef HAVE_7REGS +# define HAVE_7REGS 0 +#endif + #if HAVE_SSE_INLINE && HAVE_7REGS #define IF1(x) x @@ -160,7 +165,7 @@ static void ac3_downmix_sse(float **samples, float (*matrix)[2], matrix_cmp[3][0] == matrix_cmp[4][0]) { MIX5(IF1, IF0); } else { - DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4]; + LOCAL_ALIGNED(16, float, matrix_simd, [AC3_MAX_CHANNELS], [2][4]); float *samp[AC3_MAX_CHANNELS]; for (j = 0; j < in_ch; j++) diff --git a/libavcodec/x86/audiodsp.asm b/libavcodec/x86/audiodsp.asm index f2e831df17..273b9ef660 100644 --- a/libavcodec/x86/audiodsp.asm +++ b/libavcodec/x86/audiodsp.asm @@ -2,20 +2,20 @@ ;* optimized audio functions ;* Copyright (c) 2008 Loren Merritt ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -40,15 +40,11 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order paddd m2, m1 add orderq, mmsize*2 jl .loop -%if mmsize == 16 - movhlps m0, m2 - paddd m2, m0 - pshuflw m0, m2, 0x4e -%else - pshufw m0, m2, 0x4e -%endif - paddd m2, m0 + HADDD m2, m0 movd eax, m2 +%if mmsize == 8 + emms +%endif RET %endmacro @@ -80,17 +76,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len SPLATD m4 SPLATD m5 .loop: -%assign %%i 1 +%assign %%i 0 %rep %2 - mova m0, [srcq+mmsize*0*%%i] - mova m1, [srcq+mmsize*1*%%i] - mova m2, [srcq+mmsize*2*%%i] - mova m3, [srcq+mmsize*3*%%i] + mova m0, [srcq+mmsize*(0+%%i)] + mova m1, [srcq+mmsize*(1+%%i)] + mova m2, [srcq+mmsize*(2+%%i)] + mova m3, [srcq+mmsize*(3+%%i)] %if %3 - mova m7, [srcq+mmsize*4*%%i] - mova m8, [srcq+mmsize*5*%%i] - mova m9, [srcq+mmsize*6*%%i] - mova m10, [srcq+mmsize*7*%%i] + mova m7, [srcq+mmsize*(4+%%i)] + mova m8, [srcq+mmsize*(5+%%i)] + mova m9, [srcq+mmsize*(6+%%i)] + mova m10, [srcq+mmsize*(7+%%i)] %endif CLIPD m0, m4, m5, m6 CLIPD m1, m4, m5, m6 @@ -102,17 +98,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len CLIPD m9, m4, m5, m6 CLIPD m10, m4, m5, m6 %endif - mova [dstq+mmsize*0*%%i], m0 - mova [dstq+mmsize*1*%%i], m1 - mova [dstq+mmsize*2*%%i], m2 - mova [dstq+mmsize*3*%%i], m3 + mova [dstq+mmsize*(0+%%i)], m0 + mova [dstq+mmsize*(1+%%i)], m1 + mova [dstq+mmsize*(2+%%i)], m2 + mova [dstq+mmsize*(3+%%i)], m3 %if %3 - mova [dstq+mmsize*4*%%i], m7 - mova [dstq+mmsize*5*%%i], m8 - mova [dstq+mmsize*6*%%i], m9 - mova [dstq+mmsize*7*%%i], m10 + mova [dstq+mmsize*(4+%%i)], m7 + mova [dstq+mmsize*(5+%%i)], m8 + mova [dstq+mmsize*(6+%%i)], m9 + mova [dstq+mmsize*(7+%%i)], m10 %endif -%assign %%i %%i+1 +%assign %%i %%i+4*(%3+1) %endrep add srcq, mmsize*4*(%2+%3) add dstq, mmsize*4*(%2+%3) @@ -135,3 +131,47 @@ VECTOR_CLIP_INT32 11, 1, 1, 0 %else VECTOR_CLIP_INT32 6, 1, 0, 0 %endif + +;----------------------------------------------------- +;void ff_vector_clipf(float *dst, const float *src, +; float min, float max, int len) +;----------------------------------------------------- +INIT_XMM sse +%if UNIX64 +cglobal vector_clipf, 3,3,6, dst, src, len +%else +cglobal vector_clipf, 5,5,6, dst, src, min, max, len +%endif +%if WIN64 + SWAP 0, 2 + SWAP 1, 3 +%elif ARCH_X86_32 + movss m0, minm + movss m1, maxm +%endif + SPLATD m0 + SPLATD m1 + shl lend, 2 + add srcq, lenq + add dstq, lenq + neg lenq +.loop: + mova m2, [srcq+lenq+mmsize*0] + mova m3, [srcq+lenq+mmsize*1] + mova m4, [srcq+lenq+mmsize*2] + mova m5, [srcq+lenq+mmsize*3] + maxps m2, m0 + maxps m3, m0 + maxps m4, m0 + maxps m5, m0 + minps m2, m1 + minps m3, m1 + minps m4, m1 + minps m5, m1 + mova [dstq+lenq+mmsize*0], m2 + mova [dstq+lenq+mmsize*1], m3 + mova [dstq+lenq+mmsize*2], m4 + mova [dstq+lenq+mmsize*3], m5 + add lenq, mmsize*4 + jl .loop + REP_RET diff --git a/libavcodec/x86/audiodsp.h b/libavcodec/x86/audiodsp.h deleted file mode 100644 index 321056b8b7..0000000000 --- a/libavcodec/x86/audiodsp.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_AUDIODSP_H -#define AVCODEC_X86_AUDIODSP_H - -void ff_vector_clipf_sse(float *dst, const float *src, - float min, float max, int len); - -#endif /* AVCODEC_X86_AUDIODSP_H */ diff --git a/libavcodec/x86/audiodsp_init.c b/libavcodec/x86/audiodsp_init.c index 743f5a3699..a2ce231f32 100644 --- a/libavcodec/x86/audiodsp_init.c +++ b/libavcodec/x86/audiodsp_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -24,7 +24,6 @@ #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/audiodsp.h" -#include "audiodsp.h" int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2, int order); @@ -39,6 +38,8 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len); void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len); +void ff_vector_clipf_sse(float *dst, const float *src, + float min, float max, int len); av_cold void ff_audiodsp_init_x86(AudioDSPContext *c) { @@ -50,7 +51,7 @@ av_cold void ff_audiodsp_init_x86(AudioDSPContext *c) if (EXTERNAL_MMXEXT(cpu_flags)) c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext; - if (INLINE_SSE(cpu_flags)) + if (EXTERNAL_SSE(cpu_flags)) c->vector_clipf = ff_vector_clipf_sse; if (EXTERNAL_SSE2(cpu_flags)) { diff --git a/libavcodec/x86/audiodsp_mmx.c b/libavcodec/x86/audiodsp_mmx.c deleted file mode 100644 index cb550598f9..0000000000 --- a/libavcodec/x86/audiodsp_mmx.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/x86/asm.h" -#include "audiodsp.h" - -#if HAVE_INLINE_ASM - -void ff_vector_clipf_sse(float *dst, const float *src, - float min, float max, int len) -{ - x86_reg i = (len - 16) * 4; - __asm__ volatile ( - "movss %3, %%xmm4 \n\t" - "movss %4, %%xmm5 \n\t" - "shufps $0, %%xmm4, %%xmm4 \n\t" - "shufps $0, %%xmm5, %%xmm5 \n\t" - "1: \n\t" - "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel - "movaps 16(%2, %0), %%xmm1 \n\t" - "movaps 32(%2, %0), %%xmm2 \n\t" - "movaps 48(%2, %0), %%xmm3 \n\t" - "maxps %%xmm4, %%xmm0 \n\t" - "maxps %%xmm4, %%xmm1 \n\t" - "maxps %%xmm4, %%xmm2 \n\t" - "maxps %%xmm4, %%xmm3 \n\t" - "minps %%xmm5, %%xmm0 \n\t" - "minps %%xmm5, %%xmm1 \n\t" - "minps %%xmm5, %%xmm2 \n\t" - "minps %%xmm5, %%xmm3 \n\t" - "movaps %%xmm0, (%1, %0) \n\t" - "movaps %%xmm1, 16(%1, %0) \n\t" - "movaps %%xmm2, 32(%1, %0) \n\t" - "movaps %%xmm3, 48(%1, %0) \n\t" - "sub $64, %0 \n\t" - "jge 1b \n\t" - : "+&r" (i) - : "r" (dst), "r" (src), "m" (min), "m" (max) - : "memory"); -} - -#endif /* HAVE_INLINE_ASM */ diff --git a/libavcodec/x86/blockdsp.asm b/libavcodec/x86/blockdsp.asm new file mode 100644 index 0000000000..c793858861 --- /dev/null +++ b/libavcodec/x86/blockdsp.asm @@ -0,0 +1,86 @@ +;****************************************************************************** +;* SIMD-optimized clear block functions +;* Copyright (c) 2002 Michael Niedermayer +;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2009 Fiona Glaser +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +;---------------------------------------- +; void ff_clear_block(int16_t *blocks); +;---------------------------------------- +; %1 = number of xmm registers used +; %2 = number of inline store loops +%macro CLEAR_BLOCK 2 +cglobal clear_block, 1, 1, %1, blocks + ZERO m0, m0 +%assign %%i 0 +%rep %2 + mova [blocksq+mmsize*(0+%%i)], m0 + mova [blocksq+mmsize*(1+%%i)], m0 + mova [blocksq+mmsize*(2+%%i)], m0 + mova [blocksq+mmsize*(3+%%i)], m0 + mova [blocksq+mmsize*(4+%%i)], m0 + mova [blocksq+mmsize*(5+%%i)], m0 + mova [blocksq+mmsize*(6+%%i)], m0 + mova [blocksq+mmsize*(7+%%i)], m0 +%assign %%i %%i+8 +%endrep + RET +%endmacro + +INIT_MMX mmx +%define ZERO pxor +CLEAR_BLOCK 0, 2 +INIT_XMM sse +%define ZERO xorps +CLEAR_BLOCK 1, 1 + +;----------------------------------------- +; void ff_clear_blocks(int16_t *blocks); +;----------------------------------------- +; %1 = number of xmm registers used +%macro CLEAR_BLOCKS 1 +cglobal clear_blocks, 1, 2, %1, blocks, len + add blocksq, 768 + mov lenq, -768 + ZERO m0, m0 +.loop + mova [blocksq+lenq+mmsize*0], m0 + mova [blocksq+lenq+mmsize*1], m0 + mova [blocksq+lenq+mmsize*2], m0 + mova [blocksq+lenq+mmsize*3], m0 + mova [blocksq+lenq+mmsize*4], m0 + mova [blocksq+lenq+mmsize*5], m0 + mova [blocksq+lenq+mmsize*6], m0 + mova [blocksq+lenq+mmsize*7], m0 + add lenq, mmsize*8 + js .loop + RET +%endmacro + +INIT_MMX mmx +%define ZERO pxor +CLEAR_BLOCKS 0 +INIT_XMM sse +%define ZERO xorps +CLEAR_BLOCKS 1 diff --git a/libavcodec/x86/blockdsp.c b/libavcodec/x86/blockdsp.c deleted file mode 100644 index b5294242ab..0000000000 --- a/libavcodec/x86/blockdsp.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/internal.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/blockdsp.h" -#include "libavcodec/version.h" - -#if HAVE_INLINE_ASM - -#define CLEAR_BLOCKS(name, n) \ -static void name(int16_t *blocks) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "mov %1, %%"REG_a" \n\t" \ - "1: \n\t" \ - "movq %%mm7, (%0, %%"REG_a") \n\t" \ - "movq %%mm7, 8(%0, %%"REG_a") \n\t" \ - "movq %%mm7, 16(%0, %%"REG_a") \n\t" \ - "movq %%mm7, 24(%0, %%"REG_a") \n\t" \ - "add $32, %%"REG_a" \n\t" \ - "js 1b \n\t" \ - :: "r"(((uint8_t *) blocks) + 128 * n), \ - "i"(-128 * n) \ - : "%"REG_a); \ -} -CLEAR_BLOCKS(clear_blocks_mmx, 6) -CLEAR_BLOCKS(clear_block_mmx, 1) - -static void clear_block_sse(int16_t *block) -{ - __asm__ volatile ( - "xorps %%xmm0, %%xmm0 \n" - "movaps %%xmm0, (%0) \n" - "movaps %%xmm0, 16(%0) \n" - "movaps %%xmm0, 32(%0) \n" - "movaps %%xmm0, 48(%0) \n" - "movaps %%xmm0, 64(%0) \n" - "movaps %%xmm0, 80(%0) \n" - "movaps %%xmm0, 96(%0) \n" - "movaps %%xmm0, 112(%0) \n" - :: "r" (block) - : "memory"); -} - -static void clear_blocks_sse(int16_t *blocks) -{ - __asm__ volatile ( - "xorps %%xmm0, %%xmm0 \n" - "mov %1, %%"REG_a" \n" - "1: \n" - "movaps %%xmm0, (%0, %%"REG_a") \n" - "movaps %%xmm0, 16(%0, %%"REG_a") \n" - "movaps %%xmm0, 32(%0, %%"REG_a") \n" - "movaps %%xmm0, 48(%0, %%"REG_a") \n" - "movaps %%xmm0, 64(%0, %%"REG_a") \n" - "movaps %%xmm0, 80(%0, %%"REG_a") \n" - "movaps %%xmm0, 96(%0, %%"REG_a") \n" - "movaps %%xmm0, 112(%0, %%"REG_a") \n" - "add $128, %%"REG_a" \n" - "js 1b \n" - :: "r"(((uint8_t *) blocks) + 128 * 6), "i"(-128 * 6) - : "%"REG_a); -} - -#endif /* HAVE_INLINE_ASM */ - -#if FF_API_XVMC -av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth, - AVCodecContext *avctx) -#else -av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth) -#endif /* FF_API_XVMC */ -{ -#if HAVE_INLINE_ASM - int cpu_flags = av_get_cpu_flags(); - - if (!high_bit_depth) { - if (INLINE_MMX(cpu_flags)) { - c->clear_block = clear_block_mmx; - c->clear_blocks = clear_blocks_mmx; - } - -#if FF_API_XVMC -FF_DISABLE_DEPRECATION_WARNINGS - /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ - if (CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1) - return; -FF_ENABLE_DEPRECATION_WARNINGS -#endif /* FF_API_XVMC */ - - if (INLINE_SSE(cpu_flags)) { - c->clear_block = clear_block_sse; - c->clear_blocks = clear_blocks_sse; - } - } -#endif /* HAVE_INLINE_ASM */ -} diff --git a/libavcodec/x86/blockdsp_init.c b/libavcodec/x86/blockdsp_init.c new file mode 100644 index 0000000000..7780184af6 --- /dev/null +++ b/libavcodec/x86/blockdsp_init.c @@ -0,0 +1,60 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/internal.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/blockdsp.h" +#include "libavcodec/version.h" + +void ff_clear_block_mmx(int16_t *block); +void ff_clear_block_sse(int16_t *block); +void ff_clear_blocks_mmx(int16_t *blocks); +void ff_clear_blocks_sse(int16_t *blocks); + +#if FF_API_XVMC +av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth, + AVCodecContext *avctx) +#else +av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth) +#endif /* FF_API_XVMC */ +{ +#if HAVE_YASM + int cpu_flags = av_get_cpu_flags(); + + if (!high_bit_depth) { + if (EXTERNAL_MMX(cpu_flags)) { + c->clear_block = ff_clear_block_mmx; + c->clear_blocks = ff_clear_blocks_mmx; + } + + /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ + if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb) + return; + + if (EXTERNAL_SSE(cpu_flags)) { + c->clear_block = ff_clear_block_sse; + c->clear_blocks = ff_clear_blocks_sse; + } + } +#endif /* HAVE_YASM */ +} diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm index 17a6cb1be3..ec060c93b6 100644 --- a/libavcodec/x86/bswapdsp.asm +++ b/libavcodec/x86/bswapdsp.asm @@ -1,21 +1,23 @@ ;****************************************************************************** ;* optimized bswap buffer functions ;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2003-2013 Michael Niedermayer +;* Copyright (c) 2013 Daniel Kang ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -24,6 +26,8 @@ SECTION_RODATA pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +cextern pb_80 + SECTION_TEXT ; %1 = aligned/unaligned @@ -90,6 +94,7 @@ cglobal bswap32_buf, 3,4,3 cglobal bswap32_buf, 3,4,5 mov r3, r1 %endif + or r3, r0 and r3, 15 jz .start_align BSWAP_LOOPS u diff --git a/libavcodec/x86/bswapdsp_init.c b/libavcodec/x86/bswapdsp_init.c index ba40f2dbe1..c042e56371 100644 --- a/libavcodec/x86/bswapdsp_init.c +++ b/libavcodec/x86/bswapdsp_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h index d1701bf071..3510336f95 100644 --- a/libavcodec/x86/cabac.h +++ b/libavcodec/x86/cabac.h @@ -1,20 +1,20 @@ /* * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -27,8 +27,28 @@ #include "libavutil/x86/asm.h" #include "config.h" +#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\ + || ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\ + || (defined(__INTEL_COMPILER) && defined(_MSC_VER)) +# define BROKEN_COMPILER 1 +#else +# define BROKEN_COMPILER 0 +#endif + #if HAVE_INLINE_ASM +#ifndef UNCHECKED_BITSTREAM_READER +#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER +#endif + +#if UNCHECKED_BITSTREAM_READER +#define END_CHECK(end) "" +#else +#define END_CHECK(end) \ + "cmp "end" , %%"REG_c" \n\t"\ + "jge 1f \n\t" +#endif + #ifdef BROKEN_RELOCATIONS #define TABLES_ARG , "r"(tables) @@ -73,8 +93,7 @@ "test "lowword" , "lowword" \n\t"\ "jnz 2f \n\t"\ "mov "byte" , %%"REG_c" \n\t"\ - "cmp "end" , %%"REG_c" \n\t"\ - "jge 1f \n\t"\ + END_CHECK(end)\ "add"OPSIZE" $2 , "byte" \n\t"\ "1: \n\t"\ "movzwl (%%"REG_c") , "tmp" \n\t"\ @@ -92,7 +111,8 @@ "2: \n\t" #else /* BROKEN_RELOCATIONS */ -#define TABLES_ARG +#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables) +#define RIP_ARG #if HAVE_FAST_CMOV #define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\ @@ -134,8 +154,7 @@ "test "lowword" , "lowword" \n\t"\ " jnz 2f \n\t"\ "mov "byte" , %%"REG_c" \n\t"\ - "cmp "end" , %%"REG_c" \n\t"\ - "jge 1f \n\t"\ + END_CHECK(end)\ "add"OPSIZE" $2 , "byte" \n\t"\ "1: \n\t"\ "movzwl (%%"REG_c") , "tmp" \n\t"\ @@ -154,8 +173,7 @@ #endif /* BROKEN_RELOCATIONS */ - -#if HAVE_7REGS +#if HAVE_7REGS && !BROKEN_COMPILER #define get_cabac_inline get_cabac_inline_x86 static av_always_inline int get_cabac_inline_x86(CABACContext *c, uint8_t *const state) @@ -167,6 +185,7 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c, __asm__ volatile( "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t" : "=&r"(tables) + : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables) ); #endif @@ -178,17 +197,19 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c, AV_STRINGIFY(H264_LPS_RANGE_OFFSET), AV_STRINGIFY(H264_MLPS_STATE_OFFSET), "%8") - : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp) + : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp) : "r"(state), "r"(c), "i"(offsetof(CABACContext, bytestream)), "i"(offsetof(CABACContext, bytestream_end)) TABLES_ARG + ,"1"(c->low), "2"(c->range) : "%"REG_c, "memory" ); return bit & 1; } -#endif /* HAVE_7REGS */ +#endif /* HAVE_7REGS && !BROKEN_COMPILER */ +#if !BROKEN_COMPILER #define get_cabac_bypass_sign get_cabac_bypass_sign_x86 static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) { @@ -199,7 +220,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) "shl $17, %k1 \n\t" "add %%eax, %%eax \n\t" "sub %k1, %%eax \n\t" - "cltd \n\t" + "cdq \n\t" "and %%edx, %k1 \n\t" "add %k1, %%eax \n\t" "xor %%edx, %%ecx \n\t" @@ -211,10 +232,16 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) "movzwl (%1), %%edx \n\t" "bswap %%edx \n\t" "shrl $15, %%edx \n\t" +#if UNCHECKED_BITSTREAM_READER + "add $2, %1 \n\t" + "addl %%edx, %%eax \n\t" + "mov %1, %c4(%2) \n\t" +#else "addl %%edx, %%eax \n\t" "cmp %c5(%2), %1 \n\t" "jge 1f \n\t" "add"OPSIZE" $2, %c4(%2) \n\t" +#endif "1: \n\t" "movl %%eax, %c3(%2) \n\t" @@ -240,7 +267,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c) "shl $17, %k1 \n\t" "add %%eax, %%eax \n\t" "sub %k1, %%eax \n\t" - "cltd \n\t" + "cdq \n\t" "and %%edx, %k1 \n\t" "add %k1, %%eax \n\t" "inc %%edx \n\t" @@ -268,6 +295,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c) ); return res; } +#endif /* !BROKEN_COMPILER */ #endif /* HAVE_INLINE_ASM */ #endif /* AVCODEC_X86_CABAC_H */ diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c index 39eec4b3ee..b57116383e 100644 --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -5,20 +5,20 @@ * MMX-optimized DSP functions, based on H.264 optimizations by * Michael Niedermayer and Loren Merritt * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -139,7 +139,7 @@ static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) { int i; - DECLARE_ALIGNED(8, int16_t, b2)[64]; + LOCAL_ALIGNED(16, int16_t, b2, [64]); for(i=0; i<2; i++){ cavs_idct8_1d(block + 4 * i, ff_pw_4.a); @@ -196,7 +196,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) ); } - ff_add_pixels_clamped_mmx(b2, dst, stride); + ff_add_pixels_clamped(b2, dst, stride); } #endif /* HAVE_MMX_INLINE */ @@ -210,10 +210,10 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) ****************************************************************************/ /* vertical filter [-1 -2 96 42 -7 0] */ -#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \ +#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \ "movd (%0), "#F" \n\t"\ "movq "#C", %%mm6 \n\t"\ - "pmullw %5, %%mm6 \n\t"\ + "pmullw "MANGLE(MUL1)", %%mm6\n\t"\ "movq "#D", %%mm7 \n\t"\ "pmullw "MANGLE(MUL2)", %%mm7\n\t"\ "psllw $3, "#E" \n\t"\ @@ -228,35 +228,35 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) "psubw "#B", %%mm6 \n\t"\ "psraw $1, "#B" \n\t"\ "psubw "#A", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ + "paddw "MANGLE(ADD)", %%mm6 \n\t"\ "psraw $7, %%mm6 \n\t"\ "packuswb %%mm6, %%mm6 \n\t"\ OP(%%mm6, (%1), A, d) \ "add %3, %1 \n\t" /* vertical filter [ 0 -1 5 5 -1 0] */ -#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \ +#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \ "movd (%0), "#F" \n\t"\ "movq "#C", %%mm6 \n\t"\ "paddw "#D", %%mm6 \n\t"\ - "pmullw %5, %%mm6 \n\t"\ + "pmullw "MANGLE(MUL1)", %%mm6\n\t"\ "add %2, %0 \n\t"\ "punpcklbw %%mm7, "#F" \n\t"\ "psubw "#B", %%mm6 \n\t"\ "psubw "#E", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ + "paddw "MANGLE(ADD)", %%mm6 \n\t"\ "psraw $3, %%mm6 \n\t"\ "packuswb %%mm6, %%mm6 \n\t"\ OP(%%mm6, (%1), A, d) \ "add %3, %1 \n\t" /* vertical filter [ 0 -7 42 96 -2 -1] */ -#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \ +#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \ "movd (%0), "#F" \n\t"\ "movq "#C", %%mm6 \n\t"\ "pmullw "MANGLE(MUL2)", %%mm6\n\t"\ "movq "#D", %%mm7 \n\t"\ - "pmullw %5, %%mm7 \n\t"\ + "pmullw "MANGLE(MUL1)", %%mm7\n\t"\ "psllw $3, "#B" \n\t"\ "psubw "#B", %%mm6 \n\t"\ "psraw $3, "#B" \n\t"\ @@ -269,7 +269,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) "psubw "#E", %%mm6 \n\t"\ "psraw $1, "#E" \n\t"\ "psubw "#F", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ + "paddw "MANGLE(ADD)", %%mm6 \n\t"\ "psraw $7, %%mm6 \n\t"\ "packuswb %%mm6, %%mm6 \n\t"\ OP(%%mm6, (%1), A, d) \ @@ -298,32 +298,34 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) "punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm4 \n\t"\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\ + VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\ + VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\ \ : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ + : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\ + NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\ : "memory"\ );\ if(h==16){\ __asm__ volatile(\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\ + VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\ + VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\ \ : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ + : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\ + NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\ : "memory"\ );\ }\ @@ -336,7 +338,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int int h=8;\ __asm__ volatile(\ "pxor %%mm7, %%mm7 \n\t"\ - "movq %5, %%mm6 \n\t"\ + "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ "1: \n\t"\ "movq (%0), %%mm0 \n\t"\ "movq 1(%0), %%mm2 \n\t"\ @@ -362,7 +364,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int "paddw %%mm3, %%mm5 \n\t"\ "psubw %%mm2, %%mm0 \n\t"\ "psubw %%mm5, %%mm1 \n\t"\ - "movq %6, %%mm5 \n\t"\ + "movq "MANGLE(ff_pw_4)", %%mm5\n\t"\ "paddw %%mm5, %%mm0 \n\t"\ "paddw %%mm5, %%mm1 \n\t"\ "psraw $3, %%mm0 \n\t"\ @@ -374,7 +376,8 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int "decl %2 \n\t"\ " jnz 1b \n\t"\ : "+a"(src), "+c"(dst), "+m"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ + NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\ : "memory"\ );\ }\ @@ -384,7 +387,7 @@ static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8 }\ \ static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\ - QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ + QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42) \ }\ \ static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\ @@ -457,7 +460,7 @@ static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin #endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */ -#if HAVE_MMX_INLINE +#if HAVE_MMX_EXTERNAL static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) { @@ -470,6 +473,12 @@ static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src, ff_avg_pixels8_mmx(dst, src, stride, 8); } +static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + ff_avg_pixels8_mmxext(dst, src, stride, 8); +} + static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) { @@ -482,18 +491,40 @@ static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src, ff_avg_pixels16_mmx(dst, src, stride, 16); } +static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + ff_avg_pixels16_mmxext(dst, src, stride, 16); +} + +static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + ff_put_pixels16_sse2(dst, src, stride, 16); +} + +static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + ff_avg_pixels16_sse2(dst, src, stride, 16); +} +#endif + static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c, AVCodecContext *avctx) { +#if HAVE_MMX_EXTERNAL c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx; c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx; c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx; c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx; +#endif +#if HAVE_MMX_INLINE c->cavs_idct8_add = cavs_idct8_add_mmx; c->idct_perm = FF_IDCT_PERM_TRANSPOSE; -} #endif /* HAVE_MMX_INLINE */ +} #define DSPFUNC(PFX, IDX, NUM, EXT) \ c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \ @@ -509,15 +540,6 @@ CAVS_MC(put_, 8, mmxext) CAVS_MC(put_, 16, mmxext) CAVS_MC(avg_, 8, mmxext) CAVS_MC(avg_, 16, mmxext) - -static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c, - AVCodecContext *avctx) -{ - DSPFUNC(put, 0, 16, mmxext); - DSPFUNC(put, 1, 8, mmxext); - DSPFUNC(avg, 0, 16, mmxext); - DSPFUNC(avg, 1, 8, mmxext); -} #endif /* HAVE_MMXEXT_INLINE */ #if HAVE_AMD3DNOW_INLINE @@ -541,18 +563,31 @@ static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c, av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx) { -#if HAVE_MMX_INLINE int cpu_flags = av_get_cpu_flags(); - if (INLINE_MMX(cpu_flags)) - cavsdsp_init_mmx(c, avctx); -#endif /* HAVE_MMX_INLINE */ + cavsdsp_init_mmx(c, avctx); #if HAVE_AMD3DNOW_INLINE if (INLINE_AMD3DNOW(cpu_flags)) cavsdsp_init_3dnow(c, avctx); #endif /* HAVE_AMD3DNOW_INLINE */ #if HAVE_MMXEXT_INLINE - if (INLINE_MMXEXT(cpu_flags)) - cavsdsp_init_mmxext(c, avctx); -#endif /* HAVE_MMXEXT_INLINE */ + if (INLINE_MMXEXT(cpu_flags)) { + DSPFUNC(put, 0, 16, mmxext); + DSPFUNC(put, 1, 8, mmxext); + DSPFUNC(avg, 0, 16, mmxext); + DSPFUNC(avg, 1, 8, mmxext); + } +#endif +#if HAVE_MMX_EXTERNAL + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext; + c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext; + } +#endif +#if HAVE_SSE2_EXTERNAL + if (EXTERNAL_SSE2(cpu_flags)) { + c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2; + c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2; + } +#endif } diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index 5b8d1b224f..553dd49d4f 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -1,20 +1,20 @@ /* - * MMX/SSE constants used across x86 dsp optimizations. + * MMX/SSE/AVX constants used across x86 dsp optimizations. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -22,10 +22,10 @@ #include "libavutil/x86/asm.h" // for xmm_reg #include "constants.h" -DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; - -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL, 0x0001000100010001ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL, + 0x0002000200020002ULL, 0x0002000200020002ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL }; @@ -35,19 +35,47 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL }; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL, + 0x0100010001000100ULL, 0x0100010001000100ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL, + 0x0200020002000200ULL, 0x0200020002000200ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL, + 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL}; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL, + 0x0400040004000400ULL, 0x0400040004000400ULL}; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL, + 0x0800080008000800ULL, 0x0800080008000800ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL, + 0x1000100010001000ULL, 0x1000100010001000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL, + 0x2000200020002000ULL, 0x2000200020002000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, + 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL, 0x0000000000000000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL, 0x0101010101010101ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_2) = { 0x0202020202020202ULL, 0x0202020202020202ULL, + 0x0202020202020202ULL, 0x0202020202020202ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL, + 0x0303030303030303ULL, 0x0303030303030303ULL }; +DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; + +DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL }; + +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL, + 0x0000000100000001ULL, 0x0000000100000001ULL }; diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index f38fbe3425..33dbb650ae 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -1,20 +1,20 @@ /* * MMX/SSE constants used across x86 dsp optimizations. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -25,27 +25,42 @@ #include "libavutil/x86/asm.h" -extern const uint64_t ff_wtwo; - +extern const ymm_reg ff_pw_1; +extern const ymm_reg ff_pw_2; extern const xmm_reg ff_pw_3; extern const xmm_reg ff_pw_4; extern const xmm_reg ff_pw_5; extern const xmm_reg ff_pw_8; +extern const xmm_reg ff_pw_9; extern const uint64_t ff_pw_15; extern const xmm_reg ff_pw_16; extern const xmm_reg ff_pw_18; -extern const uint64_t ff_pw_20; +extern const xmm_reg ff_pw_20; extern const xmm_reg ff_pw_32; extern const uint64_t ff_pw_42; extern const uint64_t ff_pw_53; extern const xmm_reg ff_pw_64; extern const uint64_t ff_pw_96; extern const uint64_t ff_pw_128; -extern const uint64_t ff_pw_255; +extern const ymm_reg ff_pw_255; +extern const ymm_reg ff_pw_512; +extern const ymm_reg ff_pw_1023; +extern const ymm_reg ff_pw_1024; +extern const ymm_reg ff_pw_2048; +extern const ymm_reg ff_pw_4096; +extern const ymm_reg ff_pw_8192; +extern const ymm_reg ff_pw_m1; -extern const xmm_reg ff_pb_1; -extern const xmm_reg ff_pb_3; -extern const xmm_reg ff_pb_F8; +extern const ymm_reg ff_pb_0; +extern const ymm_reg ff_pb_1; +extern const ymm_reg ff_pb_2; +extern const ymm_reg ff_pb_3; +extern const xmm_reg ff_pb_80; +extern const xmm_reg ff_pb_FE; extern const uint64_t ff_pb_FC; +extern const xmm_reg ff_ps_neg; + +extern const ymm_reg ff_pd_1; + #endif /* AVCODEC_X86_CONSTANTS_H */ diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h deleted file mode 100644 index 11d45ae61c..0000000000 --- a/libavcodec/x86/dca.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_DCA_H -#define AVCODEC_X86_DCA_H - -#include "config.h" - -#if ARCH_X86_64 && HAVE_SSE2_INLINE -# include "libavutil/x86/asm.h" -# include "libavutil/mem.h" -#include "libavcodec/dcadsp.h" - -# define int8x8_fmul_int32 int8x8_fmul_int32 -static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp, - float *dst, const int8_t *src, int scale) -{ - DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000; - __asm__ volatile ( - "cvtsi2ss %2, %%xmm0 \n\t" - "mulss %3, %%xmm0 \n\t" - "movq (%1), %%xmm1 \n\t" - "punpcklbw %%xmm1, %%xmm1 \n\t" - "movaps %%xmm1, %%xmm2 \n\t" - "punpcklwd %%xmm1, %%xmm1 \n\t" - "punpckhwd %%xmm2, %%xmm2 \n\t" - "psrad $24, %%xmm1 \n\t" - "psrad $24, %%xmm2 \n\t" - "shufps $0, %%xmm0, %%xmm0 \n\t" - "cvtdq2ps %%xmm1, %%xmm1 \n\t" - "cvtdq2ps %%xmm2, %%xmm2 \n\t" - "mulps %%xmm0, %%xmm1 \n\t" - "mulps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, 0(%0) \n\t" - "movaps %%xmm2, 16(%0) \n\t" - :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16) - XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2") - ); -} - -#endif /* ARCH_X86_64 && HAVE_SSE2_INLINE */ - -#endif /* AVCODEC_X86_DCA_H */ diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm index c42ee23faf..1ac237885a 100644 --- a/libavcodec/x86/dcadsp.asm +++ b/libavcodec/x86/dcadsp.asm @@ -2,20 +2,20 @@ ;* SSE-optimized functions for the DCA decoder ;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -132,11 +132,16 @@ DECODE_HF mulps va, %2 mulps vb, %2 %if %0 == 3 +%if cpuflag(fma3) + fmaddps va, m4, %3, va + fmaddps vb, m0, %3, vb +%else mulps m4, %3 mulps m0, %3 addps va, m4 addps vb, m0 %endif +%endif ; va = va1 va2 va3 va4 ; vb = vb1 vb2 vb3 vb4 %if %1 @@ -148,7 +153,7 @@ DECODE_HF addps m4, va ; va1+3 vb1+3 va2+4 vb2+4 movhlps vb, m4 ; va1+3 vb1+3 addps vb, m4 ; va0..4 vb0..4 - movh [outq + count], vb + movlps [outq + count], vb %if %1 sub cf0q, 8*NUM_COEF %endif @@ -198,6 +203,10 @@ cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0 INIT_XMM sse DCA_LFE_FIR 0 DCA_LFE_FIR 1 +%if HAVE_FMA3_EXTERNAL +INIT_XMM fma3 +DCA_LFE_FIR 0 +%endif %macro SETZERO 1 %if cpuflag(sse2) && notcpuflag(avx) diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c index 7c2bec1f9b..1a19f6b807 100644 --- a/libavcodec/x86/dcadsp_init.c +++ b/libavcodec/x86/dcadsp_init.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -34,6 +34,7 @@ void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end); void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs); void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs); +void ff_dca_lfe_fir0_fma3(float *out, const float *in, const float *coefs); av_cold void ff_dcadsp_init_x86(DCADSPContext *s) { @@ -54,6 +55,10 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s) if (EXTERNAL_SSE4(cpu_flags)) { s->decode_hf = ff_decode_hf_sse4; } + + if (EXTERNAL_FMA3(cpu_flags)) { + s->lfe_fir[0] = ff_dca_lfe_fir0_fma3; + } } diff --git a/libavcodec/x86/dct-test.c b/libavcodec/x86/dct-test.c index 9d4aaf5415..0414381e65 100644 --- a/libavcodec/x86/dct-test.c +++ b/libavcodec/x86/dct-test.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -22,6 +22,37 @@ #include "xvididct.h" #include "simple_idct.h" +#if (CONFIG_PRORES_DECODER || CONFIG_PRORES_LGPL_DECODER) && ARCH_X86_64 && HAVE_YASM +void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize, + int16_t *block, int16_t *qmat); + +#define PR_WRAP(INSN) \ +static void ff_prores_idct_put_10_##INSN##_wrap(int16_t *dst){ \ + LOCAL_ALIGNED(16, int16_t, qmat, [64]); \ + LOCAL_ALIGNED(16, int16_t, tmp, [64]); \ + int i; \ + \ + for(i=0; i<64; i++){ \ + qmat[i]=4; \ + tmp[i]= dst[i]; \ + } \ + ff_prores_idct_put_10_##INSN (dst, 16, tmp, qmat); \ + \ + for(i=0; i<64; i++) { \ + dst[i] -= 512; \ + } \ +} + +PR_WRAP(sse2) + +# if HAVE_AVX_EXTERNAL +void ff_prores_idct_put_10_avx(uint16_t *dst, int linesize, + int16_t *block, int16_t *qmat); +PR_WRAP(avx) +# endif + +#endif + static const struct algo fdct_tab_arch[] = { #if HAVE_MMX_INLINE { "MMX", ff_fdct_mmx, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX }, @@ -39,21 +70,25 @@ static const struct algo idct_tab_arch[] = { #if HAVE_MMX_INLINE { "SIMPLE-MMX", ff_simple_idct_mmx, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_MMX }, #endif -#if CONFIG_MPEG4_DECODER -#if HAVE_MMX_INLINE +#if CONFIG_MPEG4_DECODER && HAVE_YASM +#if ARCH_X86_32 { "XVID-MMX", ff_xvid_idct_mmx, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX, 1 }, -#endif -#if HAVE_MMXEXT_INLINE { "XVID-MMXEXT", ff_xvid_idct_mmxext, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMXEXT, 1 }, #endif -#if HAVE_SSE2_INLINE +#if HAVE_SSE2_EXTERNAL { "XVID-SSE2", ff_xvid_idct_sse2, FF_IDCT_PERM_SSE2, AV_CPU_FLAG_SSE2, 1 }, #endif -#endif /* CONFIG_MPEG4_DECODER */ +#endif /* CONFIG_MPEG4_DECODER && HAVE_YASM */ +#if (CONFIG_PRORES_DECODER || CONFIG_PRORES_LGPL_DECODER) && ARCH_X86_64 && HAVE_YASM + { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 }, +# if HAVE_AVX_EXTERNAL + { "PR-AVX", ff_prores_idct_put_10_avx_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX, 1 }, +# endif +#endif { 0 } }; -static short idct_simple_mmx_perm[64] = { +static const uint8_t idct_simple_mmx_perm[64] = { 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, diff --git a/libavcodec/x86/dct32.asm b/libavcodec/x86/dct32.asm index 9c147b9c00..c70f6c9c49 100644 --- a/libavcodec/x86/dct32.asm +++ b/libavcodec/x86/dct32.asm @@ -2,20 +2,20 @@ ;* 32 point SSE-optimized DCT transform ;* Copyright (c) 2010 Vitor Sessak ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -192,6 +192,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 INIT_YMM avx SECTION_TEXT +%if HAVE_AVX_EXTERNAL ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) cglobal dct32_float, 2,3,8, out, in, tmp ; pass 1 @@ -264,6 +265,7 @@ cglobal dct32_float, 2,3,8, out, in, tmp INIT_XMM PASS6_AND_PERMUTE RET +%endif %if ARCH_X86_64 %define SPILL SWAP @@ -482,7 +484,9 @@ cglobal dct32_float, 2, 3, 16, out, in, tmp %endif %endmacro +%if ARCH_X86_32 INIT_XMM sse DCT32_FUNC +%endif INIT_XMM sse2 DCT32_FUNC diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c index ca9fbc7a68..daf2bb4e5d 100644 --- a/libavcodec/x86/dct_init.c +++ b/libavcodec/x86/dct_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -30,7 +30,7 @@ av_cold void ff_dct_init_x86(DCTContext *s) { int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_SSE(cpu_flags)) + if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags)) s->dct32 = ff_dct32_float_sse; if (EXTERNAL_SSE2(cpu_flags)) s->dct32 = ff_dct32_float_sse2; diff --git a/libavcodec/x86/deinterlace.asm b/libavcodec/x86/deinterlace.asm index 70d000e0db..c421385fbb 100644 --- a/libavcodec/x86/deinterlace.asm +++ b/libavcodec/x86/deinterlace.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2010 Vitor Sessak ;* Copyright (c) 2002 Michael Niedermayer ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -31,10 +31,10 @@ SECTION .text %macro DEINTERLACE 1 %ifidn %1, inplace ;void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size) -cglobal deinterlace_line_inplace_mmx, 6,6,7, lum_m4, lum_m3, lum_m2, lum_m1, lum, size +cglobal deinterlace_line_inplace, 6,6,7, lum_m4, lum_m3, lum_m2, lum_m1, lum, size %else ;void ff_deinterlace_line_mmx(uint8_t *dst, const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size) -cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size +cglobal deinterlace_line, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size %endif pxor mm7, mm7 movq mm6, [pw_4] @@ -77,6 +77,8 @@ cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1 REP_RET %endmacro +INIT_MMX mmx + DEINTERLACE "" DEINTERLACE inplace diff --git a/libavcodec/x86/dirac_dwt.c b/libavcodec/x86/dirac_dwt.c new file mode 100644 index 0000000000..3c51ea6ffa --- /dev/null +++ b/libavcodec/x86/dirac_dwt.c @@ -0,0 +1,202 @@ +/* + * MMX optimized discrete wavelet transform + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * Copyright (c) 2010 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "dirac_dwt.h" + +#define COMPOSE_VERTICAL(ext, align) \ +void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ +void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ +void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ +void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ +void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \ +void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ +void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ +\ +static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \ +\ + ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \ +} \ +\ +static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \ +\ + ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \ +} \ +\ +static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ + IDWTELEM *b3, IDWTELEM *b4, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ +\ + ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \ +} \ +\ +static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ + IDWTELEM *b3, IDWTELEM *b4, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ +\ + ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \ +} \ +static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) { \ + b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \ + b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \ + } \ +\ + ff_vertical_compose_haar##ext(b0, b1, width_align); \ +} \ +static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ +{\ + int w2= w>>1;\ + int x= w2 - (w2&(align-1));\ + ff_horizontal_compose_haar0i##ext(b, tmp, w);\ +\ + for (; x < w2; x++) {\ + b[2*x ] = tmp[x];\ + b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\ + }\ +}\ +static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ +{\ + int w2= w>>1;\ + int x= w2 - (w2&(align-1));\ + ff_horizontal_compose_haar1i##ext(b, tmp, w);\ +\ + for (; x < w2; x++) {\ + b[2*x ] = (tmp[x] + 1)>>1;\ + b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\ + }\ +}\ +\ + +#if HAVE_YASM +#if !ARCH_X86_64 +COMPOSE_VERTICAL(_mmx, 4) +#endif +COMPOSE_VERTICAL(_sse2, 8) + + +void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w); + +static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w) +{ + int w2= w>>1; + int x= w2 - (w2&7); + ff_horizontal_compose_dd97i_ssse3(b, tmp, w); + + for (; x < w2; x++) { + b[2*x ] = (tmp[x] + 1)>>1; + b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1; + } +} +#endif + +void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type) +{ +#if HAVE_YASM + int mm_flags = av_get_cpu_flags(); + +#if !ARCH_X86_64 + if (!(mm_flags & AV_CPU_FLAG_MMX)) + return; + + switch (type) { + case DWT_DIRAC_DD9_7: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; + break; + case DWT_DIRAC_LEGALL5_3: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; + d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx; + break; + case DWT_DIRAC_DD13_7: + d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; + break; + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)vertical_compose_haar_mmx; + d->horizontal_compose = horizontal_compose_haar0i_mmx; + break; + case DWT_DIRAC_HAAR1: + d->vertical_compose = (void*)vertical_compose_haar_mmx; + d->horizontal_compose = horizontal_compose_haar1i_mmx; + break; + } +#endif + + if (!(mm_flags & AV_CPU_FLAG_SSE2)) + return; + + switch (type) { + case DWT_DIRAC_DD9_7: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; + break; + case DWT_DIRAC_LEGALL5_3: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; + d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2; + break; + case DWT_DIRAC_DD13_7: + d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; + break; + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)vertical_compose_haar_sse2; + d->horizontal_compose = horizontal_compose_haar0i_sse2; + break; + case DWT_DIRAC_HAAR1: + d->vertical_compose = (void*)vertical_compose_haar_sse2; + d->horizontal_compose = horizontal_compose_haar1i_sse2; + break; + } + + if (!(mm_flags & AV_CPU_FLAG_SSSE3)) + return; + + switch (type) { + case DWT_DIRAC_DD9_7: + d->horizontal_compose = horizontal_compose_dd97i_ssse3; + break; + } +#endif // HAVE_YASM +} diff --git a/libavcodec/x86/dirac_dwt.h b/libavcodec/x86/dirac_dwt.h new file mode 100644 index 0000000000..126b29029f --- /dev/null +++ b/libavcodec/x86/dirac_dwt.h @@ -0,0 +1,30 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_DIRAC_DWT_H +#define AVCODEC_X86_DIRAC_DWT_H + +#include "libavcodec/dirac_dwt.h" + +void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); +void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); +void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); + +void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type); + +#endif diff --git a/libavcodec/x86/diracdsp_mmx.c b/libavcodec/x86/diracdsp_mmx.c new file mode 100644 index 0000000000..11df5e395e --- /dev/null +++ b/libavcodec/x86/diracdsp_mmx.c @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2010 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/cpu.h" +#include "diracdsp_mmx.h" +#include "fpel.h" + +void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); + +#define HPEL_FILTER(MMSIZE, EXT) \ + void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \ + void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \ + \ + static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \ + const uint8_t *src, int stride, int width, int height) \ + { \ + while( height-- ) \ + { \ + ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \ + ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \ + ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \ + \ + dsth += stride; \ + dstv += stride; \ + dstc += stride; \ + src += stride; \ + } \ + } + +#if !ARCH_X86_64 +HPEL_FILTER(8, mmx) +#endif +HPEL_FILTER(16, sse2) + +#define PIXFUNC(PFX, IDX, EXT) \ + /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \ + c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \ + c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT + +#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\ +void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ +{\ + if (h&3)\ + ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\ + else\ + OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\ +}\ +void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ +{\ + if (h&3)\ + ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\ + else\ + OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\ +}\ +void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ +{\ + if (h&3) {\ + ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\ + } else {\ + OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\ + OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\ + }\ +} + +DIRAC_PIXOP(put, ff_put, mmx) +DIRAC_PIXOP(avg, ff_avg, mmx) +DIRAC_PIXOP(avg, ff_avg, mmxext) + +void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + if (h&3) + ff_put_dirac_pixels16_c(dst, src, stride, h); + else + ff_put_pixels16_sse2(dst, src[0], stride, h); +} +void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + if (h&3) + ff_avg_dirac_pixels16_c(dst, src, stride, h); + else + ff_avg_pixels16_sse2(dst, src[0], stride, h); +} +void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + if (h&3) { + ff_put_dirac_pixels32_c(dst, src, stride, h); + } else { + ff_put_pixels16_sse2(dst , src[0] , stride, h); + ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h); + } +} +void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + if (h&3) { + ff_avg_dirac_pixels32_c(dst, src, stride, h); + } else { + ff_avg_pixels16_sse2(dst , src[0] , stride, h); + ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h); + } +} + +void ff_diracdsp_init_mmx(DiracDSPContext* c) +{ + int mm_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(mm_flags)) { + c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx; +#if !ARCH_X86_64 + c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx; + c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx; + c->dirac_hpel_filter = dirac_hpel_filter_mmx; + c->add_rect_clamped = ff_add_rect_clamped_mmx; + c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx; +#endif + PIXFUNC(put, 0, mmx); + PIXFUNC(avg, 0, mmx); + } + + if (EXTERNAL_MMXEXT(mm_flags)) { + PIXFUNC(avg, 0, mmxext); + } + + if (EXTERNAL_SSE2(mm_flags)) { + c->dirac_hpel_filter = dirac_hpel_filter_sse2; + c->add_rect_clamped = ff_add_rect_clamped_sse2; + c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2; + + c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2; + c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2; + + c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2; + c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2; + c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; + c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; + } +} diff --git a/libavcodec/x86/diracdsp_mmx.h b/libavcodec/x86/diracdsp_mmx.h new file mode 100644 index 0000000000..89858544f3 --- /dev/null +++ b/libavcodec/x86/diracdsp_mmx.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2010 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_DIRACDSP_H +#define AVCODEC_X86_DIRACDSP_H + +#include "libavcodec/diracdsp.h" + +void ff_diracdsp_init_mmx(DiracDSPContext* c); + +DECL_DIRAC_PIXOP(put, mmx); +DECL_DIRAC_PIXOP(avg, mmx); +DECL_DIRAC_PIXOP(avg, mmxext); + +void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); +void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); +void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); +void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); + +void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); +void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); + +void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); +void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); +void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); + +void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); +void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); + +#endif diff --git a/libavcodec/x86/diracdsp_yasm.asm b/libavcodec/x86/diracdsp_yasm.asm new file mode 100644 index 0000000000..d3cf9f1971 --- /dev/null +++ b/libavcodec/x86/diracdsp_yasm.asm @@ -0,0 +1,265 @@ +;****************************************************************************** +;* Copyright (c) 2010 David Conrad +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +pw_7: times 8 dw 7 + +cextern pw_3 +cextern pw_16 +cextern pw_32 +cextern pb_80 + +section .text + +%macro UNPACK_ADD 6 + mov%5 %1, %3 + mov%6 m5, %4 + mova m4, %1 + mova %2, m5 + punpcklbw %1, m7 + punpcklbw m5, m7 + punpckhbw m4, m7 + punpckhbw %2, m7 + paddw %1, m5 + paddw %2, m4 +%endmacro + +%macro HPEL_FILTER 1 +; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); +cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 + mov src0q, srcq + lea stridex3q, [3*strideq] + sub src0q, stridex3q + pxor m7, m7 +.loop: + ; 7*(src[0] + src[1]) + UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a + pmullw m0, [pw_7] + pmullw m1, [pw_7] + + ; 3*( ... + src[-2] + src[3]) + UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a + paddw m0, m2 + paddw m1, m3 + pmullw m0, [pw_3] + pmullw m1, [pw_3] + + ; ... - 7*(src[-1] + src[2]) + UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a + pmullw m2, [pw_7] + pmullw m3, [pw_7] + psubw m0, m2 + psubw m1, m3 + + ; ... - (src[-3] + src[4]) + UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a + psubw m0, m2 + psubw m1, m3 + + paddw m0, [pw_16] + paddw m1, [pw_16] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, mmsize + add srcq, mmsize + add src0q, mmsize + sub widthd, mmsize + jg .loop + RET + +; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); +cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width + dec widthd + pxor m7, m7 + and widthd, ~(mmsize-1) +.loop: + ; 7*(src[0] + src[1]) + UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u + pmullw m0, [pw_7] + pmullw m1, [pw_7] + + ; 3*( ... + src[-2] + src[3]) + UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u + paddw m0, m2 + paddw m1, m3 + pmullw m0, [pw_3] + pmullw m1, [pw_3] + + ; ... - 7*(src[-1] + src[2]) + UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u + pmullw m2, [pw_7] + pmullw m3, [pw_7] + psubw m0, m2 + psubw m1, m3 + + ; ... - (src[-3] + src[4]) + UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u + psubw m0, m2 + psubw m1, m3 + + paddw m0, [pw_16] + paddw m1, [pw_16] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + mova [dstq + widthq], m0 + sub widthd, mmsize + jge .loop + RET +%endmacro + +%macro PUT_RECT 1 +; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) +cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2 + mova m0, [pb_80] + add wd, (mmsize-1) + and wd, ~(mmsize-1) + +%if ARCH_X86_64 + movsxd dst_strideq, dst_strided + movsxd src_strideq, src_strided + mov r7d, r5m + mov r8d, wd + %define wspill r8d + %define hd r7d +%else + mov r4m, wd + %define wspill r4m + %define hd r5mp +%endif + +.loopy + lea src2q, [srcq+src_strideq*2] + lea dst2q, [dstq+dst_strideq] +.loopx: + sub wd, mmsize + mova m1, [srcq +2*wq] + mova m2, [src2q+2*wq] + packsswb m1, [srcq +2*wq+mmsize] + packsswb m2, [src2q+2*wq+mmsize] + paddb m1, m0 + paddb m2, m0 + mova [dstq +wq], m1 + mova [dst2q+wq], m2 + jg .loopx + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*2] + sub hd, 2 + mov wd, wspill + jg .loopy + RET +%endm + +%macro ADD_RECT 1 +; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) +cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h + mova m0, [pw_32] + add wd, (mmsize-1) + and wd, ~(mmsize-1) + +%if ARCH_X86_64 + movsxd strideq, strided + movsxd idwt_strideq, idwt_strided + mov r8d, wd + %define wspill r8d +%else + mov r5m, wd + %define wspill r5m +%endif + +.loop: + sub wd, mmsize + movu m1, [srcq +2*wq] ; FIXME: ensure alignment + paddw m1, m0 + psraw m1, 6 + movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment + paddw m2, m0 + psraw m2, 6 + paddw m1, [idwtq+2*wq] + paddw m2, [idwtq+2*wq+mmsize] + packuswb m1, m2 + mova [dstq +wq], m1 + jg .loop + + lea srcq, [srcq + 2*strideq] + add dstq, strideq + lea idwtq, [idwtq+ 2*idwt_strideq] + sub hd, 1 + mov wd, wspill + jg .loop + RET +%endm + +%macro ADD_OBMC 2 +; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) +cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen + pxor m4, m4 +.loop: +%assign i 0 +%rep %1 / mmsize + mova m0, [srcq+i] + mova m1, m0 + punpcklbw m0, m4 + punpckhbw m1, m4 + mova m2, [obmcq+i] + mova m3, m2 + punpcklbw m2, m4 + punpckhbw m3, m4 + pmullw m0, m2 + pmullw m1, m3 + movu m2, [dstq+2*i] + movu m3, [dstq+2*i+mmsize] + paddw m0, m2 + paddw m1, m3 + movu [dstq+2*i], m0 + movu [dstq+2*i+mmsize], m1 +%assign i i+mmsize +%endrep + lea srcq, [srcq+strideq] + lea dstq, [dstq+2*strideq] + add obmcq, 32 + sub yblend, 1 + jg .loop + RET +%endm + +INIT_MMX +%if ARCH_X86_64 == 0 +PUT_RECT mmx +ADD_RECT mmx + +HPEL_FILTER mmx +ADD_OBMC 32, mmx +ADD_OBMC 16, mmx +%endif +ADD_OBMC 8, mmx + +INIT_XMM +PUT_RECT sse2 +ADD_RECT sse2 + +HPEL_FILTER sse2 +ADD_OBMC 32, sse2 +ADD_OBMC 16, sse2 diff --git a/libavcodec/x86/dnxhdenc.asm b/libavcodec/x86/dnxhdenc.asm index d39b07b9f4..9dd6d51ee6 100644 --- a/libavcodec/x86/dnxhdenc.asm +++ b/libavcodec/x86/dnxhdenc.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com> ;* Copyright (c) 2014 Tiancheng "Timothy" Gu <timothygu99@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/dnxhdenc_init.c b/libavcodec/x86/dnxhdenc_init.c index f1ff7bd986..fd6f15005a 100644 --- a/libavcodec/x86/dnxhdenc_init.c +++ b/libavcodec/x86/dnxhdenc_init.c @@ -4,20 +4,20 @@ * * VC-3 encoder funded by the British Broadcasting Corporation * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/dwt_yasm.asm b/libavcodec/x86/dwt_yasm.asm new file mode 100644 index 0000000000..658acc13fc --- /dev/null +++ b/libavcodec/x86/dwt_yasm.asm @@ -0,0 +1,307 @@ +;****************************************************************************** +;* MMX optimized discrete wavelet trasnform +;* Copyright (c) 2010 David Conrad +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +pw_1991: times 4 dw 9,-1 + +cextern pw_1 +cextern pw_2 +cextern pw_8 +cextern pw_16 + +section .text + +; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 +%macro COMPOSE_53iL0 4 + paddw %2, %3 + paddw %2, %4 + psraw %2, 2 + psubw %1, %2 +%endm + +; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 +; if %4 is supplied, %1 is loaded unaligned from there +; m2: clobbered m3: pw_8 m4: pw_1991 +%macro COMPOSE_DD97iH0 3-4 + paddw m0, %3 + paddw m1, %2 + psubw m0, m3 + mova m2, m1 + punpcklwd m1, m0 + punpckhwd m2, m0 + pmaddwd m1, m4 + pmaddwd m2, m4 +%if %0 > 3 + movu %1, %4 +%endif + psrad m1, 4 + psrad m2, 4 + packssdw m1, m2 + paddw m1, %1 +%endm + +%macro COMPOSE_VERTICAL 1 +; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; int width) +cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width + mova m2, [pw_2] +%if ARCH_X86_64 + mov widthd, widthd +%endif +.loop: + sub widthq, mmsize/2 + mova m1, [b0q+2*widthq] + mova m0, [b1q+2*widthq] + COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 + mova [b1q+2*widthq], m0 + jg .loop + REP_RET + +; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; int width) +cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width + mova m1, [pw_1] +%if ARCH_X86_64 + mov widthd, widthd +%endif +.loop: + sub widthq, mmsize/2 + mova m0, [b0q+2*widthq] + paddw m0, [b2q+2*widthq] + paddw m0, m1 + psraw m0, 1 + paddw m0, [b1q+2*widthq] + mova [b1q+2*widthq], m0 + jg .loop + REP_RET + +; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; IDWTELEM *b3, IDWTELEM *b4, int width) +cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width + mova m3, [pw_8] + mova m4, [pw_1991] +%if ARCH_X86_64 + mov widthd, widthd +%endif +.loop: + sub widthq, mmsize/2 + mova m0, [b0q+2*widthq] + mova m1, [b1q+2*widthq] + COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] + mova [b2q+2*widthq], m1 + jg .loop + REP_RET + +; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; IDWTELEM *b3, IDWTELEM *b4, int width) +cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width + mova m3, [pw_16] + mova m4, [pw_1991] +%if ARCH_X86_64 + mov widthd, widthd +%endif +.loop: + sub widthq, mmsize/2 + mova m0, [b0q+2*widthq] + mova m1, [b1q+2*widthq] + mova m5, [b2q+2*widthq] + paddw m0, [b4q+2*widthq] + paddw m1, [b3q+2*widthq] + psubw m0, m3 + mova m2, m1 + punpcklwd m1, m0 + punpckhwd m2, m0 + pmaddwd m1, m4 + pmaddwd m2, m4 + psrad m1, 5 + psrad m2, 5 + packssdw m1, m2 + psubw m5, m1 + mova [b2q+2*widthq], m5 + jg .loop + REP_RET + +; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) +cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width + mova m3, [pw_1] +%if ARCH_X86_64 + mov widthd, widthd +%endif +.loop: + sub widthq, mmsize/2 + mova m1, [b1q+2*widthq] + mova m0, [b0q+2*widthq] + mova m2, m1 + paddw m1, m3 + psraw m1, 1 + psubw m0, m1 + mova [b0q+2*widthq], m0 + paddw m2, m0 + mova [b1q+2*widthq], m2 + jg .loop + REP_RET +%endmacro + +; extend the left and right edges of the tmp array by %1 and %2 respectively +%macro EDGE_EXTENSION 3 + mov %3, [tmpq] +%assign %%i 1 +%rep %1 + mov [tmpq-2*%%i], %3 + %assign %%i %%i+1 +%endrep + mov %3, [tmpq+2*w2q-2] +%assign %%i 0 +%rep %2 + mov [tmpq+2*w2q+2*%%i], %3 + %assign %%i %%i+1 +%endrep +%endmacro + + +%macro HAAR_HORIZONTAL 2 +; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) +cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 + mov w2d, wd + xor xq, xq + shr w2d, 1 + lea b_w2q, [bq+wq] + mova m3, [pw_1] +.lowpass_loop: + movu m1, [b_w2q + 2*xq] + mova m0, [bq + 2*xq] + paddw m1, m3 + psraw m1, 1 + psubw m0, m1 + mova [tmpq + 2*xq], m0 + add xq, mmsize/2 + cmp xq, w2q + jl .lowpass_loop + + xor xq, xq + and w2q, ~(mmsize/2 - 1) + cmp w2q, mmsize/2 + jl .end + +.highpass_loop: + movu m1, [b_w2q + 2*xq] + mova m0, [tmpq + 2*xq] + paddw m1, m0 + + ; shift and interleave +%if %2 == 1 + paddw m0, m3 + paddw m1, m3 + psraw m0, 1 + psraw m1, 1 +%endif + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + mova [bq+4*xq], m0 + mova [bq+4*xq+mmsize], m2 + + add xq, mmsize/2 + cmp xq, w2q + jl .highpass_loop +.end: + REP_RET +%endmacro + + +INIT_XMM +; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) +cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 + mov w2d, wd + xor xd, xd + shr w2d, 1 + lea b_w2q, [bq+wq] + movu m4, [bq+wq] + mova m7, [pw_2] + pslldq m4, 14 +.lowpass_loop: + movu m1, [b_w2q + 2*xq] + mova m0, [bq + 2*xq] + mova m2, m1 + palignr m1, m4, 14 + mova m4, m2 + COMPOSE_53iL0 m0, m1, m2, m7 + mova [tmpq + 2*xq], m0 + add xd, mmsize/2 + cmp xd, w2d + jl .lowpass_loop + + EDGE_EXTENSION 1, 2, xw + ; leave the last up to 7 (sse) or 3 (mmx) values for C + xor xd, xd + and w2d, ~(mmsize/2 - 1) + cmp w2d, mmsize/2 + jl .end + + mova m7, [tmpq-mmsize] + mova m0, [tmpq] + mova m5, [pw_1] + mova m3, [pw_8] + mova m4, [pw_1991] +.highpass_loop: + mova m6, m0 + palignr m0, m7, 14 + mova m7, [tmpq + 2*xq + 16] + mova m1, m7 + mova m2, m7 + palignr m1, m6, 2 + palignr m2, m6, 4 + COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] + mova m0, m7 + mova m7, m6 + + ; shift and interleave + paddw m6, m5 + paddw m1, m5 + psraw m6, 1 + psraw m1, 1 + mova m2, m6 + punpcklwd m6, m1 + punpckhwd m2, m1 + mova [bq+4*xq], m6 + mova [bq+4*xq+mmsize], m2 + + add xd, mmsize/2 + cmp xd, w2d + jl .highpass_loop +.end: + REP_RET + + +%if ARCH_X86_64 == 0 +INIT_MMX +COMPOSE_VERTICAL mmx +HAAR_HORIZONTAL mmx, 0 +HAAR_HORIZONTAL mmx, 1 +%endif + +;;INIT_XMM +INIT_XMM +COMPOSE_VERTICAL sse2 +HAAR_HORIZONTAL sse2, 0 +HAAR_HORIZONTAL sse2, 1 diff --git a/libavcodec/x86/fdct.c b/libavcodec/x86/fdct.c index 6528b57361..112566ded0 100644 --- a/libavcodec/x86/fdct.c +++ b/libavcodec/x86/fdct.c @@ -13,20 +13,20 @@ * a page about fdct at http://www.geocities.com/ssavekar/dct.htm * Skal's fdct at http://skal.planet-d.net/coding/dct.html * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -70,7 +70,7 @@ DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW }; -static struct +static const struct { DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; } fdct_r_row_sse2 = @@ -153,7 +153,7 @@ DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct 29692, -12299, 26722, -31521, }; -static struct +static const struct { DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; } tab_frw_01234567_sse2 = diff --git a/libavcodec/x86/fdct.h b/libavcodec/x86/fdct.h index c94a977e8f..648cdc5350 100644 --- a/libavcodec/x86/fdct.h +++ b/libavcodec/x86/fdct.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/fdctdsp_init.c b/libavcodec/x86/fdctdsp_init.c index 4e8e4eb60d..0cb5fd625b 100644 --- a/libavcodec/x86/fdctdsp_init.c +++ b/libavcodec/x86/fdctdsp_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm index e4744a3b60..f233774f1d 100644 --- a/libavcodec/x86/fft.asm +++ b/libavcodec/x86/fft.asm @@ -6,20 +6,20 @@ ;* This algorithm (though not any of the implementation details) is ;* based on libdjbfft by D. J. Bernstein. ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -51,13 +51,12 @@ struc FFTContext .imdcthalf:pointer 1 endstruc -SECTION_RODATA +SECTION_RODATA 32 %define M_SQRT1_2 0.70710678118654752440 %define M_COS_PI_1_8 0.923879532511287 %define M_COS_PI_3_8 0.38268343236509 -align 32 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 @@ -69,9 +68,10 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 -ps_m1m1m1m1: times 4 dd 1<<31 ps_m1p1: dd 1<<31, 0 +cextern ps_neg + %assign i 16 %rep 13 cextern cos_ %+ i @@ -305,6 +305,7 @@ IF%1 mova Z(1), m5 INIT_YMM avx +%if HAVE_AVX_EXTERNAL align 16 fft8_avx: mova m0, Z(0) @@ -394,6 +395,8 @@ fft32_interleave_avx: jg .deint_loop ret +%endif + INIT_XMM sse align 16 @@ -537,6 +540,7 @@ DEFINE_ARGS zc, w, n, o1, o3 INIT_YMM avx +%if HAVE_AVX_EXTERNAL %macro INTERL_AVX 5 vunpckhps %3, %2, %1 vunpcklps %2, %2, %1 @@ -558,6 +562,7 @@ cglobal fft_calc, 2,5,8 FFT_DISPATCH _interleave %+ SUFFIX, r1 REP_RET +%endif INIT_XMM sse @@ -681,7 +686,7 @@ cglobal imdct_calc, 3,5,3 mov r2, r3 sub r3, mmsize neg r2 - mova m2, [ps_m1m1m1m1] + mova m2, [ps_neg] .loop: %if mmsize == 8 PSWAPD m0, [r1 + r3] @@ -776,9 +781,11 @@ align 8 dispatch_tab %+ fullsuffix: pointer list_of_fft %endmacro ; DECL_FFT +%if HAVE_AVX_EXTERNAL INIT_YMM avx DECL_FFT 6 DECL_FFT 6, _interleave +%endif INIT_XMM sse DECL_FFT 5 DECL_FFT 5, _interleave @@ -992,7 +999,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i sub r4, r3 %endif %if notcpuflag(3dnowext) && mmsize == 8 - movd m7, [ps_m1m1m1m1] + movd m7, [ps_neg] %endif .pre: %if ARCH_X86_64 == 0 @@ -1080,4 +1087,7 @@ DECL_IMDCT POSROTATESHUF_3DNOW %endif INIT_YMM avx + +%if HAVE_AVX_EXTERNAL DECL_IMDCT POSROTATESHUF_AVX +%endif diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h index a604956836..398091eb1f 100644 --- a/libavcodec/x86/fft.h +++ b/libavcodec/x86/fft.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c index 5c0273de9d..5085f11380 100644 --- a/libavcodec/x86/fft_init.c +++ b/libavcodec/x86/fft_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm new file mode 100644 index 0000000000..cedf0837a7 --- /dev/null +++ b/libavcodec/x86/flac_dsp_gpl.asm @@ -0,0 +1,101 @@ +;****************************************************************************** +;* FLAC DSP functions +;* +;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with FFmpeg; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +INIT_XMM sse4 +%if ARCH_X86_64 + cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs + DECLARE_REG_TMP 5, 6 + %define length r2d + + movsxd orderq, orderd +%else + cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs + DECLARE_REG_TMP 2, 5 + %define length r2mp +%endif + +; Here we assume that the maximum order value is 32. This means that we only +; need to copy a maximum of 32 samples. Therefore we let the preprocessor +; unroll this loop and copy all 32. +%assign iter 0 +%rep 32/(mmsize/4) + movu m0, [smpq+iter] + movu [resq+iter], m0 + %assign iter iter+mmsize +%endrep + +lea resq, [resq+orderq*4] +lea smpq, [smpq+orderq*4] +lea coefsq, [coefsq+orderq*4] +sub length, orderd +movd m3, r5m +neg orderq + +%define posj t0q +%define negj t1q + +.looplen: + pxor m0, m0 + pxor m4, m4 + pxor m6, m6 + mov posj, orderq + xor negj, negj + + .looporder: + movd m2, [coefsq+posj*4] ; c = coefs[j] + SPLATD m2 + movu m1, [smpq+negj*4-4] ; s = smp[i-j-1] + movu m5, [smpq+negj*4-4+mmsize] + movu m7, [smpq+negj*4-4+mmsize*2] + pmulld m1, m2 + pmulld m5, m2 + pmulld m7, m2 + paddd m0, m1 ; p += c * s + paddd m4, m5 + paddd m6, m7 + + dec negj + inc posj + jnz .looporder + + psrad m0, m3 ; p >>= shift + psrad m4, m3 + psrad m6, m3 + movu m1, [smpq] + movu m5, [smpq+mmsize] + movu m7, [smpq+mmsize*2] + psubd m1, m0 ; smp[i] - p + psubd m5, m4 + psubd m7, m6 + movu [resq], m1 ; res[i] = smp[i] - (p >> shift) + movu [resq+mmsize], m5 + movu [resq+mmsize*2], m7 + + add resq, 3*mmsize + add smpq, 3*mmsize + sub length, (3*mmsize)/4 +jg .looplen +RET diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm new file mode 100644 index 0000000000..901c440ccd --- /dev/null +++ b/libavcodec/x86/flacdsp.asm @@ -0,0 +1,304 @@ +;****************************************************************************** +;* FLAC DSP SIMD optimizations +;* +;* Copyright (C) 2014 Loren Merritt +;* Copyright (C) 2014 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%macro LPC_32 1 +INIT_XMM %1 +cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j + sub lend, pred_orderd + jle .ret + lea decodedq, [decodedq+pred_orderq*4-8] + lea coeffsq, [coeffsq+pred_orderq*4] + neg pred_orderq + movd m4, qlevelm +ALIGN 16 +.loop_sample: + movd m0, [decodedq+pred_orderq*4+8] + add decodedq, 8 + movd m1, [coeffsq+pred_orderq*4] + pxor m2, m2 + pxor m3, m3 + lea jq, [pred_orderq+1] + test jq, jq + jz .end_order +.loop_order: + PMACSDQL m2, m0, m1, m2, m0 + movd m0, [decodedq+jq*4] + PMACSDQL m3, m1, m0, m3, m1 + movd m1, [coeffsq+jq*4] + inc jq + jl .loop_order +.end_order: + PMACSDQL m2, m0, m1, m2, m0 + psrlq m2, m4 + movd m0, [decodedq] + paddd m0, m2 + movd [decodedq], m0 + sub lend, 2 + jl .ret + PMACSDQL m3, m1, m0, m3, m1 + psrlq m3, m4 + movd m1, [decodedq+4] + paddd m1, m3 + movd [decodedq+4], m1 + jg .loop_sample +.ret: + REP_RET +%endmacro + +%if HAVE_XOP_EXTERNAL +LPC_32 xop +%endif +LPC_32 sse4 + +;---------------------------------------------------------------------------------- +;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels, +; int len, int shift); +;---------------------------------------------------------------------------------- +%macro FLAC_DECORRELATE_16 3-4 +cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len +%if ARCH_X86_32 + mov lend, lenm +%endif + movd m3, r4m + shl lend, 2 + mov in1q, [in0q + gprsize] + mov in0q, [in0q] + mov outq, [outq] + add in1q, lenq + add in0q, lenq + add outq, lenq + neg lenq + +align 16 +.loop: + mova m0, [in0q + lenq] + mova m1, [in1q + lenq] +%ifidn %1, ms + psrad m2, m1, 1 + psubd m0, m2 +%endif +%ifnidn %1, indep2 + p%4d m2, m0, m1 +%endif + packssdw m%2, m%2 + packssdw m%3, m%3 + punpcklwd m%2, m%3 + psllw m%2, m3 + mova [outq + lenq], m%2 + add lenq, 16 + jl .loop + REP_RET +%endmacro + +INIT_XMM sse2 +FLAC_DECORRELATE_16 ls, 0, 2, sub +FLAC_DECORRELATE_16 rs, 2, 1, add +FLAC_DECORRELATE_16 ms, 2, 0, add + +;---------------------------------------------------------------------------------- +;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels, +; int len, int shift); +;---------------------------------------------------------------------------------- +%macro FLAC_DECORRELATE_32 5 +cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len +%if ARCH_X86_32 + mov lend, lenm +%endif + movd m3, r4m + mov in1q, [in0q + gprsize] + mov in0q, [in0q] + mov outq, [outq] + sub in1q, in0q + +align 16 +.loop: + mova m0, [in0q] + mova m1, [in0q + in1q] +%ifidn %1, ms + psrad m2, m1, 1 + psubd m0, m2 +%endif + p%5d m2, m0, m1 + pslld m%2, m3 + pslld m%3, m3 + + SBUTTERFLY dq, %2, %3, %4 + + mova [outq ], m%2 + mova [outq + mmsize], m%3 + + add in0q, mmsize + add outq, mmsize*2 + sub lend, mmsize/4 + jg .loop + REP_RET +%endmacro + +INIT_XMM sse2 +FLAC_DECORRELATE_32 ls, 0, 2, 1, sub +FLAC_DECORRELATE_32 rs, 2, 1, 0, add +FLAC_DECORRELATE_32 ms, 2, 0, 1, add + +;----------------------------------------------------------------------------------------- +;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels, +; int len, int shift); +;----------------------------------------------------------------------------------------- +;%1 = bps +;%2 = channels +;%3 = last xmm reg used +;%4 = word/dword (shift instruction) +%macro FLAC_DECORRELATE_INDEP 4 +%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels +cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7 +%if ARCH_X86_32 +%if %2 == 6 + DEFINE_ARGS out, in0, in1, in2, in3, in4, in5 + %define lend dword r3m +%else + mov lend, lenm +%endif +%endif + movd m%3, r4m + +%assign %%i 1 +%rep %2-1 + mov in %+ %%i %+ q, [in0q+%%i*gprsize] +%assign %%i %%i+1 +%endrep + + mov in0q, [in0q] + mov outq, [outq] + +%assign %%i 1 +%rep %2-1 + sub in %+ %%i %+ q, in0q +%assign %%i %%i+1 +%endrep + +align 16 +.loop: + mova m0, [in0q] + +%assign %%i 1 +%rep REPCOUNT-1 + mova m %+ %%i, [in0q + in %+ %%i %+ q] +%assign %%i %%i+1 +%endrep + +%if %1 == 32 + +%if %2 == 8 + TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8 +%elif %2 == 6 + SBUTTERFLY dq, 0, 1, 6 + SBUTTERFLY dq, 2, 3, 6 + SBUTTERFLY dq, 4, 5, 6 + + punpcklqdq m6, m0, m2 + punpckhqdq m2, m4 + shufps m4, m0, 0xe4 + punpcklqdq m0, m1, m3 + punpckhqdq m3, m5 + shufps m5, m1, 0xe4 + SWAP 0,6,1,4,5,3 +%elif %2 == 4 + TRANSPOSE4x4D 0, 1, 2, 3, 4 +%else ; %2 == 2 + SBUTTERFLY dq, 0, 1, 2 +%endif + +%else ; %1 == 16 + +%if %2 == 8 + packssdw m0, [in0q + in4q] + packssdw m1, [in0q + in5q] + packssdw m2, [in0q + in6q] + packssdw m3, [in0q + in7q] + TRANSPOSE2x4x4W 0, 1, 2, 3, 4 +%elif %2 == 6 + packssdw m0, [in0q + in3q] + packssdw m1, [in0q + in4q] + packssdw m2, [in0q + in5q] + pshufd m3, m0, q1032 + punpcklwd m0, m1 + punpckhwd m1, m2 + punpcklwd m2, m3 + + shufps m3, m0, m2, q2020 + shufps m0, m1, q2031 + shufps m2, m1, q3131 + shufps m1, m2, m3, q3120 + shufps m3, m0, q0220 + shufps m0, m2, q3113 + SWAP 2, 0, 3 +%else ; %2 == 4 + packssdw m0, [in0q + in2q] + packssdw m1, [in0q + in3q] + SBUTTERFLY wd, 0, 1, 2 + SBUTTERFLY dq, 0, 1, 2 +%endif + +%endif + +%assign %%i 0 +%rep REPCOUNT + psll%4 m %+ %%i, m%3 +%assign %%i %%i+1 +%endrep + +%assign %%i 0 +%rep REPCOUNT + mova [outq + %%i*mmsize], m %+ %%i +%assign %%i %%i+1 +%endrep + + add in0q, mmsize + add outq, mmsize*REPCOUNT + sub lend, mmsize/4 + jg .loop + REP_RET +%endmacro + +INIT_XMM sse2 +FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro +FLAC_DECORRELATE_INDEP 32, 2, 3, d +FLAC_DECORRELATE_INDEP 16, 4, 3, w +FLAC_DECORRELATE_INDEP 32, 4, 5, d +FLAC_DECORRELATE_INDEP 16, 6, 4, w +FLAC_DECORRELATE_INDEP 32, 6, 7, d +%if ARCH_X86_64 +FLAC_DECORRELATE_INDEP 16, 8, 5, w +FLAC_DECORRELATE_INDEP 32, 8, 9, d +%endif + +INIT_XMM avx +FLAC_DECORRELATE_INDEP 32, 4, 5, d +FLAC_DECORRELATE_INDEP 32, 6, 7, d +%if ARCH_X86_64 +FLAC_DECORRELATE_INDEP 16, 8, 5, w +FLAC_DECORRELATE_INDEP 32, 8, 9, d +%endif diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c new file mode 100644 index 0000000000..e28c5c9322 --- /dev/null +++ b/libavcodec/x86/flacdsp_init.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2014 James Almer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/flacdsp.h" +#include "libavutil/x86/cpu.h" +#include "config.h" + +void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order, + int qlevel, int len); +void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order, + int qlevel, int len); + +void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int); + +#define DECORRELATE_FUNCS(fmt, opt) \ +void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift) + +DECORRELATE_FUNCS(16, sse2); +DECORRELATE_FUNCS(16, avx); +DECORRELATE_FUNCS(32, sse2); +DECORRELATE_FUNCS(32, avx); + +av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, + int bps) +{ +#if HAVE_YASM + int cpu_flags = av_get_cpu_flags(); + +#if CONFIG_FLAC_DECODER + if (EXTERNAL_SSE2(cpu_flags)) { + if (fmt == AV_SAMPLE_FMT_S16) { + if (channels == 2) + c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2; + else if (channels == 4) + c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2; + else if (channels == 6) + c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2; + else if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2; + c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2; + c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2; + c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2; + } else if (fmt == AV_SAMPLE_FMT_S32) { + if (channels == 2) + c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2; + else if (channels == 4) + c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2; + else if (channels == 6) + c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2; + else if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2; + c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2; + c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2; + c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2; + } + } + if (EXTERNAL_SSE4(cpu_flags)) { + c->lpc32 = ff_flac_lpc_32_sse4; + } + if (EXTERNAL_AVX(cpu_flags)) { + if (fmt == AV_SAMPLE_FMT_S16) { + if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx; + } else if (fmt == AV_SAMPLE_FMT_S32) { + if (channels == 4) + c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx; + else if (channels == 6) + c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx; + else if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx; + } + } + if (EXTERNAL_XOP(cpu_flags)) { + c->lpc32 = ff_flac_lpc_32_xop; + } +#endif + +#if CONFIG_FLAC_ENCODER + if (EXTERNAL_SSE4(cpu_flags)) { + if (CONFIG_GPL) + c->lpc16_encode = ff_flac_enc_lpc_16_sse4; + } +#endif +#endif /* HAVE_YASM */ +} diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index e82f14923a..f4fc0c20ef 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -2,20 +2,20 @@ ;* x86 optimized Format Conversion Utils ;* Copyright (c) 2008 Loren Merritt ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -68,3 +68,43 @@ INIT_XMM sse INT32_TO_FLOAT_FMUL_SCALAR 5 INIT_XMM sse2 INT32_TO_FLOAT_FMUL_SCALAR 3 + +;------------------------------------------------------------------------------ +; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src, +; const float *mul, int len); +;------------------------------------------------------------------------------ +%macro INT32_TO_FLOAT_FMUL_ARRAY8 0 +cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len + shl lend, 2 + add srcq, lenq + add dstq, lenq + neg lenq +.loop: + movss m0, [mulq] + SPLATD m0 +%if cpuflag(sse2) + cvtdq2ps m1, [srcq+lenq ] + cvtdq2ps m2, [srcq+lenq+16] +%else + cvtpi2ps m1, [srcq+lenq ] + cvtpi2ps m3, [srcq+lenq+ 8] + cvtpi2ps m2, [srcq+lenq+16] + cvtpi2ps m4, [srcq+lenq+24] + movlhps m1, m3 + movlhps m2, m4 +%endif + mulps m1, m0 + mulps m2, m0 + mova [dstq+lenq ], m1 + mova [dstq+lenq+16], m2 + add mulq, 4 + add lenq, 32 + jl .loop + REP_RET +%endmacro + +INIT_XMM sse +INT32_TO_FLOAT_FMUL_ARRAY8 +INIT_XMM sse2 +INT32_TO_FLOAT_FMUL_ARRAY8 + diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c index 1871b477bb..e4cbadcce7 100644 --- a/libavcodec/x86/fmtconvert_init.c +++ b/libavcodec/x86/fmtconvert_init.c @@ -5,20 +5,20 @@ * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -32,6 +32,10 @@ void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len); void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len); +void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src, + const float *mul, int len); +void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src, + const float *mul, int len); #endif /* HAVE_YASM */ @@ -42,9 +46,11 @@ av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx if (EXTERNAL_SSE(cpu_flags)) { c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse; + c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse; } if (EXTERNAL_SSE2(cpu_flags)) { c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2; + c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2; } #endif /* HAVE_YASM */ } diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm index b581471296..0e3b444e2a 100644 --- a/libavcodec/x86/fpel.asm +++ b/libavcodec/x86/fpel.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2003-2013 Michael Niedermayer ;* Copyright (c) 2013 Daniel Kang ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -25,85 +25,83 @@ SECTION .text -INIT_MMX mmxext +%macro PAVGB_MMX 4 + LOAD %3, %1 + por %3, %2 + pxor %2, %1 + pand %2, %4 + psrlq %2, 1 + psubb %3, %2 + SWAP %2, %3 +%endmacro + ; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels, ; ptrdiff_t line_size, int h) -%macro PIXELS48 2 -%if %2 == 4 -%define OP movh +%macro OP_PIXELS 2 +%if %2 == mmsize/2 +%define LOAD movh +%define SAVE movh +%define LEN mmsize %else -%define OP mova +%define LOAD movu +%define SAVE mova +%define LEN %2 %endif -cglobal %1_pixels%2, 4,5 +cglobal %1_pixels%2, 4,5,4 movsxdifnidn r2, r2d lea r4, [r2*3] +%ifidn %1, avg +%if notcpuflag(mmxext) + pcmpeqd m6, m6 + paddb m6, m6 +%endif +%endif .loop: - OP m0, [r1] - OP m1, [r1+r2] - OP m2, [r1+r2*2] - OP m3, [r1+r4] - lea r1, [r1+r2*4] +%assign %%i 0 +%rep LEN/mmsize + LOAD m0, [r1 + %%i] + LOAD m1, [r1+r2 + %%i] + LOAD m2, [r1+r2*2 + %%i] + LOAD m3, [r1+r4 + %%i] %ifidn %1, avg - pavgb m0, [r0] - pavgb m1, [r0+r2] - pavgb m2, [r0+r2*2] - pavgb m3, [r0+r4] +%if notcpuflag(mmxext) + PAVGB_MMX [r0 + %%i], m0, m4, m6 + PAVGB_MMX [r0+r2 + %%i], m1, m5, m6 + PAVGB_MMX [r0+r2*2 + %%i], m2, m4, m6 + PAVGB_MMX [r0+r4 + %%i], m3, m5, m6 +%else + pavgb m0, [r0 + %%i] + pavgb m1, [r0+r2 + %%i] + pavgb m2, [r0+r2*2 + %%i] + pavgb m3, [r0+r4 + %%i] +%endif %endif - OP [r0], m0 - OP [r0+r2], m1 - OP [r0+r2*2], m2 - OP [r0+r4], m3 + SAVE [r0 + %%i], m0 + SAVE [r0+r2 + %%i], m1 + SAVE [r0+r2*2 + %%i], m2 + SAVE [r0+r4 + %%i], m3 +%assign %%i %%i+mmsize +%endrep sub r3d, 4 + lea r1, [r1+r2*4] lea r0, [r0+r2*4] jne .loop RET %endmacro -PIXELS48 put, 4 -PIXELS48 avg, 4 -PIXELS48 put, 8 -PIXELS48 avg, 8 +INIT_MMX mmx +OP_PIXELS put, 4 +OP_PIXELS avg, 4 +OP_PIXELS put, 8 +OP_PIXELS avg, 8 +OP_PIXELS put, 16 +OP_PIXELS avg, 16 +INIT_MMX mmxext +OP_PIXELS avg, 4 +OP_PIXELS avg, 8 +OP_PIXELS avg, 16 INIT_XMM sse2 -; void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, -; ptrdiff_t line_size, int h) -cglobal put_pixels16, 4,5,4 - lea r4, [r2*3] -.loop: - movu m0, [r1] - movu m1, [r1+r2] - movu m2, [r1+r2*2] - movu m3, [r1+r4] - lea r1, [r1+r2*4] - mova [r0], m0 - mova [r0+r2], m1 - mova [r0+r2*2], m2 - mova [r0+r4], m3 - sub r3d, 4 - lea r0, [r0+r2*4] - jnz .loop - REP_RET - -; void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, -; ptrdiff_t line_size, int h) -cglobal avg_pixels16, 4,5,4 - lea r4, [r2*3] -.loop: - movu m0, [r1] - movu m1, [r1+r2] - movu m2, [r1+r2*2] - movu m3, [r1+r4] - lea r1, [r1+r2*4] - pavgb m0, [r0] - pavgb m1, [r0+r2] - pavgb m2, [r0+r2*2] - pavgb m3, [r0+r4] - mova [r0], m0 - mova [r0+r2], m1 - mova [r0+r2*2], m2 - mova [r0+r4], m3 - sub r3d, 4 - lea r0, [r0+r2*4] - jnz .loop - REP_RET +OP_PIXELS put, 16 +OP_PIXELS avg, 16 diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h index 88d1415ade..4d93959a96 100644 --- a/libavcodec/x86/fpel.h +++ b/libavcodec/x86/fpel.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -28,6 +28,8 @@ void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c deleted file mode 100644 index eef05ecc74..0000000000 --- a/libavcodec/x86/fpel_mmx.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * MMX-optimized avg/put pixel routines - * - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stddef.h> -#include <stdint.h> - -#include "config.h" -#include "fpel.h" -#include "inline_asm.h" - -#if HAVE_MMX_INLINE - -// in case more speed is needed - unrolling would certainly help -void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - "movq 8%0, %%mm0 \n\t" - "movq 8%1, %%mm1 \n\t" - PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, 8%0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -#endif /* HAVE_MMX_INLINE */ diff --git a/libavcodec/x86/g722dsp.asm b/libavcodec/x86/g722dsp.asm new file mode 100644 index 0000000000..807a1bdd0a --- /dev/null +++ b/libavcodec/x86/g722dsp.asm @@ -0,0 +1,54 @@ +;****************************************************************************** +;* SIMD optimized DSP functions for G722 coding +;* +;* Copyright (c) 2014 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_qmf_coeffs: dw 3, -210, -11, -805, -11, 951, 53, 3876 +pw_qmf_coeffs2: dw 12, 3876, -156, 951, 32, -805, 362, -210 +pw_qmf_coeffs3: dw 362, 0 , 32, 0, -156, 0, 12, 0 +pw_qmf_coeffs4: dw 53, 0, -11, 0, -11, 0, 3, 0 + +SECTION_TEXT + +INIT_XMM sse2 +cglobal g722_apply_qmf, 2, 2, 5, prev, out + movu m0, [prevq+mmsize*0] + movu m1, [prevq+mmsize*1] + movu m2, [prevq+mmsize*2] + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + punpcklwd m4, m2, m2 + punpckhwd m2, m2 + pmaddwd m3, [pw_qmf_coeffs ] + pmaddwd m0, [pw_qmf_coeffs2] + pmaddwd m4, [pw_qmf_coeffs3] + pmaddwd m2, [pw_qmf_coeffs4] + paddd m0, m3 + paddd m2, m4 + paddd m0, m2 + pshufd m2, m0, q0032 + paddd m0, m2 + pshufd m0, m0, q0001 + movq [outq], m0 + RET diff --git a/libavcodec/x86/g722dsp_init.c b/libavcodec/x86/g722dsp_init.c new file mode 100644 index 0000000000..614695193b --- /dev/null +++ b/libavcodec/x86/g722dsp_init.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2014 James Almer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/g722dsp.h" + +void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]); + +av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) + dsp->apply_qmf = ff_g722_apply_qmf_sse2; +} diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm index 673f795daa..2fcd1a26e5 100644 --- a/libavcodec/x86/h263_loopfilter.asm +++ b/libavcodec/x86/h263_loopfilter.asm @@ -1,20 +1,22 @@ ;****************************************************************************** ;* MMX-optimized H.263 loop filter +;* Copyright (c) 2003-2013 Michael Niedermayer +;* Copyright (c) 2013 Daniel Kang ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c index d4fab981bf..ab81063233 100644 --- a/libavcodec/x86/h263dsp_init.c +++ b/libavcodec/x86/h263dsp_init.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2013 Diego Biurrun <diego@biurrun.de> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm index cc41f00461..107ae51cbc 100644 --- a/libavcodec/x86/h264_chromamc.asm +++ b/libavcodec/x86/h264_chromamc.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, ;* 2005-2008 Loren Merritt ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm index 7b003515cc..c358482092 100644 --- a/libavcodec/x86/h264_chromamc_10bit.asm +++ b/libavcodec/x86/h264_chromamc_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -252,8 +252,10 @@ cglobal %1_h264_chroma_mc2_10, 6,7 %define CHROMAMC_AVG NOTHING INIT_XMM sse2 CHROMA_MC8 put +%if HAVE_AVX_EXTERNAL INIT_XMM avx CHROMA_MC8 put +%endif INIT_MMX mmxext CHROMA_MC4 put CHROMA_MC2 put @@ -261,8 +263,10 @@ CHROMA_MC2 put %define CHROMAMC_AVG AVG INIT_XMM sse2 CHROMA_MC8 avg +%if HAVE_AVX_EXTERNAL INIT_XMM avx CHROMA_MC8 avg +%endif INIT_MMX mmxext CHROMA_MC4 avg CHROMA_MC2 avg diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index d2067c86e7..14c8205bab 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -7,20 +7,20 @@ ;* Fiona Glaser <fiona@x264.com> ;* Oskar Arvidsson <oskar@irock.se> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -384,8 +384,10 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 INIT_XMM sse2 DEBLOCK_LUMA +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA +%endif %else @@ -499,8 +501,10 @@ INIT_MMX mmxext DEBLOCK_LUMA v8, 8 INIT_XMM sse2 DEBLOCK_LUMA v, 16 +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA v, 16 +%endif %endif ; ARCH @@ -772,8 +776,10 @@ cglobal deblock_h_luma_intra_8, 2,4,8,0x80 INIT_XMM sse2 DEBLOCK_LUMA_INTRA v +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA_INTRA v +%endif %if ARCH_X86_64 == 0 INIT_MMX mmxext DEBLOCK_LUMA_INTRA v8 @@ -836,7 +842,11 @@ cglobal deblock_h_chroma_8, 5,7 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) movq buf0, m0 movq buf1, m3 - call ff_chroma_inter_body_mmxext + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 movq m0, buf0 movq m3, buf1 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index d049c62bf2..ebf8a3f109 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -7,34 +7,32 @@ ;* Loren Merritt <lorenm@u.washington.edu> ;* Fiona Glaser <fiona@x264.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" -SECTION_RODATA - -pw_pixel_max: times 8 dw ((1 << 10)-1) - SECTION .text cextern pw_2 cextern pw_3 cextern pw_4 +cextern pw_1023 +%define pw_pixel_max pw_1023 ; out: %4 = |%1-%2|-%3 ; clobbers: %5 @@ -418,9 +416,11 @@ cglobal deblock_h_luma_10, 5,7,15 INIT_XMM sse2 DEBLOCK_LUMA_64 +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA_64 %endif +%endif %macro SWAPMOVA 2 %ifid %1 @@ -715,8 +715,10 @@ cglobal deblock_h_luma_intra_10, 4,7,16 INIT_XMM sse2 DEBLOCK_LUMA_INTRA_64 +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA_INTRA_64 +%endif %endif @@ -802,10 +804,12 @@ DEBLOCK_LUMA_INTRA INIT_XMM sse2 DEBLOCK_LUMA DEBLOCK_LUMA_INTRA +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA DEBLOCK_LUMA_INTRA %endif +%endif ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp ; out: %1=p0', %2=q0' @@ -918,5 +922,7 @@ DEBLOCK_CHROMA %endif INIT_XMM sse2 DEBLOCK_CHROMA +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_CHROMA +%endif diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h index bb881c35df..49ad0e0fa0 100644 --- a/libavcodec/x86/h264_i386.h +++ b/libavcodec/x86/h264_i386.h @@ -2,20 +2,20 @@ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -36,9 +36,15 @@ #if HAVE_INLINE_ASM +#if ARCH_X86_64 +#define REG64 "r" +#else +#define REG64 "m" +#endif + //FIXME use some macros to avoid duplicating get_cabac (cannot be done yet //as that would make optimization work hard) -#if HAVE_7REGS +#if HAVE_7REGS && !BROKEN_COMPILER #define decode_significance decode_significance_x86 static int decode_significance_x86(CABACContext *c, int max_coeff, uint8_t *significant_coeff_ctx_base, @@ -55,6 +61,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff, __asm__ volatile( "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t" : "=&r"(tables) + : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables) ); #endif @@ -130,6 +137,7 @@ static int decode_significance_8x8_x86(CABACContext *c, __asm__ volatile( "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t" : "=&r"(tables) + : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables) ); #endif @@ -138,7 +146,7 @@ static int decode_significance_8x8_x86(CABACContext *c, "3: \n\t" "mov %10, %0 \n\t" - "movzbl (%0, %6), %k6 \n\t" + "movzb (%0, %6), %6 \n\t" "add %9, %6 \n\t" BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3", @@ -149,14 +157,14 @@ static int decode_significance_8x8_x86(CABACContext *c, AV_STRINGIFY(H264_MLPS_STATE_OFFSET), "%15") - "mov %1, %k6 \n\t" + "mov %1, %6 \n\t" "test $1, %4 \n\t" " jz 4f \n\t" #ifdef BROKEN_RELOCATIONS - "movzbl %c14(%15, %q6), %k6\n\t" + "movzb %c14(%15, %q6), %6\n\t" #else - "movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t" + "movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t" #endif "add %11, %6 \n\t" @@ -169,8 +177,8 @@ static int decode_significance_8x8_x86(CABACContext *c, "%15") "mov %2, %0 \n\t" - "mov %1, %k6 \n\t" - "movl %k6, (%0) \n\t" + "mov %1, %6 \n\t" + "mov %k6, (%0) \n\t" "test $1, %4 \n\t" " jnz 5f \n\t" @@ -178,19 +186,19 @@ static int decode_significance_8x8_x86(CABACContext *c, "add"OPSIZE" $4, %2 \n\t" "4: \n\t" - "addl $1, %k6 \n\t" - "mov %k6, %1 \n\t" - "cmpl $63, %k6 \n\t" + "add $1, %6 \n\t" + "mov %6, %1 \n\t" + "cmp $63, %6 \n\t" " jb 3b \n\t" "mov %2, %0 \n\t" - "movl %k6, (%0) \n\t" + "mov %k6, (%0) \n\t" "5: \n\t" "addl %8, %k0 \n\t" "shr $2, %k0 \n\t" - : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low), + : "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low), "=&r"(bit), "+&r"(c->range), "=&r"(state) : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), - "m"(sig_off), "m"(last_coeff_ctx_base), + REG64(sig_off), REG64(last_coeff_ctx_base), "i"(offsetof(CABACContext, bytestream)), "i"(offsetof(CABACContext, bytestream_end)), "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG @@ -198,7 +206,7 @@ static int decode_significance_8x8_x86(CABACContext *c, ); return coeff_count; } -#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */ +#endif /* HAVE_7REGS && BROKEN_COMPILER */ #endif /* HAVE_INLINE_ASM */ #endif /* AVCODEC_X86_H264_I386_H */ diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 313791a5d9..7fafe195f1 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -9,20 +9,20 @@ ;* Holger Lubitz <hal@duncan.ol.sub.de> ;* Min Chen <chenm001.163.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;***************************************************************************** diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index b7d51051d3..cc115b0ff9 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -26,11 +26,13 @@ SECTION_RODATA -pw_pixel_max: times 8 dw ((1 << 10)-1) pd_32: times 4 dd 32 SECTION .text +cextern pw_1023 +%define pw_pixel_max pw_1023 + ;----------------------------------------------------------------------------- ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride) ;----------------------------------------------------------------------------- @@ -83,8 +85,10 @@ cglobal h264_idct_add_10, 3,3 INIT_XMM sse2 IDCT_ADD_10 +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD_10 +%endif ;----------------------------------------------------------------------------- ; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset, @@ -117,9 +121,11 @@ add4x4_idct %+ SUFFIX: INIT_XMM sse2 ALIGN 16 ADD4x4IDCT +%if HAVE_AVX_EXTERNAL INIT_XMM avx ALIGN 16 ADD4x4IDCT +%endif %macro ADD16_OP 2 cmp byte [r4+%2], 0 @@ -155,8 +161,10 @@ cglobal h264_idct_add16_10, 5,6 INIT_XMM sse2 IDCT_ADD16_10 +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD16_10 +%endif ;----------------------------------------------------------------------------- ; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride) @@ -220,8 +228,10 @@ cglobal h264_idct8_dc_add_10,3,4,7 INIT_XMM sse2 IDCT8_DC_ADD +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT8_DC_ADD +%endif ;----------------------------------------------------------------------------- ; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset, @@ -293,8 +303,10 @@ cglobal h264_idct_add16intra_10,5,7,8 INIT_XMM sse2 IDCT_ADD16INTRA_10 +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD16INTRA_10 +%endif %assign last_block 36 ;----------------------------------------------------------------------------- @@ -330,8 +342,10 @@ cglobal h264_idct_add8_10,5,8,7 INIT_XMM sse2 IDCT_ADD8 +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD8 +%endif ;----------------------------------------------------------------------------- ; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride) @@ -537,8 +551,10 @@ h264_idct8_add1_10 %+ SUFFIX: INIT_XMM sse2 IDCT8_ADD +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT8_ADD +%endif ;----------------------------------------------------------------------------- ; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset, @@ -577,5 +593,7 @@ cglobal h264_idct8_add4_10, 0,7,16 INIT_XMM sse2 IDCT8_ADD4 +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT8_ADD4 +%endif diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index df657a443c..c88d91b49e 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -5,20 +5,20 @@ ;* Copyright (c) 2010 Loren Merritt ;* Copyright (c) 2010 Ronald S. Bultje ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -2497,10 +2497,7 @@ cglobal pred4x4_tm_vp8_8, 3,3 pshufb mm3, mm6 pshufb mm4, mm6 pshufb mm5, mm6 - psubw mm2, mm7 - psubw mm3, mm7 - psubw mm4, mm7 - psubw mm5, mm7 + psubw mm0, mm7 paddw mm2, mm0 paddw mm3, mm0 paddw mm4, mm0 diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm index 55790a956e..9aeb70242b 100644 --- a/libavcodec/x86/h264_intrapred_10bit.asm +++ b/libavcodec/x86/h264_intrapred_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -26,6 +26,9 @@ SECTION_RODATA +cextern pw_1023 +%define pw_pixel_max pw_1023 +cextern pw_512 cextern pw_16 cextern pw_8 cextern pw_4 @@ -34,8 +37,6 @@ cextern pw_1 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 pw_m3: times 8 dw -3 -pw_pixel_max: times 8 dw ((1 << 10)-1) -pw_512: times 8 dw 512 pd_17: times 4 dd 17 pd_16: times 4 dd 16 @@ -82,8 +83,10 @@ INIT_XMM sse2 PRED4x4_DR INIT_XMM ssse3 PRED4x4_DR +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED4x4_DR +%endif ;------------------------------------------------------------------------------ ; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride) @@ -119,8 +122,10 @@ INIT_XMM sse2 PRED4x4_VR INIT_XMM ssse3 PRED4x4_VR +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED4x4_VR +%endif ;------------------------------------------------------------------------------- ; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride) @@ -159,28 +164,14 @@ INIT_XMM sse2 PRED4x4_HD INIT_XMM ssse3 PRED4x4_HD +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED4x4_HD +%endif ;----------------------------------------------------------------------------- ; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride) ;----------------------------------------------------------------------------- -%macro HADDD 2 ; sum junk -%if mmsize == 16 - movhlps %2, %1 - paddd %1, %2 - pshuflw %2, %1, 0xE - paddd %1, %2 -%else - pshufw %2, %1, 0xE - paddd %1, %2 -%endif -%endmacro - -%macro HADDW 2 - pmaddwd %1, [pw_1] - HADDD %1, %2 -%endmacro INIT_MMX mmxext cglobal pred4x4_dc_10, 3, 3 @@ -228,8 +219,10 @@ cglobal pred4x4_down_left_10, 3, 3 INIT_XMM sse2 PRED4x4_DL +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED4x4_DL +%endif ;----------------------------------------------------------------------------- ; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride) @@ -255,8 +248,10 @@ cglobal pred4x4_vertical_left_10, 3, 3 INIT_XMM sse2 PRED4x4_VL +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED4x4_VL +%endif ;----------------------------------------------------------------------------- ; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride) @@ -565,8 +560,10 @@ cglobal pred8x8l_top_dc_10, 4, 4, 6 INIT_XMM sse2 PRED8x8L_TOP_DC +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_TOP_DC +%endif ;------------------------------------------------------------------------------- ; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) @@ -622,8 +619,10 @@ cglobal pred8x8l_dc_10, 4, 6, 6 INIT_XMM sse2 PRED8x8L_DC +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_DC +%endif ;----------------------------------------------------------------------------- ; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, @@ -656,8 +655,10 @@ cglobal pred8x8l_vertical_10, 4, 4, 6 INIT_XMM sse2 PRED8x8L_VERTICAL +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_VERTICAL +%endif ;----------------------------------------------------------------------------- ; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, @@ -711,8 +712,10 @@ INIT_XMM sse2 PRED8x8L_HORIZONTAL INIT_XMM ssse3 PRED8x8L_HORIZONTAL +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_HORIZONTAL +%endif ;----------------------------------------------------------------------------- ; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, @@ -778,8 +781,10 @@ INIT_XMM sse2 PRED8x8L_DOWN_LEFT INIT_XMM ssse3 PRED8x8L_DOWN_LEFT +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_DOWN_LEFT +%endif ;----------------------------------------------------------------------------- ; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, @@ -851,8 +856,10 @@ INIT_XMM sse2 PRED8x8L_DOWN_RIGHT INIT_XMM ssse3 PRED8x8L_DOWN_RIGHT +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_DOWN_RIGHT +%endif ;----------------------------------------------------------------------------- ; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft, @@ -920,8 +927,10 @@ INIT_XMM sse2 PRED8x8L_VERTICAL_RIGHT INIT_XMM ssse3 PRED8x8L_VERTICAL_RIGHT +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_VERTICAL_RIGHT +%endif ;----------------------------------------------------------------------------- ; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft, @@ -980,8 +989,10 @@ INIT_XMM sse2 PRED8x8L_HORIZONTAL_UP INIT_XMM ssse3 PRED8x8L_HORIZONTAL_UP +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_HORIZONTAL_UP +%endif ;----------------------------------------------------------------------------- diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index 0e572b1226..528b92e497 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Fiona Glaser <fiona@x264.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 76d2ab05d5..d9cb5f264c 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -2,20 +2,20 @@ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt * Copyright (c) 2011 Daniel Kang * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -29,8 +29,8 @@ #include "fpel.h" #if HAVE_YASM -void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); +void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, @@ -49,9 +49,9 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t #define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext - -CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8) -CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8) +#define ff_put_pixels16_mmxext ff_put_pixels16_mmx +#define ff_put_pixels8_mmxext ff_put_pixels8_mmx +#define ff_put_pixels4_mmxext ff_put_pixels4_mmx #define DEF_QPEL(OPNAME)\ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\ @@ -282,7 +282,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uin #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ }\ @@ -294,7 +294,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uin \ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ }\ @@ -302,74 +302,74 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ - DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ + LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ {\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ }\ diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm index f92c4aab2b..757c425898 100644 --- a/libavcodec/x86/h264_qpel_10bit.asm +++ b/libavcodec/x86/h264_qpel_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -26,12 +26,12 @@ SECTION_RODATA 32 +cextern pw_1023 +%define pw_pixel_max pw_1023 cextern pw_16 cextern pw_1 cextern pb_0 -pw_pixel_max: times 8 dw ((1 << 10)-1) - pad10: times 8 dw 10*1023 pad20: times 8 dw 20*1023 pad30: times 8 dw 30*1023 diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index bc6c72541b..2d287ba443 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -6,20 +6,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm index 9ad26de832..897c616a81 100644 --- a/libavcodec/x86/h264_weight.asm +++ b/libavcodec/x86/h264_weight.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm index 961ec8ca45..f924e55854 100644 --- a/libavcodec/x86/h264_weight_10bit.asm +++ b/libavcodec/x86/h264_weight_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -26,11 +26,12 @@ SECTION_RODATA 32 -pw_pixel_max: times 8 dw ((1 << 10)-1) sq_1: dq 1 dq 0 cextern pw_1 +cextern pw_1023 +%define pw_pixel_max pw_1023 SECTION .text diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c index 8ec8a79aba..e08af2759e 100644 --- a/libavcodec/x86/h264chroma_init.c +++ b/libavcodec/x86/h264chroma_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 134d594ca9..35db20014a 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -210,6 +210,7 @@ H264_BIWEIGHT_10_SSE(4, 10) av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { +#if HAVE_YASM int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1) @@ -365,4 +366,5 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, #endif /* HAVE_ALIGNED_STACK */ } } +#endif } diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm index 1e895f0aa5..48a597530b 100644 --- a/libavcodec/x86/hevc_deblock.asm +++ b/libavcodec/x86/hevc_deblock.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -26,13 +26,15 @@ SECTION_RODATA -pw_pixel_max: times 8 dw ((1 << 10)-1) -pw_m1: times 8 dw -1 -pw_m2: times 8 dw -2 -pd_1 : times 4 dd 1 +cextern pw_1023 +%define pw_pixel_max_10 pw_1023 +pw_pixel_max_12: times 8 dw ((1 << 12)-1) +pw_m2: times 8 dw -2 +pd_1 : times 4 dd 1 cextern pw_4 cextern pw_8 +cextern pw_m1 SECTION .text INIT_XMM sse2 @@ -57,10 +59,10 @@ INIT_XMM sse2 movd m4, %5 movd m6, %6 movd m5, %7 - movd m7, %8 + movd m3, %8 punpcklbw m4, m6 - punpcklbw m5, m7 + punpcklbw m5, m3 punpcklwd m4, m5 punpckhdq m2, m0, m4 @@ -76,16 +78,10 @@ INIT_XMM sse2 ; in: 4 rows of 8 words in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 %macro TRANSPOSE8x4B_STORE 8 - packuswb m0, m0 - packuswb m1, m1 - packuswb m2, m2 - packuswb m3, m3 - - punpcklbw m0, m1 - punpcklbw m2, m3 - - punpckhwd m6, m0, m2 - punpcklwd m0, m2 + packuswb m0, m2 + packuswb m1, m3 + SBUTTERFLY bw, 0, 1, 2 + SBUTTERFLY wd, 0, 1, 2 movd %1, m0 pshufd m0, m0, 0x39 @@ -95,13 +91,13 @@ INIT_XMM sse2 pshufd m0, m0, 0x39 movd %4, m0 - movd %5, m6 - pshufd m6, m6, 0x39 - movd %6, m6 - pshufd m6, m6, 0x39 - movd %7, m6 - pshufd m6, m6, 0x39 - movd %8, m6 + movd %5, m1 + pshufd m1, m1, 0x39 + movd %6, m1 + pshufd m1, m1, 0x39 + movd %7, m1 + pshufd m1, m1, 0x39 + movd %8, m1 %endmacro ; in: 8 rows of 4 words in %4..%11 @@ -120,10 +116,10 @@ INIT_XMM sse2 movq m4, %5 movq m6, %6 movq m5, %7 - movq m7, %8 + movq m3, %8 punpcklwd m4, m6 - punpcklwd m5, m7 + punpcklwd m5, m3 punpckhdq m6, m4, m5 punpckldq m4, m5 @@ -136,32 +132,23 @@ INIT_XMM sse2 ; in: 4 rows of 8 words in m0..m3 ; out: 8 rows of 4 words in %1..%8 -%macro TRANSPOSE8x4W_STORE 8 - pxor m5, m5; zeros reg - CLIPW m0, m5, [pw_pixel_max] - CLIPW m1, m5, [pw_pixel_max] - CLIPW m2, m5, [pw_pixel_max] - CLIPW m3, m5, [pw_pixel_max] +%macro TRANSPOSE8x4W_STORE 9 + TRANSPOSE4x4W 0, 1, 2, 3, 4 - punpckhwd m4, m0, m1 - punpcklwd m0, m1 - punpckhwd m5, m2, m3 - punpcklwd m2, m3 - punpckhdq m6, m0, m2 - punpckldq m0, m2 + pxor m5, m5; zeros reg + CLIPW m0, m5, %9 + CLIPW m1, m5, %9 + CLIPW m2, m5, %9 + CLIPW m3, m5, %9 movq %1, m0 movhps %2, m0 - movq %3, m6 - movhps %4, m6 - - punpckhdq m6, m4, m5 - punpckldq m4, m5 - - movq %5, m4 - movhps %6, m4 - movq %7, m6 - movhps %8, m6 + movq %3, m1 + movhps %4, m1 + movq %5, m2 + movhps %6, m2 + movq %7, m3 + movhps %8, m3 %endmacro ; in: 8 rows of 8 bytes in %1..%8 @@ -212,40 +199,20 @@ INIT_XMM sse2 ; in: 8 rows of 8 words in m0..m8 ; out: 8 rows of 8 bytes in %1..%8 %macro TRANSPOSE8x8B_STORE 8 - packuswb m0, m0 - packuswb m1, m1 - packuswb m2, m2 - packuswb m3, m3 - packuswb m4, m4 - packuswb m5, m5 - packuswb m6, m6 - packuswb m7, m7 - - punpcklbw m0, m1 - punpcklbw m2, m3 - - punpckhwd m8, m0, m2 - punpcklwd m0, m2 - - punpcklbw m4, m5 - punpcklbw m6, m7 - - punpckhwd m9, m4, m6 - punpcklwd m4, m6 + packuswb m0, m4 + packuswb m1, m5 + packuswb m2, m6 + packuswb m3, m7 + TRANSPOSE2x4x4B 0, 1, 2, 3, 4 - punpckhdq m10, m0, m4; 2, 3 - punpckldq m0, m4; 0, 1 - - punpckldq m11, m8, m9; 4, 5 - punpckhdq m8, m9; 6, 7 movq %1, m0 movhps %2, m0 - movq %3, m10 - movhps %4, m10 - movq %5, m11 - movhps %6, m11 - movq %7, m8 - movhps %8, m8 + movq %3, m1 + movhps %4, m1 + movq %5, m2 + movhps %6, m2 + movq %7, m3 + movhps %8, m3 %endmacro ; in: 8 rows of 8 words in %1..%8 @@ -264,18 +231,18 @@ INIT_XMM sse2 ; in: 8 rows of 8 words in m0..m8 ; out: 8 rows of 8 words in %1..%8 -%macro TRANSPOSE8x8W_STORE 8 +%macro TRANSPOSE8x8W_STORE 9 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 pxor m8, m8 - CLIPW m0, m8, [pw_pixel_max] - CLIPW m1, m8, [pw_pixel_max] - CLIPW m2, m8, [pw_pixel_max] - CLIPW m3, m8, [pw_pixel_max] - CLIPW m4, m8, [pw_pixel_max] - CLIPW m5, m8, [pw_pixel_max] - CLIPW m6, m8, [pw_pixel_max] - CLIPW m7, m8, [pw_pixel_max] + CLIPW m0, m8, %9 + CLIPW m1, m8, %9 + CLIPW m2, m8, %9 + CLIPW m3, m8, %9 + CLIPW m4, m8, %9 + CLIPW m5, m8, %9 + CLIPW m6, m8, %9 + CLIPW m7, m8, %9 movdqu %1, m0 movdqu %2, m1 @@ -318,13 +285,14 @@ ALIGN 16 paddw m5, m4; ;tc calculations - movd m6, [r2]; tc0 - add r2, 4; + movq m6, [tcq]; tc0 punpcklwd m6, m6 - movd m7, [r2]; tc1 - punpcklwd m7, m7 - shufps m6, m7, 0; tc0, tc1 + pshufd m6, m6, 0xA0; tc0, tc1 +%if cpuflag(ssse3) + psignw m4, m6, [pw_m1]; -tc0, -tc1 +%else pmullw m4, m6, [pw_m1]; -tc0, -tc1 +%endif ;end tc calculations paddw m5, [pw_4]; +4 @@ -362,11 +330,11 @@ ALIGN 16 paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3 - pshufhw m14, m9, q0033 ;0b00001111; 0d3 0d3 0d0 0d0 in high - pshuflw m14, m14, q0033 ;0b00001111; 1d3 1d3 1d0 1d0 in low + pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high + pshuflw m14, m14, 0x0f ;0b00001111; 1d3 1d3 1d0 1d0 in low - pshufhw m9, m9, q3300 ;0b11110000; 0d0 0d0 0d3 0d3 - pshuflw m9, m9, q3300 ;0b11110000; 1d0 1d0 1d3 1d3 + pshufhw m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3 + pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3 paddw m14, m9; 0d0+0d3, 1d0+1d3 @@ -380,7 +348,7 @@ ALIGN 16 psraw m15, m13, 2; beta >> 2 psllw m8, m9, 1; pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2 - movmskps r14, m15; + movmskps r6, m15; ;end weak / strong decision ; weak filter nd_p/q calculation @@ -388,19 +356,15 @@ ALIGN 16 psrld m8, 16 paddw m8, m10 movd r7d, m8 - and r7, 0xffff; 1dp0 + 1dp3 pshufd m8, m8, 0x4E movd r8d, m8 - and r8, 0xffff; 0dp0 + 0dp3 pshufd m8, m11, 0x31 psrld m8, 16 paddw m8, m11 movd r9d, m8 - and r9, 0xffff; 1dq0 + 1dq3 pshufd m8, m8, 0x4E movd r10d, m8 - and r10, 0xffff; 0dq0 + 0dq3 ; end calc for weak filter ; filtering mask @@ -422,14 +386,13 @@ ALIGN 16 shl r11, %1 - 8 %endif movd m8, r11d; tc0 - add tcq, 4; - mov r3d, [tcq]; + mov r3d, [tcq+4]; %if %1 > 8 shl r3, %1 - 8 %endif - movd m9, r3d; tc1 add r11d, r3d; tc0 + tc1 jz .bypassluma + movd m9, r3d; tc1 punpcklwd m8, m8 punpcklwd m9, m9 shufps m8, m9, 0; tc0, tc1 @@ -453,7 +416,7 @@ ALIGN 16 psraw m13, 3; beta >> 3 pcmpgtw m13, m12; movmskps r11, m13; - and r14, r11; strong mask , beta_2 and beta_3 comparisons + and r6, r11; strong mask , beta_2 and beta_3 comparisons ;----beta_3 comparison end----- ;----tc25 comparison--- psubw m12, m3, m4; p0 - q0 @@ -464,23 +427,23 @@ ALIGN 16 pcmpgtw m8, m12; tc25 comparisons movmskps r11, m8; - and r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons + and r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons ;----tc25 comparison end--- - mov r11, r14; + mov r11, r6; shr r11, 1; - and r14, r11; strong mask, bits 2 and 0 + and r6, r11; strong mask, bits 2 and 0 pmullw m14, m9, [pw_m2]; -tc * 2 paddw m9, m9 - and r14, 5; 0b101 - mov r11, r14; strong mask - shr r14, 2; - movd m12, r14d; store to xmm for mask generation - shl r14, 1 + and r6, 5; 0b101 + mov r11, r6; strong mask + shr r6, 2; + movd m12, r6d; store to xmm for mask generation + shl r6, 1 and r11, 1 movd m10, r11d; store to xmm for mask generation - or r14, r11; final strong mask, bits 1 and 0 + or r6, r11; final strong mask, bits 1 and 0 jz .weakfilter shufps m10, m12, 0 @@ -565,16 +528,16 @@ ALIGN 16 MASKED_COPY m3, m12 .weakfilter: - not r14; strong mask -> weak mask - and r14, r13; final weak filtering mask, bits 0 and 1 + not r6; strong mask -> weak mask + and r6, r13; final weak filtering mask, bits 0 and 1 jz .store ; weak filtering mask - mov r11, r14 + mov r11, r6 shr r11, 1 movd m12, r11d - and r14, 1 - movd m11, r14d + and r6, 1 + movd m11, r6d shufps m11, m12, 0 pcmpeqd m11, [pd_1]; filtering mask @@ -609,7 +572,11 @@ ALIGN 16 pminsw m12, m9; av_clip(delta0, -tc, tc) psraw m9, 1; tc -> tc / 2 +%if cpuflag(ssse3) + psignw m14, m9, [pw_m1]; -tc / 2 +%else pmullw m14, m9, [pw_m1]; -tc / 2 +%endif pavgw m15, m1, m3; (p2 + p0 + 1) >> 1 psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1 @@ -658,117 +625,161 @@ ALIGN 16 MASKED_COPY m4, m8 %endmacro -INIT_XMM sse2 ;----------------------------------------------------------------------------- -; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc, +; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc, ; uint8_t *_no_p, uint8_t *_no_q); ;----------------------------------------------------------------------------- -cglobal hevc_v_loop_filter_chroma_8, 3, 6, 8 - sub r0, 2 - lea r5, [3 * r1] - mov r4, r0 - add r0, r5 - TRANSPOSE4x8B_LOAD PASS8ROWS(r4, r0, r1, r5) +%macro LOOP_FILTER_CHROMA 0 +cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride + sub pixq, 2 + lea r3strideq, [3*strideq] + mov pix0q, pixq + add pixq, r3strideq + TRANSPOSE4x8B_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) CHROMA_DEBLOCK_BODY 8 - TRANSPOSE8x4B_STORE PASS8ROWS(r4, r0, r1, r5) + TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq) RET -cglobal hevc_v_loop_filter_chroma_10, 3, 6, 8 - sub r0, 4 - lea r5, [3 * r1] - mov r4, r0 - add r0, r5 - TRANSPOSE4x8W_LOAD PASS8ROWS(r4, r0, r1, r5) +cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride + sub pixq, 4 + lea r3strideq, [3*strideq] + mov pix0q, pixq + add pixq, r3strideq + TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) CHROMA_DEBLOCK_BODY 10 - TRANSPOSE8x4W_STORE PASS8ROWS(r4, r0, r1, r5) + TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10] + RET + +cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride + sub pixq, 4 + lea r3strideq, [3*strideq] + mov pix0q, pixq + add pixq, r3strideq + TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq) + CHROMA_DEBLOCK_BODY 12 + TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12] RET ;----------------------------------------------------------------------------- -; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc, +; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc, ; uint8_t *_no_p, uint8_t *_no_q); ;----------------------------------------------------------------------------- -cglobal hevc_h_loop_filter_chroma_8, 3, 6, 8 - mov r5, r0; pix - sub r5, r1 - sub r5, r1 - movh m0, [r5]; p1 - movh m1, [r5 + r1]; p0 - movh m2, [r0]; q0 - movh m3, [r0 + r1]; q1 +cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0 + mov pix0q, pixq + sub pix0q, strideq + sub pix0q, strideq + movq m0, [pix0q]; p1 + movq m1, [pix0q+strideq]; p0 + movq m2, [pixq]; q0 + movq m3, [pixq+strideq]; q1 pxor m5, m5; zeros reg punpcklbw m0, m5 punpcklbw m1, m5 punpcklbw m2, m5 punpcklbw m3, m5 CHROMA_DEBLOCK_BODY 8 - packuswb m1, m2 - movh [r5 + r1], m1 - movhps [r0], m1 + packuswb m1, m2 + movh[pix0q+strideq], m1 + movhps [pixq], m1 RET -cglobal hevc_h_loop_filter_chroma_10, 3, 6, 8 - mov r5, r0; pix - sub r5, r1 - sub r5, r1 - movdqu m0, [r5]; p1 - movdqu m1, [r5+r1]; p0 - movdqu m2, [r0]; q0 - movdqu m3, [r0 + r1]; q1 +cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0 + mov pix0q, pixq + sub pix0q, strideq + sub pix0q, strideq + movu m0, [pix0q]; p1 + movu m1, [pix0q+strideq]; p0 + movu m2, [pixq]; q0 + movu m3, [pixq+strideq]; q1 CHROMA_DEBLOCK_BODY 10 pxor m5, m5; zeros reg - CLIPW m1, m5, [pw_pixel_max] - CLIPW m2, m5, [pw_pixel_max] - movdqu [r5 + r1], m1 - movdqu [r0], m2 + CLIPW m1, m5, [pw_pixel_max_10] + CLIPW m2, m5, [pw_pixel_max_10] + movu [pix0q+strideq], m1 + movu [pixq], m2 + RET + +cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0 + mov pix0q, pixq + sub pix0q, strideq + sub pix0q, strideq + movu m0, [pix0q]; p1 + movu m1, [pix0q+strideq]; p0 + movu m2, [pixq]; q0 + movu m3, [pixq+strideq]; q1 + CHROMA_DEBLOCK_BODY 12 + pxor m5, m5; zeros reg + CLIPW m1, m5, [pw_pixel_max_12] + CLIPW m2, m5, [pw_pixel_max_12] + movu [pix0q+strideq], m1 + movu [pixq], m2 RET +%endmacro + +INIT_XMM sse2 +LOOP_FILTER_CHROMA +INIT_XMM avx +LOOP_FILTER_CHROMA %if ARCH_X86_64 -INIT_XMM ssse3 +%macro LOOP_FILTER_LUMA 0 ;----------------------------------------------------------------------------- ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, -; int *_tc, uint8_t *_no_p, uint8_t *_no_q); +; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); ;----------------------------------------------------------------------------- -cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc - sub r0, 4 - lea r5, [3 * r1] - mov r6, r0 - add r0, r5 - TRANSPOSE8x8B_LOAD PASS8ROWS(r6, r0, r1, r5) +cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride + sub pixq, 4 + lea pix0q, [3 * r1] + mov src3strideq, pixq + add pixq, pix0q + TRANSPOSE8x8B_LOAD PASS8ROWS(src3strideq, pixq, r1, pix0q) LUMA_DEBLOCK_BODY 8, v .store: - TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5) + TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q) .bypassluma: RET -cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc +cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride sub pixq, 8 - lea r5, [3 * strideq] - mov r6, pixq - add pixq, r5 - TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5) + lea pix0q, [3 * strideq] + mov src3strideq, pixq + add pixq, pix0q + TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q) LUMA_DEBLOCK_BODY 10, v .store: - TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5) + TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10] +.bypassluma: + RET + +cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride + sub pixq, 8 + lea pix0q, [3 * strideq] + mov src3strideq, pixq + add pixq, pix0q + TRANSPOSE8x8W_LOAD PASS8ROWS(src3strideq, pixq, strideq, pix0q) + LUMA_DEBLOCK_BODY 12, v +.store: + TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12] .bypassluma: RET ;----------------------------------------------------------------------------- ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, -; int *_tc, uint8_t *_no_p, uint8_t *_no_q); +; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); ;----------------------------------------------------------------------------- -cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride +cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride lea src3strideq, [3 * strideq] mov pix0q, pixq sub pix0q, src3strideq sub pix0q, strideq - movdqu m0, [pix0q]; p3 - movdqu m1, [pix0q + strideq]; p2 - movdqu m2, [pix0q + 2 * strideq]; p1 - movdqu m3, [pix0q + src3strideq]; p0 - movdqu m4, [pixq]; q0 - movdqu m5, [pixq + strideq]; q1 - movdqu m6, [pixq + 2 * strideq]; q2 - movdqu m7, [pixq + src3strideq]; q3 + movq m0, [pix0q]; p3 + movq m1, [pix0q + strideq]; p2 + movq m2, [pix0q + 2 * strideq]; p1 + movq m3, [pix0q + src3strideq]; p0 + movq m4, [pixq]; q0 + movq m5, [pixq + strideq]; q1 + movq m6, [pixq + 2 * strideq]; q2 + movq m7, [pixq + src3strideq]; q3 pxor m8, m8 punpcklbw m0, m8 punpcklbw m1, m8 @@ -783,16 +794,16 @@ cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0 packuswb m1, m2 packuswb m3, m4 packuswb m5, m6 - movh [r5 + r1], m1 - movhps [r5 + 2 * r1], m1 - movh [r5 + r6], m3 - movhps [r0 ], m3 - movh [r0 + r1], m5 - movhps [r0 + 2 * r1], m5 + movh [pix0q + strideq], m1 + movhps [pix0q + 2 * strideq], m1 + movh [pix0q + src3strideq], m3 + movhps [pixq ], m3 + movh [pixq + strideq], m5 + movhps [pixq + 2 * strideq], m5 .bypassluma: RET -cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride +cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride lea src3strideq, [3 * strideq] mov pix0q, pixq sub pix0q, src3strideq @@ -808,12 +819,43 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix LUMA_DEBLOCK_BODY 10, h .store: pxor m8, m8; zeros reg - CLIPW m1, m8, [pw_pixel_max] - CLIPW m2, m8, [pw_pixel_max] - CLIPW m3, m8, [pw_pixel_max] - CLIPW m4, m8, [pw_pixel_max] - CLIPW m5, m8, [pw_pixel_max] - CLIPW m6, m8, [pw_pixel_max] + CLIPW m1, m8, [pw_pixel_max_10] + CLIPW m2, m8, [pw_pixel_max_10] + CLIPW m3, m8, [pw_pixel_max_10] + CLIPW m4, m8, [pw_pixel_max_10] + CLIPW m5, m8, [pw_pixel_max_10] + CLIPW m6, m8, [pw_pixel_max_10] + movdqu [pix0q + strideq], m1; p2 + movdqu [pix0q + 2 * strideq], m2; p1 + movdqu [pix0q + src3strideq], m3; p0 + movdqu [pixq ], m4; q0 + movdqu [pixq + strideq], m5; q1 + movdqu [pixq + 2 * strideq], m6; q2 +.bypassluma: + RET + +cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride + lea src3strideq, [3 * strideq] + mov pix0q, pixq + sub pix0q, src3strideq + sub pix0q, strideq + movdqu m0, [pix0q]; p3 + movdqu m1, [pix0q + strideq]; p2 + movdqu m2, [pix0q + 2 * strideq]; p1 + movdqu m3, [pix0q + src3strideq]; p0 + movdqu m4, [pixq]; q0 + movdqu m5, [pixq + strideq]; q1 + movdqu m6, [pixq + 2 * strideq]; q2 + movdqu m7, [pixq + src3strideq]; q3 + LUMA_DEBLOCK_BODY 12, h +.store: + pxor m8, m8; zeros reg + CLIPW m1, m8, [pw_pixel_max_12] + CLIPW m2, m8, [pw_pixel_max_12] + CLIPW m3, m8, [pw_pixel_max_12] + CLIPW m4, m8, [pw_pixel_max_12] + CLIPW m5, m8, [pw_pixel_max_12] + CLIPW m6, m8, [pw_pixel_max_12] movdqu [pix0q + strideq], m1; p2 movdqu [pix0q + 2 * strideq], m2; p1 movdqu [pix0q + src3strideq], m3; p0 @@ -822,4 +864,13 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix movdqu [pixq + 2 * strideq], m6; q2 .bypassluma: RET + +%endmacro + +INIT_XMM sse2 +LOOP_FILTER_LUMA +INIT_XMM ssse3 +LOOP_FILTER_LUMA +INIT_XMM avx +LOOP_FILTER_LUMA %endif diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm new file mode 100644 index 0000000000..481726d217 --- /dev/null +++ b/libavcodec/x86/hevc_idct.asm @@ -0,0 +1,122 @@ +; /* +; * SIMD optimized idct functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * Copyright (c) 2014 James Almer +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT 32 + +; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs) +; %1 = HxW +; %2 = number of loops +; %3 = bitdepth +%macro IDCT_DC 3 +cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp + movsx tmpq, word [coeffq] + add tmpw, ((1 << 14-%3) + 1) + sar tmpw, (15-%3) + movd xm0, tmpd + SPLATW m0, xm0 + DEFINE_ARGS coeff, cnt + mov cntd, %2 +.loop: + mova [coeffq+mmsize*0], m0 + mova [coeffq+mmsize*1], m0 + mova [coeffq+mmsize*2], m0 + mova [coeffq+mmsize*3], m0 + mova [coeffq+mmsize*4], m0 + mova [coeffq+mmsize*5], m0 + mova [coeffq+mmsize*6], m0 + mova [coeffq+mmsize*7], m0 + add coeffq, mmsize*8 + dec cntd + jg .loop + RET +%endmacro + +; %1 = HxW +; %2 = bitdepth +%macro IDCT_DC_NL 2 ; No loop +cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp + movsx tmpq, word [coeffq] + add tmpw, ((1 << 14-%2) + 1) + sar tmpw, (15-%2) + movd m0, tmpd + SPLATW m0, xm0 + mova [coeffq+mmsize*0], m0 + mova [coeffq+mmsize*1], m0 + mova [coeffq+mmsize*2], m0 + mova [coeffq+mmsize*3], m0 +%if mmsize == 16 + mova [coeffq+mmsize*4], m0 + mova [coeffq+mmsize*5], m0 + mova [coeffq+mmsize*6], m0 + mova [coeffq+mmsize*7], m0 +%endif + RET +%endmacro + +; 8-bit +INIT_MMX mmxext +IDCT_DC_NL 4, 8 +IDCT_DC 8, 2, 8 + +INIT_XMM sse2 +IDCT_DC_NL 8, 8 +IDCT_DC 16, 4, 8 +IDCT_DC 32, 16, 8 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +IDCT_DC 16, 2, 8 +IDCT_DC 32, 8, 8 +%endif ;HAVE_AVX2_EXTERNAL + +; 10-bit +INIT_MMX mmxext +IDCT_DC_NL 4, 10 +IDCT_DC 8, 2, 10 + +INIT_XMM sse2 +IDCT_DC_NL 8, 10 +IDCT_DC 16, 4, 10 +IDCT_DC 32, 16, 10 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +IDCT_DC 16, 2, 10 +IDCT_DC 32, 8, 10 +%endif ;HAVE_AVX2_EXTERNAL + +; 12-bit +INIT_MMX mmxext +IDCT_DC_NL 4, 12 +IDCT_DC 8, 2, 12 + +INIT_XMM sse2 +IDCT_DC_NL 8, 12 +IDCT_DC 16, 4, 12 +IDCT_DC 32, 16, 12 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +IDCT_DC 16, 2, 12 +IDCT_DC 32, 8, 12 +%endif ;HAVE_AVX2_EXTERNAL diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm new file mode 100644 index 0000000000..986493f20c --- /dev/null +++ b/libavcodec/x86/hevc_mc.asm @@ -0,0 +1,1671 @@ +; /* +; * Provide SSE luma and chroma mc functions for HEVC decoding +; * Copyright (c) 2013 Pierre-Edouard LEPERE +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 +cextern pw_255 +cextern pw_512 +cextern pw_2048 +cextern pw_8192 +cextern pw_1023 +cextern pw_1024 +cextern pw_4096 +%define pw_8 pw_512 +%define pw_10 pw_2048 +%define pw_12 pw_8192 +%define pw_bi_10 pw_1024 +%define pw_bi_12 pw_4096 +%define max_pixels_8 pw_255 +%define max_pixels_10 pw_1023 +pw_bi_8: times 16 dw (1 << 8) +max_pixels_12: times 16 dw ((1 << 12)-1) +cextern pd_1 +cextern pb_0 + +SECTION_TEXT 32 +%macro EPEL_TABLE 4 +hevc_epel_filters_%4_%1 times %2 d%3 -2, 58 + times %2 d%3 10, -2 + times %2 d%3 -4, 54 + times %2 d%3 16, -2 + times %2 d%3 -6, 46 + times %2 d%3 28, -4 + times %2 d%3 -4, 36 + times %2 d%3 36, -4 + times %2 d%3 -4, 28 + times %2 d%3 46, -6 + times %2 d%3 -2, 16 + times %2 d%3 54, -4 + times %2 d%3 -2, 10 + times %2 d%3 58, -2 +%endmacro + + +EPEL_TABLE 8,16, b, avx2 +EPEL_TABLE 10, 8, w, avx2 + +EPEL_TABLE 8, 8, b, sse4 +EPEL_TABLE 10, 4, w, sse4 +EPEL_TABLE 12, 4, w, sse4 + +%macro QPEL_TABLE 4 +hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4 + times %2 d%3 -10, 58 + times %2 d%3 17, -5 + times %2 d%3 1, 0 + times %2 d%3 -1, 4 + times %2 d%3 -11, 40 + times %2 d%3 40,-11 + times %2 d%3 4, -1 + times %2 d%3 0, 1 + times %2 d%3 -5, 17 + times %2 d%3 58,-10 + times %2 d%3 4, -1 +%endmacro + +QPEL_TABLE 8, 8, b, sse4 +QPEL_TABLE 10, 4, w, sse4 +QPEL_TABLE 12, 4, w, sse4 + +QPEL_TABLE 8,16, b, avx2 +QPEL_TABLE 10, 8, w, avx2 + +%define MAX_PB_SIZE 64 + +%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10 + +%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10 + +%if ARCH_X86_64 + +%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2 +%if %1 <= 4 + movq %3, [%2] ; load data from source2 +%elif %1 <= 8 + movdqa %3, [%2] ; load data from source2 +%elif %1 <= 12 +%if cpuflag(avx2) + mova %3, [%2] +%else + movdqa %3, [%2] ; load data from source2 + movq %4, [%2+16] ; load data from source2 +%endif ;avx +%elif %1 <= 16 +%if cpuflag(avx2) + mova %3, [%2] +%else + movdqa %3, [%2] ; load data from source2 + movdqa %4, [%2+16] ; load data from source2 +%endif ; avx +%else ; %1 = 32 + mova %3, [%2] + mova %4, [%2+32] +%endif +%endmacro + +%macro SIMPLE_LOAD 4 ;width, bitd, tab, r1 +%if %1 == 2 || (%2 == 8 && %1 <= 4) + movd %4, [%3] ; load data from source +%elif %1 == 4 || (%2 == 8 && %1 <= 8) + movq %4, [%3] ; load data from source +%elif notcpuflag(avx) + movu %4, [%3] ; load data from source +%elif %1 <= 8 || (%2 == 8 && %1 <= 16) + movdqu %4, [%3] +%else + movu %4, [%3] +%endif +%endmacro + + +%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp +%if cpuflag(avx2) +%assign %%offset 32 +%ifdef PIC + lea %5q, [hevc_epel_filters_avx2_%1] + %define FILTER %5q +%else + %define FILTER hevc_epel_filters_avx2_%1 +%endif +%else +%assign %%offset 16 +%ifdef PIC + lea %5q, [hevc_epel_filters_sse4_%1] + %define FILTER %5q +%else + %define FILTER hevc_epel_filters_sse4_%1 +%endif +%endif ;cpuflag(avx2) + sub %2q, 1 +%if cpuflag(avx2) + shl %2q, 6 ; multiply by 64 + %else + shl %2q, 5 ; multiply by 32 +%endif + mova %3, [FILTER + %2q] ; get 2 first values of filters + mova %4, [FILTER + %2q+%%offset] ; get 2 last values of filters +%endmacro + +%macro EPEL_HV_FILTER 1 +%if cpuflag(avx2) +%assign %%offset 32 +%assign %%shift 6 +%define %%table hevc_epel_filters_avx2_%1 +%else +%assign %%offset 16 +%assign %%shift 5 +%define %%table hevc_epel_filters_sse4_%1 +%endif + +%ifdef PIC + lea r3srcq, [%%table] + %define FILTER r3srcq +%else + %define FILTER %%table +%endif + sub mxq, 1 + sub myq, 1 + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 + mova m14, [FILTER + mxq] ; get 2 first values of filters + mova m15, [FILTER + mxq+%%offset] ; get 2 last values of filters + +%if cpuflag(avx2) +%define %%table hevc_epel_filters_avx2_10 +%else +%define %%table hevc_epel_filters_sse4_10 +%endif +%ifdef PIC + lea r3srcq, [%%table] + %define FILTER r3srcq +%else + %define FILTER %%table +%endif + mova m12, [FILTER + myq] ; get 2 first values of filters + mova m13, [FILTER + myq+%%offset] ; get 2 last values of filters + lea r3srcq, [srcstrideq*3] +%endmacro + +%macro QPEL_FILTER 2 + +%if cpuflag(avx2) +%assign %%offset 32 +%assign %%shift 7 +%define %%table hevc_qpel_filters_avx2_%1 +%else +%assign %%offset 16 +%assign %%shift 6 +%define %%table hevc_qpel_filters_sse4_%1 +%endif + +%ifdef PIC + lea rfilterq, [%%table] +%else + %define rfilterq %%table +%endif + sub %2q, 1 + shl %2q, %%shift ; multiply by 32 + mova m12, [rfilterq + %2q] ; get 4 first values of filters + mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters + mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters + mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters +%endmacro + +%macro EPEL_LOAD 4 +%if (%1 == 8 && %4 <= 4) +%define %%load movd +%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4) +%define %%load movq +%else +%define %%load movdqu +%endif + + %%load m0, [%2q ] +%ifnum %3 + %%load m1, [%2q+ %3] + %%load m2, [%2q+2*%3] + %%load m3, [%2q+3*%3] +%else + %%load m1, [%2q+ %3q] + %%load m2, [%2q+2*%3q] + %%load m3, [%2q+r3srcq] +%endif +%if %1 == 8 +%if %4 > 8 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 +%else + punpcklbw m0, m1 + punpcklbw m2, m3 +%endif +%else +%if %4 > 4 + SBUTTERFLY wd, 0, 1, 7 + SBUTTERFLY wd, 2, 3, 7 +%else + punpcklwd m0, m1 + punpcklwd m2, m3 +%endif +%endif +%endmacro + + +%macro QPEL_H_LOAD 4 +%assign %%stride (%1+7)/8 +%if %1 == 8 +%if %3 <= 4 +%define %%load movd +%elif %3 == 8 +%define %%load movq +%else +%define %%load movu +%endif +%else +%if %3 == 2 +%define %%load movd +%elif %3 == 4 +%define %%load movq +%else +%define %%load movu +%endif +%endif + %%load m0, [%2-3*%%stride] ;load data from source + %%load m1, [%2-2*%%stride] + %%load m2, [%2-%%stride ] + %%load m3, [%2 ] + %%load m4, [%2+%%stride ] + %%load m5, [%2+2*%%stride] + %%load m6, [%2+3*%%stride] + %%load m7, [%2+4*%%stride] + +%if %1 == 8 +%if %3 > 8 + SBUTTERFLY wd, 0, 1, %4 + SBUTTERFLY wd, 2, 3, %4 + SBUTTERFLY wd, 4, 5, %4 + SBUTTERFLY wd, 6, 7, %4 +%else + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 +%endif +%else +%if %3 > 4 + SBUTTERFLY dq, 0, 1, %4 + SBUTTERFLY dq, 2, 3, %4 + SBUTTERFLY dq, 4, 5, %4 + SBUTTERFLY dq, 6, 7, %4 +%else + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 +%endif +%endif +%endmacro + +%macro QPEL_V_LOAD 5 + lea %5q, [%2] + sub %5q, r3srcq + movu m0, [%5q ] ;load x- 3*srcstride + movu m1, [%5q+ %3q ] ;load x- 2*srcstride + movu m2, [%5q+ 2*%3q ] ;load x-srcstride + movu m3, [%2 ] ;load x + movu m4, [%2+ %3q] ;load x+stride + movu m5, [%2+ 2*%3q] ;load x+2*stride + movu m6, [%2+r3srcq] ;load x+3*stride + movu m7, [%2+ 4*%3q] ;load x+4*stride +%if %1 == 8 +%if %4 > 8 + SBUTTERFLY bw, 0, 1, 8 + SBUTTERFLY bw, 2, 3, 8 + SBUTTERFLY bw, 4, 5, 8 + SBUTTERFLY bw, 6, 7, 8 +%else + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 +%endif +%else +%if %4 > 4 + SBUTTERFLY wd, 0, 1, 8 + SBUTTERFLY wd, 2, 3, 8 + SBUTTERFLY wd, 4, 5, 8 + SBUTTERFLY wd, 6, 7, 8 +%else + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 +%endif +%endif +%endmacro + +%macro PEL_12STORE2 3 + movd [%1], %2 +%endmacro +%macro PEL_12STORE4 3 + movq [%1], %2 +%endmacro +%macro PEL_12STORE6 3 + movq [%1], %2 + psrldq %2, 8 + movd [%1+8], %2 +%endmacro +%macro PEL_12STORE8 3 + movdqa [%1], %2 +%endmacro +%macro PEL_12STORE12 3 + movdqa [%1], %2 + movq [%1+16], %3 +%endmacro +%macro PEL_12STORE16 3 + PEL_12STORE8 %1, %2, %3 + movdqa [%1+16], %3 +%endmacro + +%macro PEL_10STORE2 3 + movd [%1], %2 +%endmacro +%macro PEL_10STORE4 3 + movq [%1], %2 +%endmacro +%macro PEL_10STORE6 3 + movq [%1], %2 + psrldq %2, 8 + movd [%1+8], %2 +%endmacro +%macro PEL_10STORE8 3 + movdqa [%1], %2 +%endmacro +%macro PEL_10STORE12 3 + movdqa [%1], %2 + movq [%1+16], %3 +%endmacro +%macro PEL_10STORE16 3 +%if cpuflag(avx2) + movu [%1], %2 +%else + PEL_10STORE8 %1, %2, %3 + movdqa [%1+16], %3 +%endif +%endmacro + +%macro PEL_10STORE32 3 + PEL_10STORE16 %1, %2, %3 + movu [%1+32], %3 +%endmacro + +%macro PEL_8STORE2 3 + pextrw [%1], %2, 0 +%endmacro +%macro PEL_8STORE4 3 + movd [%1], %2 +%endmacro +%macro PEL_8STORE6 3 + movd [%1], %2 + pextrw [%1+4], %2, 2 +%endmacro +%macro PEL_8STORE8 3 + movq [%1], %2 +%endmacro +%macro PEL_8STORE12 3 + movq [%1], %2 + psrldq %2, 8 + movd [%1+8], %2 +%endmacro +%macro PEL_8STORE16 3 +%if cpuflag(avx2) + movdqu [%1], %2 +%else + mova [%1], %2 +%endif ; avx +%endmacro +%macro PEL_8STORE32 3 + movu [%1], %2 +%endmacro + +%macro LOOP_END 3 + add %1q, 2*MAX_PB_SIZE ; dst += dststride + add %2q, %3q ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop +%endmacro + + +%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth +%if %2 == 8 +%if cpuflag(avx2) && %0 ==3 +%if %1 > 16 + vextracti128 xm1, m0, 1 + pmovzxbw m1, xm1 + psllw m1, 14-%2 +%endif + pmovzxbw m0, xm0 +%else ; not avx +%if %1 > 8 + punpckhbw m1, m0, m2 + psllw m1, 14-%2 +%endif + punpcklbw m0, m2 +%endif +%endif ;avx + psllw m0, 14-%2 +%endmacro + +%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3 +%if %0 == 8 +%define %%reg0 %5 +%define %%reg2 %6 +%define %%reg1 %7 +%define %%reg3 %8 +%else +%define %%reg0 m0 +%define %%reg2 m2 +%define %%reg1 m1 +%define %%reg3 m3 +%endif +%if %1 == 8 +%if cpuflag(avx2) && (%0 == 5) +%if %2 > 16 + vperm2i128 m10, m0, m1, q0301 +%endif + vinserti128 m0, m0, xm1, 1 + mova m1, m10 +%if %2 > 16 + vperm2i128 m10, m2, m3, q0301 +%endif + vinserti128 m2, m2, xm3, 1 + mova m3, m10 +%endif + pmaddubsw %%reg0, %3 ;x1*c1+x2*c2 + pmaddubsw %%reg2, %4 ;x3*c3+x4*c4 + paddw %%reg0, %%reg2 +%if %2 > 8 + pmaddubsw %%reg1, %3 + pmaddubsw %%reg3, %4 + paddw %%reg1, %%reg3 +%endif +%else + pmaddwd %%reg0, %3 + pmaddwd %%reg2, %4 + paddd %%reg0, %%reg2 +%if %2 > 4 + pmaddwd %%reg1, %3 + pmaddwd %%reg3, %4 + paddd %%reg1, %%reg3 +%if %1 != 8 + psrad %%reg1, %1-8 +%endif +%endif +%if %1 != 8 + psrad %%reg0, %1-8 +%endif + packssdw %%reg0, %%reg1 +%endif +%endmacro + +%macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx + +%if cpuflag(avx2) +%assign %%offset 32 +%define %%table hevc_qpel_filters_avx2_%2 +%else +%assign %%offset 16 +%define %%table hevc_qpel_filters_sse4_%2 +%endif + +%ifdef PIC + lea rfilterq, [%%table] +%else + %define rfilterq %%table +%endif + +%if %2 == 8 + pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2 + pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4 + pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6 + pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8 + paddw m0, m2 + paddw m4, m6 + paddw m0, m4 +%else + pmaddwd m0, [rfilterq + %3q*8 ] + pmaddwd m2, [rfilterq + %3q*8+%%offset] + pmaddwd m4, [rfilterq + %3q*8+2*%%offset] + pmaddwd m6, [rfilterq + %3q*8+3*%%offset] + paddd m0, m2 + paddd m4, m6 + paddd m0, m4 +%if %2 != 8 + psrad m0, %2-8 +%endif +%if %1 > 4 + pmaddwd m1, [rfilterq + %3q*8 ] + pmaddwd m3, [rfilterq + %3q*8+%%offset] + pmaddwd m5, [rfilterq + %3q*8+2*%%offset] + pmaddwd m7, [rfilterq + %3q*8+3*%%offset] + paddd m1, m3 + paddd m5, m7 + paddd m1, m5 +%if %2 != 8 + psrad m1, %2-8 +%endif +%endif + p%4 m0, m1 +%endif +%endmacro + +%macro QPEL_COMPUTE 2-3 ; width, bitdepth +%if %2 == 8 +%if cpuflag(avx2) && (%0 == 3) + + vperm2i128 m10, m0, m1, q0301 + vinserti128 m0, m0, xm1, 1 + SWAP 1, 10 + + vperm2i128 m10, m2, m3, q0301 + vinserti128 m2, m2, xm3, 1 + SWAP 3, 10 + + + vperm2i128 m10, m4, m5, q0301 + vinserti128 m4, m4, xm5, 1 + SWAP 5, 10 + + vperm2i128 m10, m6, m7, q0301 + vinserti128 m6, m6, xm7, 1 + SWAP 7, 10 +%endif + + pmaddubsw m0, m12 ;x1*c1+x2*c2 + pmaddubsw m2, m13 ;x3*c3+x4*c4 + pmaddubsw m4, m14 ;x5*c5+x6*c6 + pmaddubsw m6, m15 ;x7*c7+x8*c8 + paddw m0, m2 + paddw m4, m6 + paddw m0, m4 +%if %1 > 8 + pmaddubsw m1, m12 + pmaddubsw m3, m13 + pmaddubsw m5, m14 + pmaddubsw m7, m15 + paddw m1, m3 + paddw m5, m7 + paddw m1, m5 +%endif +%else + pmaddwd m0, m12 + pmaddwd m2, m13 + pmaddwd m4, m14 + pmaddwd m6, m15 + paddd m0, m2 + paddd m4, m6 + paddd m0, m4 +%if %2 != 8 + psrad m0, %2-8 +%endif +%if %1 > 4 + pmaddwd m1, m12 + pmaddwd m3, m13 + pmaddwd m5, m14 + pmaddwd m7, m15 + paddd m1, m3 + paddd m5, m7 + paddd m1, m5 +%if %2 != 8 + psrad m1, %2-8 +%endif +%endif +%endif +%endmacro + +%macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw + paddsw %3, %5 +%if %1 > 8 + paddsw %4, %6 +%endif + UNI_COMPUTE %1, %2, %3, %4, %7 +%if %0 == 8 && cpuflag(avx2) && (%2 == 8) + vpermq %3, %3, 216 + vpermq %4, %4, 216 +%endif +%endmacro + +%macro UNI_COMPUTE 5 + pmulhrsw %3, %5 +%if %1 > 8 || (%2 > 8 && %1 > 4) + pmulhrsw %4, %5 +%endif +%if %2 == 8 + packuswb %3, %4 +%else + CLIPW %3, [pb_0], [max_pixels_%2] +%if (%1 > 8 && notcpuflag(avx)) || %1 > 16 + CLIPW %4, [pb_0], [max_pixels_%2] +%endif +%endif +%endmacro + + +; ****************************** +; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my) +; ****************************** + +%macro HEVC_PUT_HEVC_PEL_PIXELS 2 +HEVC_PEL_PIXELS %1, %2 +HEVC_UNI_PEL_PIXELS %1, %2 +HEVC_BI_PEL_PIXELS %1, %2 +%endmacro + +%macro HEVC_PEL_PIXELS 2 +cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height + pxor m2, m2 +.loop + SIMPLE_LOAD %1, %2, srcq, m0 + MC_PIXEL_COMPUTE %1, %2, 1 + PEL_10STORE%1 dstq, m0, m1 + LOOP_END dst, src, srcstride + RET + %endmacro + +%macro HEVC_UNI_PEL_PIXELS 2 +cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height +.loop + SIMPLE_LOAD %1, %2, srcq, m0 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET +%endmacro + +%macro HEVC_BI_PEL_PIXELS 2 +cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height + pxor m2, m2 + movdqa m5, [pw_bi_%2] +.loop + SIMPLE_LOAD %1, %2, srcq, m0 + SIMPLE_BILOAD %1, src2q, m3, m4 + MC_PIXEL_COMPUTE %1, %2, 1 + BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET +%endmacro + + +; ****************************** +; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my, int width); +; ****************************** + + +%macro HEVC_PUT_HEVC_EPEL 2 +%if cpuflag(avx2) +%define XMM_REGS 11 +%else +%define XMM_REGS 8 +%endif + +cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter +%assign %%stride ((%2 + 7)/8) + EPEL_FILTER %2, mx, m4, m5, rfilter +.loop + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m4, m5, 1 + PEL_10STORE%1 dstq, m0, m1 + LOOP_END dst, src, srcstride + RET + +cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter +%assign %%stride ((%2 + 7)/8) + movdqa m6, [pw_%2] + EPEL_FILTER %2, mx, m4, m5, rfilter +.loop + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m4, m5 + UNI_COMPUTE %1, %2, m0, m1, m6 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET + +cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter + movdqa m6, [pw_bi_%2] + EPEL_FILTER %2, mx, m4, m5, rfilter +.loop + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m4, m5, 1 + SIMPLE_BILOAD %1, src2q, m2, m3 + BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET + +; ****************************** +; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my, int width) +; ****************************** + +cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my + movifnidn myd, mym + sub srcq, srcstrideq + EPEL_FILTER %2, my, m4, m5, r3src + lea r3srcq, [srcstrideq*3] +.loop + EPEL_LOAD %2, srcq, srcstride, %1 + EPEL_COMPUTE %2, %1, m4, m5, 1 + PEL_10STORE%1 dstq, m0, m1 + LOOP_END dst, src, srcstride + RET + +cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my + movifnidn myd, mym + movdqa m6, [pw_%2] + sub srcq, srcstrideq + EPEL_FILTER %2, my, m4, m5, r3src + lea r3srcq, [srcstrideq*3] +.loop + EPEL_LOAD %2, srcq, srcstride, %1 + EPEL_COMPUTE %2, %1, m4, m5 + UNI_COMPUTE %1, %2, m0, m1, m6 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET + + +cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my + movifnidn myd, mym + movdqa m6, [pw_bi_%2] + sub srcq, srcstrideq + EPEL_FILTER %2, my, m4, m5, r3src + lea r3srcq, [srcstrideq*3] +.loop + EPEL_LOAD %2, srcq, srcstride, %1 + EPEL_COMPUTE %2, %1, m4, m5, 1 + SIMPLE_BILOAD %1, src2q, m2, m3 + BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET +%endmacro + + +; ****************************** +; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my, int width) +; ****************************** + +%macro HEVC_PUT_HEVC_EPEL_HV 2 +cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src +%assign %%stride ((%2 + 7)/8) + sub srcq, srcstrideq + EPEL_HV_FILTER %2 + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m8, m1 +%endif + SWAP m4, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m9, m1 +%endif + SWAP m5, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m10, m1 +%endif + SWAP m6, m0 + add srcq, srcstrideq +.loop + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m11, m1 +%endif + SWAP m7, m0 + punpcklwd m0, m4, m5 + punpcklwd m2, m6, m7 +%if %1 > 4 + punpckhwd m1, m4, m5 + punpckhwd m3, m6, m7 +%endif + EPEL_COMPUTE 14, %1, m12, m13 +%if (%1 > 8 && (%2 == 8)) + punpcklwd m4, m8, m9 + punpcklwd m2, m10, m11 + punpckhwd m8, m8, m9 + punpckhwd m3, m10, m11 + EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 +%if cpuflag(avx2) + vinserti128 m2, m0, xm4, 1 + vperm2i128 m3, m0, m4, q0301 + PEL_10STORE%1 dstq, m2, m3 +%else + PEL_10STORE%1 dstq, m0, m4 +%endif +%else + PEL_10STORE%1 dstq, m0, m1 +%endif + movdqa m4, m5 + movdqa m5, m6 + movdqa m6, m7 +%if (%1 > 8 && (%2 == 8)) + mova m8, m9 + mova m9, m10 + mova m10, m11 +%endif + LOOP_END dst, src, srcstride + RET + +cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src +%assign %%stride ((%2 + 7)/8) + sub srcq, srcstrideq + EPEL_HV_FILTER %2 + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m8, m1 +%endif + SWAP m4, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m9, m1 +%endif + SWAP m5, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m10, m1 +%endif + SWAP m6, m0 + add srcq, srcstrideq +.loop + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m11, m1 +%endif + mova m7, m0 + punpcklwd m0, m4, m5 + punpcklwd m2, m6, m7 +%if %1 > 4 + punpckhwd m1, m4, m5 + punpckhwd m3, m6, m7 +%endif + EPEL_COMPUTE 14, %1, m12, m13 +%if (%1 > 8 && (%2 == 8)) + punpcklwd m4, m8, m9 + punpcklwd m2, m10, m11 + punpckhwd m8, m8, m9 + punpckhwd m3, m10, m11 + EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 + UNI_COMPUTE %1, %2, m0, m4, [pw_%2] +%else + UNI_COMPUTE %1, %2, m0, m1, [pw_%2] +%endif + PEL_%2STORE%1 dstq, m0, m1 + mova m4, m5 + mova m5, m6 + mova m6, m7 +%if (%1 > 8 && (%2 == 8)) + mova m8, m9 + mova m9, m10 + mova m10, m11 +%endif + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET + +cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src +%assign %%stride ((%2 + 7)/8) + sub srcq, srcstrideq + EPEL_HV_FILTER %2 + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m8, m1 +%endif + SWAP m4, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m9, m1 +%endif + SWAP m5, m0 + add srcq, srcstrideq + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m10, m1 +%endif + SWAP m6, m0 + add srcq, srcstrideq +.loop + EPEL_LOAD %2, srcq-%%stride, %%stride, %1 + EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m11, m1 +%endif + SWAP m7, m0 + punpcklwd m0, m4, m5 + punpcklwd m2, m6, m7 +%if %1 > 4 + punpckhwd m1, m4, m5 + punpckhwd m3, m6, m7 +%endif + EPEL_COMPUTE 14, %1, m12, m13 +%if (%1 > 8 && (%2 == 8)) + punpcklwd m4, m8, m9 + punpcklwd m2, m10, m11 + punpckhwd m8, m8, m9 + punpckhwd m3, m10, m11 + EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 + SIMPLE_BILOAD %1, src2q, m8, m3 +%if cpuflag(avx2) + vinserti128 m1, m8, xm3, 1 + vperm2i128 m2, m8, m3, q0301 + BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2] +%else + BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2] +%endif +%else + SIMPLE_BILOAD %1, src2q, m8, m9 + BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] +%endif + PEL_%2STORE%1 dstq, m0, m4 + mova m4, m5 + mova m5, m6 + mova m6, m7 +%if (%1 > 8 && (%2 == 8)) + mova m8, m9 + mova m9, m10 + mova m10, m11 +%endif + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET +%endmacro + +; ****************************** +; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my, int width) +; ****************************** + +%macro HEVC_PUT_HEVC_QPEL 2 +cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter + QPEL_FILTER %2, mx +.loop + QPEL_H_LOAD %2, srcq, %1, 10 + QPEL_COMPUTE %1, %2, 1 +%if %2 > 8 + packssdw m0, m1 +%endif + PEL_10STORE%1 dstq, m0, m1 + LOOP_END dst, src, srcstride + RET + +cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter + mova m9, [pw_%2] + QPEL_FILTER %2, mx +.loop + QPEL_H_LOAD %2, srcq, %1, 10 + QPEL_COMPUTE %1, %2 +%if %2 > 8 + packssdw m0, m1 +%endif + UNI_COMPUTE %1, %2, m0, m1, m9 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET + +cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter + movdqa m9, [pw_bi_%2] + QPEL_FILTER %2, mx +.loop + QPEL_H_LOAD %2, srcq, %1, 10 + QPEL_COMPUTE %1, %2, 1 +%if %2 > 8 + packssdw m0, m1 +%endif + SIMPLE_BILOAD %1, src2q, m10, m11 + BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET + + +; ****************************** +; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my, int width) +; ****************************** + +cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter + movifnidn myd, mym + lea r3srcq, [srcstrideq*3] + QPEL_FILTER %2, my +.loop + QPEL_V_LOAD %2, srcq, srcstride, %1, r7 + QPEL_COMPUTE %1, %2, 1 +%if %2 > 8 + packssdw m0, m1 +%endif + PEL_10STORE%1 dstq, m0, m1 + LOOP_END dst, src, srcstride + RET + +cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter + movifnidn myd, mym + movdqa m9, [pw_%2] + lea r3srcq, [srcstrideq*3] + QPEL_FILTER %2, my +.loop + QPEL_V_LOAD %2, srcq, srcstride, %1, r8 + QPEL_COMPUTE %1, %2 +%if %2 > 8 + packssdw m0, m1 +%endif + UNI_COMPUTE %1, %2, m0, m1, m9 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET + +cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter + movifnidn myd, mym + movdqa m9, [pw_bi_%2] + lea r3srcq, [srcstrideq*3] + QPEL_FILTER %2, my +.loop + QPEL_V_LOAD %2, srcq, srcstride, %1, r9 + QPEL_COMPUTE %1, %2, 1 +%if %2 > 8 + packssdw m0, m1 +%endif + SIMPLE_BILOAD %1, src2q, m10, m11 + BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET +%endmacro + + +; ****************************** +; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride, +; uint8_t *_src, ptrdiff_t _srcstride, +; int height, int mx, int my) +; ****************************** +%macro HEVC_PUT_HEVC_QPEL_HV 2 +cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter +%if cpuflag(avx2) +%assign %%shift 4 +%else +%assign %%shift 3 +%endif + sub mxq, 1 + sub myq, 1 + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 + lea r3srcq, [srcstrideq*3] + sub srcq, r3srcq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m8, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m9, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m10, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m11, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m12, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m13, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m14, m0 + add srcq, srcstrideq +.loop + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m15, m0 + punpcklwd m0, m8, m9 + punpcklwd m2, m10, m11 + punpcklwd m4, m12, m13 + punpcklwd m6, m14, m15 +%if %1 > 4 + punpckhwd m1, m8, m9 + punpckhwd m3, m10, m11 + punpckhwd m5, m12, m13 + punpckhwd m7, m14, m15 +%endif + QPEL_HV_COMPUTE %1, 14, my, ackssdw + PEL_10STORE%1 dstq, m0, m1 +%if %1 <= 4 + movq m8, m9 + movq m9, m10 + movq m10, m11 + movq m11, m12 + movq m12, m13 + movq m13, m14 + movq m14, m15 +%else + movdqa m8, m9 + movdqa m9, m10 + movdqa m10, m11 + movdqa m11, m12 + movdqa m12, m13 + movdqa m13, m14 + movdqa m14, m15 +%endif + LOOP_END dst, src, srcstride + RET + +cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter +%if cpuflag(avx2) +%assign %%shift 4 +%else +%assign %%shift 3 +%endif + sub mxq, 1 + sub myq, 1 + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 + lea r3srcq, [srcstrideq*3] + sub srcq, r3srcq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m8, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m9, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m10, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m11, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m12, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m13, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m14, m0 + add srcq, srcstrideq +.loop + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m15, m0 + punpcklwd m0, m8, m9 + punpcklwd m2, m10, m11 + punpcklwd m4, m12, m13 + punpcklwd m6, m14, m15 +%if %1 > 4 + punpckhwd m1, m8, m9 + punpckhwd m3, m10, m11 + punpckhwd m5, m12, m13 + punpckhwd m7, m14, m15 +%endif + QPEL_HV_COMPUTE %1, 14, my, ackusdw + UNI_COMPUTE %1, %2, m0, m1, [pw_%2] + PEL_%2STORE%1 dstq, m0, m1 + +%if %1 <= 4 + movq m8, m9 + movq m9, m10 + movq m10, m11 + movq m11, m12 + movq m12, m13 + movq m13, m14 + movq m14, m15 +%else + mova m8, m9 + mova m9, m10 + mova m10, m11 + mova m11, m12 + mova m12, m13 + mova m13, m14 + mova m14, m15 +%endif + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET + +cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter +%if cpuflag(avx2) +%assign %%shift 4 +%else +%assign %%shift 3 +%endif + sub mxq, 1 + sub myq, 1 + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 + lea r3srcq, [srcstrideq*3] + sub srcq, r3srcq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m8, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m9, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m10, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m11, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m12, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m13, m0 + add srcq, srcstrideq + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m14, m0 + add srcq, srcstrideq +.loop + QPEL_H_LOAD %2, srcq, %1, 15 + QPEL_HV_COMPUTE %1, %2, mx, ackssdw + SWAP m15, m0 + punpcklwd m0, m8, m9 + punpcklwd m2, m10, m11 + punpcklwd m4, m12, m13 + punpcklwd m6, m14, m15 +%if %1 > 4 + punpckhwd m1, m8, m9 + punpckhwd m3, m10, m11 + punpckhwd m5, m12, m13 + punpckhwd m7, m14, m15 +%endif + QPEL_HV_COMPUTE %1, 14, my, ackssdw + SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case + BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] + PEL_%2STORE%1 dstq, m0, m1 + +%if %1 <= 4 + movq m8, m9 + movq m9, m10 + movq m10, m11 + movq m11, m12 + movq m12, m13 + movq m13, m14 + movq m14, m15 +%else + movdqa m8, m9 + movdqa m9, m10 + movdqa m10, m11 + movdqa m11, m12 + movdqa m12, m13 + movdqa m13, m14 + movdqa m14, m15 +%endif + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET +%endmacro + +%macro WEIGHTING_FUNCS 2 +%if WIN64 || ARCH_X86_32 +cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox + mov r4d, denomm +%define SHIFT r4d +%else +cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox +%define SHIFT denomd +%endif + lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom +%if %1 <= 4 + pxor m1, m1 +%endif + movd m2, wxm ; WX + movd m4, SHIFT ; shift +%if %1 <= 4 + punpcklwd m2, m1 +%else + punpcklwd m2, m2 +%endif + dec SHIFT + movdqu m5, [pd_1] + movd m6, SHIFT + pshufd m2, m2, 0 + mov SHIFT, oxm + pslld m5, m6 +%if %2 != 8 + shl SHIFT, %2-8 ; ox << (bitd - 8) +%endif + movd m3, SHIFT ; OX + pshufd m3, m3, 0 +%if WIN64 || ARCH_X86_32 + mov SHIFT, heightm +%endif +.loop + SIMPLE_LOAD %1, 10, srcq, m0 +%if %1 <= 4 + punpcklwd m0, m1 + pmaddwd m0, m2 + paddd m0, m5 + psrad m0, m4 + paddd m0, m3 +%else + pmulhw m6, m0, m2 + pmullw m0, m2 + punpckhwd m1, m0, m6 + punpcklwd m0, m6 + paddd m0, m5 + paddd m1, m5 + psrad m0, m4 + psrad m1, m4 + paddd m0, m3 + paddd m1, m3 +%endif + packssdw m0, m1 +%if %2 == 8 + packuswb m0, m0 +%else + CLIPW m0, [pb_0], [max_pixels_%2] +%endif + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, 2*MAX_PB_SIZE ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + RET + +cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1 + movifnidn r5d, denomm +%if %1 <= 4 + pxor m1, m1 +%endif + movd m2, wx0m ; WX0 + lea r5d, [r5d+14-%2] ; shift = 14 - bitd + denom + movd m3, wx1m ; WX1 + movd m0, r5d ; shift +%if %1 <= 4 + punpcklwd m2, m1 + punpcklwd m3, m1 +%else + punpcklwd m2, m2 + punpcklwd m3, m3 +%endif + inc r5d + movd m5, r5d ; shift+1 + pshufd m2, m2, 0 + mov r5d, ox0m + pshufd m3, m3, 0 + add r5d, ox1m +%if %2 != 8 + shl r5d, %2-8 ; ox << (bitd - 8) +%endif + inc r5d + movd m4, r5d ; offset + pshufd m4, m4, 0 +%if UNIX64 +%define h heightd +%else + mov r5d, heightm +%define h r5d +%endif + pslld m4, m0 + +.loop + SIMPLE_LOAD %1, 10, srcq, m0 + SIMPLE_LOAD %1, 10, src2q, m8 +%if %1 <= 4 + punpcklwd m0, m1 + punpcklwd m8, m1 + pmaddwd m0, m3 + pmaddwd m8, m2 + paddd m0, m4 + paddd m0, m8 + psrad m0, m5 +%else + pmulhw m6, m0, m3 + pmullw m0, m3 + pmulhw m7, m8, m2 + pmullw m8, m2 + punpckhwd m1, m0, m6 + punpcklwd m0, m6 + punpckhwd m9, m8, m7 + punpcklwd m8, m7 + paddd m0, m8 + paddd m1, m9 + paddd m0, m4 + paddd m1, m4 + psrad m0, m5 + psrad m1, m5 +%endif + packssdw m0, m1 +%if %2 == 8 + packuswb m0, m0 +%else + CLIPW m0, [pb_0], [max_pixels_%2] +%endif + PEL_%2STORE%1 dstq, m0, m1 + add dstq, dststrideq ; dst += dststride + add srcq, 2*MAX_PB_SIZE ; src += srcstride + add src2q, 2*MAX_PB_SIZE ; src2 += srcstride + dec h ; cmp height + jnz .loop ; height loop + RET +%endmacro + +INIT_XMM sse4 ; adds ff_ and _sse4 to function name + +WEIGHTING_FUNCS 2, 8 +WEIGHTING_FUNCS 4, 8 +WEIGHTING_FUNCS 6, 8 +WEIGHTING_FUNCS 8, 8 + +WEIGHTING_FUNCS 2, 10 +WEIGHTING_FUNCS 4, 10 +WEIGHTING_FUNCS 6, 10 +WEIGHTING_FUNCS 8, 10 + +WEIGHTING_FUNCS 2, 12 +WEIGHTING_FUNCS 4, 12 +WEIGHTING_FUNCS 6, 12 +WEIGHTING_FUNCS 8, 12 + +HEVC_PUT_HEVC_PEL_PIXELS 2, 8 +HEVC_PUT_HEVC_PEL_PIXELS 4, 8 +HEVC_PUT_HEVC_PEL_PIXELS 6, 8 +HEVC_PUT_HEVC_PEL_PIXELS 8, 8 +HEVC_PUT_HEVC_PEL_PIXELS 12, 8 +HEVC_PUT_HEVC_PEL_PIXELS 16, 8 + +HEVC_PUT_HEVC_PEL_PIXELS 2, 10 +HEVC_PUT_HEVC_PEL_PIXELS 4, 10 +HEVC_PUT_HEVC_PEL_PIXELS 6, 10 +HEVC_PUT_HEVC_PEL_PIXELS 8, 10 + +HEVC_PUT_HEVC_PEL_PIXELS 2, 12 +HEVC_PUT_HEVC_PEL_PIXELS 4, 12 +HEVC_PUT_HEVC_PEL_PIXELS 6, 12 +HEVC_PUT_HEVC_PEL_PIXELS 8, 12 + +HEVC_PUT_HEVC_EPEL 2, 8 +HEVC_PUT_HEVC_EPEL 4, 8 +HEVC_PUT_HEVC_EPEL 6, 8 +HEVC_PUT_HEVC_EPEL 8, 8 +HEVC_PUT_HEVC_EPEL 12, 8 +HEVC_PUT_HEVC_EPEL 16, 8 + + +HEVC_PUT_HEVC_EPEL 2, 10 +HEVC_PUT_HEVC_EPEL 4, 10 +HEVC_PUT_HEVC_EPEL 6, 10 +HEVC_PUT_HEVC_EPEL 8, 10 + +HEVC_PUT_HEVC_EPEL 2, 12 +HEVC_PUT_HEVC_EPEL 4, 12 +HEVC_PUT_HEVC_EPEL 6, 12 +HEVC_PUT_HEVC_EPEL 8, 12 + +HEVC_PUT_HEVC_EPEL_HV 2, 8 +HEVC_PUT_HEVC_EPEL_HV 4, 8 +HEVC_PUT_HEVC_EPEL_HV 6, 8 +HEVC_PUT_HEVC_EPEL_HV 8, 8 +HEVC_PUT_HEVC_EPEL_HV 16, 8 + +HEVC_PUT_HEVC_EPEL_HV 2, 10 +HEVC_PUT_HEVC_EPEL_HV 4, 10 +HEVC_PUT_HEVC_EPEL_HV 6, 10 +HEVC_PUT_HEVC_EPEL_HV 8, 10 + +HEVC_PUT_HEVC_EPEL_HV 2, 12 +HEVC_PUT_HEVC_EPEL_HV 4, 12 +HEVC_PUT_HEVC_EPEL_HV 6, 12 +HEVC_PUT_HEVC_EPEL_HV 8, 12 + +HEVC_PUT_HEVC_QPEL 4, 8 +HEVC_PUT_HEVC_QPEL 8, 8 +HEVC_PUT_HEVC_QPEL 12, 8 +HEVC_PUT_HEVC_QPEL 16, 8 + +HEVC_PUT_HEVC_QPEL 4, 10 +HEVC_PUT_HEVC_QPEL 8, 10 + +HEVC_PUT_HEVC_QPEL 4, 12 +HEVC_PUT_HEVC_QPEL 8, 12 + +HEVC_PUT_HEVC_QPEL_HV 2, 8 +HEVC_PUT_HEVC_QPEL_HV 4, 8 +HEVC_PUT_HEVC_QPEL_HV 6, 8 +HEVC_PUT_HEVC_QPEL_HV 8, 8 + +HEVC_PUT_HEVC_QPEL_HV 2, 10 +HEVC_PUT_HEVC_QPEL_HV 4, 10 +HEVC_PUT_HEVC_QPEL_HV 6, 10 +HEVC_PUT_HEVC_QPEL_HV 8, 10 + +HEVC_PUT_HEVC_QPEL_HV 2, 12 +HEVC_PUT_HEVC_QPEL_HV 4, 12 +HEVC_PUT_HEVC_QPEL_HV 6, 12 +HEVC_PUT_HEVC_QPEL_HV 8, 12 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0 + +HEVC_PUT_HEVC_PEL_PIXELS 32, 8 +HEVC_PUT_HEVC_PEL_PIXELS 16, 10 + +HEVC_PUT_HEVC_EPEL 32, 8 +HEVC_PUT_HEVC_EPEL 16, 10 + +HEVC_PUT_HEVC_EPEL_HV 16, 10 +HEVC_PUT_HEVC_EPEL_HV 32, 8 + +HEVC_PUT_HEVC_QPEL 32, 8 + +HEVC_PUT_HEVC_QPEL 16, 10 + +HEVC_PUT_HEVC_QPEL_HV 16, 10 + +%endif ;AVX2 +%endif ; ARCH_X86_64 diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm new file mode 100644 index 0000000000..dc3e88a373 --- /dev/null +++ b/libavcodec/x86/hevc_res_add.asm @@ -0,0 +1,388 @@ +; /* +; * Provide SIMD optimizations for transform_add functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +SECTION .text + +cextern pw_1023 +%define max_pixels_10 pw_1023 + + +;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file +%macro TR_ADD_MMX_4_8 0 + mova m2, [r1] + mova m4, [r1+8] + pxor m3, m3 + psubw m3, m2 + packuswb m2, m2 + packuswb m3, m3 + pxor m5, m5 + psubw m5, m4 + packuswb m4, m4 + packuswb m5, m5 + + movh m0, [r0 ] + movh m1, [r0+r2 ] + paddusb m0, m2 + paddusb m1, m4 + psubusb m0, m3 + psubusb m1, m5 + movh [r0 ], m0 + movh [r0+r2 ], m1 +%endmacro + + +INIT_MMX mmxext +; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_transform_add4_8, 3, 4, 6 + TR_ADD_MMX_4_8 + add r1, 16 + lea r0, [r0+r2*2] + TR_ADD_MMX_4_8 + RET + +%macro TR_ADD_SSE_8_8 0 + pxor m3, m3 + mova m4, [r1] + mova m6, [r1+16] + mova m0, [r1+32] + mova m2, [r1+48] + psubw m5, m3, m4 + psubw m7, m3, m6 + psubw m1, m3, m0 + packuswb m4, m0 + packuswb m5, m1 + psubw m3, m2 + packuswb m6, m2 + packuswb m7, m3 + + movq m0, [r0 ] + movq m1, [r0+r2 ] + movhps m0, [r0+r2*2] + movhps m1, [r0+r3 ] + paddusb m0, m4 + paddusb m1, m6 + psubusb m0, m5 + psubusb m1, m7 + movq [r0 ], m0 + movq [r0+r2 ], m1 + movhps [r0+2*r2], m0 + movhps [r0+r3 ], m1 +%endmacro + +%macro TR_ADD_SSE_16_32_8 3 + mova xm2, [r1+%1 ] + mova xm6, [r1+%1+16] +%if cpuflag(avx2) + vinserti128 m2, m2, [r1+%1+32], 1 + vinserti128 m6, m6, [r1+%1+48], 1 +%endif +%if cpuflag(avx) + psubw m1, m0, m2 + psubw m5, m0, m6 +%else + mova m1, m0 + mova m5, m0 + psubw m1, m2 + psubw m5, m6 +%endif + packuswb m2, m6 + packuswb m1, m5 + + mova xm4, [r1+%1+mmsize*2 ] + mova xm6, [r1+%1+mmsize*2+16] +%if cpuflag(avx2) + vinserti128 m4, m4, [r1+%1+96 ], 1 + vinserti128 m6, m6, [r1+%1+112], 1 +%endif +%if cpuflag(avx) + psubw m3, m0, m4 + psubw m5, m0, m6 +%else + mova m3, m0 + mova m5, m0 + psubw m3, m4 + psubw m5, m6 +%endif + packuswb m4, m6 + packuswb m3, m5 + + paddusb m2, [%2] + paddusb m4, [%3] + psubusb m2, m1 + psubusb m4, m3 + mova [%2], m2 + mova [%3], m4 +%endmacro + + +%macro TRANSFORM_ADD_8 0 +; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_transform_add8_8, 3, 4, 8 + lea r3, [r2*3] + TR_ADD_SSE_8_8 + add r1, 64 + lea r0, [r0+r2*4] + TR_ADD_SSE_8_8 + RET + +; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_transform_add16_8, 3, 4, 7 + pxor m0, m0 + lea r3, [r2*3] + TR_ADD_SSE_16_32_8 0, r0, r0+r2 + TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3 +%rep 3 + add r1, 128 + lea r0, [r0+r2*4] + TR_ADD_SSE_16_32_8 0, r0, r0+r2 + TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3 +%endrep + RET + +; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_transform_add32_8, 3, 4, 7 + pxor m0, m0 + TR_ADD_SSE_16_32_8 0, r0, r0+16 + TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16 +%rep 15 + add r1, 128 + lea r0, [r0+r2*2] + TR_ADD_SSE_16_32_8 0, r0, r0+16 + TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16 +%endrep + RET +%endmacro + +INIT_XMM sse2 +TRANSFORM_ADD_8 +INIT_XMM avx +TRANSFORM_ADD_8 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_transform_add32_8, 3, 4, 7 + pxor m0, m0 + lea r3, [r2*3] + TR_ADD_SSE_16_32_8 0, r0, r0+r2 + TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3 +%rep 7 + add r1, 256 + lea r0, [r0+r2*4] + TR_ADD_SSE_16_32_8 0, r0, r0+r2 + TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3 +%endrep + RET +%endif + +;----------------------------------------------------------------------------- +; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride) +;----------------------------------------------------------------------------- +%macro TR_ADD_SSE_8_10 4 + mova m0, [%4] + mova m1, [%4+16] + mova m2, [%4+32] + mova m3, [%4+48] + paddw m0, [%1+0 ] + paddw m1, [%1+%2 ] + paddw m2, [%1+%2*2] + paddw m3, [%1+%3 ] + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1+0 ], m0 + mova [%1+%2 ], m1 + mova [%1+%2*2], m2 + mova [%1+%3 ], m3 +%endmacro + +%macro TR_ADD_MMX4_10 3 + mova m0, [%1+0 ] + mova m1, [%1+%2 ] + paddw m0, [%3] + paddw m1, [%3+8] + CLIPW m0, m2, m3 + CLIPW m1, m2, m3 + mova [%1+0 ], m0 + mova [%1+%2 ], m1 +%endmacro + +%macro TRANS_ADD_SSE_16_10 3 + mova m0, [%3] + mova m1, [%3+16] + mova m2, [%3+32] + mova m3, [%3+48] + paddw m0, [%1 ] + paddw m1, [%1+16 ] + paddw m2, [%1+%2 ] + paddw m3, [%1+%2+16] + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1 ], m0 + mova [%1+16 ], m1 + mova [%1+%2 ], m2 + mova [%1+%2+16], m3 +%endmacro + +%macro TRANS_ADD_SSE_32_10 2 + mova m0, [%2] + mova m1, [%2+16] + mova m2, [%2+32] + mova m3, [%2+48] + + paddw m0, [%1 ] + paddw m1, [%1+16] + paddw m2, [%1+32] + paddw m3, [%1+48] + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1 ], m0 + mova [%1+16], m1 + mova [%1+32], m2 + mova [%1+48], m3 +%endmacro + +%macro TRANS_ADD16_AVX2 4 + mova m0, [%4] + mova m1, [%4+32] + mova m2, [%4+64] + mova m3, [%4+96] + + paddw m0, [%1+0 ] + paddw m1, [%1+%2 ] + paddw m2, [%1+%2*2] + paddw m3, [%1+%3 ] + + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1+0 ], m0 + mova [%1+%2 ], m1 + mova [%1+%2*2], m2 + mova [%1+%3 ], m3 +%endmacro + +%macro TRANS_ADD32_AVX2 3 + mova m0, [%3] + mova m1, [%3+32] + mova m2, [%3+64] + mova m3, [%3+96] + + paddw m0, [%1 ] + paddw m1, [%1+32 ] + paddw m2, [%1+%2 ] + paddw m3, [%1+%2+32] + + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1 ], m0 + mova [%1+32 ], m1 + mova [%1+%2 ], m2 + mova [%1+%2+32], m3 +%endmacro + + +INIT_MMX mmxext +cglobal hevc_transform_add4_10,3,4, 6 + pxor m2, m2 + mova m3, [max_pixels_10] + TR_ADD_MMX4_10 r0, r2, r1 + add r1, 16 + lea r0, [r0+2*r2] + TR_ADD_MMX4_10 r0, r2, r1 + RET + +;----------------------------------------------------------------------------- +; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal hevc_transform_add8_10,3,4,6 + pxor m4, m4 + mova m5, [max_pixels_10] + lea r3, [r2*3] + + TR_ADD_SSE_8_10 r0, r2, r3, r1 + lea r0, [r0+r2*4] + add r1, 64 + TR_ADD_SSE_8_10 r0, r2, r3, r1 + RET + +cglobal hevc_transform_add16_10,3,4,6 + pxor m4, m4 + mova m5, [max_pixels_10] + + TRANS_ADD_SSE_16_10 r0, r2, r1 +%rep 7 + lea r0, [r0+r2*2] + add r1, 64 + TRANS_ADD_SSE_16_10 r0, r2, r1 +%endrep + RET + +cglobal hevc_transform_add32_10,3,4,6 + pxor m4, m4 + mova m5, [max_pixels_10] + + TRANS_ADD_SSE_32_10 r0, r1 +%rep 31 + lea r0, [r0+r2] + add r1, 64 + TRANS_ADD_SSE_32_10 r0, r1 +%endrep + RET + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 + +cglobal hevc_transform_add16_10,3,4,6 + pxor m4, m4 + mova m5, [max_pixels_10] + lea r3, [r2*3] + + TRANS_ADD16_AVX2 r0, r2, r3, r1 +%rep 3 + lea r0, [r0+r2*4] + add r1, 128 + TRANS_ADD16_AVX2 r0, r2, r3, r1 +%endrep + RET + +cglobal hevc_transform_add32_10,3,4,6 + pxor m4, m4 + mova m5, [max_pixels_10] + + TRANS_ADD32_AVX2 r0, r2, r1 +%rep 15 + lea r0, [r0+r2*2] + add r1, 128 + TRANS_ADD32_AVX2 r0, r2, r1 +%endrep + RET +%endif ;HAVE_AVX_EXTERNAL diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm new file mode 100644 index 0000000000..86ef847ba2 --- /dev/null +++ b/libavcodec/x86/hevc_sao.asm @@ -0,0 +1,624 @@ +;****************************************************************************** +;* SIMD optimized SAO functions for HEVC decoding +;* +;* Copyright (c) 2013 Pierre-Edouard LEPERE +;* Copyright (c) 2014 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pw_mask10: times 16 dw 0x03FF +pw_mask12: times 16 dw 0x0FFF +pw_m2: times 16 dw -2 +pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1 +cextern pw_m1 +cextern pw_1 +cextern pw_2 +cextern pb_1 +cextern pb_2 + +SECTION_TEXT + +%define MAX_PB_SIZE 64 +%define PADDING_SIZE 32 ; FF_INPUT_BUFFER_PADDING_SIZE + +;****************************************************************************** +;SAO Band Filter +;****************************************************************************** + +%macro HEVC_SAO_BAND_FILTER_INIT 1 + and leftq, 31 + movd xm0, leftd + add leftq, 1 + and leftq, 31 + movd xm1, leftd + add leftq, 1 + and leftq, 31 + movd xm2, leftd + add leftq, 1 + and leftq, 31 + movd xm3, leftd + + SPLATW m0, xm0 + SPLATW m1, xm1 + SPLATW m2, xm2 + SPLATW m3, xm3 +%if mmsize > 16 + SPLATW m4, [offsetq + 2] + SPLATW m5, [offsetq + 4] + SPLATW m6, [offsetq + 6] + SPLATW m7, [offsetq + 8] +%else + movq m7, [offsetq + 2] + SPLATW m4, m7, 0 + SPLATW m5, m7, 1 + SPLATW m6, m7, 2 + SPLATW m7, m7, 3 +%endif + +%if ARCH_X86_64 +%if %1 > 8 + mova m13, [pw_mask %+ %1] +%endif + pxor m14, m14 + +%else ; ARCH_X86_32 + mova [rsp+mmsize*0], m0 + mova [rsp+mmsize*1], m1 + mova [rsp+mmsize*2], m2 + mova [rsp+mmsize*3], m3 + mova [rsp+mmsize*4], m4 + mova [rsp+mmsize*5], m5 + mova [rsp+mmsize*6], m6 + pxor m0, m0 +%if %1 > 8 + mova m1, [pw_mask %+ %1] +%endif + %assign MMSIZE mmsize + %define m14 m0 + %define m13 m1 + %define m9 m2 + %define m8 m3 +%endif ; ARCH +DEFINE_ARGS dst, src, dststride, srcstride, offset, height + mov heightd, r7m +%endmacro + +%macro HEVC_SAO_BAND_FILTER_COMPUTE 3 + psraw %2, %3, %1-5 +%if ARCH_X86_64 + pcmpeqw m10, %2, m0 + pcmpeqw m11, %2, m1 + pcmpeqw m12, %2, m2 + pcmpeqw %2, m3 + pand m10, m4 + pand m11, m5 + pand m12, m6 + pand %2, m7 + por m10, m11 + por m12, %2 + por m10, m12 + paddw %3, m10 +%else ; ARCH_X86_32 + pcmpeqw m4, %2, [rsp+MMSIZE*0] + pcmpeqw m5, %2, [rsp+MMSIZE*1] + pcmpeqw m6, %2, [rsp+MMSIZE*2] + pcmpeqw %2, [rsp+MMSIZE*3] + pand m4, [rsp+MMSIZE*4] + pand m5, [rsp+MMSIZE*5] + pand m6, [rsp+MMSIZE*6] + pand %2, m7 + por m4, m5 + por m6, %2 + por m4, m6 + paddw %3, m4 +%endif ; ARCH +%endmacro + +;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, +; int16_t *sao_offset_val, int sao_left_class, int width, int height); +%macro HEVC_SAO_BAND_FILTER_8 2 +cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left + HEVC_SAO_BAND_FILTER_INIT 8 + +align 16 +.loop +%if %1 == 8 + movq m8, [srcq] + punpcklbw m8, m14 + HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m8 + packuswb m8, m14 + movq [dstq], m8 +%endif ; %1 == 8 + +%assign i 0 +%rep %2 + mova m13, [srcq + i] + punpcklbw m8, m13, m14 + HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m8 + punpckhbw m13, m14 + HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m13 + packuswb m8, m13 + mova [dstq + i], m8 +%assign i i+mmsize +%endrep + +%if %1 == 48 +INIT_XMM cpuname + + mova m13, [srcq + i] + punpcklbw m8, m13, m14 + HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m8 + punpckhbw m13, m14 + HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m13 + packuswb m8, m13 + mova [dstq + i], m8 +%if cpuflag(avx2) +INIT_YMM cpuname +%endif +%endif ; %1 == 48 + + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride + dec heightd ; cmp height + jnz .loop ; height loop + REP_RET +%endmacro + +;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, +; int16_t *sao_offset_val, int sao_left_class, int width, int height); +%macro HEVC_SAO_BAND_FILTER_16 3 +cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left + HEVC_SAO_BAND_FILTER_INIT %1 + +align 16 +.loop +%if %2 == 8 + movu m8, [srcq] + HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8 + CLIPW m8, m14, m13 + movu [dstq], m8 +%endif + +%assign i 0 +%rep %3 + mova m8, [srcq + i] + HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8 + CLIPW m8, m14, m13 + mova [dstq + i], m8 + + mova m9, [srcq + i + mmsize] + HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9 + CLIPW m9, m14, m13 + mova [dstq + i + mmsize], m9 +%assign i i+mmsize*2 +%endrep + +%if %2 == 48 +INIT_XMM cpuname + mova m8, [srcq + i] + HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8 + CLIPW m8, m14, m13 + mova [dstq + i], m8 + + mova m9, [srcq + i + mmsize] + HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9 + CLIPW m9, m14, m13 + mova [dstq + i + mmsize], m9 +%if cpuflag(avx2) +INIT_YMM cpuname +%endif +%endif ; %1 == 48 + + add dstq, dststrideq + add srcq, srcstrideq + dec heightd + jg .loop + REP_RET +%endmacro + +%macro HEVC_SAO_BAND_FILTER_FUNCS 0 +HEVC_SAO_BAND_FILTER_8 8, 0 +HEVC_SAO_BAND_FILTER_8 16, 1 +HEVC_SAO_BAND_FILTER_8 32, 2 +HEVC_SAO_BAND_FILTER_8 48, 2 +HEVC_SAO_BAND_FILTER_8 64, 4 + +HEVC_SAO_BAND_FILTER_16 10, 8, 0 +HEVC_SAO_BAND_FILTER_16 10, 16, 1 +HEVC_SAO_BAND_FILTER_16 10, 32, 2 +HEVC_SAO_BAND_FILTER_16 10, 48, 2 +HEVC_SAO_BAND_FILTER_16 10, 64, 4 + +HEVC_SAO_BAND_FILTER_16 12, 8, 0 +HEVC_SAO_BAND_FILTER_16 12, 16, 1 +HEVC_SAO_BAND_FILTER_16 12, 32, 2 +HEVC_SAO_BAND_FILTER_16 12, 48, 2 +HEVC_SAO_BAND_FILTER_16 12, 64, 4 +%endmacro + +INIT_XMM sse2 +HEVC_SAO_BAND_FILTER_FUNCS +INIT_XMM avx +HEVC_SAO_BAND_FILTER_FUNCS + +%if HAVE_AVX2_EXTERNAL +INIT_XMM avx2 +HEVC_SAO_BAND_FILTER_8 8, 0 +HEVC_SAO_BAND_FILTER_8 16, 1 +INIT_YMM avx2 +HEVC_SAO_BAND_FILTER_8 32, 1 +HEVC_SAO_BAND_FILTER_8 48, 1 +HEVC_SAO_BAND_FILTER_8 64, 2 + +INIT_XMM avx2 +HEVC_SAO_BAND_FILTER_16 10, 8, 0 +HEVC_SAO_BAND_FILTER_16 10, 16, 1 +INIT_YMM avx2 +HEVC_SAO_BAND_FILTER_16 10, 32, 1 +HEVC_SAO_BAND_FILTER_16 10, 48, 1 +HEVC_SAO_BAND_FILTER_16 10, 64, 2 + +INIT_XMM avx2 +HEVC_SAO_BAND_FILTER_16 12, 8, 0 +HEVC_SAO_BAND_FILTER_16 12, 16, 1 +INIT_YMM avx2 +HEVC_SAO_BAND_FILTER_16 12, 32, 1 +HEVC_SAO_BAND_FILTER_16 12, 48, 1 +HEVC_SAO_BAND_FILTER_16 12, 64, 2 +%endif + +;****************************************************************************** +;SAO Edge Filter +;****************************************************************************** + +%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE + +%macro HEVC_SAO_EDGE_FILTER_INIT 1 +%if WIN64 + movsxd eoq, dword eom +%elif ARCH_X86_64 + movsxd eoq, eod +%else + mov eoq, r4m +%endif + lea tmp2q, [pb_eo] + movsx a_strideq, byte [tmp2q+eoq*4+1] + movsx b_strideq, byte [tmp2q+eoq*4+3] + imul a_strideq, EDGE_SRCSTRIDE>>%1 + imul b_strideq, EDGE_SRCSTRIDE>>%1 + movsx tmpq, byte [tmp2q+eoq*4] + add a_strideq, tmpq + movsx tmpq, byte [tmp2q+eoq*4+2] + add b_strideq, tmpq +%endmacro + +%macro HEVC_SAO_EDGE_FILTER_COMPUTE_8 1 + pminub m4, m1, m2 + pminub m5, m1, m3 + pcmpeqb m2, m4 + pcmpeqb m3, m5 + pcmpeqb m4, m1 + pcmpeqb m5, m1 + psubb m4, m2 + psubb m5, m3 + paddb m4, m6 + paddb m4, m5 + + pshufb m2, m0, m4 +%if %1 > 8 + punpckhbw m5, m7, m1 + punpckhbw m4, m2, m7 + punpcklbw m3, m7, m1 + punpcklbw m2, m7 + pmaddubsw m5, m4 + pmaddubsw m3, m2 + packuswb m3, m5 +%else + punpcklbw m3, m7, m1 + punpcklbw m2, m7 + pmaddubsw m3, m2 + packuswb m3, m3 +%endif +%endmacro + +;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, +; int eo, int width, int height); +%macro HEVC_SAO_EDGE_FILTER_8 2-3 +%if ARCH_X86_64 +cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp +%define tmp2q heightq + HEVC_SAO_EDGE_FILTER_INIT 0 + mov heightd, r6m + +%else ; ARCH_X86_32 +cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height +%define eoq srcq +%define tmpq heightq +%define tmp2q dststrideq +%define offsetq heightq + HEVC_SAO_EDGE_FILTER_INIT 0 + mov srcq, srcm + mov offsetq, r3m + mov dststrideq, dststridem +%endif ; ARCH + +%if mmsize > 16 + vbroadcasti128 m0, [offsetq] +%else + movu m0, [offsetq] +%endif + mova m1, [pb_edge_shuffle] + packsswb m0, m0 + mova m7, [pb_1] + pshufb m0, m1 + mova m6, [pb_2] +%if ARCH_X86_32 + mov heightd, r6m +%endif + +align 16 +.loop: + +%if %1 == 8 + movq m1, [srcq] + movq m2, [srcq + a_strideq] + movq m3, [srcq + b_strideq] + HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1 + movq [dstq], m3 +%endif + +%assign i 0 +%rep %2 + mova m1, [srcq + i] + movu m2, [srcq + a_strideq + i] + movu m3, [srcq + b_strideq + i] + HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1 + mov%3 [dstq + i], m3 +%assign i i+mmsize +%endrep + +%if %1 == 48 +INIT_XMM cpuname + + mova m1, [srcq + i] + movu m2, [srcq + a_strideq + i] + movu m3, [srcq + b_strideq + i] + HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1 + mova [dstq + i], m3 +%if cpuflag(avx2) +INIT_YMM cpuname +%endif +%endif + + add dstq, dststrideq + add srcq, EDGE_SRCSTRIDE + dec heightd + jg .loop + RET +%endmacro + +%macro PMINUW 4 +%if cpuflag(sse4) + pminuw %1, %2, %3 +%else + psubusw %4, %2, %3 + psubw %1, %2, %4 +%endif +%endmacro + +%macro HEVC_SAO_EDGE_FILTER_COMPUTE_10 0 + PMINUW m4, m1, m2, m6 + PMINUW m5, m1, m3, m7 + pcmpeqw m2, m4 + pcmpeqw m3, m5 + pcmpeqw m4, m1 + pcmpeqw m5, m1 + psubw m4, m2 + psubw m5, m3 + + paddw m4, m5 + pcmpeqw m2, m4, [pw_m2] +%if ARCH_X86_64 + pcmpeqw m3, m4, m13 + pcmpeqw m5, m4, m0 + pcmpeqw m6, m4, m14 + pcmpeqw m7, m4, m15 + pand m2, m8 + pand m3, m9 + pand m5, m10 + pand m6, m11 + pand m7, m12 +%else + pcmpeqw m3, m4, [pw_m1] + pcmpeqw m5, m4, m0 + pcmpeqw m6, m4, [pw_1] + pcmpeqw m7, m4, [pw_2] + pand m2, [rsp+MMSIZE*0] + pand m3, [rsp+MMSIZE*1] + pand m5, [rsp+MMSIZE*2] + pand m6, [rsp+MMSIZE*3] + pand m7, [rsp+MMSIZE*4] +%endif + paddw m2, m3 + paddw m5, m6 + paddw m2, m7 + paddw m2, m1 + paddw m2, m5 +%endmacro + +;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, +; int eo, int width, int height); +%macro HEVC_SAO_EDGE_FILTER_16 3 +%if ARCH_X86_64 +cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp +%define tmp2q heightq + HEVC_SAO_EDGE_FILTER_INIT 1 + mov heightd, r6m + add a_strideq, a_strideq + add b_strideq, b_strideq + +%else ; ARCH_X86_32 +cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height +%assign MMSIZE mmsize +%define eoq srcq +%define tmpq heightq +%define tmp2q dststrideq +%define offsetq heightq +%define m8 m1 +%define m9 m2 +%define m10 m3 +%define m11 m4 +%define m12 m5 + HEVC_SAO_EDGE_FILTER_INIT 1 + mov srcq, srcm + mov offsetq, r3m + mov dststrideq, dststridem + add a_strideq, a_strideq + add b_strideq, b_strideq + +%endif ; ARCH + +%if cpuflag(avx2) + SPLATW m8, [offsetq+2] + SPLATW m9, [offsetq+4] + SPLATW m10, [offsetq+0] + SPLATW m11, [offsetq+6] + SPLATW m12, [offsetq+8] +%else + movq m10, [offsetq+0] + movd m12, [offsetq+6] + SPLATW m8, xm10, 1 + SPLATW m9, xm10, 2 + SPLATW m10, xm10, 0 + SPLATW m11, xm12, 0 + SPLATW m12, xm12, 1 +%endif + pxor m0, m0 +%if ARCH_X86_64 + mova m13, [pw_m1] + mova m14, [pw_1] + mova m15, [pw_2] +%else + mov heightd, r6m + mova [rsp+mmsize*0], m8 + mova [rsp+mmsize*1], m9 + mova [rsp+mmsize*2], m10 + mova [rsp+mmsize*3], m11 + mova [rsp+mmsize*4], m12 +%endif + +align 16 +.loop + +%if %2 == 8 + mova m1, [srcq] + movu m2, [srcq+a_strideq] + movu m3, [srcq+b_strideq] + + HEVC_SAO_EDGE_FILTER_COMPUTE_10 + CLIPW m2, m0, [pw_mask %+ %1] + movu [dstq], m2 +%endif + +%assign i 0 +%rep %3 + mova m1, [srcq + i] + movu m2, [srcq+a_strideq + i] + movu m3, [srcq+b_strideq + i] + HEVC_SAO_EDGE_FILTER_COMPUTE_10 + CLIPW m2, m0, [pw_mask %+ %1] + mova [dstq + i], m2 + + mova m1, [srcq + i + mmsize] + movu m2, [srcq+a_strideq + i + mmsize] + movu m3, [srcq+b_strideq + i + mmsize] + HEVC_SAO_EDGE_FILTER_COMPUTE_10 + CLIPW m2, m0, [pw_mask %+ %1] + mova [dstq + i + mmsize], m2 +%assign i i+mmsize*2 +%endrep + +%if %2 == 48 +INIT_XMM cpuname + mova m1, [srcq + i] + movu m2, [srcq+a_strideq + i] + movu m3, [srcq+b_strideq + i] + HEVC_SAO_EDGE_FILTER_COMPUTE_10 + CLIPW m2, m0, [pw_mask %+ %1] + mova [dstq + i], m2 + + mova m1, [srcq + i + mmsize] + movu m2, [srcq+a_strideq + i + mmsize] + movu m3, [srcq+b_strideq + i + mmsize] + HEVC_SAO_EDGE_FILTER_COMPUTE_10 + CLIPW m2, m0, [pw_mask %+ %1] + mova [dstq + i + mmsize], m2 +%if cpuflag(avx2) +INIT_YMM cpuname +%endif +%endif + + add dstq, dststrideq + add srcq, EDGE_SRCSTRIDE + dec heightd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +HEVC_SAO_EDGE_FILTER_8 8, 0 +HEVC_SAO_EDGE_FILTER_8 16, 1, a +HEVC_SAO_EDGE_FILTER_8 32, 2, a +HEVC_SAO_EDGE_FILTER_8 48, 2, a +HEVC_SAO_EDGE_FILTER_8 64, 4, a + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +HEVC_SAO_EDGE_FILTER_8 32, 1, a +HEVC_SAO_EDGE_FILTER_8 48, 1, u +HEVC_SAO_EDGE_FILTER_8 64, 2, a +%endif + +INIT_XMM sse2 +HEVC_SAO_EDGE_FILTER_16 10, 8, 0 +HEVC_SAO_EDGE_FILTER_16 10, 16, 1 +HEVC_SAO_EDGE_FILTER_16 10, 32, 2 +HEVC_SAO_EDGE_FILTER_16 10, 48, 2 +HEVC_SAO_EDGE_FILTER_16 10, 64, 4 + +HEVC_SAO_EDGE_FILTER_16 12, 8, 0 +HEVC_SAO_EDGE_FILTER_16 12, 16, 1 +HEVC_SAO_EDGE_FILTER_16 12, 32, 2 +HEVC_SAO_EDGE_FILTER_16 12, 48, 2 +HEVC_SAO_EDGE_FILTER_16 12, 64, 4 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +HEVC_SAO_EDGE_FILTER_16 10, 32, 1 +HEVC_SAO_EDGE_FILTER_16 10, 48, 1 +HEVC_SAO_EDGE_FILTER_16 10, 64, 2 + +HEVC_SAO_EDGE_FILTER_16 12, 32, 1 +HEVC_SAO_EDGE_FILTER_16 12, 48, 1 +HEVC_SAO_EDGE_FILTER_16 12, 64, 2 +%endif diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h new file mode 100644 index 0000000000..ad8168fb5b --- /dev/null +++ b/libavcodec/x86/hevcdsp.h @@ -0,0 +1,261 @@ +/* + * HEVC video decoder + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere + * + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_HEVCDSP_H +#define AVCODEC_X86_HEVCDSP_H + +#include <stddef.h> +#include <stdint.h> + + +#define idct_dc_proto(size, bitd, opt) \ + void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) + +#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \ +dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \ +dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \ +dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \ +dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \ +dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt + + +#define PEL_PROTOTYPE(name, D, opt) \ +void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \ +void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \ +void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \ +void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \ +void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width) + + +/////////////////////////////////////////////////////////////////////////////// +// MC functions +/////////////////////////////////////////////////////////////////////////////// + +#define EPEL_PROTOTYPES(fname, bitd, opt) \ + PEL_PROTOTYPE(fname##4, bitd, opt); \ + PEL_PROTOTYPE(fname##6, bitd, opt); \ + PEL_PROTOTYPE(fname##8, bitd, opt); \ + PEL_PROTOTYPE(fname##12, bitd, opt); \ + PEL_PROTOTYPE(fname##16, bitd, opt); \ + PEL_PROTOTYPE(fname##24, bitd, opt); \ + PEL_PROTOTYPE(fname##32, bitd, opt); \ + PEL_PROTOTYPE(fname##48, bitd, opt); \ + PEL_PROTOTYPE(fname##64, bitd, opt) + +#define QPEL_PROTOTYPES(fname, bitd, opt) \ + PEL_PROTOTYPE(fname##4, bitd, opt); \ + PEL_PROTOTYPE(fname##8, bitd, opt); \ + PEL_PROTOTYPE(fname##12, bitd, opt); \ + PEL_PROTOTYPE(fname##16, bitd, opt); \ + PEL_PROTOTYPE(fname##24, bitd, opt); \ + PEL_PROTOTYPE(fname##32, bitd, opt); \ + PEL_PROTOTYPE(fname##48, bitd, opt); \ + PEL_PROTOTYPE(fname##64, bitd, opt) + +#define WEIGHTING_PROTOTYPE(width, bitd, opt) \ +void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom, int _wx, int _ox); \ +void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom, int _wx0, int _wx1, int _ox0, int _ox1) + +#define WEIGHTING_PROTOTYPES(bitd, opt) \ + WEIGHTING_PROTOTYPE(2, bitd, opt); \ + WEIGHTING_PROTOTYPE(4, bitd, opt); \ + WEIGHTING_PROTOTYPE(6, bitd, opt); \ + WEIGHTING_PROTOTYPE(8, bitd, opt); \ + WEIGHTING_PROTOTYPE(12, bitd, opt); \ + WEIGHTING_PROTOTYPE(16, bitd, opt); \ + WEIGHTING_PROTOTYPE(24, bitd, opt); \ + WEIGHTING_PROTOTYPE(32, bitd, opt); \ + WEIGHTING_PROTOTYPE(48, bitd, opt); \ + WEIGHTING_PROTOTYPE(64, bitd, opt) + + +/////////////////////////////////////////////////////////////////////////////// +// QPEL_PIXELS EPEL_PIXELS +/////////////////////////////////////////////////////////////////////////////// +EPEL_PROTOTYPES(pel_pixels , 8, sse4); +EPEL_PROTOTYPES(pel_pixels , 10, sse4); +EPEL_PROTOTYPES(pel_pixels , 12, sse4); + +void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); + +void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); + + + +void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit +void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit + + +void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); + +void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); + +/////////////////////////////////////////////////////////////////////////////// +// EPEL +/////////////////////////////////////////////////////////////////////////////// +EPEL_PROTOTYPES(epel_h , 8, sse4); +EPEL_PROTOTYPES(epel_h , 10, sse4); +EPEL_PROTOTYPES(epel_h , 12, sse4); + +EPEL_PROTOTYPES(epel_v , 8, sse4); +EPEL_PROTOTYPES(epel_v , 10, sse4); +EPEL_PROTOTYPES(epel_v , 12, sse4); + +EPEL_PROTOTYPES(epel_hv , 8, sse4); +EPEL_PROTOTYPES(epel_hv , 10, sse4); +EPEL_PROTOTYPES(epel_hv , 12, sse4); + +PEL_PROTOTYPE(epel_h16, 8, avx2); +PEL_PROTOTYPE(epel_h24, 8, avx2); +PEL_PROTOTYPE(epel_h32, 8, avx2); +PEL_PROTOTYPE(epel_h48, 8, avx2); +PEL_PROTOTYPE(epel_h64, 8, avx2); + +PEL_PROTOTYPE(epel_h16,10, avx2); +PEL_PROTOTYPE(epel_h24,10, avx2); +PEL_PROTOTYPE(epel_h32,10, avx2); +PEL_PROTOTYPE(epel_h48,10, avx2); +PEL_PROTOTYPE(epel_h64,10, avx2); + +PEL_PROTOTYPE(epel_v16, 8, avx2); +PEL_PROTOTYPE(epel_v24, 8, avx2); +PEL_PROTOTYPE(epel_v32, 8, avx2); +PEL_PROTOTYPE(epel_v48, 8, avx2); +PEL_PROTOTYPE(epel_v64, 8, avx2); + +PEL_PROTOTYPE(epel_v16,10, avx2); +PEL_PROTOTYPE(epel_v24,10, avx2); +PEL_PROTOTYPE(epel_v32,10, avx2); +PEL_PROTOTYPE(epel_v48,10, avx2); +PEL_PROTOTYPE(epel_v64,10, avx2); + +PEL_PROTOTYPE(epel_hv16, 8, avx2); +PEL_PROTOTYPE(epel_hv24, 8, avx2); +PEL_PROTOTYPE(epel_hv32, 8, avx2); +PEL_PROTOTYPE(epel_hv48, 8, avx2); +PEL_PROTOTYPE(epel_hv64, 8, avx2); + +PEL_PROTOTYPE(epel_hv16,10, avx2); +PEL_PROTOTYPE(epel_hv24,10, avx2); +PEL_PROTOTYPE(epel_hv32,10, avx2); +PEL_PROTOTYPE(epel_hv48,10, avx2); +PEL_PROTOTYPE(epel_hv64,10, avx2); + +/////////////////////////////////////////////////////////////////////////////// +// QPEL +/////////////////////////////////////////////////////////////////////////////// +QPEL_PROTOTYPES(qpel_h , 8, sse4); +QPEL_PROTOTYPES(qpel_h , 10, sse4); +QPEL_PROTOTYPES(qpel_h , 12, sse4); + +QPEL_PROTOTYPES(qpel_v, 8, sse4); +QPEL_PROTOTYPES(qpel_v, 10, sse4); +QPEL_PROTOTYPES(qpel_v, 12, sse4); + +QPEL_PROTOTYPES(qpel_hv, 8, sse4); +QPEL_PROTOTYPES(qpel_hv, 10, sse4); +QPEL_PROTOTYPES(qpel_hv, 12, sse4); + +PEL_PROTOTYPE(qpel_h16, 8, avx2); +PEL_PROTOTYPE(qpel_h24, 8, avx2); +PEL_PROTOTYPE(qpel_h32, 8, avx2); +PEL_PROTOTYPE(qpel_h48, 8, avx2); +PEL_PROTOTYPE(qpel_h64, 8, avx2); + +PEL_PROTOTYPE(qpel_h16,10, avx2); +PEL_PROTOTYPE(qpel_h24,10, avx2); +PEL_PROTOTYPE(qpel_h32,10, avx2); +PEL_PROTOTYPE(qpel_h48,10, avx2); +PEL_PROTOTYPE(qpel_h64,10, avx2); + +PEL_PROTOTYPE(qpel_v16, 8, avx2); +PEL_PROTOTYPE(qpel_v24, 8, avx2); +PEL_PROTOTYPE(qpel_v32, 8, avx2); +PEL_PROTOTYPE(qpel_v48, 8, avx2); +PEL_PROTOTYPE(qpel_v64, 8, avx2); + +PEL_PROTOTYPE(qpel_v16,10, avx2); +PEL_PROTOTYPE(qpel_v24,10, avx2); +PEL_PROTOTYPE(qpel_v32,10, avx2); +PEL_PROTOTYPE(qpel_v48,10, avx2); +PEL_PROTOTYPE(qpel_v64,10, avx2); + +PEL_PROTOTYPE(qpel_hv16, 8, avx2); +PEL_PROTOTYPE(qpel_hv24, 8, avx2); +PEL_PROTOTYPE(qpel_hv32, 8, avx2); +PEL_PROTOTYPE(qpel_hv48, 8, avx2); +PEL_PROTOTYPE(qpel_hv64, 8, avx2); + +PEL_PROTOTYPE(qpel_hv16,10, avx2); +PEL_PROTOTYPE(qpel_hv24,10, avx2); +PEL_PROTOTYPE(qpel_hv32,10, avx2); +PEL_PROTOTYPE(qpel_hv48,10, avx2); +PEL_PROTOTYPE(qpel_hv64,10, avx2); + +WEIGHTING_PROTOTYPES(8, sse4); +WEIGHTING_PROTOTYPES(10, sse4); +WEIGHTING_PROTOTYPES(12, sse4); + +/////////////////////////////////////////////////////////////////////////////// +// TRANSFORM_ADD +/////////////////////////////////////////////////////////////////////////////// +void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + +void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + +void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + +void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + +void ff_hevc_transform_add16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_add32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + +#endif // AVCODEC_X86_HEVCDSP_H diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index 04203c22a0..ddc876dfcf 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -2,29 +2,31 @@ * Copyright (c) 2013 Seppo Tomperi * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere * - * This file is part of Libav. * - * Libav is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "config.h" - #include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" - +#include "libavcodec/get_bits.h" /* required for hevcdsp.h GetBitContext */ #include "libavcodec/hevcdsp.h" +#include "libavcodec/x86/hevcdsp.h" #define LFC_FUNC(DIR, DEPTH, OPT) \ void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q); @@ -32,40 +34,1081 @@ void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, #define LFL_FUNC(DIR, DEPTH, OPT) \ void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q); -#define LFC_FUNCS(type, depth) \ - LFC_FUNC(h, depth, sse2) \ - LFC_FUNC(v, depth, sse2) +#define LFC_FUNCS(type, depth, opt) \ + LFC_FUNC(h, depth, opt) \ + LFC_FUNC(v, depth, opt) + +#define LFL_FUNCS(type, depth, opt) \ + LFL_FUNC(h, depth, opt) \ + LFL_FUNC(v, depth, opt) + +LFC_FUNCS(uint8_t, 8, sse2) +LFC_FUNCS(uint8_t, 10, sse2) +LFC_FUNCS(uint8_t, 12, sse2) +LFC_FUNCS(uint8_t, 8, avx) +LFC_FUNCS(uint8_t, 10, avx) +LFC_FUNCS(uint8_t, 12, avx) +LFL_FUNCS(uint8_t, 8, sse2) +LFL_FUNCS(uint8_t, 10, sse2) +LFL_FUNCS(uint8_t, 12, sse2) +LFL_FUNCS(uint8_t, 8, ssse3) +LFL_FUNCS(uint8_t, 10, ssse3) +LFL_FUNCS(uint8_t, 12, ssse3) +LFL_FUNCS(uint8_t, 8, avx) +LFL_FUNCS(uint8_t, 10, avx) +LFL_FUNCS(uint8_t, 12, avx) + +#define IDCT_FUNCS(W, opt) \ +void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \ +void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs); \ +void ff_hevc_idct##W##_dc_12_##opt(int16_t *coeffs) + +IDCT_FUNCS(4x4, mmxext); +IDCT_FUNCS(8x8, mmxext); +IDCT_FUNCS(8x8, sse2); +IDCT_FUNCS(16x16, sse2); +IDCT_FUNCS(32x32, sse2); +IDCT_FUNCS(16x16, avx2); +IDCT_FUNCS(32x32, avx2); + +#define mc_rep_func(name, bitd, step, W, opt) \ +void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, \ + uint8_t *_src, ptrdiff_t _srcstride, int height, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + int i; \ + uint8_t *src; \ + int16_t *dst; \ + for (i = 0; i < W; i += step) { \ + src = _src + (i * ((bitd + 7) / 8)); \ + dst = _dst + i; \ + ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \ + } \ +} +#define mc_rep_uni_func(name, bitd, step, W, opt) \ +void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, \ + uint8_t *_src, ptrdiff_t _srcstride, int height, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + int i; \ + uint8_t *src; \ + uint8_t *dst; \ + for (i = 0; i < W; i += step) { \ + src = _src + (i * ((bitd + 7) / 8)); \ + dst = _dst + (i * ((bitd + 7) / 8)); \ + ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \ + height, mx, my, width); \ + } \ +} +#define mc_rep_bi_func(name, bitd, step, W, opt) \ +void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src, \ + ptrdiff_t _srcstride, int16_t* _src2, \ + int height, intptr_t mx, intptr_t my, int width) \ +{ \ + int i; \ + uint8_t *src; \ + uint8_t *dst; \ + int16_t *src2; \ + for (i = 0; i < W ; i += step) { \ + src = _src + (i * ((bitd + 7) / 8)); \ + dst = _dst + (i * ((bitd + 7) / 8)); \ + src2 = _src2 + i; \ + ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \ + height, mx, my, width); \ + } \ +} + +#define mc_rep_funcs(name, bitd, step, W, opt) \ + mc_rep_func(name, bitd, step, W, opt); \ + mc_rep_uni_func(name, bitd, step, W, opt); \ + mc_rep_bi_func(name, bitd, step, W, opt) + +#define mc_rep_func2(name, bitd, step1, step2, W, opt) \ +void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst, \ + uint8_t *src, ptrdiff_t _srcstride, int height, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \ + ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)), \ + _srcstride, height, mx, my, width); \ +} +#define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \ +void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t _srcstride, int height, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\ + ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \ + src + (step1 * ((bitd + 7) / 8)), _srcstride, \ + height, mx, my, width); \ +} +#define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \ +void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ + ptrdiff_t _srcstride, int16_t* src2, \ + int height, intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\ + ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \ + src + (step1 * ((bitd + 7) / 8)), _srcstride, \ + src2 + step1, height, mx, my, width); \ +} + +#define mc_rep_funcs(name, bitd, step, W, opt) \ + mc_rep_func(name, bitd, step, W, opt); \ + mc_rep_uni_func(name, bitd, step, W, opt); \ + mc_rep_bi_func(name, bitd, step, W, opt) + +#define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \ + mc_rep_func2(name, bitd, step1, step2, W, opt); \ + mc_rep_uni_func2(name, bitd, step1, step2, W, opt); \ + mc_rep_bi_func2(name, bitd, step1, step2, W, opt) + +#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL + +#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \ +void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \ + int height, intptr_t mx, intptr_t my, int width) \ + \ +{ \ + ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width); \ + ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \ +} + +#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \ +void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ + ptrdiff_t _srcstride, int16_t *src2, \ + int height, intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2, \ + height, mx, my, width); \ + ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\ + height, mx, my, width); \ +} + +#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \ +void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t _srcstride, int height, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, \ + height, mx, my, width); \ + ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, \ + height, mx, my, width); \ +} + +#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4) \ +mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4); \ +mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4); \ +mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) + +#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \ +void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \ + int height, intptr_t mx, intptr_t my, int width) \ + \ +{ \ + ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width); \ + ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width); \ +} + +#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \ +void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ + ptrdiff_t _srcstride, int16_t* src2, \ + int height, intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \ + src2, height, mx, my, width); \ + ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \ + src2+width2, height, mx, my, width); \ +} + +#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) \ +void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t _srcstride, int height, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \ + height, mx, my, width); \ + ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \ + height, mx, my, width); \ +} + +#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2) \ +mc_rep_mix_8(name, width1, width2, width3, opt1, opt2); \ +mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2); \ +mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) + +#if HAVE_AVX2_EXTERNAL + +mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4); +mc_rep_mixs_8(epel_hv, 48, 32, 16, avx2, sse4); +mc_rep_mixs_8(epel_h , 48, 32, 16, avx2, sse4); +mc_rep_mixs_8(epel_v , 48, 32, 16, avx2, sse4); + +mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32); +mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32); +mc_rep_mixs_10(epel_hv, 24, 16, 8, avx2, sse4, 32); +mc_rep_mixs_10(epel_h , 24, 16, 8, avx2, sse4, 32); +mc_rep_mixs_10(epel_v , 24, 16, 8, avx2, sse4, 32); + + +mc_rep_mixs_10(qpel_h , 24, 16, 8, avx2, sse4, 32); +mc_rep_mixs_10(qpel_v , 24, 16, 8, avx2, sse4, 32); +mc_rep_mixs_10(qpel_hv, 24, 16, 8, avx2, sse4, 32); + + +mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2);//used for 10bit +mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2); //used for 10bit + +mc_rep_funcs(pel_pixels, 8, 32, 64, avx2); + +mc_rep_func(pel_pixels, 10, 16, 32, avx2); +mc_rep_func(pel_pixels, 10, 16, 48, avx2); +mc_rep_func(pel_pixels, 10, 32, 64, avx2); + +mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2); +mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2); +mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2); + +mc_rep_funcs(epel_h, 8, 32, 64, avx2); + +mc_rep_funcs(epel_v, 8, 32, 64, avx2); + +mc_rep_funcs(epel_h, 10, 16, 32, avx2); +mc_rep_funcs(epel_h, 10, 16, 48, avx2); +mc_rep_funcs(epel_h, 10, 32, 64, avx2); + +mc_rep_funcs(epel_v, 10, 16, 32, avx2); +mc_rep_funcs(epel_v, 10, 16, 48, avx2); +mc_rep_funcs(epel_v, 10, 32, 64, avx2); + + +mc_rep_funcs(epel_hv, 8, 32, 64, avx2); + +mc_rep_funcs(epel_hv, 10, 16, 32, avx2); +mc_rep_funcs(epel_hv, 10, 16, 48, avx2); +mc_rep_funcs(epel_hv, 10, 32, 64, avx2); + +mc_rep_funcs(qpel_h, 8, 32, 64, avx2); +mc_rep_mixs_8(qpel_h , 48, 32, 16, avx2, sse4); + +mc_rep_funcs(qpel_v, 8, 32, 64, avx2); +mc_rep_mixs_8(qpel_v, 48, 32, 16, avx2, sse4); + +mc_rep_funcs(qpel_h, 10, 16, 32, avx2); +mc_rep_funcs(qpel_h, 10, 16, 48, avx2); +mc_rep_funcs(qpel_h, 10, 32, 64, avx2); + +mc_rep_funcs(qpel_v, 10, 16, 32, avx2); +mc_rep_funcs(qpel_v, 10, 16, 48, avx2); +mc_rep_funcs(qpel_v, 10, 32, 64, avx2); + +mc_rep_funcs(qpel_hv, 10, 16, 32, avx2); +mc_rep_funcs(qpel_hv, 10, 16, 48, avx2); +mc_rep_funcs(qpel_hv, 10, 32, 64, avx2); + +#endif //AVX2 + +mc_rep_funcs(pel_pixels, 8, 16, 64, sse4); +mc_rep_funcs(pel_pixels, 8, 16, 48, sse4); +mc_rep_funcs(pel_pixels, 8, 16, 32, sse4); +mc_rep_funcs(pel_pixels, 8, 8, 24, sse4); +mc_rep_funcs(pel_pixels,10, 8, 64, sse4); +mc_rep_funcs(pel_pixels,10, 8, 48, sse4); +mc_rep_funcs(pel_pixels,10, 8, 32, sse4); +mc_rep_funcs(pel_pixels,10, 8, 24, sse4); +mc_rep_funcs(pel_pixels,10, 8, 16, sse4); +mc_rep_funcs(pel_pixels,10, 4, 12, sse4); +mc_rep_funcs(pel_pixels,12, 8, 64, sse4); +mc_rep_funcs(pel_pixels,12, 8, 48, sse4); +mc_rep_funcs(pel_pixels,12, 8, 32, sse4); +mc_rep_funcs(pel_pixels,12, 8, 24, sse4); +mc_rep_funcs(pel_pixels,12, 8, 16, sse4); +mc_rep_funcs(pel_pixels,12, 4, 12, sse4); + +mc_rep_funcs(epel_h, 8, 16, 64, sse4); +mc_rep_funcs(epel_h, 8, 16, 48, sse4); +mc_rep_funcs(epel_h, 8, 16, 32, sse4); +mc_rep_funcs(epel_h, 8, 8, 24, sse4); +mc_rep_funcs(epel_h,10, 8, 64, sse4); +mc_rep_funcs(epel_h,10, 8, 48, sse4); +mc_rep_funcs(epel_h,10, 8, 32, sse4); +mc_rep_funcs(epel_h,10, 8, 24, sse4); +mc_rep_funcs(epel_h,10, 8, 16, sse4); +mc_rep_funcs(epel_h,10, 4, 12, sse4); +mc_rep_funcs(epel_h,12, 8, 64, sse4); +mc_rep_funcs(epel_h,12, 8, 48, sse4); +mc_rep_funcs(epel_h,12, 8, 32, sse4); +mc_rep_funcs(epel_h,12, 8, 24, sse4); +mc_rep_funcs(epel_h,12, 8, 16, sse4); +mc_rep_funcs(epel_h,12, 4, 12, sse4); +mc_rep_funcs(epel_v, 8, 16, 64, sse4); +mc_rep_funcs(epel_v, 8, 16, 48, sse4); +mc_rep_funcs(epel_v, 8, 16, 32, sse4); +mc_rep_funcs(epel_v, 8, 8, 24, sse4); +mc_rep_funcs(epel_v,10, 8, 64, sse4); +mc_rep_funcs(epel_v,10, 8, 48, sse4); +mc_rep_funcs(epel_v,10, 8, 32, sse4); +mc_rep_funcs(epel_v,10, 8, 24, sse4); +mc_rep_funcs(epel_v,10, 8, 16, sse4); +mc_rep_funcs(epel_v,10, 4, 12, sse4); +mc_rep_funcs(epel_v,12, 8, 64, sse4); +mc_rep_funcs(epel_v,12, 8, 48, sse4); +mc_rep_funcs(epel_v,12, 8, 32, sse4); +mc_rep_funcs(epel_v,12, 8, 24, sse4); +mc_rep_funcs(epel_v,12, 8, 16, sse4); +mc_rep_funcs(epel_v,12, 4, 12, sse4); +mc_rep_funcs(epel_hv, 8, 16, 64, sse4); +mc_rep_funcs(epel_hv, 8, 16, 48, sse4); +mc_rep_funcs(epel_hv, 8, 16, 32, sse4); +mc_rep_funcs(epel_hv, 8, 8, 24, sse4); +mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4); +mc_rep_funcs(epel_hv,10, 8, 64, sse4); +mc_rep_funcs(epel_hv,10, 8, 48, sse4); +mc_rep_funcs(epel_hv,10, 8, 32, sse4); +mc_rep_funcs(epel_hv,10, 8, 24, sse4); +mc_rep_funcs(epel_hv,10, 8, 16, sse4); +mc_rep_funcs(epel_hv,10, 4, 12, sse4); +mc_rep_funcs(epel_hv,12, 8, 64, sse4); +mc_rep_funcs(epel_hv,12, 8, 48, sse4); +mc_rep_funcs(epel_hv,12, 8, 32, sse4); +mc_rep_funcs(epel_hv,12, 8, 24, sse4); +mc_rep_funcs(epel_hv,12, 8, 16, sse4); +mc_rep_funcs(epel_hv,12, 4, 12, sse4); + +mc_rep_funcs(qpel_h, 8, 16, 64, sse4); +mc_rep_funcs(qpel_h, 8, 16, 48, sse4); +mc_rep_funcs(qpel_h, 8, 16, 32, sse4); +mc_rep_funcs(qpel_h, 8, 8, 24, sse4); +mc_rep_funcs(qpel_h,10, 8, 64, sse4); +mc_rep_funcs(qpel_h,10, 8, 48, sse4); +mc_rep_funcs(qpel_h,10, 8, 32, sse4); +mc_rep_funcs(qpel_h,10, 8, 24, sse4); +mc_rep_funcs(qpel_h,10, 8, 16, sse4); +mc_rep_funcs(qpel_h,10, 4, 12, sse4); +mc_rep_funcs(qpel_h,12, 8, 64, sse4); +mc_rep_funcs(qpel_h,12, 8, 48, sse4); +mc_rep_funcs(qpel_h,12, 8, 32, sse4); +mc_rep_funcs(qpel_h,12, 8, 24, sse4); +mc_rep_funcs(qpel_h,12, 8, 16, sse4); +mc_rep_funcs(qpel_h,12, 4, 12, sse4); +mc_rep_funcs(qpel_v, 8, 16, 64, sse4); +mc_rep_funcs(qpel_v, 8, 16, 48, sse4); +mc_rep_funcs(qpel_v, 8, 16, 32, sse4); +mc_rep_funcs(qpel_v, 8, 8, 24, sse4); +mc_rep_funcs(qpel_v,10, 8, 64, sse4); +mc_rep_funcs(qpel_v,10, 8, 48, sse4); +mc_rep_funcs(qpel_v,10, 8, 32, sse4); +mc_rep_funcs(qpel_v,10, 8, 24, sse4); +mc_rep_funcs(qpel_v,10, 8, 16, sse4); +mc_rep_funcs(qpel_v,10, 4, 12, sse4); +mc_rep_funcs(qpel_v,12, 8, 64, sse4); +mc_rep_funcs(qpel_v,12, 8, 48, sse4); +mc_rep_funcs(qpel_v,12, 8, 32, sse4); +mc_rep_funcs(qpel_v,12, 8, 24, sse4); +mc_rep_funcs(qpel_v,12, 8, 16, sse4); +mc_rep_funcs(qpel_v,12, 4, 12, sse4); +mc_rep_funcs(qpel_hv, 8, 8, 64, sse4); +mc_rep_funcs(qpel_hv, 8, 8, 48, sse4); +mc_rep_funcs(qpel_hv, 8, 8, 32, sse4); +mc_rep_funcs(qpel_hv, 8, 8, 24, sse4); +mc_rep_funcs(qpel_hv, 8, 8, 16, sse4); +mc_rep_funcs2(qpel_hv,8, 8, 4, 12, sse4); +mc_rep_funcs(qpel_hv,10, 8, 64, sse4); +mc_rep_funcs(qpel_hv,10, 8, 48, sse4); +mc_rep_funcs(qpel_hv,10, 8, 32, sse4); +mc_rep_funcs(qpel_hv,10, 8, 24, sse4); +mc_rep_funcs(qpel_hv,10, 8, 16, sse4); +mc_rep_funcs(qpel_hv,10, 4, 12, sse4); +mc_rep_funcs(qpel_hv,12, 8, 64, sse4); +mc_rep_funcs(qpel_hv,12, 8, 48, sse4); +mc_rep_funcs(qpel_hv,12, 8, 32, sse4); +mc_rep_funcs(qpel_hv,12, 8, 24, sse4); +mc_rep_funcs(qpel_hv,12, 8, 16, sse4); +mc_rep_funcs(qpel_hv,12, 4, 12, sse4); + +#define mc_rep_uni_w(bitd, step, W, opt) \ +void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \ + int height, int denom, int _wx, int _ox) \ +{ \ + int i; \ + int16_t *src; \ + uint8_t *dst; \ + for (i = 0; i < W; i += step) { \ + src= _src + i; \ + dst= _dst + (i * ((bitd + 7) / 8)); \ + ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, \ + height, denom, _wx, _ox); \ + } \ +} + +mc_rep_uni_w(8, 6, 12, sse4); +mc_rep_uni_w(8, 8, 16, sse4); +mc_rep_uni_w(8, 8, 24, sse4); +mc_rep_uni_w(8, 8, 32, sse4); +mc_rep_uni_w(8, 8, 48, sse4); +mc_rep_uni_w(8, 8, 64, sse4); + +mc_rep_uni_w(10, 6, 12, sse4); +mc_rep_uni_w(10, 8, 16, sse4); +mc_rep_uni_w(10, 8, 24, sse4); +mc_rep_uni_w(10, 8, 32, sse4); +mc_rep_uni_w(10, 8, 48, sse4); +mc_rep_uni_w(10, 8, 64, sse4); + +mc_rep_uni_w(12, 6, 12, sse4); +mc_rep_uni_w(12, 8, 16, sse4); +mc_rep_uni_w(12, 8, 24, sse4); +mc_rep_uni_w(12, 8, 32, sse4); +mc_rep_uni_w(12, 8, 48, sse4); +mc_rep_uni_w(12, 8, 64, sse4); + +#define mc_rep_bi_w(bitd, step, W, opt) \ +void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \ + int16_t *_src2, int height, \ + int denom, int _wx0, int _wx1, int _ox0, int _ox1) \ +{ \ + int i; \ + int16_t *src; \ + int16_t *src2; \ + uint8_t *dst; \ + for (i = 0; i < W; i += step) { \ + src = _src + i; \ + src2 = _src2 + i; \ + dst = _dst + (i * ((bitd + 7) / 8)); \ + ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2, \ + height, denom, _wx0, _wx1, _ox0, _ox1); \ + } \ +} + +mc_rep_bi_w(8, 6, 12, sse4); +mc_rep_bi_w(8, 8, 16, sse4); +mc_rep_bi_w(8, 8, 24, sse4); +mc_rep_bi_w(8, 8, 32, sse4); +mc_rep_bi_w(8, 8, 48, sse4); +mc_rep_bi_w(8, 8, 64, sse4); + +mc_rep_bi_w(10, 6, 12, sse4); +mc_rep_bi_w(10, 8, 16, sse4); +mc_rep_bi_w(10, 8, 24, sse4); +mc_rep_bi_w(10, 8, 32, sse4); +mc_rep_bi_w(10, 8, 48, sse4); +mc_rep_bi_w(10, 8, 64, sse4); + +mc_rep_bi_w(12, 6, 12, sse4); +mc_rep_bi_w(12, 8, 16, sse4); +mc_rep_bi_w(12, 8, 24, sse4); +mc_rep_bi_w(12, 8, 32, sse4); +mc_rep_bi_w(12, 8, 48, sse4); +mc_rep_bi_w(12, 8, 64, sse4); + +#define mc_uni_w_func(name, bitd, W, opt) \ +void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \ + uint8_t *_src, ptrdiff_t _srcstride, \ + int height, int denom, \ + int _wx, int _ox, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \ + ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \ + ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\ +} + +#define mc_uni_w_funcs(name, bitd, opt) \ + mc_uni_w_func(name, bitd, 4, opt); \ + mc_uni_w_func(name, bitd, 8, opt); \ + mc_uni_w_func(name, bitd, 12, opt); \ + mc_uni_w_func(name, bitd, 16, opt); \ + mc_uni_w_func(name, bitd, 24, opt); \ + mc_uni_w_func(name, bitd, 32, opt); \ + mc_uni_w_func(name, bitd, 48, opt); \ + mc_uni_w_func(name, bitd, 64, opt) + +mc_uni_w_funcs(pel_pixels, 8, sse4); +mc_uni_w_func(pel_pixels, 8, 6, sse4); +mc_uni_w_funcs(epel_h, 8, sse4); +mc_uni_w_func(epel_h, 8, 6, sse4); +mc_uni_w_funcs(epel_v, 8, sse4); +mc_uni_w_func(epel_v, 8, 6, sse4); +mc_uni_w_funcs(epel_hv, 8, sse4); +mc_uni_w_func(epel_hv, 8, 6, sse4); +mc_uni_w_funcs(qpel_h, 8, sse4); +mc_uni_w_funcs(qpel_v, 8, sse4); +mc_uni_w_funcs(qpel_hv, 8, sse4); + +mc_uni_w_funcs(pel_pixels, 10, sse4); +mc_uni_w_func(pel_pixels, 10, 6, sse4); +mc_uni_w_funcs(epel_h, 10, sse4); +mc_uni_w_func(epel_h, 10, 6, sse4); +mc_uni_w_funcs(epel_v, 10, sse4); +mc_uni_w_func(epel_v, 10, 6, sse4); +mc_uni_w_funcs(epel_hv, 10, sse4); +mc_uni_w_func(epel_hv, 10, 6, sse4); +mc_uni_w_funcs(qpel_h, 10, sse4); +mc_uni_w_funcs(qpel_v, 10, sse4); +mc_uni_w_funcs(qpel_hv, 10, sse4); + +mc_uni_w_funcs(pel_pixels, 12, sse4); +mc_uni_w_func(pel_pixels, 12, 6, sse4); +mc_uni_w_funcs(epel_h, 12, sse4); +mc_uni_w_func(epel_h, 12, 6, sse4); +mc_uni_w_funcs(epel_v, 12, sse4); +mc_uni_w_func(epel_v, 12, 6, sse4); +mc_uni_w_funcs(epel_hv, 12, sse4); +mc_uni_w_func(epel_hv, 12, 6, sse4); +mc_uni_w_funcs(qpel_h, 12, sse4); +mc_uni_w_funcs(qpel_v, 12, sse4); +mc_uni_w_funcs(qpel_hv, 12, sse4); + +#define mc_bi_w_func(name, bitd, W, opt) \ +void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \ + uint8_t *_src, ptrdiff_t _srcstride, \ + int16_t *_src2, \ + int height, int denom, \ + int _wx0, int _wx1, int _ox0, int _ox1, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \ + ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \ + ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2, \ + height, denom, _wx0, _wx1, _ox0, _ox1); \ +} + +#define mc_bi_w_funcs(name, bitd, opt) \ + mc_bi_w_func(name, bitd, 4, opt); \ + mc_bi_w_func(name, bitd, 8, opt); \ + mc_bi_w_func(name, bitd, 12, opt); \ + mc_bi_w_func(name, bitd, 16, opt); \ + mc_bi_w_func(name, bitd, 24, opt); \ + mc_bi_w_func(name, bitd, 32, opt); \ + mc_bi_w_func(name, bitd, 48, opt); \ + mc_bi_w_func(name, bitd, 64, opt) + +mc_bi_w_funcs(pel_pixels, 8, sse4); +mc_bi_w_func(pel_pixels, 8, 6, sse4); +mc_bi_w_funcs(epel_h, 8, sse4); +mc_bi_w_func(epel_h, 8, 6, sse4); +mc_bi_w_funcs(epel_v, 8, sse4); +mc_bi_w_func(epel_v, 8, 6, sse4); +mc_bi_w_funcs(epel_hv, 8, sse4); +mc_bi_w_func(epel_hv, 8, 6, sse4); +mc_bi_w_funcs(qpel_h, 8, sse4); +mc_bi_w_funcs(qpel_v, 8, sse4); +mc_bi_w_funcs(qpel_hv, 8, sse4); + +mc_bi_w_funcs(pel_pixels, 10, sse4); +mc_bi_w_func(pel_pixels, 10, 6, sse4); +mc_bi_w_funcs(epel_h, 10, sse4); +mc_bi_w_func(epel_h, 10, 6, sse4); +mc_bi_w_funcs(epel_v, 10, sse4); +mc_bi_w_func(epel_v, 10, 6, sse4); +mc_bi_w_funcs(epel_hv, 10, sse4); +mc_bi_w_func(epel_hv, 10, 6, sse4); +mc_bi_w_funcs(qpel_h, 10, sse4); +mc_bi_w_funcs(qpel_v, 10, sse4); +mc_bi_w_funcs(qpel_hv, 10, sse4); + +mc_bi_w_funcs(pel_pixels, 12, sse4); +mc_bi_w_func(pel_pixels, 12, 6, sse4); +mc_bi_w_funcs(epel_h, 12, sse4); +mc_bi_w_func(epel_h, 12, 6, sse4); +mc_bi_w_funcs(epel_v, 12, sse4); +mc_bi_w_func(epel_v, 12, 6, sse4); +mc_bi_w_funcs(epel_hv, 12, sse4); +mc_bi_w_func(epel_hv, 12, 6, sse4); +mc_bi_w_funcs(qpel_h, 12, sse4); +mc_bi_w_funcs(qpel_v, 12, sse4); +mc_bi_w_funcs(qpel_hv, 12, sse4); +#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL -#define LFL_FUNCS(type, depth) \ - LFL_FUNC(h, depth, ssse3) \ - LFL_FUNC(v, depth, ssse3) +#define SAO_BAND_FILTER_FUNCS(bitd, opt) \ +void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \ + int16_t *sao_offset_val, int sao_left_class, int width, int height); \ +void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \ + int16_t *sao_offset_val, int sao_left_class, int width, int height); \ +void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \ + int16_t *sao_offset_val, int sao_left_class, int width, int height); \ +void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \ + int16_t *sao_offset_val, int sao_left_class, int width, int height); \ +void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \ + int16_t *sao_offset_val, int sao_left_class, int width, int height) -LFC_FUNCS(uint8_t, 8) -LFC_FUNCS(uint8_t, 10) -LFL_FUNCS(uint8_t, 8) -LFL_FUNCS(uint8_t, 10) +SAO_BAND_FILTER_FUNCS(8, sse2); +SAO_BAND_FILTER_FUNCS(10, sse2); +SAO_BAND_FILTER_FUNCS(12, sse2); +SAO_BAND_FILTER_FUNCS(8, avx); +SAO_BAND_FILTER_FUNCS(10, avx); +SAO_BAND_FILTER_FUNCS(12, avx); +SAO_BAND_FILTER_FUNCS(8, avx2); +SAO_BAND_FILTER_FUNCS(10, avx2); +SAO_BAND_FILTER_FUNCS(12, avx2); + +#define SAO_BAND_INIT(bitd, opt) do { \ + c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_##bitd##_##opt; \ + c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_##bitd##_##opt; \ + c->sao_band_filter[2] = ff_hevc_sao_band_filter_32_##bitd##_##opt; \ + c->sao_band_filter[3] = ff_hevc_sao_band_filter_48_##bitd##_##opt; \ + c->sao_band_filter[4] = ff_hevc_sao_band_filter_64_##bitd##_##opt; \ +} while (0) + +#define SAO_EDGE_FILTER_FUNCS(bitd, opt) \ +void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \ + int eo, int width, int height); \ +void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \ + int eo, int width, int height); \ +void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \ + int eo, int width, int height); \ +void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \ + int eo, int width, int height); \ +void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \ + int eo, int width, int height); \ + +SAO_EDGE_FILTER_FUNCS(8, ssse3); +SAO_EDGE_FILTER_FUNCS(8, avx2); +SAO_EDGE_FILTER_FUNCS(10, sse2); +SAO_EDGE_FILTER_FUNCS(10, avx2); +SAO_EDGE_FILTER_FUNCS(12, sse2); +SAO_EDGE_FILTER_FUNCS(12, avx2); + +#define SAO_EDGE_INIT(bitd, opt) do { \ + c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8_##bitd##_##opt; \ + c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \ + c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \ + c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \ + c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \ +} while (0) + +#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \ + PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \ + PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \ + PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \ + PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \ + PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \ + PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \ + PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \ + PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \ + PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt ) +#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt) \ + PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \ + PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \ + PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \ + PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \ + PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \ + PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \ + PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \ + PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt ) void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) { int cpu_flags = av_get_cpu_flags(); if (bit_depth == 8) { + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext; + c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext; + c->transform_add[0] = ff_hevc_transform_add4_8_mmxext; + } if (EXTERNAL_SSE2(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2; + if (ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2; + + } + SAO_BAND_INIT(8, sse2); + + c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2; + c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2; + c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2; + + c->transform_add[1] = ff_hevc_transform_add8_8_sse2; + c->transform_add[2] = ff_hevc_transform_add16_8_sse2; + c->transform_add[3] = ff_hevc_transform_add32_8_sse2; } - if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { - c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3; - c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3; + if (EXTERNAL_SSSE3(cpu_flags)) { + if(ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3; + } + SAO_EDGE_INIT(8, ssse3); + } + if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { + + EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4); + EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4); + EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4); + EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4); + + QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4); + QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4); + } + if (EXTERNAL_AVX(cpu_flags)) { + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx; + if (ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx; + } + SAO_BAND_INIT(8, avx); + + c->transform_add[1] = ff_hevc_transform_add8_8_avx; + c->transform_add[2] = ff_hevc_transform_add16_8_avx; + c->transform_add[3] = ff_hevc_transform_add32_8_avx; + } + if (EXTERNAL_AVX2(cpu_flags)) { + c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; + c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2; + if (ARCH_X86_64) { + c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2; + c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2; + c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2; + + c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2; + c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2; + c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2; + + c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2; + c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2; + c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2; + + c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2; + c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2; + c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2; + + c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2; + c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2; + c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2; + + c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2; + c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2; + c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2; + + c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2; + c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2; + c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2; + + c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2; + c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2; + c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2; + + c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2; + c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2; + c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2; + + c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2; + c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2; + c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2; + + c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2; + c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2; + c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2; + + c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2; + c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2; + c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2; + + c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2; + c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2; + c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2; + + c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2; + c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2; + c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2; + + c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2; + c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2; + c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2; + + c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2; + c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2; + c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2; + + c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2; + c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2; + c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2; + + c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2; + c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2; + c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2; + + c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2; + c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2; + c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2; + + c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2; + c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2; + c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2; + + c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2; + c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2; + c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2; + } + SAO_BAND_INIT(8, avx2); + + c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2; + c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2; + c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2; + + c->transform_add[3] = ff_hevc_transform_add32_8_avx2; } } else if (bit_depth == 10) { + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->transform_add[0] = ff_hevc_transform_add4_10_mmxext; + c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext; + c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext; + } if (EXTERNAL_SSE2(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2; + if (ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2; + } + SAO_BAND_INIT(10, sse2); + SAO_EDGE_INIT(10, sse2); + + c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2; + c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2; + c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2; + + c->transform_add[1] = ff_hevc_transform_add8_10_sse2; + c->transform_add[2] = ff_hevc_transform_add16_10_sse2; + c->transform_add[3] = ff_hevc_transform_add32_10_sse2; } if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3; } + if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { + EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4); + EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4); + EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4); + EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4); + + QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4); + QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4); + } + if (EXTERNAL_AVX(cpu_flags)) { + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx; + if (ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx; + } + SAO_BAND_INIT(10, avx); + } + if (EXTERNAL_AVX2(cpu_flags)) { + + c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2; + c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2; + if (ARCH_X86_64) { + c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2; + c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2; + c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2; + c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2; + c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2; + + c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2; + c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2; + c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2; + c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2; + c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2; + + c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2; + c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2; + c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2; + c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2; + c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2; + + c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2; + c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2; + c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2; + c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2; + c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2; + + c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2; + c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2; + c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2; + c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2; + c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2; + c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2; + c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2; + c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2; + c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2; + c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2; + + c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2; + c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2; + c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2; + c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2; + c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2; + + c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2; + c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2; + c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2; + c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2; + c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2; + + c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2; + c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2; + c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2; + c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2; + c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2; + + c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2; + c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2; + c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2; + c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2; + c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2; + + c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2; + c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2; + c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2; + c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2; + c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2; + + c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2; + c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2; + c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2; + c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2; + c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2; + + c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2; + c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2; + c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2; + c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2; + c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2; + + c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2; + c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2; + c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2; + c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2; + c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2; + + c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2; + c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2; + c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2; + c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2; + c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2; + + c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2; + c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2; + c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2; + c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2; + c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2; + + c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2; + c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2; + c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2; + c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2; + c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2; + + c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2; + c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2; + c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2; + c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2; + c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2; + + c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2; + c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2; + c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2; + c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2; + c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2; + + c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2; + c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2; + c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2; + c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2; + c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2; + + c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2; + c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2; + c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2; + c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2; + c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2; + + c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2; + c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2; + c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2; + c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2; + c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2; + + c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2; + c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2; + c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2; + c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2; + c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2; + + c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2; + c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2; + c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2; + c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2; + c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2; + } + SAO_BAND_INIT(10, avx2); + c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_10_avx2; + c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_10_avx2; + c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_10_avx2; + + c->transform_add[2] = ff_hevc_transform_add16_10_avx2; + c->transform_add[3] = ff_hevc_transform_add32_10_avx2; + + } + } else if (bit_depth == 12) { + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext; + c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext; + } + if (EXTERNAL_SSE2(cpu_flags)) { + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2; + if (ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2; + } + SAO_BAND_INIT(12, sse2); + SAO_EDGE_INIT(12, sse2); + + c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2; + c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2; + c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2; + } + if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3; + } + if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { + EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4); + EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4); + EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4); + EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4); + + QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4); + QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4); + } + if (EXTERNAL_AVX(cpu_flags)) { + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx; + if (ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx; + } + SAO_BAND_INIT(12, avx); + } + if (EXTERNAL_AVX2(cpu_flags)) { + c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2; + c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2; + + SAO_BAND_INIT(12, avx2); + c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_12_avx2; + c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_12_avx2; + c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_12_avx2; + } } } diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 073f7f908e..2cef8e698c 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -1,20 +1,27 @@ ;****************************************************************************** +;* +;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org> +;* Copyright (c) Nick Kurshev <nickols_k@mail.ru> +;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> +;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz> +;* Copyright (c) 2013 Daniel Kang +;* ;* SIMD-optimized halfpel functions ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -22,26 +29,49 @@ SECTION_RODATA cextern pb_1 +cextern pw_2 +pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7 + +cextern pw_8192 SECTION_TEXT ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro PUT_PIXELS8_X2 0 +%if cpuflag(sse2) +cglobal put_pixels16_x2, 4,5,4 +%else cglobal put_pixels8_x2, 4,5 +%endif lea r4, [r2*2] .loop: - mova m0, [r1] - mova m1, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m1, [r1+r2+1] + movu m0, [r1+1] + movu m1, [r1+r2+1] +%if cpuflag(sse2) + movu m2, [r1] + movu m3, [r1+r2] + pavgb m0, m2 + pavgb m1, m3 +%else + PAVGB m0, [r1] + PAVGB m1, [r1+r2] +%endif mova [r0], m0 mova [r0+r2], m1 add r1, r4 add r0, r4 - mova m0, [r1] - mova m1, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m1, [r1+r2+1] + movu m0, [r1+1] + movu m1, [r1+r2+1] +%if cpuflag(sse2) + movu m2, [r1] + movu m3, [r1+r2] + pavgb m0, m2 + pavgb m1, m3 +%else + PAVGB m0, [r1] + PAVGB m1, [r1+r2] +%endif add r1, r4 mova [r0], m0 mova [r0+r2], m1 @@ -99,6 +129,9 @@ INIT_MMX mmxext PUT_PIXELS_16 INIT_MMX 3dnow PUT_PIXELS_16 +; The 8_X2 macro can easily be used here +INIT_XMM sse2 +PUT_PIXELS8_X2 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) @@ -191,20 +224,24 @@ PUT_NO_RND_PIXELS8_X2_EXACT ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro PUT_PIXELS8_Y2 0 +%if cpuflag(sse2) +cglobal put_pixels16_y2, 4,5,3 +%else cglobal put_pixels8_y2, 4,5 +%endif lea r4, [r2*2] - mova m0, [r1] + movu m0, [r1] sub r0, r2 .loop: - mova m1, [r1+r2] - mova m2, [r1+r4] + movu m1, [r1+r2] + movu m2, [r1+r4] add r1, r4 PAVGB m0, m1 PAVGB m1, m2 mova [r0+r2], m0 mova [r0+r4], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] + movu m1, [r1+r2] + movu m0, [r1+r4] add r0, r4 add r1, r4 PAVGB m2, m1 @@ -221,6 +258,9 @@ INIT_MMX mmxext PUT_PIXELS8_Y2 INIT_MMX 3dnow PUT_PIXELS8_Y2 +; actually, put_pixels16_y2_sse2 +INIT_XMM sse2 +PUT_PIXELS8_Y2 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) @@ -334,26 +374,48 @@ AVG_PIXELS8 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro AVG_PIXELS8_X2 0 +%if cpuflag(sse2) +cglobal avg_pixels16_x2, 4,5,4 +%else cglobal avg_pixels8_x2, 4,5 +%endif lea r4, [r2*2] +%if notcpuflag(mmxext) + pcmpeqd m5, m5 + paddb m5, m5 +%endif .loop: - mova m0, [r1] - mova m2, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m2, [r1+r2+1] - PAVGB m0, [r0] - PAVGB m2, [r0+r2] + movu m0, [r1] + movu m2, [r1+r2] +%if cpuflag(sse2) + movu m1, [r1+1] + movu m3, [r1+r2+1] + pavgb m0, m1 + pavgb m2, m3 +%else + PAVGB m0, [r1+1], m3, m5 + PAVGB m2, [r1+r2+1], m4, m5 +%endif + PAVGB m0, [r0], m3, m5 + PAVGB m2, [r0+r2], m4, m5 add r1, r4 mova [r0], m0 mova [r0+r2], m2 - mova m0, [r1] - mova m2, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m2, [r1+r2+1] + movu m0, [r1] + movu m2, [r1+r2] +%if cpuflag(sse2) + movu m1, [r1+1] + movu m3, [r1+r2+1] + pavgb m0, m1 + pavgb m2, m3 +%else + PAVGB m0, [r1+1], m3, m5 + PAVGB m2, [r1+r2+1], m4, m5 +%endif add r0, r4 add r1, r4 - PAVGB m0, [r0] - PAVGB m2, [r0+r2] + PAVGB m0, [r0], m3, m5 + PAVGB m2, [r0+r2], m4, m5 mova [r0], m0 mova [r0+r2], m2 add r0, r4 @@ -362,40 +424,45 @@ cglobal avg_pixels8_x2, 4,5 REP_RET %endmacro +INIT_MMX mmx +AVG_PIXELS8_X2 INIT_MMX mmxext AVG_PIXELS8_X2 INIT_MMX 3dnow AVG_PIXELS8_X2 +; actually avg_pixels16_x2 +INIT_XMM sse2 +AVG_PIXELS8_X2 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro AVG_PIXELS8_Y2 0 +%if cpuflag(sse2) +cglobal avg_pixels16_y2, 4,5,3 +%else cglobal avg_pixels8_y2, 4,5 +%endif lea r4, [r2*2] - mova m0, [r1] + movu m0, [r1] sub r0, r2 .loop: - mova m1, [r1+r2] - mova m2, [r1+r4] + movu m1, [r1+r2] + movu m2, [r1+r4] add r1, r4 PAVGB m0, m1 PAVGB m1, m2 - mova m3, [r0+r2] - mova m4, [r0+r4] - PAVGB m0, m3 - PAVGB m1, m4 + PAVGB m0, [r0+r2] + PAVGB m1, [r0+r4] mova [r0+r2], m0 mova [r0+r4], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] + movu m1, [r1+r2] + movu m0, [r1+r4] PAVGB m2, m1 PAVGB m1, m0 add r0, r4 add r1, r4 - mova m3, [r0+r2] - mova m4, [r0+r4] - PAVGB m2, m3 - PAVGB m1, m4 + PAVGB m2, [r0+r2] + PAVGB m1, [r0+r4] mova [r0+r2], m2 mova [r0+r4], m1 add r0, r4 @@ -408,11 +475,16 @@ INIT_MMX mmxext AVG_PIXELS8_Y2 INIT_MMX 3dnow AVG_PIXELS8_Y2 +; actually avg_pixels16_y2 +INIT_XMM sse2 +AVG_PIXELS8_Y2 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro AVG_PIXELS8_XY2 0 -cglobal avg_pixels8_xy2, 4,5 +; Note this is not correctly rounded, and is therefore used for +; not-bitexact output +%macro AVG_APPROX_PIXELS8_XY2 0 +cglobal avg_approx_pixels8_xy2, 4,5 mova m6, [pb_1] lea r4, [r2*2] mova m0, [r1] @@ -449,6 +521,160 @@ cglobal avg_pixels8_xy2, 4,5 %endmacro INIT_MMX mmxext -AVG_PIXELS8_XY2 +AVG_APPROX_PIXELS8_XY2 INIT_MMX 3dnow -AVG_PIXELS8_XY2 +AVG_APPROX_PIXELS8_XY2 + + +; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +%macro SET_PIXELS_XY2 1 +%if cpuflag(sse2) +cglobal %1_pixels16_xy2, 4,5,8 +%else +cglobal %1_pixels8_xy2, 4,5 +%endif + pxor m7, m7 + mova m6, [pw_2] + movu m0, [r1] + movu m4, [r1+1] + mova m1, m0 + mova m5, m4 + punpcklbw m0, m7 + punpcklbw m4, m7 + punpckhbw m1, m7 + punpckhbw m5, m7 + paddusw m4, m0 + paddusw m5, m1 + xor r4, r4 + add r1, r2 +.loop: + movu m0, [r1+r4] + movu m2, [r1+r4+1] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpcklbw m2, m7 + punpckhbw m1, m7 + punpckhbw m3, m7 + paddusw m0, m2 + paddusw m1, m3 + paddusw m4, m6 + paddusw m5, m6 + paddusw m4, m0 + paddusw m5, m1 + psrlw m4, 2 + psrlw m5, 2 +%ifidn %1, avg + mova m3, [r0+r4] + packuswb m4, m5 + PAVGB m4, m3 +%else + packuswb m4, m5 +%endif + mova [r0+r4], m4 + add r4, r2 + + movu m2, [r1+r4] + movu m4, [r1+r4+1] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpcklbw m4, m7 + punpckhbw m3, m7 + punpckhbw m5, m7 + paddusw m4, m2 + paddusw m5, m3 + paddusw m0, m6 + paddusw m1, m6 + paddusw m0, m4 + paddusw m1, m5 + psrlw m0, 2 + psrlw m1, 2 +%ifidn %1, avg + mova m3, [r0+r4] + packuswb m0, m1 + PAVGB m0, m3 +%else + packuswb m0, m1 +%endif + mova [r0+r4], m0 + add r4, r2 + sub r3d, 2 + jnz .loop + REP_RET +%endmacro + +INIT_MMX mmxext +SET_PIXELS_XY2 avg +INIT_MMX 3dnow +SET_PIXELS_XY2 avg +INIT_XMM sse2 +SET_PIXELS_XY2 put +SET_PIXELS_XY2 avg + +%macro SSSE3_PIXELS_XY2 1-2 +%if %0 == 2 ; sse2 +cglobal %1_pixels16_xy2, 4,5,%2 + mova m4, [pb_interleave16] +%else +cglobal %1_pixels8_xy2, 4,5 + mova m4, [pb_interleave8] +%endif + mova m5, [pb_1] + movu m0, [r1] + movu m1, [r1+1] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + xor r4, r4 + add r1, r2 +.loop: + movu m2, [r1+r4] + movu m3, [r1+r4+1] + pmaddubsw m2, m5 + pmaddubsw m3, m5 + paddusw m0, m2 + paddusw m1, m3 + pmulhrsw m0, [pw_8192] + pmulhrsw m1, [pw_8192] +%ifidn %1, avg + mova m6, [r0+r4] + packuswb m0, m1 + pshufb m0, m4 + pavgb m0, m6 +%else + packuswb m0, m1 + pshufb m0, m4 +%endif + mova [r0+r4], m0 + add r4, r2 + + movu m0, [r1+r4] + movu m1, [r1+r4+1] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + paddusw m2, m0 + paddusw m3, m1 + pmulhrsw m2, [pw_8192] + pmulhrsw m3, [pw_8192] +%ifidn %1, avg + mova m6, [r0+r4] + packuswb m2, m3 + pshufb m2, m4 + pavgb m2, m6 +%else + packuswb m2, m3 + pshufb m2, m4 +%endif + mova [r0+r4], m2 + add r4, r2 + sub r3d, 2 + jnz .loop + REP_RET +%endmacro + +INIT_MMX ssse3 +SSSE3_PIXELS_XY2 put +SSSE3_PIXELS_XY2 avg +INIT_XMM ssse3 +SSSE3_PIXELS_XY2 put, 6 +SSSE3_PIXELS_XY2 avg, 7 diff --git a/libavcodec/x86/hpeldsp.h b/libavcodec/x86/hpeldsp.h index 47b0b8b825..5fae990a4f 100644 --- a/libavcodec/x86/hpeldsp.h +++ b/libavcodec/x86/hpeldsp.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -27,12 +27,27 @@ void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels, void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); + void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); #endif /* AVCODEC_X86_HPELDSP_H */ diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index 1cc3bacd15..8c0a0e9ab3 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -3,20 +3,20 @@ * Copyright (c) 2000, 2001 Fabrice Bellard * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> @@ -40,6 +40,14 @@ void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, @@ -74,10 +82,12 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); #define avg_pixels8_mmx ff_avg_pixels8_mmx #define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx @@ -156,32 +166,49 @@ CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8) CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \ CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \ CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \ - CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) + CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \ + CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8) HPELDSP_AVG_PIXELS16(_3dnow) HPELDSP_AVG_PIXELS16(_mmxext) #endif /* HAVE_YASM */ +#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \ + if (HAVE_MMX_EXTERNAL) \ + c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; + +#if HAVE_MMX_INLINE #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ do { \ - c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ + SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \ c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ } while (0) +#else +#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ + do { \ + SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \ + } while (0) +#endif static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags) { -#if HAVE_MMX_INLINE SET_HPEL_FUNCS(put, [0], 16, mmx); SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx); SET_HPEL_FUNCS(avg, [0], 16, mmx); SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx); SET_HPEL_FUNCS(put, [1], 8, mmx); SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx); - SET_HPEL_FUNCS(avg, [1], 8, mmx); -#endif /* HAVE_MMX_INLINE */ + if (HAVE_MMX_EXTERNAL) { + c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx; + c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx; + } +#if HAVE_MMX_INLINE + c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx; +#endif } static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags) @@ -193,6 +220,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags) c->avg_pixels_tab[0][0] = avg_pixels16_mmxext; c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext; c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext; c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; @@ -200,6 +228,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags) c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; if (!(flags & CODEC_FLAG_BITEXACT)) { c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext; @@ -207,11 +236,11 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags) c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext; - c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; + c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext; + c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext; } - if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { + if (CONFIG_VP3_DECODER && flags & CODEC_FLAG_BITEXACT) { c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; } @@ -227,6 +256,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags) c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow; c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow; @@ -234,6 +264,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags) c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow; c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow; c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; if (!(flags & CODEC_FLAG_BITEXACT)){ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; @@ -241,11 +272,11 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags) c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow; c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; - c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; + c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow; + c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow; } - if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { + if (CONFIG_VP3_DECODER && flags & CODEC_FLAG_BITEXACT) { c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow; c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow; } @@ -259,11 +290,27 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags) // these functions are slower than mmx on AMD, but faster on Intel c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2; c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; + c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2; + c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2; } #endif /* HAVE_SSE2_EXTERNAL */ } +static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags, int cpu_flags) +{ +#if HAVE_SSSE3_EXTERNAL + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3; +#endif +} + av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) { int cpu_flags = av_get_cpu_flags(); @@ -279,4 +326,7 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) if (EXTERNAL_SSE2(cpu_flags)) hpeldsp_init_sse2(c, flags, cpu_flags); + + if (EXTERNAL_SSSE3(cpu_flags)) + hpeldsp_init_ssse3(c, flags, cpu_flags); } diff --git a/libavcodec/x86/hpeldsp_mmx.c b/libavcodec/x86/hpeldsp_mmx.c deleted file mode 100644 index c93c78e40e..0000000000 --- a/libavcodec/x86/hpeldsp_mmx.c +++ /dev/null @@ -1,53 +0,0 @@ -/* - * MMX-optimized avg/put pixel routines - * - * Copyright (c) 2001 Fabrice Bellard - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stddef.h> -#include <stdint.h> - -#include "config.h" -#include "hpeldsp.h" -#include "inline_asm.h" - -#if HAVE_MMX_INLINE - -void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %1, %%mm0 \n\t" - "movq 1%1, %%mm1 \n\t" - "movq %0, %%mm3 \n\t" - PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) - PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } while (--h); -} - -#endif /* HAVE_MMX_INLINE */ diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c index d854e8a2fc..8cbc412e23 100644 --- a/libavcodec/x86/hpeldsp_rnd_template.c +++ b/libavcodec/x86/hpeldsp_rnd_template.c @@ -7,20 +7,20 @@ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> * and improved by Zdenek Kabelac <kabi@users.sf.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -138,27 +138,28 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_ static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { MOVQ_BFE(mm6); - JUMPALIGN(); - do { __asm__ volatile( - "movq %1, %%mm0 \n\t" - "movq 1%1, %%mm1 \n\t" - "movq %0, %%mm3 \n\t" + ".p2align 3 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%2), %%mm3 \n\t" PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, %0 \n\t" - "movq 8%1, %%mm0 \n\t" - "movq 9%1, %%mm1 \n\t" - "movq 8%0, %%mm3 \n\t" + "movq %%mm0, (%2) \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 9(%1), %%mm1 \n\t" + "movq 8(%2), %%mm3 \n\t" PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, 8%0 \n\t" - :"+m"(*block) - :"m"(*pixels) + "movq %%mm0, 8(%2) \n\t" + "add %3, %1 \n\t" + "add %3, %2 \n\t" + "subl $1, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"((x86_reg)line_size) :"memory"); - pixels += line_size; - block += line_size; - } while (--h); } static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm index 436abc8b75..85ee56dff2 100644 --- a/libavcodec/x86/huffyuvdsp.asm +++ b/libavcodec/x86/huffyuvdsp.asm @@ -1,28 +1,29 @@ ;****************************************************************************** ;* SIMD-optimized HuffYUV functions ;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2014 Christophe Gisquet ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" SECTION_RODATA -pb_f: times 16 db 15 +cextern pb_15 pb_zzzzzzzz77777777: times 8 db -1 pb_7: times 8 db 7 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 @@ -33,64 +34,72 @@ SECTION_TEXT ; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top, ; const uint8_t *diff, int w, ; int *left, int *left_top) -INIT_MMX mmxext -cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top - movq mm0, [topq] - movq mm2, mm0 - movd mm4, [left_topq] - psllq mm2, 8 - movq mm1, mm0 - por mm4, mm2 - movd mm3, [leftq] - psubb mm0, mm4 ; t-tl +%macro HFYU_MEDIAN 0 +cglobal add_hfyu_median_pred, 6,6,8, dst, top, diff, w, left, left_top + movu m0, [topq] + mova m2, m0 + movd m4, [left_topq] + LSHIFT m2, 1 + mova m1, m0 + por m4, m2 + movd m3, [leftq] + psubb m0, m4 ; t-tl add dstq, wq add topq, wq add diffq, wq neg wq jmp .skip .loop: - movq mm4, [topq+wq] - movq mm0, mm4 - psllq mm4, 8 - por mm4, mm1 - movq mm1, mm0 ; t - psubb mm0, mm4 ; t-tl + movu m4, [topq+wq] + mova m0, m4 + LSHIFT m4, 1 + por m4, m1 + mova m1, m0 ; t + psubb m0, m4 ; t-tl .skip: - movq mm2, [diffq+wq] + movu m2, [diffq+wq] %assign i 0 -%rep 8 - movq mm4, mm0 - paddb mm4, mm3 ; t-tl+l - movq mm5, mm3 - pmaxub mm3, mm1 - pminub mm5, mm1 - pminub mm3, mm4 - pmaxub mm3, mm5 ; median - paddb mm3, mm2 ; +residual +%rep mmsize + mova m4, m0 + paddb m4, m3 ; t-tl+l + mova m5, m3 + pmaxub m3, m1 + pminub m5, m1 + pminub m3, m4 + pmaxub m3, m5 ; median + paddb m3, m2 ; +residual %if i==0 - movq mm7, mm3 - psllq mm7, 56 + mova m7, m3 + LSHIFT m7, mmsize-1 %else - movq mm6, mm3 - psrlq mm7, 8 - psllq mm6, 56 - por mm7, mm6 + mova m6, m3 + RSHIFT m7, 1 + LSHIFT m6, mmsize-1 + por m7, m6 %endif -%if i<7 - psrlq mm0, 8 - psrlq mm1, 8 - psrlq mm2, 8 +%if i<mmsize-1 + RSHIFT m0, 1 + RSHIFT m1, 1 + RSHIFT m2, 1 %endif %assign i i+1 %endrep - movq [dstq+wq], mm7 - add wq, 8 + movu [dstq+wq], m7 + add wq, mmsize jl .loop movzx r2d, byte [dstq-1] mov [leftq], r2d movzx r2d, byte [topq-1] mov [left_topq], r2d RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmxext +HFYU_MEDIAN +%endif +INIT_XMM sse2 +HFYU_MEDIAN %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned @@ -148,7 +157,7 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left INIT_XMM sse4 cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left - mova m5, [pb_f] + mova m5, [pb_15] mova m6, [pb_zzzzzzzz77777777] mova m4, [pb_zzzz3333zzzzbbbb] mova m3, [pb_zz11zz55zz99zzdd] @@ -163,3 +172,82 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left ADD_HFYU_LEFT_LOOP 0, 1 .src_unaligned: ADD_HFYU_LEFT_LOOP 0, 0 + +%macro ADD_BYTES 0 +cglobal add_bytes, 3,4,2, dst, src, w, size + mov sizeq, wq + and sizeq, -2*mmsize + jz .2 + add dstq, sizeq + add srcq, sizeq + neg sizeq +.1: + mova m0, [srcq + sizeq] + mova m1, [srcq + sizeq + mmsize] + paddb m0, [dstq + sizeq] + paddb m1, [dstq + sizeq + mmsize] + mova [dstq + sizeq], m0 + mova [dstq + sizeq + mmsize], m1 + add sizeq, 2*mmsize + jl .1 +.2: + and wq, 2*mmsize-1 + jz .end + add dstq, wq + add srcq, wq + neg wq +.3 + mov sizeb, [srcq + wq] + add [dstq + wq], sizeb + inc wq + jl .3 +.end: + REP_RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +ADD_BYTES +%endif +INIT_XMM sse2 +ADD_BYTES + +; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src, +; intptr_t w, uint8_t *left) +%macro LEFT_BGR32 0 +cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left + shl wq, 2 + movd m0, [leftq] + lea dstq, [dstq + wq] + lea srcq, [srcq + wq] + LSHIFT m0, mmsize-4 + neg wq +.loop: + movu m1, [srcq+wq] + mova m2, m1 +%if mmsize == 8 + punpckhdq m0, m0 +%endif + LSHIFT m1, 4 + paddb m1, m2 +%if mmsize == 16 + pshufd m0, m0, q3333 + mova m2, m1 + LSHIFT m1, 8 + paddb m1, m2 +%endif + paddb m0, m1 + movu [dstq+wq], m0 + add wq, mmsize + jl .loop + movd m0, [dstq-4] + movd [leftq], m0 + REP_RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +LEFT_BGR32 +%endif +INIT_XMM sse2 +LEFT_BGR32 diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c index 75537d7a4c..3ced3c0a1c 100644 --- a/libavcodec/x86/huffyuvdsp_init.c +++ b/libavcodec/x86/huffyuvdsp_init.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -25,20 +25,29 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/huffyuvdsp.h" +void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w); +void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w); + void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top, - const uint8_t *diff, int w, + const uint8_t *diff, intptr_t w, int *left, int *left_top); +void ff_add_hfyu_median_pred_sse2(uint8_t *dst, const uint8_t *top, + const uint8_t *diff, intptr_t w, + int *left, int *left_top); int ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src, - int w, int left); + intptr_t w, int left); int ff_add_hfyu_left_pred_sse4(uint8_t *dst, const uint8_t *src, - int w, int left); + intptr_t w, int left); -#if HAVE_INLINE_ASM +void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src, + intptr_t w, uint8_t *left); +void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src, + intptr_t w, uint8_t *left); -#if HAVE_7REGS +#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32 static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top, - const uint8_t *diff, int w, + const uint8_t *diff, intptr_t w, int *left, int *left_top) { x86_reg w2 = -w; @@ -72,56 +81,34 @@ static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top, *left = l; *left_top = tl; } -#endif /* HAVE_7REGS */ - -static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) -{ - x86_reg i = 0; - - __asm__ volatile ( - "jmp 2f \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq (%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, (%2, %0) \n\t" - "movq 8(%1, %0), %%mm0 \n\t" - "movq 8(%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "2: \n\t" - "cmp %3, %0 \n\t" - "js 1b \n\t" - : "+r" (i) - : "r" (src), "r" (dst), "r" ((x86_reg) w - 15)); - - for (; i < w; i++) - dst[i + 0] += src[i + 0]; -} - -#endif /* HAVE_INLINE_ASM */ +#endif av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c) { int cpu_flags = av_get_cpu_flags(); -#if HAVE_INLINE_ASM -#if HAVE_7REGS +#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32 if (cpu_flags & AV_CPU_FLAG_CMOV) c->add_hfyu_median_pred = add_hfyu_median_pred_cmov; -#endif /* HAVE_7REGS */ +#endif - if (INLINE_MMX(cpu_flags)) - c->add_bytes = add_bytes_mmx; -#endif /* HAVE_INLINE_ASM */ + if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) { + c->add_bytes = ff_add_bytes_mmx; + c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx; + } - if (EXTERNAL_MMXEXT(cpu_flags)) { + if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) { /* slower than cmov version on AMD */ if (!(cpu_flags & AV_CPU_FLAG_3DNOW)) c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext; } + if (EXTERNAL_SSE2(cpu_flags)) { + c->add_bytes = ff_add_bytes_sse2; + c->add_hfyu_median_pred = ff_add_hfyu_median_pred_sse2; + c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2; + } + if (EXTERNAL_SSSE3(cpu_flags)) { c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3; if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/huffyuvencdsp_mmx.c index 8ffaced37d..63d8e3cc73 100644 --- a/libavcodec/x86/huffyuvencdsp_mmx.c +++ b/libavcodec/x86/huffyuvencdsp_mmx.c @@ -5,20 +5,20 @@ * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -31,10 +31,11 @@ #if HAVE_INLINE_ASM -static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w) +static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w) { x86_reg i = 0; + if (w >= 16) __asm__ volatile ( "1: \n\t" "movq (%2, %0), %%mm0 \n\t" diff --git a/libavcodec/x86/idctdsp.asm b/libavcodec/x86/idctdsp.asm new file mode 100644 index 0000000000..0aa73459e2 --- /dev/null +++ b/libavcodec/x86/idctdsp.asm @@ -0,0 +1,183 @@ +;****************************************************************************** +;* SIMD-optimized IDCT-related routines +;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2003-2013 Michael Niedermayer +;* Copyright (c) 2013 Daniel Kang +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pb_80 + +SECTION_TEXT + +;-------------------------------------------------------------------------- +;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels, +; ptrdiff_t line_size) +;-------------------------------------------------------------------------- + +%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1 + mova m1, [blockq+mmsize*0+%1] + mova m2, [blockq+mmsize*2+%1] +%if mmsize == 8 + mova m3, [blockq+mmsize*4+%1] + mova m4, [blockq+mmsize*6+%1] +%endif + packsswb m1, [blockq+mmsize*1+%1] + packsswb m2, [blockq+mmsize*3+%1] +%if mmsize == 8 + packsswb m3, [blockq+mmsize*5+%1] + packsswb m4, [blockq+mmsize*7+%1] +%endif + paddb m1, m0 + paddb m2, m0 +%if mmsize == 8 + paddb m3, m0 + paddb m4, m0 + movq [pixelsq+lsizeq*0], m1 + movq [pixelsq+lsizeq*1], m2 + movq [pixelsq+lsizeq*2], m3 + movq [pixelsq+lsize3q ], m4 +%else + movq [pixelsq+lsizeq*0], m1 + movhps [pixelsq+lsizeq*1], m1 + movq [pixelsq+lsizeq*2], m2 + movhps [pixelsq+lsize3q ], m2 +%endif +%endmacro + +%macro PUT_SIGNED_PIXELS_CLAMPED 1 +cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3 + mova m0, [pb_80] + lea lsize3q, [lsizeq*3] + PUT_SIGNED_PIXELS_CLAMPED_HALF 0 + lea pixelsq, [pixelsq+lsizeq*4] + PUT_SIGNED_PIXELS_CLAMPED_HALF 64 + RET +%endmacro + +INIT_MMX mmx +PUT_SIGNED_PIXELS_CLAMPED 0 +INIT_XMM sse2 +PUT_SIGNED_PIXELS_CLAMPED 3 + +;-------------------------------------------------------------------------- +; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels, +; ptrdiff_t line_size); +;-------------------------------------------------------------------------- +; %1 = block offset +%macro PUT_PIXELS_CLAMPED_HALF 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*2+%1] +%if mmsize == 8 + mova m2, [blockq+mmsize*4+%1] + mova m3, [blockq+mmsize*6+%1] +%endif + packuswb m0, [blockq+mmsize*1+%1] + packuswb m1, [blockq+mmsize*3+%1] +%if mmsize == 8 + packuswb m2, [blockq+mmsize*5+%1] + packuswb m3, [blockq+mmsize*7+%1] + movq [pixelsq], m0 + movq [lsizeq+pixelsq], m1 + movq [2*lsizeq+pixelsq], m2 + movq [lsize3q+pixelsq], m3 +%else + movq [pixelsq], m0 + movhps [lsizeq+pixelsq], m0 + movq [2*lsizeq+pixelsq], m1 + movhps [lsize3q+pixelsq], m1 +%endif +%endmacro + +%macro PUT_PIXELS_CLAMPED 0 +cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3 + lea lsize3q, [lsizeq*3] + PUT_PIXELS_CLAMPED_HALF 0 + lea pixelsq, [pixelsq+lsizeq*4] + PUT_PIXELS_CLAMPED_HALF 64 + RET +%endmacro + +INIT_MMX mmx +PUT_PIXELS_CLAMPED +INIT_XMM sse2 +PUT_PIXELS_CLAMPED + +;-------------------------------------------------------------------------- +; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels, +; ptrdiff_t line_size); +;-------------------------------------------------------------------------- +; %1 = block offset +%macro ADD_PIXELS_CLAMPED 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*1+%1] +%if mmsize == 8 + mova m5, [blockq+mmsize*2+%1] + mova m6, [blockq+mmsize*3+%1] +%endif + movq m2, [pixelsq] + movq m3, [pixelsq+lsizeq] +%if mmsize == 8 + mova m7, m2 + punpcklbw m2, m4 + punpckhbw m7, m4 + paddsw m0, m2 + paddsw m1, m7 + mova m7, m3 + punpcklbw m3, m4 + punpckhbw m7, m4 + paddsw m5, m3 + paddsw m6, m7 +%else + punpcklbw m2, m4 + punpcklbw m3, m4 + paddsw m0, m2 + paddsw m1, m3 +%endif + packuswb m0, m1 +%if mmsize == 8 + packuswb m5, m6 + movq [pixelsq], m0 + movq [pixelsq+lsizeq], m5 +%else + movq [pixelsq], m0 + movhps [pixelsq+lsizeq], m0 +%endif +%endmacro + +%macro ADD_PIXELS_CLAMPED 0 +cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize + pxor m4, m4 + ADD_PIXELS_CLAMPED 0 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 32 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 64 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 96 + RET +%endmacro + +INIT_MMX mmx +ADD_PIXELS_CLAMPED +INIT_XMM sse2 +ADD_PIXELS_CLAMPED diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h index 22df3dd758..daa4e798ed 100644 --- a/libavcodec/x86/idctdsp.h +++ b/libavcodec/x86/idctdsp.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -20,12 +20,19 @@ #define AVCODEC_X86_IDCTDSP_H #include <stdint.h> +#include <stddef.h> void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size); + ptrdiff_t line_size); +void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size); + ptrdiff_t line_size); +void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size); + ptrdiff_t line_size); +void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); #endif /* AVCODEC_X86_IDCTDSP_H */ diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c index 853c6a3661..2c26a98850 100644 --- a/libavcodec/x86/idctdsp_init.c +++ b/libavcodec/x86/idctdsp_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -64,12 +64,10 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, int cpu_flags = av_get_cpu_flags(); if (INLINE_MMX(cpu_flags)) { - c->put_pixels_clamped = ff_put_pixels_clamped_mmx; - c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; - c->add_pixels_clamped = ff_add_pixels_clamped_mmx; - if (!high_bit_depth && + avctx->lowres == 0 && (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { c->idct_put = ff_simple_idct_put_mmx; c->idct_add = ff_simple_idct_add_mmx; @@ -77,4 +75,14 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, c->perm_type = FF_IDCT_PERM_SIMPLE; } } + if (EXTERNAL_MMX(cpu_flags)) { + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; + c->put_pixels_clamped = ff_put_pixels_clamped_mmx; + c->add_pixels_clamped = ff_add_pixels_clamped_mmx; + } + if (EXTERNAL_SSE2(cpu_flags)) { + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2; + c->put_pixels_clamped = ff_put_pixels_clamped_sse2; + c->add_pixels_clamped = ff_add_pixels_clamped_sse2; + } } diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c deleted file mode 100644 index 7285b1d357..0000000000 --- a/libavcodec/x86/idctdsp_mmx.c +++ /dev/null @@ -1,168 +0,0 @@ -/* - * SIMD-optimized IDCT-related routines - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "idctdsp.h" -#include "inline_asm.h" - -#if HAVE_INLINE_ASM - -void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size) -{ - const int16_t *p; - uint8_t *pix; - - /* read the pixels */ - p = block; - pix = pixels; - /* unrolled loop */ - __asm__ volatile ( - "movq (%3), %%mm0 \n\t" - "movq 8(%3), %%mm1 \n\t" - "movq 16(%3), %%mm2 \n\t" - "movq 24(%3), %%mm3 \n\t" - "movq 32(%3), %%mm4 \n\t" - "movq 40(%3), %%mm5 \n\t" - "movq 48(%3), %%mm6 \n\t" - "movq 56(%3), %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3), - "r" (p) - : "memory"); - pix += line_size * 4; - p += 32; - - // if here would be an exact copy of the code above - // compiler would generate some very strange code - // thus using "r" - __asm__ volatile ( - "movq (%3), %%mm0 \n\t" - "movq 8(%3), %%mm1 \n\t" - "movq 16(%3), %%mm2 \n\t" - "movq 24(%3), %%mm3 \n\t" - "movq 32(%3), %%mm4 \n\t" - "movq 40(%3), %%mm5 \n\t" - "movq 48(%3), %%mm6 \n\t" - "movq 56(%3), %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3), - "r" (p) - : "memory"); -} - -#define put_signed_pixels_clamped_mmx_half(off) \ - "movq "#off"(%2), %%mm1 \n\t" \ - "movq 16 + "#off"(%2), %%mm2 \n\t" \ - "movq 32 + "#off"(%2), %%mm3 \n\t" \ - "movq 48 + "#off"(%2), %%mm4 \n\t" \ - "packsswb 8 + "#off"(%2), %%mm1 \n\t" \ - "packsswb 24 + "#off"(%2), %%mm2 \n\t" \ - "packsswb 40 + "#off"(%2), %%mm3 \n\t" \ - "packsswb 56 + "#off"(%2), %%mm4 \n\t" \ - "paddb %%mm0, %%mm1 \n\t" \ - "paddb %%mm0, %%mm2 \n\t" \ - "paddb %%mm0, %%mm3 \n\t" \ - "paddb %%mm0, %%mm4 \n\t" \ - "movq %%mm1, (%0) \n\t" \ - "movq %%mm2, (%0, %3) \n\t" \ - "movq %%mm3, (%0, %3, 2) \n\t" \ - "movq %%mm4, (%0, %1) \n\t" - -void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size) -{ - x86_reg line_skip = line_size; - x86_reg line_skip3; - - __asm__ volatile ( - "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" - "lea (%3, %3, 2), %1 \n\t" - put_signed_pixels_clamped_mmx_half(0) - "lea (%0, %3, 4), %0 \n\t" - put_signed_pixels_clamped_mmx_half(64) - : "+&r" (pixels), "=&r" (line_skip3) - : "r" (block), "r" (line_skip) - : "memory"); -} - -void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size) -{ - const int16_t *p; - uint8_t *pix; - int i; - - /* read the pixels */ - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - i = 4; - do { - __asm__ volatile ( - "movq (%2), %%mm0 \n\t" - "movq 8(%2), %%mm1 \n\t" - "movq 16(%2), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "movq %0, %%mm4 \n\t" - "movq %1, %%mm6 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm4, %%mm0 \n\t" - "paddsw %%mm5, %%mm1 \n\t" - "movq %%mm6, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm6, %%mm2 \n\t" - "paddsw %%mm5, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, %0 \n\t" - "movq %%mm2, %1 \n\t" - : "+m" (*pix), "+m" (*(pix + line_size)) - : "r" (p) - : "memory"); - pix += line_size * 2; - p += 16; - } while (--i); -} - -#endif /* HAVE_INLINE_ASM */ diff --git a/libavcodec/x86/imdct36.asm b/libavcodec/x86/imdct36.asm index 633fcd9d59..ce30b42103 100644 --- a/libavcodec/x86/imdct36.asm +++ b/libavcodec/x86/imdct36.asm @@ -2,20 +2,20 @@ ;* 36 point SSE-optimized IMDCT transform ;* Copyright (c) 2011 Vitor Sessak ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -50,7 +50,7 @@ ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 - dd 1.0, 0.70710678118654752439, 0.0, 0.0 + dd 1.0, -0.70710678118654752439, 0.0, 0.0 costabs: times 4 dd 0.98480773 times 4 dd 0.93969262 @@ -129,6 +129,19 @@ SECTION_TEXT %endif %endmacro +%macro BUTTERF2 3 +%if cpuflag(sse3) + mulps %1, %1, [ps_cosh_sse3 + %3] + PSHUFD %2, %1, 0xe1 + addsubps %1, %1, %2 +%else + mulps %1, [ps_cosh + %3] + PSHUFD %2, %1, 0xe1 + xorps %1, [ps_p1m1p1m1] + addps %1, %2 +%endif +%endmacro + %macro STORE 4 movhlps %2, %1 movss [%3 ], %1 @@ -279,11 +292,7 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win BUTTERF m7, m2, 16 BUTTERF m3, m6, 32 BUTTERF m4, m1, 48 - - mulps m5, m5, [ps_cosh + 64] - PSHUFD m1, m5, 0xe1 - xorps m5, m5, [ps_p1m1p1m1] - addps m5, m5, m1 + BUTTERF2 m5, m1, 64 ; permutates: ; m0 0 1 2 3 => 2 6 10 14 m1 @@ -358,8 +367,10 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win RET %endmacro +%if ARCH_X86_32 INIT_XMM sse DEFINE_IMDCT +%endif INIT_XMM sse2 DEFINE_IMDCT @@ -370,8 +381,10 @@ DEFINE_IMDCT INIT_XMM ssse3 DEFINE_IMDCT +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEFINE_IMDCT +%endif INIT_XMM sse @@ -716,5 +729,7 @@ cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp INIT_XMM sse DEFINE_FOUR_IMDCT +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEFINE_FOUR_IMDCT +%endif diff --git a/libavcodec/x86/inline_asm.h b/libavcodec/x86/inline_asm.h index e4affabc87..3e65a76973 100644 --- a/libavcodec/x86/inline_asm.h +++ b/libavcodec/x86/inline_asm.h @@ -1,20 +1,20 @@ /* * inline assembly helper macros * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -37,7 +37,7 @@ "paddb %%"#regd", %%"#regd" \n\t" ::) #ifndef PIC -#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo)) +#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2)) #else // for shared library it's better to use this way for accessing constants // pcmpeqd -> -1 diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm new file mode 100644 index 0000000000..712a298610 --- /dev/null +++ b/libavcodec/x86/jpeg2000dsp.asm @@ -0,0 +1,144 @@ +;****************************************************************************** +;* SIMD-optimized JPEG2000 DSP functions +;* Copyright (c) 2014 Nicolas Bertrand +;* Copyright (c) 2015 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pf_ict0: times 8 dd 1.402 +pf_ict1: times 8 dd 0.34413 +pf_ict2: times 8 dd 0.71414 +pf_ict3: times 8 dd 1.772 + +SECTION .text + +;*********************************************************************** +; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize) +;*********************************************************************** +%macro ICT_FLOAT 1 +cglobal ict_float, 4, 4, %1, src0, src1, src2, csize + shl csized, 2 + add src0q, csizeq + add src1q, csizeq + add src2q, csizeq + neg csizeq + movaps m6, [pf_ict0] + movaps m7, [pf_ict1] + %define ICT0 m6 + %define ICT1 m7 + +%if ARCH_X86_64 + movaps m8, [pf_ict2] + %define ICT2 m8 +%if cpuflag(avx) + movaps m3, [pf_ict3] + %define ICT3 m3 +%else + movaps m9, [pf_ict3] + %define ICT3 m9 +%endif + +%else ; ARCH_X86_32 + %define ICT2 [pf_ict2] +%if cpuflag(avx) + movaps m3, [pf_ict3] + %define ICT3 m3 +%else + %define ICT3 [pf_ict3] +%endif + +%endif ; ARCH + +align 16 +.loop + movaps m0, [src0q+csizeq] + movaps m1, [src1q+csizeq] + movaps m2, [src2q+csizeq] + +%if cpuflag(avx) + mulps m5, m1, ICT1 + mulps m4, m2, ICT0 + mulps m1, m1, ICT3 + mulps m2, m2, ICT2 + subps m5, m0, m5 +%else ; sse + movaps m3, m1 + movaps m4, m2 + movaps m5, m0 + mulps m3, ICT1 + mulps m4, ICT0 + mulps m1, ICT3 + mulps m2, ICT2 + subps m5, m3 +%endif + addps m4, m4, m0 + addps m0, m0, m1 + subps m5, m5, m2 + + movaps [src0q+csizeq], m4 + movaps [src2q+csizeq], m0 + movaps [src1q+csizeq], m5 + add csizeq, mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse +ICT_FLOAT 10 +INIT_YMM avx +ICT_FLOAT 9 + +;*************************************************************************** +; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize) +;*************************************************************************** +%macro RCT_INT 0 +cglobal rct_int, 4, 4, 4, src0, src1, src2, csize + shl csized, 2 + add src0q, csizeq + add src1q, csizeq + add src2q, csizeq + neg csizeq + +align 16 +.loop: + mova m1, [src1q+csizeq] + mova m2, [src2q+csizeq] + mova m0, [src0q+csizeq] + paddd m3, m1, m2 + psrad m3, 2 + psubd m0, m3 + paddd m1, m0 + paddd m2, m0 + mova [src1q+csizeq], m0 + mova [src2q+csizeq], m1 + mova [src0q+csizeq], m2 + add csizeq, mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse2 +RCT_INT +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +RCT_INT +%endif diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c new file mode 100644 index 0000000000..0dbd2db7f5 --- /dev/null +++ b/libavcodec/x86/jpeg2000dsp_init.c @@ -0,0 +1,50 @@ +/* + * SIMD optimized JPEG 2000 DSP functions + * Copyright (c) 2015 James Almer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/jpeg2000dsp.h" + +void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize); +void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize); +void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize); +void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize); + +av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + if (EXTERNAL_SSE(cpu_flags)) { + c->mct_decode[FF_DWT97] = ff_ict_float_sse; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + c->mct_decode[FF_DWT53] = ff_rct_int_sse2; + } + + if (EXTERNAL_AVX_FAST(cpu_flags)) { + c->mct_decode[FF_DWT97] = ff_ict_float_avx; + } + + if (EXTERNAL_AVX2(cpu_flags)) { + c->mct_decode[FF_DWT53] = ff_rct_int_avx2; + } +} diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/lossless_audiodsp.asm index d721ebda6b..64b769f7d4 100644 --- a/libavcodec/x86/apedsp.asm +++ b/libavcodec/x86/lossless_audiodsp.asm @@ -1,20 +1,20 @@ ;****************************************************************************** ;* Copyright (c) 2008 Loren Merritt ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -58,14 +58,7 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul mova [v1q + orderq + mmsize], m3 add orderq, mmsize*2 jl .loop -%if mmsize == 16 - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e -%else - pshufw m0, m6, 0x4e -%endif - paddd m6, m0 + HADDD m6, m0 movd eax, m6 RET %endmacro @@ -159,9 +152,6 @@ SCALARPRODUCT_LOOP 4 SCALARPRODUCT_LOOP 2 SCALARPRODUCT_LOOP 0 .end: - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e - paddd m6, m0 + HADDD m6, m0 movd eax, m6 RET diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/lossless_audiodsp_init.c index f692c2b9b6..197173caf4 100644 --- a/libavcodec/x86/apedsp_init.c +++ b/libavcodec/x86/lossless_audiodsp_init.c @@ -1,25 +1,25 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/x86/cpu.h" -#include "libavcodec/apedsp.h" +#include "libavcodec/lossless_audiodsp.h" int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, const int16_t *v3, @@ -31,8 +31,9 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); -av_cold void ff_apedsp_init_x86(APEDSPContext *c) +av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c) { +#if HAVE_YASM int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMXEXT(cpu_flags)) @@ -44,4 +45,5 @@ av_cold void ff_apedsp_init_x86(APEDSPContext *c) if (EXTERNAL_SSSE3(cpu_flags) && !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; +#endif } diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm new file mode 100644 index 0000000000..e6c23e7985 --- /dev/null +++ b/libavcodec/x86/lossless_videodsp.asm @@ -0,0 +1,294 @@ +;****************************************************************************** +;* SIMD lossless video DSP utils +;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2014 Michael Niedermayer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pb_ef: times 8 db 14,15 +pb_67: times 8 db 6, 7 +pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11 +pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7 + +SECTION_TEXT + +%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub + movd m4, maskd + SPLATW m4, m4 + add wd, wd + test wq, 2*mmsize - 1 + jz %%.tomainloop + push tmpq +%%.wordloop: + sub wq, 2 +%ifidn %2, add + mov tmpw, [srcq+wq] + add tmpw, [dstq+wq] +%else + mov tmpw, [src1q+wq] + sub tmpw, [src2q+wq] +%endif + and tmpw, maskw + mov [dstq+wq], tmpw + test wq, 2*mmsize - 1 + jnz %%.wordloop + pop tmpq +%%.tomainloop: +%ifidn %2, add + add srcq, wq +%else + add src1q, wq + add src2q, wq +%endif + add dstq, wq + neg wq + jz %%.end +%%.loop: +%ifidn %2, add + mov%1 m0, [srcq+wq] + mov%1 m1, [dstq+wq] + mov%1 m2, [srcq+wq+mmsize] + mov%1 m3, [dstq+wq+mmsize] +%else + mov%1 m0, [src1q+wq] + mov%1 m1, [src2q+wq] + mov%1 m2, [src1q+wq+mmsize] + mov%1 m3, [src2q+wq+mmsize] +%endif + p%2w m0, m1 + p%2w m2, m3 + pand m0, m4 + pand m2, m4 + mov%1 [dstq+wq] , m0 + mov%1 [dstq+wq+mmsize], m2 + add wq, 2*mmsize + jl %%.loop +%%.end: + RET +%endmacro + +INIT_MMX mmx +cglobal add_int16, 4,4,5, dst, src, mask, w, tmp + INT16_LOOP a, add + +INIT_XMM sse2 +cglobal add_int16, 4,4,5, dst, src, mask, w, tmp + test srcq, mmsize-1 + jnz .unaligned + test dstq, mmsize-1 + jnz .unaligned + INT16_LOOP a, add +.unaligned: + INT16_LOOP u, add + +INIT_MMX mmx +cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp + INT16_LOOP a, sub + +INIT_XMM sse2 +cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp + test src1q, mmsize-1 + jnz .unaligned + test src2q, mmsize-1 + jnz .unaligned + test dstq, mmsize-1 + jnz .unaligned + INT16_LOOP a, sub +.unaligned: + INT16_LOOP u, sub + + +%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u) + add wd, wd + add srcq, wq + add dstq, wq + neg wq +%%.loop: + mov%2 m1, [srcq+wq] + mova m2, m1 + pslld m1, 16 + paddw m1, m2 + mova m2, m1 + + pshufb m1, m3 + paddw m1, m2 + pshufb m0, m5 +%if mmsize == 16 + mova m2, m1 + pshufb m1, m4 + paddw m1, m2 +%endif + paddw m0, m1 + pand m0, m7 +%ifidn %1, a + mova [dstq+wq], m0 +%else + movq [dstq+wq], m0 + movhps [dstq+wq+8], m0 +%endif + add wq, mmsize + jl %%.loop + mov eax, mmsize-1 + sub eax, wd + mov wd, eax + shl wd, 8 + lea eax, [wd+eax-1] + movd m1, eax + pshufb m0, m1 + movd eax, m0 + RET +%endmacro + +; int add_hfyu_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left) +INIT_MMX ssse3 +cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left +.skip_prologue: + mova m5, [pb_67] + mova m3, [pb_zzzz2323zzzzabab] + movd m0, leftm + psllq m0, 48 + movd m7, maskm + SPLATW m7 ,m7 + ADD_HFYU_LEFT_LOOP_INT16 a, a + +INIT_XMM sse4 +cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left + mova m5, [pb_ef] + mova m4, [pb_zzzzzzzz67676767] + mova m3, [pb_zzzz2323zzzzabab] + movd m0, leftm + pslldq m0, 14 + movd m7, maskm + SPLATW m7 ,m7 + test srcq, 15 + jnz .src_unaligned + test dstq, 15 + jnz .dst_unaligned + ADD_HFYU_LEFT_LOOP_INT16 a, a +.dst_unaligned: + ADD_HFYU_LEFT_LOOP_INT16 u, a +.src_unaligned: + ADD_HFYU_LEFT_LOOP_INT16 u, u + +; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top) +INIT_MMX mmxext +cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top + add wd, wd + movd mm6, maskd + SPLATW mm6, mm6 + movq mm0, [topq] + movq mm2, mm0 + movd mm4, [left_topq] + psllq mm2, 16 + movq mm1, mm0 + por mm4, mm2 + movd mm3, [leftq] + psubw mm0, mm4 ; t-tl + add dstq, wq + add topq, wq + add diffq, wq + neg wq + jmp .skip +.loop: + movq mm4, [topq+wq] + movq mm0, mm4 + psllq mm4, 16 + por mm4, mm1 + movq mm1, mm0 ; t + psubw mm0, mm4 ; t-tl +.skip: + movq mm2, [diffq+wq] +%assign i 0 +%rep 4 + movq mm4, mm0 + paddw mm4, mm3 ; t-tl+l + pand mm4, mm6 + movq mm5, mm3 + pmaxsw mm3, mm1 + pminsw mm5, mm1 + pminsw mm3, mm4 + pmaxsw mm3, mm5 ; median + paddw mm3, mm2 ; +residual + pand mm3, mm6 +%if i==0 + movq mm7, mm3 + psllq mm7, 48 +%else + movq mm4, mm3 + psrlq mm7, 16 + psllq mm4, 48 + por mm7, mm4 +%endif +%if i<3 + psrlq mm0, 16 + psrlq mm1, 16 + psrlq mm2, 16 +%endif +%assign i i+1 +%endrep + movq [dstq+wq], mm7 + add wq, 8 + jl .loop + movzx r2d, word [dstq-2] + mov [leftq], r2d + movzx r2d, word [topq-2] + mov [left_topq], r2d + RET + +cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top + add wd, wd + movd mm7, maskd + SPLATW mm7, mm7 + movq mm0, [src1q] + movq mm2, [src2q] + psllq mm0, 16 + psllq mm2, 16 + movd mm6, [left_topq] + por mm0, mm6 + movd mm6, [leftq] + por mm2, mm6 + xor maskq, maskq +.loop: + movq mm1, [src1q + maskq] + movq mm3, [src2q + maskq] + movq mm4, mm2 + psubw mm2, mm0 + paddw mm2, mm1 + pand mm2, mm7 + movq mm5, mm4 + pmaxsw mm4, mm1 + pminsw mm1, mm5 + pminsw mm4, mm2 + pmaxsw mm4, mm1 + psubw mm3, mm4 + pand mm3, mm7 + movq [dstq + maskq], mm3 + add maskq, 8 + movq mm0, [src1q + maskq - 2] + movq mm2, [src2q + maskq - 2] + cmp maskq, wq + jb .loop + movzx maskd, word [src1q + wq - 2] + mov [left_topq], maskd + movzx maskd, word [src2q + wq - 2] + mov [leftq], maskd + RET diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c new file mode 100644 index 0000000000..6589024a1a --- /dev/null +++ b/libavcodec/x86/lossless_videodsp_init.c @@ -0,0 +1,62 @@ +/* + * Lossless video DSP utils + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "../lossless_videodsp.h" +#include "libavutil/pixdesc.h" +#include "libavutil/x86/cpu.h" + +void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w); +void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w); +void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w); +void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w); +int ff_add_hfyu_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc); +int ff_add_hfyu_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc); +void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top); +void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); + + +void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx) +{ + int cpu_flags = av_get_cpu_flags(); + const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt); + + if (EXTERNAL_MMX(cpu_flags)) { + c->add_int16 = ff_add_int16_mmx; + c->diff_int16 = ff_diff_int16_mmx; + } + + if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc->comp[0].depth_minus1<15) { + c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext; + c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + c->add_int16 = ff_add_int16_sse2; + c->diff_int16 = ff_diff_int16_sse2; + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + c->add_hfyu_left_pred_int16 = ff_add_hfyu_left_pred_int16_ssse3; + } + + if (EXTERNAL_SSE4(cpu_flags)) { + c->add_hfyu_left_pred_int16 = ff_add_hfyu_left_pred_int16_sse4; + } +} diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc.c index ea5d2eab56..3a9493f728 100644 --- a/libavcodec/x86/lpc.c +++ b/libavcodec/x86/lpc.c @@ -2,26 +2,25 @@ * SIMD-optimized LPC functions * Copyright (c) 2007 Loren Merritt * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/attributes.h" #include "libavutil/cpu.h" -#include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" @@ -73,6 +72,7 @@ static void lpc_apply_welch_window_sse2(const int32_t *data, int len, "3: \n\t" :"+&r"(i), "+&r"(j) :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len) + NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2) XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm5", "%xmm6", "%xmm7") ); @@ -117,6 +117,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, "movsd %%xmm2, 16(%1) \n\t" :"+&r"(i) :"r"(autoc+j), "r"(data+len), "r"(data+len-j) + NAMED_CONSTRAINTS_ARRAY_ADD(pd_1) :"memory" ); } else { @@ -140,6 +141,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, "movsd %%xmm1, %2 \n\t" :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]) :"r"(data+len), "r"(data+len-j) + NAMED_CONSTRAINTS_ARRAY_ADD(pd_1) ); } } @@ -152,7 +154,7 @@ av_cold void ff_lpc_init_x86(LPCContext *c) #if HAVE_SSE2_INLINE int cpu_flags = av_get_cpu_flags(); - if (INLINE_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { + if (HAVE_SSE2_INLINE && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) { c->lpc_apply_welch_window = lpc_apply_welch_window_sse2; c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; } diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h index 2c04d9d1bd..6298f5ed19 100644 --- a/libavcodec/x86/mathops.h +++ b/libavcodec/x86/mathops.h @@ -2,20 +2,20 @@ * simple math operations * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -105,7 +105,7 @@ __asm__ volatile(\ #endif /* HAVE_I686 */ #define MASK_ABS(mask, level) \ - __asm__ ("cltd \n\t" \ + __asm__ ("cdq \n\t" \ "xorl %1, %0 \n\t" \ "subl %1, %0 \n\t" \ : "+a"(level), "=&d"(mask)) diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm index 1a87f37b39..0160dc348f 100644 --- a/libavcodec/x86/me_cmp.asm +++ b/libavcodec/x86/me_cmp.asm @@ -4,25 +4,30 @@ ;* Copyright (c) 2000, 2001 Fabrice Bellard ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;***************************************************************************** %include "libavutil/x86/x86util.asm" +SECTION_RODATA + +cextern pb_1 +cextern pb_80 + SECTION .text %macro DIFF_PIXELS_1 4 @@ -210,7 +215,7 @@ hadamard8_16_wrapper %1, 3 %elif cpuflag(mmx) ALIGN 16 ; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, -; uint8_t *src2, int stride, int h) +; uint8_t *src2, ptrdiff_t stride, int h) ; r0 = void *s = unused, int h = unused (always 8) ; note how r1, r2 and r3 are not clobbered in this function, so 16x16 ; can simply call this 2x2x (and that's why we access rsp+gprsize @@ -274,19 +279,27 @@ INIT_XMM ssse3 %define ABS_SUM_8x8 ABS_SUM_8x8_64 HADAMARD8_DIFF 9 -INIT_XMM sse2 -; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, -; int line_size, int h); -cglobal sse16, 5, 5, 8 - shr r4d, 1 +; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, +; ptrdiff_t line_size, int h) + +%macro SUM_SQUARED_ERRORS 1 +cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h +%if %1 == mmsize + shr hd, 1 +%endif pxor m0, m0 ; mm0 = 0 pxor m7, m7 ; mm7 holds the sum .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned - movu m1, [r1 ] ; mm1 = pix1[0][0-15] - movu m2, [r2 ] ; mm2 = pix2[0][0-15] - movu m3, [r1+r3] ; mm3 = pix1[1][0-15] - movu m4, [r2+r3] ; mm4 = pix2[1][0-15] + movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx + movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx +%if %1 == mmsize + movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx + movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx +%else ; %1 / 2 == mmsize; mmx only + mova m3, [pix1q+8] ; m3 = pix1[0][8-15] + mova m4, [pix2q+8] ; m4 = pix2[0][8-15] +%endif ; todo: mm1-mm2, mm3-mm4 ; algo: subtract mm1 from mm2 with saturation and vice versa @@ -315,22 +328,607 @@ cglobal sse16, 5, 5, 8 pmaddwd m1, m1 pmaddwd m3, m3 - lea r1, [r1+r3*2] ; pix1 += 2*line_size - lea r2, [r2+r3*2] ; pix2 += 2*line_size - paddd m1, m2 paddd m3, m4 paddd m7, m1 paddd m7, m3 - dec r4 +%if %1 == mmsize + lea pix1q, [pix1q + 2*lsizeq] + lea pix2q, [pix2q + 2*lsizeq] +%else + add pix1q, lsizeq + add pix2q, lsizeq +%endif + dec hd jnz .next2lines - mova m1, m7 - psrldq m7, 8 ; shift hi qword to lo - paddd m7, m1 - mova m1, m7 - psrldq m7, 4 ; shift hi dword to lo - paddd m7, m1 + HADDD m7, m1 movd eax, m7 ; return value RET +%endmacro + +INIT_MMX mmx +SUM_SQUARED_ERRORS 8 + +INIT_MMX mmx +SUM_SQUARED_ERRORS 16 + +INIT_XMM sse2 +SUM_SQUARED_ERRORS 16 + +;----------------------------------------------- +;int ff_sum_abs_dctelem(int16_t *block) +;----------------------------------------------- +; %1 = number of xmm registers used +; %2 = number of inline loops + +%macro SUM_ABS_DCTELEM 2 +cglobal sum_abs_dctelem, 1, 1, %1, block + pxor m0, m0 + pxor m1, m1 +%assign %%i 0 +%rep %2 + mova m2, [blockq+mmsize*(0+%%i)] + mova m3, [blockq+mmsize*(1+%%i)] + mova m4, [blockq+mmsize*(2+%%i)] + mova m5, [blockq+mmsize*(3+%%i)] + ABS1_SUM m2, m6, m0 + ABS1_SUM m3, m6, m1 + ABS1_SUM m4, m6, m0 + ABS1_SUM m5, m6, m1 +%assign %%i %%i+4 +%endrep + paddusw m0, m1 + HSUM m0, m1, eax + and eax, 0xFFFF + RET +%endmacro + +INIT_MMX mmx +SUM_ABS_DCTELEM 0, 4 +INIT_MMX mmxext +SUM_ABS_DCTELEM 0, 4 +INIT_XMM sse2 +SUM_ABS_DCTELEM 7, 2 +INIT_XMM ssse3 +SUM_ABS_DCTELEM 6, 2 + +;------------------------------------------------------------------------------ +; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h) +;------------------------------------------------------------------------------ +; %1 = 8/16. %2-5=m# +%macro HF_NOISE_PART1 5 + mova m%2, [pix1q] +%if %1 == 8 + mova m%3, m%2 + psllq m%2, 8 + psrlq m%3, 8 + psrlq m%2, 8 +%else + mova m%3, [pix1q+1] +%endif + mova m%4, m%2 + mova m%5, m%3 + punpcklbw m%2, m7 + punpcklbw m%3, m7 + punpckhbw m%4, m7 + punpckhbw m%5, m7 + psubw m%2, m%3 + psubw m%4, m%5 +%endmacro + +; %1-2 = m# +%macro HF_NOISE_PART2 4 + psubw m%1, m%3 + psubw m%2, m%4 + pxor m3, m3 + pxor m1, m1 + pcmpgtw m3, m%1 + pcmpgtw m1, m%2 + pxor m%1, m3 + pxor m%2, m1 + psubw m%1, m3 + psubw m%2, m1 + paddw m%2, m%1 + paddw m6, m%2 +%endmacro + +; %1 = 8/16 +%macro HF_NOISE 1 +cglobal hf_noise%1, 3,3,0, pix1, lsize, h + sub hd, 2 + pxor m7, m7 + pxor m6, m6 + HF_NOISE_PART1 %1, 0, 1, 2, 3 + add pix1q, lsizeq + HF_NOISE_PART1 %1, 4, 1, 5, 3 + HF_NOISE_PART2 0, 2, 4, 5 + add pix1q, lsizeq +.loop: + HF_NOISE_PART1 %1, 0, 1, 2, 3 + HF_NOISE_PART2 4, 5, 0, 2 + add pix1q, lsizeq + HF_NOISE_PART1 %1, 4, 1, 5, 3 + HF_NOISE_PART2 0, 2, 4, 5 + add pix1q, lsizeq + sub hd, 2 + jne .loop + + mova m0, m6 + punpcklwd m0, m7 + punpckhwd m6, m7 + paddd m6, m0 + mova m0, m6 + psrlq m6, 32 + paddd m0, m6 + movd eax, m0 ; eax = result of hf_noise8; + REP_RET ; return eax; +%endmacro + +INIT_MMX mmx +HF_NOISE 8 +HF_NOISE 16 + +;--------------------------------------------------------------------------------------- +;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); +;--------------------------------------------------------------------------------------- +;%1 = 8/16 +%macro SAD 1 +cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h + movu m2, [pix2q] + movu m1, [pix2q+strideq] + psadbw m2, [pix1q] + psadbw m1, [pix1q+strideq] + paddw m2, m1 +%if %1 != mmsize + movu m0, [pix2q+8] + movu m1, [pix2q+strideq+8] + psadbw m0, [pix1q+8] + psadbw m1, [pix1q+strideq+8] + paddw m2, m0 + paddw m2, m1 +%endif + sub hd, 2 + +align 16 +.loop: + lea pix1q, [pix1q+strideq*2] + lea pix2q, [pix2q+strideq*2] + movu m0, [pix2q] + movu m1, [pix2q+strideq] + psadbw m0, [pix1q] + psadbw m1, [pix1q+strideq] + paddw m2, m0 + paddw m2, m1 +%if %1 != mmsize + movu m0, [pix2q+8] + movu m1, [pix2q+strideq+8] + psadbw m0, [pix1q+8] + psadbw m1, [pix1q+strideq+8] + paddw m2, m0 + paddw m2, m1 +%endif + sub hd, 2 + jg .loop +%if mmsize == 16 + movhlps m0, m2 + paddw m2, m0 +%endif + movd eax, m2 + RET +%endmacro + +INIT_MMX mmxext +SAD 8 +SAD 16 +INIT_XMM sse2 +SAD 16 + +;------------------------------------------------------------------------------------------ +;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); +;------------------------------------------------------------------------------------------ +;%1 = 8/16 +%macro SAD_X2 1 +cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h + movu m0, [pix2q] + movu m2, [pix2q+strideq] +%if mmsize == 16 + movu m3, [pix2q+1] + movu m4, [pix2q+strideq+1] + pavgb m0, m3 + pavgb m2, m4 +%else + pavgb m0, [pix2q+1] + pavgb m2, [pix2q+strideq+1] +%endif + psadbw m0, [pix1q] + psadbw m2, [pix1q+strideq] + paddw m0, m2 +%if %1 != mmsize + movu m1, [pix2q+8] + movu m2, [pix2q+strideq+8] + pavgb m1, [pix2q+9] + pavgb m2, [pix2q+strideq+9] + psadbw m1, [pix1q+8] + psadbw m2, [pix1q+strideq+8] + paddw m0, m1 + paddw m0, m2 +%endif + sub hd, 2 + +align 16 +.loop: + lea pix1q, [pix1q+2*strideq] + lea pix2q, [pix2q+2*strideq] + movu m1, [pix2q] + movu m2, [pix2q+strideq] +%if mmsize == 16 + movu m3, [pix2q+1] + movu m4, [pix2q+strideq+1] + pavgb m1, m3 + pavgb m2, m4 +%else + pavgb m1, [pix2q+1] + pavgb m2, [pix2q+strideq+1] +%endif + psadbw m1, [pix1q] + psadbw m2, [pix1q+strideq] + paddw m0, m1 + paddw m0, m2 +%if %1 != mmsize + movu m1, [pix2q+8] + movu m2, [pix2q+strideq+8] + pavgb m1, [pix2q+9] + pavgb m2, [pix2q+strideq+9] + psadbw m1, [pix1q+8] + psadbw m2, [pix1q+strideq+8] + paddw m0, m1 + paddw m0, m2 +%endif + sub hd, 2 + jg .loop +%if mmsize == 16 + movhlps m1, m0 + paddw m0, m1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_MMX mmxext +SAD_X2 8 +SAD_X2 16 +INIT_XMM sse2 +SAD_X2 16 + +;------------------------------------------------------------------------------------------ +;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); +;------------------------------------------------------------------------------------------ +;%1 = 8/16 +%macro SAD_Y2 1 +cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h + movu m1, [pix2q] + movu m0, [pix2q+strideq] + movu m3, [pix2q+2*strideq] + pavgb m1, m0 + pavgb m0, m3 + psadbw m1, [pix1q] + psadbw m0, [pix1q+strideq] + paddw m0, m1 + mova m1, m3 +%if %1 != mmsize + movu m4, [pix2q+8] + movu m5, [pix2q+strideq+8] + movu m6, [pix2q+2*strideq+8] + pavgb m4, m5 + pavgb m5, m6 + psadbw m4, [pix1q+8] + psadbw m5, [pix1q+strideq+8] + paddw m0, m4 + paddw m0, m5 + mova m4, m6 +%endif + add pix2q, strideq + sub hd, 2 + +align 16 +.loop: + lea pix1q, [pix1q+2*strideq] + lea pix2q, [pix2q+2*strideq] + movu m2, [pix2q] + movu m3, [pix2q+strideq] + pavgb m1, m2 + pavgb m2, m3 + psadbw m1, [pix1q] + psadbw m2, [pix1q+strideq] + paddw m0, m1 + paddw m0, m2 + mova m1, m3 +%if %1 != mmsize + movu m5, [pix2q+8] + movu m6, [pix2q+strideq+8] + pavgb m4, m5 + pavgb m5, m6 + psadbw m4, [pix1q+8] + psadbw m5, [pix1q+strideq+8] + paddw m0, m4 + paddw m0, m5 + mova m4, m6 +%endif + sub hd, 2 + jg .loop +%if mmsize == 16 + movhlps m1, m0 + paddw m0, m1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_MMX mmxext +SAD_Y2 8 +SAD_Y2 16 +INIT_XMM sse2 +SAD_Y2 16 + +;------------------------------------------------------------------------------------------- +;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); +;------------------------------------------------------------------------------------------- +;%1 = 8/16 +%macro SAD_APPROX_XY2 1 +cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h + mova m4, [pb_1] + movu m1, [pix2q] + movu m0, [pix2q+strideq] + movu m3, [pix2q+2*strideq] +%if mmsize == 16 + movu m5, [pix2q+1] + movu m6, [pix2q+strideq+1] + movu m2, [pix2q+2*strideq+1] + pavgb m1, m5 + pavgb m0, m6 + pavgb m3, m2 +%else + pavgb m1, [pix2q+1] + pavgb m0, [pix2q+strideq+1] + pavgb m3, [pix2q+2*strideq+1] +%endif + psubusb m0, m4 + pavgb m1, m0 + pavgb m0, m3 + psadbw m1, [pix1q] + psadbw m0, [pix1q+strideq] + paddw m0, m1 + mova m1, m3 +%if %1 != mmsize + movu m5, [pix2q+8] + movu m6, [pix2q+strideq+8] + movu m7, [pix2q+2*strideq+8] + pavgb m5, [pix2q+1+8] + pavgb m6, [pix2q+strideq+1+8] + pavgb m7, [pix2q+2*strideq+1+8] + psubusb m6, m4 + pavgb m5, m6 + pavgb m6, m7 + psadbw m5, [pix1q+8] + psadbw m6, [pix1q+strideq+8] + paddw m0, m5 + paddw m0, m6 + mova m5, m7 +%endif + add pix2q, strideq + sub hd, 2 + +align 16 +.loop: + lea pix1q, [pix1q+2*strideq] + lea pix2q, [pix2q+2*strideq] + movu m2, [pix2q] + movu m3, [pix2q+strideq] +%if mmsize == 16 + movu m5, [pix2q+1] + movu m6, [pix2q+strideq+1] + pavgb m2, m5 + pavgb m3, m6 +%else + pavgb m2, [pix2q+1] + pavgb m3, [pix2q+strideq+1] +%endif + psubusb m2, m4 + pavgb m1, m2 + pavgb m2, m3 + psadbw m1, [pix1q] + psadbw m2, [pix1q+strideq] + paddw m0, m1 + paddw m0, m2 + mova m1, m3 +%if %1 != mmsize + movu m6, [pix2q+8] + movu m7, [pix2q+strideq+8] + pavgb m6, [pix2q+8+1] + pavgb m7, [pix2q+strideq+8+1] + psubusb m6, m4 + pavgb m5, m6 + pavgb m6, m7 + psadbw m5, [pix1q+8] + psadbw m6, [pix1q+strideq+8] + paddw m0, m5 + paddw m0, m6 + mova m5, m7 +%endif + sub hd, 2 + jg .loop +%if mmsize == 16 + movhlps m1, m0 + paddw m0, m1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_MMX mmxext +SAD_APPROX_XY2 8 +SAD_APPROX_XY2 16 +INIT_XMM sse2 +SAD_APPROX_XY2 16 + +;-------------------------------------------------------------------- +;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, +; ptrdiff_t line_size, int h); +;-------------------------------------------------------------------- +; %1 = 8/16 +%macro VSAD_INTRA 1 +cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h + mova m0, [pix1q] +%if %1 == mmsize + mova m2, [pix1q+lsizeq] + psadbw m0, m2 +%else + mova m2, [pix1q+lsizeq] + mova m3, [pix1q+8] + mova m4, [pix1q+lsizeq+8] + psadbw m0, m2 + psadbw m3, m4 + paddw m0, m3 +%endif + sub hd, 2 + +.loop + lea pix1q, [pix1q + 2*lsizeq] +%if %1 == mmsize + mova m1, [pix1q] + psadbw m2, m1 + paddw m0, m2 + mova m2, [pix1q+lsizeq] + psadbw m1, m2 + paddw m0, m1 +%else + mova m1, [pix1q] + mova m3, [pix1q+8] + psadbw m2, m1 + psadbw m4, m3 + paddw m0, m2 + paddw m0, m4 + mova m2, [pix1q+lsizeq] + mova m4, [pix1q+lsizeq+8] + psadbw m1, m2 + psadbw m3, m4 + paddw m0, m1 + paddw m0, m3 +%endif + sub hd, 2 + jg .loop + +%if mmsize == 16 + pshufd m1, m0, 0xe + paddd m0, m1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_MMX mmxext +VSAD_INTRA 8 +VSAD_INTRA 16 +INIT_XMM sse2 +VSAD_INTRA 16 + +;--------------------------------------------------------------------- +;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, +; ptrdiff_t line_size, int h); +;--------------------------------------------------------------------- +; %1 = 8/16 +%macro VSAD_APPROX 1 +cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h + mova m1, [pb_80] + mova m0, [pix1q] +%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2 + mova m4, [pix1q+lsizeq] +%if mmsize == 16 + movu m3, [pix2q] + movu m2, [pix2q+lsizeq] + psubb m0, m3 + psubb m4, m2 +%else + psubb m0, [pix2q] + psubb m4, [pix2q+lsizeq] +%endif + pxor m0, m1 + pxor m4, m1 + psadbw m0, m4 +%else ; vsad16_mmxext + mova m3, [pix1q+8] + psubb m0, [pix2q] + psubb m3, [pix2q+8] + pxor m0, m1 + pxor m3, m1 + mova m4, [pix1q+lsizeq] + mova m5, [pix1q+lsizeq+8] + psubb m4, [pix2q+lsizeq] + psubb m5, [pix2q+lsizeq+8] + pxor m4, m1 + pxor m5, m1 + psadbw m0, m4 + psadbw m3, m5 + paddw m0, m3 +%endif + sub hd, 2 + +.loop + lea pix1q, [pix1q + 2*lsizeq] + lea pix2q, [pix2q + 2*lsizeq] + mova m2, [pix1q] +%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2 +%if mmsize == 16 + movu m3, [pix2q] + psubb m2, m3 +%else + psubb m2, [pix2q] +%endif + pxor m2, m1 + psadbw m4, m2 + paddw m0, m4 + mova m4, [pix1q+lsizeq] + movu m3, [pix2q+lsizeq] + psubb m4, m3 + pxor m4, m1 + psadbw m2, m4 + paddw m0, m2 +%else ; vsad16_mmxext + mova m3, [pix1q+8] + psubb m2, [pix2q] + psubb m3, [pix2q+8] + pxor m2, m1 + pxor m3, m1 + psadbw m4, m2 + psadbw m5, m3 + paddw m0, m4 + paddw m0, m5 + mova m4, [pix1q+lsizeq] + mova m5, [pix1q+lsizeq+8] + psubb m4, [pix2q+lsizeq] + psubb m5, [pix2q+lsizeq+8] + pxor m4, m1 + pxor m5, m1 + psadbw m2, m4 + psadbw m3, m5 + paddw m0, m2 + paddw m0, m3 +%endif + sub hd, 2 + jg .loop + +%if mmsize == 16 + pshufd m1, m0, 0xe + paddd m0, m1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_MMX mmxext +VSAD_APPROX 8 +VSAD_APPROX 16 +INIT_XMM sse2 +VSAD_APPROX 16 diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c index f6c8e5b565..255df5065d 100644 --- a/libavcodec/x86/me_cmp_init.c +++ b/libavcodec/x86/me_cmp_init.c @@ -5,20 +5,20 @@ * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -29,382 +29,67 @@ #include "libavcodec/me_cmp.h" #include "libavcodec/mpegvideo.h" -#if HAVE_INLINE_ASM - -static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - ptrdiff_t stride, int h) -{ - int tmp; - - __asm__ volatile ( - "movl %4, %%ecx \n" - "shr $1, %%ecx \n" - "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */ - "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */ - "1: \n" - "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */ - "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */ - "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */ - "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1, %%mm5 \n" - "movq %%mm3, %%mm6 \n" - "psubusb %%mm2, %%mm1 \n" - "psubusb %%mm4, %%mm3 \n" - "psubusb %%mm5, %%mm2 \n" - "psubusb %%mm6, %%mm4 \n" - - "por %%mm1, %%mm2 \n" - "por %%mm3, %%mm4 \n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2, %%mm1 \n" - "movq %%mm4, %%mm3 \n" - - "punpckhbw %%mm0, %%mm2 \n" - "punpckhbw %%mm0, %%mm4 \n" - "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */ - "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */ - - "pmaddwd %%mm2, %%mm2 \n" - "pmaddwd %%mm4, %%mm4 \n" - "pmaddwd %%mm1, %%mm1 \n" - "pmaddwd %%mm3, %%mm3 \n" - - "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * stride */ - "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * stride */ - - "paddd %%mm2, %%mm1 \n" - "paddd %%mm4, %%mm3 \n" - "paddd %%mm1, %%mm7 \n" - "paddd %%mm3, %%mm7 \n" - - "decl %%ecx \n" - "jnz 1b \n" - - "movq %%mm7, %%mm1 \n" - "psrlq $32, %%mm7 \n" /* shift hi dword to lo */ - "paddd %%mm7, %%mm1 \n" - "movd %%mm1, %2 \n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" (stride), "m" (h) - : "%ecx"); - - return tmp; -} - -static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - ptrdiff_t stride, int h) -{ - int tmp; - - __asm__ volatile ( - "movl %4, %%ecx\n" - "pxor %%mm0, %%mm0\n" /* mm0 = 0 */ - "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */ - "1:\n" - "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */ - "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */ - "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */ - "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1, %%mm5\n" - "movq %%mm3, %%mm6\n" - "psubusb %%mm2, %%mm1\n" - "psubusb %%mm4, %%mm3\n" - "psubusb %%mm5, %%mm2\n" - "psubusb %%mm6, %%mm4\n" - - "por %%mm1, %%mm2\n" - "por %%mm3, %%mm4\n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2, %%mm1\n" - "movq %%mm4, %%mm3\n" - - "punpckhbw %%mm0, %%mm2\n" - "punpckhbw %%mm0, %%mm4\n" - "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */ - "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */ - - "pmaddwd %%mm2, %%mm2\n" - "pmaddwd %%mm4, %%mm4\n" - "pmaddwd %%mm1, %%mm1\n" - "pmaddwd %%mm3, %%mm3\n" - - "add %3, %0\n" - "add %3, %1\n" - - "paddd %%mm2, %%mm1\n" - "paddd %%mm4, %%mm3\n" - "paddd %%mm1, %%mm7\n" - "paddd %%mm3, %%mm7\n" - - "decl %%ecx\n" - "jnz 1b\n" - - "movq %%mm7, %%mm1\n" - "psrlq $32, %%mm7\n" /* shift hi dword to lo */ - "paddd %%mm7, %%mm1\n" - "movd %%mm1, %2\n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" (stride), "m" (h) - : "%ecx"); - - return tmp; -} - -static int hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h) -{ - int tmp; - - __asm__ volatile ( - "movl %3, %%ecx\n" - "pxor %%mm7, %%mm7\n" - "pxor %%mm6, %%mm6\n" - - "movq (%0), %%mm0\n" - "movq %%mm0, %%mm1\n" - "psllq $8, %%mm0\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm0\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq %%mm4, %%mm1\n" - "psllq $8, %%mm4\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm4\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "1:\n" - - "movq (%0), %%mm0\n" - "movq %%mm0, %%mm1\n" - "psllq $8, %%mm0\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm0\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - "psubw %%mm0, %%mm4\n" - "psubw %%mm2, %%mm5\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm4, %%mm3\n\t" - "pcmpgtw %%mm5, %%mm1\n\t" - "pxor %%mm3, %%mm4\n" - "pxor %%mm1, %%mm5\n" - "psubw %%mm3, %%mm4\n" - "psubw %%mm1, %%mm5\n" - "paddw %%mm4, %%mm5\n" - "paddw %%mm5, %%mm6\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq %%mm4, %%mm1\n" - "psllq $8, %%mm4\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm4\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "subl $2, %%ecx\n" - " jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "punpcklwd %%mm7, %%mm0\n" - "punpckhwd %%mm7, %%mm6\n" - "paddd %%mm0, %%mm6\n" - - "movq %%mm6, %%mm0\n" - "psrlq $32, %%mm6\n" - "paddd %%mm6, %%mm0\n" - "movd %%mm0, %1\n" - : "+r" (pix1), "=r" (tmp) - : "r" (stride), "g" (h - 2) - : "%ecx"); - - return tmp; -} - -static int hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h) -{ - int tmp; - uint8_t *pix = pix1; - - __asm__ volatile ( - "movl %3, %%ecx\n" - "pxor %%mm7, %%mm7\n" - "pxor %%mm6, %%mm6\n" - - "movq (%0), %%mm0\n" - "movq 1(%0), %%mm1\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq 1(%0), %%mm1\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "1:\n" - - "movq (%0), %%mm0\n" - "movq 1(%0), %%mm1\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - "psubw %%mm0, %%mm4\n" - "psubw %%mm2, %%mm5\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm4, %%mm3\n\t" - "pcmpgtw %%mm5, %%mm1\n\t" - "pxor %%mm3, %%mm4\n" - "pxor %%mm1, %%mm5\n" - "psubw %%mm3, %%mm4\n" - "psubw %%mm1, %%mm5\n" - "paddw %%mm4, %%mm5\n" - "paddw %%mm5, %%mm6\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq 1(%0), %%mm1\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "subl $2, %%ecx\n" - " jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "punpcklwd %%mm7, %%mm0\n" - "punpckhwd %%mm7, %%mm6\n" - "paddd %%mm0, %%mm6\n" +int ff_sum_abs_dctelem_mmx(int16_t *block); +int ff_sum_abs_dctelem_mmxext(int16_t *block); +int ff_sum_abs_dctelem_sse2(int16_t *block); +int ff_sum_abs_dctelem_ssse3(int16_t *block); +int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h); +int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h); +int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); +int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + ptrdiff_t stride, int h); - "movq %%mm6, %%mm0\n" - "psrlq $32, %%mm6\n" - "paddd %%mm6, %%mm0\n" - "movd %%mm0, %1\n" - : "+r" (pix1), "=r" (tmp) - : "r" (stride), "g" (h - 2) - : "%ecx"); +#define hadamard_func(cpu) \ + int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ + uint8_t *src2, ptrdiff_t stride, int h); \ + int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ + uint8_t *src2, ptrdiff_t stride, int h); - return tmp + hf_noise8_mmx(pix + 8, stride, h); -} +hadamard_func(mmx) +hadamard_func(mmxext) +hadamard_func(sse2) +hadamard_func(ssse3) +#if HAVE_YASM static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h) { @@ -413,9 +98,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, if (c) score1 = c->mecc.sse[0](c, pix1, pix2, stride, h); else - score1 = sse16_mmx(c, pix1, pix2, stride, h); - score2 = hf_noise16_mmx(pix1, stride, h) - - hf_noise16_mmx(pix2, stride, h); + score1 = ff_sse16_mmx(c, pix1, pix2, stride, h); + score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h) + - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h); if (c) return score1 + FFABS(score2) * c->avctx->nsse_weight; @@ -426,9 +111,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h) { - int score1 = sse8_mmx(c, pix1, pix2, stride, h); - int score2 = hf_noise8_mmx(pix1, stride, h) - - hf_noise8_mmx(pix2, stride, h); + int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h); + int score2 = ff_hf_noise8_mmx(pix1, stride, h) - + ff_hf_noise8_mmx(pix2, stride, h); if (c) return score1 + FFABS(score2) * c->avctx->nsse_weight; @@ -436,13 +121,17 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, return score1 + FFABS(score2) * 8; } +#endif /* HAVE_YASM */ + +#if HAVE_INLINE_ASM + static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, ptrdiff_t stride, int h) { int tmp; - assert((((int) pix) & 7) == 0); - assert((stride & 7) == 0); + av_assert2((((int) pix) & 7) == 0); + av_assert2((stride & 7) == 0); #define SUM(in0, in1, out0, out1) \ "movq (%0), %%mm2\n" \ @@ -500,57 +189,14 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, } #undef SUM -static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, - ptrdiff_t stride, int h) -{ - int tmp; - - assert((((int) pix) & 7) == 0); - assert((stride & 7) == 0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), " #out0 "\n" \ - "movq 8(%0), " #out1 "\n" \ - "add %2, %0\n" \ - "psadbw " #out0 ", " #in0 "\n" \ - "psadbw " #out1 ", " #in1 "\n" \ - "paddw " #in1 ", " #in0 "\n" \ - "paddw " #in0 ", %%mm6\n" - - __asm__ volatile ( - "movl %3, %%ecx\n" - "pxor %%mm6, %%mm6\n" - "pxor %%mm7, %%mm7\n" - "movq (%0), %%mm0\n" - "movq 8(%0), %%mm1\n" - "add %2, %0\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movd %%mm6, %1\n" - : "+r" (pix), "=r" (tmp) - : "r" (stride), "m" (h) - : "%ecx"); - - return tmp; -} -#undef SUM - static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h) { int tmp; - assert((((int) pix1) & 7) == 0); - assert((((int) pix2) & 7) == 0); - assert((stride & 7) == 0); + av_assert2((((int) pix1) & 7) == 0); + av_assert2((((int) pix2) & 7) == 0); + av_assert2((stride & 7) == 0); #define SUM(in0, in1, out0, out1) \ "movq (%0), %%mm2\n" \ @@ -624,191 +270,16 @@ static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, } #undef SUM -static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - ptrdiff_t stride, int h) -{ - int tmp; - - assert((((int) pix1) & 7) == 0); - assert((((int) pix2) & 7) == 0); - assert((stride & 7) == 0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), " #out0 "\n" \ - "movq (%1), %%mm2\n" \ - "movq 8(%0), " #out1 "\n" \ - "movq 8(%1), %%mm3\n" \ - "add %3, %0\n" \ - "add %3, %1\n" \ - "psubb %%mm2, " #out0 "\n" \ - "psubb %%mm3, " #out1 "\n" \ - "pxor %%mm7, " #out0 "\n" \ - "pxor %%mm7, " #out1 "\n" \ - "psadbw " #out0 ", " #in0 "\n" \ - "psadbw " #out1 ", " #in1 "\n" \ - "paddw " #in1 ", " #in0 "\n" \ - "paddw " #in0 ", %%mm6\n " - - __asm__ volatile ( - "movl %4, %%ecx\n" - "pxor %%mm6, %%mm6\n" - "pcmpeqw %%mm7, %%mm7\n" - "psllw $15, %%mm7\n" - "packsswb %%mm7, %%mm7\n" - "movq (%0), %%mm0\n" - "movq (%1), %%mm2\n" - "movq 8(%0), %%mm1\n" - "movq 8(%1), %%mm3\n" - "add %3, %0\n" - "add %3, %1\n" - "psubb %%mm2, %%mm0\n" - "psubb %%mm3, %%mm1\n" - "pxor %%mm7, %%mm0\n" - "pxor %%mm7, %%mm1\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movd %%mm6, %2\n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" (stride), "m" (h) - : "%ecx"); - - return tmp; -} -#undef SUM - -#define MMABS_MMX(a,z) \ - "pxor " #z ", " #z " \n\t" \ - "pcmpgtw " #a ", " #z " \n\t" \ - "pxor " #z ", " #a " \n\t" \ - "psubw " #z ", " #a " \n\t" - -#define MMABS_MMXEXT(a, z) \ - "pxor " #z ", " #z " \n\t" \ - "psubw " #a ", " #z " \n\t" \ - "pmaxsw " #z ", " #a " \n\t" - -#define MMABS_SSSE3(a,z) \ - "pabsw " #a ", " #a " \n\t" - -#define MMABS_SUM(a,z, sum) \ - MMABS(a,z) \ - "paddusw " #a ", " #sum " \n\t" - -/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get - * up to about 100k on extreme inputs. But that's very unlikely to occur in - * natural video, and it's even more unlikely to not have any alternative - * mvs/modes with lower cost. */ -#define HSUM_MMX(a, t, dst) \ - "movq " #a ", " #t " \n\t" \ - "psrlq $32, " #a " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movq " #a ", " #t " \n\t" \ - "psrlq $16, " #a " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movd " #a ", " #dst " \n\t" \ - -#define HSUM_MMXEXT(a, t, dst) \ - "pshufw $0x0E, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "pshufw $0x01, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movd " #a ", " #dst " \n\t" \ - -#define HSUM_SSE2(a, t, dst) \ - "movhlps " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "pshuflw $0x0E, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "pshuflw $0x01, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movd " #a ", " #dst " \n\t" \ - -#define DCT_SAD4(m, mm, o) \ - "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \ - "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \ - "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \ - "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \ - MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \ - MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \ - MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \ - MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \ - -#define DCT_SAD_MMX \ - "pxor %%mm0, %%mm0 \n\t" \ - "pxor %%mm1, %%mm1 \n\t" \ - DCT_SAD4(q, %%mm, 0) \ - DCT_SAD4(q, %%mm, 8) \ - DCT_SAD4(q, %%mm, 64) \ - DCT_SAD4(q, %%mm, 72) \ - "paddusw %%mm1, %%mm0 \n\t" \ - HSUM(%%mm0, %%mm1, %0) - -#define DCT_SAD_SSE2 \ - "pxor %%xmm0, %%xmm0 \n\t" \ - "pxor %%xmm1, %%xmm1 \n\t" \ - DCT_SAD4(dqa, %%xmm, 0) \ - DCT_SAD4(dqa, %%xmm, 64) \ - "paddusw %%xmm1, %%xmm0 \n\t" \ - HSUM(%%xmm0, %%xmm1, %0) - -#define DCT_SAD_FUNC(cpu) \ -static int sum_abs_dctelem_ ## cpu(int16_t *block) \ -{ \ - int sum; \ - __asm__ volatile ( \ - DCT_SAD \ - :"=r"(sum) \ - :"r"(block)); \ - return sum & 0xFFFF; \ -} - -#define DCT_SAD DCT_SAD_MMX -#define HSUM(a, t, dst) HSUM_MMX(a, t, dst) -#define MMABS(a, z) MMABS_MMX(a, z) -DCT_SAD_FUNC(mmx) -#undef MMABS -#undef HSUM - -#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst) -#define MMABS(a, z) MMABS_MMXEXT(a, z) -DCT_SAD_FUNC(mmxext) -#undef HSUM -#undef DCT_SAD - -#define DCT_SAD DCT_SAD_SSE2 -#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst) -DCT_SAD_FUNC(sse2) -#undef MMABS - -#if HAVE_SSSE3_INLINE -#define MMABS(a, z) MMABS_SSSE3(a, z) -DCT_SAD_FUNC(ssse3) -#undef MMABS -#endif -#undef HSUM -#undef DCT_SAD - - DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { 0x0000000000000000ULL, 0x0001000100010001ULL, 0x0002000200020002ULL, }; -DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL; - static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, ptrdiff_t stride, int h) { - x86_reg len = -(stride * h); + x86_reg len = -stride * h; __asm__ volatile ( ".p2align 4 \n\t" "1: \n\t" @@ -841,133 +312,10 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)); } -static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2, - ptrdiff_t stride, int h) -{ - __asm__ volatile ( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" (stride)); -} - -static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, - ptrdiff_t stride, int h) -{ - int ret; - __asm__ volatile ( - "pxor %%xmm2, %%xmm2 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movdqu (%1), %%xmm0 \n\t" - "movdqu (%1, %4), %%xmm1 \n\t" - "psadbw (%2), %%xmm0 \n\t" - "psadbw (%2, %4), %%xmm1 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "paddw %%xmm1, %%xmm2 \n\t" - "lea (%1,%4,2), %1 \n\t" - "lea (%2,%4,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - "movhlps %%xmm2, %%xmm0 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "movd %%xmm2, %3 \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret) - : "r" (stride)); - return ret; -} - -static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2, - ptrdiff_t stride, int h) -{ - __asm__ volatile ( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "pavgb 1(%1), %%mm0 \n\t" - "pavgb 1(%1, %3), %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" (stride)); -} - -static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2, - ptrdiff_t stride, int h) -{ - __asm__ volatile ( - "movq (%1), %%mm0 \n\t" - "add %3, %1 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "pavgb %%mm1, %%mm0 \n\t" - "pavgb %%mm2, %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" (stride)); -} - -static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, - ptrdiff_t stride, int h) -{ - __asm__ volatile ( - "movq "MANGLE(bone)", %%mm5 \n\t" - "movq (%1), %%mm0 \n\t" - "pavgb 1(%1), %%mm0 \n\t" - "add %3, %1 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1,%3), %%mm2 \n\t" - "pavgb 1(%1), %%mm1 \n\t" - "pavgb 1(%1,%3), %%mm2 \n\t" - "psubusb %%mm5, %%mm1 \n\t" - "pavgb %%mm1, %%mm0 \n\t" - "pavgb %%mm2, %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2,%3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" (stride)); -} - static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, ptrdiff_t stride, int h) { - x86_reg len = -(stride * h); + x86_reg len = -stride * h; __asm__ volatile ( ".p2align 4 \n\t" "1: \n\t" @@ -1006,7 +354,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, ptrdiff_t stride, int h) { - x86_reg len = -(stride * h); + x86_reg len = -stride * h; __asm__ volatile ( "movq (%1, %%"REG_a"), %%mm0 \n\t" "movq 1(%1, %%"REG_a"), %%mm2 \n\t" @@ -1030,7 +378,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, "punpckhbw %%mm7, %%mm5 \n\t" "paddw %%mm4, %%mm2 \n\t" "paddw %%mm5, %%mm3 \n\t" - "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" + "movq %5, %%mm5 \n\t" "paddw %%mm2, %%mm0 \n\t" "paddw %%mm3, %%mm1 \n\t" "paddw %%mm5, %%mm0 \n\t" @@ -1054,7 +402,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, " js 1b \n\t" : "+a" (len) : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), - "r" (stride)); + "r" (stride), "m" (round_tab[2])); } static inline int sum_mmx(void) @@ -1072,15 +420,6 @@ static inline int sum_mmx(void) return ret & 0xFFFF; } -static inline int sum_mmxext(void) -{ - int ret; - __asm__ volatile ( - "movd %%mm6, %0 \n\t" - : "=r" (ret)); - return ret; -} - static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, ptrdiff_t stride, int h) { @@ -1097,7 +436,7 @@ static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ uint8_t *blk1, ptrdiff_t stride, int h) \ { \ - assert(h == 8); \ + av_assert2(h == 8); \ __asm__ volatile ( \ "pxor %%mm7, %%mm7 \n\t" \ "pxor %%mm6, %%mm6 \n\t" \ @@ -1111,7 +450,7 @@ static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ uint8_t *blk1, ptrdiff_t stride, int h) \ { \ - assert(h == 8); \ + av_assert2(h == 8); \ __asm__ volatile ( \ "pxor %%mm7, %%mm7 \n\t" \ "pxor %%mm6, %%mm6 \n\t" \ @@ -1126,7 +465,7 @@ static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ uint8_t *blk1, ptrdiff_t stride, int h) \ { \ - assert(h == 8); \ + av_assert2(h == 8); \ __asm__ volatile ( \ "pxor %%mm7, %%mm7 \n\t" \ "pxor %%mm6, %%mm6 \n\t" \ @@ -1141,7 +480,7 @@ static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ uint8_t *blk1, ptrdiff_t stride, int h) \ { \ - assert(h == 8); \ + av_assert2(h == 8); \ __asm__ volatile ( \ "pxor %%mm7, %%mm7 \n\t" \ "pxor %%mm6, %%mm6 \n\t" \ @@ -1211,32 +550,15 @@ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ } \ PIX_SAD(mmx) -PIX_SAD(mmxext) #endif /* HAVE_INLINE_ASM */ -int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - ptrdiff_t stride, int h); - -#define hadamard_func(cpu) \ - int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ - uint8_t *src2, ptrdiff_t stride, int h); \ - int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ - uint8_t *src2, ptrdiff_t stride, int h); - -hadamard_func(mmx) -hadamard_func(mmxext) -hadamard_func(sse2) -hadamard_func(ssse3) - av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) { int cpu_flags = av_get_cpu_flags(); #if HAVE_INLINE_ASM if (INLINE_MMX(cpu_flags)) { - c->sum_abs_dctelem = sum_abs_dctelem_mmx; - c->pix_abs[0][0] = sad16_mmx; c->pix_abs[0][1] = sad16_x2_mmx; c->pix_abs[0][2] = sad16_y2_mmx; @@ -1249,77 +571,81 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) c->sad[0] = sad16_mmx; c->sad[1] = sad8_mmx; - c->sse[0] = sse16_mmx; - c->sse[1] = sse8_mmx; c->vsad[4] = vsad_intra16_mmx; - c->nsse[0] = nsse16_mmx; - c->nsse[1] = nsse8_mmx; - if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { c->vsad[0] = vsad16_mmx; } } - if (INLINE_MMXEXT(cpu_flags)) { - c->sum_abs_dctelem = sum_abs_dctelem_mmxext; - - c->vsad[4] = vsad_intra16_mmxext; - - c->pix_abs[0][0] = sad16_mmxext; - c->pix_abs[1][0] = sad8_mmxext; - - c->sad[0] = sad16_mmxext; - c->sad[1] = sad8_mmxext; - - if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { - c->pix_abs[0][1] = sad16_x2_mmxext; - c->pix_abs[0][2] = sad16_y2_mmxext; - c->pix_abs[0][3] = sad16_xy2_mmxext; - c->pix_abs[1][1] = sad8_x2_mmxext; - c->pix_abs[1][2] = sad8_y2_mmxext; - c->pix_abs[1][3] = sad8_xy2_mmxext; - - c->vsad[0] = vsad16_mmxext; - } - } - - if (INLINE_SSE2(cpu_flags)) { - c->sum_abs_dctelem = sum_abs_dctelem_sse2; - } - - if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) { - c->sad[0] = sad16_sse2; - } - -#if HAVE_SSSE3_INLINE - if (INLINE_SSSE3(cpu_flags)) { - c->sum_abs_dctelem = sum_abs_dctelem_ssse3; - } -#endif #endif /* HAVE_INLINE_ASM */ if (EXTERNAL_MMX(cpu_flags)) { c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; + c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx; + c->sse[0] = ff_sse16_mmx; + c->sse[1] = ff_sse8_mmx; +#if HAVE_YASM + c->nsse[0] = nsse16_mmx; + c->nsse[1] = nsse8_mmx; +#endif } if (EXTERNAL_MMXEXT(cpu_flags)) { c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; + c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext; + + c->sad[0] = ff_sad16_mmxext; + c->sad[1] = ff_sad8_mmxext; + + c->pix_abs[0][0] = ff_sad16_mmxext; + c->pix_abs[0][1] = ff_sad16_x2_mmxext; + c->pix_abs[0][2] = ff_sad16_y2_mmxext; + c->pix_abs[1][0] = ff_sad8_mmxext; + c->pix_abs[1][1] = ff_sad8_x2_mmxext; + c->pix_abs[1][2] = ff_sad8_y2_mmxext; + + c->vsad[4] = ff_vsad_intra16_mmxext; + c->vsad[5] = ff_vsad_intra8_mmxext; + + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext; + c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext; + + c->vsad[0] = ff_vsad16_approx_mmxext; + c->vsad[1] = ff_vsad8_approx_mmxext; + } } if (EXTERNAL_SSE2(cpu_flags)) { c->sse[0] = ff_sse16_sse2; + c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; #if HAVE_ALIGNED_STACK c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; #endif + if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { + c->sad[0] = ff_sad16_sse2; + c->pix_abs[0][0] = ff_sad16_sse2; + c->pix_abs[0][1] = ff_sad16_x2_sse2; + c->pix_abs[0][2] = ff_sad16_y2_sse2; + + c->vsad[4] = ff_vsad_intra16_sse2; + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2; + c->vsad[0] = ff_vsad16_approx_sse2; + } + } } - if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) { + if (EXTERNAL_SSSE3(cpu_flags)) { + c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3; +#if HAVE_ALIGNED_STACK c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; +#endif } } diff --git a/libavcodec/x86/mlpdsp.asm b/libavcodec/x86/mlpdsp.asm new file mode 100644 index 0000000000..ce656af145 --- /dev/null +++ b/libavcodec/x86/mlpdsp.asm @@ -0,0 +1,196 @@ +;****************************************************************************** +;* SIMD-optimized MLP DSP functions +;* Copyright (c) 2014 James Almer <jamrial@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +%if ARCH_X86_64 + +%macro SHLX 2 +%if cpuflag(bmi2) + shlx %1, %1, %2q +%else + shl %1, %2b +%endif +%endmacro + +%macro REMATRIX 0 + movdqa m0, [samplesq] + movdqa m1, [coeffsq ] + pshufd m2, m0, q2301 + pshufd m3, m1, q2301 + pmuldq m0, m1 + pmuldq m3, m2 + paddq m0, m3 +%if notcpuflag(avx2) + movdqa m1, [samplesq + 16] + movdqa m2, [coeffsq + 16] + pshufd m3, m1, q2301 + pshufd m4, m2, q2301 + pmuldq m1, m2 + pmuldq m4, m3 + paddq m0, m1 + paddq m0, m4 +%else + vextracti128 xm1, m0, 1 + paddq xm0, xm1 +%endif +%endmacro + +%macro LOOP_END 0 + pshufd xm1, xm0, q0032 + paddq xm0, xm1 + movq accumq, xm0 + movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs + sar accumq, 14 ; accum >>= 14 + and accumd, maskd ; accum &= mask + add accumd, blsbsd ; accum += *bypassed_lsbs + mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum + add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; + add samplesq, 32 ; samples += MAX_CHANNELS; + cmp blsbs_ptrq, cntq +%endmacro + +%macro LOOP_SHIFT_END 0 + pshufd xm1, xm0, q0032 + paddq xm0, xm1 + movq accumq, xm0 + and indexd, auspd ; index &= access_unit_size_pow2; + movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index] + add indexd, index2d ; index += index2 + SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift + add accumq, noiseq ; accum += noise_buffer[index] + movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register) + sar accumq, 14 ; accum >>= 14 + and accumd, maskd ; accum &= mask + add accumd, noised ; accum += *bypassed_lsbs + mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum + add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; + add samplesq, 32 ; samples += MAX_CHANNELS; + cmp blsbs_ptrq, cntq +%endmacro + +;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs, +; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer, +; int index, unsigned int dest_ch, uint16_t blockpos, +; unsigned int maxchan, int matrix_noise_shift, +; int access_unit_size_pow2, int32_t mask) +%macro MLP_REMATRIX_CHANNEL 0 +cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \ + index, dest_ch, blockpos, maxchan, mns, \ + accum, mask, cnt + mov mnsd, mnsm ; load matrix_noise_shift + movzx blockposq, word blockposm ; load and zero extend blockpos (16bit) + mov maxchand, maxchanm ; load maxchan + mov maskd, maskm ; load mask +%if WIN64 + mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64) +%endif + shl dest_chd, 2 + lea cntq, [blsbs_ptrq + blockposq*8] + test mnsd, mnsd ; is matrix_noise_shift != 0? + jne .shift ; jump if true + cmp maxchand, 4 ; is maxchan < 4? + jl .loop4 ; jump if true + +align 16 +.loop8: + ; Process 5 or more channels + REMATRIX + LOOP_END + jne .loop8 + RET + +align 16 +.loop4: + ; Process up to 4 channels + movdqa xm0, [samplesq] + movdqa xm1, [coeffsq ] + pshufd xm2, xm0, q2301 + pshufd xm3, xm1, q2301 + pmuldq xm0, xm1 + pmuldq xm3, xm2 + paddq xm0, xm3 + LOOP_END + jne .loop4 + RET + +.shift: +%if WIN64 + mov indexd, indexm ; load index (not needed on UNIX64) +%endif + mov r9d, r9m ; load access_unit_size_pow2 +%if cpuflag(bmi2) + ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place. + DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \ + index, dest_ch, accum, index2, mns, \ + ausp, mask, cnt, noise + add mnsd, 7 ; matrix_noise_shift += 7 +%else ; sse4 + mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift +%if WIN64 + ; r0 = rcx + DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \ + index2, accum, ausp, mask, cnt, noise +%else ; UNIX64 + ; r3 = rcx + DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \ + index2, accum, ausp, mask, cnt, noise +%endif + lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7 +%endif ; cpuflag + sub auspd, 1 ; access_unit_size_pow2 -= 1 + cmp r7d, 4 ; is maxchan < 4? + lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1; + jl .loop4_shift ; jump if maxchan < 4 + +align 16 +.loop8_shift: + ; Process 5 or more channels + REMATRIX + LOOP_SHIFT_END + jne .loop8_shift + RET + +align 16 +.loop4_shift: + ; Process up to 4 channels + movdqa xm0, [samplesq] + movdqa xm1, [coeffsq ] + pshufd xm2, xm0, q2301 + pshufd xm3, xm1, q2301 + pmuldq xm0, xm1 + pmuldq xm3, xm2 + paddq xm0, xm3 + LOOP_SHIFT_END + jne .loop4_shift + RET +%endmacro + +INIT_XMM sse4 +MLP_REMATRIX_CHANNEL +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2, bmi2 +MLP_REMATRIX_CHANNEL +%endif + +%endif ; ARCH_X86_64 diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp_init.c index 72fc637764..e9d9b1bf18 100644 --- a/libavcodec/x86/mlpdsp.c +++ b/libavcodec/x86/mlpdsp_init.c @@ -2,32 +2,47 @@ * MLP DSP functions x86-optimized * Copyright (c) 2009 Ramiro Polla * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/attributes.h" -#include "libavutil/internal.h" #include "libavutil/cpu.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/mlpdsp.h" #include "libavcodec/mlp.h" -#if HAVE_7REGS && HAVE_INLINE_ASM +#define REMATRIX_CHANNEL_FUNC(opt) \ +void ff_mlp_rematrix_channel_##opt(int32_t *samples, \ + const int32_t *coeffs, \ + const uint8_t *bypassed_lsbs, \ + const int8_t *noise_buffer, \ + int index, \ + unsigned int dest_ch, \ + uint16_t blockpos, \ + unsigned int maxchan, \ + int matrix_noise_shift, \ + int access_unit_size_pow2, \ + int32_t mask); + +REMATRIX_CHANNEL_FUNC(sse4) +REMATRIX_CHANNEL_FUNC(avx2_bmi2) + +#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS extern char ff_mlp_firorder_8; extern char ff_mlp_firorder_7; @@ -45,12 +60,12 @@ extern char ff_mlp_iirorder_2; extern char ff_mlp_iirorder_1; extern char ff_mlp_iirorder_0; -static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1, +static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1, &ff_mlp_firorder_2, &ff_mlp_firorder_3, &ff_mlp_firorder_4, &ff_mlp_firorder_5, &ff_mlp_firorder_6, &ff_mlp_firorder_7, &ff_mlp_firorder_8 }; -static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1, +static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1, &ff_mlp_iirorder_2, &ff_mlp_iirorder_3, &ff_mlp_iirorder_4 }; @@ -133,8 +148,8 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff, FIRMUL (ff_mlp_firorder_6, 0x14 ) FIRMUL (ff_mlp_firorder_5, 0x10 ) FIRMUL (ff_mlp_firorder_4, 0x0c ) - FIRMULREG(ff_mlp_firorder_3, 0x08,10) - FIRMULREG(ff_mlp_firorder_2, 0x04, 9) + FIRMUL (ff_mlp_firorder_3, 0x08 ) + FIRMUL (ff_mlp_firorder_2, 0x04 ) FIRMULREG(ff_mlp_firorder_1, 0x00, 8) LABEL_MANGLE(ff_mlp_firorder_0)":\n\t" "jmp *%6 \n\t" @@ -163,8 +178,6 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff, : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump), /* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift) , /* 8*/"r"((int64_t)coeff[0]) - , /* 9*/"r"((int64_t)coeff[1]) - , /*10*/"r"((int64_t)coeff[2]) : "rax", "rdx", "rsi" #else /* ARCH_X86_32 */ /* 3*/"+m"(blocksize) @@ -179,9 +192,13 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff, av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c) { -#if HAVE_7REGS && HAVE_INLINE_ASM int cpu_flags = av_get_cpu_flags(); +#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS if (INLINE_MMX(cpu_flags)) c->mlp_filter_channel = mlp_filter_channel_x86; #endif + if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags)) + c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4; + if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2) + c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2; } diff --git a/libavcodec/x86/mpegaudiodsp.c b/libavcodec/x86/mpegaudiodsp.c index 533b4a7c3f..27231674ae 100644 --- a/libavcodec/x86/mpegaudiodsp.c +++ b/libavcodec/x86/mpegaudiodsp.c @@ -2,20 +2,20 @@ * SIMD-optimized MP3 decoding functions * Copyright (c) 2010 Vitor Sessak * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -26,11 +26,18 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/mpegaudiodsp.h" -void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win); +#define DECL(CPU)\ +static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\ +void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win); + +#if ARCH_X86_32 +DECL(sse) +#endif +DECL(sse2) +DECL(sse3) +DECL(ssse3) +DECL(avx) + void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, float *tmpbuf); void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, @@ -38,7 +45,7 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; -#if HAVE_SSE2_INLINE +#if HAVE_6REGS && HAVE_SSE_INLINE #define MACS(rt, ra, rb) rt+=(ra)*(rb) #define MLSS(rt, ra, rb) rt-=(ra)*(rb) @@ -182,7 +189,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out, *out = sum; } -#endif /* HAVE_SSE2_INLINE */ +#endif /* HAVE_6REGS && HAVE_SSE_INLINE */ #if HAVE_YASM #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ @@ -217,11 +224,17 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ } \ } +#if HAVE_SSE +#if ARCH_X86_32 DECL_IMDCT_BLOCKS(sse,sse) +#endif DECL_IMDCT_BLOCKS(sse2,sse) DECL_IMDCT_BLOCKS(sse3,sse) DECL_IMDCT_BLOCKS(ssse3,sse) +#endif +#if HAVE_AVX_EXTERNAL DECL_IMDCT_BLOCKS(avx,avx) +#endif #endif /* HAVE_YASM */ av_cold void ff_mpadsp_init_x86(MPADSPContext *s) @@ -242,16 +255,19 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s) } } -#if HAVE_SSE2_INLINE - if (INLINE_SSE2(cpu_flags)) { +#if HAVE_6REGS && HAVE_SSE_INLINE + if (INLINE_SSE(cpu_flags)) { s->apply_window_float = apply_window_mp3; } -#endif /* HAVE_SSE2_INLINE */ +#endif /* HAVE_SSE_INLINE */ #if HAVE_YASM +#if HAVE_SSE +#if ARCH_X86_32 if (EXTERNAL_SSE(cpu_flags)) { s->imdct36_blocks_float = imdct36_blocks_sse; } +#endif if (EXTERNAL_SSE2(cpu_flags)) { s->imdct36_blocks_float = imdct36_blocks_sse2; } @@ -261,8 +277,11 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s) if (EXTERNAL_SSSE3(cpu_flags)) { s->imdct36_blocks_float = imdct36_blocks_ssse3; } +#endif +#if HAVE_AVX_EXTERNAL if (EXTERNAL_AVX(cpu_flags)) { s->imdct36_blocks_float = imdct36_blocks_avx; } +#endif #endif /* HAVE_YASM */ } diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c index d9fb4d24fd..133ae80a27 100644 --- a/libavcodec/x86/mpegvideo.c +++ b/libavcodec/x86/mpegvideo.c @@ -2,20 +2,20 @@ * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -26,7 +26,7 @@ #include "libavcodec/avcodec.h" #include "libavcodec/mpegvideo.h" -#if HAVE_INLINE_ASM +#if HAVE_MMX_INLINE static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, int16_t *block, int n, int qscale) @@ -35,7 +35,7 @@ static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, qmul = qscale << 1; - assert(s->block_last_index[n]>=0 || s->h263_aic); + av_assert2(s->block_last_index[n]>=0 || s->h263_aic); if (!s->h263_aic) { if (n < 4) @@ -111,7 +111,7 @@ static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, qmul = qscale << 1; qadd = (qscale - 1) | 1; - assert(s->block_last_index[n]>=0 || s->h263_aic); + av_assert2(s->block_last_index[n]>=0 || s->h263_aic); nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; @@ -171,7 +171,7 @@ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, const uint16_t *quant_matrix; int block0; - assert(s->block_last_index[n]>=0); + av_assert2(s->block_last_index[n]>=0); nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; @@ -239,7 +239,7 @@ static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, x86_reg nCoeffs; const uint16_t *quant_matrix; - assert(s->block_last_index[n]>=0); + av_assert2(s->block_last_index[n]>=0); nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; @@ -306,7 +306,7 @@ static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, const uint16_t *quant_matrix; int block0; - assert(s->block_last_index[n]>=0); + av_assert2(s->block_last_index[n]>=0); if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; @@ -371,7 +371,7 @@ static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, x86_reg nCoeffs; const uint16_t *quant_matrix; - assert(s->block_last_index[n]>=0); + av_assert2(s->block_last_index[n]>=0); if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; @@ -442,11 +442,11 @@ __asm__ volatile( ); } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_MMX_INLINE */ av_cold void ff_mpv_common_init_x86(MpegEncContext *s) { -#if HAVE_INLINE_ASM +#if HAVE_MMX_INLINE int cpu_flags = av_get_cpu_flags(); if (INLINE_MMX(cpu_flags)) { @@ -458,5 +458,5 @@ av_cold void ff_mpv_common_init_x86(MpegEncContext *s) s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_MMX_INLINE */ } diff --git a/libavcodec/x86/mpegvideodsp.c b/libavcodec/x86/mpegvideodsp.c index 0e5dd0f153..941a8e2e4c 100644 --- a/libavcodec/x86/mpegvideodsp.c +++ b/libavcodec/x86/mpegvideodsp.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -22,6 +22,7 @@ #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/mpegvideodsp.h" +#include "libavcodec/videodsp.h" #if HAVE_INLINE_ASM @@ -43,20 +44,24 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys }; const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys }; const uint64_t shift2 = 2 * shift; +#define MAX_STRIDE 4096U +#define MAX_H 8U + uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE]; int x, y; const int dxw = (dxx - (1 << (16 + shift))) * (w - 1); const int dyh = (dyy - (1 << (16 + shift))) * (h - 1); const int dxh = dxy * (h - 1); const int dyw = dyx * (w - 1); + int need_emu = (unsigned) ix >= width - w || + (unsigned) iy >= height - h; if ( // non-constant fullpel offset (3% of blocks) ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) | (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) || // uses more than 16 bits of subpel mv (only at huge resolution) (dxx | dxy | dyx | dyy) & 15 || - (unsigned) ix >= width - w || - (unsigned) iy >= height - h) { + (need_emu && (h > MAX_H || stride > MAX_STRIDE))) { // FIXME could still use mmx for some of the rows ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); @@ -64,6 +69,10 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, } src += ix + iy * stride; + if (need_emu) { + ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height); + src = edge_buf; + } __asm__ volatile ( "movd %0, %%mm6 \n\t" @@ -150,4 +159,3 @@ av_cold void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c) c->gmc = gmc_mmx; #endif /* HAVE_INLINE_ASM */ } - diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c index 47349d17ec..b410511c6a 100644 --- a/libavcodec/x86/mpegvideoenc.c +++ b/libavcodec/x86/mpegvideoenc.c @@ -2,20 +2,20 @@ * The simplest mpeg encoder (well, it was the simplest!) * Copyright (c) 2000,2001 Fabrice Bellard * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -30,6 +30,8 @@ /* not permutated inverse zigzag_direct + 1 for MMX quantizer */ DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64]; +#if HAVE_6REGS + #if HAVE_MMX_INLINE #define COMPILE_TEMPLATE_MMXEXT 0 #define COMPILE_TEMPLATE_SSE2 0 @@ -81,6 +83,8 @@ DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64]; #include "mpegvideoenc_template.c" #endif /* HAVE_SSSE3_INLINE */ +#endif /* HAVE_6REGS */ + #if HAVE_INLINE_ASM static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){ const int intra= s->mb_intra; @@ -193,7 +197,7 @@ static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){ } #endif /* HAVE_INLINE_ASM */ -av_cold void ff_mpv_encode_init_x86(MpegEncContext *s) +av_cold void ff_dct_encode_init_x86(MpegEncContext *s) { const int dct_algo = s->avctx->dct_algo; int i; @@ -205,21 +209,25 @@ av_cold void ff_mpv_encode_init_x86(MpegEncContext *s) #if HAVE_MMX_INLINE int cpu_flags = av_get_cpu_flags(); if (INLINE_MMX(cpu_flags)) { +#if HAVE_6REGS s->dct_quantize = dct_quantize_mmx; +#endif s->denoise_dct = denoise_dct_mmx; } #endif -#if HAVE_MMXEXT_INLINE +#if HAVE_6REGS && HAVE_MMXEXT_INLINE if (INLINE_MMXEXT(cpu_flags)) s->dct_quantize = dct_quantize_mmxext; #endif #if HAVE_SSE2_INLINE if (INLINE_SSE2(cpu_flags)) { +#if HAVE_6REGS s->dct_quantize = dct_quantize_sse2; +#endif s->denoise_dct = denoise_dct_sse2; } #endif -#if HAVE_SSSE3_INLINE +#if HAVE_6REGS && HAVE_SSSE3_INLINE if (INLINE_SSSE3(cpu_flags)) s->dct_quantize = dct_quantize_ssse3; #endif diff --git a/libavcodec/x86/mpegvideoenc_qns_template.c b/libavcodec/x86/mpegvideoenc_qns_template.c index 8d8d68762a..882d486205 100644 --- a/libavcodec/x86/mpegvideoenc_qns_template.c +++ b/libavcodec/x86/mpegvideoenc_qns_template.c @@ -5,26 +5,26 @@ * MMX optimization by Michael Niedermayer <michaelni@gmx.at> * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include <assert.h> #include <stdint.h> +#include "libavutil/avassert.h" #include "libavutil/common.h" #include "libavutil/x86/asm.h" @@ -36,7 +36,7 @@ static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[ { x86_reg i=0; - assert(FFABS(scale) < MAX_ABS); + av_assert2(FFABS(scale) < MAX_ABS); scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; SET_RND(mm6); diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c index 1274c13e05..1899ba23c6 100644 --- a/libavcodec/x86/mpegvideoenc_template.c +++ b/libavcodec/x86/mpegvideoenc_template.c @@ -3,20 +3,20 @@ * * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -107,7 +107,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, const uint16_t *qmat, *bias; LOCAL_ALIGNED_16(int16_t, temp_block, [64]); - assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? + av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? //s->fdct (block); RENAME_FDCT(ff_fdct)(block); // cannot be anything else ... @@ -117,10 +117,15 @@ static int RENAME(dct_quantize)(MpegEncContext *s, if (s->mb_intra) { int dummy; - if (n < 4) + if (n < 4){ q = s->y_dc_scale; - else + bias = s->q_intra_matrix16[qscale][1]; + qmat = s->q_intra_matrix16[qscale][0]; + }else{ q = s->c_dc_scale; + bias = s->q_chroma_intra_matrix16[qscale][1]; + qmat = s->q_chroma_intra_matrix16[qscale][0]; + } /* note: block[0] is assumed to be positive */ if (!s->h263_aic) { __asm__ volatile ( @@ -135,8 +140,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s, block[0]=0; //avoid fake overflow // temp_block[0] = (block[0] + (q >> 1)) / q; last_non_zero_p1 = 1; - bias = s->q_intra_matrix16[qscale][1]; - qmat = s->q_intra_matrix16[qscale][0]; } else { last_non_zero_p1 = 0; bias = s->q_inter_matrix16[qscale][1]; @@ -172,7 +175,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, " js 1b \n\t" PMAX(MM"3", MM"0") "movd "MM"3, %%"REG_a" \n\t" - "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 + "movzbl %%al, %%eax \n\t" // last_non_zero_p1 : "+a" (last_non_zero_p1) : "r" (block+64), "r" (qmat), "r" (bias), "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64) @@ -206,7 +209,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, " js 1b \n\t" PMAX(MM"3", MM"0") "movd "MM"3, %%"REG_a" \n\t" - "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 + "movzbl %%al, %%eax \n\t" // last_non_zero_p1 : "+a" (last_non_zero_p1) : "r" (block+64), "r" (qmat+64), "r" (bias+64), "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64) @@ -220,7 +223,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "psubusw "MM"1, "MM"4 \n\t" "packuswb "MM"4, "MM"4 \n\t" #if COMPILE_TEMPLATE_SSE2 - "packuswb "MM"4, "MM"4 \n\t" + "packsswb "MM"4, "MM"4 \n\t" #endif "movd "MM"4, %0 \n\t" // *overflow : "=g" (*overflow) @@ -274,6 +277,50 @@ static int RENAME(dct_quantize)(MpegEncContext *s, block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36]; block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37]; block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; + }else if(s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2){ + if(last_non_zero_p1 <= 1) goto end; + block[0x04] = temp_block[0x01]; + block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; + if(last_non_zero_p1 <= 4) goto end; + block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02]; + block[0x05] = temp_block[0x03]; + if(last_non_zero_p1 <= 7) goto end; + block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11]; + block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; + if(last_non_zero_p1 <= 11) goto end; + block[0x1C] = temp_block[0x19]; + block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B]; + block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05]; + if(last_non_zero_p1 <= 16) goto end; + block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13]; + block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21]; + block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; + block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22]; + if(last_non_zero_p1 <= 24) goto end; + block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14]; + block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06]; + block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E]; + block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C]; + if(last_non_zero_p1 <= 32) goto end; + block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A]; + block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38]; + block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32]; + block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24]; + if(last_non_zero_p1 <= 40) goto end; + block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16]; + block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; + block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25]; + block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33]; + if(last_non_zero_p1 <= 48) goto end; + block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B]; + block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D]; + block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; + block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E]; + if(last_non_zero_p1 <= 56) goto end; + block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C]; + block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36]; + block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; + block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; }else{ if(last_non_zero_p1 <= 1) goto end; block[0x01] = temp_block[0x01]; diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm index 9326ee776d..aec73f82dc 100644 --- a/libavcodec/x86/mpegvideoencdsp.asm +++ b/libavcodec/x86/mpegvideoencdsp.asm @@ -4,92 +4,151 @@ ;* Copyright (c) 2000, 2001 Fabrice Bellard ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;***************************************************************************** %include "libavutil/x86/x86util.asm" -SECTION .text +SECTION_RODATA -INIT_MMX mmx +cextern pw_1 + +SECTION .text ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) -cglobal pix_sum16, 2, 3 +; %1 = number of loops +; %2 = number of GPRs used +%macro PIX_SUM16 3 +cglobal pix_sum16, 2, %2, 6 movsxdifnidn r1, r1d - mov r2, r1 - neg r2 - shl r2, 4 - sub r0, r2 - pxor m7, m7 - pxor m6, m6 + mov r2, %1 +%if mmsize == 16 + lea r3, [r1*3] +%endif +%if notcpuflag(xop) + pxor m5, m5 +%endif + pxor m4, m4 .loop: - mova m0, [r0+r2+0] - mova m1, [r0+r2+0] - mova m2, [r0+r2+8] - mova m3, [r0+r2+8] - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 +%if cpuflag(xop) + vphaddubq m0, [r0] + vphaddubq m1, [r0+r1] + vphaddubq m2, [r0+r1*2] + vphaddubq m3, [r0+r3] +%else + mova m0, [r0] +%if mmsize == 8 + mova m1, [r0+8] +%if cpuflag(mmxext) + mova m2, [r0+r1] + mova m3, [r0+r1+8] +%endif +%else ; sse2 + mova m1, [r0+r1] + mova m2, [r0+r1*2] + mova m3, [r0+r3] +%endif +%if cpuflag(mmxext) + psadbw m0, m5 + psadbw m1, m5 + psadbw m2, m5 + psadbw m3, m5 +%else ; mmx + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif ; cpuflag(mmxext) +%endif ; cpuflag(xop) paddw m1, m0 paddw m3, m2 paddw m3, m1 - paddw m6, m3 - add r2, r1 - js .loop - mova m5, m6 - psrlq m6, 32 - paddw m6, m5 - mova m5, m6 - psrlq m6, 16 - paddw m6, m5 - movd eax, m6 - and eax, 0xffff + paddw m4, m3 +%if cpuflag(mmxext) + lea r0, [r0+r1*%3] +%else + add r0, r1 +%endif + dec r2 + jne .loop +%if mmsize == 16 + pshufd m0, m4, q0032 + paddd m4, m0 +%elif notcpuflag(mmxext) + HADDW m4, m5 +%endif + movd eax, m4 RET +%endmacro +%if ARCH_X86_32 INIT_MMX mmx +PIX_SUM16 16, 3, 0 +INIT_MMX mmxext +PIX_SUM16 8, 4, 2 +%endif +INIT_XMM sse2 +PIX_SUM16 4, 4, 4 +%if HAVE_XOP_EXTERNAL +INIT_XMM xop +PIX_SUM16 4, 4, 4 +%endif + ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) -cglobal pix_norm1, 2, 4 +; %1 = number of xmm registers used +; %2 = number of loops +%macro PIX_NORM1 2 +cglobal pix_norm1, 2, 3, %1 movsxdifnidn r1, r1d - mov r2, 16 + mov r2, %2 pxor m0, m0 - pxor m7, m7 + pxor m5, m5 .loop: mova m2, [r0+0] +%if mmsize == 8 mova m3, [r0+8] - mova m1, m2 - punpckhbw m1, m0 +%else + mova m3, [r0+r1] +%endif + punpckhbw m1, m2, m0 punpcklbw m2, m0 - mova m4, m3 - punpckhbw m3, m0 - punpcklbw m4, m0 + punpckhbw m4, m3, m0 + punpcklbw m3, m0 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m2, m1 paddd m4, m3 - paddd m7, m2 + paddd m5, m2 + paddd m5, m4 +%if mmsize == 8 add r0, r1 - paddd m7, m4 +%else + lea r0, [r0+r1*2] +%endif dec r2 jne .loop - mova m1, m7 - psrlq m7, 32 - paddd m1, m7 - movd eax, m1 + HADDD m5, m1 + movd eax, m5 RET +%endmacro + +INIT_MMX mmx +PIX_NORM1 0, 16 +INIT_XMM sse2 +PIX_NORM1 6, 8 diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c index 7732e7307f..2a4db61511 100644 --- a/libavcodec/x86/mpegvideoencdsp_init.c +++ b/libavcodec/x86/mpegvideoencdsp_init.c @@ -1,29 +1,34 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/attributes.h" +#include "libavutil/avassert.h" #include "libavutil/cpu.h" #include "libavutil/x86/cpu.h" #include "libavcodec/avcodec.h" #include "libavcodec/mpegvideoencdsp.h" int ff_pix_sum16_mmx(uint8_t *pix, int line_size); +int ff_pix_sum16_mmxext(uint8_t *pix, int line_size); +int ff_pix_sum16_sse2(uint8_t *pix, int line_size); +int ff_pix_sum16_xop(uint8_t *pix, int line_size); int ff_pix_norm1_mmx(uint8_t *pix, int line_size); +int ff_pix_norm1_sse2(uint8_t *pix, int line_size); #if HAVE_INLINE_ASM @@ -123,7 +128,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, : "+r" (ptr) : "r" ((x86_reg) wrap), "r" ((x86_reg) width), "r" (ptr + wrap * height)); - } else { + } else if (w == 16) { __asm__ volatile ( "1: \n\t" "movd (%0), %%mm0 \n\t" @@ -141,6 +146,25 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, "add %1, %0 \n\t" "cmp %3, %0 \n\t" "jb 1b \n\t" + : "+r"(ptr) + : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) + ); + } else { + av_assert1(w == 4); + __asm__ volatile ( + "1: \n\t" + "movd (%0), %%mm0 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpcklwd %%mm0, %%mm0 \n\t" + "movd %%mm0, -4(%0) \n\t" + "movd -4(%0, %2), %%mm1 \n\t" + "punpcklbw %%mm1, %%mm1 \n\t" + "punpckhwd %%mm1, %%mm1 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, (%0, %2) \n\t" + "add %1, %0 \n\t" + "cmp %3, %0 \n\t" + "jb 1b \n\t" : "+r" (ptr) : "r" ((x86_reg) wrap), "r" ((x86_reg) width), "r" (ptr + wrap * height)); @@ -195,11 +219,26 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, { int cpu_flags = av_get_cpu_flags(); +#if ARCH_X86_32 if (EXTERNAL_MMX(cpu_flags)) { c->pix_sum = ff_pix_sum16_mmx; c->pix_norm1 = ff_pix_norm1_mmx; } + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->pix_sum = ff_pix_sum16_mmxext; + } +#endif + + if (EXTERNAL_SSE2(cpu_flags)) { + c->pix_sum = ff_pix_sum16_sse2; + c->pix_norm1 = ff_pix_norm1_sse2; + } + + if (EXTERNAL_XOP(cpu_flags)) { + c->pix_sum = ff_pix_sum16_xop; + } + #if HAVE_INLINE_ASM if (INLINE_MMX(cpu_flags)) { diff --git a/libavcodec/x86/pixblockdsp.asm b/libavcodec/x86/pixblockdsp.asm index c8fd1b24a1..7c5377b2bb 100644 --- a/libavcodec/x86/pixblockdsp.asm +++ b/libavcodec/x86/pixblockdsp.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2000, 2001 Fabrice Bellard ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;***************************************************************************** @@ -26,9 +26,8 @@ SECTION .text INIT_MMX mmx -; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size) +; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size) cglobal get_pixels, 3,4 - movsxdifnidn r2, r2d add r0, 128 mov r3, -128 pxor m7, m7 @@ -51,8 +50,7 @@ cglobal get_pixels, 3,4 REP_RET INIT_XMM sse2 -cglobal get_pixels, 3, 4 - movsxdifnidn r2, r2d +cglobal get_pixels, 3, 4, 5 lea r3, [r2*3] pxor m4, m4 movh m0, [r1] @@ -108,3 +106,28 @@ cglobal diff_pixels, 4,5 add r4, 16 jne .loop REP_RET + +INIT_XMM sse2 +cglobal diff_pixels, 4, 5, 5 + movsxdifnidn r3, r3d + pxor m4, m4 + add r0, 128 + mov r4, -128 +.loop: + movh m0, [r1] + movh m2, [r2] + movh m1, [r1+r3] + movh m3, [r2+r3] + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 + psubw m0, m2 + psubw m1, m3 + mova [r0+r4+0 ], m0 + mova [r0+r4+16], m1 + lea r1, [r1+r3*2] + lea r2, [r2+r3*2] + add r4, 32 + jne .loop + RET diff --git a/libavcodec/x86/pixblockdsp_init.c b/libavcodec/x86/pixblockdsp_init.c index 9582e0b5c2..4d06a44c6d 100644 --- a/libavcodec/x86/pixblockdsp_init.c +++ b/libavcodec/x86/pixblockdsp_init.c @@ -1,20 +1,20 @@ /* * SIMD-optimized pixel operations * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -23,10 +23,12 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/pixblockdsp.h" -void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size); -void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size); +void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size); +void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size); void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride); +void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2, + int stride); av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c, AVCodecContext *avctx, @@ -43,5 +45,6 @@ av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c, if (EXTERNAL_SSE2(cpu_flags)) { if (!high_bit_depth) c->get_pixels = ff_get_pixels_sse2; + c->diff_pixels = ff_diff_pixels_sse2; } } diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm index c05f3da017..7bd1ab5c07 100644 --- a/libavcodec/x86/pngdsp.asm +++ b/libavcodec/x86/pngdsp.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i and waq, ~(mmsize*2-1) jmp .end_v .loop_v: - mova m0, [src1q+iq] - mova m1, [src1q+iq+mmsize] - paddb m0, [src2q+iq] - paddb m1, [src2q+iq+mmsize] - mova [dstq+iq ], m0 - mova [dstq+iq+mmsize], m1 + movu m0, [src2q+iq] + movu m1, [src2q+iq+mmsize] + paddb m0, [src1q+iq] + paddb m1, [src1q+iq+mmsize] + movu [dstq+iq ], m0 + movu [dstq+iq+mmsize], m1 add iq, mmsize*2 .end_v: cmp iq, waq @@ -157,7 +157,7 @@ cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr movh [dstq], m3 add dstq, bppq cmp dstq, endq - jle .loop + jl .loop mov dstq, [rsp] dec cntrq diff --git a/libavcodec/x86/pngdsp_init.c b/libavcodec/x86/pngdsp_init.c index 34a3da36d7..7dca62c675 100644 --- a/libavcodec/x86/pngdsp_init.c +++ b/libavcodec/x86/pngdsp_init.c @@ -2,20 +2,20 @@ * x86 PNG optimizations. * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm index a0e97b3951..632ece6ebf 100644 --- a/libavcodec/x86/proresdsp.asm +++ b/libavcodec/x86/proresdsp.asm @@ -1,23 +1,24 @@ ;****************************************************************************** ;* x86-SIMD-optimized IDCT for prores -;* this is identical to "simple" IDCT except for the clip range +;* this is identical to "simple" IDCT written by Michael Niedermayer +;* except for the clip range ;* ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -47,10 +48,10 @@ w1_plus_w5: times 4 dw W1sh2, +W5sh2 w5_min_w1: times 4 dw W5sh2, -W1sh2 w5_plus_w7: times 4 dw W5sh2, +W7sh2 w7_min_w5: times 4 dw W7sh2, -W5sh2 -row_round: times 8 dw (1<<14) +pw_88: times 8 dw 0x2008 +cextern pw_1 cextern pw_4 -cextern pw_8 cextern pw_512 cextern pw_1019 @@ -91,14 +92,12 @@ section .text align=16 ; a2 -= W6 * row[2]; ; a3 -= W2 * row[2]; %ifidn %1, col - paddw m10,[pw_8] + paddw m10,[pw_88] %endif - SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] %ifidn %1, row - psubw m10,[row_round] + paddw m10,[pw_1] %endif - SIGNEXTEND m8, m9, m14 ; { row[2] }[0-3] / [4-7] - SIGNEXTEND m10, m11, m14 ; { row[0] }[0-3] / [4-7] + SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] pmaddwd m2, m0, [w4_plus_w6] pmaddwd m3, m1, [w4_plus_w6] pmaddwd m4, m0, [w4_min_w6] @@ -107,75 +106,33 @@ section .text align=16 pmaddwd m7, m1, [w4_min_w2] pmaddwd m0, [w4_plus_w2] pmaddwd m1, [w4_plus_w2] - pslld m2, 2 - pslld m3, 2 - pslld m4, 2 - pslld m5, 2 - pslld m6, 2 - pslld m7, 2 - pslld m0, 2 - pslld m1, 2 ; a0: -1*row[0]-1*row[2] ; a1: -1*row[0] ; a2: -1*row[0] ; a3: -1*row[0]+1*row[2] - psubd m2, m10 ; a1[0-3] - psubd m3, m11 ; a1[4-7] - psubd m4, m10 ; a2[0-3] - psubd m5, m11 ; a2[4-7] - psubd m0, m10 - psubd m1, m11 - psubd m6, m10 - psubd m7, m11 - psubd m0, m8 ; a0[0-3] - psubd m1, m9 ; a0[4-7] - paddd m6, m8 ; a3[0-3] - paddd m7, m9 ; a3[4-7] ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4] ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6] ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6] ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4] SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7] - SIGNEXTEND m13, m14, m10 ; { row[4] }[0-3] / [4-7] pmaddwd m10, m8, [w4_plus_w6] pmaddwd m11, m9, [w4_plus_w6] - pslld m10, 2 - pslld m11, 2 - psubd m10, m13 - psubd m11, m14 paddd m0, m10 ; a0[0-3] paddd m1, m11 ; a0[4-7] pmaddwd m10, m8, [w4_min_w6] pmaddwd m11, m9, [w4_min_w6] - pslld m10, 2 - pslld m11, 2 - psubd m10, m13 - psubd m11, m14 paddd m6, m10 ; a3[0-3] paddd m7, m11 ; a3[4-7] pmaddwd m10, m8, [w4_min_w2] pmaddwd m11, m9, [w4_min_w2] pmaddwd m8, [w4_plus_w2] pmaddwd m9, [w4_plus_w2] - pslld m10, 2 - pslld m11, 2 - pslld m8, 2 - pslld m9, 2 - psubd m10, m13 - psubd m11, m14 - psubd m8, m13 - psubd m9, m14 psubd m4, m10 ; a2[0-3] intermediate psubd m5, m11 ; a2[4-7] intermediate psubd m2, m8 ; a1[0-3] intermediate psubd m3, m9 ; a1[4-7] intermediate - SIGNEXTEND m12, m13, m10 ; { row[6] }[0-3] / [4-7] - psubd m4, m12 ; a2[0-3] - psubd m5, m13 ; a2[4-7] - paddd m2, m12 ; a1[0-3] - paddd m3, m13 ; a1[4-7] ; load/store mova [r2+ 0], m0 @@ -206,8 +163,6 @@ section .text align=16 ; b3 = MUL(W7, row[1]); ; MAC(b3, -W5, row[3]); SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7] - SIGNEXTEND m10, m11, m12 ; { row[1] }[0-3] / [4-7] - SIGNEXTEND m8, m9, m12 ; { row[3] }[0-3] / [4-7] pmaddwd m2, m0, [w3_min_w7] pmaddwd m3, m1, [w3_min_w7] pmaddwd m4, m0, [w5_min_w1] @@ -216,35 +171,11 @@ section .text align=16 pmaddwd m7, m1, [w7_min_w5] pmaddwd m0, [w1_plus_w3] pmaddwd m1, [w1_plus_w3] - pslld m2, 2 - pslld m3, 2 - pslld m4, 2 - pslld m5, 2 - pslld m6, 2 - pslld m7, 2 - pslld m0, 2 - pslld m1, 2 ; b0: +1*row[1]+2*row[3] ; b1: +2*row[1]-1*row[3] ; b2: -1*row[1]-1*row[3] ; b3: +1*row[1]+1*row[3] - psubd m2, m8 - psubd m3, m9 - paddd m0, m8 - paddd m1, m9 - paddd m8, m10 ; { row[1] + row[3] }[0-3] - paddd m9, m11 ; { row[1] + row[3] }[4-7] - paddd m10, m10 - paddd m11, m11 - paddd m0, m8 ; b0[0-3] - paddd m1, m9 ; b0[4-7] - paddd m2, m10 ; b1[0-3] - paddd m3, m11 ; b2[4-7] - psubd m4, m8 ; b2[0-3] - psubd m5, m9 ; b2[4-7] - paddd m6, m8 ; b3[0-3] - paddd m7, m9 ; b3[4-7] ; MAC(b0, W5, row[5]); ; MAC(b0, W7, row[7]); @@ -255,38 +186,16 @@ section .text align=16 ; MAC(b3, W3, row[5]); ; MAC(b3, -W1, row[7]); SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7] - SIGNEXTEND m13, m12, m11 ; { row[5] }[0-3] / [4-7] - SIGNEXTEND m14, m11, m10 ; { row[7] }[0-3] / [4-7] ; b0: -1*row[5]+1*row[7] ; b1: -1*row[5]+1*row[7] ; b2: +1*row[5]+2*row[7] ; b3: +2*row[5]-1*row[7] - paddd m4, m13 - paddd m5, m12 - paddd m6, m13 - paddd m7, m12 - psubd m13, m14 ; { row[5] - row[7] }[0-3] - psubd m12, m11 ; { row[5] - row[7] }[4-7] - paddd m14, m14 - paddd m11, m11 - psubd m0, m13 - psubd m1, m12 - psubd m2, m13 - psubd m3, m12 - paddd m4, m14 - paddd m5, m11 - paddd m6, m13 - paddd m7, m12 pmaddwd m10, m8, [w1_plus_w5] pmaddwd m11, m9, [w1_plus_w5] pmaddwd m12, m8, [w5_plus_w7] pmaddwd m13, m9, [w5_plus_w7] - pslld m10, 2 - pslld m11, 2 - pslld m12, 2 - pslld m13, 2 psubd m2, m10 ; b1[0-3] psubd m3, m11 ; b1[4-7] paddd m0, m12 ; b0[0-3] @@ -295,10 +204,6 @@ section .text align=16 pmaddwd m13, m9, [w7_plus_w3] pmaddwd m8, [w3_min_w1] pmaddwd m9, [w3_min_w1] - pslld m12, 2 - pslld m13, 2 - pslld m8, 2 - pslld m9, 2 paddd m4, m12 ; b2[0-3] paddd m5, m13 ; b2[4-7] paddd m6, m8 ; b3[0-3] @@ -345,7 +250,7 @@ cglobal prores_idct_put_10, 4, 4, %1 pmullw m13,[r3+64] pmullw m12,[r3+96] - IDCT_1D row, 17 + IDCT_1D row, 15 ; transpose for second part of IDCT TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 @@ -360,20 +265,11 @@ cglobal prores_idct_put_10, 4, 4, %1 ; for (i = 0; i < 8; i++) ; idctSparseColAdd(dest + i, line_size, block + i); - IDCT_1D col, 20 + IDCT_1D col, 18 ; clip/store - mova m6, [pw_512] mova m3, [pw_4] mova m5, [pw_1019] - paddw m8, m6 - paddw m0, m6 - paddw m1, m6 - paddw m2, m6 - paddw m4, m6 - paddw m11, m6 - paddw m9, m6 - paddw m10, m6 pmaxsw m8, m3 pmaxsw m0, m3 pmaxsw m1, m3 @@ -404,25 +300,11 @@ cglobal prores_idct_put_10, 4, 4, %1 RET %endmacro -%macro SIGNEXTEND 2-3 -%if cpuflag(sse4) ; dstlow, dsthigh - movhlps %2, %1 - pmovsxwd %1, %1 - pmovsxwd %2, %2 -%elif cpuflag(sse2) ; dstlow, dsthigh, tmp - pxor %3, %3 - pcmpgtw %3, %1 - mova %2, %1 - punpcklwd %1, %3 - punpckhwd %2, %3 -%endif -%endmacro - INIT_XMM sse2 idct_put_fn 16 -INIT_XMM sse4 -idct_put_fn 16 +%if HAVE_AVX_EXTERNAL INIT_XMM avx idct_put_fn 16 +%endif %endif diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c index e82dac0448..ead11ae9c1 100644 --- a/libavcodec/x86/proresdsp_init.c +++ b/libavcodec/x86/proresdsp_init.c @@ -3,20 +3,20 @@ * * Copyright (c) 2010-2011 Maxim Poliakovski * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -27,12 +27,10 @@ void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize, int16_t *block, const int16_t *qmat); -void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize, - int16_t *block, const int16_t *qmat); void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize, int16_t *block, const int16_t *qmat); -av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp) +av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx) { #if ARCH_X86_64 int cpu_flags = av_get_cpu_flags(); @@ -42,11 +40,6 @@ av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp) dsp->idct_put = ff_prores_idct_put_10_sse2; } - if (EXTERNAL_SSE4(cpu_flags)) { - dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE; - dsp->idct_put = ff_prores_idct_put_10_sse4; - } - if (EXTERNAL_AVX(cpu_flags)) { dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE; dsp->idct_put = ff_prores_idct_put_10_avx; diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm index 27a1c63b8a..4e72d5084f 100644 --- a/libavcodec/x86/qpel.asm +++ b/libavcodec/x86/qpel.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2003-2013 Michael Niedermayer ;* Copyright (c) 2013 Daniel Kang ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm index 8f65550e60..dc0f900c5b 100644 --- a/libavcodec/x86/qpeldsp.asm +++ b/libavcodec/x86/qpeldsp.asm @@ -1,22 +1,23 @@ ;****************************************************************************** -;* quarterpel DSP functions -;* +;* mpeg4 qpel +;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> ;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2013 Daniel Kang ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c index cdefe50a3c..3268d907ab 100644 --- a/libavcodec/x86/qpeldsp_init.c +++ b/libavcodec/x86/qpeldsp_init.c @@ -1,20 +1,22 @@ /* * quarterpel DSP functions + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -77,13 +79,13 @@ void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride); -#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext -#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext +#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx +#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx #if HAVE_YASM -CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8) -CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8) +#define ff_put_pixels16_mmxext ff_put_pixels16_mmx +#define ff_put_pixels8_mmxext ff_put_pixels8_mmx #define QPEL_OP(OPNAME, RND, MMX) \ static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst, \ diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c index a9fb13234b..c9fd71eeef 100644 --- a/libavcodec/x86/rnd_template.c +++ b/libavcodec/x86/rnd_template.c @@ -7,20 +7,20 @@ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> * and improved by Zdenek Kabelac <kabi@users.sf.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm index 4d9c35b600..7732d65b2a 100644 --- a/libavcodec/x86/rv34dsp.asm +++ b/libavcodec/x86/rv34dsp.asm @@ -2,20 +2,20 @@ ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c index 586e4e9a6d..99c56f9d09 100644 --- a/libavcodec/x86/rv34dsp_init.c +++ b/libavcodec/x86/rv34dsp_init.c @@ -2,20 +2,20 @@ * RV30/40 MMX/SSE2 optimizations * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm index 0a242b54e3..fdd81a0a37 100644 --- a/libavcodec/x86/rv40dsp.asm +++ b/libavcodec/x86/rv40dsp.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c index e006c76584..bbf9c785f6 100644 --- a/libavcodec/x86/rv40dsp_init.c +++ b/libavcodec/x86/rv40dsp_init.c @@ -2,20 +2,20 @@ * RV40 decoder motion compensation functions x86-optimised * Copyright (c) 2008 Konstantin Shishkov * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -32,6 +32,13 @@ #include "libavutil/x86/cpu.h" #include "hpeldsp.h" +#define DEFINE_FN(op, size, insn) \ +static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \ + ptrdiff_t stride) \ +{ \ + ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \ +} + #if HAVE_YASM void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, int stride, int h, int x, int y); @@ -75,7 +82,7 @@ static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \ { \ int i; \ if (PH && PV) { \ - DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \ + LOCAL_ALIGNED(16, uint8_t, tmp, [SIZE * (SIZE + 5)]); \ uint8_t *tmpptr = tmp + SIZE * 2; \ src -= stride * 2; \ \ @@ -127,8 +134,8 @@ QPEL_FUNCS_DECL(OP, 3, 2, OPT) /** @} */ #define LOOPSIZE 8 -#define HCOFF(x) (32 * (x - 1)) -#define VCOFF(x) (32 * (x - 1)) +#define HCOFF(x) (32 * ((x) - 1)) +#define VCOFF(x) (32 * ((x) - 1)) QPEL_MC_DECL(put_, _ssse3) QPEL_MC_DECL(avg_, _ssse3) @@ -136,8 +143,8 @@ QPEL_MC_DECL(avg_, _ssse3) #undef HCOFF #undef VCOFF #define LOOPSIZE 8 -#define HCOFF(x) (64 * (x - 1)) -#define VCOFF(x) (64 * (x - 1)) +#define HCOFF(x) (64 * ((x) - 1)) +#define VCOFF(x) (64 * ((x) - 1)) QPEL_MC_DECL(put_, _sse2) QPEL_MC_DECL(avg_, _sse2) @@ -146,8 +153,8 @@ QPEL_MC_DECL(avg_, _sse2) #undef HCOFF #undef VCOFF #define LOOPSIZE 4 -#define HCOFF(x) (64 * (x - 1)) -#define VCOFF(x) (64 * (x - 1)) +#define HCOFF(x) (64 * ((x) - 1)) +#define VCOFF(x) (64 * ((x) - 1)) QPEL_MC_DECL(put_, _mmx) @@ -186,30 +193,24 @@ QPEL_FUNCS_SET (OP, 3, 1, OPT) \ QPEL_FUNCS_SET (OP, 3, 2, OPT) /** @} */ +DEFINE_FN(put, 8, ssse3) + +DEFINE_FN(put, 16, sse2) +DEFINE_FN(put, 16, ssse3) + +DEFINE_FN(avg, 8, mmxext) +DEFINE_FN(avg, 8, ssse3) + +DEFINE_FN(avg, 16, sse2) +DEFINE_FN(avg, 16, ssse3) #endif /* HAVE_YASM */ #if HAVE_MMX_INLINE -static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride) -{ - ff_put_pixels8_xy2_mmx(dst, src, stride, 8); -} -static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride) -{ - ff_put_pixels16_xy2_mmx(dst, src, stride, 16); -} -static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride) -{ - ff_avg_pixels8_xy2_mmx(dst, src, stride, 8); -} -static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride) -{ - ff_avg_pixels16_xy2_mmx(dst, src, stride, 16); -} -#endif /* HAVE_MMX_INLINE */ +DEFINE_FN(put, 8, mmx) +DEFINE_FN(avg, 8, mmx) +DEFINE_FN(put, 16, mmx) +DEFINE_FN(avg, 16, mmx) +#endif av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) { @@ -240,6 +241,7 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) #endif } if (EXTERNAL_MMXEXT(cpu_flags)) { + c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmxext; c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext; c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext; c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext; @@ -251,6 +253,8 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) #endif } if (EXTERNAL_SSE2(cpu_flags)) { + c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_sse2; + c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_sse2; c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; @@ -259,6 +263,10 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) QPEL_MC_SET(avg_, _sse2) } if (EXTERNAL_SSSE3(cpu_flags)) { + c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_ssse3; + c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_ssse3; + c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_ssse3; + c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_ssse3; c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm index d7164b6496..083461a107 100644 --- a/libavcodec/x86/sbrdsp.asm +++ b/libavcodec/x86/sbrdsp.asm @@ -2,20 +2,20 @@ ;* AAC Spectral Band Replication decoding functions ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -25,7 +25,14 @@ SECTION_RODATA ; mask equivalent for multiply by -1.0 1.0 ps_mask times 2 dd 1<<31, 0 ps_mask2 times 2 dd 0, 1<<31 -ps_neg times 4 dd 1<<31 +ps_mask3 dd 0, 0, 0, 1<<31 +ps_noise0 times 2 dd 1.0, 0.0, +ps_noise2 times 2 dd -1.0, 0.0 +ps_noise13 dd 0.0, 1.0, 0.0, -1.0 + dd 0.0, -1.0, 0.0, 1.0 + dd 0.0, 1.0, 0.0, -1.0 +cextern sbr_noise_table +cextern ps_neg SECTION_TEXT @@ -136,7 +143,6 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1) mova m3, m1 mova m4, m2 - mova m7, [ps_mask] ; Set pointers %if ARCH_X86_64 == 0 || WIN64 @@ -156,30 +162,28 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E shl start, 3 ; offset from num loops mova m0, [X_lowq + start] - movlhps m1, m1 ; (a2 a3 a2 a3) - movlhps m2, m2 ; (a0 a1 a0 a1) - shufps m3, m3, q0101 ; (a3 a2 a3 a2) - shufps m4, m4, q0101 ; (a1 a0 a1 a0) - xorps m3, m7 ; (-a3 a2 -a3 a2) - xorps m4, m7 ; (-a1 a0 -a1 a0) + shufps m3, m3, q1111 + shufps m4, m4, q1111 + xorps m3, [ps_mask] + shufps m1, m1, q0000 + shufps m2, m2, q0000 + xorps m4, [ps_mask] .loop2: - mova m5, m0 + movu m7, [X_lowq + start + 8] ; BbCc mova m6, m0 - shufps m0, m0, q2200 ; {Xl[-2][0],",Xl[-1][0],"} - shufps m5, m5, q3311 ; {Xl[-2][1],",Xl[-1][1],"} - mulps m0, m2 - mulps m5, m4 - mova m7, m6 - addps m5, m0 - mova m0, [X_lowq + start + 2*2*4] - shufps m6, m0, q0022 ; {Xl[-1][0],",Xl[0][0],"} - shufps m7, m0, q1133 ; {Xl[-1][1],",Xl[1][1],"} - mulps m6, m1 + mova m5, m7 + shufps m0, m0, q2301 ; aAbB + shufps m7, m7, q2301 ; bBcC + mulps m0, m4 mulps m7, m3 - addps m5, m6 + mulps m6, m2 + mulps m5, m1 + addps m7, m0 + mova m0, [X_lowq + start +16] ; CcDd addps m7, m0 - addps m5, m7 - mova [X_highq + start], m5 + addps m6, m5 + addps m7, m6 + mova [X_highq + start], m7 add start, 16 jnz .loop2 RET @@ -246,33 +250,47 @@ cglobal sbr_neg_odd_64, 1,2,4,z jne .loop REP_RET -INIT_XMM sse2 ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1) +%macro SBR_QMF_DEINT_BFLY 0 cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c mov cq, 64*4-2*mmsize lea vrevq, [vq + 64*4] .loop: mova m0, [src0q+cq] mova m1, [src1q] - mova m2, [src0q+cq+mmsize] - mova m3, [src1q+mmsize] - pshufd m4, m0, q0123 - pshufd m5, m1, q0123 - pshufd m6, m2, q0123 - pshufd m7, m3, q0123 - addps m3, m4 + mova m4, [src0q+cq+mmsize] + mova m5, [src1q+mmsize] +%if cpuflag(sse2) + pshufd m2, m0, q0123 + pshufd m3, m1, q0123 + pshufd m6, m4, q0123 + pshufd m7, m5, q0123 +%else + shufps m2, m0, m0, q0123 + shufps m3, m1, m1, q0123 + shufps m6, m4, m4, q0123 + shufps m7, m5, m5, q0123 +%endif + addps m5, m2 subps m0, m7 addps m1, m6 - subps m2, m5 + subps m4, m3 mova [vrevq], m1 - mova [vrevq+mmsize], m3 + mova [vrevq+mmsize], m5 mova [vq+cq], m0 - mova [vq+cq+mmsize], m2 + mova [vq+cq+mmsize], m4 add src1q, 2*mmsize add vrevq, 2*mmsize sub cq, 2*mmsize jge .loop REP_RET +%endmacro + +INIT_XMM sse +SBR_QMF_DEINT_BFLY + +INIT_XMM sse2 +SBR_QMF_DEINT_BFLY INIT_XMM sse2 cglobal sbr_qmf_pre_shuffle, 1,4,6,z @@ -303,3 +321,243 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z movq m2, [zq] movq [r2q], m2 REP_RET + +%ifdef PIC +%define NREGS 1 +%if UNIX64 +%define NOISE_TABLE r6q ; r5q is m_max +%else +%define NOISE_TABLE r5q +%endif +%else +%define NREGS 0 +%define NOISE_TABLE sbr_noise_table +%endif + +%macro LOAD_NST 1 +%ifdef PIC + lea NOISE_TABLE, [%1] + mova m0, [kxq + NOISE_TABLE] +%else + mova m0, [kxq + %1] +%endif +%endmacro + +INIT_XMM sse2 +; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + mova m0, [ps_noise0] + jmp apply_noise_main + +; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + and kxq, 1 + shl kxq, 4 + LOAD_NST ps_noise13 + jmp apply_noise_main + +; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + mova m0, [ps_noise2] + jmp apply_noise_main + +; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + and kxq, 1 + shl kxq, 4 + LOAD_NST ps_noise13+16 + +apply_noise_main: +%if ARCH_X86_64 == 0 || WIN64 + mov kxd, m_maxm +%define count kxq +%else +%define count m_maxq +%endif + dec noiseq + shl count, 2 +%ifdef PIC + lea NOISE_TABLE, [sbr_noise_table] +%endif + lea Yq, [Yq + 2*count] + add s_mq, count + add q_filtq, count + shl noiseq, 3 + pxor m5, m5 + neg count +.loop: + mova m1, [q_filtq + count] + movu m3, [noiseq + NOISE_TABLE + 1*mmsize] + movu m4, [noiseq + NOISE_TABLE + 2*mmsize] + add noiseq, 2*mmsize + and noiseq, 0x1ff<<3 + punpckhdq m2, m1, m1 + punpckldq m1, m1 + mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] + mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] + mova m3, [s_mq + count] + ; TODO: replace by a vpermd in AVX2 + punpckhdq m4, m3, m3 + punpckldq m3, m3 + pcmpeqd m6, m3, m5 ; m6 == 0 + pcmpeqd m7, m4, m5 ; m7 == 0 + mulps m3, m0 ; s_m[m] * phi_sign + mulps m4, m0 ; s_m[m] * phi_sign + pand m1, m6 + pand m2, m7 + movu m6, [Yq + 2*count] + movu m7, [Yq + 2*count + mmsize] + addps m3, m1 + addps m4, m2 + addps m6, m3 + addps m7, m4 + movu [Yq + 2*count], m6 + movu [Yq + 2*count + mmsize], m7 + add count, mmsize + jl .loop + RET + +INIT_XMM sse +cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c +%define COUNT 32*4 +%define OFFSET 32*4 + mov cq, -COUNT + lea vrevq, [vq + OFFSET + COUNT] + add vq, OFFSET-mmsize + add srcq, 2*COUNT + mova m3, [ps_neg] +.loop: + mova m0, [srcq + 2*cq + 0*mmsize] + mova m1, [srcq + 2*cq + 1*mmsize] + shufps m2, m0, m1, q2020 + shufps m1, m0, q1313 + xorps m2, m3 + mova [vq], m1 + mova [vrevq + cq], m2 + sub vq, mmsize + add cq, mmsize + jl .loop + REP_RET + +%macro SBR_AUTOCORRELATE 0 +cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt + mov cntq, 37*8 + add xq, cntq + neg cntq + +%if cpuflag(sse3) +%define MOVH movsd + movddup m5, [xq+cntq] +%else +%define MOVH movlps + movlps m5, [xq+cntq] + movlhps m5, m5 +%endif + MOVH m7, [xq+cntq+8 ] + MOVH m1, [xq+cntq+16] + shufps m7, m7, q0110 + shufps m1, m1, q0110 + mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0] + mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1]; + mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0] + movaps [rsp ], m3 + movaps [rsp+16], m4 + add cntq, 8 + + MOVH m2, [xq+cntq+16] + movlhps m7, m7 + shufps m2, m2, q0110 + mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0] + mulps m4, m7, m2 + mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1]; + addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0] + +align 16 +.loop: + add cntq, 8 + MOVH m0, [xq+cntq+16] + movlhps m1, m1 + shufps m0, m0, q0110 + mulps m3, m1, m2 + mulps m4, m1, m0 + mulps m1, m1 + addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; + addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; + addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; + add cntq, 8 + MOVH m1, [xq+cntq+16] + movlhps m2, m2 + shufps m1, m1, q0110 + mulps m3, m2, m0 + mulps m4, m2, m1 + mulps m2, m2 + addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; + addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; + addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; + add cntq, 8 + MOVH m2, [xq+cntq+16] + movlhps m0, m0 + shufps m2, m2, q0110 + mulps m3, m0, m1 + mulps m4, m0, m2 + mulps m0, m0 + addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; + addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; + addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; + jl .loop + + movlhps m1, m1 + mulps m4, m1, m2 + mulps m1, m1 + addps m4, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0]; + addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1]; + addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0]; + addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1]; + + xorps m4, [ps_mask3] + xorps m5, [ps_mask3] + xorps m6, [ps_mask3] +%if cpuflag(sse3) + movshdup m2, m1 + haddps m4, m5 + haddps m7, m6 + addss m1, m2 +%else + movaps m3, m4 + movaps m2, m5 + movaps m0, m6 + shufps m3, m3, q0301 + shufps m2, m2, q0301 + shufps m0, m0, q0301 + addps m4, m3 + addps m5, m2 + addps m6, m0 + + movss m2, m7 + movss m3, m1 + shufps m7, m7, q0001 + shufps m1, m1, q0001 + addss m7, m2 + addss m1, m3 + shufps m4, m5, q2020 + shufps m7, m6, q2020 +%endif + movaps [phiq ], m4 + movhps [phiq+0x18], m7 + movss [phiq+0x28], m7 + movss [phiq+0x10], m1 + RET +%endmacro + +INIT_XMM sse +SBR_AUTOCORRELATE +INIT_XMM sse3 +SBR_AUTOCORRELATE diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c index 9600852163..6911a1a515 100644 --- a/libavcodec/x86/sbrdsp_init.c +++ b/libavcodec/x86/sbrdsp_init.c @@ -2,20 +2,20 @@ * AAC Spectral Band Replication decoding functions * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -34,9 +34,28 @@ void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2], float bw, int start, int end); void ff_sbr_neg_odd_64_sse(float *z); void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z); +void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1); void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1); void ff_sbr_qmf_pre_shuffle_sse2(float *z); +void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); + +void ff_sbr_qmf_deint_neg_sse(float *v, const float *src); + +void ff_sbr_autocorrelate_sse (const float x[40][2], float phi[3][2][2]); +void ff_sbr_autocorrelate_sse3(const float x[40][2], float phi[3][2][2]); + av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -48,10 +67,21 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s) s->hf_g_filt = ff_sbr_hf_g_filt_sse; s->hf_gen = ff_sbr_hf_gen_sse; s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse; + s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse; + s->qmf_deint_neg = ff_sbr_qmf_deint_neg_sse; + s->autocorrelate = ff_sbr_autocorrelate_sse; } if (EXTERNAL_SSE2(cpu_flags)) { s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2; s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2; + s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2; + s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2; + s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2; + s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2; + } + + if (EXTERNAL_SSE3(cpu_flags)) { + s->autocorrelate = ff_sbr_autocorrelate_sse3; } } diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c index 71763dbf75..1d46212a13 100644 --- a/libavcodec/x86/simple_idct.c +++ b/libavcodec/x86/simple_idct.c @@ -3,24 +3,23 @@ * * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" @@ -86,7 +85,7 @@ DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= { static inline void idct(int16_t *block) { - DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; + LOCAL_ALIGNED_8(int64_t, align_tmp, [16]); int16_t * const temp= (int16_t*)align_tmp; __asm__ volatile( @@ -1148,6 +1147,7 @@ Temp "9: \n\t" :: "r" (block), "r" (temp), "r" (coeffs) + NAMED_CONSTRAINTS_ADD(wm1010,d40000) : "%eax" ); } diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h index 4fc29141b5..4a98732503 100644 --- a/libavcodec/x86/simple_idct.h +++ b/libavcodec/x86/simple_idct.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/snowdsp.c b/libavcodec/x86/snowdsp.c new file mode 100644 index 0000000000..e2ad511d0a --- /dev/null +++ b/libavcodec/x86/snowdsp.c @@ -0,0 +1,908 @@ +/* + * MMX and SSE2 optimized snow DSP utils + * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/snow.h" +#include "libavcodec/snow_dwt.h" + +#if HAVE_INLINE_ASM + +static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){ + const int w2= (width+1)>>1; + const int w_l= (width>>1); + const int w_r= w2 - 1; + int i; + + { // Lift 0 + IDWTELEM * const ref = b + w2 - 1; + IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice + // (the first time erroneously), we allow the SSE2 code to run an extra pass. + // The savings in code and time are well worth having to store this value and + // calculate b[0] correctly afterwards. + + i = 0; + __asm__ volatile( + "pcmpeqd %%xmm7, %%xmm7 \n\t" + "pcmpeqd %%xmm3, %%xmm3 \n\t" + "psllw $1, %%xmm3 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "psllw $13, %%xmm3 \n\t" + ::); + for(; i<w_l-15; i+=16){ + __asm__ volatile( + "movdqu (%1), %%xmm1 \n\t" + "movdqu 16(%1), %%xmm5 \n\t" + "movdqu 2(%1), %%xmm2 \n\t" + "movdqu 18(%1), %%xmm6 \n\t" + "paddw %%xmm1, %%xmm2 \n\t" + "paddw %%xmm5, %%xmm6 \n\t" + "paddw %%xmm7, %%xmm2 \n\t" + "paddw %%xmm7, %%xmm6 \n\t" + "pmulhw %%xmm3, %%xmm2 \n\t" + "pmulhw %%xmm3, %%xmm6 \n\t" + "paddw (%0), %%xmm2 \n\t" + "paddw 16(%0), %%xmm6 \n\t" + "movdqa %%xmm2, (%0) \n\t" + "movdqa %%xmm6, 16(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); + b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); + } + + { // Lift 1 + IDWTELEM * const dst = b+w2; + + i = 0; + for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){ + dst[i] = dst[i] - (b[i] + b[i + 1]); + } + for(; i<w_r-15; i+=16){ + __asm__ volatile( + "movdqu (%1), %%xmm1 \n\t" + "movdqu 16(%1), %%xmm5 \n\t" + "movdqu 2(%1), %%xmm2 \n\t" + "movdqu 18(%1), %%xmm6 \n\t" + "paddw %%xmm1, %%xmm2 \n\t" + "paddw %%xmm5, %%xmm6 \n\t" + "movdqa (%0), %%xmm0 \n\t" + "movdqa 16(%0), %%xmm4 \n\t" + "psubw %%xmm2, %%xmm0 \n\t" + "psubw %%xmm6, %%xmm4 \n\t" + "movdqa %%xmm0, (%0) \n\t" + "movdqa %%xmm4, 16(%0) \n\t" + :: "r"(&dst[i]), "r"(&b[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); + } + + { // Lift 2 + IDWTELEM * const ref = b+w2 - 1; + IDWTELEM b_0 = b[0]; + + i = 0; + __asm__ volatile( + "psllw $15, %%xmm7 \n\t" + "pcmpeqw %%xmm6, %%xmm6 \n\t" + "psrlw $13, %%xmm6 \n\t" + "paddw %%xmm7, %%xmm6 \n\t" + ::); + for(; i<w_l-15; i+=16){ + __asm__ volatile( + "movdqu (%1), %%xmm0 \n\t" + "movdqu 16(%1), %%xmm4 \n\t" + "movdqu 2(%1), %%xmm1 \n\t" + "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts + "paddw %%xmm6, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm4 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm5 \n\t" + "pavgw %%xmm1, %%xmm0 \n\t" + "pavgw %%xmm5, %%xmm4 \n\t" + "psubw %%xmm7, %%xmm0 \n\t" + "psubw %%xmm7, %%xmm4 \n\t" + "psraw $1, %%xmm0 \n\t" + "psraw $1, %%xmm4 \n\t" + "movdqa (%0), %%xmm1 \n\t" + "movdqa 16(%0), %%xmm5 \n\t" + "paddw %%xmm1, %%xmm0 \n\t" + "paddw %%xmm5, %%xmm4 \n\t" + "psraw $2, %%xmm0 \n\t" + "psraw $2, %%xmm4 \n\t" + "paddw %%xmm1, %%xmm0 \n\t" + "paddw %%xmm5, %%xmm4 \n\t" + "movdqa %%xmm0, (%0) \n\t" + "movdqa %%xmm4, 16(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); + b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS); + } + + { // Lift 3 + IDWTELEM * const src = b+w2; + + i = 0; + for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){ + temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); + } + for(; i<w_r-7; i+=8){ + __asm__ volatile( + "movdqu 2(%1), %%xmm2 \n\t" + "movdqu 18(%1), %%xmm6 \n\t" + "paddw (%1), %%xmm2 \n\t" + "paddw 16(%1), %%xmm6 \n\t" + "movdqu (%0), %%xmm0 \n\t" + "movdqu 16(%0), %%xmm4 \n\t" + "paddw %%xmm2, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm4 \n\t" + "psraw $1, %%xmm2 \n\t" + "psraw $1, %%xmm6 \n\t" + "paddw %%xmm0, %%xmm2 \n\t" + "paddw %%xmm4, %%xmm6 \n\t" + "movdqa %%xmm2, (%2) \n\t" + "movdqa %%xmm6, 16(%2) \n\t" + :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); + } + + { + snow_interleave_line_header(&i, width, b, temp); + + for (; (i & 0x3E) != 0x3E; i-=2){ + b[i+1] = temp[i>>1]; + b[i] = b[i>>1]; + } + for (i-=62; i>=0; i-=64){ + __asm__ volatile( + "movdqa (%1), %%xmm0 \n\t" + "movdqa 16(%1), %%xmm2 \n\t" + "movdqa 32(%1), %%xmm4 \n\t" + "movdqa 48(%1), %%xmm6 \n\t" + "movdqa (%1), %%xmm1 \n\t" + "movdqa 16(%1), %%xmm3 \n\t" + "movdqa 32(%1), %%xmm5 \n\t" + "movdqa 48(%1), %%xmm7 \n\t" + "punpcklwd (%2), %%xmm0 \n\t" + "punpcklwd 16(%2), %%xmm2 \n\t" + "punpcklwd 32(%2), %%xmm4 \n\t" + "punpcklwd 48(%2), %%xmm6 \n\t" + "movdqa %%xmm0, (%0) \n\t" + "movdqa %%xmm2, 32(%0) \n\t" + "movdqa %%xmm4, 64(%0) \n\t" + "movdqa %%xmm6, 96(%0) \n\t" + "punpckhwd (%2), %%xmm1 \n\t" + "punpckhwd 16(%2), %%xmm3 \n\t" + "punpckhwd 32(%2), %%xmm5 \n\t" + "punpckhwd 48(%2), %%xmm7 \n\t" + "movdqa %%xmm1, 16(%0) \n\t" + "movdqa %%xmm3, 48(%0) \n\t" + "movdqa %%xmm5, 80(%0) \n\t" + "movdqa %%xmm7, 112(%0) \n\t" + :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) + : "memory" + ); + } + } +} + +static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){ + const int w2= (width+1)>>1; + const int w_l= (width>>1); + const int w_r= w2 - 1; + int i; + + { // Lift 0 + IDWTELEM * const ref = b + w2 - 1; + + i = 1; + b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); + __asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "pcmpeqw %%mm3, %%mm3 \n\t" + "psllw $1, %%mm3 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "psllw $13, %%mm3 \n\t" + ::); + for(; i<w_l-7; i+=8){ + __asm__ volatile( + "movq (%1), %%mm2 \n\t" + "movq 8(%1), %%mm6 \n\t" + "paddw 2(%1), %%mm2 \n\t" + "paddw 10(%1), %%mm6 \n\t" + "paddw %%mm7, %%mm2 \n\t" + "paddw %%mm7, %%mm6 \n\t" + "pmulhw %%mm3, %%mm2 \n\t" + "pmulhw %%mm3, %%mm6 \n\t" + "paddw (%0), %%mm2 \n\t" + "paddw 8(%0), %%mm6 \n\t" + "movq %%mm2, (%0) \n\t" + "movq %%mm6, 8(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); + } + + { // Lift 1 + IDWTELEM * const dst = b+w2; + + i = 0; + for(; i<w_r-7; i+=8){ + __asm__ volatile( + "movq (%1), %%mm2 \n\t" + "movq 8(%1), %%mm6 \n\t" + "paddw 2(%1), %%mm2 \n\t" + "paddw 10(%1), %%mm6 \n\t" + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm4 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm6, %%mm4 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm4, 8(%0) \n\t" + :: "r"(&dst[i]), "r"(&b[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); + } + + { // Lift 2 + IDWTELEM * const ref = b+w2 - 1; + + i = 1; + b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); + __asm__ volatile( + "psllw $15, %%mm7 \n\t" + "pcmpeqw %%mm6, %%mm6 \n\t" + "psrlw $13, %%mm6 \n\t" + "paddw %%mm7, %%mm6 \n\t" + ::); + for(; i<w_l-7; i+=8){ + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm4 \n\t" + "movq 2(%1), %%mm1 \n\t" + "movq 10(%1), %%mm5 \n\t" + "paddw %%mm6, %%mm0 \n\t" + "paddw %%mm6, %%mm4 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm5 \n\t" + "pavgw %%mm1, %%mm0 \n\t" + "pavgw %%mm5, %%mm4 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psubw %%mm7, %%mm4 \n\t" + "psraw $1, %%mm0 \n\t" + "psraw $1, %%mm4 \n\t" + "movq (%0), %%mm1 \n\t" + "movq 8(%0), %%mm5 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm5, %%mm4 \n\t" + "psraw $2, %%mm0 \n\t" + "psraw $2, %%mm4 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm5, %%mm4 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm4, 8(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); + } + + { // Lift 3 + IDWTELEM * const src = b+w2; + i = 0; + + for(; i<w_r-7; i+=8){ + __asm__ volatile( + "movq 2(%1), %%mm2 \n\t" + "movq 10(%1), %%mm6 \n\t" + "paddw (%1), %%mm2 \n\t" + "paddw 8(%1), %%mm6 \n\t" + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm4 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm6, %%mm4 \n\t" + "psraw $1, %%mm2 \n\t" + "psraw $1, %%mm6 \n\t" + "paddw %%mm0, %%mm2 \n\t" + "paddw %%mm4, %%mm6 \n\t" + "movq %%mm2, (%2) \n\t" + "movq %%mm6, 8(%2) \n\t" + :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); + } + + { + snow_interleave_line_header(&i, width, b, temp); + + for (; (i & 0x1E) != 0x1E; i-=2){ + b[i+1] = temp[i>>1]; + b[i] = b[i>>1]; + } + for (i-=30; i>=0; i-=32){ + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 16(%1), %%mm4 \n\t" + "movq 24(%1), %%mm6 \n\t" + "movq (%1), %%mm1 \n\t" + "movq 8(%1), %%mm3 \n\t" + "movq 16(%1), %%mm5 \n\t" + "movq 24(%1), %%mm7 \n\t" + "punpcklwd (%2), %%mm0 \n\t" + "punpcklwd 8(%2), %%mm2 \n\t" + "punpcklwd 16(%2), %%mm4 \n\t" + "punpcklwd 24(%2), %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, 16(%0) \n\t" + "movq %%mm4, 32(%0) \n\t" + "movq %%mm6, 48(%0) \n\t" + "punpckhwd (%2), %%mm1 \n\t" + "punpckhwd 8(%2), %%mm3 \n\t" + "punpckhwd 16(%2), %%mm5 \n\t" + "punpckhwd 24(%2), %%mm7 \n\t" + "movq %%mm1, 8(%0) \n\t" + "movq %%mm3, 24(%0) \n\t" + "movq %%mm5, 40(%0) \n\t" + "movq %%mm7, 56(%0) \n\t" + :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) + : "memory" + ); + } + } +} + +#if HAVE_7REGS +#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ + ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ + ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\ + ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\ + ""op" 48("r",%%"REG_d"), %%"t3" \n\t" + +#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ + snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) + +#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ + snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) + +#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ + "psubw %%"s0", %%"t0" \n\t"\ + "psubw %%"s1", %%"t1" \n\t"\ + "psubw %%"s2", %%"t2" \n\t"\ + "psubw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ + "movdqa %%"s0", ("w",%%"REG_d") \n\t"\ + "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\ + "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\ + "movdqa %%"s3", 48("w",%%"REG_d") \n\t" + +#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ + "psraw $"n", %%"t0" \n\t"\ + "psraw $"n", %%"t1" \n\t"\ + "psraw $"n", %%"t2" \n\t"\ + "psraw $"n", %%"t3" \n\t" + +#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ + "paddw %%"s0", %%"t0" \n\t"\ + "paddw %%"s1", %%"t1" \n\t"\ + "paddw %%"s2", %%"t2" \n\t"\ + "paddw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ + "pmulhw %%"s0", %%"t0" \n\t"\ + "pmulhw %%"s1", %%"t1" \n\t"\ + "pmulhw %%"s2", %%"t2" \n\t"\ + "pmulhw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ + "movdqa %%"s0", %%"t0" \n\t"\ + "movdqa %%"s1", %%"t1" \n\t"\ + "movdqa %%"s2", %%"t2" \n\t"\ + "movdqa %%"s3", %%"t3" \n\t" + +static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ + x86_reg i = width; + + while(i & 0x1F) + { + i--; + b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; + b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; + b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; + b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; + } + i+=i; + + __asm__ volatile ( + "jmp 2f \n\t" + "1: \n\t" + snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") + + + "pcmpeqw %%xmm0, %%xmm0 \n\t" + "pcmpeqw %%xmm2, %%xmm2 \n\t" + "paddw %%xmm2, %%xmm2 \n\t" + "paddw %%xmm0, %%xmm2 \n\t" + "psllw $13, %%xmm2 \n\t" + snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") + + "pcmpeqw %%xmm7, %%xmm7 \n\t" + "pcmpeqw %%xmm5, %%xmm5 \n\t" + "psllw $15, %%xmm7 \n\t" + "psrlw $13, %%xmm5 \n\t" + "paddw %%xmm7, %%xmm5 \n\t" + snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") + "movq (%2,%%"REG_d"), %%xmm1 \n\t" + "movq 8(%2,%%"REG_d"), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm0 \n\t" + "pavgw %%xmm3, %%xmm2 \n\t" + "movq 16(%2,%%"REG_d"), %%xmm1 \n\t" + "movq 24(%2,%%"REG_d"), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm4 \n\t" + "pavgw %%xmm3, %%xmm6 \n\t" + snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") + + snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") + + "2: \n\t" + "sub $64, %%"REG_d" \n\t" + "jge 1b \n\t" + :"+d"(i) + :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); +} + +#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ + ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ + ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\ + ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\ + ""op" 24("r",%%"REG_d"), %%"t3" \n\t" + +#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ + snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) + +#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ + snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) + +#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ + "movq %%"s0", ("w",%%"REG_d") \n\t"\ + "movq %%"s1", 8("w",%%"REG_d") \n\t"\ + "movq %%"s2", 16("w",%%"REG_d") \n\t"\ + "movq %%"s3", 24("w",%%"REG_d") \n\t" + +#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ + "movq %%"s0", %%"t0" \n\t"\ + "movq %%"s1", %%"t1" \n\t"\ + "movq %%"s2", %%"t2" \n\t"\ + "movq %%"s3", %%"t3" \n\t" + + +static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ + x86_reg i = width; + while(i & 15) + { + i--; + b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; + b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; + b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; + b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; + } + i+=i; + __asm__ volatile( + "jmp 2f \n\t" + "1: \n\t" + + snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") + "pcmpeqw %%mm0, %%mm0 \n\t" + "pcmpeqw %%mm2, %%mm2 \n\t" + "paddw %%mm2, %%mm2 \n\t" + "paddw %%mm0, %%mm2 \n\t" + "psllw $13, %%mm2 \n\t" + snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") + snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") + snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") + "pcmpeqw %%mm7, %%mm7 \n\t" + "pcmpeqw %%mm5, %%mm5 \n\t" + "psllw $15, %%mm7 \n\t" + "psrlw $13, %%mm5 \n\t" + "paddw %%mm7, %%mm5 \n\t" + snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") + "movq (%2,%%"REG_d"), %%mm1 \n\t" + "movq 8(%2,%%"REG_d"), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm0 \n\t" + "pavgw %%mm3, %%mm2 \n\t" + "movq 16(%2,%%"REG_d"), %%mm1 \n\t" + "movq 24(%2,%%"REG_d"), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm4 \n\t" + "pavgw %%mm3, %%mm6 \n\t" + snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") + + snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") + snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") + snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") + + "2: \n\t" + "sub $32, %%"REG_d" \n\t" + "jge 1b \n\t" + :"+d"(i) + :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); +} +#endif //HAVE_7REGS + +#if HAVE_6REGS +#define snow_inner_add_yblock_sse2_header \ + IDWTELEM * * dst_array = sb->line + src_y;\ + x86_reg tmp;\ + __asm__ volatile(\ + "mov %7, %%"REG_c" \n\t"\ + "mov %6, %2 \n\t"\ + "mov %4, %%"REG_S" \n\t"\ + "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ + "pcmpeqd %%xmm3, %%xmm3 \n\t"\ + "psllw $15, %%xmm3 \n\t"\ + "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ + "1: \n\t"\ + "mov %1, %%"REG_D" \n\t"\ + "mov (%%"REG_D"), %%"REG_D" \n\t"\ + "add %3, %%"REG_D" \n\t" + +#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movq (%%"REG_d"), %%"out_reg1" \n\t"\ + "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ + "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ + "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + "pmullw %%xmm0, %%"out_reg1" \n\t"\ + "pmullw %%xmm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movq (%%"REG_d"), %%"out_reg1" \n\t"\ + "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ + "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ + "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + "pmullw %%xmm0, %%"out_reg1" \n\t"\ + "pmullw %%xmm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ + snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ + "paddusw %%xmm2, %%xmm1 \n\t"\ + "paddusw %%xmm6, %%xmm5 \n\t" + +#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ + snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ + "paddusw %%xmm2, %%xmm1 \n\t"\ + "paddusw %%xmm6, %%xmm5 \n\t" + +#define snow_inner_add_yblock_sse2_end_common1\ + "add $32, %%"REG_S" \n\t"\ + "add %%"REG_c", %0 \n\t"\ + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ + "add %%"REG_c", (%%"REG_a") \n\t" + +#define snow_inner_add_yblock_sse2_end_common2\ + "jnz 1b \n\t"\ + :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ + :\ + "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ + XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\ + "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); + +#define snow_inner_add_yblock_sse2_end_8\ + "sal $1, %%"REG_c" \n\t"\ + "add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\ + snow_inner_add_yblock_sse2_end_common1\ + "sar $1, %%"REG_c" \n\t"\ + "sub $2, %2 \n\t"\ + snow_inner_add_yblock_sse2_end_common2 + +#define snow_inner_add_yblock_sse2_end_16\ + "add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\ + snow_inner_add_yblock_sse2_end_common1\ + "dec %2 \n\t"\ + snow_inner_add_yblock_sse2_end_common2 + +static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_sse2_header +snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") +snow_inner_add_yblock_sse2_accum_8("2", "8") +snow_inner_add_yblock_sse2_accum_8("1", "128") +snow_inner_add_yblock_sse2_accum_8("0", "136") + + "mov %0, %%"REG_d" \n\t" + "movdqa (%%"REG_D"), %%xmm0 \n\t" + "movdqa %%xmm1, %%xmm2 \n\t" + + "punpckhwd %%xmm7, %%xmm1 \n\t" + "punpcklwd %%xmm7, %%xmm2 \n\t" + "paddd %%xmm2, %%xmm0 \n\t" + "movdqa 16(%%"REG_D"), %%xmm2 \n\t" + "paddd %%xmm1, %%xmm2 \n\t" + "paddd %%xmm3, %%xmm0 \n\t" + "paddd %%xmm3, %%xmm2 \n\t" + + "mov %1, %%"REG_D" \n\t" + "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" + "add %3, %%"REG_D" \n\t" + + "movdqa (%%"REG_D"), %%xmm4 \n\t" + "movdqa %%xmm5, %%xmm6 \n\t" + "punpckhwd %%xmm7, %%xmm5 \n\t" + "punpcklwd %%xmm7, %%xmm6 \n\t" + "paddd %%xmm6, %%xmm4 \n\t" + "movdqa 16(%%"REG_D"), %%xmm6 \n\t" + "paddd %%xmm5, %%xmm6 \n\t" + "paddd %%xmm3, %%xmm4 \n\t" + "paddd %%xmm3, %%xmm6 \n\t" + + "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ + "packssdw %%xmm2, %%xmm0 \n\t" + "packuswb %%xmm7, %%xmm0 \n\t" + "movq %%xmm0, (%%"REG_d") \n\t" + + "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ + "packssdw %%xmm6, %%xmm4 \n\t" + "packuswb %%xmm7, %%xmm4 \n\t" + "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" +snow_inner_add_yblock_sse2_end_8 +} + +static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_sse2_header +snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") +snow_inner_add_yblock_sse2_accum_16("2", "16") +snow_inner_add_yblock_sse2_accum_16("1", "512") +snow_inner_add_yblock_sse2_accum_16("0", "528") + + "mov %0, %%"REG_d" \n\t" + "psrlw $4, %%xmm1 \n\t" + "psrlw $4, %%xmm5 \n\t" + "paddw (%%"REG_D"), %%xmm1 \n\t" + "paddw 16(%%"REG_D"), %%xmm5 \n\t" + "paddw %%xmm3, %%xmm1 \n\t" + "paddw %%xmm3, %%xmm5 \n\t" + "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ + "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ + "packuswb %%xmm5, %%xmm1 \n\t" + + "movdqu %%xmm1, (%%"REG_d") \n\t" + +snow_inner_add_yblock_sse2_end_16 +} + +#define snow_inner_add_yblock_mmx_header \ + IDWTELEM * * dst_array = sb->line + src_y;\ + x86_reg tmp;\ + __asm__ volatile(\ + "mov %7, %%"REG_c" \n\t"\ + "mov %6, %2 \n\t"\ + "mov %4, %%"REG_S" \n\t"\ + "pxor %%mm7, %%mm7 \n\t" /* 0 */\ + "pcmpeqd %%mm3, %%mm3 \n\t"\ + "psllw $15, %%mm3 \n\t"\ + "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ + "1: \n\t"\ + "mov %1, %%"REG_D" \n\t"\ + "mov (%%"REG_D"), %%"REG_D" \n\t"\ + "add %3, %%"REG_D" \n\t" + +#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ + "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ + "punpcklbw %%mm7, %%"out_reg1" \n\t"\ + "punpcklbw %%mm7, %%"out_reg2" \n\t"\ + "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ + "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "pmullw %%mm0, %%"out_reg1" \n\t"\ + "pmullw %%mm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ + snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ + "paddusw %%mm2, %%mm1 \n\t"\ + "paddusw %%mm6, %%mm5 \n\t" + +#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ + "mov %0, %%"REG_d" \n\t"\ + "psrlw $4, %%mm1 \n\t"\ + "psrlw $4, %%mm5 \n\t"\ + "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ + "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psraw $4, %%mm1 \n\t"\ + "psraw $4, %%mm5 \n\t"\ + "packuswb %%mm5, %%mm1 \n\t"\ + "movq %%mm1, "write_offset"(%%"REG_d") \n\t" + +#define snow_inner_add_yblock_mmx_end(s_step)\ + "add $"s_step", %%"REG_S" \n\t"\ + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ + "add %%"REG_c", (%%"REG_a") \n\t"\ + "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\ + "add %%"REG_c", %0 \n\t"\ + "dec %2 \n\t"\ + "jnz 1b \n\t"\ + :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ + :\ + "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ + "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); + +static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_mmx_header +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") +snow_inner_add_yblock_mmx_accum("2", "8", "0") +snow_inner_add_yblock_mmx_accum("1", "128", "0") +snow_inner_add_yblock_mmx_accum("0", "136", "0") +snow_inner_add_yblock_mmx_mix("0", "0") +snow_inner_add_yblock_mmx_end("16") +} + +static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_mmx_header +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") +snow_inner_add_yblock_mmx_accum("2", "16", "0") +snow_inner_add_yblock_mmx_accum("1", "512", "0") +snow_inner_add_yblock_mmx_accum("0", "528", "0") +snow_inner_add_yblock_mmx_mix("0", "0") + +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") +snow_inner_add_yblock_mmx_accum("2", "24", "8") +snow_inner_add_yblock_mmx_accum("1", "520", "8") +snow_inner_add_yblock_mmx_accum("0", "536", "8") +snow_inner_add_yblock_mmx_mix("16", "8") +snow_inner_add_yblock_mmx_end("32") +} + +static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ + + if (b_w == 16) + inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else if (b_w == 8 && obmc_stride == 16) { + if (!(b_h & 1)) + inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else + inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + } else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); +} + +static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ + if (b_w == 16) + inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else if (b_w == 8 && obmc_stride == 16) + inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); +} +#endif /* HAVE_6REGS */ + +#endif /* HAVE_INLINE_ASM */ + +av_cold void ff_dwt_init_x86(SnowDWTContext *c) +{ +#if HAVE_INLINE_ASM + int mm_flags = av_get_cpu_flags(); + + if (mm_flags & AV_CPU_FLAG_MMX) { + if(mm_flags & AV_CPU_FLAG_SSE2 & 0){ + c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; +#if HAVE_7REGS + c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; +#endif +#if HAVE_6REGS + c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; +#endif + } + else{ + if (mm_flags & AV_CPU_FLAG_MMXEXT) { + c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; +#if HAVE_7REGS + c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; +#endif + } +#if HAVE_6REGS + c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; +#endif + } + } +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/x86/svq1enc.asm b/libavcodec/x86/svq1enc.asm new file mode 100644 index 0000000000..24ee70f108 --- /dev/null +++ b/libavcodec/x86/svq1enc.asm @@ -0,0 +1,61 @@ +;****************************************************************************** +;* SIMD-optimized SVQ1 encoder functions +;* Copyright (c) 2007 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +%macro SSD_INT8_VS_INT16 0 +cglobal ssd_int8_vs_int16, 3, 3, 3, pix1, pix2, size + pxor m0, m0 +.loop + sub sizeq, 8 + movq m1, [pix1q + sizeq] + mova m2, [pix2q + sizeq*2] +%if mmsize == 8 + movq m3, [pix2q + sizeq*2 + mmsize] + punpckhbw m4, m1 + punpcklbw m1, m1 + psraw m4, 8 + psraw m1, 8 + psubw m3, m4 + psubw m2, m1 + pmaddwd m3, m3 + pmaddwd m2, m2 + paddd m0, m3 + paddd m0, m2 +%else + punpcklbw m1, m1 + psraw m1, 8 + psubw m2, m1 + pmaddwd m2, m2 + paddd m0, m2 +%endif + jg .loop + HADDD m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_MMX mmx +SSD_INT8_VS_INT16 +INIT_XMM sse2 +SSD_INT8_VS_INT16 diff --git a/libavcodec/x86/svq1enc.c b/libavcodec/x86/svq1enc.c deleted file mode 100644 index 02b0a84b8c..0000000000 --- a/libavcodec/x86/svq1enc.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/svq1enc.h" - -#if HAVE_INLINE_ASM - -static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, - int size) -{ - int sum; - x86_reg i = size; - - __asm__ volatile ( - "pxor %%mm4, %%mm4 \n" - "1: \n" - "sub $8, %0 \n" - "movq (%2, %0), %%mm2 \n" - "movq (%3, %0, 2), %%mm0 \n" - "movq 8(%3, %0, 2), %%mm1 \n" - "punpckhbw %%mm2, %%mm3 \n" - "punpcklbw %%mm2, %%mm2 \n" - "psraw $8, %%mm3 \n" - "psraw $8, %%mm2 \n" - "psubw %%mm3, %%mm1 \n" - "psubw %%mm2, %%mm0 \n" - "pmaddwd %%mm1, %%mm1 \n" - "pmaddwd %%mm0, %%mm0 \n" - "paddd %%mm1, %%mm4 \n" - "paddd %%mm0, %%mm4 \n" - "jg 1b \n" - "movq %%mm4, %%mm3 \n" - "psrlq $32, %%mm3 \n" - "paddd %%mm3, %%mm4 \n" - "movd %%mm4, %1 \n" - : "+r" (i), "=r" (sum) - : "r" (pix1), "r" (pix2)); - - return sum; -} - -#endif /* HAVE_INLINE_ASM */ - -av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c) -{ -#if HAVE_INLINE_ASM - int cpu_flags = av_get_cpu_flags(); - - if (INLINE_MMX(cpu_flags)) { - c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; - } -#endif /* HAVE_INLINE_ASM */ -} diff --git a/libavcodec/x86/svq1enc_init.c b/libavcodec/x86/svq1enc_init.c new file mode 100644 index 0000000000..40b4b0e183 --- /dev/null +++ b/libavcodec/x86/svq1enc_init.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2007 Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/svq1enc.h" + +int ff_ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, + intptr_t size); +int ff_ssd_int8_vs_int16_sse2(const int8_t *pix1, const int16_t *pix2, + intptr_t size); + +av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_mmx; + } + if (EXTERNAL_SSE2(cpu_flags)) { + c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_sse2; + } +} diff --git a/libavcodec/x86/ttadsp.asm b/libavcodec/x86/ttadsp.asm new file mode 100644 index 0000000000..8f489498a3 --- /dev/null +++ b/libavcodec/x86/ttadsp.asm @@ -0,0 +1,119 @@ +;****************************************************************************** +;* TTA DSP SIMD optimizations +;* +;* Copyright (C) 2014 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pd_n0113: dd ~0, ~1, ~1, ~3 +pd_1224: dd 1, 2, 2, 4 + +SECTION .text + +%macro TTA_FILTER 2 +INIT_XMM %1 +cglobal ttafilter_process_dec, 5,5,%2, qm, dx, dl, error, in, shift, round + mova m2, [qmq ] + mova m3, [qmq + 0x10] + mova m4, [dxq ] + mova m5, [dxq + 0x10] + + movd m6, [errorq] ; if (filter->error < 0) { + SPLATD m6 ; for (int i = 0; i < 8; i++) + psignd m0, m4, m6 ; filter->qm[i] -= filter->dx[i]; + psignd m1, m5, m6 ; } else if (filter->error > 0) { + paddd m2, m0 ; for (int i = 0; i < 8; i++) + paddd m3, m1 ; filter->qm[i] += filter->dx[i]; + mova [qmq ], m2 ; } + mova [qmq + 0x10], m3 ; + + mova m0, [dlq ] + mova m1, [dlq + 0x10] + +%if cpuflag(sse4) + pmulld m2, m0 + pmulld m3, m1 +%else + pshufd m6, m0, 0xb1 + pshufd m7, m2, 0xb1 + pmuludq m6, m7 + pshufd m6, m6, 0xd8 + pmuludq m2, m0 + pshufd m2, m2, 0xd8 + punpckldq m2, m6 + + pshufd m6, m1, 0xb1 + pshufd m7, m3, 0xb1 + pmuludq m6, m7 + pshufd m6, m6, 0xd8 + pmuludq m3, m1 + pshufd m3, m3, 0xd8 + punpckldq m3, m6 +%endif + ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around + paddd m2, m3 ; int sum = filter->round + + ; filter->dl[0] * filter->qm[0] + + pshufd m3, m2, 0xe ; filter->dl[1] * filter->qm[1] + + paddd m2, m3 ; filter->dl[2] * filter->qm[2] + + ; filter->dl[3] * filter->qm[3] + + movd m6, roundm ; filter->dl[4] * filter->qm[4] + + paddd m6, m2 ; filter->dl[5] * filter->qm[5] + + pshufd m2, m2, 0x1 ; filter->dl[6] * filter->qm[6] + + paddd m6, m2 ; filter->dl[7] * filter->qm[7]; + + palignr m5, m4, 4 ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2]; + ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4]; + + palignr m2, m1, m0, 4 ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2]; + ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4]; + + psrad m4, m1, 30 ; filter->dx[4] = ((filter->dl[4] >> 30) | 1); + por m4, [pd_1224 ] ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1; + pand m4, [pd_n0113] ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1; + ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3; + + mova [dlq ], m2 + mova [dxq ], m5 + mova [dxq + 0x10], m4 + movd m0, [inq] ; filter->error = *in; + movd [errorq], m0 ; + + movd m2, shiftm ; *in += (sum >> filter->shift); + psrad m6, m2 ; + paddd m0, m6 ; + movd [inq], m0 ; + + psrldq m1, 4 ; + pslldq m0, 12 ; filter->dl[4] = -filter->dl[5]; + pshufd m0, m0, 0xf0 ; filter->dl[5] = -filter->dl[6]; + psubd m0, m1 ; filter->dl[6] = *in - filter->dl[7]; + psrldq m1, m0, 4 ; filter->dl[7] = *in; + pshufd m1, m1, 0xf4 ; filter->dl[5] += filter->dl[6]; + paddd m0, m1 ; filter->dl[4] += filter->dl[5]; + psrldq m1, 4 ; + paddd m0, m1 ; + mova [dlq + 0x10], m0 ; + RET +%endmacro + +TTA_FILTER ssse3, 8 +TTA_FILTER sse4, 7 diff --git a/libavcodec/x86/ttadsp_init.c b/libavcodec/x86/ttadsp_init.c new file mode 100644 index 0000000000..47dc87f6af --- /dev/null +++ b/libavcodec/x86/ttadsp_init.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2014 James Almer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/ttadsp.h" +#include "libavutil/x86/cpu.h" +#include "config.h" + +void ff_ttafilter_process_dec_ssse3(int32_t *qm, int32_t *dx, int32_t *dl, + int32_t *error, int32_t *in, int32_t shift, + int32_t round); +void ff_ttafilter_process_dec_sse4(int32_t *qm, int32_t *dx, int32_t *dl, + int32_t *error, int32_t *in, int32_t shift, + int32_t round); + +av_cold void ff_ttadsp_init_x86(TTADSPContext *c) +{ +#if HAVE_YASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSSE3(cpu_flags)) + c->ttafilter_process_dec = ff_ttafilter_process_dec_ssse3; + if (EXTERNAL_SSE4(cpu_flags)) + c->ttafilter_process_dec = ff_ttafilter_process_dec_sse4; +#endif +} diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c new file mode 100644 index 0000000000..dfdfd2631b --- /dev/null +++ b/libavcodec/x86/v210-init.c @@ -0,0 +1,48 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavcodec/v210dec.h" + +extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); + +extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); + +av_cold void ff_v210_x86_init(V210DecContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_YASM + if (s->aligned_input) { + if (cpu_flags & AV_CPU_FLAG_SSSE3) + s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3; + + if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) + s->unpack_frame = ff_v210_planar_unpack_aligned_avx; + } + else { + if (cpu_flags & AV_CPU_FLAG_SSSE3) + s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3; + + if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) + s->unpack_frame = ff_v210_planar_unpack_unaligned_avx; + } +#endif +} diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm new file mode 100644 index 0000000000..400a1f3f9e --- /dev/null +++ b/libavcodec/x86/v210.asm @@ -0,0 +1,90 @@ +;****************************************************************************** +;* V210 SIMD unpack +;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu> +;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +v210_mask: times 4 dd 0x3ff +v210_mult: dw 64,4,64,4,64,4,64,4 +v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 +v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 + +SECTION .text + +%macro v210_planar_unpack 1 + +; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) +cglobal v210_planar_unpack_%1, 5, 5, 7 + movsxdifnidn r4, r4d + lea r1, [r1+2*r4] + add r2, r4 + add r3, r4 + neg r4 + + mova m3, [v210_mult] + mova m4, [v210_mask] + mova m5, [v210_luma_shuf] + mova m6, [v210_chroma_shuf] +.loop +%ifidn %1, unaligned + movu m0, [r0] +%else + mova m0, [r0] +%endif + + pmullw m1, m0, m3 + psrld m0, 10 + psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5 + pand m0, m4 ; y0 __ u1 __ y3 __ v2 __ + + shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __ + pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __ + movu [r1+2*r4], m2 + + shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __ + pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __ + movq [r2+r4], m1 + movhps [r3+r4], m1 + + add r0, mmsize + add r4, 6 + jl .loop + + REP_RET +%endmacro + +INIT_XMM ssse3 +v210_planar_unpack unaligned + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +v210_planar_unpack unaligned +%endif + +INIT_XMM ssse3 +v210_planar_unpack aligned + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +v210_planar_unpack aligned +%endif diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index 595c8907b3..751675fc5e 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -2,20 +2,20 @@ ;* V210 SIMD pack ;* Copyright (c) 2014 Kieran Kunhya <kierank@obe.tv> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -23,7 +23,8 @@ SECTION_RODATA -v210_enc_min_10: times 8 dw 0x4 +cextern pw_4 +%define v210_enc_min_10 pw_4 v210_enc_max_10: times 8 dw 0x3fb v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0 @@ -32,8 +33,10 @@ v210_enc_luma_shuf_10: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0 v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 -v210_enc_min_8: times 16 db 0x1 -v210_enc_max_8: times 16 db 0xfe +cextern pb_1 +%define v210_enc_min_8 pb_1 +cextern pb_FE +%define v210_enc_max_8 pb_FE v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0 diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c index 95b999bc05..2afb1b2d7b 100644 --- a/libavcodec/x86/v210enc_init.c +++ b/libavcodec/x86/v210enc_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm index adf08d7d84..546688cf9d 100644 --- a/libavcodec/x86/vc1dsp.asm +++ b/libavcodec/x86/vc1dsp.asm @@ -2,20 +2,20 @@ ;* VC1 deblocking optimizations ;* Copyright (c) 2009 David Conrad ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/vc1dsp.h b/libavcodec/x86/vc1dsp.h index 9b6c8ada26..fdd4de1813 100644 --- a/libavcodec/x86/vc1dsp.h +++ b/libavcodec/x86/vc1dsp.h @@ -1,20 +1,20 @@ /* * VC-1 and WMV3 decoder - X86 DSP init functions * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c index aff4b264e3..2bef5f5fb5 100644 --- a/libavcodec/x86/vc1dsp_init.c +++ b/libavcodec/x86/vc1dsp_init.c @@ -27,6 +27,7 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/x86/cpu.h" +#include "libavutil/x86/asm.h" #include "libavcodec/vc1dsp.h" #include "fpel.h" #include "vc1dsp.h" @@ -62,12 +63,17 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq) ff_vc1_h_loop_filter8_sse4(src, stride, pq); ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq); } - static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd) { ff_avg_pixels8_mmxext(dst, src, stride, 8); } +static void avg_vc1_mspel_mc00_16_sse2(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride, int rnd) +{ + ff_avg_pixels16_sse2(dst, src, stride, 16); +} + #endif /* HAVE_YASM */ void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src, @@ -86,10 +92,10 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) { int cpu_flags = av_get_cpu_flags(); - if (INLINE_MMX(cpu_flags)) + if (HAVE_6REGS && INLINE_MMX(cpu_flags)) ff_vc1dsp_init_mmx(dsp); - if (INLINE_MMXEXT(cpu_flags)) + if (HAVE_6REGS && INLINE_MMXEXT(cpu_flags)) ff_vc1dsp_init_mmxext(dsp); #define ASSIGN_LF(EXT) \ @@ -111,13 +117,14 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) ASSIGN_LF(mmxext); dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext; - dsp->avg_vc1_mspel_pixels_tab[0] = avg_vc1_mspel_mc00_mmxext; + dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2; dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2; dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2; dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2; + dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_sse2; } if (EXTERNAL_SSSE3(cpu_flags)) { ASSIGN_LF(ssse3); diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index 046affbc26..a7eb59df47 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -25,7 +25,6 @@ */ #include "libavutil/cpu.h" -#include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" @@ -34,7 +33,7 @@ #include "fpel.h" #include "vc1dsp.h" -#if HAVE_INLINE_ASM +#if HAVE_6REGS && HAVE_INLINE_ASM #define OP_PUT(S,D) #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" @@ -81,7 +80,7 @@ "movq %%mm"#R1", "#OFF"(%1) \n\t" \ "add %2, %0 \n\t" -/** Sacrifying mm6 allows to pipeline loads from src */ +/** Sacrificing mm6 makes it possible to pipeline loads from src */ static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src, x86_reg stride, int rnd, int64_t shift) @@ -111,6 +110,7 @@ static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, : "+r"(src), "+r"(dst) : "r"(stride), "r"(-2*stride), "m"(shift), "m"(rnd), "r"(9*stride-4) + NAMED_CONSTRAINTS_ADD(ff_pw_9) : "%"REG_c, "memory" ); } @@ -155,6 +155,7 @@ static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ "jnz 1b \n\t"\ : "+r"(h), "+r" (src), "+r" (dst)\ : "r"(stride), "m"(rnd)\ + NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\ : "memory"\ );\ } @@ -213,6 +214,7 @@ static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ : "+r"(src), "+r"(dst)\ : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ "g"(stride-offset)\ + NAMED_CONSTRAINTS_ADD(ff_pw_9)\ : "%"REG_c, "memory"\ );\ } @@ -315,6 +317,7 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ : "+r"(h), "+r" (src), "+r" (dst) \ : "r"(src_stride), "r"(3*src_stride), \ "m"(rnd), "m"(shift) \ + NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18) \ : "memory" \ ); \ } @@ -352,6 +355,7 @@ OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ "jnz 1b \n\t" \ : "+r"(h), "+r" (src), "+r" (dst) \ : "r"(stride), "m"(rnd) \ + NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128) \ : "memory" \ ); \ } @@ -387,6 +391,7 @@ OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ "jnz 1b \n\t" \ : "+r"(h), "+r" (src), "+r" (dst) \ : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ + NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3) \ : "memory" \ ); \ } @@ -441,7 +446,7 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ static const int shift_value[] = { 0, 5, 1, 5 };\ int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ int r;\ - DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ + LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);\ \ r = (1<<(shift-1)) + rnd-1;\ vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ @@ -457,6 +462,15 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ \ /* Horizontal mode with no vertical mode */\ vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ +} \ +static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ + int stride, int hmode, int vmode, int rnd)\ +{ \ + OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ + OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ + dst += 8*stride; src += 8*stride; \ + OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ + OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ } VC1_MSPEL_MC(put_) @@ -477,6 +491,20 @@ static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \ int rnd) \ { \ avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ +}\ +static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst, \ + const uint8_t *src, \ + ptrdiff_t stride, \ + int rnd) \ +{ \ + put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ +}\ +static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \ + const uint8_t *src,\ + ptrdiff_t stride, \ + int rnd) \ +{ \ + avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ } DECLARE_FUNCTION(0, 1) @@ -700,59 +728,83 @@ static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize, ); } +#if HAVE_MMX_EXTERNAL static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd) { ff_put_pixels8_mmx(dst, src, stride, 8); } +static void put_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride, int rnd) +{ + ff_put_pixels16_mmx(dst, src, stride, 16); +} +static void avg_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride, int rnd) +{ + ff_avg_pixels8_mmx(dst, src, stride, 8); +} +static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride, int rnd) +{ + ff_avg_pixels16_mmx(dst, src, stride, 16); +} +#endif + +#define FN_ASSIGN(OP, X, Y, INSN) \ + dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \ + dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) { - dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx; - dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; - dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; - dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx; - dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx; - dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx; - dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx; - dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx; - dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx; - dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; - dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; - dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; - dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; +#if HAVE_MMX_EXTERNAL + FN_ASSIGN(put_, 0, 0, _mmx); + FN_ASSIGN(avg_, 0, 0, _mmx); +#endif + FN_ASSIGN(put_, 0, 1, _mmx); + FN_ASSIGN(put_, 0, 2, _mmx); + FN_ASSIGN(put_, 0, 3, _mmx); + + FN_ASSIGN(put_, 1, 0, _mmx); + FN_ASSIGN(put_, 1, 1, _mmx); + FN_ASSIGN(put_, 1, 2, _mmx); + FN_ASSIGN(put_, 1, 3, _mmx); + + FN_ASSIGN(put_, 2, 0, _mmx); + FN_ASSIGN(put_, 2, 1, _mmx); + FN_ASSIGN(put_, 2, 2, _mmx); + FN_ASSIGN(put_, 2, 3, _mmx); + + FN_ASSIGN(put_, 3, 0, _mmx); + FN_ASSIGN(put_, 3, 1, _mmx); + FN_ASSIGN(put_, 3, 2, _mmx); + FN_ASSIGN(put_, 3, 3, _mmx); } av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) { - dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext; - dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext; + FN_ASSIGN(avg_, 0, 1, _mmxext); + FN_ASSIGN(avg_, 0, 2, _mmxext); + FN_ASSIGN(avg_, 0, 3, _mmxext); - dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext; - dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext; + FN_ASSIGN(avg_, 1, 0, _mmxext); + FN_ASSIGN(avg_, 1, 1, _mmxext); + FN_ASSIGN(avg_, 1, 2, _mmxext); + FN_ASSIGN(avg_, 1, 3, _mmxext); - dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext; - dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext; - dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext; + FN_ASSIGN(avg_, 2, 0, _mmxext); + FN_ASSIGN(avg_, 2, 1, _mmxext); + FN_ASSIGN(avg_, 2, 2, _mmxext); + FN_ASSIGN(avg_, 2, 3, _mmxext); - dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext; - dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext; - dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext; + FN_ASSIGN(avg_, 3, 0, _mmxext); + FN_ASSIGN(avg_, 3, 1, _mmxext); + FN_ASSIGN(avg_, 3, 2, _mmxext); + FN_ASSIGN(avg_, 3, 3, _mmxext); dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext; dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext; dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext; dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext; } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_6REGS && HAVE_INLINE_ASM */ diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm index 53b9e8292c..25d43640ab 100644 --- a/libavcodec/x86/videodsp.asm +++ b/libavcodec/x86/videodsp.asm @@ -2,20 +2,20 @@ ;* Core video DSP functions ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -54,13 +54,13 @@ SECTION .text ; | | <- bottom is copied from last line in body of source ; '----' <- bh %if ARCH_X86_64 -cglobal emu_edge_vvar, 7, 8, 1, dst, src, dst_stride, src_stride, \ +cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ start_y, end_y, bh, w %else ; x86-32 cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w %define src_strideq r3mp -%define dst_strideq r2mp - mov srcq, r1mp +%define dst_strideq r1mp + mov srcq, r2mp mov start_yq, r4mp mov end_yq, r5mp mov bhq, r6mp @@ -97,7 +97,10 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w neg n_wordsq lea start_xq, [start_xq+n_wordsq*2] .y_loop: ; do { - ; FIXME also write a ssse3 version using pshufb +%if cpuflag(avx2) + vpbroadcastb m0, [dstq+start_xq] + mov wq, n_wordsq ; initialize w +%else movzx wd, byte [dstq+start_xq] ; w = read(1) imul wd, 0x01010101 ; w *= 0x01010101 movd m0, wd @@ -107,6 +110,7 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w %else ; mmx punpckldq m0, m0 ; splat %endif ; mmx/sse +%endif ; avx2 .x_loop: ; do { movu [dstq+wq*2], m0 ; write($reg, $mmsize) add wq, mmsize/2 ; w -= $mmsize/2 @@ -127,6 +131,11 @@ hvar_fn INIT_XMM sse2 hvar_fn +%if HAVE_AVX2_EXTERNAL +INIT_XMM avx2 +hvar_fn +%endif + ; macro to read/write a horizontal number of pixels (%2) to/from registers ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels ; - if (%2 & 8) fills 8 bytes into xmm$next @@ -262,30 +271,30 @@ hvar_fn %rep 1+%2-%1 %if %%n <= 3 %if ARCH_X86_64 -cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, src, dst_stride, src_stride, \ +cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ start_y, end_y, val, bh mov bhq, r6mp ; r6mp = bhmp %else ; x86-32 cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh mov dstq, r0mp - mov srcq, r1mp + mov srcq, r2mp mov start_yq, r4mp mov end_yq, r5mp mov bhq, r6mp -%define dst_strideq r2mp +%define dst_strideq r1mp %define src_strideq r3mp %endif ; x86-64/32 %else %if ARCH_X86_64 -cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, src, dst_stride, src_stride, \ +cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ start_y, end_y, bh %else ; x86-32 cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh - mov srcq, r1mp + mov srcq, r2mp mov start_yq, r4mp mov end_yq, r5mp mov bhq, r6mp -%define dst_strideq r2mp +%define dst_strideq r1mp %define src_strideq r3mp %endif ; x86-64/32 %endif @@ -344,9 +353,8 @@ VERTICAL_EXTEND 16, 22 ; obviously not the same on both sides. %macro READ_V_PIXEL 2 -%if %1 == 2 - movzx valw, byte %2 - imul valw, 0x0101 +%if cpuflag(avx2) + vpbroadcastb m0, %2 %else movzx vald, byte %2 imul vald, 0x01010101 @@ -356,13 +364,16 @@ VERTICAL_EXTEND 16, 22 pshufd m0, m0, q0000 %else punpckldq m0, m0 -%endif -%endif ; %1 >= 8 -%endif +%endif ; mmsize == 16 +%endif ; %1 > 16 +%endif ; avx2 %endmacro ; READ_V_PIXEL %macro WRITE_V_PIXEL 2 %assign %%off 0 + +%if %1 >= 8 + %rep %1/mmsize movu [%2+%%off], m0 %assign %%off %%off+mmsize @@ -378,34 +389,44 @@ VERTICAL_EXTEND 16, 22 %assign %%off %%off+8 %endif %endif ; %1-%%off >= 8 -%endif +%endif ; mmsize == 16 %if %1-%%off >= 4 %if %1 > 8 && %1-%%off > 4 movq [%2+%1-8], m0 %assign %%off %1 -%elif %1 >= 8 && %1-%%off >= 4 - movd [%2+%%off], m0 -%assign %%off %%off+4 %else - mov [%2+%%off], vald + movd [%2+%%off], m0 %assign %%off %%off+4 %endif %endif ; %1-%%off >= 4 -%if %1-%%off >= 2 -%if %1 >= 8 - movd [%2+%1-4], m0 +%else ; %1 < 8 + +%rep %1/4 + mov [%2+%%off], vald +%assign %%off %%off+4 +%endrep ; %1/4 + +%endif ; %1 >=/< 8 + +%if %1-%%off == 2 +%if cpuflag(avx2) + movd [%2+%%off-2], m0 %else mov [%2+%%off], valw -%endif +%endif ; avx2 %endif ; (%1-%%off)/2 %endmacro ; WRITE_V_PIXEL %macro H_EXTEND 2 %assign %%n %1 %rep 1+(%2-%1)/2 +%if cpuflag(avx2) +cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh +%else cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val +%endif .loop_y: ; do { READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) @@ -426,6 +447,11 @@ H_EXTEND 16, 22 INIT_XMM sse2 H_EXTEND 16, 22 +%if HAVE_AVX2_EXTERNAL +INIT_XMM avx2 +H_EXTEND 8, 22 +%endif + %macro PREFETCH_FN 1 cglobal prefetch, 3, 3, 0, buf, stride, h .loop: diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c index 8ee837096a..885cdf1d8c 100644 --- a/libavcodec/x86/videodsp_init.c +++ b/libavcodec/x86/videodsp_init.c @@ -1,25 +1,27 @@ /* + * Copyright (C) 2002-2012 Michael Niedermayer * Copyright (C) 2012 Ronald S. Bultje * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "config.h" #include "libavutil/attributes.h" +#include "libavutil/avassert.h" #include "libavutil/common.h" #include "libavutil/cpu.h" #include "libavutil/mem.h" @@ -28,11 +30,11 @@ #include "libavcodec/videodsp.h" #if HAVE_YASM -typedef void emu_edge_vfix_func(uint8_t *dst, const uint8_t *src, - x86_reg dst_stride, x86_reg src_stride, +typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride, + const uint8_t *src, x86_reg src_stride, x86_reg start_y, x86_reg end_y, x86_reg bh); -typedef void emu_edge_vvar_func(uint8_t *dst, const uint8_t *src, - x86_reg dst_stride, x86_reg src_stride, +typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride, + const uint8_t *src, x86_reg src_stride, x86_reg start_y, x86_reg end_y, x86_reg bh, x86_reg w); @@ -59,7 +61,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx; extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx; extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx; #if ARCH_X86_32 -static emu_edge_vfix_func *vfixtbl_mmx[22] = { +static emu_edge_vfix_func * const vfixtbl_mmx[22] = { &ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx, &ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx, &ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx, @@ -78,7 +80,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix19_sse; extern emu_edge_vfix_func ff_emu_edge_vfix20_sse; extern emu_edge_vfix_func ff_emu_edge_vfix21_sse; extern emu_edge_vfix_func ff_emu_edge_vfix22_sse; -static emu_edge_vfix_func *vfixtbl_sse[22] = { +static emu_edge_vfix_func * const vfixtbl_sse[22] = { ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx, ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx, ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx, @@ -107,7 +109,7 @@ extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx; extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx; extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx; #if ARCH_X86_32 -static emu_edge_hfix_func *hfixtbl_mmx[11] = { +static emu_edge_hfix_func * const hfixtbl_mmx[11] = { ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx, @@ -119,13 +121,30 @@ extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; -static emu_edge_hfix_func *hfixtbl_sse2[11] = { +static emu_edge_hfix_func * const hfixtbl_sse2[11] = { ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 }; extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; +#if HAVE_AVX2_EXTERNAL +extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2; +static emu_edge_hfix_func * const hfixtbl_avx2[11] = { + ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, + ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2, + ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2, + ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2 +}; +extern emu_edge_hvar_func ff_emu_edge_hvar_avx2; +#endif static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, ptrdiff_t dst_stride, @@ -133,22 +152,24 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, x86_reg block_w, x86_reg block_h, x86_reg src_x, x86_reg src_y, x86_reg w, x86_reg h, - emu_edge_vfix_func **vfix_tbl, + emu_edge_vfix_func * const *vfix_tbl, emu_edge_vvar_func *v_extend_var, - emu_edge_hfix_func **hfix_tbl, + emu_edge_hfix_func * const *hfix_tbl, emu_edge_hvar_func *h_extend_var) { x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p; if (!w || !h) - return; + return; if (src_y >= h) { - src -= src_y * src_stride; - src_y = src_y_add = h - 1; + src -= src_y*src_stride; + src_y_add = h - 1; + src_y = h - 1; } else if (src_y <= -block_h) { - src -= src_y*src_stride; - src_y = src_y_add = 1 - block_h; + src -= src_y*src_stride; + src_y_add = 1 - block_h; + src_y = 1 - block_h; } if (src_x >= w) { src += w - 1 - src_x; @@ -162,18 +183,17 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, start_x = FFMAX(0, -src_x); end_y = FFMIN(block_h, h-src_y); end_x = FFMIN(block_w, w-src_x); - assert(start_x < end_x && block_w > 0); - assert(start_y < end_y && block_h > 0); + av_assert2(start_x < end_x && block_w > 0); + av_assert2(start_y < end_y && block_h > 0); // fill in the to-be-copied part plus all above/below src += (src_y_add + start_y) * src_stride + start_x; w = end_x - start_x; if (w <= 22) { - vfix_tbl[w - 1](dst + start_x, src, - dst_stride, src_stride, + vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride, start_y, end_y, block_h); } else { - v_extend_var(dst + start_x, src, dst_stride, src_stride, + v_extend_var(dst + start_x, dst_stride, src, src_stride, start_y, end_y, block_h, w); } @@ -212,7 +232,7 @@ static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, hfixtbl_mmx, &ff_emu_edge_hvar_mmx); } -static av_noinline void emulated_edge_mc_sse(uint8_t * buf,const uint8_t *src, +static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, ptrdiff_t buf_stride, ptrdiff_t src_stride, int block_w, int block_h, @@ -231,10 +251,24 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src, int src_x, int src_y, int w, int h) { - emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, src_x, - src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, + emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, + src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, hfixtbl_sse2, &ff_emu_edge_hvar_sse2); } + +#if HAVE_AVX2_EXTERNAL +static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src, + ptrdiff_t buf_stride, + ptrdiff_t src_stride, + int block_w, int block_h, + int src_x, int src_y, int w, + int h) +{ + emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, + src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, + hfixtbl_avx2, &ff_emu_edge_hvar_avx2); +} +#endif /* HAVE_AVX2_EXTERNAL */ #endif /* HAVE_YASM */ void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h); @@ -264,5 +298,10 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) { ctx->emulated_edge_mc = emulated_edge_mc_sse2; } +#if HAVE_AVX2_EXTERNAL + if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) { + ctx->emulated_edge_mc = emulated_edge_mc_avx2; + } +#endif #endif /* HAVE_YASM */ } diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm index c54650eef5..b25d838868 100644 --- a/libavcodec/x86/vorbisdsp.asm +++ b/libavcodec/x86/vorbisdsp.asm @@ -2,20 +2,20 @@ ;* Vorbis x86 optimizations ;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c index bbd83195cc..bc1cc43a18 100644 --- a/libavcodec/x86/vorbisdsp_init.c +++ b/libavcodec/x86/vorbisdsp_init.c @@ -1,20 +1,20 @@ /* * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index fc8a047224..ee5a6bf67a 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -2,20 +2,20 @@ ;* MMX/SSE2-optimized functions for the VP3 decoder ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -40,6 +40,7 @@ pb_81: times 8 db 0x81 cextern pb_1 cextern pb_3 cextern pb_80 +cextern pb_FE cextern pw_8 @@ -147,6 +148,49 @@ cglobal vp3_h_loop_filter, 3, 4 STORE_4_WORDS m3 RET +%macro PAVGB_NO_RND 0 + mova m4, m0 + mova m5, m2 + pand m4, m1 + pand m5, m3 + pxor m1, m0 + pxor m3, m2 + pand m1, m6 + pand m3, m6 + psrlq m1, 1 + psrlq m3, 1 + paddb m4, m1 + paddb m5, m3 +%endmacro + +INIT_MMX mmx +cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3 + mova m6, [pb_FE] + lea stride3q,[strideq+strideq*2] +.loop + mova m0, [src1q] + mova m1, [src2q] + mova m2, [src1q+strideq] + mova m3, [src2q+strideq] + PAVGB_NO_RND + mova [dstq], m4 + mova [dstq+strideq], m5 + + mova m0, [src1q+strideq*2] + mova m1, [src2q+strideq*2] + mova m2, [src1q+stride3q] + mova m3, [src2q+stride3q] + PAVGB_NO_RND + mova [dstq+strideq*2], m4 + mova [dstq+stride3q], m5 + + lea src1q, [src1q+strideq*4] + lea src2q, [src2q+strideq*4] + lea dstq, [dstq+strideq*4] + sub hd, 4 + jnz .loop + RET + ; from original comments: The Macro does IDct on 4 1-D Dcts %macro BeginIDCT 0 movq m2, I(3) diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c index ed38a8e4df..354e1a1944 100644 --- a/libavcodec/x86/vp3dsp_init.c +++ b/libavcodec/x86/vp3dsp_init.c @@ -1,18 +1,20 @@ /* - * This file is part of Libav. + * Copyright (c) 2009 David Conrad <lessen42@gmail.com> * - * Libav is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -23,7 +25,6 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/avcodec.h" #include "libavcodec/vp3dsp.h" -#include "config.h" void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block); void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block); @@ -39,16 +40,21 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride, void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride, int *bounding_values); +void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, + const uint8_t *b, ptrdiff_t stride, + int h); + av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) { int cpu_flags = av_get_cpu_flags(); -#if ARCH_X86_32 if (EXTERNAL_MMX(cpu_flags)) { + c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx; +#if ARCH_X86_32 c->idct_put = ff_vp3_idct_put_mmx; c->idct_add = ff_vp3_idct_add_mmx; - } #endif + } if (EXTERNAL_MMXEXT(cpu_flags)) { c->idct_dc_add = ff_vp3_idct_dc_add_mmxext; diff --git a/libavcodec/x86/vp56_arith.h b/libavcodec/x86/vp56_arith.h index 0a693684af..810cc8dcd8 100644 --- a/libavcodec/x86/vp56_arith.h +++ b/libavcodec/x86/vp56_arith.h @@ -4,49 +4,46 @@ * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> * Copyright (C) 2010 Eli Friedman * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef AVCODEC_X86_VP56_ARITH_H #define AVCODEC_X86_VP56_ARITH_H -#if HAVE_INLINE_ASM && HAVE_FAST_CMOV +#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS #define vp56_rac_get_prob vp56_rac_get_prob static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob) { unsigned int code_word = vp56_rac_renorm(c); - unsigned int high = c->high; - unsigned int low = 1 + (((high - 1) * prob) >> 8); + unsigned int low = 1 + (((c->high - 1) * prob) >> 8); unsigned int low_shift = low << 16; int bit = 0; + c->code_word = code_word; __asm__( "subl %4, %1 \n\t" "subl %3, %2 \n\t" - "leal (%2, %3), %3 \n\t" "setae %b0 \n\t" "cmovb %4, %1 \n\t" - "cmovb %3, %2 \n\t" - : "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift) - : "r"(low) + "cmovb %5, %2 \n\t" + : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word) + : "r"(low_shift), "r"(low), "r"(code_word) ); - c->high = high; - c->code_word = code_word; return bit; } #endif diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm index 80f8ca5f38..3d874ea62a 100644 --- a/libavcodec/x86/vp6dsp.asm +++ b/libavcodec/x86/vp6dsp.asm @@ -3,20 +3,20 @@ ;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> ;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/vp6dsp_init.c b/libavcodec/x86/vp6dsp_init.c index cd94f3e038..82baee7e97 100644 --- a/libavcodec/x86/vp6dsp_init.c +++ b/libavcodec/x86/vp6dsp_init.c @@ -3,20 +3,20 @@ * Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> * Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index adc9730dfa..538b3f4a9b 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -143,13 +143,13 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 -pw_256: times 8 dw 256 pw_20091: times 4 dw 20091 pw_17734: times 4 dw 17734 cextern pw_3 cextern pw_4 cextern pw_64 +cextern pw_256 SECTION .text diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c index e5afd493bb..8d5d033744 100644 --- a/libavcodec/x86/vp8dsp_init.c +++ b/libavcodec/x86/vp8dsp_init.c @@ -3,20 +3,20 @@ * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> * Copyright (c) 2010 Fiona Glaser <fiona@x264.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -169,7 +169,7 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ ptrdiff_t srcstride, int height, int mx, int my) \ { \ - DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \ + LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \ uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \ src -= srcstride * (TAPNUMY / 2 - 1); \ ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \ @@ -214,7 +214,7 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ ptrdiff_t srcstride, int height, int mx, int my) \ { \ - DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \ + LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \ ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \ tmp, SIZE, src, srcstride, height + 1, mx, my); \ ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \ @@ -347,7 +347,7 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; } - if (EXTERNAL_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { + if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) { VP8_LUMA_MC_FUNC(0, 16, sse2); VP8_MC_FUNC(1, 8, sse2); VP8_BILINEAR_MC_FUNC(0, 16, sse2); @@ -417,7 +417,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c) c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; } - if (EXTERNAL_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { + if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) { c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; @@ -430,7 +430,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c) if (EXTERNAL_SSE2(cpu_flags)) { c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; - c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; @@ -455,7 +455,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c) } if (EXTERNAL_SSE4(cpu_flags)) { - c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; + c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; diff --git a/libavcodec/x86/vp8dsp_loopfilter.asm b/libavcodec/x86/vp8dsp_loopfilter.asm index 5d792e8207..98bb6696a0 100644 --- a/libavcodec/x86/vp8dsp_loopfilter.asm +++ b/libavcodec/x86/vp8dsp_loopfilter.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9dsp.asm deleted file mode 100644 index 6488f3092d..0000000000 --- a/libavcodec/x86/vp9dsp.asm +++ /dev/null @@ -1,277 +0,0 @@ -;****************************************************************************** -;* VP9 SIMD optimizations -;* -;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -; FIXME share with vp8dsp.asm -pw_256: times 8 dw 256 - -%macro F8_TAPS 8 -times 8 db %1, %2 -times 8 db %3, %4 -times 8 db %5, %6 -times 8 db %7, %8 -%endmacro -; int8_t ff_filters_ssse3[3][15][4][16] -const filters_ssse3 ; smooth - F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 - F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 - F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 - F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0 - F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0 - F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0 - F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1 - F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1 - F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1 - F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1 - F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2 - F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2 - F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2 - F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 - F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 - ; regular - F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 - F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 - F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 - F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1 - F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1 - F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1 - F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1 - F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1 - F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1 - F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1 - F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1 - F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1 - F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1 - F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 - F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 - ; sharp - F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 - F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 - F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 - F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2 - F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3 - F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3 - F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4 - F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4 - F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4 - F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4 - F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4 - F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4 - F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3 - F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2 - F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1 - -SECTION .text - -%macro filter_h_fn 1 -%assign %%px mmsize/2 -cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, src, dstride, sstride, h, filtery - mova m6, [pw_256] - mova m7, [filteryq+ 0] -%if ARCH_X86_64 && mmsize > 8 - mova m8, [filteryq+16] - mova m9, [filteryq+32] - mova m10, [filteryq+48] -%endif -.loop: - movh m0, [srcq-3] - movh m1, [srcq-2] - movh m2, [srcq-1] - movh m3, [srcq+0] - movh m4, [srcq+1] - movh m5, [srcq+2] - punpcklbw m0, m1 - punpcklbw m2, m3 - movh m1, [srcq+3] - movh m3, [srcq+4] - add srcq, sstrideq - punpcklbw m4, m5 - punpcklbw m1, m3 - pmaddubsw m0, m7 -%if ARCH_X86_64 && mmsize > 8 - pmaddubsw m2, m8 - pmaddubsw m4, m9 - pmaddubsw m1, m10 -%else - pmaddubsw m2, [filteryq+16] - pmaddubsw m4, [filteryq+32] - pmaddubsw m1, [filteryq+48] -%endif - paddw m0, m2 - paddw m4, m1 - paddsw m0, m4 - pmulhrsw m0, m6 -%ifidn %1, avg - movh m1, [dstq] -%endif - packuswb m0, m0 -%ifidn %1, avg - pavgb m0, m1 -%endif - movh [dstq], m0 - add dstq, dstrideq - dec hd - jg .loop - RET -%endmacro - -INIT_MMX ssse3 -filter_h_fn put -filter_h_fn avg - -INIT_XMM ssse3 -filter_h_fn put -filter_h_fn avg - -%macro filter_v_fn 1 -%assign %%px mmsize/2 -%if ARCH_X86_64 -cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, src, dstride, sstride, h, filtery, src4, sstride3 -%else -cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, src, dstride, sstride, filtery, src4, sstride3 - mov filteryq, r5mp -%define hd r4mp -%endif - sub srcq, sstrideq - lea sstride3q, [sstrideq*3] - sub srcq, sstrideq - mova m6, [pw_256] - sub srcq, sstrideq - mova m7, [filteryq+ 0] - lea src4q, [srcq+sstrideq*4] -%if ARCH_X86_64 && mmsize > 8 - mova m8, [filteryq+16] - mova m9, [filteryq+32] - mova m10, [filteryq+48] -%endif -.loop: - ; FIXME maybe reuse loads from previous rows, or just more generally - ; unroll this to prevent multiple loads of the same data? - movh m0, [srcq] - movh m1, [srcq+sstrideq] - movh m2, [srcq+sstrideq*2] - movh m3, [srcq+sstride3q] - movh m4, [src4q] - movh m5, [src4q+sstrideq] - punpcklbw m0, m1 - punpcklbw m2, m3 - movh m1, [src4q+sstrideq*2] - movh m3, [src4q+sstride3q] - add srcq, sstrideq - add src4q, sstrideq - punpcklbw m4, m5 - punpcklbw m1, m3 - pmaddubsw m0, m7 -%if ARCH_X86_64 && mmsize > 8 - pmaddubsw m2, m8 - pmaddubsw m4, m9 - pmaddubsw m1, m10 -%else - pmaddubsw m2, [filteryq+16] - pmaddubsw m4, [filteryq+32] - pmaddubsw m1, [filteryq+48] -%endif - paddw m0, m2 - paddw m4, m1 - paddsw m0, m4 - pmulhrsw m0, m6 -%ifidn %1, avg - movh m1, [dstq] -%endif - packuswb m0, m0 -%ifidn %1, avg - pavgb m0, m1 -%endif - movh [dstq], m0 - add dstq, dstrideq - dec hd - jg .loop - RET -%endmacro - -INIT_MMX ssse3 -filter_v_fn put -filter_v_fn avg - -INIT_XMM ssse3 -filter_v_fn put -filter_v_fn avg - -%macro fpel_fn 6 -%if %2 == 4 -%define %%srcfn movh -%define %%dstfn movh -%else -%define %%srcfn movu -%define %%dstfn mova -%endif - -%if %2 <= 16 -cglobal %1%2, 5, 7, 4, dst, src, dstride, sstride, h, dstride3, sstride3 - lea sstride3q, [sstrideq*3] - lea dstride3q, [dstrideq*3] -%else -cglobal %1%2, 5, 5, 4, dst, src, dstride, sstride, h -%endif -.loop: - %%srcfn m0, [srcq] - %%srcfn m1, [srcq+s%3] - %%srcfn m2, [srcq+s%4] - %%srcfn m3, [srcq+s%5] - lea srcq, [srcq+sstrideq*%6] -%ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+d%3] - pavgb m2, [dstq+d%4] - pavgb m3, [dstq+d%5] -%endif - %%dstfn [dstq], m0 - %%dstfn [dstq+d%3], m1 - %%dstfn [dstq+d%4], m2 - %%dstfn [dstq+d%5], m3 - lea dstq, [dstq+dstrideq*%6] - sub hd, %6 - jnz .loop - RET -%endmacro - -%define d16 16 -%define s16 16 -INIT_MMX mmx -fpel_fn put, 4, strideq, strideq*2, stride3q, 4 -fpel_fn put, 8, strideq, strideq*2, stride3q, 4 -INIT_MMX sse -fpel_fn avg, 4, strideq, strideq*2, stride3q, 4 -fpel_fn avg, 8, strideq, strideq*2, stride3q, 4 -INIT_XMM sse -fpel_fn put, 16, strideq, strideq*2, stride3q, 4 -fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 -fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 -INIT_XMM sse2 -fpel_fn avg, 16, strideq, strideq*2, stride3q, 4 -fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2 -fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1 -%undef s16 -%undef d16 diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index ce58c08a3b..00e7125a0c 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -1,240 +1,511 @@ /* * VP9 SIMD optimizations * - * Copyright (c) 2013 Ronald S. Bultje <rsbultje@gmail.com> + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/attributes.h" #include "libavutil/cpu.h" -#include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" -#include "libavcodec/vp9.h" +#include "libavcodec/vp9dsp.h" #if HAVE_YASM -#define fpel_func(avg, sz, opt) \ -void ff_ ## avg ## sz ## _ ## opt(uint8_t *dst, const uint8_t *src, \ - ptrdiff_t dst_stride, \ - ptrdiff_t src_stride, \ - int h, int mx, int my) - +#define fpel_func(avg, sz, opt) \ +void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) fpel_func(put, 4, mmx); fpel_func(put, 8, mmx); fpel_func(put, 16, sse); fpel_func(put, 32, sse); fpel_func(put, 64, sse); -fpel_func(avg, 4, sse); -fpel_func(avg, 8, sse); +fpel_func(avg, 4, mmxext); +fpel_func(avg, 8, mmxext); fpel_func(avg, 16, sse2); fpel_func(avg, 32, sse2); fpel_func(avg, 64, sse2); +fpel_func(put, 32, avx); +fpel_func(put, 64, avx); +fpel_func(avg, 32, avx2); +fpel_func(avg, 64, avx2); #undef fpel_func -#define mc_func(avg, sz, dir, opt) \ -void \ -ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \ - const uint8_t *src, \ - ptrdiff_t dst_stride, \ - ptrdiff_t src_stride, \ - int h, \ - const int8_t (*filter)[16]) - -#define mc_funcs(sz) \ - mc_func(put, sz, h, ssse3); \ - mc_func(avg, sz, h, ssse3); \ - mc_func(put, sz, v, ssse3); \ - mc_func(avg, sz, v, ssse3) - -mc_funcs(4); -mc_funcs(8); +#define mc_func(avg, sz, dir, opt, type, f_sz) \ +void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, const type (*filter)[f_sz]) +#define mc_funcs(sz, opt, type, fsz) \ +mc_func(put, sz, h, opt, type, fsz); \ +mc_func(avg, sz, h, opt, type, fsz); \ +mc_func(put, sz, v, opt, type, fsz); \ +mc_func(avg, sz, v, opt, type, fsz) + +mc_funcs(4, mmxext, int16_t, 8); +mc_funcs(8, sse2, int16_t, 8); +mc_funcs(4, ssse3, int8_t, 32); +mc_funcs(8, ssse3, int8_t, 32); +#if ARCH_X86_64 +mc_funcs(16, ssse3, int8_t, 32); +mc_funcs(32, avx2, int8_t, 32); +#endif #undef mc_funcs #undef mc_func -#define mc_rep_func(avg, sz, hsz, dir, opt) \ -static av_always_inline void \ -ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \ - const uint8_t *src, \ - ptrdiff_t dst_stride, \ - ptrdiff_t src_stride, \ - int h, \ - const int8_t (*filter)[16]) \ -{ \ - ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst, src, \ - dst_stride, \ - src_stride, \ - h, \ - filter); \ - ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst + hsz, \ - src + hsz, \ - dst_stride, \ - src_stride, \ - h, filter); \ +#define mc_rep_func(avg, sz, hsz, dir, opt, type, f_sz) \ +static av_always_inline void \ +ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, const type (*filter)[f_sz]) \ +{ \ + ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst, dst_stride, src, \ + src_stride, h, filter); \ + ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst + hsz, dst_stride, src + hsz, \ + src_stride, h, filter); \ } -#define mc_rep_funcs(sz, hsz) \ - mc_rep_func(put, sz, hsz, h, ssse3); \ - mc_rep_func(avg, sz, hsz, h, ssse3); \ - mc_rep_func(put, sz, hsz, v, ssse3); \ - mc_rep_func(avg, sz, hsz, v, ssse3) - -mc_rep_funcs(16, 8); -mc_rep_funcs(32, 16); -mc_rep_funcs(64, 32); +#define mc_rep_funcs(sz, hsz, opt, type, fsz) \ +mc_rep_func(put, sz, hsz, h, opt, type, fsz); \ +mc_rep_func(avg, sz, hsz, h, opt, type, fsz); \ +mc_rep_func(put, sz, hsz, v, opt, type, fsz); \ +mc_rep_func(avg, sz, hsz, v, opt, type, fsz) + +mc_rep_funcs(16, 8, sse2, int16_t, 8); +#if ARCH_X86_32 +mc_rep_funcs(16, 8, ssse3, int8_t, 32); +#endif +mc_rep_funcs(32, 16, sse2, int16_t, 8); +mc_rep_funcs(32, 16, ssse3, int8_t, 32); +mc_rep_funcs(64, 32, sse2, int16_t, 8); +mc_rep_funcs(64, 32, ssse3, int8_t, 32); +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +mc_rep_funcs(64, 32, avx2, int8_t, 32); +#endif #undef mc_rep_funcs #undef mc_rep_func -extern const int8_t ff_filters_ssse3[3][15][4][16]; - -#define filter_8tap_2d_fn(op, sz, f, fname) \ -static void \ -op ## _8tap_ ## fname ## _ ## sz ## hv_ssse3(uint8_t *dst, \ - const uint8_t *src, \ - ptrdiff_t dst_stride, \ - ptrdiff_t src_stride, \ - int h, int mx, int my) \ -{ \ - LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]); \ - ff_put_8tap_1d_h_ ## sz ## _ssse3(temp, src - 3 * src_stride, \ - 64, src_stride, \ - h + 7, \ - ff_filters_ssse3[f][mx - 1]); \ - ff_ ## op ## _8tap_1d_v_ ## sz ## _ssse3(dst, temp + 3 * 64, \ - dst_stride, 64, \ - h, \ - ff_filters_ssse3[f][my - 1]); \ +extern const int8_t ff_filters_ssse3[3][15][4][32]; +extern const int16_t ff_filters_sse2[3][15][8][8]; + +#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, opt) \ +static void op##_8tap_##fname##_##sz##hv_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64]); \ + ff_vp9_put_8tap_1d_h_##sz##_##opt(temp, 64, src - 3 * src_stride, src_stride, \ + h + 7, ff_filters_##f_opt[f][mx - 1]); \ + ff_vp9_##op##_8tap_1d_v_##sz##_##opt(dst, dst_stride, temp + 3 * 64, 64, \ + h, ff_filters_##f_opt[f][my - 1]); \ } -#define filters_8tap_2d_fn(op, sz) \ - filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \ - filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp) \ - filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth) - -#define filters_8tap_2d_fn2(op) \ - filters_8tap_2d_fn(op, 64) \ - filters_8tap_2d_fn(op, 32) \ - filters_8tap_2d_fn(op, 16) \ - filters_8tap_2d_fn(op, 8) \ - filters_8tap_2d_fn(op, 4) - -filters_8tap_2d_fn2(put) -filters_8tap_2d_fn2(avg) +#define filters_8tap_2d_fn(op, sz, align, opt, f_opt) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, opt) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, opt) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, opt) + +#define filters_8tap_2d_fn2(op, align, opt4, opt8, f_opt) \ +filters_8tap_2d_fn(op, 64, align, opt8, f_opt) \ +filters_8tap_2d_fn(op, 32, align, opt8, f_opt) \ +filters_8tap_2d_fn(op, 16, align, opt8, f_opt) \ +filters_8tap_2d_fn(op, 8, align, opt8, f_opt) \ +filters_8tap_2d_fn(op, 4, align, opt4, f_opt) + +filters_8tap_2d_fn2(put, 16, mmxext, sse2, sse2) +filters_8tap_2d_fn2(avg, 16, mmxext, sse2, sse2) +filters_8tap_2d_fn2(put, 16, ssse3, ssse3, ssse3) +filters_8tap_2d_fn2(avg, 16, ssse3, ssse3, ssse3) +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +filters_8tap_2d_fn(put, 64, 32, avx2, ssse3) +filters_8tap_2d_fn(put, 32, 32, avx2, ssse3) +filters_8tap_2d_fn(avg, 64, 32, avx2, ssse3) +filters_8tap_2d_fn(avg, 32, 32, avx2, ssse3) +#endif #undef filters_8tap_2d_fn2 #undef filters_8tap_2d_fn #undef filter_8tap_2d_fn -#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar) \ -static void \ -op ## _8tap_ ## fname ## _ ## sz ## dir ## _ssse3(uint8_t *dst, \ - const uint8_t *src, \ - ptrdiff_t dst_stride, \ - ptrdiff_t src_stride, \ - int h, int mx, \ - int my) \ -{ \ - ff_ ## op ## _8tap_1d_ ## dir ## _ ## sz ## _ssse3(dst, src, \ - dst_stride, \ - src_stride, h, \ - ff_filters_ssse3[f][dvar - 1]); \ +#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, opt) \ +static void op##_8tap_##fname##_##sz##dir##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + ff_vp9_##op##_8tap_1d_##dir##_##sz##_##opt(dst, dst_stride, src, src_stride, \ + h, ff_filters_##f_opt[f][dvar - 1]); \ } -#define filters_8tap_1d_fn(op, sz, dir, dvar) \ - filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \ - filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar) \ - filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar) - -#define filters_8tap_1d_fn2(op, sz) \ - filters_8tap_1d_fn(op, sz, h, mx) \ - filters_8tap_1d_fn(op, sz, v, my) - -#define filters_8tap_1d_fn3(op) \ - filters_8tap_1d_fn2(op, 64) \ - filters_8tap_1d_fn2(op, 32) \ - filters_8tap_1d_fn2(op, 16) \ - filters_8tap_1d_fn2(op, 8) \ - filters_8tap_1d_fn2(op, 4) - -filters_8tap_1d_fn3(put) -filters_8tap_1d_fn3(avg) +#define filters_8tap_1d_fn(op, sz, dir, dvar, opt, f_opt) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, opt) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, dir, dvar, opt) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, opt) + +#define filters_8tap_1d_fn2(op, sz, opt, f_opt) \ +filters_8tap_1d_fn(op, sz, h, mx, opt, f_opt) \ +filters_8tap_1d_fn(op, sz, v, my, opt, f_opt) + +#define filters_8tap_1d_fn3(op, opt4, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 64, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 32, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 16, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 8, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 4, opt4, f_opt) + +filters_8tap_1d_fn3(put, mmxext, sse2, sse2) +filters_8tap_1d_fn3(avg, mmxext, sse2, sse2) +filters_8tap_1d_fn3(put, ssse3, ssse3, ssse3) +filters_8tap_1d_fn3(avg, ssse3, ssse3, ssse3) +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +filters_8tap_1d_fn2(put, 64, avx2, ssse3) +filters_8tap_1d_fn2(put, 32, avx2, ssse3) +filters_8tap_1d_fn2(avg, 64, avx2, ssse3) +filters_8tap_1d_fn2(avg, 32, avx2, ssse3) +#endif #undef filters_8tap_1d_fn #undef filters_8tap_1d_fn2 #undef filters_8tap_1d_fn3 #undef filter_8tap_1d_fn +#define itxfm_func(typea, typeb, size, opt) \ +void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \ + int16_t *block, int eob) +#define itxfm_funcs(size, opt) \ +itxfm_func(idct, idct, size, opt); \ +itxfm_func(iadst, idct, size, opt); \ +itxfm_func(idct, iadst, size, opt); \ +itxfm_func(iadst, iadst, size, opt) + +itxfm_func(idct, idct, 4, mmxext); +itxfm_func(idct, iadst, 4, sse2); +itxfm_func(iadst, idct, 4, sse2); +itxfm_func(iadst, iadst, 4, sse2); +itxfm_funcs(4, ssse3); +itxfm_funcs(8, sse2); +itxfm_funcs(8, ssse3); +itxfm_funcs(8, avx); +itxfm_funcs(16, sse2); +itxfm_funcs(16, ssse3); +itxfm_funcs(16, avx); +itxfm_func(idct, idct, 32, sse2); +itxfm_func(idct, idct, 32, ssse3); +itxfm_func(idct, idct, 32, avx); +itxfm_func(iwht, iwht, 4, mmx); + +#undef itxfm_func +#undef itxfm_funcs + +#define lpf_funcs(size1, size2, opt) \ +void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H); \ +void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) + +lpf_funcs(16, 16, sse2); +lpf_funcs(16, 16, ssse3); +lpf_funcs(16, 16, avx); +lpf_funcs(44, 16, sse2); +lpf_funcs(44, 16, ssse3); +lpf_funcs(44, 16, avx); +lpf_funcs(84, 16, sse2); +lpf_funcs(84, 16, ssse3); +lpf_funcs(84, 16, avx); +lpf_funcs(48, 16, sse2); +lpf_funcs(48, 16, ssse3); +lpf_funcs(48, 16, avx); +lpf_funcs(88, 16, sse2); +lpf_funcs(88, 16, ssse3); +lpf_funcs(88, 16, avx); + +#undef lpf_funcs + +#define ipred_func(size, type, opt) \ +void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \ + const uint8_t *l, const uint8_t *a) + +ipred_func(8, v, mmx); + +#define ipred_dc_funcs(size, opt) \ +ipred_func(size, dc, opt); \ +ipred_func(size, dc_left, opt); \ +ipred_func(size, dc_top, opt) + +ipred_dc_funcs(4, mmxext); +ipred_dc_funcs(8, mmxext); + +#define ipred_dir_tm_funcs(size, opt) \ +ipred_func(size, tm, opt); \ +ipred_func(size, dl, opt); \ +ipred_func(size, dr, opt); \ +ipred_func(size, hd, opt); \ +ipred_func(size, hu, opt); \ +ipred_func(size, vl, opt); \ +ipred_func(size, vr, opt) + +ipred_dir_tm_funcs(4, mmxext); + +ipred_func(16, v, sse); +ipred_func(32, v, sse); + +ipred_dc_funcs(16, sse2); +ipred_dc_funcs(32, sse2); + +#define ipred_dir_tm_h_funcs(size, opt) \ +ipred_dir_tm_funcs(size, opt); \ +ipred_func(size, h, opt) + +ipred_dir_tm_h_funcs(8, sse2); +ipred_dir_tm_h_funcs(16, sse2); +ipred_dir_tm_h_funcs(32, sse2); + +ipred_func(4, h, sse2); + +#define ipred_all_funcs(size, opt) \ +ipred_dc_funcs(size, opt); \ +ipred_dir_tm_h_funcs(size, opt) + +// FIXME hd/vl_4x4_ssse3 does not exist +ipred_all_funcs(4, ssse3); +ipred_all_funcs(8, ssse3); +ipred_all_funcs(16, ssse3); +ipred_all_funcs(32, ssse3); + +ipred_dir_tm_h_funcs(8, avx); +ipred_dir_tm_h_funcs(16, avx); +ipred_dir_tm_h_funcs(32, avx); + +ipred_func(32, v, avx); + +ipred_dc_funcs(32, avx2); +ipred_func(32, h, avx2); +ipred_func(32, tm, avx2); + +#undef ipred_func +#undef ipred_dir_tm_h_funcs +#undef ipred_dir_tm_funcs +#undef ipred_dc_funcs + #endif /* HAVE_YASM */ -av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) +av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp) { #if HAVE_YASM - int cpu_flags = av_get_cpu_flags(); + int cpu_flags; + if (bpp != 8) return; -#define init_fpel(idx1, idx2, sz, type, opt) \ - dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ - dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ - dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ - dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_ ## type ## sz ## _ ## opt + cpu_flags = av_get_cpu_flags(); +#define init_fpel(idx1, idx2, sz, type, opt) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##_##opt #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \ - dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv] = type ## _8tap_smooth_ ## sz ## dir ## _ ## opt; \ - dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _ ## opt; \ - dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv] = type ## _8tap_sharp_ ## sz ## dir ## _ ## opt - -#define init_subpel2(idx, idxh, idxv, dir, type, opt) \ - init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \ - init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \ - init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \ - init_subpel1(3, idx, idxh, idxv, 8, dir, type, opt); \ - init_subpel1(4, idx, idxh, idxv, 4, dir, type, opt) - -#define init_subpel3(idx, type, opt) \ - init_subpel2(idx, 1, 1, hv, type, opt); \ - init_subpel2(idx, 0, 1, v, type, opt); \ - init_subpel2(idx, 1, 0, h, type, opt) + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_##opt + +#define init_subpel2(idx1, idx2, sz, type, opt) \ + init_subpel1(idx1, idx2, 1, 1, sz, hv, type, opt); \ + init_subpel1(idx1, idx2, 0, 1, sz, v, type, opt); \ + init_subpel1(idx1, idx2, 1, 0, sz, h, type, opt) + +#define init_subpel3_32_64(idx, type, opt) \ + init_subpel2(0, idx, 64, type, opt); \ + init_subpel2(1, idx, 32, type, opt) + +#define init_subpel3_8to64(idx, type, opt) \ + init_subpel3_32_64(idx, type, opt); \ + init_subpel2(2, idx, 16, type, opt); \ + init_subpel2(3, idx, 8, type, opt) + +#define init_subpel3(idx, type, opt) \ + init_subpel3_8to64(idx, type, opt); \ + init_subpel2(4, idx, 4, type, opt) + +#define init_lpf(opt) do { \ + dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \ + dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \ + dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \ + dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \ + dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \ + dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \ + dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \ + dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \ + dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \ + dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \ +} while (0) + +#define init_ipred(sz, opt, t, e) \ + dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt + +#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext +#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext +#define init_dir_tm_ipred(sz, opt) do { \ + init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \ + init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \ + init_ipred(sz, opt, hd, HOR_DOWN); \ + init_ipred(sz, opt, vl, VERT_LEFT); \ + init_ipred(sz, opt, hu, HOR_UP); \ + init_ipred(sz, opt, tm, TM_VP8); \ + init_ipred(sz, opt, vr, VERT_RIGHT); \ +} while (0) +#define init_dir_tm_h_ipred(sz, opt) do { \ + init_dir_tm_ipred(sz, opt); \ + init_ipred(sz, opt, h, HOR); \ +} while (0) +#define init_dc_ipred(sz, opt) do { \ + init_ipred(sz, opt, dc, DC); \ + init_ipred(sz, opt, dc_left, LEFT_DC); \ + init_ipred(sz, opt, dc_top, TOP_DC); \ +} while (0) +#define init_all_ipred(sz, opt) do { \ + init_dc_ipred(sz, opt); \ + init_dir_tm_h_ipred(sz, opt); \ +} while (0) if (EXTERNAL_MMX(cpu_flags)) { init_fpel(4, 0, 4, put, mmx); init_fpel(3, 0, 8, put, mmx); + dsp->itxfm_add[4 /* lossless */][DCT_DCT] = + dsp->itxfm_add[4 /* lossless */][ADST_DCT] = + dsp->itxfm_add[4 /* lossless */][DCT_ADST] = + dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx; + init_ipred(8, mmx, v, VERT); + } + + if (EXTERNAL_MMXEXT(cpu_flags)) { + init_subpel2(4, 0, 4, put, mmxext); + init_subpel2(4, 1, 4, avg, mmxext); + init_fpel(4, 1, 4, avg, mmxext); + init_fpel(3, 1, 8, avg, mmxext); + dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; + init_dc_ipred(4, mmxext); + init_dc_ipred(8, mmxext); + init_dir_tm_ipred(4, mmxext); } if (EXTERNAL_SSE(cpu_flags)) { init_fpel(2, 0, 16, put, sse); init_fpel(1, 0, 32, put, sse); init_fpel(0, 0, 64, put, sse); - init_fpel(4, 1, 4, avg, sse); - init_fpel(3, 1, 8, avg, sse); + init_ipred(16, sse, v, VERT); + init_ipred(32, sse, v, VERT); } if (EXTERNAL_SSE2(cpu_flags)) { + init_subpel3_8to64(0, put, sse2); + init_subpel3_8to64(1, avg, sse2); init_fpel(2, 1, 16, avg, sse2); init_fpel(1, 1, 32, avg, sse2); init_fpel(0, 1, 64, avg, sse2); + init_lpf(sse2); + dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; + dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; + dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2; + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2; + dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2; + dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2; + dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2; + init_dc_ipred(16, sse2); + init_dc_ipred(32, sse2); + init_dir_tm_h_ipred(8, sse2); + init_dir_tm_h_ipred(16, sse2); + init_dir_tm_h_ipred(32, sse2); + init_ipred(4, sse2, h, HOR); } if (EXTERNAL_SSSE3(cpu_flags)) { init_subpel3(0, put, ssse3); init_subpel3(1, avg, ssse3); + dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3; + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3; + dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3; + dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3; + dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; + init_lpf(ssse3); + init_all_ipred(4, ssse3); + init_all_ipred(8, ssse3); + init_all_ipred(16, ssse3); + init_all_ipred(32, ssse3); + } + + if (EXTERNAL_AVX(cpu_flags)) { + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx; + dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx; + dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx; + dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; + init_lpf(avx); + init_dir_tm_h_ipred(8, avx); + init_dir_tm_h_ipred(16, avx); + init_dir_tm_h_ipred(32, avx); + } + if (EXTERNAL_AVX_FAST(cpu_flags)) { + init_fpel(1, 0, 32, put, avx); + init_fpel(0, 0, 64, put, avx); + init_ipred(32, avx, v, VERT); + } + + if (EXTERNAL_AVX2(cpu_flags)) { + init_fpel(1, 1, 32, avg, avx2); + init_fpel(0, 1, 64, avg, avx2); + if (ARCH_X86_64) { +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL + init_subpel3_32_64(0, put, avx2); + init_subpel3_32_64(1, avg, avx2); +#endif + } + init_dc_ipred(32, avx2); + init_ipred(32, avx2, h, HOR); + init_ipred(32, avx2, tm, TM_VP8); } #undef init_fpel diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm new file mode 100644 index 0000000000..31f7d449fd --- /dev/null +++ b/libavcodec/x86/vp9intrapred.asm @@ -0,0 +1,2044 @@ +;****************************************************************************** +;* VP9 Intra prediction SIMD optimizations +;* +;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* Parts based on: +;* H.264 intra prediction asm optimizations +;* Copyright (c) 2010 Fiona Glaser +;* Copyright (c) 2010 Holger Lubitz +;* Copyright (c) 2010 Loren Merritt +;* Copyright (c) 2010 Ronald S. Bultje +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pw_m256: times 16 dw -256 +pw_m255: times 16 dw -255 +pw_4096: times 8 dw 4096 + +pb_4x3_4x2_4x1_4x0: times 4 db 3 + times 4 db 2 + times 4 db 1 + times 4 db 0 +pb_8x1_8x0: times 8 db 1 + times 8 db 0 +pb_8x3_8x2: times 8 db 3 + times 8 db 2 +pb_0to5_2x7: db 0, 1, 2, 3, 4, 5, 7, 7 + times 8 db -1 +pb_0to6_9x7: db 0, 1, 2, 3, 4, 5, 6 + times 9 db 7 +pb_1to6_10x7: db 1, 2, 3, 4, 5, 6 + times 10 db 7 +pb_2to6_3x7: +pb_2to6_11x7: db 2, 3, 4, 5, 6 + times 11 db 7 +pb_1toE_2xF: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 +pb_2toE_3xF: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 +pb_13456_3xm1: db 1, 3, 4, 5, 6 + times 3 db -1 +pb_6012_4xm1: db 6, 0, 1, 2 + times 4 db -1 +pb_6xm1_246_8toE: times 6 db -1 + db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14 +pb_6xm1_BDF_0to6: times 6 db -1 + db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6 +pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + +pb_15x0_1xm1: times 15 db 0 + db -1 +pb_0to2_5x3: db 0, 1, 2 + times 5 db 3 +pb_6xm1_2x0: times 6 db -1 + times 2 db 0 +pb_6x0_2xm1: times 6 db 0 + times 2 db -1 + +cextern pb_1 +cextern pb_2 +cextern pb_3 +cextern pb_15 +cextern pw_2 +cextern pw_4 +cextern pw_8 +cextern pw_16 +cextern pw_32 +cextern pw_255 +cextern pw_512 +cextern pw_1024 +cextern pw_2048 +cextern pw_8192 + +SECTION .text + +; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) + +%macro DC_4to8_FUNCS 0 +cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [lq] + punpckldq m0, [aq] + pxor m1, m1 + psadbw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_4096] + pshufb m0, m1 +%else + paddw m0, [pw_4] + psraw m0, 3 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + RET + +cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a + movq m0, [lq] + movq m1, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] + pshufb m0, m2 +%else + paddw m0, [pw_8] + psraw m0, 4 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET +%endmacro + +INIT_MMX mmxext +DC_4to8_FUNCS +INIT_MMX ssse3 +DC_4to8_FUNCS + +%macro DC_16to32_FUNCS 0 +cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a + mova m0, [lq] + mova m1, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_1024] + pshufb m0, m2 +%else + paddw m0, [pw_16] + psraw m0, 5 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a + mova m0, [lq] + mova m1, [lq+16] + mova m2, [aq] + mova m3, [aq+16] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m4, m4 + psadbw m0, m4 + psadbw m1, m4 + psadbw m2, m4 + psadbw m3, m4 + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_512] + pshufb m0, m4 +%else + paddw m0, [pw_32] + psraw m0, 6 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 8 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DC_16to32_FUNCS +INIT_XMM ssse3 +DC_16to32_FUNCS + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [lq] + mova m1, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + movhlps xm1, xm0 + paddw xm0, xm1 + pmulhrsw xm0, [pw_512] + vpbroadcastb m0, xm0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif + +; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) + +%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l) +cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [%2q] + pxor m1, m1 + psadbw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_8192] + pshufb m0, m1 +%else + paddw m0, [pw_2] + psraw m0, 2 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + RET + +cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a + movq m0, [%2q] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pxor m1, m1 + psadbw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_4096] + pshufb m0, m1 +%else + paddw m0, [pw_4] + psraw m0, 3 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET +%endmacro + +INIT_MMX mmxext +DC_1D_4to8_FUNCS top, a +DC_1D_4to8_FUNCS left, l +INIT_MMX ssse3 +DC_1D_4to8_FUNCS top, a +DC_1D_4to8_FUNCS left, l + +%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l) +cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a + mova m0, [%2q] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] + pshufb m0, m2 +%else + paddw m0, [pw_8] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [%2q] + mova m1, [%2q+16] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_1024] + pshufb m0, m2 +%else + paddw m0, [pw_16] + psraw m0, 5 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 8 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DC_1D_16to32_FUNCS top, a +DC_1D_16to32_FUNCS left, l +INIT_XMM ssse3 +DC_1D_16to32_FUNCS top, a +DC_1D_16to32_FUNCS left, l + +%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l) +%if HAVE_AVX2_EXTERNAL +cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [%2q] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + movhlps xm1, xm0 + paddw xm0, xm1 + pmulhrsw xm0, [pw_1024] + vpbroadcastb m0, xm0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif +%endmacro + +INIT_YMM avx2 +DC_1D_AVX2_FUNCS top, a +DC_1D_AVX2_FUNCS left, l + +; v + +INIT_MMX mmx +cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a + movq m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse +cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse +cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a + mova m0, [aq] + mova m1, [aq+16] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 8 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_YMM avx +cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +; h + +%macro H_XMM_FUNCS 2 +%if notcpuflag(avx) +cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3 + movd m0, [lq] +%if cpuflag(ssse3) + pshufb m0, [pb_4x3_4x2_4x1_4x0] +%else + punpcklbw m0, m0 + pshuflw m0, m0, q0123 + punpcklwd m0, m0 +%endif + lea stride3q, [strideq*3] + movd [dstq+strideq*0], m0 + psrldq m0, 4 + movd [dstq+strideq*1], m0 + psrldq m0, 4 + movd [dstq+strideq*2], m0 + psrldq m0, 4 + movd [dstq+stride3q ], m0 + RET +%endif + +cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) + mova m2, [pb_8x1_8x0] + mova m3, [pb_8x3_8x2] +%endif + lea stride3q, [strideq*3] + mov cntq, 1 +.loop: + movd m0, [lq+cntq*4] +%if cpuflag(ssse3) + pshufb m1, m0, m3 + pshufb m0, m2 +%else + punpcklbw m0, m0 + punpcklwd m0, m0 + pshufd m1, m0, q2233 + pshufd m0, m0, q0011 +%endif + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + movq [dstq+strideq*2], m0 + movhps [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET + +cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) + mova m5, [pb_1] + mova m6, [pb_2] + mova m7, [pb_3] + pxor m4, m4 +%endif + lea stride3q, [strideq*3] + mov cntq, 3 +.loop: + movd m3, [lq+cntq*4] +%if cpuflag(ssse3) + pshufb m0, m3, m7 + pshufb m1, m3, m6 +%else + punpcklbw m3, m3 + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 +%endif + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 +%if cpuflag(ssse3) + pshufb m2, m3, m5 + pshufb m3, m4 +%else + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 +%endif + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET + +cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) + mova m5, [pb_1] + mova m6, [pb_2] + mova m7, [pb_3] + pxor m4, m4 +%endif + lea stride3q, [strideq*3] + mov cntq, 7 +.loop: + movd m3, [lq+cntq*4] +%if cpuflag(ssse3) + pshufb m0, m3, m7 + pshufb m1, m3, m6 +%else + punpcklbw m3, m3 + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 +%endif + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m1 +%if cpuflag(ssse3) + pshufb m2, m3, m5 + pshufb m3, m4 +%else + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 +%endif + mova [dstq+strideq*2+ 0], m2 + mova [dstq+strideq*2+16], m2 + mova [dstq+stride3q + 0], m3 + mova [dstq+stride3q +16], m3 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET +%endmacro + +INIT_XMM sse2 +H_XMM_FUNCS 2, 4 +INIT_XMM ssse3 +H_XMM_FUNCS 4, 8 +INIT_XMM avx +H_XMM_FUNCS 4, 8 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt + mova m5, [pb_1] + mova m6, [pb_2] + mova m7, [pb_3] + pxor m4, m4 + lea stride3q, [strideq*3] + mov cntq, 7 +.loop: + movd xm3, [lq+cntq*4] + vinserti128 m3, m3, xm3, 1 + pshufb m0, m3, m7 + pshufb m1, m3, m6 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufb m2, m3, m5 + pshufb m3, m4 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET +%endif + +; tm + +%macro TM_MMX_FUNCS 0 +cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a + pxor m1, m1 + movd m0, [aq] + pinsrw m2, [aq-1], 0 + punpcklbw m0, m1 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m3, [pw_m256] + mova m1, [pw_m255] + pshufb m2, m3 +%else + punpcklbw m2, m1 + pshufw m2, m2, q0000 +%endif + psubw m0, m2 + mov cntq, 1 +.loop: + pinsrw m2, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m4, m2, m1 + pshufb m2, m3 +%else + punpcklbw m2, m1 + pshufw m4, m2, q1111 + pshufw m2, m2, q0000 +%endif + paddw m4, m0 + paddw m2, m0 + packuswb m4, m4 + packuswb m2, m2 + movd [dstq+strideq*0], m4 + movd [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET +%endmacro + +INIT_MMX mmxext +TM_MMX_FUNCS +INIT_MMX ssse3 +TM_MMX_FUNCS + +%macro TM_XMM_FUNCS 0 +cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a + pxor m1, m1 + movh m0, [aq] + pinsrw m2, [aq-1], 0 + punpcklbw m0, m1 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m3, [pw_m256] + mova m1, [pw_m255] + pshufb m2, m3 +%else + punpcklbw m2, m1 + punpcklwd m2, m2 + pshufd m2, m2, q0000 +%endif + psubw m0, m2 + mov cntq, 3 +.loop: + pinsrw m2, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m4, m2, m1 + pshufb m2, m3 +%else + punpcklbw m2, m1 + punpcklwd m2, m2 + pshufd m4, m2, q1111 + pshufd m2, m2, q0000 +%endif + paddw m4, m0 + paddw m2, m0 + packuswb m4, m2 + movh [dstq+strideq*0], m4 + movhps [dstq+strideq*1], m4 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET + +cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a + pxor m3, m3 + mova m0, [aq] + pinsrw m2, [aq-1], 0 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m4, [pw_m256] + mova m3, [pw_m255] + pshufb m2, m4 +%else + punpcklbw m2, m3 + punpcklwd m2, m2 + pshufd m2, m2, q0000 +%endif + psubw m1, m2 + psubw m0, m2 + mov cntq, 7 +.loop: + pinsrw m7, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m5, m7, m3 + pshufb m7, m4 +%else + punpcklbw m7, m3 + punpcklwd m7, m7 + pshufd m5, m7, q1111 + pshufd m7, m7, q0000 +%endif + paddw m2, m5, m0 + paddw m5, m1 + paddw m6, m7, m0 + paddw m7, m1 + packuswb m2, m5 + packuswb m6, m7 + mova [dstq+strideq*0], m2 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET + +%if ARCH_X86_64 +%define mem 0 +%else +%define mem 64 +%endif +cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a + pxor m5, m5 + pinsrw m4, [aq-1], 0 + mova m0, [aq] + mova m2, [aq+16] + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) +%if ARCH_X86_64 + mova m12, [pw_m256] + mova m13, [pw_m255] +%define pw_m256_reg m12 +%define pw_m255_reg m13 +%else +%define pw_m256_reg [pw_m256] +%define pw_m255_reg [pw_m255] +%endif + pshufb m4, pw_m256_reg +%else + punpcklbw m4, m5 + punpcklwd m4, m4 + pshufd m4, m4, q0000 +%endif + punpckhbw m1, m0, m5 + punpckhbw m3, m2, m5 + punpcklbw m0, m5 + punpcklbw m2, m5 + psubw m1, m4 + psubw m0, m4 + psubw m3, m4 + psubw m2, m4 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 +%else + mova [rsp+0*16], m0 + mova [rsp+1*16], m1 + mova [rsp+2*16], m2 + mova [rsp+3*16], m3 +%endif + mov cntq, 15 +.loop: + pinsrw m3, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m7, m3, pw_m255_reg + pshufb m3, pw_m256_reg +%else + pxor m7, m7 + punpcklbw m3, m7 + punpcklwd m3, m3 + pshufd m7, m3, q1111 + pshufd m3, m3, q0000 +%endif +%if ARCH_X86_64 + paddw m4, m7, m8 + paddw m5, m7, m9 + paddw m6, m7, m10 + paddw m7, m11 + paddw m0, m3, m8 + paddw m1, m3, m9 + paddw m2, m3, m10 + paddw m3, m11 +%else + paddw m4, m7, [rsp+0*16] + paddw m5, m7, [rsp+1*16] + paddw m6, m7, [rsp+2*16] + paddw m7, [rsp+3*16] + paddw m0, m3, [rsp+0*16] + paddw m1, m3, [rsp+1*16] + paddw m2, m3, [rsp+2*16] + paddw m3, [rsp+3*16] +%endif + packuswb m4, m5 + packuswb m6, m7 + packuswb m0, m1 + packuswb m2, m3 + mova [dstq+strideq*0+ 0], m4 + mova [dstq+strideq*0+16], m6 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m2 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET +%undef pw_m256_reg +%undef pw_m255_reg +%undef mem +%endmacro + +INIT_XMM sse2 +TM_XMM_FUNCS +INIT_XMM ssse3 +TM_XMM_FUNCS +INIT_XMM avx +TM_XMM_FUNCS + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a + pxor m3, m3 + pinsrw xm2, [aq-1], 0 + vinserti128 m2, m2, xm2, 1 + mova m0, [aq] + DEFINE_ARGS dst, stride, l, cnt + mova m4, [pw_m256] + mova m5, [pw_m255] + pshufb m2, m4 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + psubw m1, m2 + psubw m0, m2 + mov cntq, 15 +.loop: + pinsrw xm7, [lq+cntq*2], 0 + vinserti128 m7, m7, xm7, 1 + pshufb m3, m7, m5 + pshufb m7, m4 + paddw m2, m3, m0 + paddw m3, m1 + paddw m6, m7, m0 + paddw m7, m1 + packuswb m2, m3 + packuswb m6, m7 + mova [dstq+strideq*0], m2 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET +%endif + +; dl + +%macro LOWPASS 4 ; left [dst], center, right, tmp + pxor m%4, m%1, m%3 + pand m%4, [pb_1] + pavgb m%1, m%3 + psubusb m%1, m%4 + pavgb m%1, m%2 +%endmacro + +%macro DL_MMX_FUNCS 0 +cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a + movq m1, [aq] +%if cpuflag(ssse3) + pshufb m0, m1, [pb_0to5_2x7] + pshufb m2, m1, [pb_2to6_3x7] +%else + punpckhbw m3, m1, m1 ; 44556677 + pand m0, m1, [pb_6xm1_2x0] ; 012345__ + pand m3, [pb_6x0_2xm1] ; ______77 + psrlq m2, m1, 16 ; 234567__ + por m0, m3 ; 01234577 + por m2, m3 ; 23456777 +%endif + psrlq m1, 8 + LOWPASS 0, 1, 2, 3 + + pshufw m1, m0, q3321 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*2], m1 + psrlq m0, 8 + psrlq m1, 8 + add dstq, strideq + movd [dstq+strideq*0], m0 + movd [dstq+strideq*2], m1 + RET +%endmacro + +INIT_MMX mmxext +DL_MMX_FUNCS +INIT_MMX ssse3 +DL_MMX_FUNCS + +%macro DL_XMM_FUNCS 0 +cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a + movq m0, [aq] + lea stride5q, [strideq*5] +%if cpuflag(ssse3) + pshufb m1, m0, [pb_1to6_10x7] +%else + punpcklbw m1, m0, m0 ; 0011223344556677 + punpckhwd m1, m1 ; 4x4,4x5,4x6,4x7 +%endif + shufps m0, m1, q3310 +%if notcpuflag(ssse3) + psrldq m1, m0, 1 + shufps m1, m0, q3210 +%endif + psrldq m2, m1, 1 + LOWPASS 0, 1, 2, 3 + + pshufd m1, m0, q3321 + movq [dstq+strideq*0], m0 + movq [dstq+strideq*4], m1 + psrldq m0, 1 + psrldq m1, 1 + movq [dstq+strideq*1], m0 + movq [dstq+stride5q ], m1 + lea dstq, [dstq+strideq*2] + psrldq m0, 1 + psrldq m1, 1 + movq [dstq+strideq*0], m0 + movq [dstq+strideq*4], m1 + psrldq m0, 1 + psrldq m1, 1 + movq [dstq+strideq*1], m0 + movq [dstq+stride5q ], m1 + RET + +cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a + mova m0, [aq] +%if cpuflag(ssse3) + mova m5, [pb_1toE_2xF] + pshufb m1, m0, m5 + pshufb m2, m1, m5 + pshufb m4, m0, [pb_15] +%else + pand m5, m0, [pb_15x0_1xm1] ; _______________F + psrldq m1, m0, 1 ; 123456789ABCDEF_ + por m1, m5 ; 123456789ABCDEFF + psrldq m2, m1, 1 ; 23456789ABCDEFF_ + por m2, m5 ; 23456789ABCDEFFF + pshufhw m4, m1, q3333 ; xxxxxxxxFFFFFFFF +%endif + LOWPASS 0, 1, 2, 3 + DEFINE_ARGS dst, stride, cnt, stride9 + lea stride9q, [strideq+strideq*8] + mov cntd, 4 + +.loop: + movhlps m4, m0 + mova [dstq+strideq*0], m0 +%if cpuflag(ssse3) + pshufb m0, m5 +%else + psrldq m0, 1 + por m0, m5 +%endif + mova [dstq+strideq*8], m4 + movhlps m4, m0 + mova [dstq+strideq*1], m0 +%if cpuflag(ssse3) + pshufb m0, m5 +%else + psrldq m0, 1 + por m0, m5 +%endif + mova [dstq+stride9q ], m4 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16 + mova m0, [aq] + mova m1, [aq+16] + PALIGNR m2, m1, m0, 1, m4 + PALIGNR m3, m1, m0, 2, m4 + LOWPASS 0, 2, 3, 4 +%if cpuflag(ssse3) + mova m5, [pb_1toE_2xF] + pshufb m2, m1, m5 + pshufb m3, m2, m5 + pshufb m6, m1, [pb_15] + mova m7, m6 +%else + pand m5, m1, [pb_15x0_1xm1] ; _______________F + psrldq m2, m1, 1 ; 123456789ABCDEF_ + por m2, m5 ; 123456789ABCDEFF + psrldq m3, m2, 1 ; 23456789ABCDEFF_ + por m3, m5 ; 23456789ABCDEFFF + pshufhw m7, m2, q3333 ; xxxxxxxxFFFFFFFF + pshufd m6, m7, q3333 +%endif + LOWPASS 1, 2, 3, 4 + lea dst16q, [dstq +strideq*8] + mov cntd, 8 + lea dst16q, [dst16q+strideq*8] +.loop: + movhlps m7, m1 + mova [dstq +strideq*0+ 0], m0 + mova [dstq +strideq*0+16], m1 + movhps [dstq+strideq*8+ 0], m0 + movq [dstq +strideq*8+ 8], m1 + mova [dstq +strideq*8+16], m7 + mova [dst16q+strideq*0+ 0], m1 + mova [dst16q+strideq*0+16], m6 + mova [dst16q+strideq*8+ 0], m7 + mova [dst16q+strideq*8+16], m6 +%if cpuflag(avx) + vpalignr m0, m1, m0, 1 + pshufb m1, m5 +%elif cpuflag(ssse3) + palignr m2, m1, m0, 1 + pshufb m1, m5 + mova m0, m2 +%else + mova m4, m1 + psrldq m0, 1 + pslldq m4, 15 + psrldq m1, 1 + por m0, m4 + por m1, m5 +%endif + add dstq, strideq + add dst16q, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DL_XMM_FUNCS +INIT_XMM ssse3 +DL_XMM_FUNCS +INIT_XMM avx +DL_XMM_FUNCS + +; dr + +%macro DR_MMX_FUNCS 0 +cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [lq] + punpckldq m0, [aq-1] + movd m1, [aq+3] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + PALIGNR m1, m0, 1, m3 + psrlq m2, m1, 8 + LOWPASS 0, 1, 2, 3 + + movd [dstq+stride3q ], m0 + psrlq m0, 8 + movd [dstq+strideq*2], m0 + psrlq m0, 8 + movd [dstq+strideq*1], m0 + psrlq m0, 8 + movd [dstq+strideq*0], m0 + RET +%endmacro + +INIT_MMX mmxext +DR_MMX_FUNCS +INIT_MMX ssse3 +DR_MMX_FUNCS + +%macro DR_XMM_FUNCS 0 +cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a + movq m1, [lq] + movhps m1, [aq-1] + movd m2, [aq+7] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pslldq m0, m1, 1 + PALIGNR m2, m1, 1, m3 + LOWPASS 0, 1, 2, 3 + + movhps [dstq+strideq*0], m0 + pslldq m0, 1 + movhps [dstq+strideq*1], m0 + pslldq m0, 1 + movhps [dstq+strideq*2], m0 + pslldq m0, 1 + movhps [dstq+stride3q ], m0 + pslldq m0, 1 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], m0 + pslldq m0, 1 + movhps [dstq+strideq*1], m0 + pslldq m0, 1 + movhps [dstq+strideq*2], m0 + pslldq m0, 1 + movhps [dstq+stride3q ], m0 + RET + +cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a + mova m1, [lq] + movu m2, [aq-1] + movd m4, [aq+15] + DEFINE_ARGS dst, stride, stride9, cnt + lea stride9q, [strideq *3] + mov cntd, 4 + lea stride9q, [stride9q*3] + PALIGNR m4, m2, 1, m5 + PALIGNR m3, m2, m1, 15, m5 + LOWPASS 3, 2, 4, 5 + pslldq m0, m1, 1 + PALIGNR m2, m1, 1, m4 + LOWPASS 0, 1, 2, 4 + +.loop: + mova [dstq+strideq*0 ], m3 + movhps [dstq+strideq*8+0], m0 + movq [dstq+strideq*8+8], m3 + PALIGNR m3, m0, 15, m1 + pslldq m0, 1 + mova [dstq+strideq*1 ], m3 + movhps [dstq+stride9q +0], m0 + movq [dstq+stride9q +8], m3 + PALIGNR m3, m0, 15, m1 + pslldq m0, 1 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a + mova m1, [lq] + mova m2, [lq+16] + movu m3, [aq-1] + movu m4, [aq+15] + movd m5, [aq+31] + DEFINE_ARGS dst, stride, stride8, cnt + lea stride8q, [strideq*8] + PALIGNR m5, m4, 1, m7 + PALIGNR m6, m4, m3, 15, m7 + LOWPASS 5, 4, 6, 7 + PALIGNR m4, m3, 1, m7 + PALIGNR m6, m3, m2, 15, m7 + LOWPASS 4, 3, 6, 7 + PALIGNR m3, m2, 1, m7 + PALIGNR m6, m2, m1, 15, m7 + LOWPASS 3, 2, 6, 7 + PALIGNR m2, m1, 1, m6 + pslldq m0, m1, 1 + LOWPASS 2, 1, 0, 6 + mov cntd, 16 + + ; out=m2/m3/m4/m5 +.loop: + mova [dstq+stride8q*0+ 0], m4 + mova [dstq+stride8q*0+16], m5 + mova [dstq+stride8q*2+ 0], m3 + mova [dstq+stride8q*2+16], m4 + PALIGNR m5, m4, 15, m6 + PALIGNR m4, m3, 15, m6 + PALIGNR m3, m2, 15, m6 + pslldq m2, 1 + add dstq, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DR_XMM_FUNCS +INIT_XMM ssse3 +DR_XMM_FUNCS +INIT_XMM avx +DR_XMM_FUNCS + +; vl + +INIT_MMX mmxext +cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a + movq m0, [aq] + psrlq m1, m0, 8 + psrlq m2, m1, 8 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + movd [dstq+strideq*0], m1 + movd [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + psrlq m1, 8 + psrlq m2, 8 + movd [dstq+strideq*0], m1 + movd [dstq+strideq*1], m2 + RET + +%macro VL_XMM_FUNCS 0 +cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a + movq m0, [aq] +%if cpuflag(ssse3) + pshufb m0, [pb_0to6_9x7] +%else + punpcklbw m1, m0, m0 + punpckhwd m1, m1 + shufps m0, m1, q3310 +%endif + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psrldq m1, m0, 1 + psrldq m2, m0, 2 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + + movq [dstq+strideq*0], m1 + movq [dstq+strideq*1], m2 + psrldq m1, 1 + psrldq m2, 1 + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + psrldq m1, 1 + psrldq m2, 1 + movq [dstq+strideq*0], m1 + movq [dstq+strideq*1], m2 + psrldq m1, 1 + psrldq m2, 1 + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] +%if cpuflag(ssse3) + mova m4, [pb_1toE_2xF] + pshufb m1, m0, m4 + pshufb m2, m1, m4 +%else + pand m4, m0, [pb_15x0_1xm1] ; _______________F + psrldq m1, m0, 1 ; 123456789ABCDEF_ + por m1, m4 ; 123456789ABCDEFF + psrldq m2, m1, 1 ; 23456789ABCDEFF_ + por m2, m4 ; 23456789ABCDEFFF +%endif + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 +%if cpuflag(ssse3) + pshufb m1, m4 + pshufb m2, m4 +%else + psrldq m1, 1 + psrldq m2, 1 + por m1, m4 + por m2, m4 +%endif + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 +%if cpuflag(ssse3) + pshufb m1, m4 + pshufb m2, m4 +%else + psrldq m1, 1 + psrldq m2, 1 + por m1, m4 + por m2, m4 +%endif + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a + mova m0, [aq] + mova m5, [aq+16] + DEFINE_ARGS dst, stride, dst16, cnt + PALIGNR m2, m5, m0, 1, m4 + PALIGNR m3, m5, m0, 2, m4 + lea dst16q, [dstq +strideq*8] + LOWPASS 3, 2, 0, 6 + pavgb m2, m0 +%if cpuflag(ssse3) + mova m4, [pb_1toE_2xF] + pshufb m0, m5, m4 + pshufb m1, m0, m4 +%else + pand m4, m5, [pb_15x0_1xm1] ; _______________F + psrldq m0, m5, 1 ; 123456789ABCDEF_ + por m0, m4 ; 123456789ABCDEFF + psrldq m1, m0, 1 ; 23456789ABCDEFF_ + por m1, m4 ; 23456789ABCDEFFF +%endif + lea dst16q, [dst16q+strideq*8] + LOWPASS 1, 0, 5, 6 + pavgb m0, m5 +%if cpuflag(ssse3) + pshufb m5, [pb_15] +%else + punpckhbw m5, m4, m4 + pshufhw m5, m5, q3333 + punpckhqdq m5, m5 +%endif + mov cntd, 8 + +.loop: +%macro %%write 3 + mova [dstq+stride%1+ 0], %2 + mova [dstq+stride%1+16], %3 + movhps [dst16q+stride%1 ], %2 + movu [dst16q+stride%1+ 8], %3 + movq [dst16q+stride%1+24], m5 +%if cpuflag(avx) + palignr %2, %3, %2, 1 + pshufb %3, m4 +%elif cpuflag(ssse3) + palignr m6, %3, %2, 1 + pshufb %3, m4 + mova %2, m6 +%else + pslldq m6, %3, 15 + psrldq %3, 1 + psrldq %2, 1 + por %3, m4 + por %2, m6 +%endif +%endmacro + + %%write q*0, m2, m0 + %%write q*1, m3, m1 + lea dstq, [dstq +strideq*2] + lea dst16q, [dst16q+strideq*2] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +VL_XMM_FUNCS +INIT_XMM ssse3 +VL_XMM_FUNCS +INIT_XMM avx +VL_XMM_FUNCS + +; vr + +%macro VR_MMX_FUNCS 0 +cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a + movq m1, [aq-1] + punpckldq m2, [lq] + movd m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pavgb m0, m1 + PALIGNR m1, m2, 5, m3 + psrlq m2, m1, 8 + psllq m3, m1, 8 + LOWPASS 2, 1, 3, 4 + + ; ABCD <- for the following predictor: + ; EFGH + ; IABC | m0 contains ABCDxxxx + ; JEFG | m2 contains xJIEFGHx + +%if cpuflag(ssse3) + punpckldq m0, m2 + pshufb m2, [pb_13456_3xm1] + movd [dstq+strideq*0], m0 + pshufb m0, [pb_6012_4xm1] + movd [dstq+stride3q ], m2 + psrlq m2, 8 + movd [dstq+strideq*2], m0 + movd [dstq+strideq*1], m2 +%else + psllq m1, m2, 40 + psrlq m2, 24 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m2 + PALIGNR m0, m1, 7, m3 + psllq m1, 8 + PALIGNR m2, m1, 7, m3 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m2 +%endif + RET +%endmacro + +INIT_MMX mmxext +VR_MMX_FUNCS +INIT_MMX ssse3 +VR_MMX_FUNCS + +%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16 +cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a + movu m1, [aq-1] + movhps m2, [lq] + movq m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pavgb m0, m1 + PALIGNR m1, m2, 9, m3 + pslldq m2, m1, 1 + pslldq m3, m1, 2 + LOWPASS 1, 2, 3, 4 + + ; ABCDEFGH <- for the following predictor: + ; IJKLMNOP + ; QABCDEFG | m0 contains ABCDEFGHxxxxxxxx + ; RIJKLMNO | m1 contains xxVUTSRQIJKLMNOP + ; SQABCDEF + ; TRIJKLMN + ; USQABCDE + ; VTRIJKLM + +%if cpuflag(ssse3) + punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ +%endif + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m1 +%if cpuflag(ssse3) + pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG + pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO +%else + psrlw m2, m1, 8 ; x_U_S_Q_xxxxxxxx + pand m3, m1, [pw_255] ; x_V_T_R_xxxxxxxx + packuswb m3, m2 ; xVTRxxxxxUSQxxxx + pslldq m3, 4 ; xxxxxVTRxxxxxUSQ + PALIGNR m0, m3, 7, m4 ; xxxxxxUSQABCDEFG + psrldq m1, 8 + pslldq m3, 8 + PALIGNR m1, m3, 7, m4 ; xxxxxxVTRIJKLMNO +%endif + movhps [dstq+strideq*2], m0 + movhps [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + pslldq m0, 1 + pslldq m1, 1 + movhps [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m1 + pslldq m0, 1 + pslldq m1, 1 + movhps [dstq+strideq*2], m0 + movhps [dstq+stride3q ], m1 + RET + +cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a + mova m0, [aq] + movu m1, [aq-1] + mova m2, [lq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + PALIGNR m3, m1, m2, 15, m6 + LOWPASS 3, 1, 0, 4 + pavgb m0, m1 + PALIGNR m1, m2, 1, m6 + pslldq m4, m2, 1 + LOWPASS 1, 2, 4, 5 +%if cpuflag(ssse3) + pshufb m1, [pb_02468ACE_13579BDF] +%else + psrlw m5, m1, 8 + pand m1, [pw_255] + packuswb m1, m5 +%endif + mov cntd, 4 + +.loop: + movlhps m2, m1 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m3 + PALIGNR m4, m0, m1, 15, m6 + PALIGNR m5, m3, m2, 15, m6 + mova [dstq+strideq*2], m4 + mova [dstq+stride3q ], m5 + lea dstq, [dstq+strideq*4] + PALIGNR m0, m1, 14, m6 + PALIGNR m3, m2, 14, m6 + pslldq m1, 2 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a + mova m0, [aq] + mova m2, [aq+16] + movu m1, [aq-1] + PALIGNR m3, m2, m0, 15, m6 + PALIGNR m4, m2, m0, 14, m6 + LOWPASS 4, 3, 2, 5 + pavgb m3, m2 + mova m2, [lq+16] + PALIGNR m5, m1, m2, 15, m6 + LOWPASS 5, 1, 0, 6 + pavgb m0, m1 + mova m6, [lq] +%if ARCH_X86_64 + SWAP 0, 8 +%else + mova [dstq], m0 +%endif + PALIGNR m1, m2, 1, m0 + PALIGNR m7, m2, m6, 15, m0 + LOWPASS 1, 2, 7, 0 + PALIGNR m2, m6, 1, m0 + pslldq m7, m6, 1 + LOWPASS 2, 6, 7, 0 +%if cpuflag(ssse3) + pshufb m1, [pb_02468ACE_13579BDF] + pshufb m2, [pb_02468ACE_13579BDF] +%else + psrlw m0, m1, 8 + psrlw m6, m2, 8 + pand m1, [pw_255] + pand m2, [pw_255] + packuswb m1, m0 + packuswb m2, m6 +%endif + DEFINE_ARGS dst, stride, dst16, cnt + lea dst16q, [dstq +strideq*8] + lea dst16q, [dst16q+strideq*8] + SBUTTERFLY qdq, 2, 1, 6 +%if ARCH_X86_64 + SWAP 0, 8 +%else + mova m0, [dstq] +%endif + mov cntd, 8 + +.loop: + ; even lines (0, 2, 4, ...): m1 | m0, m3 + ; odd lines (1, 3, 5, ...): m2 | m5, m4 +%macro %%write 4 + mova [dstq+stride%1+ 0], %3 + mova [dstq+stride%1+16], %4 + movhps [dst16q+stride%1 ], %2 + movu [dst16q+stride%1+ 8], %3 + movq [dst16q+stride%1+24], %4 + PALIGNR %4, %3, 15, m6 + PALIGNR %3, %2, 15, m6 + pslldq %2, 1 +%endmacro + + %%write q*0, m1, m0, m3 + %%write q*1, m2, m5, m4 + lea dstq, [dstq +strideq*2] + lea dst16q, [dst16q+strideq*2] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +VR_XMM_FUNCS 7 +INIT_XMM ssse3 +VR_XMM_FUNCS 6 +INIT_XMM avx +VR_XMM_FUNCS 6 + +; hd + +INIT_MMX mmxext +cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [lq] + punpckldq m0, [aq-1] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psrlq m1, m0, 8 + psrlq m2, m1, 8 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + + ; DHIJ <- for the following predictor: + ; CGDH + ; BFCG | m1 contains ABCDxxxx + ; AEBF | m2 contains EFGHIJxx + + punpcklbw m1, m2 + punpckhdq m0, m1, m2 + + ; m1 contains AEBFCGDH + ; m0 contains CGDHIJxx + + movd [dstq+stride3q ], m1 + movd [dstq+strideq*1], m0 + psrlq m1, 16 + psrlq m0, 16 + movd [dstq+strideq*2], m1 + movd [dstq+strideq*0], m0 + RET + +%macro HD_XMM_FUNCS 0 +cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a + movq m0, [lq] + movhps m0, [aq-1] + DEFINE_ARGS dst, stride, stride3, dst4 + lea stride3q, [strideq*3] + lea dst4q, [dstq+strideq*4] + psrldq m1, m0, 1 + psrldq m2, m1, 1 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + + ; HPQRSTUV <- for the following predictor + ; GOHPQRST + ; FNGOHPQR | m1 contains ABCDEFGHxxxxxxxx + ; EMFNGOHP | m2 contains IJKLMNOPQRSTUVxx + ; DLEMFNGO + ; CKDLEMFN + ; BJCKDLEM + ; AIBJCKDL + + punpcklbw m1, m2 + movhlps m2, m2 + + ; m1 contains AIBJCKDLEMFNGOHP + ; m2 contains QRSTUVxxxxxxxxxx + + movhps [dstq +stride3q ], m1 + movq [dst4q+stride3q ], m1 + PALIGNR m3, m2, m1, 2, m4 + movhps [dstq +strideq*2], m3 + movq [dst4q+strideq*2], m3 + PALIGNR m3, m2, m1, 4, m4 + movhps [dstq +strideq*1], m3 + movq [dst4q+strideq*1], m3 + PALIGNR m2, m1, 6, m4 + movhps [dstq +strideq*0], m2 + movq [dst4q+strideq*0], m2 + RET + +cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a + mova m0, [lq] + movu m3, [aq-1] + DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12 + lea stride4q, [strideq*4] + lea dst4q, [dstq +stride4q] + lea dst8q, [dst4q+stride4q] + lea dst12q, [dst8q+stride4q] + psrldq m4, m3, 1 + psrldq m5, m3, 2 + LOWPASS 5, 4, 3, 6 + PALIGNR m1, m3, m0, 1, m6 + PALIGNR m2, m3, m0, 2, m6 + LOWPASS 2, 1, 0, 6 + pavgb m1, m0 + SBUTTERFLY bw, 1, 2, 6 + + ; I PROBABLY INVERTED L0 ad L16 here + ; m1, m2, m5 +.loop: + sub stride4q, strideq + movhps [dstq +stride4q +0], m2 + movq [dstq +stride4q +8], m5 + mova [dst4q+stride4q ], m2 + movhps [dst8q+stride4q +0], m1 + movq [dst8q+stride4q +8], m2 + mova [dst12q+stride4q ], m1 +%if cpuflag(avx) + palignr m1, m2, m1, 2 + palignr m2, m5, m2, 2 +%elif cpuflag(ssse3) + palignr m3, m2, m1, 2 + palignr m0, m5, m2, 2 + mova m1, m3 + mova m2, m0 +%else + ; slightly modified version of PALIGNR + mova m6, m2 + mova m4, m5 + pslldq m6, 14 + pslldq m4, 14 + psrldq m1, 2 + psrldq m2, 2 + por m1, m6 + por m2, m4 +%endif + psrldq m5, 2 + jg .loop + RET + +cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a + mova m0, [lq] + mova m1, [lq+16] + movu m2, [aq-1] + movu m3, [aq+15] + DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24 + lea stride8q, [strideq*8] + lea dst8q, [dstq +stride8q] + lea dst16q, [dst8q +stride8q] + lea dst24q, [dst16q+stride8q] + psrldq m4, m3, 1 + psrldq m5, m3, 2 + LOWPASS 5, 4, 3, 6 + PALIGNR m4, m3, m2, 2, m6 + PALIGNR m3, m2, 1, m6 + LOWPASS 4, 3, 2, 6 + PALIGNR m3, m2, m1, 2, m6 + PALIGNR m2, m1, 1, m6 + LOWPASS 3, 2, 1, 6 + pavgb m2, m1 + PALIGNR m6, m1, m0, 1, m7 + PALIGNR m1, m0, 2, m7 + LOWPASS 1, 6, 0, 7 + pavgb m0, m6 + SBUTTERFLY bw, 2, 3, 6 + SBUTTERFLY bw, 0, 1, 6 + + ; m0, m1, m2, m3, m4, m5 +.loop: + sub stride8q, strideq + mova [dstq +stride8q+ 0], m3 + mova [dstq +stride8q+16], m4 + mova [dst8q +stride8q+ 0], m2 + mova [dst8q +stride8q+16], m3 + mova [dst16q+stride8q+ 0], m1 + mova [dst16q+stride8q+16], m2 + mova [dst24q+stride8q+ 0], m0 + mova [dst24q+stride8q+16], m1 +%if cpuflag(avx) + palignr m0, m1, m0, 2 + palignr m1, m2, m1, 2 + palignr m2, m3, m2, 2 + palignr m3, m4, m3, 2 + palignr m4, m5, m4, 2 + psrldq m5, 2 +%elif cpuflag(ssse3) + psrldq m6, m5, 2 + palignr m5, m4, 2 + palignr m4, m3, 2 + palignr m3, m2, 2 + palignr m2, m1, 2 + palignr m1, m0, 2 + mova m0, m1 + mova m1, m2 + mova m2, m3 + mova m3, m4 + mova m4, m5 + mova m5, m6 +%else + ; sort of a half-integrated version of PALIGNR + pslldq m7, m4, 14 + pslldq m6, m5, 14 + psrldq m4, 2 + psrldq m5, 2 + por m4, m6 + pslldq m6, m3, 14 + psrldq m3, 2 + por m3, m7 + pslldq m7, m2, 14 + psrldq m2, 2 + por m2, m6 + pslldq m6, m1, 14 + psrldq m1, 2 + por m1, m7 + psrldq m0, 2 + por m0, m6 +%endif + jg .loop + RET +%endmacro + +INIT_XMM sse2 +HD_XMM_FUNCS +INIT_XMM ssse3 +HD_XMM_FUNCS +INIT_XMM avx +HD_XMM_FUNCS + +%macro HU_MMX_FUNCS 0 +cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l + movd m0, [lq] +%if cpuflag(ssse3) + pshufb m0, [pb_0to2_5x3] +%else + punpcklbw m1, m0, m0 ; 00112233 + pshufw m1, m1, q3333 ; 33333333 + punpckldq m0, m1 ; 01233333 +%endif + psrlq m1, m0, 8 + psrlq m2, m1, 8 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + SBUTTERFLY bw, 1, 2, 0 + PALIGNR m2, m1, 2, m0 + movd [dstq+strideq*0], m1 + movd [dstq+strideq*1], m2 + punpckhdq m1, m1 + punpckhdq m2, m2 + movd [dstq+strideq*2], m1 + movd [dstq+stride3q ], m2 + RET +%endmacro + +INIT_MMX mmxext +HU_MMX_FUNCS +INIT_MMX ssse3 +HU_MMX_FUNCS + +%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32 +cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l + movq m0, [lq] +%if cpuflag(ssse3) + pshufb m0, [pb_0to6_9x7] +%else + punpcklbw m1, m0, m0 ; 0011223344556677 + punpckhwd m1, m1 ; 4444555566667777 + shufps m0, m1, q3310 ; 0123456777777777 +%endif + psrldq m1, m0, 1 + psrldq m2, m1, 1 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + DEFINE_ARGS dst, stride, stride3, dst4 + lea stride3q, [strideq*3] + lea dst4q, [dstq+strideq*4] + SBUTTERFLY bw, 1, 2, 0 + movq [dstq +strideq*0], m1 + movhps [dst4q+strideq*0], m1 + PALIGNR m0, m2, m1, 2, m3 + movq [dstq +strideq*1], m0 + movhps [dst4q+strideq*1], m0 + PALIGNR m0, m2, m1, 4, m3 + movq [dstq +strideq*2], m0 + movhps [dst4q+strideq*2], m0 + PALIGNR m2, m1, 6, m3 + movq [dstq +stride3q ], m2 + movhps [dst4q+stride3q ], m2 + RET + +cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l + mova m0, [lq] +%if cpuflag(ssse3) + mova m3, [pb_2toE_3xF] + pshufb m1, m0, [pb_1toE_2xF] + pshufb m2, m0, m3 +%else + pand m3, m0, [pb_15x0_1xm1] + psrldq m1, m0, 1 + por m1, m3 + punpckhbw m3, m3 + psrldq m2, m0, 2 + por m2, m3 +%endif + LOWPASS 2, 1, 0, 4 + pavgb m1, m0 + DEFINE_ARGS dst, stride, stride9, cnt + lea stride9q, [strideq*8+strideq] + mov cntd, 4 + SBUTTERFLY bw, 1, 2, 0 + +.loop: + mova [dstq+strideq*0], m1 + mova [dstq+strideq*8], m2 + PALIGNR m0, m2, m1, 2, m4 +%if cpuflag(ssse3) + pshufb m2, m3 +%else + psrldq m2, 2 + por m2, m3 +%endif + mova [dstq+strideq*1], m0 + mova [dstq+stride9q ], m2 + PALIGNR m1, m2, m0, 2, m4 +%if cpuflag(ssse3) + pshufb m2, m3 +%else + psrldq m2, 2 + por m2, m3 +%endif + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l + mova m1, [lq] + mova m0, [lq+16] + PALIGNR m2, m0, m1, 1, m5 + PALIGNR m3, m0, m1, 2, m5 + LOWPASS 3, 2, 1, 5 + pavgb m2, m1 +%if cpuflag(ssse3) + mova m4, [pb_2toE_3xF] + pshufb m5, m0, [pb_1toE_2xF] + pshufb m1, m0, m4 +%else + pand m4, m0, [pb_15x0_1xm1] + psrldq m5, m0, 1 + por m5, m4 + punpckhbw m4, m4 + psrldq m1, m0, 2 + por m1, m4 +%endif + LOWPASS 1, 5, 0, 6 + pavgb m0, m5 + DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24 + mov cntd, 8 + xor stride0q, stride0q + lea dst8q, [dstq +strideq*8] + lea dst16q, [dst8q +strideq*8] + lea dst24q, [dst16q+strideq*8] + SBUTTERFLY bw, 0, 1, 5 + SBUTTERFLY bw, 2, 3, 5 +%if cpuflag(ssse3) + pshufb m6, m1, [pb_15] +%else + pshufhw m6, m4, q3333 + punpckhqdq m6, m6 +%endif + +.loop: + mova [dstq +stride0q+ 0], m2 + mova [dstq +stride0q+16], m3 + mova [dst8q +stride0q+ 0], m3 + mova [dst8q +stride0q+16], m0 + mova [dst16q+stride0q+ 0], m0 + mova [dst16q+stride0q+16], m1 + mova [dst24q+stride0q+ 0], m1 + mova [dst24q+stride0q+16], m6 +%if cpuflag(avx) + palignr m2, m3, m2, 2 + palignr m3, m0, m3, 2 + palignr m0, m1, m0, 2 + pshufb m1, m4 +%elif cpuflag(ssse3) + pshufb m5, m1, m4 + palignr m1, m0, 2 + palignr m0, m3, 2 + palignr m3, m2, 2 + mova m2, m3 + mova m3, m0 + mova m0, m1 + mova m1, m5 +%else + ; half-integrated version of PALIGNR + pslldq m5, m1, 14 + pslldq m7, m0, 14 + psrldq m1, 2 + psrldq m0, 2 + por m1, m4 + por m0, m5 + pslldq m5, m3, 14 + psrldq m3, 2 + por m3, m7 + psrldq m2, 2 + por m2, m5 +%endif + add stride0q, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +HU_XMM_FUNCS 8 +INIT_XMM ssse3 +HU_XMM_FUNCS 7 +INIT_XMM avx +HU_XMM_FUNCS 7 + +; FIXME 127, 128, 129 ? diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm new file mode 100644 index 0000000000..d9fb36f710 --- /dev/null +++ b/libavcodec/x86/vp9itxfm.asm @@ -0,0 +1,2723 @@ +;****************************************************************************** +;* VP9 IDCT SIMD optimizations +;* +;* Copyright (C) 2013 Clément Bœsch <u pkh me> +;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pw_m11585x2: times 8 dw -23170 +pw_m11585_11585: times 4 dw -11585, 11585 +pw_11585_11585: times 8 dw 11585 + +%macro VP9_IDCT_COEFFS 2-3 0 +pw_%1x2: times 8 dw %1*2 +pw_m%1x2: times 8 dw -%1*2 +pw_%2x2: times 8 dw %2*2 +pw_m%2x2: times 8 dw -%2*2 +pw_m%1_%2: times 4 dw -%1, %2 +pw_%2_%1: times 4 dw %2, %1 +pw_m%2_m%1: times 4 dw -%2, -%1 +%if %3 == 1 +pw_m%2_%1: times 4 dw -%2, %1 +pw_%1_%2: times 4 dw %1, %2 +%endif +%endmacro + +VP9_IDCT_COEFFS 15137, 6270, 1 +VP9_IDCT_COEFFS 16069, 3196, 1 +VP9_IDCT_COEFFS 9102, 13623, 1 +VP9_IDCT_COEFFS 16305, 1606 +VP9_IDCT_COEFFS 10394, 12665 +VP9_IDCT_COEFFS 14449, 7723 +VP9_IDCT_COEFFS 4756, 15679 +VP9_IDCT_COEFFS 16364, 804 +VP9_IDCT_COEFFS 11003, 12140 +VP9_IDCT_COEFFS 14811, 7005 +VP9_IDCT_COEFFS 5520, 15426 +VP9_IDCT_COEFFS 15893, 3981 +VP9_IDCT_COEFFS 8423, 14053 +VP9_IDCT_COEFFS 13160, 9760 +VP9_IDCT_COEFFS 2404, 16207 + +pw_5283_13377: times 4 dw 5283, 13377 +pw_9929_13377: times 4 dw 9929, 13377 +pw_15212_m13377: times 4 dw 15212, -13377 +pw_15212_9929: times 4 dw 15212, 9929 +pw_m5283_m15212: times 4 dw -5283, -15212 +pw_13377x2: times 8 dw 13377*2 +pw_13377_m13377: times 4 dw 13377, -13377 + +pd_8192: times 4 dd 8192 + +cextern pw_8 +cextern pw_16 +cextern pw_32 +cextern pw_512 +cextern pw_1024 +cextern pw_2048 +cextern pw_m1 + +SECTION .text + +; (a*x + b*y + round) >> shift +%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2 + pmaddwd m%1, m%2, %4 + pmaddwd m%2, %5 + paddd m%1, %3 + paddd m%2, %3 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2 + VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3] + VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2 +%if %0 == 7 + punpckhwd m%6, m%2, m%1 + punpcklwd m%2, m%1 + VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7 +%else + punpckhwd m%8, m%4, m%3 + punpcklwd m%2, m%4, m%3 + VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9 +%endif +%endmacro + +%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2 + punpckhwd m%4, m%2, m%1 + punpcklwd m%2, m%1 + pmaddwd m%3, m%4, [pw_m%5_%6] + pmaddwd m%4, [pw_%6_%5] + pmaddwd m%1, m%2, [pw_m%5_%6] + pmaddwd m%2, [pw_%6_%5] +%endmacro + +%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round + SUMSUB_BA d, %1, %2, %5 + SUMSUB_BA d, %3, %4, %5 + paddd m%1, %6 + paddd m%2, %6 + paddd m%3, %6 + paddd m%4, %6 + psrad m%1, 14 + psrad m%2, 14 + psrad m%3, 14 + psrad m%4, 14 + packssdw m%1, m%3 + packssdw m%2, m%4 +%endmacro + +%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst + movh m%3, [%6] + movh m%4, [%6+strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%3, m%1 + paddw m%4, m%2 + packuswb m%3, m%5 + packuswb m%4, m%5 + movh [%6], m%3 + movh [%6+strideq], m%4 +%endmacro + +%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg +%assign %%y 0 +%rep %3 +%assign %%x 0 +%rep %3*2/mmsize + mova [%1+%%y+%%x], %4 +%assign %%x (%%x+mmsize) +%endrep +%assign %%y (%%y+%2) +%endrep +%endmacro + +;------------------------------------------------------------------------------------------- +; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +%macro VP9_IWHT4_1D 0 + SWAP 1, 2, 3 + paddw m0, m2 + psubw m3, m1 + psubw m4, m0, m3 + psraw m4, 1 + psubw m5, m4, m1 + SWAP 5, 1 + psubw m4, m2 + SWAP 4, 2 + psubw m0, m1 + paddw m3, m2 + SWAP 3, 2, 1 +%endmacro + +INIT_MMX mmx +cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob + mova m0, [blockq+0*8] + mova m1, [blockq+1*8] + mova m2, [blockq+2*8] + mova m3, [blockq+3*8] + psraw m0, 2 + psraw m1, 2 + psraw m2, 2 + psraw m3, 2 + + VP9_IWHT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IWHT4_1D + + pxor m4, m4 + VP9_STORE_2X 0, 1, 5, 6, 4 + lea dstq, [dstq+strideq*2] + VP9_STORE_2X 2, 3, 5, 6, 4 + ZERO_BLOCK blockq, 8, 4, m4 + RET + +;------------------------------------------------------------------------------------------- +; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +%macro VP9_IDCT4_1D_FINALIZE 0 + SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0 + SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1 + SWAP 0, 3, 2 ; 3102 -> 0123 +%endmacro + +%macro VP9_IDCT4_1D 0 +%if cpuflag(ssse3) + SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2) + pmulhrsw m2, m6 ; m2=t0 + pmulhrsw m0, m6 ; m0=t1 +%else ; <= sse2 + VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0 +%endif + VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3 + VP9_IDCT4_1D_FINALIZE +%endmacro + +; 2x2 top left corner +%macro VP9_IDCT4_2x2_1D 0 + pmulhrsw m0, m5 ; m0=t1 + mova m2, m0 ; m2=t0 + mova m3, m1 + pmulhrsw m1, m6 ; m1=t2 + pmulhrsw m3, m7 ; m3=t3 + VP9_IDCT4_1D_FINALIZE +%endmacro + +%macro VP9_IDCT4_WRITEOUT 0 +%if cpuflag(ssse3) + mova m5, [pw_2048] + pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 + pmulhrsw m1, m5 +%else + mova m5, [pw_8] + paddw m0, m5 + paddw m1, m5 + psraw m0, 4 + psraw m1, 4 +%endif + VP9_STORE_2X 0, 1, 6, 7, 4 + lea dstq, [dstq+2*strideq] +%if cpuflag(ssse3) + pmulhrsw m2, m5 + pmulhrsw m3, m5 +%else + paddw m2, m5 + paddw m3, m5 + psraw m2, 4 + psraw m3, 4 +%endif + VP9_STORE_2X 2, 3, 6, 7, 4 +%endmacro + +%macro IDCT_4x4_FN 1 +INIT_MMX %1 +cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob + +%if cpuflag(ssse3) + cmp eobd, 4 ; 2x2 or smaller + jg .idctfull + + cmp eobd, 1 ; faster path for when only DC is set + jne .idct2x2 +%else + cmp eobd, 1 + jg .idctfull +%endif + +%if cpuflag(ssse3) + movd m0, [blockq] + mova m5, [pw_11585x2] + pmulhrsw m0, m5 + pmulhrsw m0, m5 +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (8 << 14) + 8192 + sar coefd, 14 + 4 + movd m0, coefd +%endif + pshufw m0, m0, 0 + pxor m4, m4 + movh [blockq], m4 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 +%endif + VP9_STORE_2X 0, 0, 6, 7, 4 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 0, 0, 6, 7, 4 + RET + +%if cpuflag(ssse3) +; faster path for when only top left 2x2 block is set +.idct2x2: + movd m0, [blockq+0] + movd m1, [blockq+8] + mova m5, [pw_11585x2] + mova m6, [pw_6270x2] + mova m7, [pw_15137x2] + VP9_IDCT4_2x2_1D + ; partial 2x4 transpose + punpcklwd m0, m1 + punpcklwd m2, m3 + SBUTTERFLY dq, 0, 2, 1 + SWAP 1, 2 + VP9_IDCT4_2x2_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + movh [blockq+ 0], m4 + movh [blockq+ 8], m4 + VP9_IDCT4_WRITEOUT + RET +%endif + +.idctfull: ; generic full 4x4 idct/idct + mova m0, [blockq+ 0] + mova m1, [blockq+ 8] + mova m2, [blockq+16] + mova m3, [blockq+24] +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif + mova m7, [pd_8192] ; rounding + VP9_IDCT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IDCT4_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + mova [blockq+ 0], m4 + mova [blockq+ 8], m4 + mova [blockq+16], m4 + mova [blockq+24], m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +IDCT_4x4_FN mmxext +IDCT_4x4_FN ssse3 + +;------------------------------------------------------------------------------------------- +; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +%macro VP9_IADST4_1D 0 + movq2dq xmm0, m0 + movq2dq xmm1, m1 + movq2dq xmm2, m2 + movq2dq xmm3, m3 +%if cpuflag(ssse3) + paddw m3, m0 +%else + paddw xmm6, xmm3, xmm0 + punpcklwd xmm6, xmm2 +%endif + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + pmaddwd xmm1, xmm0, [pw_5283_13377] + pmaddwd xmm4, xmm0, [pw_9929_13377] + pmaddwd xmm0, [pw_15212_m13377] + pmaddwd xmm3, xmm2, [pw_15212_9929] + pmaddwd xmm2, [pw_m5283_m15212] +%if cpuflag(ssse3) + psubw m3, m2 +%else + pmaddwd xmm6, [pw_13377_m13377] +%endif + paddd xmm0, xmm2 + paddd xmm3, xmm5 + paddd xmm2, xmm5 +%if notcpuflag(ssse3) + paddd xmm6, xmm5 +%endif + paddd xmm1, xmm3 + paddd xmm0, xmm3 + paddd xmm4, xmm2 + psrad xmm1, 14 + psrad xmm0, 14 + psrad xmm4, 14 +%if cpuflag(ssse3) + pmulhrsw m3, [pw_13377x2] ; out2 +%else + psrad xmm6, 14 +%endif + packssdw xmm0, xmm0 + packssdw xmm1, xmm1 + packssdw xmm4, xmm4 +%if notcpuflag(ssse3) + packssdw xmm6, xmm6 +%endif + movdq2q m0, xmm0 ; out3 + movdq2q m1, xmm1 ; out0 + movdq2q m2, xmm4 ; out1 +%if notcpuflag(ssse3) + movdq2q m3, xmm6 ; out2 +%endif + SWAP 0, 1, 2, 3 +%endmacro + +%macro IADST4_FN 5 +INIT_MMX %5 +cglobal vp9_%1_%3_4x4_add, 3, 3, 6 + notcpuflag(ssse3), dst, stride, block, eob +%if WIN64 && notcpuflag(ssse3) +WIN64_SPILL_XMM 7 +%endif + movdqa xmm5, [pd_8192] + mova m0, [blockq+ 0] + mova m1, [blockq+ 8] + mova m2, [blockq+16] + mova m3, [blockq+24] +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif +%ifnidn %1%3, iadstiadst + movdq2q m7, xmm5 +%endif + VP9_%2_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_%4_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + mova [blockq+ 0], m4 + mova [blockq+ 8], m4 + mova [blockq+16], m4 + mova [blockq+24], m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +IADST4_FN idct, IDCT4, iadst, IADST4, sse2 +IADST4_FN iadst, IADST4, idct, IDCT4, sse2 +IADST4_FN iadst, IADST4, iadst, IADST4, sse2 + +IADST4_FN idct, IDCT4, iadst, IADST4, ssse3 +IADST4_FN iadst, IADST4, idct, IDCT4, ssse3 +IADST4_FN iadst, IADST4, iadst, IADST4, ssse3 + +%macro SCRATCH 3 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova [%3], m%1 +%endif +%endmacro + +%macro UNSCRATCH 3 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%endmacro + +;------------------------------------------------------------------------------------------- +; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +%macro VP9_IDCT8_1D_FINALIZE 0 + SUMSUB_BA w, 3, 6, 5 ; m3=t0+t7, m6=t0-t7 + SUMSUB_BA w, 1, 2, 5 ; m1=t1+t6, m2=t1-t6 + SUMSUB_BA w, 7, 0, 5 ; m7=t2+t5, m0=t2-t5 + + UNSCRATCH 5, 8, blockq+ 0 + SCRATCH 2, 8, blockq+ 0 + + SUMSUB_BA w, 5, 4, 2 ; m5=t3+t4, m4=t3-t4 + SWAP 7, 6, 2 + SWAP 3, 5, 0 + +%if ARCH_X86_64 + SWAP 6, 8 +%endif +%endmacro + +; x86-32 +; - in: m0/m4 is in mem +; - out: m6 is in mem +; x86-64: +; - everything is in registers (m0-7) +%macro VP9_IDCT8_1D 0 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 4, 9 +%endif + + VP9_UNPACK_MULSUB_2W_4X 5, 3, 9102, 13623, D_8192_REG, 0, 4 ; m5=t5a, m3=t6a + VP9_UNPACK_MULSUB_2W_4X 1, 7, 16069, 3196, D_8192_REG, 0, 4 ; m1=t4a, m7=t7a + SUMSUB_BA w, 5, 1, 0 ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a) + SUMSUB_BA w, 3, 7, 0 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) +%if cpuflag(ssse3) + SUMSUB_BA w, 1, 7, 0 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) + pmulhrsw m1, W_11585x2_REG ; m1=t6 + pmulhrsw m7, W_11585x2_REG ; m7=t5 +%else + VP9_UNPACK_MULSUB_2W_4X 7, 1, 11585, 11585, D_8192_REG, 0, 4 +%endif + VP9_UNPACK_MULSUB_2W_4X 2, 6, 15137, 6270, D_8192_REG, 0, 4 ; m2=t2a, m6=t3a + + UNSCRATCH 0, 8, blockq+ 0 ; IN(0) + UNSCRATCH 4, 9, blockq+64 ; IN(4) + SCRATCH 5, 8, blockq+ 0 + +%if cpuflag(ssse3) + SUMSUB_BA w, 4, 0, 5 ; m4=IN(0)+IN(4) m0=IN(0)-IN(4) + pmulhrsw m4, W_11585x2_REG ; m4=t0a + pmulhrsw m0, W_11585x2_REG ; m0=t1a +%else + SCRATCH 7, 9, blockq+64 + VP9_UNPACK_MULSUB_2W_4X 0, 4, 11585, 11585, D_8192_REG, 5, 7 + UNSCRATCH 7, 9, blockq+64 +%endif + SUMSUB_BA w, 6, 4, 5 ; m6=t0a+t3a (t0), m4=t0a-t3a (t3) + SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) + + VP9_IDCT8_1D_FINALIZE +%endmacro + +%macro VP9_IDCT8_4x4_1D 0 + pmulhrsw m0, W_11585x2_REG ; m0=t1a/t0a + pmulhrsw m6, m2, [pw_15137x2] ; m6=t3a + pmulhrsw m2, [pw_6270x2] ; m2=t2a + pmulhrsw m7, m1, [pw_16069x2] ; m7=t7a + pmulhrsw m1, [pw_3196x2] ; m1=t4a + pmulhrsw m5, m3, [pw_9102x2] ; m5=-t5a + pmulhrsw m3, [pw_13623x2] ; m3=t6a + SUMSUB_BA w, 5, 1, 4 ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a) + SWAP 1, 5 + SUMSUB_BA w, 3, 7, 4 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) + SUMSUB_BA w, 1, 7, 4 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) + pmulhrsw m1, W_11585x2_REG ; m1=t6 + pmulhrsw m7, W_11585x2_REG ; m7=t5 + psubw m4, m0, m6 ; m4=t0a-t3a (t3) + paddw m6, m0 ; m6=t0a+t3a (t0) + SCRATCH 5, 8, blockq+ 0 + SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) + VP9_IDCT8_1D_FINALIZE +%endmacro + +%macro VP9_IDCT8_2x2_1D 1 + pmulhrsw m0, W_11585x2_REG ; m0=t0 + pmulhrsw m3, m1, W_16069x2_REG ; m3=t7 + pmulhrsw m1, W_3196x2_REG ; m1=t4 + psubw m7, m3, m1 ; t5 = t7a - t4a + paddw m5, m3, m1 ; t6 = t7a + t4a + pmulhrsw m7, W_11585x2_REG ; m7=t5 + pmulhrsw m5, W_11585x2_REG ; m5=t6 + SWAP 5, 1 + ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier + psubw m6, m0, m3 ; m6=t0-t7 + paddw m3, m0 ; m3=t0+t7 + psubw m2, m0, m1 ; m2=t1-t6 + paddw m1, m0 ; m1=t1+t6 +%if %1 == 1 + punpcklwd m3, m1 +%define SCRATCH_REG 1 +%elif ARCH_X86_32 + mova [blockq+ 0], m2 +%define SCRATCH_REG 2 +%else +%define SCRATCH_REG 8 +%endif + psubw m4, m0, m5 ; m4=t3-t4 + paddw m5, m0 ; m5=t3+t4 + SUMSUB_BA w, 7, 0, SCRATCH_REG ; m7=t2+t5, m0=t2-t5 + SWAP 7, 6, 2 + SWAP 3, 5, 0 +%undef SCRATCH_REG +%endmacro + +%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift +%if cpuflag(ssse3) + pmulhrsw m%1, %6 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 + pmulhrsw m%2, %6 +%else + paddw m%1, %6 + paddw m%2, %6 + psraw m%1, %7 + psraw m%2, %7 +%endif +%if %0 <= 7 + VP9_STORE_2X %1, %2, %3, %4, %5 +%else + VP9_STORE_2X %1, %2, %3, %4, %5, %8 +%endif +%endmacro + +; x86-32: +; - m6 is in mem +; x86-64: +; - m8 holds m6 (SWAP) +; m6 holds zero +%macro VP9_IDCT8_WRITEOUT 0 +%if ARCH_X86_64 +%if cpuflag(ssse3) + mova m9, [pw_1024] +%else + mova m9, [pw_16] +%endif +%define ROUND_REG m9 +%else +%if cpuflag(ssse3) +%define ROUND_REG [pw_1024] +%else +%define ROUND_REG [pw_16] +%endif +%endif + SCRATCH 5, 10, blockq+16 + SCRATCH 7, 11, blockq+32 + VP9_IDCT8_WRITEx2 0, 1, 5, 7, 6, ROUND_REG + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 2, 3, 5, 7, 6, ROUND_REG + lea dstq, [dstq+2*strideq] + UNSCRATCH 5, 10, blockq+16 + UNSCRATCH 7, 11, blockq+32 + VP9_IDCT8_WRITEx2 4, 5, 0, 1, 6, ROUND_REG + lea dstq, [dstq+2*strideq] + UNSCRATCH 5, 8, blockq+ 0 + VP9_IDCT8_WRITEx2 5, 7, 0, 1, 6, ROUND_REG + +%undef ROUND_REG +%endmacro + +%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2 +INIT_XMM %1 +cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob + +%if cpuflag(ssse3) +%if ARCH_X86_64 + mova m12, [pw_11585x2] ; often used +%define W_11585x2_REG m12 +%else +%define W_11585x2_REG [pw_11585x2] +%endif + + cmp eobd, 12 ; top left half or less + jg .idctfull + + cmp eobd, 3 ; top left corner or less + jg .idcthalf + + cmp eobd, 1 ; faster path for when only DC is set + jne .idcttopleftcorner +%else + cmp eobd, 1 + jg .idctfull +%endif + +%if cpuflag(ssse3) + movd m0, [blockq] + pmulhrsw m0, W_11585x2_REG + pmulhrsw m0, W_11585x2_REG +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (16 << 14) + 8192 + sar coefd, 14 + 5 + movd m0, coefd +%endif + SPLATW m0, m0, 0 + pxor m4, m4 + movd [blockq], m4 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_1024] ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 +%endif +%rep 3 + VP9_STORE_2X 0, 0, 6, 7, 4 + lea dstq, [dstq+2*strideq] +%endrep + VP9_STORE_2X 0, 0, 6, 7, 4 + RET + +%if cpuflag(ssse3) +; faster path for when only left corner is set (3 input: DC, right to DC, below +; to DC). Note: also working with a 2x2 block +.idcttopleftcorner: + movd m0, [blockq+0] + movd m1, [blockq+16] +%if ARCH_X86_64 + mova m10, [pw_3196x2] + mova m11, [pw_16069x2] +%define W_3196x2_REG m10 +%define W_16069x2_REG m11 +%else +%define W_3196x2_REG [pw_3196x2] +%define W_16069x2_REG [pw_16069x2] +%endif + VP9_IDCT8_2x2_1D 1 + ; partial 2x8 transpose + ; punpcklwd m0, m1 already done inside idct + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 + punpckldq m0, m2 + punpckldq m4, m6 + SBUTTERFLY qdq, 0, 4, 1 + SWAP 1, 4 + VP9_IDCT8_2x2_1D 2 +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 ; used for the block reset, and VP9_STORE_2X + VP9_IDCT8_WRITEOUT +%if ARCH_X86_64 + movd [blockq+ 0], m6 + movd [blockq+16], m6 +%else + mova [blockq+ 0], m6 + mova [blockq+16], m6 + mova [blockq+32], m6 +%endif + RET + +.idcthalf: + movh m0, [blockq + 0] + movh m1, [blockq +16] + movh m2, [blockq +32] + movh m3, [blockq +48] + VP9_IDCT8_4x4_1D + ; partial 4x8 transpose +%if ARCH_X86_32 + mova m6, [blockq+ 0] +%endif + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 + SBUTTERFLY dq, 0, 2, 1 + SBUTTERFLY dq, 4, 6, 5 + SBUTTERFLY qdq, 0, 4, 1 + SBUTTERFLY qdq, 2, 6, 5 + SWAP 1, 4 + SWAP 3, 6 + VP9_IDCT8_4x4_1D +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 + VP9_IDCT8_WRITEOUT +%if ARCH_X86_64 + movh [blockq+ 0], m6 + movh [blockq+16], m6 + movh [blockq+32], m6 +%else + mova [blockq+ 0], m6 + mova [blockq+16], m6 + mova [blockq+32], m6 +%endif + movh [blockq+48], m6 + RET +%endif + +.idctfull: ; generic full 8x8 idct/idct +%if ARCH_X86_64 + mova m0, [blockq+ 0] ; IN(0) +%endif + mova m1, [blockq+ 16] ; IN(1) + mova m2, [blockq+ 32] ; IN(2) + mova m3, [blockq+ 48] ; IN(3) +%if ARCH_X86_64 + mova m4, [blockq+ 64] ; IN(4) +%endif + mova m5, [blockq+ 80] ; IN(5) + mova m6, [blockq+ 96] ; IN(6) + mova m7, [blockq+112] ; IN(7) +%if ARCH_X86_64 + mova m11, [pd_8192] ; rounding +%define D_8192_REG m11 +%else +%define D_8192_REG [pd_8192] +%endif + VP9_IDCT8_1D +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 + mova [blockq+0], m0 +%endif + VP9_IDCT8_1D + +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 ; used for the block reset, and VP9_STORE_2X + VP9_IDCT8_WRITEOUT + ZERO_BLOCK blockq, 16, 8, m6 + RET +%undef W_11585x2_REG +%endmacro + +VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12 +VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13 +VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13 + +;--------------------------------------------------------------------------------------------- +; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +; x86-32: +; - in: m0/3/4/7 are in mem [blockq+N*16] +; - out: m6 is in mem [blockq+0] +; x86-64: +; - everything is in registers +%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 3, 9 + SWAP 4, 10 + SWAP 7, 11 +%endif + + VP9_UNPACK_MULSUB_2D_4X 5, 2, 0, 3, 14449, 7723 ; m5/2=t3[d], m2/4=t2[d] + VP9_UNPACK_MULSUB_2D_4X 1, 6, 4, 7, 4756, 15679 ; m1/4=t7[d], m6/7=t6[d] + SCRATCH 4, 12, blockq+1*16 + VP9_RND_SH_SUMSUB_BA 6, 2, 7, 3, 4, D_8192_REG ; m6=t2[w], m2=t6[w] + UNSCRATCH 4, 12, blockq+1*16 + VP9_RND_SH_SUMSUB_BA 1, 5, 4, 0, 3, D_8192_REG ; m1=t3[w], m5=t7[w] + + UNSCRATCH 0, 8, blockq+16*0 + UNSCRATCH 3, 9, blockq+16*3 + UNSCRATCH 4, 10, blockq+16*4 + UNSCRATCH 7, 11, blockq+16*7 + SCRATCH 1, 8, blockq+16*1 + SCRATCH 2, 9, blockq+16*2 + SCRATCH 5, 10, blockq+16*5 + SCRATCH 6, 11, blockq+16*6 + + VP9_UNPACK_MULSUB_2D_4X 7, 0, 1, 2, 16305, 1606 ; m7/1=t1[d], m0/2=t0[d] + VP9_UNPACK_MULSUB_2D_4X 3, 4, 5, 6, 10394, 12665 ; m3/5=t5[d], m4/6=t4[d] + SCRATCH 1, 12, blockq+ 0*16 + VP9_RND_SH_SUMSUB_BA 4, 0, 6, 2, 1, D_8192_REG ; m4=t0[w], m0=t4[w] + UNSCRATCH 1, 12, blockq+ 0*16 + VP9_RND_SH_SUMSUB_BA 3, 7, 5, 1, 2, D_8192_REG ; m3=t1[w], m7=t5[w] + + UNSCRATCH 2, 9, blockq+16*2 + UNSCRATCH 5, 10, blockq+16*5 + SCRATCH 3, 9, blockq+16*3 + SCRATCH 4, 10, blockq+16*4 + + ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7 + + VP9_UNPACK_MULSUB_2D_4X 0, 7, 1, 3, 15137, 6270 ; m0/1=t5[d], m7/3=t4[d] + VP9_UNPACK_MULSUB_2D_4X 5, 2, 4, 6, 6270, 15137 ; m5/4=t6[d], m2/6=t7[d] + SCRATCH 1, 12, blockq+ 0*16 + VP9_RND_SH_SUMSUB_BA 5, 7, 4, 3, 1, D_8192_REG + UNSCRATCH 1, 12, blockq+ 0*16 + PSIGNW m5, W_M1_REG ; m5=out1[w], m7=t6[w] + VP9_RND_SH_SUMSUB_BA 2, 0, 6, 1, 3, D_8192_REG ; m2=out6[w], m0=t7[w] + + UNSCRATCH 1, 8, blockq+16*1 + UNSCRATCH 3, 9, blockq+16*3 + UNSCRATCH 4, 10, blockq+16*4 + UNSCRATCH 6, 11, blockq+16*6 + SCRATCH 2, 8, blockq+16*0 + + SUMSUB_BA w, 6, 4, 2 ; m6=out0[w], m4=t2[w] + SUMSUB_BA w, 1, 3, 2 + PSIGNW m1, W_M1_REG ; m1=out7[w], m3=t3[w] + + ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7 + + ; unfortunately, the code below overflows in some cases +%if 0; cpuflag(ssse3) + SUMSUB_BA w, 3, 4, 2 + SUMSUB_BA w, 0, 7, 2 + pmulhrsw m3, W_11585x2_REG + pmulhrsw m7, W_11585x2_REG + pmulhrsw m4, W_11585x2_REG ; out4 + pmulhrsw m0, W_11585x2_REG ; out2 +%else + SCRATCH 5, 9, blockq+16*1 + VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, D_8192_REG, 2, 5 + VP9_UNPACK_MULSUB_2W_4X 7, 0, 11585, 11585, D_8192_REG, 2, 5 + UNSCRATCH 5, 9, blockq+16*1 +%endif + PSIGNW m3, W_M1_REG ; out3 + PSIGNW m7, W_M1_REG ; out5 + + ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7 + +%if ARCH_X86_64 + SWAP 2, 8 +%endif + SWAP 0, 6, 2 + SWAP 7, 1, 5 +%endmacro + +%macro IADST8_FN 6 +INIT_XMM %5 +cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob + +%ifidn %1, idct +%define first_is_idct 1 +%else +%define first_is_idct 0 +%endif + +%ifidn %3, idct +%define second_is_idct 1 +%else +%define second_is_idct 0 +%endif + +%if ARCH_X86_64 + mova m0, [blockq+ 0] ; IN(0) +%endif + mova m1, [blockq+ 16] ; IN(1) + mova m2, [blockq+ 32] ; IN(2) +%if ARCH_X86_64 || first_is_idct + mova m3, [blockq+ 48] ; IN(3) +%endif +%if ARCH_X86_64 + mova m4, [blockq+ 64] ; IN(4) +%endif + mova m5, [blockq+ 80] ; IN(5) + mova m6, [blockq+ 96] ; IN(6) +%if ARCH_X86_64 || first_is_idct + mova m7, [blockq+112] ; IN(7) +%endif +%if ARCH_X86_64 +%if cpuflag(ssse3) + mova m15, [pw_11585x2] ; often used +%endif + mova m13, [pd_8192] ; rounding + mova m14, [pw_m1] +%define W_11585x2_REG m15 +%define D_8192_REG m13 +%define W_M1_REG m14 +%else +%define W_11585x2_REG [pw_11585x2] +%define D_8192_REG [pd_8192] +%define W_M1_REG [pw_m1] +%endif + + ; note different calling conventions for idct8 vs. iadst8 on x86-32 + VP9_%2_1D +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 + mova [blockq+ 0], m0 +%if second_is_idct == 0 + mova [blockq+ 48], m3 + mova [blockq+112], m7 +%endif +%endif + VP9_%4_1D + +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 ; used for the block reset, and VP9_STORE_2X + VP9_IDCT8_WRITEOUT + ZERO_BLOCK blockq, 16, 8, m6 + RET + +%undef W_11585x2_REG +%undef first_is_idct +%undef second_is_idct + +%endmacro + +%define PSIGNW PSIGNW_MMX +IADST8_FN idct, IDCT8, iadst, IADST8, sse2, 15 +IADST8_FN iadst, IADST8, idct, IDCT8, sse2, 15 +IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15 +%define PSIGNW PSIGNW_SSSE3 +IADST8_FN idct, IDCT8, iadst, IADST8, ssse3, 16 +IADST8_FN idct, IDCT8, iadst, IADST8, avx, 16 +IADST8_FN iadst, IADST8, idct, IDCT8, ssse3, 16 +IADST8_FN iadst, IADST8, idct, IDCT8, avx, 16 +IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16 +IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 +%undef PSIGNW + +;--------------------------------------------------------------------------------------------- +; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +; x86-64: +; at the end of this macro, m7 is stored in [%4+15*%5] +; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15 +; the following sumsubs have not been done yet: +; SUMSUB_BA w, 6, 9, 15 ; t6, t9 +; SUMSUB_BA w, 7, 8, 15 ; t7, t8 +; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1, +; and the following simsubs have not been done yet: +; SUMSUB_BA w, x13, x14, 7 ; t6, t9 +; SUMSUB_BA w, x15, x12, 7 ; t7, t8 + +%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst +%if %2 <= 4 + mova m3, [%1+ 1*%3] ; IN(1) + mova m0, [%1+ 3*%3] ; IN(3) + + pmulhrsw m4, m3, [pw_16305x2] ; t14-15 + pmulhrsw m3, [pw_1606x2] ; t8-9 + pmulhrsw m7, m0, [pw_m4756x2] ; t10-11 + pmulhrsw m0, [pw_15679x2] ; t12-13 + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 + ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 + + VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 1, 6 ; t9, t14 + SCRATCH 4, 10, %4+ 1*%5 + SCRATCH 5, 11, %4+ 7*%5 + VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 + UNSCRATCH 5, 11, %4+ 7*%5 + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 +%else + mova m5, [%1+ 1*%3] ; IN(1) + mova m4, [%1+ 7*%3] ; IN(7) +%if %2 <= 8 + pmulhrsw m2, m5, [pw_16305x2] ; t15 + pmulhrsw m5, [pw_1606x2] ; t8 + pmulhrsw m3, m4, [pw_m10394x2] ; t9 + pmulhrsw m4, [pw_12665x2] ; t14 +%else + mova m3, [%1+ 9*%3] ; IN(9) + mova m2, [%1+15*%3] ; IN(15) + + ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7 + ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15 + + VP9_UNPACK_MULSUB_2W_4X 5, 2, 16305, 1606, [pd_8192], 0, 1 ; t8, t15 + VP9_UNPACK_MULSUB_2W_4X 3, 4, 10394, 12665, [pd_8192], 0, 1 ; t9, t14 +%endif + + SUMSUB_BA w, 3, 5, 0 ; t8, t9 + SUMSUB_BA w, 4, 2, 0 ; t15, t14 + + VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 0, 1 ; t9, t14 + + SCRATCH 4, 10, %4+ 1*%5 + SCRATCH 5, 11, %4+ 7*%5 + + mova m6, [%1+ 3*%3] ; IN(3) + mova m7, [%1+ 5*%3] ; IN(5) +%if %2 <= 8 + pmulhrsw m0, m7, [pw_14449x2] ; t13 + pmulhrsw m7, [pw_7723x2] ; t10 + pmulhrsw m1, m6, [pw_m4756x2] ; t11 + pmulhrsw m6, [pw_15679x2] ; t12 +%else + mova m0, [%1+11*%3] ; IN(11) + mova m1, [%1+13*%3] ; IN(13) + + VP9_UNPACK_MULSUB_2W_4X 7, 0, 14449, 7723, [pd_8192], 4, 5 ; t10, t13 + VP9_UNPACK_MULSUB_2W_4X 1, 6, 4756, 15679, [pd_8192], 4, 5 ; t11, t12 +%endif + + ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7 + ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15 + + SUMSUB_BA w, 7, 1, 4 ; t11, t10 + SUMSUB_BA w, 0, 6, 4 ; t12, t13 + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 + ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 + + VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 + + UNSCRATCH 5, 11, %4+ 7*%5 +%endif + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7 + ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15 + + SUMSUB_BA w, 7, 3, 4 ; t8, t11 + + ; backup first register + mova [%4+15*%5], m7 + + SUMSUB_BA w, 6, 2, 7 ; t9, t10 + UNSCRATCH 4, 10, %4+ 1*%5 + SUMSUB_BA w, 0, 4, 7 ; t15, t12 + SUMSUB_BA w, 1, 5, 7 ; t14. t13 + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 + +%if cpuflag(ssse3) && %6 == 0 + SUMSUB_BA w, 2, 5, 7 + SUMSUB_BA w, 3, 4, 7 + pmulhrsw m5, [pw_11585x2] ; t10 + pmulhrsw m4, [pw_11585x2] ; t11 + pmulhrsw m3, [pw_11585x2] ; t12 + pmulhrsw m2, [pw_11585x2] ; t13 +%else + SCRATCH 6, 10, %4+ 1*%5 + VP9_UNPACK_MULSUB_2W_4X 5, 2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13 + VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12 + UNSCRATCH 6, 10, %4+ 1*%5 +%endif + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15 + + SCRATCH 0, 8, %4+ 1*%5 + SCRATCH 1, 9, %4+ 3*%5 + SCRATCH 2, 10, %4+ 5*%5 + SCRATCH 3, 11, %4+ 7*%5 + SCRATCH 4, 12, %4+ 9*%5 + SCRATCH 5, 13, %4+11*%5 + SCRATCH 6, 14, %4+13*%5 + + ; even (tx8x8) +%if %2 <= 4 + mova m3, [%1+ 0*%3] ; IN(0) + mova m4, [%1+ 2*%3] ; IN(2) + + pmulhrsw m3, [pw_11585x2] ; t0-t3 + pmulhrsw m7, m4, [pw_16069x2] ; t6-7 + pmulhrsw m4, [pw_3196x2] ; t4-5 + + paddw m6, m7, m4 + psubw m5, m7, m4 + pmulhrsw m5, [pw_11585x2] ; t5 + pmulhrsw m6, [pw_11585x2] ; t6 + + psubw m0, m3, m7 + paddw m7, m3 + psubw m1, m3, m6 + paddw m6, m3 + psubw m2, m3, m5 + paddw m5, m3 + +%if ARCH_X86_32 + SWAP 0, 7 +%endif + SCRATCH 7, 15, %4+12*%5 +%else + mova m6, [%1+ 2*%3] ; IN(2) + mova m1, [%1+ 4*%3] ; IN(4) + mova m7, [%1+ 6*%3] ; IN(6) +%if %2 <= 8 + pmulhrsw m0, m1, [pw_15137x2] ; t3 + pmulhrsw m1, [pw_6270x2] ; t2 + pmulhrsw m5, m6, [pw_16069x2] ; t7 + pmulhrsw m6, [pw_3196x2] ; t4 + pmulhrsw m4, m7, [pw_m9102x2] ; t5 + pmulhrsw m7, [pw_13623x2] ; t6 +%else + mova m4, [%1+10*%3] ; IN(10) + mova m0, [%1+12*%3] ; IN(12) + mova m5, [%1+14*%3] ; IN(14) + + VP9_UNPACK_MULSUB_2W_4X 1, 0, 15137, 6270, [pd_8192], 2, 3 ; t2, t3 + VP9_UNPACK_MULSUB_2W_4X 6, 5, 16069, 3196, [pd_8192], 2, 3 ; t4, t7 + VP9_UNPACK_MULSUB_2W_4X 4, 7, 9102, 13623, [pd_8192], 2, 3 ; t5, t6 +%endif + + SUMSUB_BA w, 4, 6, 2 ; t4, t5 + SUMSUB_BA w, 7, 5, 2 ; t7, t6 + +%if cpuflag(ssse3) && %6 == 0 + SUMSUB_BA w, 6, 5, 2 + pmulhrsw m5, [pw_11585x2] ; t5 + pmulhrsw m6, [pw_11585x2] ; t6 +%else + VP9_UNPACK_MULSUB_2W_4X 5, 6, 11585, 11585, [pd_8192], 2, 3 ; t5, t6 +%endif + + SCRATCH 5, 15, %4+10*%5 + mova m2, [%1+ 0*%3] ; IN(0) +%if %2 <= 8 + pmulhrsw m2, [pw_11585x2] ; t0 and t1 + psubw m3, m2, m0 + paddw m0, m2 + + SUMSUB_BA w, 7, 0, 5 ; t0, t7 +%else + mova m3, [%1+ 8*%3] ; IN(8) + + ; from 3 stages back +%if cpuflag(ssse3) && %6 == 0 + SUMSUB_BA w, 3, 2, 5 + pmulhrsw m3, [pw_11585x2] ; t0 + pmulhrsw m2, [pw_11585x2] ; t1 +%else + mova [%1+ 0*%3], m0 + VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 0 ; t0, t1 + mova m0, [%1+ 0*%3] +%endif + + ; from 2 stages back + SUMSUB_BA w, 0, 3, 5 ; t0, t3 + + SUMSUB_BA w, 7, 0, 5 ; t0, t7 +%endif + UNSCRATCH 5, 15, %4+10*%5 +%if ARCH_X86_32 + SWAP 0, 7 +%endif + SCRATCH 7, 15, %4+12*%5 + SUMSUB_BA w, 1, 2, 7 ; t1, t2 + + ; from 1 stage back + SUMSUB_BA w, 6, 1, 7 ; t1, t6 + SUMSUB_BA w, 5, 2, 7 ; t2, t5 +%endif + SUMSUB_BA w, 4, 3, 7 ; t3, t4 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 + SWAP 5, 13 + SWAP 6, 14 + + SUMSUB_BA w, 0, 15, 7 ; t0, t15 + SUMSUB_BA w, 1, 14, 7 ; t1, t14 + SUMSUB_BA w, 2, 13, 7 ; t2, t13 + SUMSUB_BA w, 3, 12, 7 ; t3, t12 + SUMSUB_BA w, 4, 11, 7 ; t4, t11 + SUMSUB_BA w, 5, 10, 7 ; t5, t10 +%else + SWAP 1, 6 + SWAP 2, 5 + SWAP 3, 4 + mova [%4+14*%5], m6 + +%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride + mova m6, [%4+%2*%5] + SUMSUB_BA w, 6, %1, 7 + SWAP %1, 6 + mova [%4+%3*%5], m6 +%endmacro + + %%SUMSUB_BA_STORE 0, 1, 1, %4, %5 ; t0, t15 + %%SUMSUB_BA_STORE 1, 3, 3, %4, %5 ; t1, t14 + %%SUMSUB_BA_STORE 2, 5, 5, %4, %5 ; t2, t13 + %%SUMSUB_BA_STORE 3, 7, 7, %4, %5 ; t3, t12 + %%SUMSUB_BA_STORE 4, 9, 9, %4, %5 ; t4, t11 + %%SUMSUB_BA_STORE 5, 11, 11, %4, %5 ; t5, t10 +%endif +%endmacro + +%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst +%if %2 == 1 + VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4 + +%if ARCH_X86_64 + ; backup a different register + mova m7, [tmpq+15*16] + mova [tmpq+ 1*16], m15 + + SUMSUB_BA w, 6, 9, 15 ; t6, t9 + SUMSUB_BA w, 7, 8, 15 ; t7, t8 + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15 + mova [tmpq+ 0], m0 + mova [tmpq+ 32], m1 + mova [tmpq+ 64], m2 + mova [tmpq+ 96], m3 + mova [tmpq+128], m4 + mova [tmpq+160], m5 + mova [tmpq+192], m6 + mova [tmpq+224], m7 + + mova m15, [tmpq+ 1*16] + TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 + mova [tmpq+ 16], m8 + mova [tmpq+ 48], m9 + mova [tmpq+ 80], m10 + mova [tmpq+112], m11 + mova [tmpq+144], m12 + mova [tmpq+176], m13 + mova [tmpq+208], m14 + mova [tmpq+240], m15 +%else + mova m6, [tmpq+13*16] + mova m7, [tmpq+14*16] + SUMSUB_BA w, 6, 7 ; t6, t9 + mova [tmpq+14*16], m6 + mova [tmpq+13*16], m7 + mova m7, [tmpq+15*16] + mova m6, [tmpq+12*16] + SUMSUB_BA w, 7, 6 ; t7, t8 + mova [tmpq+15*16], m6 + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1 + mova [tmpq+ 0*16], m0 + mova [tmpq+ 2*16], m1 + mova [tmpq+ 4*16], m2 + mova [tmpq+ 6*16], m3 + mova [tmpq+10*16], m5 + mova [tmpq+12*16], m6 + mova [tmpq+14*16], m7 + + mova m0, [tmpq+15*16] + mova m1, [tmpq+13*16] + mova m2, [tmpq+11*16] + mova m3, [tmpq+ 9*16] + mova m4, [tmpq+ 7*16] + mova m5, [tmpq+ 5*16] + mova m7, [tmpq+ 1*16] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1 + mova [tmpq+ 1*16], m0 + mova [tmpq+ 3*16], m1 + mova [tmpq+ 5*16], m2 + mova [tmpq+ 7*16], m3 + mova [tmpq+11*16], m5 + mova [tmpq+13*16], m6 + mova [tmpq+15*16], m7 +%endif +%else ; %2 == 2 + VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4 + +%if cpuflag(ssse3) +%define ROUND_REG [pw_512] +%else +%define ROUND_REG [pw_32] +%endif + + pxor m7, m7 +%if ARCH_X86_64 + ; backup more registers + mova [%1+ 2*32], m8 + mova [%1+ 3*32], m9 + + VP9_IDCT8_WRITEx2 0, 1, 8, 9, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 2, 3, 8, 9, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 4, 5, 8, 9, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + ; restore from cache + SWAP 0, 7 ; move zero from m7 to m0 + mova m7, [%1+15*32] + mova m8, [%1+ 2*32] + mova m9, [%1+ 3*32] + + SUMSUB_BA w, 6, 9, 3 ; t6, t9 + SUMSUB_BA w, 7, 8, 3 ; t7, t8 + + VP9_IDCT8_WRITEx2 6, 7, 3, 4, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 8, 9, 3, 4, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 10, 11, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 12, 13, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 14, 15, 1, 2, 0, ROUND_REG, 6 +%else + mova [tmpq+ 0*32], m5 + + VP9_IDCT8_WRITEx2 0, 1, 5, 6, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 2, 3, 5, 6, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + SWAP 0, 7 ; move zero from m7 to m0 + mova m5, [tmpq+ 0*32] + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m4, [tmpq+13*32] + mova m7, [tmpq+14*32] + mova m5, [tmpq+15*32] + mova m6, [tmpq+12*32] + SUMSUB_BADC w, 4, 7, 5, 6, 1 + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m4, [tmpq+11*32] + mova m5, [tmpq+ 9*32] + mova m6, [tmpq+ 7*32] + mova m7, [tmpq+ 5*32] + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m4, [tmpq+ 3*32] + mova m5, [tmpq+ 1*32] + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] +%endif + +%undef ROUND_REG +%endif ; %2 == 1/2 +%endmacro + +%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride + mova m%3, [dstq] + mova m%5, [dstq+%7] + punpcklbw m%2, m%3, m%6 + punpckhbw m%3, m%6 + punpcklbw m%4, m%5, m%6 + punpckhbw m%5, m%6 + paddw m%2, m%1 + paddw m%3, m%1 + paddw m%4, m%1 + paddw m%5, m%1 + packuswb m%2, m%3 + packuswb m%4, m%5 + mova [dstq], m%2 + mova [dstq+%7], m%4 +%endmacro + +%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1 +INIT_XMM %1 +cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob +%if cpuflag(ssse3) + ; 2x2=eob=3, 4x4=eob=10 + cmp eobd, 38 + jg .idctfull + cmp eobd, 1 ; faster path for when only DC is set + jne .idct8x8 +%else + cmp eobd, 1 ; faster path for when only DC is set + jg .idctfull +%endif + + ; dc-only +%if cpuflag(ssse3) + movd m0, [blockq] + mova m1, [pw_11585x2] + pmulhrsw m0, m1 + pmulhrsw m0, m1 +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (32 << 14) + 8192 + sar coefd, 14 + 6 + movd m0, coefd +%endif + SPLATW m0, m0, q0000 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_512] +%endif + pxor m5, m5 + movd [blockq], m5 +%rep 7 + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 + lea dstq, [dstq+2*strideq] +%endrep + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 + RET + + DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp +%if cpuflag(ssse3) +.idct8x8: + mov tmpq, rsp + VP9_IDCT16_1D blockq, 1, 8, 0 + + mov cntd, 2 + mov dst_bakq, dstq +.loop2_8x8: + VP9_IDCT16_1D tmpq, 2, 8, 0 + lea dstq, [dst_bakq+8] + add tmpq, 16 + dec cntd + jg .loop2_8x8 + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 8, m0 + RET +%endif + +.idctfull: + mov cntd, 2 + mov tmpq, rsp +.loop1_full: + VP9_IDCT16_1D blockq, 1, 16, 0 + add blockq, 16 + add tmpq, 256 + dec cntd + jg .loop1_full + sub blockq, 32 + + mov cntd, 2 + mov tmpq, rsp + mov dst_bakq, dstq +.loop2_full: + VP9_IDCT16_1D tmpq, 2, 16, 0 + lea dstq, [dst_bakq+8] + add tmpq, 16 + dec cntd + jg .loop2_full + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endmacro + +VP9_IDCT_IDCT_16x16_ADD_XMM sse2 +VP9_IDCT_IDCT_16x16_ADD_XMM ssse3 +VP9_IDCT_IDCT_16x16_ADD_XMM avx + +;--------------------------------------------------------------------------------------------- +; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +%macro VP9_IADST16_1D 2 ; src, pass +%assign %%str 16*%2 + mova m0, [%1+ 0*32] ; in0 + mova m1, [%1+15*32] ; in15 + mova m2, [%1+ 7*32] ; in7 + mova m3, [%1+ 8*32] ; in8 + + VP9_UNPACK_MULSUB_2D_4X 1, 0, 4, 5, 16364, 804 ; m1/4=t1[d], m0/5=t0[d] + VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 6, 11003, 12140 ; m2/7=t9[d], m3/6=t8[d] + SCRATCH 4, 8, tmpq+ 0*%%str + VP9_RND_SH_SUMSUB_BA 3, 0, 6, 5, 4, [pd_8192] ; m3=t0[w], m0=t8[w] + UNSCRATCH 4, 8, tmpq+ 0*%%str + VP9_RND_SH_SUMSUB_BA 2, 1, 7, 4, 5, [pd_8192] ; m2=t1[w], m1=t9[w] + + SCRATCH 0, 10, tmpq+ 0*%%str + SCRATCH 1, 11, tmpq+15*%%str + mova [tmpq+ 7*%%str], m2 + mova [tmpq+ 8*%%str], m3 + + mova m1, [%1+ 2*32] ; in2 + mova m0, [%1+13*32] ; in13 + mova m3, [%1+ 5*32] ; in5 + mova m2, [%1+10*32] ; in10 + + VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 15893, 3981 ; m0/6=t3[d], m1/7=t2[d] + VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d] + SCRATCH 4, 12, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 2, 1, 5, 7, 4, [pd_8192] ; m2=t2[w], m1=t10[w] + UNSCRATCH 4, 12, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 3, 0, 4, 6, 5, [pd_8192] ; m3=t3[w], m0=t11[w] + + SCRATCH 0, 12, tmpq+ 2*%%str + SCRATCH 1, 13, tmpq+13*%%str + mova [tmpq+ 5*%%str], m2 + mova [tmpq+10*%%str], m3 + + mova m2, [%1+ 4*32] ; in4 + mova m3, [%1+11*32] ; in11 + mova m0, [%1+ 3*32] ; in3 + mova m1, [%1+12*32] ; in12 + + VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 14811, 7005 ; m3/7=t5[d], m2/6=t4[d] + VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 5520, 15426 ; m0/4=t13[d], m1/5=t12[d] + SCRATCH 4, 9, tmpq+ 4*%%str + VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t4[w], m2=t12[w] + UNSCRATCH 4, 9, tmpq+ 4*%%str + VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t5[w], m3=t13[w] + + SCRATCH 0, 8, tmpq+ 4*%%str + mova [tmpq+11*%%str], m1 ; t4:m1->r11 + UNSCRATCH 0, 10, tmpq+ 0*%%str + UNSCRATCH 1, 11, tmpq+15*%%str + + ; round 2 interleaved part 1 + VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d] + VP9_UNPACK_MULSUB_2D_4X 3, 2, 5, 4, 3196, 16069 ; m3/5=t12[d], m2/4=t13[d] + SCRATCH 4, 9, tmpq+ 3*%%str + VP9_RND_SH_SUMSUB_BA 3, 1, 5, 7, 4, [pd_8192] ; m3=t8[w], m1=t12[w] + UNSCRATCH 4, 9, tmpq+ 3*%%str + VP9_RND_SH_SUMSUB_BA 2, 0, 4, 6, 5, [pd_8192] ; m2=t9[w], m0=t13[w] + + SCRATCH 0, 10, tmpq+ 0*%%str + SCRATCH 1, 11, tmpq+15*%%str + SCRATCH 2, 14, tmpq+ 3*%%str + SCRATCH 3, 15, tmpq+12*%%str + + mova m2, [%1+ 6*32] ; in6 + mova m3, [%1+ 9*32] ; in9 + mova m0, [%1+ 1*32] ; in1 + mova m1, [%1+14*32] ; in14 + + VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d] + VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 2404, 16207 ; m0/4=t15[d], m1/5=t14[d] + SCRATCH 4, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t6[w], m2=t14[w] + UNSCRATCH 4, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t7[w], m3=t15[w] + + ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7 + ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15 + + UNSCRATCH 4, 12, tmpq+ 2*%%str + UNSCRATCH 5, 13, tmpq+13*%%str + SCRATCH 0, 12, tmpq+ 1*%%str + SCRATCH 1, 13, tmpq+14*%%str + + ; remainder of round 2 (rest of t8-15) + VP9_UNPACK_MULSUB_2D_4X 5, 4, 6, 7, 9102, 13623 ; m5/6=t11[d], m4/7=t10[d] + VP9_UNPACK_MULSUB_2D_4X 3, 2, 1, 0, 13623, 9102 ; m3/1=t14[d], m2/0=t15[d] + SCRATCH 0, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 3, 4, 1, 7, 0, [pd_8192] ; m3=t10[w], m4=t14[w] + UNSCRATCH 0, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 2, 5, 0, 6, 1, [pd_8192] ; m2=t11[w], m5=t15[w] + + ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15 + + UNSCRATCH 6, 14, tmpq+ 3*%%str + UNSCRATCH 7, 15, tmpq+12*%%str + + SUMSUB_BA w, 3, 7, 1 + PSIGNW m3, [pw_m1] ; m3=out1[w], m7=t10[w] + SUMSUB_BA w, 2, 6, 1 ; m2=out14[w], m6=t11[w] + + ; unfortunately, the code below overflows in some cases, e.g. + ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm +%if 0; cpuflag(ssse3) + SUMSUB_BA w, 7, 6, 1 + pmulhrsw m7, [pw_11585x2] ; m7=out6[w] + pmulhrsw m6, [pw_11585x2] ; m6=out9[w] +%else + VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, 11585, [pd_8192], 1, 0 +%endif + + mova [tmpq+ 3*%%str], m6 + mova [tmpq+ 6*%%str], m7 + UNSCRATCH 6, 10, tmpq+ 0*%%str + UNSCRATCH 7, 11, tmpq+15*%%str + mova [tmpq+13*%%str], m2 + SCRATCH 3, 11, tmpq+ 9*%%str + + VP9_UNPACK_MULSUB_2D_4X 7, 6, 2, 3, 15137, 6270 ; m6/3=t13[d], m7/2=t12[d] + VP9_UNPACK_MULSUB_2D_4X 5, 4, 1, 0, 6270, 15137 ; m5/1=t14[d], m4/0=t15[d] + SCRATCH 0, 9, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 5, 6, 1, 3, 0, [pd_8192] ; m5=out2[w], m6=t14[w] + UNSCRATCH 0, 9, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 4, 7, 0, 2, 1, [pd_8192] + PSIGNW m4, [pw_m1] ; m4=out13[w], m7=t15[w] + + ; unfortunately, the code below overflows in some cases +%if 0; cpuflag(ssse3) + SUMSUB_BA w, 7, 6, 1 + pmulhrsw m7, [pw_m11585x2] ; m7=out5[w] + pmulhrsw m6, [pw_11585x2] ; m6=out10[w] +%else + PSIGNW m7, [pw_m1] + VP9_UNPACK_MULSUB_2W_4X 7, 6, 11585, 11585, [pd_8192], 1, 0 +%endif + + ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14 + + mova m2, [tmpq+ 8*%%str] + mova m3, [tmpq+ 7*%%str] + mova m1, [tmpq+11*%%str] + mova [tmpq+ 7*%%str], m6 + mova [tmpq+11*%%str], m4 + mova m4, [tmpq+ 5*%%str] + SCRATCH 5, 14, tmpq+ 5*%%str + SCRATCH 7, 15, tmpq+ 8*%%str + UNSCRATCH 6, 8, tmpq+ 4*%%str + UNSCRATCH 5, 12, tmpq+ 1*%%str + UNSCRATCH 7, 13, tmpq+14*%%str + + ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7 + ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 + + SUMSUB_BA w, 1, 2, 0 ; m1=t0[w], m2=t4[w] + mova m0, [tmpq+10*%%str] + SCRATCH 1, 12, tmpq+ 1*%%str + SUMSUB_BA w, 6, 3, 1 ; m8=t1[w], m3=t5[w] + SCRATCH 6, 13, tmpq+ 4*%%str + SUMSUB_BA w, 7, 4, 1 ; m13=t2[w], m9=t6[w] + SCRATCH 7, 8, tmpq+10*%%str + SUMSUB_BA w, 5, 0, 1 ; m12=t3[w], m0=t7[w] + SCRATCH 5, 9, tmpq+14*%%str + + VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 5, 15137, 6270 ; m2/6=t5[d], m3/10=t4[d] + VP9_UNPACK_MULSUB_2D_4X 0, 4, 1, 6, 6270, 15137 ; m0/14=t6[d], m9/15=t7[d] + SCRATCH 6, 10, tmpq+ 0*%%str + VP9_RND_SH_SUMSUB_BA 0, 3, 1, 5, 6, [pd_8192] + UNSCRATCH 6, 10, tmpq+ 0*%%str + PSIGNW m0, [pw_m1] ; m0=out3[w], m3=t6[w] + VP9_RND_SH_SUMSUB_BA 4, 2, 6, 7, 5, [pd_8192] ; m9=out12[w], m2=t7[w] + + UNSCRATCH 1, 8, tmpq+10*%%str + UNSCRATCH 5, 9, tmpq+14*%%str + UNSCRATCH 6, 12, tmpq+ 1*%%str + UNSCRATCH 7, 13, tmpq+ 4*%%str + SCRATCH 4, 9, tmpq+14*%%str + + SUMSUB_BA w, 1, 6, 4 ; m13=out0[w], m1=t2[w] + SUMSUB_BA w, 5, 7, 4 + PSIGNW m5, [pw_m1] ; m12=out15[w], m8=t3[w] + + ; unfortunately, the code below overflows in some cases, e.g. + ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm +%if 0 ; cpuflag(ssse3) + SUMSUB_BA w, 7, 6, 4 + pmulhrsw m7, [pw_m11585x2] ; m8=out7[w] + pmulhrsw m6, [pw_11585x2] ; m1=out8[w] + SUMSUB_BA w, 3, 2, 4 + pmulhrsw m3, [pw_11585x2] ; m3=out4[w] + pmulhrsw m2, [pw_11585x2] ; m2=out11[w] +%else + SCRATCH 5, 8, tmpq+10*%%str + PSIGNW m7, [pw_m1] + VP9_UNPACK_MULSUB_2W_4X 7, 6, 11585, 11585, [pd_8192], 5, 4 + VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 4 + UNSCRATCH 5, 8, tmpq+10*%%str +%endif + + ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15 + ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 + +%if %2 == 1 +%if ARCH_X86_64 + mova m13, [tmpq+ 6*%%str] + TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 7, 10 + mova [tmpq+ 0*16], m1 + mova [tmpq+ 2*16], m11 + mova [tmpq+ 4*16], m14 + mova [tmpq+ 6*16], m0 + mova m1, [tmpq+ 3*%%str] + mova m11, [tmpq+ 7*%%str] + mova m14, [tmpq+11*%%str] + mova m0, [tmpq+13*%%str] + mova [tmpq+ 8*16], m3 + mova [tmpq+10*16], m15 + mova [tmpq+12*16], m13 + mova [tmpq+14*16], m7 + + TRANSPOSE8x8W 6, 1, 11, 2, 9, 14, 0, 5, 10 + mova [tmpq+ 1*16], m6 + mova [tmpq+ 3*16], m1 + mova [tmpq+ 5*16], m11 + mova [tmpq+ 7*16], m2 + mova [tmpq+ 9*16], m9 + mova [tmpq+11*16], m14 + mova [tmpq+13*16], m0 + mova [tmpq+15*16], m5 +%else + mova [tmpq+12*%%str], m2 + mova [tmpq+ 1*%%str], m5 + mova [tmpq+15*%%str], m6 + mova m2, [tmpq+ 9*%%str] + mova m5, [tmpq+ 5*%%str] + mova m6, [tmpq+ 8*%%str] + TRANSPOSE8x8W 1, 2, 5, 0, 3, 6, 4, 7, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1 + mova [tmpq+ 0*16], m1 + mova [tmpq+ 2*16], m2 + mova [tmpq+ 4*16], m5 + mova [tmpq+ 6*16], m0 + mova [tmpq+10*16], m6 + mova m3, [tmpq+12*%%str] + mova [tmpq+12*16], m4 + mova m4, [tmpq+14*%%str] + mova [tmpq+14*16], m7 + + mova m0, [tmpq+15*%%str] + mova m1, [tmpq+ 3*%%str] + mova m2, [tmpq+ 7*%%str] + mova m5, [tmpq+11*%%str] + mova m7, [tmpq+ 1*%%str] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1 + mova [tmpq+ 1*16], m0 + mova [tmpq+ 3*16], m1 + mova [tmpq+ 5*16], m2 + mova [tmpq+ 7*16], m3 + mova [tmpq+11*16], m5 + mova [tmpq+13*16], m6 + mova [tmpq+15*16], m7 +%endif +%else + pxor m4, m4 + +%if cpuflag(ssse3) +%define ROUND_REG [pw_512] +%else +%define ROUND_REG [pw_32] +%endif + +%if ARCH_X86_64 + mova m12, [tmpq+ 6*%%str] + VP9_IDCT8_WRITEx2 1, 11, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 14, 0, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 3, 15, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 12, 7, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m1, [tmpq+ 3*%%str] + mova m11, [tmpq+ 7*%%str] + mova m14, [tmpq+11*%%str] + mova m0, [tmpq+13*%%str] + + VP9_IDCT8_WRITEx2 6, 1, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 11, 2, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 9, 14, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 0, 5, 10, 8, 4, ROUND_REG, 6 +%else + mova [tmpq+ 0*%%str], m2 + mova [tmpq+ 1*%%str], m5 + mova [tmpq+ 2*%%str], m6 + mova m2, [tmpq+ 9*%%str] + VP9_IDCT8_WRITEx2 1, 2, 5, 6, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m5, [tmpq+ 5*%%str] + VP9_IDCT8_WRITEx2 5, 0, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m5, [tmpq+ 8*%%str] + VP9_IDCT8_WRITEx2 3, 5, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m5, [tmpq+ 6*%%str] + VP9_IDCT8_WRITEx2 5, 7, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m0, [tmpq+ 2*%%str] + mova m3, [tmpq+ 3*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m0, [tmpq+ 7*%%str] + mova m3, [tmpq+ 0*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m0, [tmpq+14*%%str] + mova m3, [tmpq+11*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m0, [tmpq+13*%%str] + mova m3, [tmpq+ 1*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 +%endif + + SWAP 0, 4 ; zero +%undef ROUND_REG +%endif +%endmacro + +%macro IADST16_FN 5 +INIT_XMM %5 +cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp + mov cntd, 2 + mov tmpq, rsp +.loop1_full: + VP9_%2_1D blockq, 1 + add blockq, 16 + add tmpq, 256 + dec cntd + jg .loop1_full + sub blockq, 32 + + mov cntd, 2 + mov tmpq, rsp + mov dst_bakq, dstq +.loop2_full: + VP9_%4_1D tmpq, 2 + lea dstq, [dst_bakq+8] + add tmpq, 16 + dec cntd + jg .loop2_full + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endmacro + +%define PSIGNW PSIGNW_MMX +IADST16_FN idct, IDCT16, iadst, IADST16, sse2 +IADST16_FN iadst, IADST16, idct, IDCT16, sse2 +IADST16_FN iadst, IADST16, iadst, IADST16, sse2 +%define PSIGNW PSIGNW_SSSE3 +IADST16_FN idct, IDCT16, iadst, IADST16, ssse3 +IADST16_FN iadst, IADST16, idct, IDCT16, ssse3 +IADST16_FN iadst, IADST16, iadst, IADST16, ssse3 +IADST16_FN idct, IDCT16, iadst, IADST16, avx +IADST16_FN iadst, IADST16, idct, IDCT16, avx +IADST16_FN iadst, IADST16, iadst, IADST16, avx +%undef PSIGNW + +;--------------------------------------------------------------------------------------------- +; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc +%assign %%str 16*%2*%2 + ; first do t0-15, this can be done identical to idct16x16 + VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1 + + ; store everything on stack to make space available for t16-31 + ; we store interleaved with the output of the second half (t16-31) + ; so we don't need to allocate extra stack space + mova [tmpq+ 0*%%str], m0 ; t0 + mova [tmpq+ 4*%%str], m1 ; t1 + mova [tmpq+ 8*%%str], m2 ; t2 + mova [tmpq+12*%%str], m3 ; t3 + mova [tmpq+16*%%str], m4 ; t4 + mova [tmpq+20*%%str], m5 ; t5 +%if ARCH_X86_64 + mova [tmpq+22*%%str], m10 ; t10 + mova [tmpq+18*%%str], m11 ; t11 + mova [tmpq+14*%%str], m12 ; t12 + mova [tmpq+10*%%str], m13 ; t13 + mova [tmpq+ 6*%%str], m14 ; t14 + mova [tmpq+ 2*%%str], m15 ; t15 +%endif + + mova m0, [tmpq+ 30*%%str] + UNSCRATCH 1, 6, tmpq+26*%%str + UNSCRATCH 2, 8, tmpq+24*%%str + UNSCRATCH 3, 9, tmpq+28*%%str + SUMSUB_BA w, 1, 3, 4 ; t6, t9 + SUMSUB_BA w, 0, 2, 4 ; t7, t8 + + mova [tmpq+24*%%str], m1 ; t6 + mova [tmpq+28*%%str], m0 ; t7 + mova [tmpq+30*%%str], m2 ; t8 + mova [tmpq+26*%%str], m3 ; t9 + + ; then, secondly, do t16-31 +%if %3 <= 8 + mova m4, [%1+ 1*64] + mova m7, [%1+ 7*64] + + pmulhrsw m1, m4, [pw_16364x2] ;t31 + pmulhrsw m4, [pw_804x2] ;t16 + + VP9_UNPACK_MULSUB_2W_4X 5, 0, 1, 4, 16069, 3196, [pd_8192], 6, 2 ; t17, t30 + + pmulhrsw m3, m7, [pw_m5520x2] ;t19 + pmulhrsw m7, [pw_15426x2] ;t28 + + SCRATCH 4, 13, tmpq+ 1*%%str + SCRATCH 5, 12, tmpq+15*%%str + + VP9_UNPACK_MULSUB_2W_4X 2, 6, 7, 3, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 +%else + mova m0, [%1+ 1*64] + mova m1, [%1+15*64] +%if %3 <= 16 + pmulhrsw m5, m0, [pw_16364x2] + pmulhrsw m0, [pw_804x2] + pmulhrsw m4, m1, [pw_m11003x2] + pmulhrsw m1, [pw_12140x2] +%else + mova m4, [%1+17*64] + mova m5, [%1+31*64] + + VP9_UNPACK_MULSUB_2W_4X 0, 5, 16364, 804, [pd_8192], 2, 3 ; t16, t31 + VP9_UNPACK_MULSUB_2W_4X 4, 1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30 +%endif + SUMSUB_BA w, 4, 0, 2 + SUMSUB_BA w, 1, 5, 2 + + VP9_UNPACK_MULSUB_2W_4X 5, 0, 16069, 3196, [pd_8192], 2, 3 ; t17, t30 + + SCRATCH 4, 13, tmpq+ 1*%%str + SCRATCH 5, 12, tmpq+15*%%str + + mova m2, [%1+ 7*64] + mova m3, [%1+ 9*64] +%if %3 <= 16 + pmulhrsw m7, m3, [pw_14811x2] + pmulhrsw m3, [pw_7005x2] + pmulhrsw m6, m2, [pw_m5520x2] + pmulhrsw m2, [pw_15426x2] +%else + mova m7, [%1+23*64] + mova m6, [%1+25*64] + + VP9_UNPACK_MULSUB_2W_4X 3, 7, 14811, 7005, [pd_8192], 4, 5 ; t18, t29 + VP9_UNPACK_MULSUB_2W_4X 6, 2, 5520, 15426, [pd_8192], 4, 5 ; t19, t28 +%endif + SUMSUB_BA w, 3, 6, 4 + SUMSUB_BA w, 7, 2, 4 + + VP9_UNPACK_MULSUB_2W_4X 2, 6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 +%endif + + UNSCRATCH 5, 12, tmpq+15*%%str + SUMSUB_BA w, 6, 0, 4 + mova [tmpq+25*%%str], m6 ; t19 + UNSCRATCH 4, 13, tmpq+ 1*%%str + SUMSUB_BA w, 7, 1, 6 + SUMSUB_BA w, 3, 4, 6 + mova [tmpq+23*%%str], m3 ; t16 + SUMSUB_BA w, 2, 5, 6 + + VP9_UNPACK_MULSUB_2W_4X 0, 5, 15137, 6270, [pd_8192], 6, 3 ; t18, t29 + VP9_UNPACK_MULSUB_2W_4X 1, 4, 15137, 6270, [pd_8192], 6, 3 ; t19, t28 + + SCRATCH 0, 10, tmpq+ 1*%%str + SCRATCH 1, 11, tmpq+ 7*%%str + SCRATCH 2, 9, tmpq+ 9*%%str + SCRATCH 4, 14, tmpq+15*%%str + SCRATCH 5, 15, tmpq+17*%%str + SCRATCH 7, 13, tmpq+31*%%str + +%if %3 <= 8 + mova m0, [%1+ 5*64] + mova m3, [%1+ 3*64] + + pmulhrsw m5, m0, [pw_15893x2] ;t27 + pmulhrsw m0, [pw_3981x2] ;t20 + + VP9_UNPACK_MULSUB_2W_4X 1, 4, 5, 0, 9102, 13623, [pd_8192], 7, 2 ; t21, t26 + + pmulhrsw m6, m3, [pw_m2404x2] ;t23 + pmulhrsw m3, [pw_16207x2] ;t24 + + SCRATCH 5, 8, tmpq+ 5*%%str + SCRATCH 4, 12, tmpq+11*%%str + + VP9_UNPACK_MULSUB_2W_4X 7, 2, 3, 6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 +%else + mova m4, [%1+ 5*64] + mova m5, [%1+11*64] +%if %3 <= 16 + pmulhrsw m1, m4, [pw_15893x2] + pmulhrsw m4, [pw_3981x2] + pmulhrsw m0, m5, [pw_m8423x2] + pmulhrsw m5, [pw_14053x2] +%else + mova m0, [%1+21*64] + mova m1, [%1+27*64] + + VP9_UNPACK_MULSUB_2W_4X 4, 1, 15893, 3981, [pd_8192], 2, 3 ; t20, t27 + VP9_UNPACK_MULSUB_2W_4X 0, 5, 8423, 14053, [pd_8192], 2, 3 ; t21, t26 +%endif + SUMSUB_BA w, 0, 4, 2 + SUMSUB_BA w, 5, 1, 2 + + VP9_UNPACK_MULSUB_2W_4X 1, 4, 9102, 13623, [pd_8192], 2, 3 ; t21, t26 + + SCRATCH 5, 8, tmpq+ 5*%%str + SCRATCH 4, 12, tmpq+11*%%str + + mova m7, [%1+ 3*64] + mova m6, [%1+13*64] +%if %3 <= 16 + pmulhrsw m3, m6, [pw_13160x2] + pmulhrsw m6, [pw_9760x2] + pmulhrsw m2, m7, [pw_m2404x2] + pmulhrsw m7, [pw_16207x2] +%else + mova m2, [%1+29*64] + mova m3, [%1+19*64] + VP9_UNPACK_MULSUB_2W_4X 6, 3, 13160, 9760, [pd_8192], 4, 5 ; t22, t25 + VP9_UNPACK_MULSUB_2W_4X 2, 7, 2404, 16207, [pd_8192], 4, 5 ; t23, t24 +%endif + SUMSUB_BA w, 6, 2, 4 + SUMSUB_BA w, 3, 7, 4 + + VP9_UNPACK_MULSUB_2W_4X 7, 2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 +%endif + + ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23, + ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31 + + UNSCRATCH 4, 12, tmpq+11*%%str + SUMSUB_BA w, 0, 6, 5 + SUMSUB_BA w, 4, 2, 5 + UNSCRATCH 5, 8, tmpq+ 5*%%str + SCRATCH 4, 8, tmpq+11*%%str + SUMSUB_BA w, 1, 7, 4 + SUMSUB_BA w, 5, 3, 4 + SCRATCH 5, 12, tmpq+ 5*%%str + + VP9_UNPACK_MULSUB_2W_4X 3, 6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27 + VP9_UNPACK_MULSUB_2W_4X 2, 7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26 + + ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23, + ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31 + + UNSCRATCH 5, 9, tmpq+ 9*%%str + mova m4, [tmpq+23*%%str] ; t16 +%if ARCH_X86_64 + SUMSUB_BA w, 1, 5, 9 + SUMSUB_BA w, 0, 4, 9 +%else + SUMSUB_BADC w, 1, 5, 0, 4 +%endif + mova [tmpq+29*%%str], m1 ; t17 + mova [tmpq+21*%%str], m0 ; t16 + UNSCRATCH 0, 10, tmpq+ 1*%%str + UNSCRATCH 1, 11, tmpq+ 7*%%str +%if ARCH_X86_64 + SUMSUB_BA w, 2, 0, 9 + SUMSUB_BA w, 3, 1, 9 +%else + SUMSUB_BADC w, 2, 0, 3, 1 +%endif + mova [tmpq+ 9*%%str], m2 ; t18 + mova [tmpq+13*%%str], m3 ; t19 + SCRATCH 0, 10, tmpq+23*%%str + SCRATCH 1, 11, tmpq+27*%%str + + UNSCRATCH 2, 14, tmpq+15*%%str + UNSCRATCH 3, 15, tmpq+17*%%str + SUMSUB_BA w, 6, 2, 0 + SUMSUB_BA w, 7, 3, 0 + SCRATCH 6, 14, tmpq+ 3*%%str + SCRATCH 7, 15, tmpq+ 7*%%str + + UNSCRATCH 0, 8, tmpq+11*%%str + mova m1, [tmpq+25*%%str] ; t19 + UNSCRATCH 6, 12, tmpq+ 5*%%str + UNSCRATCH 7, 13, tmpq+31*%%str +%if ARCH_X86_64 + SUMSUB_BA w, 0, 1, 9 + SUMSUB_BA w, 6, 7, 9 +%else + SUMSUB_BADC w, 0, 1, 6, 7 +%endif + + ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23, + ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31 + +%if 0; cpuflag(ssse3) +%if ARCH_X86_64 + SUMSUB_BA w, 4, 7, 8 + SUMSUB_BA w, 5, 1, 8 +%else + SUMSUB_BADC w, 4, 7, 5, 1 +%endif + + pmulhrsw m7, [pw_11585x2] + pmulhrsw m4, [pw_11585x2] + pmulhrsw m1, [pw_11585x2] + pmulhrsw m5, [pw_11585x2] + + mova [tmpq+ 5*%%str], m7 ; t23 + SCRATCH 1, 13, tmpq+25*%%str + UNSCRATCH 7, 10, tmpq+23*%%str + UNSCRATCH 1, 11, tmpq+27*%%str + +%if ARCH_X86_64 + SUMSUB_BA w, 7, 3, 10 + SUMSUB_BA w, 1, 2, 10 +%else + SUMSUB_BADC w, 7, 3, 1, 2 +%endif + + pmulhrsw m3, [pw_11585x2] + pmulhrsw m7, [pw_11585x2] + pmulhrsw m2, [pw_11585x2] + pmulhrsw m1, [pw_11585x2] +%else + SCRATCH 0, 8, tmpq+15*%%str + SCRATCH 6, 9, tmpq+17*%%str + VP9_UNPACK_MULSUB_2W_4X 7, 4, 11585, 11585, [pd_8192], 0, 6 + mova [tmpq+ 5*%%str], m7 ; t23 + UNSCRATCH 7, 10, tmpq+23*%%str + VP9_UNPACK_MULSUB_2W_4X 1, 5, 11585, 11585, [pd_8192], 0, 6 + SCRATCH 1, 13, tmpq+25*%%str + UNSCRATCH 1, 11, tmpq+27*%%str + VP9_UNPACK_MULSUB_2W_4X 3, 7, 11585, 11585, [pd_8192], 0, 6 + VP9_UNPACK_MULSUB_2W_4X 2, 1, 11585, 11585, [pd_8192], 0, 6 + UNSCRATCH 0, 8, tmpq+15*%%str + UNSCRATCH 6, 9, tmpq+17*%%str +%endif + + ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23, + ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31 + + ; then do final pass to sumsub+store the two halves +%if %2 == 1 + mova [tmpq+17*%%str], m2 ; t20 + mova [tmpq+ 1*%%str], m3 ; t21 +%if ARCH_X86_64 + mova [tmpq+25*%%str], m13 ; t22 + + mova m8, [tmpq+ 0*%%str] ; t0 + mova m9, [tmpq+ 4*%%str] ; t1 + mova m12, [tmpq+ 8*%%str] ; t2 + mova m11, [tmpq+12*%%str] ; t3 + mova m2, [tmpq+16*%%str] ; t4 + mova m3, [tmpq+20*%%str] ; t5 + mova m13, [tmpq+24*%%str] ; t6 + + SUMSUB_BA w, 6, 8, 10 + mova [tmpq+ 3*%%str], m8 ; t15 + mova m10, [tmpq+28*%%str] ; t7 + SUMSUB_BA w, 0, 9, 8 + SUMSUB_BA w, 15, 12, 8 + SUMSUB_BA w, 14, 11, 8 + SUMSUB_BA w, 1, 2, 8 + SUMSUB_BA w, 7, 3, 8 + SUMSUB_BA w, 5, 13, 8 + SUMSUB_BA w, 4, 10, 8 + + TRANSPOSE8x8W 6, 0, 15, 14, 1, 7, 5, 4, 8 + mova [tmpq+ 0*%%str], m6 + mova [tmpq+ 4*%%str], m0 + mova [tmpq+ 8*%%str], m15 + mova [tmpq+12*%%str], m14 + mova [tmpq+16*%%str], m1 + mova [tmpq+20*%%str], m7 + mova [tmpq+24*%%str], m5 + mova [tmpq+28*%%str], m4 + + mova m8, [tmpq+ 3*%%str] ; t15 + TRANSPOSE8x8W 10, 13, 3, 2, 11, 12, 9, 8, 0 + mova [tmpq+ 3*%%str], m10 + mova [tmpq+ 7*%%str], m13 + mova [tmpq+11*%%str], m3 + mova [tmpq+15*%%str], m2 + mova [tmpq+19*%%str], m11 + mova [tmpq+23*%%str], m12 + mova [tmpq+27*%%str], m9 + mova [tmpq+31*%%str], m8 + + mova m15, [tmpq+30*%%str] ; t8 + mova m14, [tmpq+26*%%str] ; t9 + mova m13, [tmpq+22*%%str] ; t10 + mova m12, [tmpq+18*%%str] ; t11 + mova m11, [tmpq+14*%%str] ; t12 + mova m10, [tmpq+10*%%str] ; t13 + mova m9, [tmpq+ 6*%%str] ; t14 + mova m8, [tmpq+ 2*%%str] ; t15 + mova m7, [tmpq+21*%%str] ; t16 + mova m6, [tmpq+29*%%str] ; t17 + mova m5, [tmpq+ 9*%%str] ; t18 + mova m4, [tmpq+13*%%str] ; t19 + mova m3, [tmpq+17*%%str] ; t20 + mova m2, [tmpq+ 1*%%str] ; t21 + mova m1, [tmpq+25*%%str] ; t22 + + SUMSUB_BA w, 7, 8, 0 + mova [tmpq+ 2*%%str], m8 + mova m0, [tmpq+ 5*%%str] ; t23 + SUMSUB_BA w, 6, 9, 8 + SUMSUB_BA w, 5, 10, 8 + SUMSUB_BA w, 4, 11, 8 + SUMSUB_BA w, 3, 12, 8 + SUMSUB_BA w, 2, 13, 8 + SUMSUB_BA w, 1, 14, 8 + SUMSUB_BA w, 0, 15, 8 + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + mova [tmpq+ 1*%%str], m0 + mova [tmpq+ 5*%%str], m1 + mova [tmpq+ 9*%%str], m2 + mova [tmpq+13*%%str], m3 + mova [tmpq+17*%%str], m4 + mova [tmpq+21*%%str], m5 + mova [tmpq+25*%%str], m6 + mova [tmpq+29*%%str], m7 + + mova m8, [tmpq+ 2*%%str] + TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 + mova [tmpq+ 2*%%str], m8 + mova [tmpq+ 6*%%str], m9 + mova [tmpq+10*%%str], m10 + mova [tmpq+14*%%str], m11 + mova [tmpq+18*%%str], m12 + mova [tmpq+22*%%str], m13 + mova [tmpq+26*%%str], m14 + mova [tmpq+30*%%str], m15 +%else + mova m2, [tmpq+24*%%str] ; t6 + mova m3, [tmpq+28*%%str] ; t7 + SUMSUB_BADC w, 5, 2, 4, 3 + mova [tmpq+24*%%str], m5 + mova [tmpq+23*%%str], m2 + mova [tmpq+28*%%str], m4 + mova [tmpq+19*%%str], m3 + + mova m2, [tmpq+16*%%str] ; t4 + mova m3, [tmpq+20*%%str] ; t5 + SUMSUB_BA w, 1, 2, 5 + SUMSUB_BA w, 7, 3, 5 + mova [tmpq+15*%%str], m2 + mova [tmpq+11*%%str], m3 + + mova m2, [tmpq+ 0*%%str] ; t0 + mova m3, [tmpq+ 4*%%str] ; t1 + SUMSUB_BA w, 6, 2, 5 + SUMSUB_BA w, 0, 3, 5 + mova [tmpq+31*%%str], m2 + mova [tmpq+27*%%str], m3 + + mova m2, [tmpq+ 8*%%str] ; t2 + mova m3, [tmpq+12*%%str] ; t3 + mova m5, [tmpq+ 7*%%str] + mova m4, [tmpq+ 3*%%str] + SUMSUB_BADC w, 5, 2, 4, 3 + mova [tmpq+ 7*%%str], m2 + mova [tmpq+ 3*%%str], m3 + + mova m3, [tmpq+28*%%str] + TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1 + mova [tmpq+ 0*%%str], m6 + mova [tmpq+ 4*%%str], m0 + mova [tmpq+ 8*%%str], m5 + mova [tmpq+12*%%str], m4 + mova [tmpq+20*%%str], m7 + mova [tmpq+24*%%str], m2 + mova [tmpq+28*%%str], m3 + + mova m6, [tmpq+19*%%str] + mova m0, [tmpq+23*%%str] + mova m5, [tmpq+11*%%str] + mova m4, [tmpq+15*%%str] + mova m1, [tmpq+ 3*%%str] + mova m7, [tmpq+ 7*%%str] + mova m3, [tmpq+31*%%str] + TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1 + mova [tmpq+ 3*%%str], m6 + mova [tmpq+ 7*%%str], m0 + mova [tmpq+11*%%str], m5 + mova [tmpq+15*%%str], m4 + mova [tmpq+23*%%str], m7 + mova [tmpq+27*%%str], m2 + mova [tmpq+31*%%str], m3 + + mova m1, [tmpq+ 6*%%str] ; t14 + mova m0, [tmpq+ 2*%%str] ; t15 + mova m7, [tmpq+21*%%str] ; t16 + mova m6, [tmpq+29*%%str] ; t17 + SUMSUB_BA w, 7, 0, 2 + SUMSUB_BA w, 6, 1, 2 + mova [tmpq+29*%%str], m7 + mova [tmpq+ 2*%%str], m0 + mova [tmpq+21*%%str], m6 + mova [tmpq+ 6*%%str], m1 + + mova m1, [tmpq+14*%%str] ; t12 + mova m0, [tmpq+10*%%str] ; t13 + mova m5, [tmpq+ 9*%%str] ; t18 + mova m4, [tmpq+13*%%str] ; t19 + SUMSUB_BA w, 5, 0, 2 + SUMSUB_BA w, 4, 1, 2 + mova [tmpq+10*%%str], m0 + mova [tmpq+14*%%str], m1 + + mova m1, [tmpq+22*%%str] ; t10 + mova m0, [tmpq+18*%%str] ; t11 + mova m3, [tmpq+17*%%str] ; t20 + mova m2, [tmpq+ 1*%%str] ; t21 + SUMSUB_BA w, 3, 0, 6 + SUMSUB_BA w, 2, 1, 6 + mova [tmpq+18*%%str], m0 + mova [tmpq+22*%%str], m1 + + mova m7, [tmpq+30*%%str] ; t8 + mova m6, [tmpq+26*%%str] ; t9 + mova m1, [tmpq+25*%%str] ; t22 + mova m0, [tmpq+ 5*%%str] ; t23 + SUMSUB_BADC w, 1, 6, 0, 7 + mova [tmpq+26*%%str], m6 + mova [tmpq+30*%%str], m7 + + mova m7, [tmpq+29*%%str] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1 + mova [tmpq+ 1*%%str], m0 + mova [tmpq+ 5*%%str], m1 + mova [tmpq+ 9*%%str], m2 + mova [tmpq+13*%%str], m3 + mova [tmpq+21*%%str], m5 + mova [tmpq+25*%%str], m6 + mova [tmpq+29*%%str], m7 + + mova m0, [tmpq+ 2*%%str] + mova m1, [tmpq+ 6*%%str] + mova m2, [tmpq+10*%%str] + mova m3, [tmpq+14*%%str] + mova m4, [tmpq+18*%%str] + mova m5, [tmpq+22*%%str] + mova m7, [tmpq+30*%%str] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1 + mova [tmpq+ 2*%%str], m0 + mova [tmpq+ 6*%%str], m1 + mova [tmpq+10*%%str], m2 + mova [tmpq+14*%%str], m3 + mova [tmpq+22*%%str], m5 + mova [tmpq+26*%%str], m6 + mova [tmpq+30*%%str], m7 +%endif +%else + ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str] + ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str] + ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str] + ; t20-22 is in m4-6 + ; t24-31 is in m8-15 + +%if cpuflag(ssse3) +%define ROUND_REG [pw_512] +%else +%define ROUND_REG [pw_32] +%endif + +%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs + SUMSUB_BA w, %4, %1, %5 + SUMSUB_BA w, %3, %2, %5 + VP9_IDCT8_WRITEx2 %4, %3, %5, %6, %7, ROUND_REG, 6 +%if %8 == 1 + add dstq, stride2q +%endif + VP9_IDCT8_WRITEx2 %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq +%if %8 == 1 + sub dst_endq, stride2q +%endif +%endmacro + +%if ARCH_X86_64 + pxor m10, m10 + + ; store t0-1 and t30-31 + mova m8, [tmpq+ 0*%%str] + mova m9, [tmpq+ 4*%%str] + %%STORE_2X2 8, 9, 0, 6, 12, 11, 10 + + ; store t2-3 and t28-29 + mova m8, [tmpq+ 8*%%str] + mova m9, [tmpq+12*%%str] + %%STORE_2X2 8, 9, 14, 15, 12, 11, 10 + + ; store t4-5 and t26-27 + mova m8, [tmpq+16*%%str] + mova m9, [tmpq+20*%%str] + %%STORE_2X2 8, 9, 7, 1, 12, 11, 10 + + ; store t6-7 and t24-25 + mova m8, [tmpq+24*%%str] + mova m9, [tmpq+28*%%str] + %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 + + ; store t8-9 and t22-23 + mova m8, [tmpq+30*%%str] + mova m9, [tmpq+26*%%str] + mova m0, [tmpq+ 5*%%str] + %%STORE_2X2 8, 9, 13, 0, 12, 11, 10 + + ; store t10-11 and t20-21 + mova m8, [tmpq+22*%%str] + mova m9, [tmpq+18*%%str] + %%STORE_2X2 8, 9, 2, 3, 12, 11, 10 + + ; store t12-13 and t18-19 + mova m8, [tmpq+14*%%str] + mova m9, [tmpq+10*%%str] + mova m5, [tmpq+13*%%str] + mova m4, [tmpq+ 9*%%str] + %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 + + ; store t14-17 + mova m8, [tmpq+ 6*%%str] + mova m9, [tmpq+ 2*%%str] + mova m5, [tmpq+29*%%str] + mova m4, [tmpq+21*%%str] + %%STORE_2X2 8, 9, 4, 5, 12, 11, 10, 0 + + SWAP 1, 10 ; zero +%else + mova [tmpq+ 1*%%str], m1 + mova [tmpq+11*%%str], m2 + mova [tmpq+15*%%str], m3 + mova [tmpq+17*%%str], m4 + mova [tmpq+19*%%str], m5 + pxor m1, m1 + + ; store t0-1 and t30-31 + mova m2, [tmpq+ 0*%%str] + mova m3, [tmpq+ 4*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t2-3 and t28-29 + mova m2, [tmpq+ 8*%%str] + mova m3, [tmpq+12*%%str] + mova m0, [tmpq+ 3*%%str] + mova m6, [tmpq+ 7*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t4-5 and t26-27 + mova m2, [tmpq+16*%%str] + mova m3, [tmpq+20*%%str] + mova m0, [tmpq+ 1*%%str] + %%STORE_2X2 2, 3, 7, 0, 4, 5, 1 + + ; store t6-7 and t24-25 + mova m2, [tmpq+24*%%str] + mova m3, [tmpq+28*%%str] + mova m0, [tmpq+17*%%str] + mova m6, [tmpq+19*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t8-9 and t22-23 + mova m2, [tmpq+30*%%str] + mova m3, [tmpq+26*%%str] + mova m0, [tmpq+25*%%str] + mova m6, [tmpq+ 5*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t10-11 and t20-21 + mova m2, [tmpq+22*%%str] + mova m3, [tmpq+18*%%str] + mova m0, [tmpq+11*%%str] + mova m6, [tmpq+15*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t12-13 and t18-19 + mova m2, [tmpq+14*%%str] + mova m3, [tmpq+10*%%str] + mova m6, [tmpq+13*%%str] + mova m0, [tmpq+ 9*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t14-17 + mova m2, [tmpq+ 6*%%str] + mova m3, [tmpq+ 2*%%str] + mova m6, [tmpq+29*%%str] + mova m0, [tmpq+21*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1, 0 +%endif +%undef ROUND_REG +%endif +%endmacro + +%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1 +INIT_XMM %1 +cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob + movifnidn eobd, dword eobm +%if cpuflag(ssse3) + cmp eobd, 135 + jg .idctfull + cmp eobd, 34 + jg .idct16x16 + cmp eobd, 1 + jg .idct8x8 +%else + cmp eobd, 1 + jg .idctfull +%endif + + ; dc-only case + movifnidn blockq, blockmp + movifnidn dstq, dstmp + movifnidn strideq, stridemp +%if cpuflag(ssse3) + movd m0, [blockq] + mova m1, [pw_11585x2] + pmulhrsw m0, m1 + pmulhrsw m0, m1 +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (32 << 14) + 8192 + sar coefd, 14 + 6 + movd m0, coefd +%endif + SPLATW m0, m0, q0000 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_512] +%endif + pxor m5, m5 + movd [blockq], m5 +%rep 31 + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize + add dstq, strideq +%endrep + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize + RET + +%if ARCH_X86_64 + DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp +%else +%define dst_bakq r0mp +%endif +%if cpuflag(ssse3) +.idct8x8: +%if ARCH_X86_32 + DEFINE_ARGS block, u1, u2, u3, u4, tmp + mov blockq, r2mp +%endif + mov tmpq, rsp + VP9_IDCT32_1D blockq, 1, 8 + +%if ARCH_X86_32 + DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp + mov strideq, r1mp +%define cntd dword r3m +%endif + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + sub stride30q, stride2q ; stride*30 +.loop2_8x8: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2, 8 + add dst_bakq, 8 + add tmpq, 16 + dec cntd + jg .loop2_8x8 + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients +%if ARCH_X86_32 + DEFINE_ARGS block + mov blockq, r2mp +%endif + ZERO_BLOCK blockq, 64, 8, m1 + RET + +.idct16x16: +%if ARCH_X86_32 + DEFINE_ARGS block, tmp, cnt + mov blockq, r2mp +%endif + mov cntd, 2 + mov tmpq, rsp +.loop1_16x16: + VP9_IDCT32_1D blockq, 1, 16 + add blockq, 16 + add tmpq, 512 + dec cntd + jg .loop1_16x16 + +%if ARCH_X86_64 + sub blockq, 32 +%else + DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp + mov strideq, r1mp +%define cntd dword r3m +%endif + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + mov tmpq, rsp + sub stride30q, stride2q ; stride*30 +.loop2_16x16: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2, 16 + add dst_bakq, 8 + add tmpq, 16 + dec cntd + jg .loop2_16x16 + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients +%if ARCH_X86_32 + DEFINE_ARGS block + mov blockq, r2mp +%endif + ZERO_BLOCK blockq, 64, 16, m1 + RET +%endif + +.idctfull: +%if ARCH_X86_32 + DEFINE_ARGS block, tmp, cnt + mov blockq, r2mp +%endif + mov cntd, 4 + mov tmpq, rsp +.loop1_full: + VP9_IDCT32_1D blockq, 1 + add blockq, 16 + add tmpq, 512 + dec cntd + jg .loop1_full + +%if ARCH_X86_64 + sub blockq, 64 +%else + DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp + mov strideq, r1mp +%define cntd dword r3m +%endif + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + mov tmpq, rsp + sub stride30q, stride2q ; stride*30 +.loop2_full: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2 + add dst_bakq, 8 + add tmpq, 16 + dec cntd + jg .loop2_full + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients +%if ARCH_X86_32 + DEFINE_ARGS block + mov blockq, r2mp +%endif + ZERO_BLOCK blockq, 64, 32, m1 + RET +%endmacro + +VP9_IDCT_IDCT_32x32_ADD_XMM sse2 +VP9_IDCT_IDCT_32x32_ADD_XMM ssse3 +VP9_IDCT_IDCT_32x32_ADD_XMM avx diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm new file mode 100644 index 0000000000..2c4fe214da --- /dev/null +++ b/libavcodec/x86/vp9lpf.asm @@ -0,0 +1,1139 @@ +;****************************************************************************** +;* VP9 loop filter SIMD optimizations +;* +;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me> +;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pb_3 +cextern pb_80 + +pb_4: times 16 db 0x04 +pb_10: times 16 db 0x10 +pb_40: times 16 db 0x40 +pb_81: times 16 db 0x81 +pb_f8: times 16 db 0xf8 +pb_fe: times 16 db 0xfe +pb_ff: times 16 db 0xff + +cextern pw_4 +cextern pw_8 + +; with mix functions, two 8-bit thresholds are stored in a 16-bit storage, +; the following mask is used to splat both in the same register +mask_mix: times 8 db 0 + times 8 db 1 + +mask_mix84: times 8 db 0xff + times 8 db 0x00 +mask_mix48: times 8 db 0x00 + times 8 db 0xff + +SECTION .text + +%macro SCRATCH 3 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova [%3], m%1 +%endif +%endmacro + +%macro UNSCRATCH 3 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%endmacro + +; %1 = abs(%2-%3) +%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp +%if ARCH_X86_64 + psubusb %1, %3, %2 + psubusb %4, %2, %3 +%else + mova %1, %3 + mova %4, %2 + psubusb %1, %2 + psubusb %4, %3 +%endif + por %1, %4 +%endmacro + +; %1 = %1>%2 +%macro CMP_GT 2-3 ; src/dst, cmp, pb_80 +%if %0 == 3 + pxor %1, %3 +%endif + pcmpgtb %1, %2 +%endmacro + +; %1 = abs(%2-%3) > %4 +%macro ABSSUB_GT 5-6 [pb_80]; dst, src1, src2, cmp, tmp, [pb_80] + ABSSUB %1, %2, %3, %5 ; dst = abs(src1-src2) + CMP_GT %1, %4, %6 ; dst > cmp +%endmacro + +%macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp + pand %1, %3 ; new &= mask + pandn %4, %3, %2 ; tmp = ~mask & old + por %1, %4 ; new&mask | old&~mask +%endmacro + +%macro UNPACK 4 +%if ARCH_X86_64 + punpck%1bw %2, %3, %4 +%else + mova %2, %3 + punpck%1bw %2, %4 +%endif +%endmacro + +%macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1 + ; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32] + psubw %3, [rsp+%4+%5*32] + psubw %3, [rsp+%4+%6*32] + paddw %3, [rsp+%4+%7*32] +%ifnidn %10, "" +%if %11 == 0 + punpck%2bw %1, %10, m0 +%else + UNPACK %2, %1, %10, m0 +%endif + mova [rsp+%4+%8*32], %1 + paddw %3, %1 +%else + paddw %3, [rsp+%4+%8*32] +%endif + psraw %1, %3, %9 +%endmacro + +; FIXME interleave l/h better (for instruction pairing) +%macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source + FILTER%7_INIT %1, l, %3, %6 + 0 + FILTER%7_INIT %2, h, %4, %6 + 16 + packuswb %1, %2 + MASK_APPLY %1, %9, %8, %2 + mova %5, %1 +%endmacro + + +%macro FILTER_UPDATE 12-16 "", "", "", 0 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, -, -, +, +, rshift, + ; mask, [source], [unpack + src], [unpack_is_mem_on_x86_32] +; FIXME interleave this properly with the subx2/addx2 +%ifnidn %15, "" +%if %16 == 0 || ARCH_X86_64 + mova %14, %15 +%endif +%endif + FILTER_SUBx2_ADDx2 %1, l, %3, %6 + 0, %7, %8, %9, %10, %11, %14, %16 + FILTER_SUBx2_ADDx2 %2, h, %4, %6 + 16, %7, %8, %9, %10, %11, %14, %16 + packuswb %1, %2 +%ifnidn %13, "" + MASK_APPLY %1, %13, %12, %2 +%else + MASK_APPLY %1, %5, %12, %2 +%endif + mova %5, %1 +%endmacro + +%macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp + mova %4, [pb_f8] + pand %1, %4 + pand %2, %4 + psrlq %1, 3 + psrlq %2, 3 + pxor %1, %3 + pxor %2, %3 + psubb %1, %3 + psubb %2, %3 +%endmacro + +%macro EXTRACT_POS_NEG 3 ; i8, neg, pos + pxor %3, %3 + pxor %2, %2 + pcmpgtb %3, %1 ; i8 < 0 mask + psubb %2, %1 ; neg values (only the originally - will be kept) + pand %2, %3 ; negative values of i8 (but stored as +) + pandn %3, %1 ; positive values of i8 +%endmacro + +; clip_u8(u8 + i8) +%macro SIGN_ADD 4 ; dst, u8, i8, tmp1 + EXTRACT_POS_NEG %3, %4, %1 + paddusb %1, %2 ; add the positives + psubusb %1, %4 ; sub the negatives +%endmacro + +; clip_u8(u8 - i8) +%macro SIGN_SUB 4 ; dst, u8, i8, tmp1 + EXTRACT_POS_NEG %3, %1, %4 + paddusb %1, %2 ; add the negatives + psubusb %1, %4 ; sub the positives +%endmacro + +%macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off + UNPACK %2, %1, rp3, m0 ; p3: B->W + mova [rsp+%4+0*32], %1 + paddw %3, %1, %1 ; p3*2 + paddw %3, %1 ; p3*3 + punpck%2bw %1, m1, m0 ; p2: B->W + mova [rsp+%4+1*32], %1 + paddw %3, %1 ; p3*3 + p2 + paddw %3, %1 ; p3*3 + p2*2 + UNPACK %2, %1, rp1, m0 ; p1: B->W + mova [rsp+%4+2*32], %1 + paddw %3, %1 ; p3*3 + p2*2 + p1 + UNPACK %2, %1, rp0, m0 ; p0: B->W + mova [rsp+%4+3*32], %1 + paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 + UNPACK %2, %1, rq0, m0 ; q0: B->W + mova [rsp+%4+4*32], %1 + paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 + q0 + paddw %3, [pw_4] ; p3*3 + p2*2 + p1 + p0 + q0 + 4 + psraw %1, %3, 3 ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3 +%endmacro + +%macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off + punpck%2bw %1, m2, m0 ; p7: B->W + mova [rsp+%4+ 8*32], %1 + psllw %3, %1, 3 ; p7*8 + psubw %3, %1 ; p7*7 + punpck%2bw %1, m3, m0 ; p6: B->W + mova [rsp+%4+ 9*32], %1 + paddw %3, %1 ; p7*7 + p6 + paddw %3, %1 ; p7*7 + p6*2 + UNPACK %2, %1, rp5, m0 ; p5: B->W + mova [rsp+%4+10*32], %1 + paddw %3, %1 ; p7*7 + p6*2 + p5 + UNPACK %2, %1, rp4, m0 ; p4: B->W + mova [rsp+%4+11*32], %1 + paddw %3, %1 ; p7*7 + p6*2 + p5 + p4 + paddw %3, [rsp+%4+ 0*32] ; p7*7 + p6*2 + p5 + p4 + p3 + paddw %3, [rsp+%4+ 1*32] ; p7*7 + p6*2 + p5 + .. + p2 + paddw %3, [rsp+%4+ 2*32] ; p7*7 + p6*2 + p5 + .. + p1 + paddw %3, [rsp+%4+ 3*32] ; p7*7 + p6*2 + p5 + .. + p0 + paddw %3, [rsp+%4+ 4*32] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + paddw %3, [pw_8] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8 + psraw %1, %3, 4 ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4 +%endmacro + +%macro TRANSPOSE16x16B 17 + mova %17, m%16 + SBUTTERFLY bw, %1, %2, %16 + SBUTTERFLY bw, %3, %4, %16 + SBUTTERFLY bw, %5, %6, %16 + SBUTTERFLY bw, %7, %8, %16 + SBUTTERFLY bw, %9, %10, %16 + SBUTTERFLY bw, %11, %12, %16 + SBUTTERFLY bw, %13, %14, %16 + mova m%16, %17 + mova %17, m%14 + SBUTTERFLY bw, %15, %16, %14 + SBUTTERFLY wd, %1, %3, %14 + SBUTTERFLY wd, %2, %4, %14 + SBUTTERFLY wd, %5, %7, %14 + SBUTTERFLY wd, %6, %8, %14 + SBUTTERFLY wd, %9, %11, %14 + SBUTTERFLY wd, %10, %12, %14 + SBUTTERFLY wd, %13, %15, %14 + mova m%14, %17 + mova %17, m%12 + SBUTTERFLY wd, %14, %16, %12 + SBUTTERFLY dq, %1, %5, %12 + SBUTTERFLY dq, %2, %6, %12 + SBUTTERFLY dq, %3, %7, %12 + SBUTTERFLY dq, %4, %8, %12 + SBUTTERFLY dq, %9, %13, %12 + SBUTTERFLY dq, %10, %14, %12 + SBUTTERFLY dq, %11, %15, %12 + mova m%12, %17 + mova %17, m%8 + SBUTTERFLY dq, %12, %16, %8 + SBUTTERFLY qdq, %1, %9, %8 + SBUTTERFLY qdq, %2, %10, %8 + SBUTTERFLY qdq, %3, %11, %8 + SBUTTERFLY qdq, %4, %12, %8 + SBUTTERFLY qdq, %5, %13, %8 + SBUTTERFLY qdq, %6, %14, %8 + SBUTTERFLY qdq, %7, %15, %8 + mova m%8, %17 + mova %17, m%1 + SBUTTERFLY qdq, %8, %16, %1 + mova m%1, %17 + SWAP %2, %9 + SWAP %3, %5 + SWAP %4, %13 + SWAP %6, %11 + SWAP %8, %15 + SWAP %12, %14 +%endmacro + +%macro TRANSPOSE8x8B 13 + SBUTTERFLY bw, %1, %2, %7 + movdq%10 m%7, %9 + movdqa %11, m%2 + SBUTTERFLY bw, %3, %4, %2 + SBUTTERFLY bw, %5, %6, %2 + SBUTTERFLY bw, %7, %8, %2 + SBUTTERFLY wd, %1, %3, %2 + movdqa m%2, %11 + movdqa %11, m%3 + SBUTTERFLY wd, %2, %4, %3 + SBUTTERFLY wd, %5, %7, %3 + SBUTTERFLY wd, %6, %8, %3 + SBUTTERFLY dq, %1, %5, %3 + SBUTTERFLY dq, %2, %6, %3 + movdqa m%3, %11 + movh %12, m%2 + movhps %13, m%2 + SBUTTERFLY dq, %3, %7, %2 + SBUTTERFLY dq, %4, %8, %2 + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +%macro DEFINE_REAL_P7_TO_Q7 0-1 0 +%define P7 dstq + 4*mstrideq + %1 +%define P6 dstq + mstride3q + %1 +%define P5 dstq + 2*mstrideq + %1 +%define P4 dstq + mstrideq + %1 +%define P3 dstq + %1 +%define P2 dstq + strideq + %1 +%define P1 dstq + 2* strideq + %1 +%define P0 dstq + stride3q + %1 +%define Q0 dstq + 4* strideq + %1 +%define Q1 dst2q + mstride3q + %1 +%define Q2 dst2q + 2*mstrideq + %1 +%define Q3 dst2q + mstrideq + %1 +%define Q4 dst2q + %1 +%define Q5 dst2q + strideq + %1 +%define Q6 dst2q + 2* strideq + %1 +%define Q7 dst2q + stride3q + %1 +%endmacro + +%macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0 +%define P3 rsp + 0 + %1 +%define P2 rsp + 16 + %1 +%define P1 rsp + 32 + %1 +%define P0 rsp + 48 + %1 +%define Q0 rsp + 64 + %1 +%define Q1 rsp + 80 + %1 +%define Q2 rsp + 96 + %1 +%define Q3 rsp + 112 + %1 +%define P7 rsp + 128 + %1 +%define P6 rsp + 144 + %1 +%define P5 rsp + 160 + %1 +%define P4 rsp + 176 + %1 +%define Q4 rsp + 192 + %1 +%define Q5 rsp + 208 + %1 +%define Q6 rsp + 224 + %1 +%define Q7 rsp + 240 + %1 +%endmacro + +; ..............AB -> AAAAAAAABBBBBBBB +%macro SPLATB_MIX 1-2 [mask_mix] +%if cpuflag(ssse3) + pshufb %1, %2 +%else + punpcklbw %1, %1 + punpcklwd %1, %1 + punpckldq %1, %1 +%endif +%endmacro + +%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=32bit stack only +%if UNIX64 +cglobal vp9_loop_filter_%1_%2_16, 5, 9, 16, %3 + %4, dst, stride, E, I, H, mstride, dst2, stride3, mstride3 +%else +%if WIN64 +cglobal vp9_loop_filter_%1_%2_16, 4, 8, 16, %3 + %4, dst, stride, E, I, mstride, dst2, stride3, mstride3 +%else +cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride, dst2, stride3, mstride3 +%define Ed dword r2m +%define Id dword r3m +%endif +%define Hd dword r4m +%endif + + mov mstrideq, strideq + neg mstrideq + + lea stride3q, [strideq*3] + lea mstride3q, [mstrideq*3] + +%ifidn %1, h +%if %2 > 16 +%define movx movh + lea dstq, [dstq + 4*strideq - 4] +%else +%define movx movu + lea dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos) +%endif + lea dst2q, [dstq + 8*strideq] +%else + lea dstq, [dstq + 4*mstrideq] + lea dst2q, [dstq + 8*strideq] +%endif + + DEFINE_REAL_P7_TO_Q7 + +%ifidn %1, h + movx m0, [P7] + movx m1, [P6] + movx m2, [P5] + movx m3, [P4] + movx m4, [P3] + movx m5, [P2] +%if ARCH_X86_64 || %2 != 16 + movx m6, [P1] +%endif + movx m7, [P0] +%if ARCH_X86_64 + movx m8, [Q0] + movx m9, [Q1] + movx m10, [Q2] + movx m11, [Q3] + movx m12, [Q4] + movx m13, [Q5] + movx m14, [Q6] + movx m15, [Q7] + DEFINE_TRANSPOSED_P7_TO_Q7 +%if %2 == 16 + TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] + mova [P7], m0 + mova [P6], m1 + mova [P5], m2 + mova [P4], m3 +%else ; %2 == 44/48/84/88 + ; 8x16 transpose + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + punpcklbw m8, m9 + punpcklbw m10, m11 + punpcklbw m12, m13 + punpcklbw m14, m15 + TRANSPOSE8x8W 0, 2, 4, 6, 8, 10, 12, 14, 15 + SWAP 0, 4 + SWAP 2, 5 + SWAP 0, 6 + SWAP 0, 7 + SWAP 10, 9 + SWAP 12, 10 + SWAP 14, 11 +%endif ; %2 + mova [P3], m4 + mova [P2], m5 + mova [P1], m6 + mova [P0], m7 + mova [Q0], m8 + mova [Q1], m9 + mova [Q2], m10 + mova [Q3], m11 +%if %2 == 16 + mova [Q4], m12 + mova [Q5], m13 + mova [Q6], m14 + mova [Q7], m15 +%endif ; %2 +%else ; x86-32 +%if %2 == 16 + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [P1], u, [rsp+%3+%4], [rsp+64], [rsp+80] + DEFINE_TRANSPOSED_P7_TO_Q7 + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 + DEFINE_REAL_P7_TO_Q7 + movx m0, [Q0] + movx m1, [Q1] + movx m2, [Q2] + movx m3, [Q3] + movx m4, [Q4] + movx m5, [Q5] + movx m7, [Q7] + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [Q6], u, [rsp+%3+%4], [rsp+72], [rsp+88] + DEFINE_TRANSPOSED_P7_TO_Q7 8 + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 + DEFINE_TRANSPOSED_P7_TO_Q7 +%else ; %2 == 44/48/84/88 + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + movx m1, [Q0] + movx m3, [Q1] + movx m5, [Q2] + movx m7, [Q3] + punpcklbw m1, m3 + punpcklbw m5, m7 + movx m3, [Q4] + movx m7, [Q5] + punpcklbw m3, m7 + mova [rsp], m3 + movx m3, [Q6] + movx m7, [Q7] + punpcklbw m3, m7 + DEFINE_TRANSPOSED_P7_TO_Q7 + TRANSPOSE8x8W 0, 2, 4, 6, 1, 5, 7, 3, [rsp], [Q0], 1 + mova [P3], m0 + mova [P2], m2 + mova [P1], m4 + mova [P0], m6 + mova [Q1], m5 + mova [Q2], m7 + mova [Q3], m3 +%endif ; %2 +%endif ; x86-32/64 +%endif ; %1 == h + + ; calc fm mask +%if %2 == 16 +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m2, I, m0 ; I I I I ... + SPLATB_REG m3, E, m0 ; E E E E ... +%else +%if cpuflag(ssse3) + mova m0, [mask_mix] +%endif + movd m2, Id + movd m3, Ed + SPLATB_MIX m2, m0 + SPLATB_MIX m3, m0 +%endif + mova m0, [pb_80] + pxor m2, m0 + pxor m3, m0 +%if ARCH_X86_64 +%ifidn %1, v + mova m8, [P3] + mova m9, [P2] + mova m10, [P1] + mova m11, [P0] + mova m12, [Q0] + mova m13, [Q1] + mova m14, [Q2] + mova m15, [Q3] +%else + ; In case of horizontal, P3..Q3 are already present in some registers due + ; to the previous transpose, so we just swap registers. + SWAP 8, 4, 12 + SWAP 9, 5, 13 + SWAP 10, 6, 14 + SWAP 11, 7, 15 +%endif +%define rp3 m8 +%define rp2 m9 +%define rp1 m10 +%define rp0 m11 +%define rq0 m12 +%define rq1 m13 +%define rq2 m14 +%define rq3 m15 +%else +%define rp3 [P3] +%define rp2 [P2] +%define rp1 [P1] +%define rp0 [P0] +%define rq0 [Q0] +%define rq1 [Q1] +%define rq2 [Q2] +%define rq3 [Q3] +%endif + ABSSUB_GT m5, rp3, rp2, m2, m7, m0 ; m5 = abs(p3-p2) <= I + ABSSUB_GT m1, rp2, rp1, m2, m7, m0 ; m1 = abs(p2-p1) <= I + por m5, m1 + ABSSUB_GT m1, rp1, rp0, m2, m7, m0 ; m1 = abs(p1-p0) <= I + por m5, m1 + ABSSUB_GT m1, rq0, rq1, m2, m7, m0 ; m1 = abs(q1-q0) <= I + por m5, m1 + ABSSUB_GT m1, rq1, rq2, m2, m7, m0 ; m1 = abs(q2-q1) <= I + por m5, m1 + ABSSUB_GT m1, rq2, rq3, m2, m7, m0 ; m1 = abs(q3-q2) <= I + por m5, m1 + ABSSUB m1, rp0, rq0, m7 ; abs(p0-q0) + paddusb m1, m1 ; abs(p0-q0) * 2 + ABSSUB m2, rp1, rq1, m7 ; abs(p1-q1) + pand m2, [pb_fe] ; drop lsb so shift can work + psrlq m2, 1 ; abs(p1-q1)/2 + paddusb m1, m2 ; abs(p0-q0)*2 + abs(p1-q1)/2 + pxor m1, m0 + pcmpgtb m1, m3 + por m1, m5 ; fm final value + SWAP 1, 3 + pxor m3, [pb_ff] + + ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3) + ; calc flat8in (if not 44_16) and hev masks +%if %2 != 44 + mova m6, [pb_81] ; [1 1 1 1 ...] ^ 0x80 + ABSSUB_GT m2, rp3, rp0, m6, m5 ; abs(p3 - p0) <= 1 +%if ARCH_X86_64 + mova m8, [pb_80] +%define rb80 m8 +%else +%define rb80 [pb_80] +%endif + ABSSUB_GT m1, rp2, rp0, m6, m5, rb80 ; abs(p2 - p0) <= 1 + por m2, m1 + ABSSUB m4, rp1, rp0, m5 ; abs(p1 - p0) +%if %2 == 16 +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m7, H, m0 ; H H H H ... +%else + movd m7, Hd + SPLATB_MIX m7 +%endif + pxor m7, rb80 + pxor m4, rb80 + pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition) + CMP_GT m4, m6 ; abs(p1 - p0) <= 1 + por m2, m4 ; (flat8in) + ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0) + pxor m4, rb80 + pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition) + por m0, m5 ; hev final value + CMP_GT m4, m6 ; abs(q1 - q0) <= 1 + por m2, m4 ; (flat8in) + ABSSUB_GT m1, rq2, rq0, m6, m5, rb80 ; abs(q2 - q0) <= 1 + por m2, m1 + ABSSUB_GT m1, rq3, rq0, m6, m5, rb80 ; abs(q3 - q0) <= 1 + por m2, m1 ; flat8in final value + pxor m2, [pb_ff] +%if %2 == 84 || %2 == 48 + pand m2, [mask_mix%2] +%endif +%else + mova m6, [pb_80] + movd m7, Hd + SPLATB_MIX m7 + pxor m7, m6 + ABSSUB m4, rp1, rp0, m1 ; abs(p1 - p0) + pxor m4, m6 + pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition) + ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0) + pxor m4, m6 + pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition) + por m0, m5 ; hev final value +%endif + +%if %2 == 16 + ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3) + ; calc flat8out mask +%if ARCH_X86_64 + mova m8, [P7] + mova m9, [P6] +%define rp7 m8 +%define rp6 m9 +%else +%define rp7 [P7] +%define rp6 [P6] +%endif + ABSSUB_GT m1, rp7, rp0, m6, m5 ; abs(p7 - p0) <= 1 + ABSSUB_GT m7, rp6, rp0, m6, m5 ; abs(p6 - p0) <= 1 + por m1, m7 +%if ARCH_X86_64 + mova m8, [P5] + mova m9, [P4] +%define rp5 m8 +%define rp4 m9 +%else +%define rp5 [P5] +%define rp4 [P4] +%endif + ABSSUB_GT m7, rp5, rp0, m6, m5 ; abs(p5 - p0) <= 1 + por m1, m7 + ABSSUB_GT m7, rp4, rp0, m6, m5 ; abs(p4 - p0) <= 1 + por m1, m7 +%if ARCH_X86_64 + mova m14, [Q4] + mova m15, [Q5] +%define rq4 m14 +%define rq5 m15 +%else +%define rq4 [Q4] +%define rq5 [Q5] +%endif + ABSSUB_GT m7, rq4, rq0, m6, m5 ; abs(q4 - q0) <= 1 + por m1, m7 + ABSSUB_GT m7, rq5, rq0, m6, m5 ; abs(q5 - q0) <= 1 + por m1, m7 +%if ARCH_X86_64 + mova m14, [Q6] + mova m15, [Q7] +%define rq6 m14 +%define rq7 m15 +%else +%define rq6 [Q6] +%define rq7 [Q7] +%endif + ABSSUB_GT m7, rq6, rq0, m6, m5 ; abs(q4 - q0) <= 1 + por m1, m7 + ABSSUB_GT m7, rq7, rq0, m6, m5 ; abs(q5 - q0) <= 1 + por m1, m7 ; flat8out final value + pxor m1, [pb_ff] +%endif + + ; if (fm) { + ; if (out && in) filter_14() + ; else if (in) filter_6() + ; else if (hev) filter_2() + ; else filter_4() + ; } + ; + ; f14: fm & out & in + ; f6: fm & ~f14 & in => fm & ~(out & in) & in => fm & ~out & in + ; f2: fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev => fm & ~in & hev + ; f4: fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm & ~in & ~hev + + ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7) + ; filter2() +%if %2 != 44 + mova m6, [pb_80] ; already in m6 if 44_16 + SCRATCH 2, 15, rsp+%3+%4 +%if %2 == 16 + SCRATCH 1, 8, rsp+%3+%4+16 +%endif +%endif + pxor m2, m6, rq0 ; q0 ^ 0x80 + pxor m4, m6, rp0 ; p0 ^ 0x80 + psubsb m2, m4 ; (signed) q0 - p0 + pxor m4, m6, rp1 ; p1 ^ 0x80 + pxor m5, m6, rq1 ; q1 ^ 0x80 + psubsb m4, m5 ; (signed) p1 - q1 + paddsb m4, m2 ; (q0 - p0) + (p1 - q1) + paddsb m4, m2 ; 2*(q0 - p0) + (p1 - q1) + paddsb m4, m2 ; 3*(q0 - p0) + (p1 - q1) + paddsb m6, m4, [pb_4] ; m6: f1 = clip(f + 4, 127) + paddsb m4, [pb_3] ; m4: f2 = clip(f + 3, 127) +%if ARCH_X86_64 + mova m14, [pb_10] ; will be reused in filter4() +%define rb10 m14 +%else +%define rb10 [pb_10] +%endif + SRSHIFT3B_2X m6, m4, rb10, m7 ; f1 and f2 sign byte shift by 3 + SIGN_SUB m7, rq0, m6, m5 ; m7 = q0 - f1 + SIGN_ADD m1, rp0, m4, m5 ; m1 = p0 + f2 +%if %2 != 44 +%if ARCH_X86_64 + pandn m6, m15, m3 ; ~mask(in) & mask(fm) +%else + mova m6, [rsp+%3+%4] + pandn m6, m3 +%endif + pand m6, m0 ; (~mask(in) & mask(fm)) & mask(hev) +%else + pand m6, m3, m0 +%endif + MASK_APPLY m7, rq0, m6, m5 ; m7 = filter2(q0) & mask / we write it in filter4() + MASK_APPLY m1, rp0, m6, m5 ; m1 = filter2(p0) & mask / we write it in filter4() + + ; (m0: hev, m1: p0', m2: q0-p0, m3: fm, m7: q0', [m8: flat8out], m10..13: p1 p0 q0 q1, m14: pb_10, [m15: flat8in], ) + ; filter4() + mova m4, m2 + paddsb m2, m4 ; 2 * (q0 - p0) + paddsb m2, m4 ; 3 * (q0 - p0) + paddsb m6, m2, [pb_4] ; m6: f1 = clip(f + 4, 127) + paddsb m2, [pb_3] ; m2: f2 = clip(f + 3, 127) + SRSHIFT3B_2X m6, m2, rb10, m4 ; f1 and f2 sign byte shift by 3 +%if %2 != 44 +%if ARCH_X86_64 + pandn m5, m15, m3 ; ~mask(in) & mask(fm) +%else + mova m5, [rsp+%3+%4] + pandn m5, m3 +%endif + pandn m0, m5 ; ~mask(hev) & (~mask(in) & mask(fm)) +%else + pandn m0, m3 +%endif + SIGN_SUB m5, rq0, m6, m4 ; q0 - f1 + MASK_APPLY m5, m7, m0, m4 ; filter4(q0) & mask + mova [Q0], m5 + SIGN_ADD m7, rp0, m2, m4 ; p0 + f2 + MASK_APPLY m7, m1, m0, m4 ; filter4(p0) & mask + mova [P0], m7 + paddb m6, [pb_80] ; + pxor m1, m1 ; f=(f1+1)>>1 + pavgb m6, m1 ; + psubb m6, [pb_40] ; + SIGN_ADD m1, rp1, m6, m2 ; p1 + f + SIGN_SUB m4, rq1, m6, m2 ; q1 - f + MASK_APPLY m1, rp1, m0, m2 ; m1 = filter4(p1) + MASK_APPLY m4, rq1, m0, m2 ; m4 = filter4(q1) + mova [P1], m1 + mova [Q1], m4 + +%if %2 != 44 + UNSCRATCH 2, 15, rsp+%3+%4 +%endif + + ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1) + ; filter6() +%if %2 != 44 + pxor m0, m0 +%if %2 > 16 + pand m3, m2 +%else + pand m2, m3 ; mask(fm) & mask(in) +%if ARCH_X86_64 + pandn m3, m8, m2 ; ~mask(out) & (mask(fm) & mask(in)) +%else + mova m3, [rsp+%3+%4+16] + pandn m3, m2 +%endif +%endif +%if ARCH_X86_64 + mova m14, [P3] + mova m9, [Q3] +%define rp3 m14 +%define rq3 m9 +%else +%define rp3 [P3] +%define rq3 [Q3] +%endif + mova m1, [P2] + FILTER_INIT m4, m5, m6, m7, [P2], %4, 6, m3, m1 ; [p2] + mova m1, [Q2] + FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 0, 1, 2, 5, 3, m3, "", rq1, "", 1 ; [p1] -p3 -p2 +p1 +q1 + FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 0, 2, 3, 6, 3, m3, "", m1 ; [p0] -p3 -p1 +p0 +q2 + FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 0, 3, 4, 7, 3, m3, "", rq3, "", 1 ; [q0] -p3 -p0 +q0 +q3 + FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 1, 4, 5, 7, 3, m3, "" ; [q1] -p2 -q0 +q1 +q3 + FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 2, 5, 6, 7, 3, m3, m1 ; [q2] -p1 -q1 +q2 +q3 +%endif + +%if %2 == 16 + UNSCRATCH 1, 8, rsp+%3+%4+16 +%endif + + ; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2) + ; filter14() + ; + ; m2 m3 m8 m9 m14 m15 m10 m11 m12 m13 + ; + ; q2 q3 p3 p2 p1 p0 q0 q1 + ; p6 -7 p7 p6 p5 p4 . . . . . + ; p5 -6 -p7 -p6 +p5 +q1 . . . . + ; p4 -5 -p7 -p5 +p4 +q2 . . . q2 + ; p3 -4 -p7 -p4 +p3 +q3 . . . q3 + ; p2 -3 -p7 -p3 +p2 +q4 . . . q4 + ; p1 -2 -p7 -p2 +p1 +q5 . . . q5 + ; p0 -1 -p7 -p1 +p0 +q6 . . . q6 + ; q0 +0 -p7 -p0 +q0 +q7 . . . q7 + ; q1 +1 -p6 -q0 +q1 +q7 q1 . . . + ; q2 +2 -p5 -q1 +q2 +q7 . q2 . . + ; q3 +3 -p4 -q2 +q3 +q7 . q3 . . + ; q4 +4 -p3 -q3 +q4 +q7 . q4 . . + ; q5 +5 -p2 -q4 +q5 +q7 . q5 . . + ; q6 +6 -p1 -q5 +q6 +q7 . q6 . . + +%if %2 == 16 + pand m1, m2 ; mask(out) & (mask(fm) & mask(in)) + mova m2, [P7] + mova m3, [P6] +%if ARCH_X86_64 + mova m8, [P5] + mova m9, [P4] +%define rp5 m8 +%define rp4 m9 +%define rp5s m8 +%define rp4s m9 +%define rp3s m14 +%define rq4 m8 +%define rq5 m9 +%define rq6 m14 +%define rq7 m15 +%define rq4s m8 +%define rq5s m9 +%define rq6s m14 +%else +%define rp5 [P5] +%define rp4 [P4] +%define rp5s "" +%define rp4s "" +%define rp3s "" +%define rq4 [Q4] +%define rq5 [Q5] +%define rq6 [Q6] +%define rq7 [Q7] +%define rq4s "" +%define rq5s "" +%define rq6s "" +%endif + FILTER_INIT m4, m5, m6, m7, [P6], %4, 14, m1, m3 ; [p6] + FILTER_UPDATE m4, m5, m6, m7, [P5], %4, 8, 9, 10, 5, 4, m1, rp5s ; [p5] -p7 -p6 +p5 +q1 + FILTER_UPDATE m4, m5, m6, m7, [P4], %4, 8, 10, 11, 6, 4, m1, rp4s ; [p4] -p7 -p5 +p4 +q2 + FILTER_UPDATE m4, m5, m6, m7, [P3], %4, 8, 11, 0, 7, 4, m1, rp3s ; [p3] -p7 -p4 +p3 +q3 + FILTER_UPDATE m4, m5, m6, m7, [P2], %4, 8, 0, 1, 12, 4, m1, "", rq4, [Q4], 1 ; [p2] -p7 -p3 +p2 +q4 + FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 8, 1, 2, 13, 4, m1, "", rq5, [Q5], 1 ; [p1] -p7 -p2 +p1 +q5 + FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 8, 2, 3, 14, 4, m1, "", rq6, [Q6], 1 ; [p0] -p7 -p1 +p0 +q6 + FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 8, 3, 4, 15, 4, m1, "", rq7, [Q7], 1 ; [q0] -p7 -p0 +q0 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 9, 4, 5, 15, 4, m1, "" ; [q1] -p6 -q0 +q1 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 10, 5, 6, 15, 4, m1, "" ; [q2] -p5 -q1 +q2 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q3], %4, 11, 6, 7, 15, 4, m1, "" ; [q3] -p4 -q2 +q3 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q4], %4, 0, 7, 12, 15, 4, m1, rq4s ; [q4] -p3 -q3 +q4 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q5], %4, 1, 12, 13, 15, 4, m1, rq5s ; [q5] -p2 -q4 +q5 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q6], %4, 2, 13, 14, 15, 4, m1, rq6s ; [q6] -p1 -q5 +q6 +q7 +%endif + +%ifidn %1, h +%if %2 == 16 + mova m0, [P7] + mova m1, [P6] + mova m2, [P5] + mova m3, [P4] + mova m4, [P3] + mova m5, [P2] +%if ARCH_X86_64 + mova m6, [P1] +%endif + mova m7, [P0] +%if ARCH_X86_64 + mova m8, [Q0] + mova m9, [Q1] + mova m10, [Q2] + mova m11, [Q3] + mova m12, [Q4] + mova m13, [Q5] + mova m14, [Q6] + mova m15, [Q7] + TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] + DEFINE_REAL_P7_TO_Q7 + movu [P7], m0 + movu [P6], m1 + movu [P5], m2 + movu [P4], m3 + movu [P3], m4 + movu [P2], m5 + movu [P1], m6 + movu [P0], m7 + movu [Q0], m8 + movu [Q1], m9 + movu [Q2], m10 + movu [Q3], m11 + movu [Q4], m12 + movu [Q5], m13 + movu [Q6], m14 + movu [Q7], m15 +%else + DEFINE_REAL_P7_TO_Q7 + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+32], a, [rsp+%3+%4], [Q0], [Q1] + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 + DEFINE_TRANSPOSED_P7_TO_Q7 + mova m0, [Q0] + mova m1, [Q1] + mova m2, [Q2] + mova m3, [Q3] + mova m4, [Q4] + mova m5, [Q5] + mova m7, [Q7] + DEFINE_REAL_P7_TO_Q7 8 + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+224], a, [rsp+%3+%4], [Q0], [Q1] + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 +%endif +%elif %2 == 44 + SWAP 0, 1 ; m0 = p1 + SWAP 1, 7 ; m1 = p0 + SWAP 2, 5 ; m2 = q0 + SWAP 3, 4 ; m3 = q1 + DEFINE_REAL_P7_TO_Q7 2 + SBUTTERFLY bw, 0, 1, 4 + SBUTTERFLY bw, 2, 3, 4 + SBUTTERFLY wd, 0, 2, 4 + SBUTTERFLY wd, 1, 3, 4 + movd [P7], m0 + movd [P3], m2 + movd [Q0], m1 + movd [Q4], m3 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P6], m0 + movd [P2], m2 + movd [Q1], m1 + movd [Q5], m3 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P5], m0 + movd [P1], m2 + movd [Q2], m1 + movd [Q6], m3 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P4], m0 + movd [P0], m2 + movd [Q3], m1 + movd [Q7], m3 +%else + ; the following code do a transpose of 8 full lines to 16 half + ; lines (high part). It is inlined to avoid the need of a staging area + mova m0, [P3] + mova m1, [P2] + mova m2, [P1] + mova m3, [P0] + mova m4, [Q0] + mova m5, [Q1] +%if ARCH_X86_64 + mova m6, [Q2] +%endif + mova m7, [Q3] + DEFINE_REAL_P7_TO_Q7 +%if ARCH_X86_64 + SBUTTERFLY bw, 0, 1, 8 + SBUTTERFLY bw, 2, 3, 8 + SBUTTERFLY bw, 4, 5, 8 + SBUTTERFLY bw, 6, 7, 8 + SBUTTERFLY wd, 0, 2, 8 + SBUTTERFLY wd, 1, 3, 8 + SBUTTERFLY wd, 4, 6, 8 + SBUTTERFLY wd, 5, 7, 8 + SBUTTERFLY dq, 0, 4, 8 + SBUTTERFLY dq, 1, 5, 8 + SBUTTERFLY dq, 2, 6, 8 + SBUTTERFLY dq, 3, 7, 8 +%else + SBUTTERFLY bw, 0, 1, 6 + mova [rsp+64], m1 + mova m6, [rsp+96] + SBUTTERFLY bw, 2, 3, 1 + SBUTTERFLY bw, 4, 5, 1 + SBUTTERFLY bw, 6, 7, 1 + SBUTTERFLY wd, 0, 2, 1 + mova [rsp+96], m2 + mova m1, [rsp+64] + SBUTTERFLY wd, 1, 3, 2 + SBUTTERFLY wd, 4, 6, 2 + SBUTTERFLY wd, 5, 7, 2 + SBUTTERFLY dq, 0, 4, 2 + SBUTTERFLY dq, 1, 5, 2 + movh [Q0], m1 + movhps [Q1], m1 + mova m2, [rsp+96] + SBUTTERFLY dq, 2, 6, 1 + SBUTTERFLY dq, 3, 7, 1 +%endif + SWAP 3, 6 + SWAP 1, 4 + movh [P7], m0 + movhps [P6], m0 + movh [P5], m1 + movhps [P4], m1 + movh [P3], m2 + movhps [P2], m2 + movh [P1], m3 + movhps [P0], m3 +%if ARCH_X86_64 + movh [Q0], m4 + movhps [Q1], m4 +%endif + movh [Q2], m5 + movhps [Q3], m5 + movh [Q4], m6 + movhps [Q5], m6 + movh [Q6], m7 + movhps [Q7], m7 +%endif +%endif + + RET +%endmacro + +%macro LPF_16_VH 5 +INIT_XMM %5 +LOOPFILTER v, %1, %2, 0, %4 +LOOPFILTER h, %1, %2, %3, %4 +%endmacro + +%macro LPF_16_VH_ALL_OPTS 4 +LPF_16_VH %1, %2, %3, %4, sse2 +LPF_16_VH %1, %2, %3, %4, ssse3 +LPF_16_VH %1, %2, %3, %4, avx +%endmacro + +LPF_16_VH_ALL_OPTS 16, 512, 256, 32 +LPF_16_VH_ALL_OPTS 44, 0, 128, 0 +LPF_16_VH_ALL_OPTS 48, 256, 128, 16 +LPF_16_VH_ALL_OPTS 84, 256, 128, 16 +LPF_16_VH_ALL_OPTS 88, 256, 128, 16 diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm new file mode 100644 index 0000000000..53939579fc --- /dev/null +++ b/libavcodec/x86/vp9mc.asm @@ -0,0 +1,623 @@ +;****************************************************************************** +;* VP9 MC SIMD optimizations +;* +;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +cextern pw_256 +cextern pw_64 + +%macro F8_SSSE3_TAPS 8 +times 16 db %1, %2 +times 16 db %3, %4 +times 16 db %5, %6 +times 16 db %7, %8 +%endmacro + +%macro F8_SSE2_TAPS 8 +times 8 dw %1 +times 8 dw %2 +times 8 dw %3 +times 8 dw %4 +times 8 dw %5 +times 8 dw %6 +times 8 dw %7 +times 8 dw %8 +%endmacro + +%macro FILTER 1 +const filters_%1 ; smooth + F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 + F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 + F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 + F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0 + F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0 + F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0 + F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1 + F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1 + F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1 + F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1 + F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2 + F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2 + F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2 + F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 + F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 + ; regular + F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 + F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 + F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 + F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1 + F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1 + F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1 + F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1 + F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1 + F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1 + F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1 + F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1 + F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1 + F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1 + F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 + F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 + ; sharp + F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 + F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 + F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 + F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2 + F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3 + F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3 + F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4 + F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4 + F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4 + F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4 + F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4 + F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4 + F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3 + F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2 + F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1 +%endmacro + +%define F8_TAPS F8_SSSE3_TAPS +; int8_t ff_filters_ssse3[3][15][4][32] +FILTER ssse3 +%define F8_TAPS F8_SSE2_TAPS +; int16_t ff_filters_sse2[3][15][8][8] +FILTER sse2 + +SECTION .text + +%macro filter_sse2_h_fn 1 +%assign %%px mmsize/2 +cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 15, dst, dstride, src, sstride, h, filtery + pxor m5, m5 + mova m6, [pw_64] + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+ 16] + mova m9, [filteryq+ 32] + mova m10, [filteryq+ 48] + mova m11, [filteryq+ 64] + mova m12, [filteryq+ 80] + mova m13, [filteryq+ 96] + mova m14, [filteryq+112] +%endif +.loop: + movh m0, [srcq-3] + movh m1, [srcq-2] + movh m2, [srcq-1] + movh m3, [srcq+0] + movh m4, [srcq+1] + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m8 + pmullw m2, m9 + pmullw m3, m10 + pmullw m4, m11 +%else + pmullw m1, [filteryq+ 16] + pmullw m2, [filteryq+ 32] + pmullw m3, [filteryq+ 48] + pmullw m4, [filteryq+ 64] +%endif + paddw m0, m1 + paddw m2, m3 + paddw m0, m4 + movh m1, [srcq+2] + movh m3, [srcq+3] + movh m4, [srcq+4] + add srcq, sstrideq + punpcklbw m1, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m12 + pmullw m3, m13 + pmullw m4, m14 +%else + pmullw m1, [filteryq+ 80] + pmullw m3, [filteryq+ 96] + pmullw m4, [filteryq+112] +%endif + paddw m0, m1 + paddw m3, m4 + paddw m0, m6 + paddw m2, m3 + paddsw m0, m2 + psraw m0, 7 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX mmxext +filter_sse2_h_fn put +filter_sse2_h_fn avg + +INIT_XMM sse2 +filter_sse2_h_fn put +filter_sse2_h_fn avg + +%macro filter_h_fn 1 +%assign %%px mmsize/2 +cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery + mova m6, [pw_256] + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+32] + mova m9, [filteryq+64] + mova m10, [filteryq+96] +%endif +.loop: + movh m0, [srcq-3] + movh m1, [srcq-2] + movh m2, [srcq-1] + movh m3, [srcq+0] + movh m4, [srcq+1] + movh m5, [srcq+2] + punpcklbw m0, m1 + punpcklbw m2, m3 + movh m1, [srcq+3] + movh m3, [srcq+4] + add srcq, sstrideq + punpcklbw m4, m5 + punpcklbw m1, m3 + pmaddubsw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddubsw m2, m8 + pmaddubsw m4, m9 + pmaddubsw m1, m10 +%else + pmaddubsw m2, [filteryq+32] + pmaddubsw m4, [filteryq+64] + pmaddubsw m1, [filteryq+96] +%endif + paddw m0, m4 + paddw m2, m1 + paddsw m0, m2 + pmulhrsw m0, m6 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX ssse3 +filter_h_fn put +filter_h_fn avg + +INIT_XMM ssse3 +filter_h_fn put +filter_h_fn avg + +%if ARCH_X86_64 +%macro filter_hx2_fn 1 +%assign %%px mmsize +cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery + mova m13, [pw_256] + mova m8, [filteryq+ 0] + mova m9, [filteryq+32] + mova m10, [filteryq+64] + mova m11, [filteryq+96] +.loop: + movu m0, [srcq-3] + movu m1, [srcq-2] + movu m2, [srcq-1] + movu m3, [srcq+0] + movu m4, [srcq+1] + movu m5, [srcq+2] + movu m6, [srcq+3] + movu m7, [srcq+4] + add srcq, sstrideq + SBUTTERFLY bw, 0, 1, 12 + SBUTTERFLY bw, 2, 3, 12 + SBUTTERFLY bw, 4, 5, 12 + SBUTTERFLY bw, 6, 7, 12 + pmaddubsw m0, m8 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + paddsw m0, m2 + paddsw m1, m3 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 +%ifidn %1, avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter_hx2_fn put +filter_hx2_fn avg + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +filter_hx2_fn put +filter_hx2_fn avg +%endif + +%endif ; ARCH_X86_64 + +%macro filter_sse2_v_fn 1 +%assign %%px mmsize/2 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%define hd r4mp +%endif + pxor m5, m5 + mova m6, [pw_64] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+ 16] + mova m9, [filteryq+ 32] + mova m10, [filteryq+ 48] + mova m11, [filteryq+ 64] + mova m12, [filteryq+ 80] + mova m13, [filteryq+ 96] + mova m14, [filteryq+112] +%endif +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movh m0, [srcq] + movh m1, [srcq+sstrideq] + movh m2, [srcq+sstrideq*2] + movh m3, [srcq+sstride3q] + add srcq, sstrideq + movh m4, [src4q] + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m8 + pmullw m2, m9 + pmullw m3, m10 + pmullw m4, m11 +%else + pmullw m1, [filteryq+ 16] + pmullw m2, [filteryq+ 32] + pmullw m3, [filteryq+ 48] + pmullw m4, [filteryq+ 64] +%endif + paddw m0, m1 + paddw m2, m3 + paddw m0, m4 + movh m1, [src4q+sstrideq] + movh m3, [src4q+sstrideq*2] + movh m4, [src4q+sstride3q] + add src4q, sstrideq + punpcklbw m1, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m12 + pmullw m3, m13 + pmullw m4, m14 +%else + pmullw m1, [filteryq+ 80] + pmullw m3, [filteryq+ 96] + pmullw m4, [filteryq+112] +%endif + paddw m0, m1 + paddw m3, m4 + paddw m0, m6 + paddw m2, m3 + paddsw m0, m2 + psraw m0, 7 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX mmxext +filter_sse2_v_fn put +filter_sse2_v_fn avg + +INIT_XMM sse2 +filter_sse2_v_fn put +filter_sse2_v_fn avg + +%macro filter_v_fn 1 +%assign %%px mmsize/2 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%define hd r4mp +%endif + mova m6, [pw_256] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+32] + mova m9, [filteryq+64] + mova m10, [filteryq+96] +%endif +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movh m0, [srcq] + movh m1, [srcq+sstrideq] + movh m2, [srcq+sstrideq*2] + movh m3, [srcq+sstride3q] + movh m4, [src4q] + movh m5, [src4q+sstrideq] + punpcklbw m0, m1 + punpcklbw m2, m3 + movh m1, [src4q+sstrideq*2] + movh m3, [src4q+sstride3q] + add srcq, sstrideq + add src4q, sstrideq + punpcklbw m4, m5 + punpcklbw m1, m3 + pmaddubsw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddubsw m2, m8 + pmaddubsw m4, m9 + pmaddubsw m1, m10 +%else + pmaddubsw m2, [filteryq+32] + pmaddubsw m4, [filteryq+64] + pmaddubsw m1, [filteryq+96] +%endif + paddw m0, m4 + paddw m2, m1 + paddsw m0, m2 + pmulhrsw m0, m6 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX ssse3 +filter_v_fn put +filter_v_fn avg + +INIT_XMM ssse3 +filter_v_fn put +filter_v_fn avg + +%if ARCH_X86_64 + +%macro filter_vx2_fn 1 +%assign %%px mmsize +cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3 + mova m13, [pw_256] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m8, [filteryq+ 0] + mova m9, [filteryq+32] + mova m10, [filteryq+64] + mova m11, [filteryq+96] +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movu m0, [srcq] + movu m1, [srcq+sstrideq] + movu m2, [srcq+sstrideq*2] + movu m3, [srcq+sstride3q] + movu m4, [src4q] + movu m5, [src4q+sstrideq] + movu m6, [src4q+sstrideq*2] + movu m7, [src4q+sstride3q] + add srcq, sstrideq + add src4q, sstrideq + SBUTTERFLY bw, 0, 1, 12 + SBUTTERFLY bw, 2, 3, 12 + SBUTTERFLY bw, 4, 5, 12 + SBUTTERFLY bw, 6, 7, 12 + pmaddubsw m0, m8 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + paddsw m0, m2 + paddsw m1, m3 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 +%ifidn %1, avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter_vx2_fn put +filter_vx2_fn avg + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +filter_vx2_fn put +filter_vx2_fn avg +%endif + +%endif ; ARCH_X86_64 + +%macro fpel_fn 6 +%if %2 == 4 +%define %%srcfn movh +%define %%dstfn movh +%else +%define %%srcfn movu +%define %%dstfn mova +%endif + +%if %2 <= mmsize +cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 + lea sstride3q, [sstrideq*3] + lea dstride3q, [dstrideq*3] +%else +cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h +%endif +.loop: + %%srcfn m0, [srcq] + %%srcfn m1, [srcq+s%3] + %%srcfn m2, [srcq+s%4] + %%srcfn m3, [srcq+s%5] + lea srcq, [srcq+sstrideq*%6] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+d%3] + pavgb m2, [dstq+d%4] + pavgb m3, [dstq+d%5] +%endif + %%dstfn [dstq], m0 + %%dstfn [dstq+d%3], m1 + %%dstfn [dstq+d%4], m2 + %%dstfn [dstq+d%5], m3 + lea dstq, [dstq+dstrideq*%6] + sub hd, %6 + jnz .loop + RET +%endmacro + +%define d16 16 +%define s16 16 +%define d32 32 +%define s32 32 +INIT_MMX mmx +fpel_fn put, 4, strideq, strideq*2, stride3q, 4 +fpel_fn put, 8, strideq, strideq*2, stride3q, 4 +INIT_MMX mmxext +fpel_fn avg, 4, strideq, strideq*2, stride3q, 4 +fpel_fn avg, 8, strideq, strideq*2, stride3q, 4 +INIT_XMM sse +fpel_fn put, 16, strideq, strideq*2, stride3q, 4 +fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 +fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 +INIT_XMM sse2 +fpel_fn avg, 16, strideq, strideq*2, stride3q, 4 +fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2 +fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1 +INIT_YMM avx +fpel_fn put, 32, strideq, strideq*2, stride3q, 4 +fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +fpel_fn avg, 32, strideq, strideq*2, stride3q, 4 +fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2 +%endif +%undef s16 +%undef d16 +%undef s32 +%undef d32 diff --git a/libavcodec/x86/w64xmmtest.c b/libavcodec/x86/w64xmmtest.c index 2f064cad7b..25e833fef3 100644 --- a/libavcodec/x86/w64xmmtest.c +++ b/libavcodec/x86/w64xmmtest.c @@ -2,20 +2,20 @@ * check XMM registers for clobbers on Win64 * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -65,6 +65,13 @@ wrap(avcodec_encode_audio2(AVCodecContext *avctx, got_packet_ptr); } +wrap(avcodec_encode_video(AVCodecContext *avctx, + uint8_t *buf, int buf_size, + const AVFrame *pict)) +{ + testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict); +} + wrap(avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size, const AVSubtitle *sub)) diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm new file mode 100644 index 0000000000..0220885da6 --- /dev/null +++ b/libavcodec/x86/xvididct.asm @@ -0,0 +1,983 @@ +; XVID MPEG-4 VIDEO CODEC +; +; Conversion from gcc syntax to x264asm syntax with modifications +; by Christophe Gisquet <christophe.gisquet@gmail.com> +; +; =========== SSE2 inverse discrete cosine transform =========== +; +; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> +; +; Conversion to gcc syntax with modifications +; by Alexander Strange <astrange@ithinksw.com> +; +; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. +; +; Vertical pass is an implementation of the scheme: +; Loeffler C., Ligtenberg A., and Moschytz C.S.: +; Practical Fast 1D DCT Algorithm with Eleven Multiplications, +; Proc. ICASSP 1989, 988-991. +; +; Horizontal pass is a double 4x4 vector/matrix multiplication, +; (see also Intel's Application Note 922: +; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm +; Copyright (C) 1999 Intel Corporation) +; +; More details at http://skal.planet-d.net/coding/dct.html +; +; ======= MMX and XMM forward discrete cosine transform ======= +; +; Copyright(C) 2001 Peter Ross <pross@xvid.org> +; +; Originally provided by Intel at AP-922 +; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm +; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm) +; but in a limited edition. +; New macro implements a column part for precise iDCT +; The routine precision now satisfies IEEE standard 1180-1990. +; +; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru> +; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org> +; +; http://www.elecard.com/peter/idct.html +; http://www.linuxvideo.org/mpeg2dec/ +; +; These examples contain code fragments for first stage iDCT 8x8 +; (for rows) and first stage DCT 8x8 (for columns) +; +; conversion to gcc syntax by Michael Niedermayer +; +; ====================================================================== +; +; This file is part of FFmpeg. +; +; FFmpeg is free software; you can redistribute it and/or +; modify it under the terms of the GNU Lesser General Public +; License as published by the Free Software Foundation; either +; version 2.1 of the License, or (at your option) any later version. +; +; FFmpeg is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Lesser General Public License for more details. +; +; You should have received a copy of the GNU Lesser General Public License +; along with FFmpeg; if not, write to the Free Software Foundation, +; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +; Similar to tg_1_16 in MMX code +tan1: times 8 dw 13036 +tan2: times 8 dw 27146 +tan3: times 8 dw 43790 +sqrt2: times 8 dw 23170 + +; SSE2 tables +iTab1: dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d + dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61 + dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7 + dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b +iTab2: dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5 + dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04 + dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41 + dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df +iTab3: dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf + dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf + dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d + dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 +iTab4: dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746 + dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac + dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df + dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e + +%if ARCH_X86_32 +; ----------------------------------------------------------------------------- +; +; The first stage iDCT 8x8 - inverse DCTs of rows +; +; ----------------------------------------------------------------------------- +; The 8-point inverse DCT direct algorithm +; ----------------------------------------------------------------------------- +; +; static const short w[32] = { +; FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16), +; FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16), +; FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16), +; FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16), +; FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16), +; FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16), +; FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16), +; FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) }; +; +; #define DCT_8_INV_ROW(x, y) +; { +; int a0, a1, a2, a3, b0, b1, b2, b3; +; +; a0 = x[0] * w[0] + x[2] * w[1] + x[4] * w[2] + x[6] * w[3]; +; a1 = x[0] * w[4] + x[2] * w[5] + x[4] * w[6] + x[6] * w[7]; +; a2 = x[0] * w[8] + x[2] * w[9] + x[4] * w[10] + x[6] * w[11]; +; a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15]; +; b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19]; +; b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23]; +; b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27]; +; b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31]; +; +; y[0] = SHIFT_ROUND(a0 + b0); +; y[1] = SHIFT_ROUND(a1 + b1); +; y[2] = SHIFT_ROUND(a2 + b2); +; y[3] = SHIFT_ROUND(a3 + b3); +; y[4] = SHIFT_ROUND(a3 - b3); +; y[5] = SHIFT_ROUND(a2 - b2); +; y[6] = SHIFT_ROUND(a1 - b1); +; y[7] = SHIFT_ROUND(a0 - b0); +; } +; +; ----------------------------------------------------------------------------- +; +; In this implementation the outputs of the iDCT-1D are multiplied +; for rows 0,4 - by cos_4_16, +; for rows 1,7 - by cos_1_16, +; for rows 2,6 - by cos_2_16, +; for rows 3,5 - by cos_3_16 +; and are shifted to the left for better accuracy. +; +; For the constants used, +; FIX(float_const) = (short) (float_const * (1 << 15) + 0.5) +; +; ----------------------------------------------------------------------------- + +; ----------------------------------------------------------------------------- +; Tables for mmx processors +; ----------------------------------------------------------------------------- + +; Table for rows 0,4 - constants are multiplied by cos_4_16 +tab_i_04_mmx: dw 16384, 16384, 16384, -16384 + dw 21407, 8867, 8867, -21407 ; w07 w05 w03 w01 + dw 16384, -16384, 16384, 16384 ; w14 w12 w10 w08 + dw -8867, 21407, -21407, -8867 ; w15 w13 w11 w09 + dw 22725, 12873, 19266, -22725 ; w22 w20 w18 w16 + dw 19266, 4520, -4520, -12873 ; w23 w21 w19 w17 + dw 12873, 4520, 4520, 19266 ; w30 w28 w26 w24 + dw -22725, 19266, -12873, -22725 ; w31 w29 w27 w25 +; Table for rows 1,7 - constants are multiplied by cos_1_16 + dw 22725, 22725, 22725, -22725 ; movq-> w06 w04 w02 w00 + dw 29692, 12299, 12299, -29692 ; w07 w05 w03 w01 + dw 22725, -22725, 22725, 22725 ; w14 w12 w10 w08 + dw -12299, 29692, -29692, -12299 ; w15 w13 w11 w09 + dw 31521, 17855, 26722, -31521 ; w22 w20 w18 w16 + dw 26722, 6270, -6270, -17855 ; w23 w21 w19 w17 + dw 17855, 6270, 6270, 26722 ; w30 w28 w26 w24 + dw -31521, 26722, -17855, -31521 ; w31 w29 w27 w25 +; Table for rows 2,6 - constants are multiplied by cos_2_16 + dw 21407, 21407, 21407, -21407 ; movq-> w06 w04 w02 w00 + dw 27969, 11585, 11585, -27969 ; w07 w05 w03 w01 + dw 21407, -21407, 21407, 21407 ; w14 w12 w10 w08 + dw -11585, 27969, -27969, -11585 ; w15 w13 w11 w09 + dw 29692, 16819, 25172, -29692 ; w22 w20 w18 w16 + dw 25172, 5906, -5906, -16819 ; w23 w21 w19 w17 + dw 16819, 5906, 5906, 25172 ; w30 w28 w26 w24 + dw -29692, 25172, -16819, -29692 ; w31 w29 w27 w25 +; Table for rows 3,5 - constants are multiplied by cos_3_16 + dw 19266, 19266, 19266, -19266 ; movq-> w06 w04 w02 w00 + dw 25172, 10426, 10426, -25172 ; w07 w05 w03 w01 + dw 19266, -19266, 19266, 19266 ; w14 w12 w10 w08 + dw -10426, 25172, -25172, -10426 ; w15 w13 w11 w09 + dw 26722, 15137, 22654, -26722 ; w22 w20 w18 w16 + dw 22654, 5315, -5315, -15137 ; w23 w21 w19 w17 + dw 15137, 5315, 5315, 22654 ; w30 w28 w26 w24 + dw -26722, 22654, -15137, -26722 ; w31 w29 w27 w25 + +; ----------------------------------------------------------------------------- +; Tables for xmm processors +; ----------------------------------------------------------------------------- + +; %3 for rows 0,4 - constants are multiplied by cos_4_16 +tab_i_04_xmm: dw 16384, 21407, 16384, 8867 ; movq-> w05 w04 w01 w00 + dw 16384, 8867, -16384, -21407 ; w07 w06 w03 w02 + dw 16384, -8867, 16384, -21407 ; w13 w12 w09 w08 + dw -16384, 21407, 16384, -8867 ; w15 w14 w11 w10 + dw 22725, 19266, 19266, -4520 ; w21 w20 w17 w16 + dw 12873, 4520, -22725, -12873 ; w23 w22 w19 w18 + dw 12873, -22725, 4520, -12873 ; w29 w28 w25 w24 + dw 4520, 19266, 19266, -22725 ; w31 w30 w27 w26 +; %3 for rows 1,7 - constants are multiplied by cos_1_16 + dw 22725, 29692, 22725, 12299 ; movq-> w05 w04 w01 w00 + dw 22725, 12299, -22725, -29692 ; w07 w06 w03 w02 + dw 22725, -12299, 22725, -29692 ; w13 w12 w09 w08 + dw -22725, 29692, 22725, -12299 ; w15 w14 w11 w10 + dw 31521, 26722, 26722, -6270 ; w21 w20 w17 w16 + dw 17855, 6270, -31521, -17855 ; w23 w22 w19 w18 + dw 17855, -31521, 6270, -17855 ; w29 w28 w25 w24 + dw 6270, 26722, 26722, -31521 ; w31 w30 w27 w26 +; %3 for rows 2,6 - constants are multiplied by cos_2_16 + dw 21407, 27969, 21407, 11585 ; movq-> w05 w04 w01 w00 + dw 21407, 11585, -21407, -27969 ; w07 w06 w03 w02 + dw 21407, -11585, 21407, -27969 ; w13 w12 w09 w08 + dw -21407, 27969, 21407, -11585 ; w15 w14 w11 w10 + dw 29692, 25172, 25172, -5906 ; w21 w20 w17 w16 + dw 16819, 5906, -29692, -16819 ; w23 w22 w19 w18 + dw 16819, -29692, 5906, -16819 ; w29 w28 w25 w24 + dw 5906, 25172, 25172, -29692 ; w31 w30 w27 w26 +; %3 for rows 3,5 - constants are multiplied by cos_3_16 + dw 19266, 25172, 19266, 10426 ; movq-> w05 w04 w01 w00 + dw 19266, 10426, -19266, -25172 ; w07 w06 w03 w02 + dw 19266, -10426, 19266, -25172 ; w13 w12 w09 w08 + dw -19266, 25172, 19266, -10426 ; w15 w14 w11 w10 + dw 26722, 22654, 22654, -5315 ; w21 w20 w17 w16 + dw 15137, 5315, -26722, -15137 ; w23 w22 w19 w18 + dw 15137, -26722, 5315, -15137 ; w29 w28 w25 w24 + dw 5315, 22654, 22654, -26722 ; w31 w30 w27 w26 +%endif ; ~ARCH_X86_32 + +; Similar to rounder_0 in MMX code +; 4 first similar, then: 4*8->6*16 5*8->4*16 6/7*8->5*16 +walkenIdctRounders: times 4 dd 65536 + times 4 dd 3597 + times 4 dd 2260 + times 4 dd 1203 + times 4 dd 120 + times 4 dd 512 + times 2 dd 0 + +pb_127: times 8 db 127 + +SECTION .text + +; Temporary storage before the column pass +%define ROW1 xmm6 +%define ROW3 xmm4 +%define ROW5 xmm5 +%define ROW7 xmm7 + +%macro CLEAR_ODD 1 + pxor %1, %1 +%endmacro +%macro PUT_ODD 1 + pshufhw %1, xmm2, 0x1B +%endmacro + +%macro MOV32 2 +%if ARCH_X86_32 + movdqa %2, %1 +%endif +%endmacro + +%macro CLEAR_EVEN 1 +%if ARCH_X86_64 + CLEAR_ODD %1 +%endif +%endmacro + +%macro PUT_EVEN 1 +%if ARCH_X86_64 + PUT_ODD %1 +%else + pshufhw xmm2, xmm2, 0x1B + movdqa %1, xmm2 +%endif +%endmacro + +%if ARCH_X86_64 +%define ROW0 xmm8 +%define REG0 ROW0 +%define ROW2 xmm9 +%define REG2 ROW2 +%define ROW4 xmm10 +%define REG4 ROW4 +%define ROW6 xmm11 +%define REG6 ROW6 +%define XMMS xmm12 +%define SREG2 REG2 +%define TAN3 xmm13 +%define TAN1 xmm14 +%else +%define ROW0 [BLOCK + 0*16] +%define REG0 xmm4 +%define ROW2 [BLOCK + 2*16] +%define REG2 xmm4 +%define ROW4 [BLOCK + 4*16] +%define REG4 xmm6 +%define ROW6 [BLOCK + 6*16] +%define REG6 xmm6 +%define XMMS xmm2 +%define SREG2 xmm7 +%define TAN3 xmm0 +%define TAN1 xmm2 +%endif + +%macro JZ 2 + test %1, %1 + jz .%2 +%endmacro + +%macro JNZ 2 + test %1, %1 + jnz .%2 +%endmacro + +%macro TEST_ONE_ROW 4 ; src, reg, clear, arg + %3 %4 + movq mm1, [%1] + por mm1, [%1 + 8] + paddusb mm1, mm0 + pmovmskb %2, mm1 +%endmacro + +;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2 +%macro TEST_TWO_ROWS 8 + %5 %6 + %7 %8 + movq mm1, [%1 + 0] + por mm1, [%1 + 8] + movq mm2, [%2 + 0] + por mm2, [%2 + 8] + paddusb mm1, mm0 + paddusb mm2, mm0 + pmovmskb %3, mm1 + pmovmskb %4, mm2 +%endmacro + +; IDCT pass on rows. +%macro iMTX_MULT 4-5 ; src, table, put, arg, rounder + movdqa xmm3, [%1] + movdqa xmm0, xmm3 + pshufd xmm1, xmm3, 0x11 ; 4602 + punpcklqdq xmm0, xmm0 ; 0246 + pmaddwd xmm0, [%2] + pmaddwd xmm1, [%2+16] + pshufd xmm2, xmm3, 0xBB ; 5713 + punpckhqdq xmm3, xmm3 ; 1357 + pmaddwd xmm2, [%2+32] + pmaddwd xmm3, [%2+48] + paddd xmm0, xmm1 + paddd xmm2, xmm3 +%if %0 == 5 + paddd xmm0, [walkenIdctRounders+%5] +%endif + movdqa xmm3, xmm2 + paddd xmm2, xmm0 + psubd xmm0, xmm3 + psrad xmm2, 11 + psrad xmm0, 11 + packssdw xmm2, xmm0 + %3 %4 +%endmacro + +%macro iLLM_HEAD 0 + movdqa TAN3, [tan3] + movdqa TAN1, [tan1] +%endmacro + +%macro FIRST_HALF 2 ; %1=dct %2=type(normal,add,put) + psraw xmm5, 6 + psraw REG0, 6 + psraw TAN3, 6 + psraw xmm3, 6 + ; dct coeffs must still be written for AC prediction +%if %2 == 0 + movdqa [%1+1*16], TAN3 + movdqa [%1+2*16], xmm3 + movdqa [%1+5*16], REG0 + movdqa [%1+6*16], xmm5 +%else + ; Must now load args as gprs are no longer used for masks + ; DEST is set to where address of dest was loaded + %if ARCH_X86_32 + %if %2 == 2 ; Not enough xmms, store + movdqa [%1+1*16], TAN3 + movdqa [%1+2*16], xmm3 + movdqa [%1+5*16], REG0 + movdqa [%1+6*16], xmm5 + %endif + %xdefine DEST r2q ; BLOCK is r0, stride r1 + movifnidn DEST, destm + movifnidn strideq, stridem + %else + %xdefine DEST r0q + %endif + lea r3q, [3*strideq] + %if %2 == 1 + packuswb TAN3, xmm3 + packuswb xmm5, REG0 + movq [DEST + strideq], TAN3 + movhps [DEST + 2*strideq], TAN3 + ; REG0 and TAN3 are now available (and likely used in second half) + %endif +%endif +%endmacro + +%macro SECOND_HALF 6 ; %1=dct %2=type(normal,add,put) 3-6: xmms + psraw %3, 6 + psraw %4, 6 + psraw %5, 6 + psraw %6, 6 + ; dct coeffs must still be written for AC prediction +%if %2 == 0 + movdqa [%1+0*16], %3 + movdqa [%1+3*16], %5 + movdqa [%1+4*16], %6 + movdqa [%1+7*16], %4 +%elif %2 == 1 + packuswb %3, %5 + packuswb %6, %4 + ; address of dest may have been loaded + movq [DEST], %3 + movhps [DEST + r3q], %3 + lea DEST, [DEST + 4*strideq] + movq [DEST], %6 + movhps [DEST + r3q], %6 + ; and now write remainder of first half + movq [DEST + 2*strideq], xmm5 + movhps [DEST + strideq], xmm5 +%elif %2 == 2 + pxor xmm0, xmm0 + %if ARCH_X86_32 + ; free: m3 REG0=m4 m5 + ; input: m1, m7, m2, m6 + movq xmm3, [DEST+0*strideq] + movq xmm4, [DEST+1*strideq] + punpcklbw xmm3, xmm0 + punpcklbw xmm4, xmm0 + paddsw xmm3, %3 + paddsw xmm4, [%1 + 1*16] + movq %3, [DEST+2*strideq] + movq xmm5, [DEST+ r3q] + punpcklbw %3, xmm0 + punpcklbw xmm5, xmm0 + paddsw %3, [%1 + 2*16] + paddsw xmm5, %5 + packuswb xmm3, xmm4 + packuswb %3, xmm5 + movq [DEST+0*strideq], xmm3 + movhps [DEST+1*strideq], xmm3 + movq [DEST+2*strideq], %3 + movhps [DEST+ r3q], %3 + lea DEST, [DEST+4*strideq] + movq xmm3, [DEST+0*strideq] + movq xmm4, [DEST+1*strideq] + movq %3, [DEST+2*strideq] + movq xmm5, [DEST+ r3q] + punpcklbw xmm3, xmm0 + punpcklbw xmm4, xmm0 + punpcklbw %3, xmm0 + punpcklbw xmm5, xmm0 + paddsw xmm3, %6 + paddsw xmm4, [%1 + 5*16] + paddsw %3, [%1 + 6*16] + paddsw xmm5, %4 + packuswb xmm3, xmm4 + packuswb %3, xmm5 + movq [DEST+0*strideq], xmm3 + movhps [DEST+1*strideq], xmm3 + movq [DEST+2*strideq], %3 + movhps [DEST+ r3q], %3 + %else + ; l1:TAN3=m13 l2:m3 l5:REG0=m8 l6=m5 + ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10 + movq xmm2, [DEST+0*strideq] + movq xmm4, [DEST+1*strideq] + movq xmm12, [DEST+2*strideq] + movq xmm11, [DEST+ r3q] + punpcklbw xmm2, xmm0 + punpcklbw xmm4, xmm0 + punpcklbw xmm12, xmm0 + punpcklbw xmm11, xmm0 + paddsw xmm2, %3 + paddsw xmm4, TAN3 + paddsw xmm12, xmm3 + paddsw xmm11, %5 + packuswb xmm2, xmm4 + packuswb xmm12, xmm11 + movq [DEST+0*strideq], xmm2 + movhps [DEST+1*strideq], xmm2 + movq [DEST+2*strideq], xmm12 + movhps [DEST+ r3q], xmm12 + lea DEST, [DEST+4*strideq] + movq xmm2, [DEST+0*strideq] + movq xmm4, [DEST+1*strideq] + movq xmm12, [DEST+2*strideq] + movq xmm11, [DEST+ r3q] + punpcklbw xmm2, xmm0 + punpcklbw xmm4, xmm0 + punpcklbw xmm12, xmm0 + punpcklbw xmm11, xmm0 + paddsw xmm2, %6 + paddsw xmm4, REG0 + paddsw xmm12, xmm5 + paddsw xmm11, %4 + packuswb xmm2, xmm4 + packuswb xmm12, xmm11 + movq [DEST+0*strideq], xmm2 + movhps [DEST+1*strideq], xmm2 + movq [DEST+2*strideq], xmm12 + movhps [DEST+ r3q], xmm12 + %endif +%endif +%endmacro + + +; IDCT pass on columns. +%macro iLLM_PASS 2 ; %1=dct %2=type(normal,add,put) + movdqa xmm1, TAN3 + movdqa xmm3, TAN1 + pmulhw TAN3, xmm4 + pmulhw xmm1, xmm5 + paddsw TAN3, xmm4 + paddsw xmm1, xmm5 + psubsw TAN3, xmm5 + paddsw xmm1, xmm4 + pmulhw xmm3, xmm7 + pmulhw TAN1, xmm6 + paddsw xmm3, xmm6 + psubsw TAN1, xmm7 + movdqa xmm7, xmm3 + movdqa xmm6, TAN1 + psubsw xmm3, xmm1 + psubsw TAN1, TAN3 + paddsw xmm1, xmm7 + paddsw TAN3, xmm6 + movdqa xmm6, xmm3 + psubsw xmm3, TAN3 + paddsw TAN3, xmm6 + movdqa xmm4, [sqrt2] + pmulhw xmm3, xmm4 + pmulhw TAN3, xmm4 + paddsw TAN3, TAN3 + paddsw xmm3, xmm3 + movdqa xmm7, [tan2] + MOV32 ROW2, REG2 + MOV32 ROW6, REG6 + movdqa xmm5, xmm7 + pmulhw xmm7, REG6 + pmulhw xmm5, REG2 + paddsw xmm7, REG2 + psubsw xmm5, REG6 + MOV32 ROW0, REG0 + MOV32 ROW4, REG4 + MOV32 TAN1, [BLOCK] + movdqa XMMS, REG0 + psubsw REG0, REG4 + paddsw REG4, XMMS + movdqa XMMS, REG4 + psubsw REG4, xmm7 + paddsw xmm7, XMMS + movdqa XMMS, REG0 + psubsw REG0, xmm5 + paddsw xmm5, XMMS + movdqa XMMS, xmm5 + psubsw xmm5, TAN3 + paddsw TAN3, XMMS + movdqa XMMS, REG0 + psubsw REG0, xmm3 + paddsw xmm3, XMMS + MOV32 [BLOCK], TAN1 + + FIRST_HALF %1, %2 + + movdqa xmm0, xmm7 + movdqa xmm4, REG4 + psubsw xmm7, xmm1 + psubsw REG4, TAN1 + paddsw xmm1, xmm0 + paddsw TAN1, xmm4 + + SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4 +%endmacro + +; IDCT pass on columns, assuming rows 4-7 are zero +%macro iLLM_PASS_SPARSE 2 ; %1=dct %2=type(normal,put,add) + pmulhw TAN3, xmm4 + paddsw TAN3, xmm4 + movdqa xmm3, xmm6 + pmulhw TAN1, xmm6 + movdqa xmm1, xmm4 + psubsw xmm3, xmm1 + paddsw xmm1, xmm6 + movdqa xmm6, TAN1 + psubsw TAN1, TAN3 + paddsw TAN3, xmm6 + movdqa xmm6, xmm3 + psubsw xmm3, TAN3 + paddsw TAN3, xmm6 + movdqa xmm4, [sqrt2] + pmulhw xmm3, xmm4 + pmulhw TAN3, xmm4 + paddsw TAN3, TAN3 + paddsw xmm3, xmm3 + movdqa xmm5, [tan2] + MOV32 ROW2, SREG2 + pmulhw xmm5, SREG2 + MOV32 ROW0, REG0 + movdqa xmm6, REG0 + psubsw xmm6, SREG2 + paddsw SREG2, REG0 + MOV32 TAN1, [BLOCK] + movdqa XMMS, REG0 + psubsw REG0, xmm5 + paddsw xmm5, XMMS + movdqa XMMS, xmm5 + psubsw xmm5, TAN3 + paddsw TAN3, XMMS + movdqa XMMS, REG0 + psubsw REG0, xmm3 + paddsw xmm3, XMMS + MOV32 [BLOCK], TAN1 + + FIRST_HALF %1, %2 + + movdqa xmm0, SREG2 + movdqa xmm4, xmm6 + psubsw SREG2, xmm1 + psubsw xmm6, TAN1 + paddsw xmm1, xmm0 + paddsw TAN1, xmm4 + + SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6 +%endmacro + +%macro IDCT_SSE2 1 ; 0=normal 1=put 2=add +%if %1 == 0 || ARCH_X86_32 + %define GPR0 r1d + %define GPR1 r2d + %define GPR2 r3d + %define GPR3 r4d + %define NUM_GPRS 5 +%else + %define GPR0 r3d + %define GPR1 r4d + %define GPR2 r5d + %define GPR3 r6d + %define NUM_GPRS 7 +%endif +%if %1 == 0 +cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block +%xdefine BLOCK blockq +%else + %if %1 == 1 +cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block + %else +cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block + %endif + %if ARCH_X86_64 + %xdefine BLOCK blockq + %else + mov r0q, blockm + %xdefine BLOCK r0q + %endif +%endif + movq mm0, [pb_127] + iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16 + iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16 + iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16 + + TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c + JZ GPR0, col1 + iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16 +.col1: + TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d + TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi + + iLLM_HEAD + JNZ GPR1, 2 + JNZ GPR0, 3 + JNZ GPR2, 4 + JNZ GPR3, 5 + iLLM_PASS_SPARSE BLOCK, %1 + jmp .6 +.2: + iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4 +.3: + iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16 + JZ GPR2, col2 +.4: + iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16 +.col2: + JZ GPR3, col3 +.5: + iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16 +.col3: +%if ARCH_X86_32 + iLLM_HEAD +%endif + iLLM_PASS BLOCK, %1 +.6: + RET +%endmacro + +INIT_XMM sse2 +IDCT_SSE2 0 +IDCT_SSE2 1 +IDCT_SSE2 2 + +%if ARCH_X86_32 + +; %1=offset %2=tab_offset +; %3=rnd_offset where 4*8->6*16 5*8->4*16 6/7*8->5*16 +%macro DCT_8_INV_ROW 3 + movq mm0, [r0+16*%1+0] ; 0 ; x3 x2 x1 x0 + movq mm1, [r0+16*%1+8] ; 1 ; x7 x6 x5 x4 + movq mm2, mm0 ; 2 ; x3 x2 x1 x0 + movq mm3, [%2+ 0] ; 3 ; w06 w04 w02 w00 +%if cpuflag(mmxext) + pshufw mm0, mm0, 0x88 ; x2 x0 x2 x0 + movq mm4, [%2+ 8] ; 4 ; w07 w06 w03 w02 + movq mm5, mm1 ; 5 ; x7 x6 x5 x4 + pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 + movq mm6, [%2+32] ; 6 ; w21 w20 w17 w16 + pshufw mm1, mm1, 0x88 ; x6 x4 x6 x4 + pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 + movq mm7, [%2+40] ; 7; w23 w22 w19 w18 + pshufw mm2, mm2, 0xdd ; x3 x1 x3 x1 + pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 + pshufw mm5, mm5, 0xdd ; x7 x5 x7 x5 + pmaddwd mm7, mm5 ; x7*w23+x5*w22 x7*w19+x5*w18 + paddd mm3, [walkenIdctRounders + %3] ; +%3 + pmaddwd mm0, [%2+16] ; x2*w13+x0*w12 x2*w09+x0*w08 + paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) + pmaddwd mm1, [%2+24] ; x6*w15+x4*w14 x6*w11+x4*w10 + movq mm4, mm3 ; 4 ; a1 a0 + pmaddwd mm2, [%2+48] ; x3*w29+x1*w28 x3*w25+x1*w24 + paddd mm6, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) + pmaddwd mm5, [%2+56] ; x7*w31+x5*w30 x7*w27+x5*w26 + paddd mm3, mm6 ; a1+b1 a0+b0 + paddd mm0, [walkenIdctRounders + %3] ; +%3 + psrad mm3, 11 ; y1=a1+b1 y0=a0+b0 + paddd mm0, mm1 ; 1 ; a3=sum(even3) a2=sum(even2) + psubd mm4, mm6 ; 6 ; a1-b1 a0-b0 + movq mm7, mm0 ; 7 ; a3 a2 + paddd mm2, mm5 ; 5 ; b3=sum(odd3) b2=sum(odd2) + paddd mm0, mm2 ; a3+b3 a2+b2 + psrad mm4, 11 ; y6=a1-b1 y7=a0-b0 + psubd mm7, mm2 ; 2 ; a3-b3 a2-b2 + psrad mm0, 11 ; y3=a3+b3 y2=a2+b2 + psrad mm7, 11 ; y4=a3-b3 y5=a2-b2 + packssdw mm3, mm0 ; 0 ; y3 y2 y1 y0 + packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 + movq [r0+16*%1+0], mm3 ; 3 ; save y3 y2 y1 y0 + pshufw mm7, mm7, 0xb1 ; y7 y6 y5 y4 +%else + punpcklwd mm0, mm1 ; x5 x1 x4 x0 + movq mm5, mm0 ; 5 ; x5 x1 x4 x0 + punpckldq mm0, mm0 ; x4 x0 x4 x0 + movq mm4, [%2+ 8] ; 4 ; w07 w05 w03 w01 + punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2 + pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00 + movq mm6, mm2 ; 6 ; x7 x3 x6 x2 + movq mm1, [%2+32] ; 1 ; w22 w20 w18 w16 + punpckldq mm2, mm2 ; x6 x2 x6 x2 + pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01 + punpckhdq mm5, mm5 ; x5 x1 x5 x1 + pmaddwd mm0, [%2+16] ; x4*w14+x0*w12 x4*w10+x0*w08 + punpckhdq mm6, mm6 ; x7 x3 x7 x3 + movq mm7, [%2+40] ; 7 ; w23 w21 w19 w17 + pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16 + paddd mm3, [walkenIdctRounders + %3] ; +%3 + pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17 + pmaddwd mm2, [%2+24] ; x6*w15+x2*w13 x6*w11+x2*w09 + paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) + pmaddwd mm5, [%2+48] ; x5*w30+x1*w28 x5*w26+x1*w24 + movq mm4, mm3 ; 4 ; a1 a0 + pmaddwd mm6, [%2+56] ; x7*w31+x3*w29 x7*w27+x3*w25 + paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) + paddd mm0, [walkenIdctRounders + %3] ; +%3 + psubd mm3, mm1 ; a1-b1 a0-b0 + psrad mm3, 11 ; y6=a1-b1 y7=a0-b0 + paddd mm1, mm4 ; 4 ; a1+b1 a0+b0 + paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2) + psrad mm1, 11 ; y1=a1+b1 y0=a0+b0 + paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2) + movq mm4, mm0 ; 4 ; a3 a2 + paddd mm0, mm5 ; a3+b3 a2+b2 + psubd mm4, mm5 ; 5 ; a3-b3 a2-b2 + psrad mm0, 11 ; y3=a3+b3 y2=a2+b2 + psrad mm4, 11 ; y4=a3-b3 y5=a2-b2 + packssdw mm1, mm0 ; 0 ; y3 y2 y1 y0 + packssdw mm4, mm3 ; 3 ; y6 y7 y4 y5 + movq mm7, mm4 ; 7 ; y6 y7 y4 y5 + psrld mm4, 16 ; 0 y6 0 y4 + pslld mm7, 16 ; y7 0 y5 0 + movq [r0+16*%1+0], mm1 ; 1 ; save y3 y2 y1 y0 + por mm7, mm4 ; 4 ; y7 y6 y5 y4 +%endif + movq [r0+16*%1+8], mm7 ; 7 ; save y7 y6 y5 y4 +%endmacro + +; ----------------------------------------------------------------------------- +; +; The first stage DCT 8x8 - forward DCTs of columns +; +; The %2puts are multiplied +; for rows 0,4 - on cos_4_16, +; for rows 1,7 - on cos_1_16, +; for rows 2,6 - on cos_2_16, +; for rows 3,5 - on cos_3_16 +; and are shifted to the left for rise of accuracy +; +; ----------------------------------------------------------------------------- +; +; The 8-point scaled forward DCT algorithm (26a8m) +; +; ----------------------------------------------------------------------------- +; +;#define DCT_8_FRW_COL(x, y) +; { +; short t0, t1, t2, t3, t4, t5, t6, t7; +; short tp03, tm03, tp12, tm12, tp65, tm65; +; short tp465, tm465, tp765, tm765; +; +; t0 = LEFT_SHIFT(x[0] + x[7]); +; t1 = LEFT_SHIFT(x[1] + x[6]); +; t2 = LEFT_SHIFT(x[2] + x[5]); +; t3 = LEFT_SHIFT(x[3] + x[4]); +; t4 = LEFT_SHIFT(x[3] - x[4]); +; t5 = LEFT_SHIFT(x[2] - x[5]); +; t6 = LEFT_SHIFT(x[1] - x[6]); +; t7 = LEFT_SHIFT(x[0] - x[7]); +; +; tp03 = t0 + t3; +; tm03 = t0 - t3; +; tp12 = t1 + t2; +; tm12 = t1 - t2; +; +; y[0] = tp03 + tp12; +; y[4] = tp03 - tp12; +; +; y[2] = tm03 + tm12 * tg_2_16; +; y[6] = tm03 * tg_2_16 - tm12; +; +; tp65 = (t6 + t5) * cos_4_16; +; tm65 = (t6 - t5) * cos_4_16; +; +; tp765 = t7 + tp65; +; tm765 = t7 - tp65; +; tp465 = t4 + tm65; +; tm465 = t4 - tm65; +; +; y[1] = tp765 + tp465 * tg_1_16; +; y[7] = tp765 * tg_1_16 - tp465; +; y[5] = tm765 * tg_3_16 + tm465; +; y[3] = tm765 - tm465 * tg_3_16; +; } +; +; ----------------------------------------------------------------------------- + +; ----------------------------------------------------------------------------- +; DCT_8_INV_COL_4 INP,OUT +; ----------------------------------------------------------------------------- +%macro DCT_8_INV_COL 1 + movq mm0, [tan3] + movq mm3, [%1+16*3] + movq mm1, mm0 ; tg_3_16 + movq mm5, [%1+16*5] + pmulhw mm0, mm3 ; x3*(tg_3_16-1) + movq mm4, [tan1] + pmulhw mm1, mm5 ; x5*(tg_3_16-1) + movq mm7, [%1+16*7] + movq mm2, mm4 ; tg_1_16 + movq mm6, [%1+16*1] + pmulhw mm4, mm7 ; x7*tg_1_16 + paddsw mm0, mm3 ; x3*tg_3_16 + pmulhw mm2, mm6 ; x1*tg_1_16 + paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) + psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 + movq mm3, [sqrt2] + paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 + paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 + psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17 + movq mm5, mm4 ; tp17 + movq mm6, mm2 ; tm17 + paddsw mm5, mm1 ; tp17+tp35 = b0 + psubsw mm6, mm0 ; tm17-tm35 = b3 + psubsw mm4, mm1 ; tp17-tp35 = t1 + paddsw mm2, mm0 ; tm17+tm35 = t2 + movq mm7, [tan2] + movq mm1, mm4 ; t1 + movq [%1+3*16], mm5 ; save b0 + paddsw mm1, mm2 ; t1+t2 + movq [%1+5*16], mm6 ; save b3 + psubsw mm4, mm2 ; t1-t2 + movq mm5, [%1+2*16] + movq mm0, mm7 ; tg_2_16 + movq mm6, [%1+6*16] + pmulhw mm0, mm5 ; x2*tg_2_16 + pmulhw mm7, mm6 ; x6*tg_2_16 + pmulhw mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2 + movq mm2, [%1+0*16] + pmulhw mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2 + psubsw mm0, mm6 ; t2*tg_2_16-x6 = tm26 + movq mm3, mm2 ; x0 + movq mm6, [%1+4*16] + paddsw mm7, mm5 ; x2+x6*tg_2_16 = tp26 + paddsw mm2, mm6 ; x0+x4 = tp04 + psubsw mm3, mm6 ; x0-x4 = tm04 + movq mm5, mm2 ; tp04 + movq mm6, mm3 ; tm04 + psubsw mm2, mm7 ; tp04-tp26 = a3 + paddsw mm3, mm0 ; tm04+tm26 = a1 + paddsw mm1, mm1 ; b1 + paddsw mm4, mm4 ; b2 + paddsw mm5, mm7 ; tp04+tp26 = a0 + psubsw mm6, mm0 ; tm04-tm26 = a2 + movq mm7, mm3 ; a1 + movq mm0, mm6 ; a2 + paddsw mm3, mm1 ; a1+b1 + paddsw mm6, mm4 ; a2+b2 + psraw mm3, 6 ; dst1 + psubsw mm7, mm1 ; a1-b1 + psraw mm6, 6 ; dst2 + psubsw mm0, mm4 ; a2-b2 + movq mm1, [%1+3*16] ; load b0 + psraw mm7, 6 ; dst6 + movq mm4, mm5 ; a0 + psraw mm0, 6 ; dst5 + movq [%1+1*16], mm3 + paddsw mm5, mm1 ; a0+b0 + movq [%1+2*16], mm6 + psubsw mm4, mm1 ; a0-b0 + movq mm3, [%1+5*16] ; load b3 + psraw mm5, 6 ; dst0 + movq mm6, mm2 ; a3 + psraw mm4, 6 ; dst7 + movq [%1+5*16], mm0 + paddsw mm2, mm3 ; a3+b3 + movq [%1+6*16], mm7 + psubsw mm6, mm3 ; a3-b3 + movq [%1+0*16], mm5 + psraw mm2, 6 ; dst3 + movq [%1+7*16], mm4 + psraw mm6, 6 ; dst4 + movq [%1+3*16], mm2 + movq [%1+4*16], mm6 +%endmacro + +%macro XVID_IDCT_MMX 0 +cglobal xvid_idct, 1, 1, 0, block +%if cpuflag(mmxext) +%define TAB tab_i_04_xmm +%else +%define TAB tab_i_04_mmx +%endif + ; Process each row - beware of rounder offset + DCT_8_INV_ROW 0, TAB + 64 * 0, 0*16 + DCT_8_INV_ROW 1, TAB + 64 * 1, 1*16 + DCT_8_INV_ROW 2, TAB + 64 * 2, 2*16 + DCT_8_INV_ROW 3, TAB + 64 * 3, 3*16 + DCT_8_INV_ROW 4, TAB + 64 * 0, 6*16 + DCT_8_INV_ROW 5, TAB + 64 * 3, 4*16 + DCT_8_INV_ROW 6, TAB + 64 * 2, 5*16 + DCT_8_INV_ROW 7, TAB + 64 * 1, 5*16 + + ; Process the columns (4 at a time) + DCT_8_INV_COL r0+0 + DCT_8_INV_COL r0+8 + + RET +%endmacro + +INIT_MMX mmx +XVID_IDCT_MMX +INIT_MMX mmxext +XVID_IDCT_MMX + +%endif ; ~ARCH_X86_32 diff --git a/libavcodec/x86/xvididct.h b/libavcodec/x86/xvididct.h index 13a4e85890..573b25c6b5 100644 --- a/libavcodec/x86/xvididct.h +++ b/libavcodec/x86/xvididct.h @@ -1,20 +1,20 @@ /* * XVID MPEG-4 VIDEO CODEC * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -37,7 +37,7 @@ void ff_xvid_idct_mmxext_put(uint8_t *dest, int line_size, int16_t *block); void ff_xvid_idct_mmxext_add(uint8_t *dest, int line_size, int16_t *block); void ff_xvid_idct_sse2(short *block); -void ff_xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block); -void ff_xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block); +void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block); +void ff_xvid_idct_add_sse2(uint8_t *dest, int line_size, short *block); #endif /* AVCODEC_X86_XVIDIDCT_H */ diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c index e4f7345795..8b9d8de0cd 100644 --- a/libavcodec/x86/xvididct_init.c +++ b/libavcodec/x86/xvididct_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -26,9 +26,36 @@ #include "idctdsp.h" #include "xvididct.h" +#if ARCH_X86_32 && HAVE_YASM +static void xvid_idct_mmx_put(uint8_t *dest, int line_size, short *block) +{ + ff_xvid_idct_mmx(block); + ff_put_pixels_clamped(block, dest, line_size); +} + +static void xvid_idct_mmx_add(uint8_t *dest, int line_size, short *block) +{ + ff_xvid_idct_mmx(block); + ff_add_pixels_clamped(block, dest, line_size); +} + +static void xvid_idct_mmxext_put(uint8_t *dest, int line_size, short *block) +{ + ff_xvid_idct_mmxext(block); + ff_put_pixels_clamped(block, dest, line_size); +} + +static void xvid_idct_mmxext_add(uint8_t *dest, int line_size, short *block) +{ + ff_xvid_idct_mmxext(block); + ff_add_pixels_clamped(block, dest, line_size); +} +#endif + av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { +#if HAVE_YASM int cpu_flags = av_get_cpu_flags(); if (high_bit_depth || @@ -36,24 +63,27 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, avctx->idct_algo == FF_IDCT_XVID)) return; - if (INLINE_MMX(cpu_flags)) { - c->idct_put = ff_xvid_idct_mmx_put; - c->idct_add = ff_xvid_idct_mmx_add; +#if ARCH_X86_32 + if (EXTERNAL_MMX(cpu_flags)) { + c->idct_put = xvid_idct_mmx_put; + c->idct_add = xvid_idct_mmx_add; c->idct = ff_xvid_idct_mmx; c->perm_type = FF_IDCT_PERM_NONE; } - if (INLINE_MMXEXT(cpu_flags)) { - c->idct_put = ff_xvid_idct_mmxext_put; - c->idct_add = ff_xvid_idct_mmxext_add; + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->idct_put = xvid_idct_mmxext_put; + c->idct_add = xvid_idct_mmxext_add; c->idct = ff_xvid_idct_mmxext; c->perm_type = FF_IDCT_PERM_NONE; } +#endif - if (INLINE_SSE2(cpu_flags)) { - c->idct_put = ff_xvid_idct_sse2_put; - c->idct_add = ff_xvid_idct_sse2_add; + if (EXTERNAL_SSE2(cpu_flags)) { + c->idct_put = ff_xvid_idct_put_sse2; + c->idct_add = ff_xvid_idct_add_sse2; c->idct = ff_xvid_idct_sse2; c->perm_type = FF_IDCT_PERM_SSE2; } +#endif /* HAVE_YASM */ } diff --git a/libavcodec/x86/xvididct_mmx.c b/libavcodec/x86/xvididct_mmx.c deleted file mode 100644 index e371142974..0000000000 --- a/libavcodec/x86/xvididct_mmx.c +++ /dev/null @@ -1,548 +0,0 @@ -/* - * XVID MPEG-4 VIDEO CODEC - * - MMX and XMM forward discrete cosine transform - - * - * Copyright(C) 2001 Peter Ross <pross@xvid.org> - * - * Originally provided by Intel at AP-922 - * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm - * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm) - * but in a limited edition. - * New macro implements a column part for precise iDCT - * The routine precision now satisfies IEEE standard 1180-1990. - * - * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru> - * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org> - * - * http://www.elecard.com/peter/idct.html - * http://www.linuxvideo.org/mpeg2dec/ - * - * These examples contain code fragments for first stage iDCT 8x8 - * (for rows) and first stage DCT 8x8 (for columns) - * - * conversion to gcc syntax by Michael Niedermayer - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with Libav; if not, write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <inttypes.h> - -#include "config.h" - -#include "libavutil/mem.h" - -#include "libavcodec/avcodec.h" - -#include "idctdsp.h" -#include "xvididct.h" - -#if HAVE_MMX_INLINE - -// ----------------------------------------------------------------------------- -// Various memory constants (trigonometric values or rounding values) -// ----------------------------------------------------------------------------- - -DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4 * 4] = { - 13036, 13036, 13036, 13036, // tg * (2 << 16) + 0.5 - 27146, 27146, 27146, 27146, // tg * (2 << 16) + 0.5 - -21746, -21746, -21746, -21746, // tg * (2 << 16) + 0.5 - 23170, 23170, 23170, 23170 -}; // cos * (2 << 15) + 0.5 - -DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2 * 8] = { - 65536, 65536, - 3597, 3597, - 2260, 2260, - 1203, 1203, - 0, 0, - 120, 120, - 512, 512, - 512, 512 -}; - -// ----------------------------------------------------------------------------- -// -// The first stage iDCT 8x8 - inverse DCTs of rows -// -// ----------------------------------------------------------------------------- -// The 8-point inverse DCT direct algorithm -// ----------------------------------------------------------------------------- -// -// static const short w[32] = { -// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16), -// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16), -// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16), -// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16), -// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16), -// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16), -// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16), -// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) }; -// -// #define DCT_8_INV_ROW(x, y) -// { -// int a0, a1, a2, a3, b0, b1, b2, b3; -// -// a0 = x[0] * w[0] + x[2] * w[1] + x[4] * w[2] + x[6] * w[3]; -// a1 = x[0] * w[4] + x[2] * w[5] + x[4] * w[6] + x[6] * w[7]; -// a2 = x[0] * w[8] + x[2] * w[9] + x[4] * w[10] + x[6] * w[11]; -// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15]; -// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19]; -// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23]; -// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27]; -// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31]; -// -// y[0] = SHIFT_ROUND(a0 + b0); -// y[1] = SHIFT_ROUND(a1 + b1); -// y[2] = SHIFT_ROUND(a2 + b2); -// y[3] = SHIFT_ROUND(a3 + b3); -// y[4] = SHIFT_ROUND(a3 - b3); -// y[5] = SHIFT_ROUND(a2 - b2); -// y[6] = SHIFT_ROUND(a1 - b1); -// y[7] = SHIFT_ROUND(a0 - b0); -// } -// -// ----------------------------------------------------------------------------- -// -// In this implementation the outputs of the iDCT-1D are multiplied -// for rows 0,4 - by cos_4_16, -// for rows 1,7 - by cos_1_16, -// for rows 2,6 - by cos_2_16, -// for rows 3,5 - by cos_3_16 -// and are shifted to the left for better accuracy. -// -// For the constants used, -// FIX(float_const) = (short) (float_const * (1 << 15) + 0.5) -// -// ----------------------------------------------------------------------------- - -// ----------------------------------------------------------------------------- -// Tables for mmx processors -// ----------------------------------------------------------------------------- - -// Table for rows 0,4 - constants are multiplied by cos_4_16 -DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32 * 4] = { - 16384, 16384, 16384, -16384, // movq-> w06 w04 w02 w00 - 21407, 8867, 8867, -21407, // w07 w05 w03 w01 - 16384, -16384, 16384, 16384, // w14 w12 w10 w08 - -8867, 21407, -21407, -8867, // w15 w13 w11 w09 - 22725, 12873, 19266, -22725, // w22 w20 w18 w16 - 19266, 4520, -4520, -12873, // w23 w21 w19 w17 - 12873, 4520, 4520, 19266, // w30 w28 w26 w24 - -22725, 19266, -12873, -22725, // w31 w29 w27 w25 -// Table for rows 1,7 - constants are multiplied by cos_1_16 - 22725, 22725, 22725, -22725, // movq-> w06 w04 w02 w00 - 29692, 12299, 12299, -29692, // w07 w05 w03 w01 - 22725, -22725, 22725, 22725, // w14 w12 w10 w08 - -12299, 29692, -29692, -12299, // w15 w13 w11 w09 - 31521, 17855, 26722, -31521, // w22 w20 w18 w16 - 26722, 6270, -6270, -17855, // w23 w21 w19 w17 - 17855, 6270, 6270, 26722, // w30 w28 w26 w24 - -31521, 26722, -17855, -31521, // w31 w29 w27 w25 -// Table for rows 2,6 - constants are multiplied by cos_2_16 - 21407, 21407, 21407, -21407, // movq-> w06 w04 w02 w00 - 27969, 11585, 11585, -27969, // w07 w05 w03 w01 - 21407, -21407, 21407, 21407, // w14 w12 w10 w08 - -11585, 27969, -27969, -11585, // w15 w13 w11 w09 - 29692, 16819, 25172, -29692, // w22 w20 w18 w16 - 25172, 5906, -5906, -16819, // w23 w21 w19 w17 - 16819, 5906, 5906, 25172, // w30 w28 w26 w24 - -29692, 25172, -16819, -29692, // w31 w29 w27 w25 -// Table for rows 3,5 - constants are multiplied by cos_3_16 - 19266, 19266, 19266, -19266, // movq-> w06 w04 w02 w00 - 25172, 10426, 10426, -25172, // w07 w05 w03 w01 - 19266, -19266, 19266, 19266, // w14 w12 w10 w08 - -10426, 25172, -25172, -10426, // w15 w13 w11 w09 - 26722, 15137, 22654, -26722, // w22 w20 w18 w16 - 22654, 5315, -5315, -15137, // w23 w21 w19 w17 - 15137, 5315, 5315, 22654, // w30 w28 w26 w24 - -26722, 22654, -15137, -26722, // w31 w29 w27 w25 -}; -// ----------------------------------------------------------------------------- -// Tables for xmm processors -// ----------------------------------------------------------------------------- - -// %3 for rows 0,4 - constants are multiplied by cos_4_16 -DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32 * 4] = { - 16384, 21407, 16384, 8867, // movq-> w05 w04 w01 w00 - 16384, 8867, -16384, -21407, // w07 w06 w03 w02 - 16384, -8867, 16384, -21407, // w13 w12 w09 w08 - -16384, 21407, 16384, -8867, // w15 w14 w11 w10 - 22725, 19266, 19266, -4520, // w21 w20 w17 w16 - 12873, 4520, -22725, -12873, // w23 w22 w19 w18 - 12873, -22725, 4520, -12873, // w29 w28 w25 w24 - 4520, 19266, 19266, -22725, // w31 w30 w27 w26 -// %3 for rows 1,7 - constants are multiplied by cos_1_16 - 22725, 29692, 22725, 12299, // movq-> w05 w04 w01 w00 - 22725, 12299, -22725, -29692, // w07 w06 w03 w02 - 22725, -12299, 22725, -29692, // w13 w12 w09 w08 - -22725, 29692, 22725, -12299, // w15 w14 w11 w10 - 31521, 26722, 26722, -6270, // w21 w20 w17 w16 - 17855, 6270, -31521, -17855, // w23 w22 w19 w18 - 17855, -31521, 6270, -17855, // w29 w28 w25 w24 - 6270, 26722, 26722, -31521, // w31 w30 w27 w26 -// %3 for rows 2,6 - constants are multiplied by cos_2_16 - 21407, 27969, 21407, 11585, // movq-> w05 w04 w01 w00 - 21407, 11585, -21407, -27969, // w07 w06 w03 w02 - 21407, -11585, 21407, -27969, // w13 w12 w09 w08 - -21407, 27969, 21407, -11585, // w15 w14 w11 w10 - 29692, 25172, 25172, -5906, // w21 w20 w17 w16 - 16819, 5906, -29692, -16819, // w23 w22 w19 w18 - 16819, -29692, 5906, -16819, // w29 w28 w25 w24 - 5906, 25172, 25172, -29692, // w31 w30 w27 w26 -// %3 for rows 3,5 - constants are multiplied by cos_3_16 - 19266, 25172, 19266, 10426, // movq-> w05 w04 w01 w00 - 19266, 10426, -19266, -25172, // w07 w06 w03 w02 - 19266, -10426, 19266, -25172, // w13 w12 w09 w08 - -19266, 25172, 19266, -10426, // w15 w14 w11 w10 - 26722, 22654, 22654, -5315, // w21 w20 w17 w16 - 15137, 5315, -26722, -15137, // w23 w22 w19 w18 - 15137, -26722, 5315, -15137, // w29 w28 w25 w24 - 5315, 22654, 22654, -26722, // w31 w30 w27 w26 -}; -// ============================================================================= -// Helper macros for the code -// ============================================================================= - -// ----------------------------------------------------------------------------- -// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER -// ----------------------------------------------------------------------------- - -#define DCT_8_INV_ROW_MMX(A1, A2, A3, A4) \ - "movq "#A1", %%mm0 \n\t" /* 0 ; x3 x2 x1 x0 */ \ - "movq 8+"#A1", %%mm1 \n\t" /* 1 ; x7 x6 x5 x4 */ \ - "movq %%mm0, %%mm2 \n\t" /* 2 ; x3 x2 x1 x0 */ \ - "movq "#A3", %%mm3 \n\t" /* 3 ; w06 w04 w02 w00 */ \ - "punpcklwd %%mm1, %%mm0 \n\t" /* x5 x1 x4 x0 */ \ - "movq %%mm0, %%mm5 \n\t" /* 5 ; x5 x1 x4 x0 */ \ - "punpckldq %%mm0, %%mm0 \n\t" /* x4 x0 x4 x0 */ \ - "movq 8+"#A3", %%mm4 \n\t" /* 4 ; w07 w05 w03 w01 */ \ - "punpckhwd %%mm1, %%mm2 \n\t" /* 1 ; x7 x3 x6 x2 */ \ - "pmaddwd %%mm0, %%mm3 \n\t" /* x4*w06+x0*w04 x4*w02+x0*w00 */ \ - "movq %%mm2, %%mm6 \n\t" /* 6 ; x7 x3 x6 x2 */ \ - "movq 32+"#A3", %%mm1 \n\t" /* 1 ; w22 w20 w18 w16 */ \ - "punpckldq %%mm2, %%mm2 \n\t" /* x6 x2 x6 x2 */ \ - "pmaddwd %%mm2, %%mm4 \n\t" /* x6*w07+x2*w05 x6*w03+x2*w01 */ \ - "punpckhdq %%mm5, %%mm5 \n\t" /* x5 x1 x5 x1 */ \ - "pmaddwd 16+"#A3", %%mm0 \n\t" /* x4*w14+x0*w12 x4*w10+x0*w08 */ \ - "punpckhdq %%mm6, %%mm6 \n\t" /* x7 x3 x7 x3 */ \ - "movq 40+ "#A3", %%mm7 \n\t" /* 7 ; w23 w21 w19 w17 */ \ - "pmaddwd %%mm5, %%mm1 \n\t" /* x5*w22+x1*w20 x5*w18+x1*w16 */ \ - "paddd "#A4", %%mm3 \n\t" /* +%4 */ \ - "pmaddwd %%mm6, %%mm7 \n\t" /* x7*w23+x3*w21 x7*w19+x3*w17 */ \ - "pmaddwd 24+"#A3", %%mm2 \n\t" /* x6*w15+x2*w13 x6*w11+x2*w09 */ \ - "paddd %%mm4, %%mm3 \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */ \ - "pmaddwd 48+"#A3", %%mm5 \n\t" /* x5*w30+x1*w28 x5*w26+x1*w24 */ \ - "movq %%mm3, %%mm4 \n\t" /* 4 ; a1 a0 */ \ - "pmaddwd 56+"#A3", %%mm6 \n\t" /* x7*w31+x3*w29 x7*w27+x3*w25 */ \ - "paddd %%mm7, %%mm1 \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */ \ - "paddd "#A4", %%mm0 \n\t" /* +%4 */ \ - "psubd %%mm1, %%mm3 \n\t" /* a1-b1 a0-b0 */ \ - "psrad $11, %%mm3 \n\t" /* y6=a1-b1 y7=a0-b0 */ \ - "paddd %%mm4, %%mm1 \n\t" /* 4 ; a1+b1 a0+b0 */ \ - "paddd %%mm2, %%mm0 \n\t" /* 2 ; a3=sum(even3) a2=sum(even2) */ \ - "psrad $11, %%mm1 \n\t" /* y1=a1+b1 y0=a0+b0 */ \ - "paddd %%mm6, %%mm5 \n\t" /* 6 ; b3=sum(odd3) b2=sum(odd2) */ \ - "movq %%mm0, %%mm4 \n\t" /* 4 ; a3 a2 */ \ - "paddd %%mm5, %%mm0 \n\t" /* a3+b3 a2+b2 */ \ - "psubd %%mm5, %%mm4 \n\t" /* 5 ; a3-b3 a2-b2 */ \ - "psrad $11, %%mm0 \n\t" /* y3=a3+b3 y2=a2+b2 */ \ - "psrad $11, %%mm4 \n\t" /* y4=a3-b3 y5=a2-b2 */ \ - "packssdw %%mm0, %%mm1 \n\t" /* 0 ; y3 y2 y1 y0 */ \ - "packssdw %%mm3, %%mm4 \n\t" /* 3 ; y6 y7 y4 y5 */ \ - "movq %%mm4, %%mm7 \n\t" /* 7 ; y6 y7 y4 y5 */ \ - "psrld $16, %%mm4 \n\t" /* 0 y6 0 y4 */ \ - "pslld $16, %%mm7 \n\t" /* y7 0 y5 0 */ \ - "movq %%mm1, "#A2" \n\t" /* 1 ; save y3 y2 y1 y0 */ \ - "por %%mm4, %%mm7 \n\t" /* 4 ; y7 y6 y5 y4 */ \ - "movq %%mm7, 8+"#A2" \n\t" /* 7 ; save y7 y6 y5 y4 */ \ - - -// ----------------------------------------------------------------------------- -// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER -// ----------------------------------------------------------------------------- - -#define DCT_8_INV_ROW_XMM(A1, A2, A3, A4) \ - "movq "#A1", %%mm0 \n\t" /* 0 ; x3 x2 x1 x0 */ \ - "movq 8+"#A1", %%mm1 \n\t" /* 1 ; x7 x6 x5 x4 */ \ - "movq %%mm0, %%mm2 \n\t" /* 2 ; x3 x2 x1 x0 */ \ - "movq "#A3", %%mm3 \n\t" /* 3 ; w05 w04 w01 w00 */ \ - "pshufw $0x88, %%mm0, %%mm0 \n\t" /* x2 x0 x2 x0 */ \ - "movq 8+"#A3", %%mm4 \n\t" /* 4 ; w07 w06 w03 w02 */ \ - "movq %%mm1, %%mm5 \n\t" /* 5 ; x7 x6 x5 x4 */ \ - "pmaddwd %%mm0, %%mm3 \n\t" /* x2*w05+x0*w04 x2*w01+x0*w00 */ \ - "movq 32+"#A3", %%mm6 \n\t" /* 6 ; w21 w20 w17 w16 */ \ - "pshufw $0x88, %%mm1, %%mm1 \n\t" /* x6 x4 x6 x4 */ \ - "pmaddwd %%mm1, %%mm4 \n\t" /* x6*w07+x4*w06 x6*w03+x4*w02 */ \ - "movq 40+"#A3", %%mm7 \n\t" /* 7; w23 w22 w19 w18 */ \ - "pshufw $0xdd, %%mm2, %%mm2 \n\t" /* x3 x1 x3 x1 */ \ - "pmaddwd %%mm2, %%mm6 \n\t" /* x3*w21+x1*w20 x3*w17+x1*w16 */ \ - "pshufw $0xdd, %%mm5, %%mm5 \n\t" /* x7 x5 x7 x5 */ \ - "pmaddwd %%mm5, %%mm7 \n\t" /* x7*w23+x5*w22 x7*w19+x5*w18 */ \ - "paddd "#A4", %%mm3 \n\t" /* +%4 */ \ - "pmaddwd 16+"#A3", %%mm0 \n\t" /* x2*w13+x0*w12 x2*w09+x0*w08 */ \ - "paddd %%mm4, %%mm3 \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */ \ - "pmaddwd 24+"#A3", %%mm1 \n\t" /* x6*w15+x4*w14 x6*w11+x4*w10 */ \ - "movq %%mm3, %%mm4 \n\t" /* 4 ; a1 a0 */ \ - "pmaddwd 48+"#A3", %%mm2 \n\t" /* x3*w29+x1*w28 x3*w25+x1*w24 */ \ - "paddd %%mm7, %%mm6 \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */ \ - "pmaddwd 56+"#A3", %%mm5 \n\t" /* x7*w31+x5*w30 x7*w27+x5*w26 */ \ - "paddd %%mm6, %%mm3 \n\t" /* a1+b1 a0+b0 */ \ - "paddd "#A4", %%mm0 \n\t" /* +%4 */ \ - "psrad $11, %%mm3 \n\t" /* y1=a1+b1 y0=a0+b0 */ \ - "paddd %%mm1, %%mm0 \n\t" /* 1 ; a3=sum(even3) a2=sum(even2) */ \ - "psubd %%mm6, %%mm4 \n\t" /* 6 ; a1-b1 a0-b0 */ \ - "movq %%mm0, %%mm7 \n\t" /* 7 ; a3 a2 */ \ - "paddd %%mm5, %%mm2 \n\t" /* 5 ; b3=sum(odd3) b2=sum(odd2) */ \ - "paddd %%mm2, %%mm0 \n\t" /* a3+b3 a2+b2 */ \ - "psrad $11, %%mm4 \n\t" /* y6=a1-b1 y7=a0-b0 */ \ - "psubd %%mm2, %%mm7 \n\t" /* 2 ; a3-b3 a2-b2 */ \ - "psrad $11, %%mm0 \n\t" /* y3=a3+b3 y2=a2+b2 */ \ - "psrad $11, %%mm7 \n\t" /* y4=a3-b3 y5=a2-b2 */ \ - "packssdw %%mm0, %%mm3 \n\t" /* 0 ; y3 y2 y1 y0 */ \ - "packssdw %%mm4, %%mm7 \n\t" /* 4 ; y6 y7 y4 y5 */ \ - "movq %%mm3, "#A2" \n\t" /* 3 ; save y3 y2 y1 y0 */ \ - "pshufw $0xb1, %%mm7, %%mm7 \n\t" /* y7 y6 y5 y4 */ \ - "movq %%mm7, 8+"#A2" \n\t" /* 7 ; save y7 y6 y5 y4 */ \ - - -// ----------------------------------------------------------------------------- -// -// The first stage DCT 8x8 - forward DCTs of columns -// -// The %2puts are multiplied -// for rows 0,4 - on cos_4_16, -// for rows 1,7 - on cos_1_16, -// for rows 2,6 - on cos_2_16, -// for rows 3,5 - on cos_3_16 -// and are shifted to the left for rise of accuracy -// -// ----------------------------------------------------------------------------- -// -// The 8-point scaled forward DCT algorithm (26a8m) -// -// ----------------------------------------------------------------------------- -// -//#define DCT_8_FRW_COL(x, y) -// { -// short t0, t1, t2, t3, t4, t5, t6, t7; -// short tp03, tm03, tp12, tm12, tp65, tm65; -// short tp465, tm465, tp765, tm765; -// -// t0 = LEFT_SHIFT(x[0] + x[7]); -// t1 = LEFT_SHIFT(x[1] + x[6]); -// t2 = LEFT_SHIFT(x[2] + x[5]); -// t3 = LEFT_SHIFT(x[3] + x[4]); -// t4 = LEFT_SHIFT(x[3] - x[4]); -// t5 = LEFT_SHIFT(x[2] - x[5]); -// t6 = LEFT_SHIFT(x[1] - x[6]); -// t7 = LEFT_SHIFT(x[0] - x[7]); -// -// tp03 = t0 + t3; -// tm03 = t0 - t3; -// tp12 = t1 + t2; -// tm12 = t1 - t2; -// -// y[0] = tp03 + tp12; -// y[4] = tp03 - tp12; -// -// y[2] = tm03 + tm12 * tg_2_16; -// y[6] = tm03 * tg_2_16 - tm12; -// -// tp65 = (t6 + t5) * cos_4_16; -// tm65 = (t6 - t5) * cos_4_16; -// -// tp765 = t7 + tp65; -// tm765 = t7 - tp65; -// tp465 = t4 + tm65; -// tm465 = t4 - tm65; -// -// y[1] = tp765 + tp465 * tg_1_16; -// y[7] = tp765 * tg_1_16 - tp465; -// y[5] = tm765 * tg_3_16 + tm465; -// y[3] = tm765 - tm465 * tg_3_16; -// } -// -// ----------------------------------------------------------------------------- - -// ----------------------------------------------------------------------------- -// DCT_8_INV_COL_4 INP,OUT -// ----------------------------------------------------------------------------- - -#define DCT_8_INV_COL(A1, A2) \ - "movq 2*8(%3), %%mm0 \n\t" \ - "movq 16*3+"#A1", %%mm3 \n\t" \ - "movq %%mm0, %%mm1 \n\t" /* tg_3_16 */ \ - "movq 16*5+"#A1", %%mm5 \n\t" \ - "pmulhw %%mm3, %%mm0 \n\t" /* x3*(tg_3_16-1) */ \ - "movq (%3), %%mm4 \n\t" \ - "pmulhw %%mm5, %%mm1 \n\t" /* x5*(tg_3_16-1) */ \ - "movq 16*7+"#A1", %%mm7 \n\t" \ - "movq %%mm4, %%mm2 \n\t" /* tg_1_16 */ \ - "movq 16*1+"#A1", %%mm6 \n\t" \ - "pmulhw %%mm7, %%mm4 \n\t" /* x7*tg_1_16 */ \ - "paddsw %%mm3, %%mm0 \n\t" /* x3*tg_3_16 */ \ - "pmulhw %%mm6, %%mm2 \n\t" /* x1*tg_1_16 */ \ - "paddsw %%mm3, %%mm1 \n\t" /* x3+x5*(tg_3_16-1) */ \ - "psubsw %%mm5, %%mm0 \n\t" /* x3*tg_3_16-x5 = tm35 */ \ - "movq 3*8(%3), %%mm3 \n\t" \ - "paddsw %%mm5, %%mm1 \n\t" /* x3+x5*tg_3_16 = tp35 */ \ - "paddsw %%mm6, %%mm4 \n\t" /* x1+tg_1_16*x7 = tp17 */ \ - "psubsw %%mm7, %%mm2 \n\t" /* x1*tg_1_16-x7 = tm17 */ \ - "movq %%mm4, %%mm5 \n\t" /* tp17 */ \ - "movq %%mm2, %%mm6 \n\t" /* tm17 */ \ - "paddsw %%mm1, %%mm5 \n\t" /* tp17+tp35 = b0 */ \ - "psubsw %%mm0, %%mm6 \n\t" /* tm17-tm35 = b3 */ \ - "psubsw %%mm1, %%mm4 \n\t" /* tp17-tp35 = t1 */ \ - "paddsw %%mm0, %%mm2 \n\t" /* tm17+tm35 = t2 */ \ - "movq 1*8(%3), %%mm7 \n\t" \ - "movq %%mm4, %%mm1 \n\t" /* t1 */ \ - "movq %%mm5, 3*16+"#A2" \n\t" /* save b0 */ \ - "paddsw %%mm2, %%mm1 \n\t" /* t1+t2 */ \ - "movq %%mm6, 5*16+"#A2" \n\t" /* save b3 */ \ - "psubsw %%mm2, %%mm4 \n\t" /* t1-t2 */ \ - "movq 2*16+"#A1", %%mm5 \n\t" \ - "movq %%mm7, %%mm0 \n\t" /* tg_2_16 */ \ - "movq 6*16+"#A1", %%mm6 \n\t" \ - "pmulhw %%mm5, %%mm0 \n\t" /* x2*tg_2_16 */ \ - "pmulhw %%mm6, %%mm7 \n\t" /* x6*tg_2_16 */ \ - "pmulhw %%mm3, %%mm1 \n\t" /* ocos_4_16*(t1+t2) = b1/2 */ \ - "movq 0*16+"#A1", %%mm2 \n\t" \ - "pmulhw %%mm3, %%mm4 \n\t" /* ocos_4_16*(t1-t2) = b2/2 */ \ - "psubsw %%mm6, %%mm0 \n\t" /* t2*tg_2_16-x6 = tm26 */ \ - "movq %%mm2, %%mm3 \n\t" /* x0 */ \ - "movq 4*16+"#A1", %%mm6 \n\t" \ - "paddsw %%mm5, %%mm7 \n\t" /* x2+x6*tg_2_16 = tp26 */ \ - "paddsw %%mm6, %%mm2 \n\t" /* x0+x4 = tp04 */ \ - "psubsw %%mm6, %%mm3 \n\t" /* x0-x4 = tm04 */ \ - "movq %%mm2, %%mm5 \n\t" /* tp04 */ \ - "movq %%mm3, %%mm6 \n\t" /* tm04 */ \ - "psubsw %%mm7, %%mm2 \n\t" /* tp04-tp26 = a3 */ \ - "paddsw %%mm0, %%mm3 \n\t" /* tm04+tm26 = a1 */ \ - "paddsw %%mm1, %%mm1 \n\t" /* b1 */ \ - "paddsw %%mm4, %%mm4 \n\t" /* b2 */ \ - "paddsw %%mm7, %%mm5 \n\t" /* tp04+tp26 = a0 */ \ - "psubsw %%mm0, %%mm6 \n\t" /* tm04-tm26 = a2 */ \ - "movq %%mm3, %%mm7 \n\t" /* a1 */ \ - "movq %%mm6, %%mm0 \n\t" /* a2 */ \ - "paddsw %%mm1, %%mm3 \n\t" /* a1+b1 */ \ - "paddsw %%mm4, %%mm6 \n\t" /* a2+b2 */ \ - "psraw $6, %%mm3 \n\t" /* dst1 */ \ - "psubsw %%mm1, %%mm7 \n\t" /* a1-b1 */ \ - "psraw $6, %%mm6 \n\t" /* dst2 */ \ - "psubsw %%mm4, %%mm0 \n\t" /* a2-b2 */ \ - "movq 3*16+"#A2", %%mm1 \n\t" /* load b0 */ \ - "psraw $6, %%mm7 \n\t" /* dst6 */ \ - "movq %%mm5, %%mm4 \n\t" /* a0 */ \ - "psraw $6, %%mm0 \n\t" /* dst5 */ \ - "movq %%mm3, 1*16+"#A2" \n\t" \ - "paddsw %%mm1, %%mm5 \n\t" /* a0+b0 */ \ - "movq %%mm6, 2*16+"#A2" \n\t" \ - "psubsw %%mm1, %%mm4 \n\t" /* a0-b0 */ \ - "movq 5*16+"#A2", %%mm3 \n\t" /* load b3 */ \ - "psraw $6, %%mm5 \n\t" /* dst0 */ \ - "movq %%mm2, %%mm6 \n\t" /* a3 */ \ - "psraw $6, %%mm4 \n\t" /* dst7 */ \ - "movq %%mm0, 5*16+"#A2" \n\t" \ - "paddsw %%mm3, %%mm2 \n\t" /* a3+b3 */ \ - "movq %%mm7, 6*16+"#A2" \n\t" \ - "psubsw %%mm3, %%mm6 \n\t" /* a3-b3 */ \ - "movq %%mm5, 0*16+"#A2" \n\t" \ - "psraw $6, %%mm2 \n\t" /* dst3 */ \ - "movq %%mm4, 7*16+"#A2" \n\t" \ - "psraw $6, %%mm6 \n\t" /* dst4 */ \ - "movq %%mm2, 3*16+"#A2" \n\t" \ - "movq %%mm6, 4*16+"#A2" \n\t" \ - -// ============================================================================= -// Code -// ============================================================================= - -// ----------------------------------------------------------------------------- -// void idct_mmx(uint16_t block[64]); -// ----------------------------------------------------------------------------- - -void ff_xvid_idct_mmx(short *block) -{ - __asm__ volatile ( - // # Process each row - DCT_8_INV_ROW_MMX(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1)) - DCT_8_INV_ROW_MMX(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1)) - DCT_8_INV_ROW_MMX(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1)) - DCT_8_INV_ROW_MMX(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1)) - DCT_8_INV_ROW_MMX(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1)) - DCT_8_INV_ROW_MMX(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1)) - DCT_8_INV_ROW_MMX(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1)) - DCT_8_INV_ROW_MMX(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1)) - - // # Process the columns (4 at a time) - DCT_8_INV_COL(0(%0), 0(%0)) - DCT_8_INV_COL(8(%0), 8(%0)) - :: "r" (block), "r" (rounder_0), "r" (tab_i_04_mmx), "r" (tg_1_16)); -} - -void ff_xvid_idct_mmx_put(uint8_t *dest, int line_size, int16_t *block) -{ - ff_xvid_idct_mmx(block); - ff_put_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_xvid_idct_mmx_add(uint8_t *dest, int line_size, int16_t *block) -{ - ff_xvid_idct_mmx(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} - -#endif /* HAVE_MMX_INLINE */ - -#if HAVE_MMXEXT_INLINE - -// ----------------------------------------------------------------------------- -// void idct_xmm(uint16_t block[64]); -// ----------------------------------------------------------------------------- - -void ff_xvid_idct_mmxext(short *block) -{ - __asm__ volatile ( - // # Process each row - DCT_8_INV_ROW_XMM(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1)) - DCT_8_INV_ROW_XMM(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1)) - DCT_8_INV_ROW_XMM(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1)) - DCT_8_INV_ROW_XMM(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1)) - DCT_8_INV_ROW_XMM(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1)) - DCT_8_INV_ROW_XMM(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1)) - DCT_8_INV_ROW_XMM(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1)) - DCT_8_INV_ROW_XMM(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1)) - - // # Process the columns (4 at a time) - DCT_8_INV_COL(0(%0), 0(%0)) - DCT_8_INV_COL(8(%0), 8(%0)) - :: "r" (block), "r" (rounder_0), "r" (tab_i_04_xmm), "r" (tg_1_16)); -} - -void ff_xvid_idct_mmxext_put(uint8_t *dest, int line_size, int16_t *block) -{ - ff_xvid_idct_mmxext(block); - ff_put_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_xvid_idct_mmxext_add(uint8_t *dest, int line_size, int16_t *block) -{ - ff_xvid_idct_mmxext(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} - -#endif /* HAVE_MMXEXT_INLINE */ diff --git a/libavcodec/x86/xvididct_sse2.c b/libavcodec/x86/xvididct_sse2.c deleted file mode 100644 index d4f01693ea..0000000000 --- a/libavcodec/x86/xvididct_sse2.c +++ /dev/null @@ -1,405 +0,0 @@ -/* - * XVID MPEG-4 VIDEO CODEC - * - SSE2 inverse discrete cosine transform - - * - * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> - * - * Conversion to gcc syntax with modifications - * by Alexander Strange <astrange@ithinksw.com> - * - * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. - * - * This file is part of Libav. - * - * Vertical pass is an implementation of the scheme: - * Loeffler C., Ligtenberg A., and Moschytz C.S.: - * Practical Fast 1D DCT Algorithm with Eleven Multiplications, - * Proc. ICASSP 1989, 988-991. - * - * Horizontal pass is a double 4x4 vector/matrix multiplication, - * (see also Intel's Application Note 922: - * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm - * Copyright (C) 1999 Intel Corporation) - * - * More details at http://skal.planet-d.net/coding/dct.html - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with Libav; if not, write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/internal.h" -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" - -#include "idctdsp.h" -#include "xvididct.h" - -#if HAVE_SSE2_INLINE - -/** - * @file - * @brief SSE2 IDCT compatible with the Xvid IDCT - */ - -#define X8(x) x, x, x, x, x, x, x, x - -DECLARE_ASM_CONST(16, int16_t, tan1)[] = { X8(13036) }; // tan( pi/16) -DECLARE_ASM_CONST(16, int16_t, tan2)[] = { X8(27146) }; // tan(2pi/16) = sqrt(2)-1 -DECLARE_ASM_CONST(16, int16_t, tan3)[] = { X8(43790) }; // tan(3pi/16)-1 -DECLARE_ASM_CONST(16, int16_t, sqrt2)[] = { X8(23170) }; // 0.5/sqrt(2) -DECLARE_ASM_CONST(8, uint8_t, m127)[] = { X8(127) }; - -DECLARE_ASM_CONST(16, int16_t, iTab1)[] = { - 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d, - 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61, - 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7, - 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b -}; - -DECLARE_ASM_CONST(16, int16_t, iTab2)[] = { - 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5, - 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04, - 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41, - 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df -}; - -DECLARE_ASM_CONST(16, int16_t, iTab3)[] = { - 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf, - 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf, - 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d, - 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 -}; - -DECLARE_ASM_CONST(16, int16_t, iTab4)[] = { - 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746, - 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac, - 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df, - 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e -}; - -DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = { - 65536, 65536, 65536, 65536, - 3597, 3597, 3597, 3597, - 2260, 2260, 2260, 2260, - 1203, 1203, 1203, 1203, - 120, 120, 120, 120, - 512, 512, 512, 512 -}; - -// Temporary storage before the column pass -#define ROW1 "%%xmm6" -#define ROW3 "%%xmm4" -#define ROW5 "%%xmm5" -#define ROW7 "%%xmm7" - -#define CLEAR_ODD(r) "pxor "r","r" \n\t" -#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t" - -#if ARCH_X86_64 - -# define ROW0 "%%xmm8" -# define REG0 ROW0 -# define ROW2 "%%xmm9" -# define REG2 ROW2 -# define ROW4 "%%xmm10" -# define REG4 ROW4 -# define ROW6 "%%xmm11" -# define REG6 ROW6 -# define CLEAR_EVEN(r) CLEAR_ODD(r) -# define PUT_EVEN(dst) PUT_ODD(dst) -# define XMMS "%%xmm12" -# define MOV_32_ONLY "#" -# define SREG2 REG2 -# define TAN3 "%%xmm13" -# define TAN1 "%%xmm14" - -#else - -# define ROW0 "(%0)" -# define REG0 "%%xmm4" -# define ROW2 "2*16(%0)" -# define REG2 "%%xmm4" -# define ROW4 "4*16(%0)" -# define REG4 "%%xmm6" -# define ROW6 "6*16(%0)" -# define REG6 "%%xmm6" -# define CLEAR_EVEN(r) -# define PUT_EVEN(dst) \ - "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \ - "movdqa %%xmm2, "dst" \n\t" -# define XMMS "%%xmm2" -# define MOV_32_ONLY "movdqa " -# define SREG2 "%%xmm7" -# define TAN3 "%%xmm0" -# define TAN1 "%%xmm2" - -#endif - -#define ROUND(x) "paddd "MANGLE(x) - -#define JZ(reg, to) \ - "testl "reg","reg" \n\t" \ - "jz "to" \n\t" - -#define JNZ(reg, to) \ - "testl "reg","reg" \n\t" \ - "jnz "to" \n\t" - -#define TEST_ONE_ROW(src, reg, clear) \ - clear \ - "movq "src", %%mm1 \n\t" \ - "por 8+"src", %%mm1 \n\t" \ - "paddusb %%mm0, %%mm1 \n\t" \ - "pmovmskb %%mm1, "reg" \n\t" - -#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \ - clear1 \ - clear2 \ - "movq "row1", %%mm1 \n\t" \ - "por 8+"row1", %%mm1 \n\t" \ - "movq "row2", %%mm2 \n\t" \ - "por 8+"row2", %%mm2 \n\t" \ - "paddusb %%mm0, %%mm1 \n\t" \ - "paddusb %%mm0, %%mm2 \n\t" \ - "pmovmskb %%mm1, "reg1" \n\t" \ - "pmovmskb %%mm2, "reg2" \n\t" - -/// IDCT pass on rows. -#define iMTX_MULT(src, table, rounder, put) \ - "movdqa "src", %%xmm3 \n\t" \ - "movdqa %%xmm3, %%xmm0 \n\t" \ - "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \ - "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \ - "pmaddwd "table", %%xmm0 \n\t" \ - "pmaddwd 16+"table", %%xmm1 \n\t" \ - "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \ - "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \ - "pmaddwd 32+"table", %%xmm2 \n\t" \ - "pmaddwd 48+"table", %%xmm3 \n\t" \ - "paddd %%xmm1, %%xmm0 \n\t" \ - "paddd %%xmm3, %%xmm2 \n\t" \ - rounder", %%xmm0 \n\t" \ - "movdqa %%xmm2, %%xmm3 \n\t" \ - "paddd %%xmm0, %%xmm2 \n\t" \ - "psubd %%xmm3, %%xmm0 \n\t" \ - "psrad $11, %%xmm2 \n\t" \ - "psrad $11, %%xmm0 \n\t" \ - "packssdw %%xmm0, %%xmm2 \n\t" \ - put \ - "1: \n\t" - -#define iLLM_HEAD \ - "movdqa "MANGLE(tan3)", "TAN3" \n\t" \ - "movdqa "MANGLE(tan1)", "TAN1" \n\t" \ - -/// IDCT pass on columns. -#define iLLM_PASS(dct) \ - "movdqa "TAN3", %%xmm1 \n\t" \ - "movdqa "TAN1", %%xmm3 \n\t" \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "pmulhw %%xmm5, %%xmm1 \n\t" \ - "paddsw %%xmm4, "TAN3" \n\t" \ - "paddsw %%xmm5, %%xmm1 \n\t" \ - "psubsw %%xmm5, "TAN3" \n\t" \ - "paddsw %%xmm4, %%xmm1 \n\t" \ - "pmulhw %%xmm7, %%xmm3 \n\t" \ - "pmulhw %%xmm6, "TAN1" \n\t" \ - "paddsw %%xmm6, %%xmm3 \n\t" \ - "psubsw %%xmm7, "TAN1" \n\t" \ - "movdqa %%xmm3, %%xmm7 \n\t" \ - "movdqa "TAN1", %%xmm6 \n\t" \ - "psubsw %%xmm1, %%xmm3 \n\t" \ - "psubsw "TAN3", "TAN1" \n\t" \ - "paddsw %%xmm7, %%xmm1 \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa %%xmm3, %%xmm6 \n\t" \ - "psubsw "TAN3", %%xmm3 \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ - "pmulhw %%xmm4, %%xmm3 \n\t" \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "paddsw "TAN3", "TAN3" \n\t" \ - "paddsw %%xmm3, %%xmm3 \n\t" \ - "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \ - MOV_32_ONLY ROW2", "REG2" \n\t" \ - MOV_32_ONLY ROW6", "REG6" \n\t" \ - "movdqa %%xmm7, %%xmm5 \n\t" \ - "pmulhw "REG6", %%xmm7 \n\t" \ - "pmulhw "REG2", %%xmm5 \n\t" \ - "paddsw "REG2", %%xmm7 \n\t" \ - "psubsw "REG6", %%xmm5 \n\t" \ - MOV_32_ONLY ROW0", "REG0" \n\t" \ - MOV_32_ONLY ROW4", "REG4" \n\t" \ - MOV_32_ONLY" "TAN1", (%0) \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw "REG4", "REG0" \n\t" \ - "paddsw "XMMS", "REG4" \n\t" \ - "movdqa "REG4", "XMMS" \n\t" \ - "psubsw %%xmm7, "REG4" \n\t" \ - "paddsw "XMMS", %%xmm7 \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm5, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm5 \n\t" \ - "movdqa %%xmm5, "XMMS" \n\t" \ - "psubsw "TAN3", %%xmm5 \n\t" \ - "paddsw "XMMS", "TAN3" \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm3, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm3 \n\t" \ - MOV_32_ONLY" (%0), "TAN1" \n\t" \ - "psraw $6, %%xmm5 \n\t" \ - "psraw $6, "REG0" \n\t" \ - "psraw $6, "TAN3" \n\t" \ - "psraw $6, %%xmm3 \n\t" \ - "movdqa "TAN3", 1*16("dct") \n\t" \ - "movdqa %%xmm3, 2*16("dct") \n\t" \ - "movdqa "REG0", 5*16("dct") \n\t" \ - "movdqa %%xmm5, 6*16("dct") \n\t" \ - "movdqa %%xmm7, %%xmm0 \n\t" \ - "movdqa "REG4", %%xmm4 \n\t" \ - "psubsw %%xmm1, %%xmm7 \n\t" \ - "psubsw "TAN1", "REG4" \n\t" \ - "paddsw %%xmm0, %%xmm1 \n\t" \ - "paddsw %%xmm4, "TAN1" \n\t" \ - "psraw $6, %%xmm1 \n\t" \ - "psraw $6, %%xmm7 \n\t" \ - "psraw $6, "TAN1" \n\t" \ - "psraw $6, "REG4" \n\t" \ - "movdqa %%xmm1, ("dct") \n\t" \ - "movdqa "TAN1", 3*16("dct") \n\t" \ - "movdqa "REG4", 4*16("dct") \n\t" \ - "movdqa %%xmm7, 7*16("dct") \n\t" - -/// IDCT pass on columns, assuming rows 4-7 are zero. -#define iLLM_PASS_SPARSE(dct) \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "paddsw %%xmm4, "TAN3" \n\t" \ - "movdqa %%xmm6, %%xmm3 \n\t" \ - "pmulhw %%xmm6, "TAN1" \n\t" \ - "movdqa %%xmm4, %%xmm1 \n\t" \ - "psubsw %%xmm1, %%xmm3 \n\t" \ - "paddsw %%xmm6, %%xmm1 \n\t" \ - "movdqa "TAN1", %%xmm6 \n\t" \ - "psubsw "TAN3", "TAN1" \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa %%xmm3, %%xmm6 \n\t" \ - "psubsw "TAN3", %%xmm3 \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ - "pmulhw %%xmm4, %%xmm3 \n\t" \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "paddsw "TAN3", "TAN3" \n\t" \ - "paddsw %%xmm3, %%xmm3 \n\t" \ - "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \ - MOV_32_ONLY ROW2", "SREG2" \n\t" \ - "pmulhw "SREG2", %%xmm5 \n\t" \ - MOV_32_ONLY ROW0", "REG0" \n\t" \ - "movdqa "REG0", %%xmm6 \n\t" \ - "psubsw "SREG2", %%xmm6 \n\t" \ - "paddsw "REG0", "SREG2" \n\t" \ - MOV_32_ONLY" "TAN1", (%0) \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm5, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm5 \n\t" \ - "movdqa %%xmm5, "XMMS" \n\t" \ - "psubsw "TAN3", %%xmm5 \n\t" \ - "paddsw "XMMS", "TAN3" \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm3, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm3 \n\t" \ - MOV_32_ONLY" (%0), "TAN1" \n\t" \ - "psraw $6, %%xmm5 \n\t" \ - "psraw $6, "REG0" \n\t" \ - "psraw $6, "TAN3" \n\t" \ - "psraw $6, %%xmm3 \n\t" \ - "movdqa "TAN3", 1*16("dct") \n\t" \ - "movdqa %%xmm3, 2*16("dct") \n\t" \ - "movdqa "REG0", 5*16("dct") \n\t" \ - "movdqa %%xmm5, 6*16("dct") \n\t" \ - "movdqa "SREG2", %%xmm0 \n\t" \ - "movdqa %%xmm6, %%xmm4 \n\t" \ - "psubsw %%xmm1, "SREG2" \n\t" \ - "psubsw "TAN1", %%xmm6 \n\t" \ - "paddsw %%xmm0, %%xmm1 \n\t" \ - "paddsw %%xmm4, "TAN1" \n\t" \ - "psraw $6, %%xmm1 \n\t" \ - "psraw $6, "SREG2" \n\t" \ - "psraw $6, "TAN1" \n\t" \ - "psraw $6, %%xmm6 \n\t" \ - "movdqa %%xmm1, ("dct") \n\t" \ - "movdqa "TAN1", 3*16("dct") \n\t" \ - "movdqa %%xmm6, 4*16("dct") \n\t" \ - "movdqa "SREG2", 7*16("dct") \n\t" - -inline void ff_xvid_idct_sse2(short *block) -{ - __asm__ volatile ( - "movq "MANGLE (m127) ", %%mm0 \n\t" - iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0)) - iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 1 * 16), PUT_ODD(ROW1)) - iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 2 * 16), PUT_EVEN(ROW2)) - - TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4)) - JZ("%%eax", "1f") - iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 3 * 16), PUT_ODD(ROW3)) - - TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6)) - TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7)) - iLLM_HEAD - ".p2align 4 \n\t" - JNZ("%%ecx", "2f") - JNZ("%%eax", "3f") - JNZ("%%edx", "4f") - JNZ("%%esi", "5f") - iLLM_PASS_SPARSE("%0") - "jmp 6f \n\t" - "2: \n\t" - iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4)) - "3: \n\t" - iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 4 * 16), PUT_ODD(ROW5)) - JZ("%%edx", "1f") - "4: \n\t" - iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 5 * 16), PUT_EVEN(ROW6)) - JZ("%%esi", "1f") - "5: \n\t" - iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 5 * 16), PUT_ODD(ROW7)) -#if ARCH_X86_32 - iLLM_HEAD -#endif - iLLM_PASS("%0") - "6: \n\t" - : "+r" (block) - : - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", ) -#if ARCH_X86_64 - XMM_CLOBBERS("%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", ) -#endif - "%eax", "%ecx", "%edx", "%esi", "memory"); -} - -void ff_xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block) -{ - ff_xvid_idct_sse2(block); - ff_put_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block) -{ - ff_xvid_idct_sse2(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} - -#endif /* HAVE_SSE2_INLINE */ |