diff options
author | Ivan Kalvachev | 2017-08-05 20:18:50 +0300 |
---|---|---|
committer | Rostislav Pehlivanov | 2017-08-18 17:18:32 +0100 |
commit | 30ae07d7ef3555ec45fa53098849223fff204475 (patch) | |
tree | b92a38366756a5c748172b43514a160ac335f984 /libavutil | |
parent | cadab5a2a74d715fc16325bd89f8b8091def1083 (diff) |
Add macros to x86util.asm .
Improved version of VBROADCASTSS that works like the avx2 instruction.
Emulation of vpbroadcastd.
Horizontal sum HSUMPS that places the result in all elements.
Emulation of blendvps and pblendvb.
Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
Diffstat (limited to 'libavutil')
-rw-r--r-- | libavutil/x86/x86util.asm | 106 |
1 files changed, 98 insertions, 8 deletions
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index cc7d272cad..e1220dfc1a 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -832,14 +832,25 @@ pmaxsd %1, %2 %endmacro -%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32 -%if cpuflag(avx) - vbroadcastss %1, %2 -%else ; sse -%ifnidn %1, %2 - movss %1, %2 -%endif - shufps %1, %1, 0 +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vbroadcastss %1, %2 +%elif cpuflag(avx) + %ifnum sizeof%2 ; avx1 register + shufps xmm%1, xmm%2, xmm%2, q0000 + %if sizeof%1 >= 32 ; mmsize>=32 + vinsertf128 %1, %1, xmm%1, 1 + %endif + %else ; avx1 memory + vbroadcastss %1, %2 + %endif +%else + %ifnum sizeof%2 ; sse register + shufps %1, %2, %2, q0000 + %else ; sse memory + movss %1, %2 + shufps %1, %1, 0 + %endif %endif %endmacro @@ -854,6 +865,21 @@ %endif %endmacro +%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vpbroadcastd %1, %2 +%elif cpuflag(avx) && sizeof%1 >= 32 + %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss +%else + %ifnum sizeof%2 ; sse2 register + pshufd %1, %2, q0000 + %else ; sse memory + movd %1, %2 + pshufd %1, %1, 0 + %endif +%endif +%endmacro + %macro SHUFFLE_MASK_W 8 %rep 8 %if %1>=0x80 @@ -918,3 +944,67 @@ movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst %endif %endmacro + +; Horizontal Sum of Packed Single precision floats +; The resulting sum is in all elements. +%macro HSUMPS 2 ; dst/src, tmp +%if cpuflag(avx) + %if sizeof%1>=32 ; avx + vperm2f128 %2, %1, %1, (0)*16+(1) + addps %1, %2 + %endif + shufps %2, %1, %1, q1032 + addps %1, %2 + shufps %2, %1, %1, q0321 + addps %1, %2 +%else ; this form is a bit faster than the short avx-like emulation. + movaps %2, %1 + shufps %1, %1, q1032 + addps %1, %2 + movaps %2, %1 + shufps %1, %1, q0321 + addps %1, %2 + ; all %1 members should be equal for as long as float a+b==b+a +%endif +%endmacro + +; Emulate blendvps if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro BLENDVPS 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + blendvps %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3d operand, you used %3 + %endif + blendvps %1, %2, %3 +%else + xorps %2, %1 + andps %2, %3 + xorps %1, %2 +%endif +%endmacro + +; Emulate pblendvb if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro PBLENDVB 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32 + %error pblendb not possible with ymm on avx1, try blendvps. + %endif + pblendvb %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3 + %endif + pblendvb %1, %2, %3 +%else + pxor %2, %1 + pand %2, %3 + pxor %1, %2 +%endif +%endmacro |