diff options
author | Shiyou Yin | 2018-09-05 18:31:05 +0800 |
---|---|---|
committer | Michael Niedermayer | 2018-09-05 21:45:52 +0200 |
commit | 776909e42e2a9cef37035f070464bbbfa3441c39 (patch) | |
tree | bdc0ce004c315b9adb9550f3b8742c64be7f2f82 | |
parent | 1a4a8df249426c85ba6c96ef5ab022afaaf4dc8f (diff) |
avcodec/mips: [loongson] reoptimize put and add pixels clamped functions.
Simplify the usage of intermediate variable addr and remove unused variable all64
in following functions:
1. ff_put_pixels_clamped_mmi
2. ff_put_signed_pixels_clamped_mmi
3. ff_add_pixels_clamped_mmi
This optimization speed up mpeg4 decode about 2% on loongson platform(tested with 3A3000).
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
-rw-r--r-- | libavcodec/mips/idctdsp_mmi.c | 155 |
1 files changed, 62 insertions, 93 deletions
diff --git a/libavcodec/mips/idctdsp_mmi.c b/libavcodec/mips/idctdsp_mmi.c index b7979650fd..a96dac4704 100644 --- a/libavcodec/mips/idctdsp_mmi.c +++ b/libavcodec/mips/idctdsp_mmi.c @@ -29,9 +29,6 @@ void ff_put_pixels_clamped_mmi(const int16_t *block, uint8_t *av_restrict pixels, ptrdiff_t line_size) { double ftmp[8]; - mips_reg addr[1]; - DECLARE_VAR_ALL64; - DECLARE_VAR_ADDRT; __asm__ volatile ( MMI_LDC1(%[ftmp0], %[block], 0x00) @@ -42,60 +39,44 @@ void ff_put_pixels_clamped_mmi(const int16_t *block, MMI_LDC1(%[ftmp5], %[block], 0x28) MMI_LDC1(%[ftmp6], %[block], 0x30) MMI_LDC1(%[ftmp7], %[block], 0x38) - PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" "packushb %[ftmp6], %[ftmp6], %[ftmp7] \n\t" MMI_SDC1(%[ftmp0], %[pixels], 0x00) - MMI_SDC1(%[ftmp2], %[addr0], 0x00) - MMI_SDXC1(%[ftmp4], %[addr0], %[line_size], 0x00) - MMI_SDXC1(%[ftmp6], %[pixels], %[line_sizex3], 0x00) - : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), - [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), - [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), - [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), - RESTRICT_ASM_ALL64 - RESTRICT_ASM_ADDRT - [addr0]"=&r"(addr[0]), - [pixels]"+&r"(pixels) - : [line_size]"r"((mips_reg)line_size), - [line_sizex3]"r"((mips_reg)(line_size*3)), - [block]"r"(block) - : "memory" - ); - - pixels += line_size*4; - block += 32; + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp2], %[pixels], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp4], %[pixels], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp6], %[pixels], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" - __asm__ volatile ( - MMI_LDC1(%[ftmp0], %[block], 0x00) - MMI_LDC1(%[ftmp1], %[block], 0x08) - MMI_LDC1(%[ftmp2], %[block], 0x10) - MMI_LDC1(%[ftmp3], %[block], 0x18) - MMI_LDC1(%[ftmp4], %[block], 0x20) - MMI_LDC1(%[ftmp5], %[block], 0x28) - MMI_LDC1(%[ftmp6], %[block], 0x30) - MMI_LDC1(%[ftmp7], %[block], 0x38) - PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t" + MMI_LDC1(%[ftmp0], %[block], 0x40) + MMI_LDC1(%[ftmp1], %[block], 0x48) + MMI_LDC1(%[ftmp2], %[block], 0x50) + MMI_LDC1(%[ftmp3], %[block], 0x58) + MMI_LDC1(%[ftmp4], %[block], 0x60) + MMI_LDC1(%[ftmp5], %[block], 0x68) + MMI_LDC1(%[ftmp6], %[block], 0x70) + MMI_LDC1(%[ftmp7], %[block], 0x78) "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" "packushb %[ftmp6], %[ftmp6], %[ftmp7] \n\t" MMI_SDC1(%[ftmp0], %[pixels], 0x00) - MMI_SDC1(%[ftmp2], %[addr0], 0x00) - MMI_SDXC1(%[ftmp4], %[addr0], %[line_size], 0x00) - MMI_SDXC1(%[ftmp6], %[pixels], %[line_sizex3], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp2], %[pixels], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp4], %[pixels], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp6], %[pixels], 0x00) : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), - RESTRICT_ASM_ALL64 - RESTRICT_ASM_ADDRT - [addr0]"=&r"(addr[0]), [pixels]"+&r"(pixels) : [line_size]"r"((mips_reg)line_size), - [line_sizex3]"r"((mips_reg)(line_size*3)), [block]"r"(block) : "memory" ); @@ -104,15 +85,9 @@ void ff_put_pixels_clamped_mmi(const int16_t *block, void ff_put_signed_pixels_clamped_mmi(const int16_t *block, uint8_t *av_restrict pixels, ptrdiff_t line_size) { - int64_t line_skip = line_size; - int64_t line_skip3 = 0; double ftmp[5]; - mips_reg addr[1]; - DECLARE_VAR_ALL64; - DECLARE_VAR_ADDRT; __asm__ volatile ( - PTR_ADDU "%[line_skip3], %[line_skip], %[line_skip] \n\t" MMI_LDC1(%[ftmp1], %[block], 0x00) MMI_LDC1(%[ftmp0], %[block], 0x08) "packsshb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" @@ -130,12 +105,14 @@ void ff_put_signed_pixels_clamped_mmi(const int16_t *block, "paddb %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" "paddb %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" MMI_SDC1(%[ftmp1], %[pixels], 0x00) - MMI_SDXC1(%[ftmp2], %[pixels], %[line_skip], 0x00) - MMI_SDXC1(%[ftmp3], %[pixels], %[line_skip3], 0x00) - PTR_ADDU "%[line_skip3], %[line_skip3], %[line_skip] \n\t" - MMI_SDXC1(%[ftmp4], %[pixels], %[line_skip3], 0x00) - PTR_ADDU "%[addr0], %[line_skip3], %[line_skip] \n\t" - PTR_ADDU "%[pixels], %[pixels], %[addr0] \n\t" + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp2], %[pixels], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp3], %[pixels], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp4], %[pixels], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_LDC1(%[ftmp1], %[block], 0x40) MMI_LDC1(%[ftmp0], %[block], 0x48) "packsshb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" @@ -153,19 +130,18 @@ void ff_put_signed_pixels_clamped_mmi(const int16_t *block, "paddb %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" "paddb %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" MMI_SDC1(%[ftmp1], %[pixels], 0x00) - MMI_SDXC1(%[ftmp2], %[pixels], %[line_skip], 0x00) - PTR_ADDU "%[addr0], %[line_skip], %[line_skip] \n\t" - MMI_SDXC1(%[ftmp3], %[pixels], %[addr0], 0x00) - MMI_SDXC1(%[ftmp4], %[pixels], %[line_skip3], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp2], %[pixels], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp3], %[pixels], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp4], %[pixels], 0x00) : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), - RESTRICT_ASM_ALL64 - RESTRICT_ASM_ADDRT - [addr0]"=&r"(addr[0]), - [pixels]"+&r"(pixels), [line_skip3]"+&r"(line_skip3) + [pixels]"+&r"(pixels) : [block]"r"(block), - [line_skip]"r"((mips_reg)line_skip), + [line_size]"r"((mips_reg)line_size), [ff_pb_80]"f"(ff_pb_80) : "memory" ); @@ -174,49 +150,42 @@ void ff_put_signed_pixels_clamped_mmi(const int16_t *block, void ff_add_pixels_clamped_mmi(const int16_t *block, uint8_t *av_restrict pixels, ptrdiff_t line_size) { - double ftmp[8]; + double ftmp[9]; uint64_t tmp[1]; - mips_reg addr[1]; - DECLARE_VAR_ALL64; - DECLARE_VAR_ADDRT; - __asm__ volatile ( - "li %[tmp0], 0x04 \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "1: \n\t" + "li %[tmp0], 0x04 \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "1: \n\t" + MMI_LDC1(%[ftmp5], %[pixels], 0x00) + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_LDC1(%[ftmp6], %[pixels], 0x00) + PTR_SUBU "%[pixels], %[pixels], %[line_size] \n\t" MMI_LDC1(%[ftmp1], %[block], 0x00) MMI_LDC1(%[ftmp2], %[block], 0x08) MMI_LDC1(%[ftmp3], %[block], 0x10) MMI_LDC1(%[ftmp4], %[block], 0x18) - MMI_LDC1(%[ftmp5], %[pixels], 0x00) - MMI_LDXC1(%[ftmp6], %[pixels], %[line_size], 0x00) - "mov.d %[ftmp7], %[ftmp5] \n\t" - "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" - "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" - "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" - "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" - "mov.d %[ftmp7], %[ftmp6] \n\t" - "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" - "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" - "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" - "paddh %[ftmp4], %[ftmp4], %[ftmp7] \n\t" - "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "packushb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + PTR_ADDIU "%[block], %[block], 0x20 \n\t" + "punpckhbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" MMI_SDC1(%[ftmp1], %[pixels], 0x00) - MMI_SDXC1(%[ftmp3], %[pixels], %[line_size], 0x00) - "addi %[tmp0], %[tmp0], -0x01 \n\t" - PTR_ADDIU "%[block], %[block], 0x20 \n\t" - PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" - PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" - "bnez %[tmp0], 1b" + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + MMI_SDC1(%[ftmp3], %[pixels], 0x00) + "addi %[tmp0], %[tmp0], -0x01 \n\t" + PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t" + "bnez %[tmp0], 1b \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), - [tmp0]"=&r"(tmp[0]), - RESTRICT_ASM_ALL64 - RESTRICT_ASM_ADDRT - [addr0]"=&r"(addr[0]), + [ftmp8]"=&f"(ftmp[8]), [tmp0]"=&r"(tmp[0]), [pixels]"+&r"(pixels), [block]"+&r"(block) : [line_size]"r"((mips_reg)line_size) : "memory" |