diff options
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/mips/hevc_mc_uni_msa.c | 372 |
1 files changed, 133 insertions, 239 deletions
diff --git a/libavcodec/mips/hevc_mc_uni_msa.c b/libavcodec/mips/hevc_mc_uni_msa.c index 754fbdbb41..cf22e7f1a1 100644 --- a/libavcodec/mips/hevc_mc_uni_msa.c +++ b/libavcodec/mips/hevc_mc_uni_msa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) * * This file is part of FFmpeg. * @@ -359,16 +359,14 @@ static const uint8_t mc_filt_mask_arr[16 * 3] = { static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, uint8_t rnd_val) + const int8_t *filter) { v16u8 mask0, mask1, mask2, mask3, out; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v8i16 filt, out0, out1; - v8i16 rnd_vec; mask0 = LD_UB(&mc_filt_mask_arr[16]); src -= 3; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -382,7 +380,7 @@ static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1); - SRAR_H2_SH(out0, out1, rnd_vec); + SRARI_H2_SH(out0, out1, 6); SAT_SH2_SH(out0, out1, 7); out = PCKEV_XORI128_UB(out0, out1); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -390,17 +388,15 @@ static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, uint8_t rnd_val) + const int8_t *filter) { v16i8 filt0, filt1, filt2, filt3; v16i8 src0, src1, src2, src3; v16u8 mask0, mask1, mask2, mask3, out; v8i16 filt, out0, out1, out2, out3; - v8i16 rnd_vec; mask0 = LD_UB(&mc_filt_mask_arr[16]); src -= 3; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -419,7 +415,7 @@ static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -430,16 +426,14 @@ static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, uint8_t rnd_val) + const int8_t *filter) { v16u8 mask0, mask1, mask2, mask3, out; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v8i16 filt, out0, out1, out2, out3; - v8i16 rnd_vec; mask0 = LD_UB(&mc_filt_mask_arr[16]); src -= 3; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -459,7 +453,7 @@ static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, src += (4 * src_stride); HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -479,7 +473,7 @@ static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -490,30 +484,27 @@ static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, uint8_t rnd_val) + const int8_t *filter, int32_t height) { if (4 == height) { - common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); } else if (8 == height) { - common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); } else if (16 == height) { - common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter, - rnd_val); + common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter); } } static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, uint8_t rnd_val) + const int8_t *filter) { v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; v8i16 filt, out0, out1, out2, out3; - v8i16 rnd_vec; mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -528,7 +519,7 @@ static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride, HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); tmp0 = PCKEV_XORI128_UB(out0, out1); tmp1 = PCKEV_XORI128_UB(out2, out3); @@ -537,18 +528,15 @@ static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride, static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; v8i16 filt, out0, out1, out2, out3; - v8i16 rnd_vec; mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -565,7 +553,7 @@ static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride, HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); tmp0 = PCKEV_XORI128_UB(out0, out1); tmp1 = PCKEV_XORI128_UB(out2, out3); @@ -576,32 +564,28 @@ static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride, static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { if (4 == height) { - common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); } else { common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, - height, rnd_val); + height); } } static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint8_t *src1_ptr, *dst1; uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v8i16 filt, out0, out1, out2, out3; v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1; - v8i16 rnd_vec; mask00 = LD_UB(&mc_filt_mask_arr[0]); mask0 = LD_UB(&mc_filt_mask_arr[16]); - rnd_vec = __msa_fill_h(rnd_val); src1_ptr = src - 3; dst1 = dst; @@ -628,7 +612,7 @@ static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask00, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); tmp0 = PCKEV_XORI128_UB(out0, out1); tmp1 = PCKEV_XORI128_UB(out2, out3); @@ -642,7 +626,7 @@ static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask4, mask5, mask6, filt0, filt1, filt2, filt3, out0, out1); - SRAR_H2_SH(out0, out1, rnd_vec); + SRARI_H2_SH(out0, out1, 6); SAT_SH2_SH(out0, out1, 7); tmp0 = PCKEV_XORI128_UB(out0, out1); ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride); @@ -652,18 +636,15 @@ static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16u8 mask0, mask1, mask2, mask3, out; v8i16 filt, out0, out1, out2, out3; - v8i16 rnd_vec; mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -681,7 +662,7 @@ static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); ST_UB(out, dst); @@ -694,8 +675,7 @@ static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; @@ -704,11 +684,9 @@ static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, v16i8 vec11; v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10; v8i16 out11, filt; - v8i16 rnd_vec; mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -754,8 +732,8 @@ static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0, out8, out2, out9); ADDS_SH2_SH(out1, out5, out3, out7, out1, out3); - SRAR_H4_SH(out0, out8, out2, out9, rnd_vec); - SRAR_H2_SH(out1, out3, rnd_vec); + SRARI_H4_SH(out0, out8, out2, out9, 6); + SRARI_H2_SH(out1, out3, 6); SAT_SH4_SH(out0, out8, out2, out9, 7); SAT_SH2_SH(out1, out3, 7); out = PCKEV_XORI128_UB(out8, out9); @@ -771,18 +749,15 @@ static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16u8 mask0, mask1, mask2, mask3, out; v8i16 filt, out0, out1, out2, out3; - v8i16 rnd_vec; mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -802,7 +777,7 @@ static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); src0 = LD_SB(src); @@ -821,7 +796,7 @@ static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); ST_UB(out, dst); @@ -833,18 +808,15 @@ static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2; v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out; v8i16 filt, out0, out1, out2, out3, out4, out5, out6; - v8i16 rnd_vec; mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -879,8 +851,8 @@ static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, out5 = __msa_dpadd_s_h(out5, vec2, filt3); ADDS_SH2_SH(out0, out3, out1, out4, out0, out1); out2 = __msa_adds_s_h(out2, out5); - SRAR_H2_SH(out0, out1, rnd_vec); - out6 = __msa_srar_h(out2, rnd_vec); + SRARI_H2_SH(out0, out1, 6); + out6 = __msa_srari_h(out2, 6); SAT_SH3_SH(out0, out1, out6, 7); out = PCKEV_XORI128_UB(out0, out1); ST_UB(out, dst); @@ -905,7 +877,8 @@ static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, out5 = __msa_dpadd_s_h(out5, vec2, filt3); ADDS_SH2_SH(out0, out3, out1, out4, out3, out4); out5 = __msa_adds_s_h(out2, out5); - SRAR_H3_SH(out3, out4, out5, rnd_vec); + SRARI_H2_SH(out3, out4, 6); + out5 = __msa_srari_h(out5, 6); SAT_SH3_SH(out3, out4, out5, 7); out = PCKEV_XORI128_UB(out6, out3); ST_UB(out, dst + 16); @@ -917,18 +890,15 @@ static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { int32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16u8 mask0, mask1, mask2, mask3, out; v8i16 filt, out0, out1, out2, out3; - v8i16 rnd_vec; mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -948,7 +918,7 @@ static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); ST_UB(out, dst); @@ -965,7 +935,7 @@ static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); ST_UB(out, dst + 32); @@ -977,8 +947,7 @@ static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; @@ -987,10 +956,8 @@ static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, v16i8 src10998, filt0, filt1, filt2, filt3; v16u8 out; v8i16 filt, out10, out32; - v8i16 rnd_vec; src -= (3 * src_stride); - rnd_vec = __msa_fill_h(rnd_val); filt = LD_SH(filter); SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); @@ -1017,7 +984,7 @@ static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, filt1, filt2, filt3); out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, filt1, filt2, filt3); - SRAR_H2_SH(out10, out32, rnd_vec); + SRARI_H2_SH(out10, out32, 6); SAT_SH2_SH(out10, out32, 7); out = PCKEV_XORI128_UB(out10, out32); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -1032,8 +999,7 @@ static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; @@ -1041,10 +1007,8 @@ static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; v16u8 tmp0, tmp1; v8i16 filt, out0_r, out1_r, out2_r, out3_r; - v8i16 rnd_vec; src -= (3 * src_stride); - rnd_vec = __msa_fill_h(rnd_val); filt = LD_SH(filter); SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); @@ -1071,7 +1035,7 @@ static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, filt1, filt2, filt3); out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3); - SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); @@ -1090,8 +1054,7 @@ static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { int32_t loop_cnt; uint32_t out2, out3; @@ -1100,11 +1063,9 @@ static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2; v8i16 filt, filt0, filt1, filt2, filt3; - v8i16 rnd_vec; v4i32 mask = { 2, 6, 2, 6 }; src -= (3 * src_stride); - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter_y */ filt = LD_SH(filter); @@ -1140,7 +1101,8 @@ static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, vec45, vec67); tmp2 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1, filt2, filt3); - SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec); + SRARI_H2_SH(tmp0, tmp1, 6); + tmp2 = __msa_srari_h(tmp2, 6); SAT_SH3_SH(tmp0, tmp1, tmp2, 7); PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2); XORI_B3_128_SB(res0, res1, res2); @@ -1174,8 +1136,7 @@ static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; @@ -1185,10 +1146,8 @@ static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; v16u8 tmp0, tmp1, tmp2, tmp3; v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; - v8i16 rnd_vec; src -= (3 * src_stride); - rnd_vec = __msa_fill_h(rnd_val); filt = LD_SH(filter); SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); @@ -1228,8 +1187,8 @@ static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, filt1, filt2, filt3); out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, filt1, filt2, filt3); - SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); - SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6); SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, @@ -1257,7 +1216,7 @@ static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, - uint8_t rnd_val, int32_t width) + int32_t width) { uint8_t *src_tmp; uint8_t *dst_tmp; @@ -1269,10 +1228,8 @@ static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; v16u8 tmp0, tmp1, tmp2, tmp3; v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; - v8i16 rnd_vec; src -= (3 * src_stride); - rnd_vec = __msa_fill_h(rnd_val); filt = LD_SH(filter); SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); @@ -1315,8 +1272,8 @@ static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, filt0, filt1, filt2, filt3); out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, filt1, filt2, filt3); - SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); - SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6); SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, @@ -1347,37 +1304,37 @@ static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, uint8_t rnd_val) + const int8_t *filter, int32_t height) { common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, - rnd_val, 16); + 16); common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter, - height, rnd_val); + height); } static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, uint8_t rnd_val) + const int8_t *filter, int32_t height) { common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, - rnd_val, 32); + 32); } static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, uint8_t rnd_val) + const int8_t *filter, int32_t height) { common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, - rnd_val, 48); + 48); } static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, uint8_t rnd_val) + const int8_t *filter, int32_t height) { common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, - rnd_val, 64); + 64); } static void hevc_hv_uni_8t_4w_msa(uint8_t *src, @@ -1736,16 +1693,14 @@ static void hevc_hv_uni_8t_64w_msa(uint8_t *src, static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, uint8_t rnd_val) + const int8_t *filter) { v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1; v16u8 out; v8i16 filt, res0; - v8i16 rnd_vec; mask0 = LD_SB(&mc_filt_mask_arr[16]); src -= 1; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -1757,7 +1712,7 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, XORI_B2_128_SB(src0, src1); VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1); res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1); - res0 = __msa_srar_h(res0, rnd_vec); + res0 = __msa_srari_h(res0, 6); res0 = __msa_sat_s_h(res0, 7); out = PCKEV_XORI128_UB(res0, res0); ST4x2_UB(out, dst, dst_stride); @@ -1765,16 +1720,14 @@ static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, uint8_t rnd_val) + const int8_t *filter) { v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; v8i16 filt, out0, out1; v16u8 out; - v8i16 rnd_vec; mask0 = LD_SB(&mc_filt_mask_arr[16]); src -= 1; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -1786,7 +1739,7 @@ static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1); - SRAR_H2_SH(out0, out1, rnd_vec); + SRARI_H2_SH(out0, out1, 6); SAT_SH2_SH(out0, out1, 7); out = PCKEV_XORI128_UB(out0, out1); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -1794,16 +1747,14 @@ static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, uint8_t rnd_val) + const int8_t *filter) { v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; v16u8 out; v8i16 filt, out0, out1, out2, out3; - v8i16 rnd_vec; mask0 = LD_SB(&mc_filt_mask_arr[16]); src -= 1; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -1821,7 +1772,7 @@ static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -1832,17 +1783,15 @@ static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, uint8_t rnd_val) + const int8_t *filter) { v16i8 src0, src1, src2, src3, src4, src5, src6, src7; v16i8 filt0, filt1, mask0, mask1; v16u8 out; v8i16 filt, out0, out1, out2, out3; - v8i16 rnd_vec; mask0 = LD_SB(&mc_filt_mask_arr[16]); src -= 1; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -1857,7 +1806,7 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, filt0, filt1, out0, out1); HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, filt1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -1873,7 +1822,7 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, filt0, filt1, out0, out1); HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, filt1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -1884,35 +1833,30 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { if (2 == height) { - common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); } else if (4 == height) { - common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter); } else if (8 == height) { - common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter); } else if (16 == height) { - common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter, - rnd_val); + common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter); } } static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; v16u8 out4, out5; v8i16 filt, out0, out1, out2, out3; - v8i16 rnd_vec; mask0 = LD_SB(&mc_filt_mask_arr[0]); src -= 1; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -1927,7 +1871,7 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); out4 = PCKEV_XORI128_UB(out0, out1); @@ -1939,18 +1883,15 @@ static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, filt0, filt1, mask0, mask1; v16u8 out; v8i16 filt, vec0, vec1, vec2, vec3; - v8i16 rnd_vec; mask0 = LD_SB(&mc_filt_mask_arr[0]); src -= 1; - rnd_vec = __msa_fill_h(rnd_val); filt = LD_SH(filter); SPLATI_H2_SB(filt, 0, 1, filt0, filt1); @@ -1966,7 +1907,7 @@ static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1); VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3); DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1); - SRAR_H2_SH(vec0, vec1, rnd_vec); + SRARI_H2_SH(vec0, vec1, 6); SAT_SH2_SH(vec0, vec1, 7); out = PCKEV_XORI128_UB(vec0, vec1); ST8x2_UB(out, dst, dst_stride); @@ -1976,18 +1917,15 @@ static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1; v16u8 tmp0, tmp1; v8i16 filt, out0, out1, out2, out3; - v8i16 rnd_vec; mask0 = LD_SB(&mc_filt_mask_arr[0]); src -= 1; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -2002,7 +1940,7 @@ static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); tmp0 = PCKEV_XORI128_UB(out0, out1); tmp1 = PCKEV_XORI128_UB(out2, out3); @@ -2013,22 +1951,20 @@ static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { if ((2 == height) || (6 == height)) { common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter, - height, rnd_val); + height); } else { common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter, - height, rnd_val); + height); } } static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3; @@ -2036,7 +1972,6 @@ static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, v16i8 vec10, vec11; v16u8 tmp0, tmp1; v8i16 filt, out0, out1, out2, out3, out4, out5; - v8i16 rnd_vec; mask0 = LD_SB(&mc_filt_mask_arr[0]); mask2 = LD_SB(&mc_filt_mask_arr[32]); @@ -2050,8 +1985,6 @@ static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, mask1 = mask0 + 2; mask3 = mask2 + 2; - rnd_vec = __msa_fill_h(rnd_val); - for (loop_cnt = (height >> 2); loop_cnt--;) { LD_SB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); @@ -2069,8 +2002,8 @@ static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1, out2, out3, out4, out5); DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); - SRAR_H2_SH(out4, out5, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); + SRARI_H2_SH(out4, out5, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); SAT_SH2_SH(out4, out5, 7); tmp0 = PCKEV_XORI128_UB(out2, out3); @@ -2084,19 +2017,16 @@ static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7; v16i8 filt0, filt1, mask0, mask1; v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; v16u8 out; - v8i16 rnd_vec; mask0 = LD_SB(&mc_filt_mask_arr[0]); src -= 1; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -2114,8 +2044,8 @@ static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, filt1, out0, out1, out2, out3); HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, filt1, out4, out5, out6, out7); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); - SRAR_H4_SH(out4, out5, out6, out7, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); + SRARI_H4_SH(out4, out5, out6, out7, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); SAT_SH4_SH(out4, out5, out6, out7, 7); out = PCKEV_XORI128_UB(out0, out1); @@ -2135,8 +2065,7 @@ static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint8_t *dst1 = dst + 16; uint32_t loop_cnt; @@ -2145,11 +2074,9 @@ static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, v16i8 filt0, filt1, mask0, mask1, mask00, mask11; v8i16 filt, out0, out1, out2, out3; v16u8 tmp0, tmp1; - v8i16 rnd_vec; mask0 = LD_SB(&mc_filt_mask_arr[0]); src -= 1; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -2173,7 +2100,7 @@ static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, out0, out1, out2, out3); DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); tmp0 = PCKEV_XORI128_UB(out0, out1); ST_UB(tmp0, dst); @@ -2190,7 +2117,7 @@ static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, out0, out1, out2, out3); DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); tmp0 = PCKEV_XORI128_UB(out0, out1); ST_UB(tmp0, dst); @@ -2210,7 +2137,7 @@ static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0, out1, out2, out3); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); tmp0 = PCKEV_XORI128_UB(out0, out1); tmp1 = PCKEV_XORI128_UB(out2, out3); @@ -2221,19 +2148,16 @@ static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7; v16i8 filt0, filt1, mask0, mask1; v16u8 out; v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7; - v8i16 rnd_vec; mask0 = LD_SB(&mc_filt_mask_arr[0]); src -= 1; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter */ filt = LD_SH(filter); @@ -2257,8 +2181,8 @@ static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, filt0, filt1, out0, out1, out2, out3); HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, filt1, out4, out5, out6, out7); - SRAR_H4_SH(out0, out1, out2, out3, rnd_vec); - SRAR_H4_SH(out4, out5, out6, out7, rnd_vec); + SRARI_H4_SH(out0, out1, out2, out3, 6); + SRARI_H4_SH(out4, out5, out6, out7, 6); SAT_SH4_SH(out0, out1, out2, out3, 7); SAT_SH4_SH(out4, out5, out6, out7, 7); out = PCKEV_XORI128_UB(out0, out1); @@ -2276,16 +2200,14 @@ static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, uint8_t rnd_val) + const int8_t *filter) { v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r; v16i8 src2110, src4332, filt0, filt1; v16u8 out; v8i16 filt, out10; - v8i16 rnd_vec; src -= src_stride; - rnd_vec = __msa_fill_h(rnd_val); filt = LD_SH(filter); SPLATI_H2_SB(filt, 0, 1, filt0, filt1); @@ -2301,7 +2223,7 @@ static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r); src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128); out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1); - out10 = __msa_srar_h(out10, rnd_vec); + out10 = __msa_srari_h(out10, 6); out10 = __msa_sat_s_h(out10, 7); out = PCKEV_XORI128_UB(out10, out10); ST4x2_UB(out, dst, dst_stride); @@ -2309,8 +2231,7 @@ static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5; @@ -2318,10 +2239,8 @@ static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, v16i8 src2110, src4332, filt0, filt1; v8i16 filt, out10, out32; v16u8 out; - v8i16 rnd_vec; src -= src_stride; - rnd_vec = __msa_fill_h(rnd_val); filt = LD_SH(filter); SPLATI_H2_SB(filt, 0, 1, filt0, filt1); @@ -2348,7 +2267,7 @@ static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r); src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128); out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1); - SRAR_H2_SH(out10, out32, rnd_vec); + SRARI_H2_SH(out10, out32, 6); SAT_SH2_SH(out10, out32, 7); out = PCKEV_XORI128_UB(out10, out32); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -2358,30 +2277,26 @@ static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { if (2 == height) { - common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter); } else { common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter, - height, rnd_val); + height); } } static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1; v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3; v8i16 filt, filt0, filt1; - v8i16 rnd_vec; src -= src_stride; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter_y */ filt = LD_SH(filter); @@ -2414,7 +2329,7 @@ static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1); tmp3 = FILT_4TAP_DPADD_S_H(vec30, vec12, filt0, filt1); - SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec); + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); out0 = PCKEV_XORI128_UB(tmp0, tmp1); out1 = PCKEV_XORI128_UB(tmp2, tmp3); @@ -2425,15 +2340,13 @@ static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, uint8_t rnd_val) + const int8_t *filter) { v16i8 src0, src1, src2, src3, src4; v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1; v16u8 out; - v8i16 rnd_vec; src -= src_stride; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter_y */ filt = LD_SH(filter); @@ -2445,7 +2358,7 @@ static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, tmp0 = FILT_4TAP_DPADD_S_H(src01, src23, filt0, filt1); ILVR_B2_SH(src2, src1, src4, src3, src12, src34); tmp1 = FILT_4TAP_DPADD_S_H(src12, src34, filt0, filt1); - SRAR_H2_SH(tmp0, tmp1, rnd_vec); + SRARI_H2_SH(tmp0, tmp1, 6); SAT_SH2_SH(tmp0, tmp1, 7); out = PCKEV_XORI128_UB(tmp0, tmp1); ST8x2_UB(out, dst, dst_stride); @@ -2453,17 +2366,15 @@ static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, uint8_t rnd_val) + const int8_t *filter) { uint32_t loop_cnt; uint64_t out0, out1, out2; v16i8 src0, src1, src2, src3, src4, src5; v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2; v8i16 filt, filt0, filt1; - v8i16 rnd_vec; src -= src_stride; - rnd_vec = __msa_fill_h(rnd_val); /* rearranging filter_y */ filt = LD_SH(filter); @@ -2484,7 +2395,8 @@ static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1); tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt0, filt1); tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt0, filt1); - SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec); + SRARI_H2_SH(tmp0, tmp1, 6); + tmp2 = __msa_srari_h(tmp2, 6); SAT_SH3_SH(tmp0, tmp1, tmp2, 7); PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2); XORI_B2_128_SH(tmp0, tmp2); @@ -2507,18 +2419,15 @@ static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src7, src8, src9, src10; v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; v16u8 tmp0, tmp1; v8i16 filt, out0_r, out1_r, out2_r, out3_r; - v8i16 rnd_vec; src -= src_stride; - rnd_vec = __msa_fill_h(rnd_val); filt = LD_SH(filter); SPLATI_H2_SB(filt, 0, 1, filt0, filt1); @@ -2540,7 +2449,7 @@ static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1); out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1); out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1); - SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); @@ -2555,23 +2464,21 @@ static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { if (2 == height) { - common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter); } else if (6 == height) { - common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter, rnd_val); + common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter); } else { common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride, - filter, height, rnd_val); + filter, height); } } static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6; @@ -2580,14 +2487,11 @@ static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, filt, filt0, filt1; v4u32 mask = { 2, 6, 2, 6 }; - v8i16 rnd_vec; /* rearranging filter_y */ filt = LD_SH(filter); SPLATI_H2_SH(filt, 0, 1, filt0, filt1); - rnd_vec = __msa_fill_h(rnd_val); - src -= src_stride; LD_SB3(src, src_stride, src0, src1, src2); @@ -2613,8 +2517,8 @@ static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211); tmp4 = FILT_4TAP_DPADD_S_H(src87, src109, filt0, filt1); tmp5 = FILT_4TAP_DPADD_S_H(src109, src1211, filt0, filt1); - SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec); - SRAR_H2_SH(tmp4, tmp5, rnd_vec); + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6); + SRARI_H2_SH(tmp4, tmp5, 6); SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); SAT_SH2_SH(tmp4, tmp5, 7); out0 = PCKEV_XORI128_UB(tmp0, tmp1); @@ -2635,8 +2539,7 @@ static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6; @@ -2644,10 +2547,8 @@ static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; v16u8 tmp0, tmp1, tmp2, tmp3; v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; - v8i16 rnd_vec; src -= src_stride; - rnd_vec = __msa_fill_h(rnd_val); filt = LD_SH(filter); SPLATI_H2_SB(filt, 0, 1, filt0, filt1); @@ -2676,8 +2577,8 @@ static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1); out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1); out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1); - SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); - SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6); SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, @@ -2696,8 +2597,7 @@ static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { uint32_t loop_cnt; uint64_t out0, out1; @@ -2707,15 +2607,12 @@ static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, v16i8 src109_r, src10_l, src32_l, src21_l, src43_l; v16u8 out; v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l; - v8i16 rnd_vec; src -= src_stride; filt = LD_SH(filter); SPLATI_H2_SB(filt, 0, 1, filt0, filt1); - rnd_vec = __msa_fill_h(rnd_val); - /* 16 width */ LD_SB3(src, src_stride, src0, src1, src2); XORI_B3_128_SB(src0, src1, src2); @@ -2752,8 +2649,8 @@ static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1); /* 16 + 8 width */ - SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); - SRAR_H2_SH(out0_l, out1_l, rnd_vec); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); + SRARI_H2_SH(out0_l, out1_l, 6); SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); SAT_SH2_SH(out0_l, out1_l, 7); out = PCKEV_XORI128_UB(out0_r, out0_l); @@ -2792,8 +2689,8 @@ static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, out3_r = FILT_4TAP_DPADD_S_H(src109_r, src87_r, filt0, filt1); /* 16 + 8 width */ - SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec); - SRAR_H2_SH(out0_l, out1_l, rnd_vec); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6); + SRARI_H2_SH(out0_l, out1_l, 6); SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); SAT_SH2_SH(out0_l, out1_l, 7); out = PCKEV_XORI128_UB(out0_r, out0_l); @@ -2812,7 +2709,7 @@ static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, - uint8_t rnd_val, int32_t width) + int32_t width) { uint32_t loop_cnt, cnt; uint8_t *dst_tmp, *src_tmp; @@ -2824,11 +2721,9 @@ static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride, v16i8 src21_l, src43_l, src87_l, src109_l; v8i16 filt; v16i8 filt0, filt1; - v8i16 rnd_vec; v16u8 out; src -= src_stride; - rnd_vec = __msa_fill_h(rnd_val); filt = LD_SH(filter); SPLATI_H2_SB(filt, 0, 1, filt0, filt1); @@ -2866,7 +2761,7 @@ static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride, out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1); /* 16 width */ - SRAR_H4_SH(out0_r, out1_r, out0_l, out1_l, rnd_vec); + SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6); SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7); out = PCKEV_XORI128_UB(out0_r, out0_l); ST_UB(out, dst_tmp); @@ -2893,7 +2788,7 @@ static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride, out3_l = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1); /* next 16 width */ - SRAR_H4_SH(out2_r, out3_r, out2_l, out3_l, rnd_vec); + SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6); SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7); out = PCKEV_XORI128_UB(out2_r, out2_l); ST_UB(out, dst_tmp + 16); @@ -2916,11 +2811,10 @@ static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride, static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int8_t *filter, int32_t height, - uint8_t rnd_val) + const int8_t *filter, int32_t height) { common_vt_4t_32w_mult_msa(src, src_stride, dst, dst_stride, - filter, height, rnd_val, 32); + filter, height, 32); } static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, @@ -3885,7 +3779,7 @@ void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ \ common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ - filter, height, 6); \ + filter, height); \ } UNI_MC(qpel, h, 4, 8, hz, mx); |