Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(93)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c

Issue 1169543007: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "./vp9_rtcd.h" 11 #include "./vp9_rtcd.h"
12 #include "vp9/common/mips/msa/vp9_convolve_msa.h" 12 #include "vp9/common/mips/msa/vp9_convolve_msa.h"
13 13
14 const uint8_t mc_filt_mask_arr[16 * 3] = { 14 const uint8_t mc_filt_mask_arr[16 * 3] = {
15 /* 8 width cases */ 15 /* 8 width cases */
16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
17 /* 4 width cases */ 17 /* 4 width cases */
18 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 18 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
19 /* 4 width cases */ 19 /* 4 width cases */
20 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 20 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
21 }; 21 };
22 22
23 static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, 23 static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
24 uint8_t *dst, int32_t dst_stride, 24 uint8_t *dst, int32_t dst_stride,
25 int8_t *filter_horiz, int8_t *filter_vert, 25 int8_t *filter_horiz, int8_t *filter_vert,
26 int32_t height) { 26 int32_t height) {
27 uint32_t loop_cnt; 27 uint32_t loop_cnt;
28 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 28 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
29 v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3; 29 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
30 v16u8 mask0, mask1, mask2, mask3; 30 v16u8 mask0, mask1, mask2, mask3, out;
31 v8i16 filt_horiz; 31 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
32 v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4; 32 v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
33 v8i16 horiz_out5, horiz_out6, horiz_out7, horiz_out8, horiz_out9; 33 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
34 v8i16 tmp0, tmp1, out0, out1, out2, out3, out4;
35 v8i16 filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3;
36 34
37 mask0 = LOAD_UB(&mc_filt_mask_arr[16]); 35 mask0 = LD_UB(&mc_filt_mask_arr[16]);
38
39 src -= (3 + 3 * src_stride); 36 src -= (3 + 3 * src_stride);
40 37
41 /* rearranging filter */ 38 /* rearranging filter */
42 filt_horiz = LOAD_SH(filter_horiz); 39 filt = LD_SH(filter_horiz);
43 filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0); 40 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
44 filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
45 filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
46 filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
47 41
48 mask1 = mask0 + 2; 42 mask1 = mask0 + 2;
49 mask2 = mask0 + 4; 43 mask2 = mask0 + 4;
50 mask3 = mask0 + 6; 44 mask3 = mask0 + 6;
51 45
52 LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 46 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
47 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
53 src += (7 * src_stride); 48 src += (7 * src_stride);
54 49
55 XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6, 50 hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
56 src0, src1, src2, src3, src4, src5, src6, 128); 51 filt_hz1, filt_hz2, filt_hz3);
52 hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
53 filt_hz1, filt_hz2, filt_hz3);
54 hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
55 filt_hz1, filt_hz2, filt_hz3);
56 hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
57 filt_hz1, filt_hz2, filt_hz3);
58 SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
57 59
58 horiz_out0 = HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3, 60 filt = LD_SH(filter_vert);
59 filt_horiz0, filt_horiz1, filt_horiz2, 61 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
60 filt_horiz3);
61 horiz_out2 = HORIZ_8TAP_FILT_2VECS(src2, src3, mask0, mask1, mask2, mask3,
62 filt_horiz0, filt_horiz1, filt_horiz2,
63 filt_horiz3);
64 horiz_out4 = HORIZ_8TAP_FILT_2VECS(src4, src5, mask0, mask1, mask2, mask3,
65 filt_horiz0, filt_horiz1, filt_horiz2,
66 filt_horiz3);
67 horiz_out5 = HORIZ_8TAP_FILT_2VECS(src5, src6, mask0, mask1, mask2, mask3,
68 filt_horiz0, filt_horiz1, filt_horiz2,
69 filt_horiz3);
70 horiz_out1 = (v8i16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
71 horiz_out3 = (v8i16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
72 62
73 filt = LOAD_SH(filter_vert); 63 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
74 filt_vert0 = __msa_splati_h(filt, 0); 64 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
75 filt_vert1 = __msa_splati_h(filt, 1);
76 filt_vert2 = __msa_splati_h(filt, 2);
77 filt_vert3 = __msa_splati_h(filt, 3);
78
79 out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
80 out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
81 out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
82 65
83 for (loop_cnt = (height >> 2); loop_cnt--;) { 66 for (loop_cnt = (height >> 2); loop_cnt--;) {
84 LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10); 67 LD_SB4(src, src_stride, src7, src8, src9, src10);
68 XORI_B4_128_SB(src7, src8, src9, src10);
85 src += (4 * src_stride); 69 src += (4 * src_stride);
86 70
87 XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128); 71 hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
72 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
73 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
74 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
75 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
76 filt_vt2, filt_vt3);
88 77
89 horiz_out7 = HORIZ_8TAP_FILT_2VECS(src7, src8, mask0, mask1, mask2, mask3, 78 hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
90 filt_horiz0, filt_horiz1, filt_horiz2, 79 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
91 filt_horiz3); 80 hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
92 horiz_out6 = (v8i16)__msa_sldi_b((v16i8)horiz_out7, (v16i8)horiz_out5, 8); 81 out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
93 82 tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
94 out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6); 83 filt_vt2, filt_vt3);
95 84 SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
96 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1, 85 SAT_SH2_SH(tmp0, tmp1, 7);
97 filt_vert2, filt_vert3); 86 out = PCKEV_XORI128_UB(tmp0, tmp1);
98 87 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
99 horiz_out9 = HORIZ_8TAP_FILT_2VECS(src9, src10, mask0, mask1, mask2, mask3,
100 filt_horiz0, filt_horiz1, filt_horiz2,
101 filt_horiz3);
102 horiz_out8 = (v8i16)__msa_sldi_b((v16i8)horiz_out9, (v16i8)horiz_out7, 8);
103
104 out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8);
105
106 tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vert0, filt_vert1,
107 filt_vert2, filt_vert3);
108 tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7);
109 tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7);
110
111 PCKEV_2B_XORI128_STORE_4_BYTES_4(tmp0, tmp1, dst, dst_stride);
112 dst += (4 * dst_stride); 88 dst += (4 * dst_stride);
113 89
114 horiz_out5 = horiz_out9; 90 hz_out5 = hz_out9;
115
116 out0 = out2; 91 out0 = out2;
117 out1 = out3; 92 out1 = out3;
118 out2 = out4; 93 out2 = out4;
119 } 94 }
120 } 95 }
121 96
122 static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, 97 static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
123 uint8_t *dst, int32_t dst_stride, 98 uint8_t *dst, int32_t dst_stride,
124 int8_t *filter_horiz, int8_t *filter_vert, 99 int8_t *filter_horiz, int8_t *filter_vert,
125 int32_t height) { 100 int32_t height) {
126 uint32_t loop_cnt; 101 uint32_t loop_cnt;
127 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 102 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
128 v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3; 103 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
129 v8i16 filt_horiz, filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3; 104 v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
130 v16u8 mask0, mask1, mask2, mask3; 105 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
131 v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3; 106 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
132 v8i16 horiz_out4, horiz_out5, horiz_out6, horiz_out7; 107 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
133 v8i16 horiz_out8, horiz_out9, horiz_out10;
134 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; 108 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
135 v8i16 tmp0, tmp1, tmp2, tmp3;
136 109
137 mask0 = LOAD_UB(&mc_filt_mask_arr[0]); 110 mask0 = LD_UB(&mc_filt_mask_arr[0]);
138
139 src -= (3 + 3 * src_stride); 111 src -= (3 + 3 * src_stride);
140 112
141 /* rearranging filter */ 113 /* rearranging filter */
142 filt_horiz = LOAD_SH(filter_horiz); 114 filt = LD_SH(filter_horiz);
143 filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0); 115 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
144 filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
145 filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
146 filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
147 116
148 mask1 = mask0 + 2; 117 mask1 = mask0 + 2;
149 mask2 = mask0 + 4; 118 mask2 = mask0 + 4;
150 mask3 = mask0 + 6; 119 mask3 = mask0 + 6;
151 120
152 LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 121 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
153 src += (7 * src_stride); 122 src += (7 * src_stride);
154 123
155 XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6, 124 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
156 src0, src1, src2, src3, src4, src5, src6, 128); 125 hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
126 filt_hz1, filt_hz2, filt_hz3);
127 hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
128 filt_hz1, filt_hz2, filt_hz3);
129 hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
130 filt_hz1, filt_hz2, filt_hz3);
131 hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
132 filt_hz1, filt_hz2, filt_hz3);
133 hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
134 filt_hz1, filt_hz2, filt_hz3);
135 hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
136 filt_hz1, filt_hz2, filt_hz3);
137 hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
138 filt_hz1, filt_hz2, filt_hz3);
157 139
158 horiz_out0 = HORIZ_8TAP_FILT(src0, mask0, mask1, mask2, mask3, filt_horiz0, 140 filt = LD_SH(filter_vert);
159 filt_horiz1, filt_horiz2, filt_horiz3); 141 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
160 horiz_out1 = HORIZ_8TAP_FILT(src1, mask0, mask1, mask2, mask3, filt_horiz0,
161 filt_horiz1, filt_horiz2, filt_horiz3);
162 horiz_out2 = HORIZ_8TAP_FILT(src2, mask0, mask1, mask2, mask3, filt_horiz0,
163 filt_horiz1, filt_horiz2, filt_horiz3);
164 horiz_out3 = HORIZ_8TAP_FILT(src3, mask0, mask1, mask2, mask3, filt_horiz0,
165 filt_horiz1, filt_horiz2, filt_horiz3);
166 horiz_out4 = HORIZ_8TAP_FILT(src4, mask0, mask1, mask2, mask3, filt_horiz0,
167 filt_horiz1, filt_horiz2, filt_horiz3);
168 horiz_out5 = HORIZ_8TAP_FILT(src5, mask0, mask1, mask2, mask3, filt_horiz0,
169 filt_horiz1, filt_horiz2, filt_horiz3);
170 horiz_out6 = HORIZ_8TAP_FILT(src6, mask0, mask1, mask2, mask3, filt_horiz0,
171 filt_horiz1, filt_horiz2, filt_horiz3);
172 142
173 filt = LOAD_SH(filter_vert); 143 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
174 filt_vert0 = __msa_splati_h(filt, 0); 144 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
175 filt_vert1 = __msa_splati_h(filt, 1); 145 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
176 filt_vert2 = __msa_splati_h(filt, 2);
177 filt_vert3 = __msa_splati_h(filt, 3);
178
179 out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
180 out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
181 out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
182 out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out1);
183 out5 = (v8i16)__msa_ilvev_b((v16i8)horiz_out4, (v16i8)horiz_out3);
184 out6 = (v8i16)__msa_ilvev_b((v16i8)horiz_out6, (v16i8)horiz_out5);
185 146
186 for (loop_cnt = (height >> 2); loop_cnt--;) { 147 for (loop_cnt = (height >> 2); loop_cnt--;) {
187 LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10); 148 LD_SB4(src, src_stride, src7, src8, src9, src10);
188 src += (4 * src_stride); 149 src += (4 * src_stride);
189 150
190 XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128); 151 XORI_B4_128_SB(src7, src8, src9, src10);
191 152
192 horiz_out7 = HORIZ_8TAP_FILT(src7, mask0, mask1, mask2, mask3, filt_horiz0, 153 hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
193 filt_horiz1, filt_horiz2, filt_horiz3); 154 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
155 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
156 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
157 filt_vt2, filt_vt3);
194 158
195 out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6); 159 hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
196 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1, 160 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
197 filt_vert2, filt_vert3); 161 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
198 tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7); 162 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
163 filt_vt2, filt_vt3);
199 164
200 horiz_out8 = HORIZ_8TAP_FILT(src8, mask0, mask1, mask2, mask3, filt_horiz0, 165 hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
201 filt_horiz1, filt_horiz2, filt_horiz3); 166 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
167 out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
168 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
169 filt_vt2, filt_vt3);
202 170
203 out7 = (v8i16)__msa_ilvev_b((v16i8)horiz_out8, (v16i8)horiz_out7); 171 hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
204 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vert0, filt_vert1, 172 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
205 filt_vert2, filt_vert3); 173 out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
206 tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7); 174 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
207 175 filt_vt2, filt_vt3);
208 horiz_out9 = HORIZ_8TAP_FILT(src9, mask0, mask1, mask2, mask3, filt_horiz0, 176 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
209 filt_horiz1, filt_horiz2, filt_horiz3); 177 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
210 178 vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
211 out8 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8); 179 vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
212 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vert0, filt_vert1, 180 ST8x4_UB(vec0, vec1, dst, dst_stride);
213 filt_vert2, filt_vert3);
214 tmp2 = SRARI_SATURATE_SIGNED_H(tmp2, FILTER_BITS, 7);
215
216 horiz_out10 = HORIZ_8TAP_FILT(src10, mask0, mask1, mask2, mask3,
217 filt_horiz0, filt_horiz1, filt_horiz2,
218 filt_horiz3);
219
220 out9 = (v8i16)__msa_ilvev_b((v16i8)horiz_out10, (v16i8)horiz_out9);
221 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vert0, filt_vert1,
222 filt_vert2, filt_vert3);
223 tmp3 = SRARI_SATURATE_SIGNED_H(tmp3, FILTER_BITS, 7);
224
225 PCKEV_B_4_XORI128_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
226 dst += (4 * dst_stride); 181 dst += (4 * dst_stride);
227 182
228 horiz_out6 = horiz_out10; 183 hz_out6 = hz_out10;
229
230 out0 = out2; 184 out0 = out2;
231 out1 = out3; 185 out1 = out3;
232 out2 = out8; 186 out2 = out8;
233 out4 = out6; 187 out4 = out6;
234 out5 = out7; 188 out5 = out7;
235 out6 = out9; 189 out6 = out9;
236 } 190 }
237 } 191 }
238 192
239 static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, 193 static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
272 filter_vert, height); 226 filter_vert, height);
273 src += 8; 227 src += 8;
274 dst += 8; 228 dst += 8;
275 } 229 }
276 } 230 }
277 231
278 static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, 232 static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
279 uint8_t *dst, int32_t dst_stride, 233 uint8_t *dst, int32_t dst_stride,
280 int8_t *filter_horiz, 234 int8_t *filter_horiz,
281 int8_t *filter_vert) { 235 int8_t *filter_vert) {
282 uint32_t out0, out1, out2, out3;
283 v16i8 src0, src1, src2, src3, src4, mask; 236 v16i8 src0, src1, src2, src3, src4, mask;
284 v16u8 res0, res1, horiz_vec; 237 v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
285 v16u8 filt_vert, filt_horiz, vec0, vec1; 238 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
286 v8u16 filt, tmp0, tmp1;
287 v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4;
288 239
289 mask = LOAD_SB(&mc_filt_mask_arr[16]); 240 mask = LD_SB(&mc_filt_mask_arr[16]);
290 241
291 /* rearranging filter */ 242 /* rearranging filter */
292 filt = LOAD_UH(filter_horiz); 243 filt = LD_UH(filter_horiz);
293 filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0); 244 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
294 245
295 filt = LOAD_UH(filter_vert); 246 filt = LD_UH(filter_vert);
296 filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0); 247 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
297 248
298 LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4); 249 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
250 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
251 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
252 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
253 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
254 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
299 255
300 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0); 256 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
301 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); 257 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
302 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); 258 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
303 259 SAT_UH2_UH(tmp0, tmp1, 7);
304 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2); 260 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
305 horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz); 261 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
306 horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7);
307
308 horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
309 horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz);
310 horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7);
311
312 horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
313 horiz_out3 = (v8u16)__msa_pckod_d((v2i64)horiz_out4, (v2i64)horiz_out2);
314
315 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
316 vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
317
318 tmp0 = __msa_dotp_u_h(vec0, filt_vert);
319 tmp1 = __msa_dotp_u_h(vec1, filt_vert);
320 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
321 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
322
323 res0 = (v16u8)__msa_pckev_b((v16i8)tmp0, (v16i8)tmp0);
324 res1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp1);
325
326 out0 = __msa_copy_u_w((v4i32)res0, 0);
327 out1 = __msa_copy_u_w((v4i32)res0, 1);
328 out2 = __msa_copy_u_w((v4i32)res1, 0);
329 out3 = __msa_copy_u_w((v4i32)res1, 1);
330
331 STORE_WORD(dst, out0);
332 dst += dst_stride;
333 STORE_WORD(dst, out1);
334 dst += dst_stride;
335 STORE_WORD(dst, out2);
336 dst += dst_stride;
337 STORE_WORD(dst, out3);
338 } 262 }
339 263
340 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, 264 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
341 uint8_t *dst, int32_t dst_stride, 265 uint8_t *dst, int32_t dst_stride,
342 int8_t *filter_horiz, 266 int8_t *filter_horiz,
343 int8_t *filter_vert) { 267 int8_t *filter_vert) {
344 uint32_t out0, out1, out2, out3;
345 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; 268 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
346 v16u8 filt_horiz, filt_vert, horiz_vec;
347 v16u8 vec0, vec1, vec2, vec3;
348 v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
349 v8u16 vec4, vec5, vec6, vec7, filt;
350 v8u16 horiz_out4, horiz_out5, horiz_out6, horiz_out7, horiz_out8;
351 v16i8 res0, res1, res2, res3; 269 v16i8 res0, res1, res2, res3;
270 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
271 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
272 v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
352 273
353 mask = LOAD_SB(&mc_filt_mask_arr[16]); 274 mask = LD_SB(&mc_filt_mask_arr[16]);
354 275
355 /* rearranging filter */ 276 /* rearranging filter */
356 filt = LOAD_UH(filter_horiz); 277 filt = LD_UH(filter_horiz);
357 filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0); 278 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
358 279
359 filt = LOAD_UH(filter_vert); 280 filt = LD_UH(filter_vert);
360 filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0); 281 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
361 282
362 LOAD_8VECS_SB(src, src_stride, 283 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
363 src0, src1, src2, src3, src4, src5, src6, src7);
364 src += (8 * src_stride); 284 src += (8 * src_stride);
365 src8 = LOAD_SB(src); 285 src8 = LD_SB(src);
366 286
367 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0); 287 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
368 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); 288 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
369 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); 289 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
290 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
291 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
292 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
293 hz_out3, hz_out5, 8);
294 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
370 295
371 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2); 296 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
372 horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz); 297 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
373 horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7); 298 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
374 299 vec4, vec5, vec6, vec7);
375 horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src4); 300 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
376 horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz); 301 SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
377 horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7); 302 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
378 303 res2, res3);
379 horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src6); 304 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
380 horiz_out6 = __msa_dotp_u_h(horiz_vec, filt_horiz); 305 dst += (4 * dst_stride);
381 horiz_out6 = SRARI_SATURATE_UNSIGNED_H(horiz_out6, FILTER_BITS, 7); 306 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
382
383 horiz_vec = (v16u8)__msa_vshf_b(mask, src8, src8);
384 horiz_out8 = __msa_dotp_u_h(horiz_vec, filt_horiz);
385 horiz_out8 = SRARI_SATURATE_UNSIGNED_H(horiz_out8, FILTER_BITS, 7);
386
387 horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
388 horiz_out3 = (v8u16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
389 horiz_out5 = (v8u16)__msa_sldi_b((v16i8)horiz_out6, (v16i8)horiz_out4, 8);
390 horiz_out7 = (v8u16)__msa_pckod_d((v2i64)horiz_out8, (v2i64)horiz_out6);
391
392 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
393 vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
394 vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
395 vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
396
397 vec4 = __msa_dotp_u_h(vec0, filt_vert);
398 vec5 = __msa_dotp_u_h(vec1, filt_vert);
399 vec6 = __msa_dotp_u_h(vec2, filt_vert);
400 vec7 = __msa_dotp_u_h(vec3, filt_vert);
401
402 vec4 = SRARI_SATURATE_UNSIGNED_H(vec4, FILTER_BITS, 7);
403 vec5 = SRARI_SATURATE_UNSIGNED_H(vec5, FILTER_BITS, 7);
404 vec6 = SRARI_SATURATE_UNSIGNED_H(vec6, FILTER_BITS, 7);
405 vec7 = SRARI_SATURATE_UNSIGNED_H(vec7, FILTER_BITS, 7);
406
407 res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4);
408 res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5);
409 res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6);
410 res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7);
411
412 out0 = __msa_copy_u_w((v4i32)res0, 0);
413 out1 = __msa_copy_u_w((v4i32)res0, 1);
414 out2 = __msa_copy_u_w((v4i32)res1, 0);
415 out3 = __msa_copy_u_w((v4i32)res1, 1);
416
417 STORE_WORD(dst, out0);
418 dst += dst_stride;
419 STORE_WORD(dst, out1);
420 dst += dst_stride;
421 STORE_WORD(dst, out2);
422 dst += dst_stride;
423 STORE_WORD(dst, out3);
424 dst += dst_stride;
425
426 out0 = __msa_copy_u_w((v4i32)res2, 0);
427 out1 = __msa_copy_u_w((v4i32)res2, 1);
428 out2 = __msa_copy_u_w((v4i32)res3, 0);
429 out3 = __msa_copy_u_w((v4i32)res3, 1);
430
431 STORE_WORD(dst, out0);
432 dst += dst_stride;
433 STORE_WORD(dst, out1);
434 dst += dst_stride;
435 STORE_WORD(dst, out2);
436 dst += dst_stride;
437 STORE_WORD(dst, out3);
438 } 307 }
439 308
440 static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, 309 static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
441 uint8_t *dst, int32_t dst_stride, 310 uint8_t *dst, int32_t dst_stride,
442 int8_t *filter_horiz, 311 int8_t *filter_horiz, int8_t *filter_vert,
443 int8_t *filter_vert,
444 int32_t height) { 312 int32_t height) {
445 if (4 == height) { 313 if (4 == height) {
446 common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, 314 common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
447 filter_horiz, filter_vert); 315 filter_vert);
448 } else if (8 == height) { 316 } else if (8 == height) {
449 common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, 317 common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
450 filter_horiz, filter_vert); 318 filter_vert);
451 } 319 }
452 } 320 }
453 321
454 static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, 322 static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
455 uint8_t *dst, int32_t dst_stride, 323 uint8_t *dst, int32_t dst_stride,
456 int8_t *filter_horiz, 324 int8_t *filter_horiz,
457 int8_t *filter_vert) { 325 int8_t *filter_vert) {
458 v16i8 src0, src1, src2, src3, src4, mask; 326 v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
459 v16u8 filt_horiz, filt_vert, horiz_vec; 327 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
460 v16u8 vec0, vec1, vec2, vec3; 328 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
461 v8u16 horiz_out0, horiz_out1;
462 v8u16 tmp0, tmp1, tmp2, tmp3;
463 v8i16 filt; 329 v8i16 filt;
464 330
465 mask = LOAD_SB(&mc_filt_mask_arr[0]); 331 mask = LD_SB(&mc_filt_mask_arr[0]);
466 332
467 /* rearranging filter */ 333 /* rearranging filter */
468 filt = LOAD_SH(filter_horiz); 334 filt = LD_SH(filter_horiz);
469 filt_horiz = (v16u8)__msa_splati_h(filt, 0); 335 filt_hz = (v16u8)__msa_splati_h(filt, 0);
470 336
471 filt = LOAD_SH(filter_vert); 337 filt = LD_SH(filter_vert);
472 filt_vert = (v16u8)__msa_splati_h(filt, 0); 338 filt_vt = (v16u8)__msa_splati_h(filt, 0);
473 339
474 LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4); 340 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
475 src += (5 * src_stride);
476 341
477 horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0); 342 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
478 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); 343 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
479 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); 344 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
345 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
480 346
481 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); 347 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
482 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); 348 vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
483 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); 349 tmp1 = __msa_dotp_u_h(vec1, filt_vt);
484 350
485 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); 351 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
486 tmp0 = __msa_dotp_u_h(vec0, filt_vert); 352 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
353 tmp2 = __msa_dotp_u_h(vec2, filt_vt);
487 354
488 horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); 355 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
489 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); 356 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
490 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); 357 tmp3 = __msa_dotp_u_h(vec3, filt_vt);
491 358
492 vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); 359 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
493 tmp1 = __msa_dotp_u_h(vec1, filt_vert); 360 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
494 361 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
495 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3); 362 ST8x4_UB(out0, out1, dst, dst_stride);
496 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
497 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
498
499 vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
500 tmp2 = __msa_dotp_u_h(vec2, filt_vert);
501
502 horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
503 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
504 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
505
506 vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
507 tmp3 = __msa_dotp_u_h(vec3, filt_vert);
508
509 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
510 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
511 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
512 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
513
514 PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
515 } 363 }
516 364
517 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, 365 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
518 int32_t src_stride, 366 int32_t src_stride,
519 uint8_t *dst, 367 uint8_t *dst,
520 int32_t dst_stride, 368 int32_t dst_stride,
521 int8_t *filter_horiz, 369 int8_t *filter_horiz,
522 int8_t *filter_vert, 370 int8_t *filter_vert,
523 int32_t height) { 371 int32_t height) {
524 uint32_t loop_cnt; 372 uint32_t loop_cnt;
525 v16i8 src0, src1, src2, src3, src4, mask; 373 v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
526 v16u8 filt_horiz, filt_vert, vec0, horiz_vec; 374 v16u8 filt_hz, filt_vt, vec0;
527 v8u16 horiz_out0, horiz_out1; 375 v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
528 v8u16 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
529 v8i16 filt; 376 v8i16 filt;
530 377
531 mask = LOAD_SB(&mc_filt_mask_arr[0]); 378 mask = LD_SB(&mc_filt_mask_arr[0]);
532 379
533 /* rearranging filter */ 380 /* rearranging filter */
534 filt = LOAD_SH(filter_horiz); 381 filt = LD_SH(filter_horiz);
535 filt_horiz = (v16u8)__msa_splati_h(filt, 0); 382 filt_hz = (v16u8)__msa_splati_h(filt, 0);
536 383
537 filt = LOAD_SH(filter_vert); 384 filt = LD_SH(filter_vert);
538 filt_vert = (v16u8)__msa_splati_h(filt, 0); 385 filt_vt = (v16u8)__msa_splati_h(filt, 0);
539 386
540 src0 = LOAD_SB(src); 387 src0 = LD_SB(src);
541 src += src_stride; 388 src += src_stride;
542 389
543 horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0); 390 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
544 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
545 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
546 391
547 for (loop_cnt = (height >> 3); loop_cnt--;) { 392 for (loop_cnt = (height >> 3); loop_cnt--;) {
548 LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4); 393 LD_SB4(src, src_stride, src1, src2, src3, src4);
549 src += (4 * src_stride); 394 src += (4 * src_stride);
550 395
551 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); 396 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
552 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); 397 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
553 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); 398 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
554 399
555 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); 400 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
556 tmp1 = __msa_dotp_u_h(vec0, filt_vert); 401 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
402 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
557 403
558 horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); 404 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
559 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); 405 SAT_UH2_UH(tmp1, tmp2, 7);
560 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
561 406
562 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); 407 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
563 tmp2 = (v8u16)__msa_dotp_u_h(vec0, filt_vert); 408 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
409 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
564 410
565 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); 411 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
566 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); 412 LD_SB4(src, src_stride, src1, src2, src3, src4);
413 src += (4 * src_stride);
414 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
415 tmp4 = __msa_dotp_u_h(vec0, filt_vt);
567 416
568 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3); 417 SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
569 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); 418 SAT_UH2_UH(tmp3, tmp4, 7);
570 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); 419 PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
571 420 ST8x4_UB(out0, out1, dst, dst_stride);
572 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
573 tmp3 = __msa_dotp_u_h(vec0, filt_vert);
574
575 horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
576 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
577 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
578
579 LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4);
580 src += (4 * src_stride);
581
582 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
583 tmp4 = __msa_dotp_u_h(vec0, filt_vert);
584
585 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
586 tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
587
588 PCKEV_B_STORE_8_BYTES_4(tmp1, tmp2, tmp3, tmp4, dst, dst_stride);
589 dst += (4 * dst_stride); 421 dst += (4 * dst_stride);
590 422
591 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); 423 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
592 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); 424 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
593 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); 425 tmp5 = __msa_dotp_u_h(vec0, filt_vt);
594 426
595 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); 427 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
596 tmp5 = __msa_dotp_u_h(vec0, filt_vert); 428 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
429 tmp6 = __msa_dotp_u_h(vec0, filt_vt);
597 430
598 horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); 431 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
599 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); 432 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
600 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); 433 tmp7 = __msa_dotp_u_h(vec0, filt_vt);
601 434
602 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); 435 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
603 tmp6 = __msa_dotp_u_h(vec0, filt_vert); 436 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
437 tmp8 = __msa_dotp_u_h(vec0, filt_vt);
604 438
605 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3); 439 SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
606 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); 440 SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
607 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); 441 PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
608 442 ST8x4_UB(out0, out1, dst, dst_stride);
609 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
610 tmp7 = __msa_dotp_u_h(vec0, filt_vert);
611
612 horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
613 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
614 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
615
616 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
617 tmp8 = __msa_dotp_u_h(vec0, filt_vert);
618
619 tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
620 tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
621 tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
622 tmp8 = SRARI_SATURATE_UNSIGNED_H(tmp8, FILTER_BITS, 7);
623
624 PCKEV_B_STORE_8_BYTES_4(tmp5, tmp6, tmp7, tmp8, dst, dst_stride);
625 dst += (4 * dst_stride); 443 dst += (4 * dst_stride);
626 } 444 }
627 } 445 }
628 446
629 static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, 447 static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
630 uint8_t *dst, int32_t dst_stride, 448 uint8_t *dst, int32_t dst_stride,
631 int8_t *filter_horiz, int8_t *filter_vert, 449 int8_t *filter_horiz, int8_t *filter_vert,
632 int32_t height) { 450 int32_t height) {
633 if (4 == height) { 451 if (4 == height) {
634 common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, 452 common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
635 filter_vert); 453 filter_vert);
636 } else { 454 } else {
637 common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, 455 common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
638 filter_horiz, filter_vert, height); 456 filter_horiz, filter_vert, height);
639 } 457 }
640 } 458 }
641 459
642 static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride, 460 static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
643 uint8_t *dst, int32_t dst_stride, 461 uint8_t *dst, int32_t dst_stride,
644 int8_t *filter_horiz, int8_t *filter_vert, 462 int8_t *filter_horiz, int8_t *filter_vert,
645 int32_t height) { 463 int32_t height) {
646 uint32_t loop_cnt; 464 uint32_t loop_cnt;
647 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 465 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
648 v16u8 filt_horiz, filt_vert, vec0, horiz_vec; 466 v16u8 filt_hz, filt_vt, vec0, vec1;
649 v8u16 horiz_vec0, horiz_vec1, tmp1, tmp2; 467 v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
650 v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
651 v8i16 filt; 468 v8i16 filt;
652 469
653 mask = LOAD_SB(&mc_filt_mask_arr[0]); 470 mask = LD_SB(&mc_filt_mask_arr[0]);
654 471
655 /* rearranging filter */ 472 /* rearranging filter */
656 filt = LOAD_SH(filter_horiz); 473 filt = LD_SH(filter_horiz);
657 filt_horiz = (v16u8)__msa_splati_h(filt, 0); 474 filt_hz = (v16u8)__msa_splati_h(filt, 0);
658 475
659 filt = LOAD_SH(filter_vert); 476 filt = LD_SH(filter_vert);
660 filt_vert = (v16u8)__msa_splati_h(filt, 0); 477 filt_vt = (v16u8)__msa_splati_h(filt, 0);
661 478
662 src0 = LOAD_SB(src); 479 LD_SB2(src, 8, src0, src1);
663 src1 = LOAD_SB(src + 8);
664
665 horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
666 horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
667 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
668
669 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
670 horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
671 horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
672
673 src += src_stride; 480 src += src_stride;
674 481
482 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
483 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
484
675 for (loop_cnt = (height >> 2); loop_cnt--;) { 485 for (loop_cnt = (height >> 2); loop_cnt--;) {
676 LOAD_4VECS_SB(src, src_stride, src0, src2, src4, src6); 486 LD_SB4(src, src_stride, src0, src2, src4, src6);
677 LOAD_4VECS_SB(src + 8, src_stride, src1, src3, src5, src7); 487 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
678 src += (4 * src_stride); 488 src += (4 * src_stride);
679 489
680 horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0); 490 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
681 horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); 491 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
682 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); 492 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
683 493 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
684 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); 494 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
685 horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); 495 SAT_UH2_UH(tmp1, tmp2, 7);
686 horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); 496 PCKEV_ST_SB(tmp1, tmp2, dst);
687
688 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
689 tmp1 = __msa_dotp_u_h(vec0, filt_vert);
690 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
691 tmp2 = __msa_dotp_u_h(vec0, filt_vert);
692 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
693 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
694
695 PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
696 dst += dst_stride; 497 dst += dst_stride;
697 498
698 horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); 499 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
699 horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); 500 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
700 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); 501 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
701 502 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
702 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3); 503 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
703 horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); 504 SAT_UH2_UH(tmp1, tmp2, 7);
704 horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); 505 PCKEV_ST_SB(tmp1, tmp2, dst);
705
706 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
707 tmp1 = __msa_dotp_u_h(vec0, filt_vert);
708 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
709 tmp2 = __msa_dotp_u_h(vec0, filt_vert);
710 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
711 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
712
713 PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
714 dst += dst_stride; 506 dst += dst_stride;
715 507
716 horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4); 508 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
717 horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); 509 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
718 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); 510 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
719 511 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
720 horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src5); 512 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
721 horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); 513 SAT_UH2_UH(tmp1, tmp2, 7);
722 horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); 514 PCKEV_ST_SB(tmp1, tmp2, dst);
723
724 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
725 tmp1 = __msa_dotp_u_h(vec0, filt_vert);
726 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
727 tmp2 = __msa_dotp_u_h(vec0, filt_vert);
728 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
729 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
730
731 PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
732 dst += dst_stride; 515 dst += dst_stride;
733 516
734 horiz_vec = (v16u8)__msa_vshf_b(mask, src6, src6); 517 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
735 horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); 518 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
736 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); 519 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
737 520 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
738 horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src7); 521 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
739 horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); 522 SAT_UH2_UH(tmp1, tmp2, 7);
740 horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); 523 PCKEV_ST_SB(tmp1, tmp2, dst);
741
742 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
743 tmp1 = __msa_dotp_u_h(vec0, filt_vert);
744 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
745 tmp2 = __msa_dotp_u_h(vec0, filt_vert);
746 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
747 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
748
749 PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
750 dst += dst_stride; 524 dst += dst_stride;
751 } 525 }
752 } 526 }
753 527
754 static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, 528 static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
755 uint8_t *dst, int32_t dst_stride, 529 uint8_t *dst, int32_t dst_stride,
756 int8_t *filter_horiz, int8_t *filter_vert, 530 int8_t *filter_horiz, int8_t *filter_vert,
757 int32_t height) { 531 int32_t height) {
758 int32_t multiple8_cnt; 532 int32_t multiple8_cnt;
759 for (multiple8_cnt = 2; multiple8_cnt--;) { 533 for (multiple8_cnt = 2; multiple8_cnt--;) {
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after
871 filt_hor, filt_ver, (int32_t)h); 645 filt_hor, filt_ver, (int32_t)h);
872 break; 646 break;
873 default: 647 default:
874 vp9_convolve8_c(src, src_stride, dst, dst_stride, 648 vp9_convolve8_c(src, src_stride, dst, dst_stride,
875 filter_x, x_step_q4, filter_y, y_step_q4, 649 filter_x, x_step_q4, filter_y, y_step_q4,
876 w, h); 650 w, h);
877 break; 651 break;
878 } 652 }
879 } 653 }
880 } 654 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698