Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(79)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c

Issue 1169543007: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "./vp9_rtcd.h" 11 #include "./vp9_rtcd.h"
12 #include "vp9/common/mips/msa/vp9_convolve_msa.h" 12 #include "vp9/common/mips/msa/vp9_convolve_msa.h"
13 13
14 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, 14 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
15 uint8_t *dst, int32_t dst_stride, 15 uint8_t *dst, int32_t dst_stride,
16 int8_t *filter) { 16 int8_t *filter) {
17 v16i8 filt0, filt1, filt2, filt3; 17 v16u8 mask0, mask1, mask2, mask3, out;
18 v16i8 src0, src1, src2, src3; 18 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
19 v16u8 mask0, mask1, mask2, mask3;
20 v8i16 filt, out0, out1; 19 v8i16 filt, out0, out1;
21 20
22 mask0 = LOAD_UB(&mc_filt_mask_arr[16]); 21 mask0 = LD_UB(&mc_filt_mask_arr[16]);
23
24 src -= 3; 22 src -= 3;
25 23
26 /* rearranging filter */ 24 /* rearranging filter */
27 filt = LOAD_SH(filter); 25 filt = LD_SH(filter);
28 filt0 = (v16i8)__msa_splati_h(filt, 0); 26 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
29 filt1 = (v16i8)__msa_splati_h(filt, 1);
30 filt2 = (v16i8)__msa_splati_h(filt, 2);
31 filt3 = (v16i8)__msa_splati_h(filt, 3);
32 27
33 mask1 = mask0 + 2; 28 mask1 = mask0 + 2;
34 mask2 = mask0 + 4; 29 mask2 = mask0 + 4;
35 mask3 = mask0 + 6; 30 mask3 = mask0 + 6;
36 31
37 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); 32 LD_SB4(src, src_stride, src0, src1, src2, src3);
38 33 XORI_B4_128_SB(src0, src1, src2, src3);
39 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
40
41 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 34 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
42 filt0, filt1, filt2, filt3, out0, out1); 35 filt0, filt1, filt2, filt3, out0, out1);
43 36 SRARI_H2_SH(out0, out1, FILTER_BITS);
44 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); 37 SAT_SH2_SH(out0, out1, 7);
45 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); 38 out = PCKEV_XORI128_UB(out0, out1);
46 39 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
47 PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
48 } 40 }
49 41
50 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, 42 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
51 uint8_t *dst, int32_t dst_stride, 43 uint8_t *dst, int32_t dst_stride,
52 int8_t *filter) { 44 int8_t *filter) {
53 v16i8 filt0, filt1, filt2, filt3; 45 v16i8 filt0, filt1, filt2, filt3;
54 v16i8 src0, src1, src2, src3; 46 v16i8 src0, src1, src2, src3;
55 v16u8 mask0, mask1, mask2, mask3; 47 v16u8 mask0, mask1, mask2, mask3, out;
56 v8i16 filt, out0, out1, out2, out3; 48 v8i16 filt, out0, out1, out2, out3;
57 49
58 mask0 = LOAD_UB(&mc_filt_mask_arr[16]); 50 mask0 = LD_UB(&mc_filt_mask_arr[16]);
59
60 src -= 3; 51 src -= 3;
61 52
62 /* rearranging filter */ 53 /* rearranging filter */
63 filt = LOAD_SH(filter); 54 filt = LD_SH(filter);
64 filt0 = (v16i8)__msa_splati_h(filt, 0); 55 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
65 filt1 = (v16i8)__msa_splati_h(filt, 1);
66 filt2 = (v16i8)__msa_splati_h(filt, 2);
67 filt3 = (v16i8)__msa_splati_h(filt, 3);
68 56
69 mask1 = mask0 + 2; 57 mask1 = mask0 + 2;
70 mask2 = mask0 + 4; 58 mask2 = mask0 + 4;
71 mask3 = mask0 + 6; 59 mask3 = mask0 + 6;
72 60
73 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); 61 LD_SB4(src, src_stride, src0, src1, src2, src3);
62 XORI_B4_128_SB(src0, src1, src2, src3);
74 src += (4 * src_stride); 63 src += (4 * src_stride);
75
76 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
77
78 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 64 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
79 filt0, filt1, filt2, filt3, out0, out1); 65 filt0, filt1, filt2, filt3, out0, out1);
80 66 LD_SB4(src, src_stride, src0, src1, src2, src3);
81 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); 67 XORI_B4_128_SB(src0, src1, src2, src3);
82
83 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
84
85 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 68 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
86 filt0, filt1, filt2, filt3, out2, out3); 69 filt0, filt1, filt2, filt3, out2, out3);
87 70 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
88 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); 71 SAT_SH4_SH(out0, out1, out2, out3, 7);
89 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); 72 out = PCKEV_XORI128_UB(out0, out1);
90 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); 73 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
91 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
92
93 PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
94 dst += (4 * dst_stride); 74 dst += (4 * dst_stride);
95 PCKEV_2B_XORI128_STORE_4_BYTES_4(out2, out3, dst, dst_stride); 75 out = PCKEV_XORI128_UB(out2, out3);
76 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
96 } 77 }
97 78
98 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, 79 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
99 uint8_t *dst, int32_t dst_stride, 80 uint8_t *dst, int32_t dst_stride,
100 int8_t *filter, int32_t height) { 81 int8_t *filter, int32_t height) {
101 if (4 == height) { 82 if (4 == height) {
102 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); 83 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
103 } else if (8 == height) { 84 } else if (8 == height) {
104 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); 85 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
105 } 86 }
106 } 87 }
107 88
108 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, 89 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
109 uint8_t *dst, int32_t dst_stride, 90 uint8_t *dst, int32_t dst_stride,
110 int8_t *filter) { 91 int8_t *filter) {
111 v16i8 filt0, filt1, filt2, filt3; 92 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
112 v16i8 src0, src1, src2, src3; 93 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
113 v16u8 mask0, mask1, mask2, mask3;
114 v8i16 filt, out0, out1, out2, out3; 94 v8i16 filt, out0, out1, out2, out3;
115 95
116 mask0 = LOAD_UB(&mc_filt_mask_arr[0]); 96 mask0 = LD_UB(&mc_filt_mask_arr[0]);
117
118 src -= 3; 97 src -= 3;
119 98
120 /* rearranging filter */ 99 /* rearranging filter */
121 filt = LOAD_SH(filter); 100 filt = LD_SH(filter);
122 filt0 = (v16i8)__msa_splati_h(filt, 0); 101 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
123 filt1 = (v16i8)__msa_splati_h(filt, 1);
124 filt2 = (v16i8)__msa_splati_h(filt, 2);
125 filt3 = (v16i8)__msa_splati_h(filt, 3);
126 102
127 mask1 = mask0 + 2; 103 mask1 = mask0 + 2;
128 mask2 = mask0 + 4; 104 mask2 = mask0 + 4;
129 mask3 = mask0 + 6; 105 mask3 = mask0 + 6;
130 106
131 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); 107 LD_SB4(src, src_stride, src0, src1, src2, src3);
132 108 XORI_B4_128_SB(src0, src1, src2, src3);
133 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
134
135 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 109 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
136 filt0, filt1, filt2, filt3, out0, out1, out2, 110 filt0, filt1, filt2, filt3, out0, out1, out2,
137 out3); 111 out3);
138 112 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
139 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); 113 SAT_SH4_SH(out0, out1, out2, out3, 7);
140 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); 114 tmp0 = PCKEV_XORI128_UB(out0, out1);
141 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); 115 tmp1 = PCKEV_XORI128_UB(out2, out3);
142 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); 116 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
143
144 PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
145 } 117 }
146 118
147 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 119 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
148 uint8_t *dst, int32_t dst_stride, 120 uint8_t *dst, int32_t dst_stride,
149 int8_t *filter, int32_t height) { 121 int8_t *filter, int32_t height) {
150 uint32_t loop_cnt; 122 uint32_t loop_cnt;
151 v16i8 filt0, filt1, filt2, filt3; 123 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
152 v16i8 src0, src1, src2, src3; 124 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
153 v16u8 mask0, mask1, mask2, mask3;
154 v8i16 filt, out0, out1, out2, out3; 125 v8i16 filt, out0, out1, out2, out3;
155 126
156 mask0 = LOAD_UB(&mc_filt_mask_arr[0]); 127 mask0 = LD_UB(&mc_filt_mask_arr[0]);
157
158 src -= 3; 128 src -= 3;
159 129
160 /* rearranging filter */ 130 /* rearranging filter */
161 filt = LOAD_SH(filter); 131 filt = LD_SH(filter);
162 filt0 = (v16i8)__msa_splati_h(filt, 0); 132 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
163 filt1 = (v16i8)__msa_splati_h(filt, 1);
164 filt2 = (v16i8)__msa_splati_h(filt, 2);
165 filt3 = (v16i8)__msa_splati_h(filt, 3);
166 133
167 mask1 = mask0 + 2; 134 mask1 = mask0 + 2;
168 mask2 = mask0 + 4; 135 mask2 = mask0 + 4;
169 mask3 = mask0 + 6; 136 mask3 = mask0 + 6;
170 137
171 for (loop_cnt = (height >> 2); loop_cnt--;) { 138 for (loop_cnt = (height >> 2); loop_cnt--;) {
172 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); 139 LD_SB4(src, src_stride, src0, src1, src2, src3);
140 XORI_B4_128_SB(src0, src1, src2, src3);
173 src += (4 * src_stride); 141 src += (4 * src_stride);
174
175 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
176
177 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 142 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
178 mask3, filt0, filt1, filt2, filt3, out0, out1, 143 mask3, filt0, filt1, filt2, filt3, out0, out1,
179 out2, out3); 144 out2, out3);
180 145 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
181 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); 146 SAT_SH4_SH(out0, out1, out2, out3, 7);
182 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); 147 tmp0 = PCKEV_XORI128_UB(out0, out1);
183 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); 148 tmp1 = PCKEV_XORI128_UB(out2, out3);
184 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); 149 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
185
186 PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
187 dst += (4 * dst_stride); 150 dst += (4 * dst_stride);
188 } 151 }
189 } 152 }
190 153
191 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, 154 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
192 uint8_t *dst, int32_t dst_stride, 155 uint8_t *dst, int32_t dst_stride,
193 int8_t *filter, int32_t height) { 156 int8_t *filter, int32_t height) {
194 if (4 == height) { 157 if (4 == height) {
195 common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); 158 common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
196 } else { 159 } else {
197 common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); 160 common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
198 } 161 }
199 } 162 }
200 163
201 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, 164 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
202 uint8_t *dst, int32_t dst_stride, 165 uint8_t *dst, int32_t dst_stride,
203 int8_t *filter, int32_t height) { 166 int8_t *filter, int32_t height) {
204 uint32_t loop_cnt; 167 uint32_t loop_cnt;
205 v16i8 src0, src1, src2, src3; 168 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
206 v16i8 filt0, filt1, filt2, filt3; 169 v16u8 mask0, mask1, mask2, mask3, out;
207 v16u8 mask0, mask1, mask2, mask3;
208 v8i16 filt, out0, out1, out2, out3; 170 v8i16 filt, out0, out1, out2, out3;
209 171
210 mask0 = LOAD_UB(&mc_filt_mask_arr[0]); 172 mask0 = LD_UB(&mc_filt_mask_arr[0]);
211
212 src -= 3; 173 src -= 3;
213 174
214 /* rearranging filter */ 175 /* rearranging filter */
215 filt = LOAD_SH(filter); 176 filt = LD_SH(filter);
216 filt0 = (v16i8)__msa_splati_h(filt, 0); 177 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
217 filt1 = (v16i8)__msa_splati_h(filt, 1);
218 filt2 = (v16i8)__msa_splati_h(filt, 2);
219 filt3 = (v16i8)__msa_splati_h(filt, 3);
220 178
221 mask1 = mask0 + 2; 179 mask1 = mask0 + 2;
222 mask2 = mask0 + 4; 180 mask2 = mask0 + 4;
223 mask3 = mask0 + 6; 181 mask3 = mask0 + 6;
224 182
225 for (loop_cnt = (height >> 1); loop_cnt--;) { 183 for (loop_cnt = (height >> 1); loop_cnt--;) {
226 src0 = LOAD_SB(src); 184 LD_SB2(src, src_stride, src0, src2);
227 src1 = LOAD_SB(src + 8); 185 LD_SB2(src + 8, src_stride, src1, src3);
228 src += src_stride; 186 XORI_B4_128_SB(src0, src1, src2, src3);
229 src2 = LOAD_SB(src); 187 src += (2 * src_stride);
230 src3 = LOAD_SB(src + 8);
231 src += src_stride;
232
233 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
234
235 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 188 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
236 mask3, filt0, filt1, filt2, filt3, out0, out1, 189 mask3, filt0, filt1, filt2, filt3, out0, out1,
237 out2, out3); 190 out2, out3);
238 191 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
239 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); 192 SAT_SH4_SH(out0, out1, out2, out3, 7);
240 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); 193 out = PCKEV_XORI128_UB(out0, out1);
241 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); 194 ST_UB(out, dst);
242 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
243
244 PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
245 dst += dst_stride; 195 dst += dst_stride;
246 PCKEV_B_XORI128_STORE_VEC(out3, out2, dst); 196 out = PCKEV_XORI128_UB(out2, out3);
197 ST_UB(out, dst);
247 dst += dst_stride; 198 dst += dst_stride;
248 } 199 }
249 } 200 }
250 201
251 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, 202 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
252 uint8_t *dst, int32_t dst_stride, 203 uint8_t *dst, int32_t dst_stride,
253 int8_t *filter, int32_t height) { 204 int8_t *filter, int32_t height) {
254 uint32_t loop_cnt; 205 uint32_t loop_cnt;
255 v16i8 src0, src1, src2, src3; 206 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
256 v16i8 filt0, filt1, filt2, filt3; 207 v16u8 mask0, mask1, mask2, mask3, out;
257 v16u8 mask0, mask1, mask2, mask3;
258 v8i16 filt, out0, out1, out2, out3; 208 v8i16 filt, out0, out1, out2, out3;
259 209
260 mask0 = LOAD_UB(&mc_filt_mask_arr[0]); 210 mask0 = LD_UB(&mc_filt_mask_arr[0]);
261
262 src -= 3; 211 src -= 3;
263 212
264 /* rearranging filter */ 213 /* rearranging filter */
265 filt = LOAD_SH(filter); 214 filt = LD_SH(filter);
266 filt0 = (v16i8)__msa_splati_h(filt, 0); 215 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
267 filt1 = (v16i8)__msa_splati_h(filt, 1);
268 filt2 = (v16i8)__msa_splati_h(filt, 2);
269 filt3 = (v16i8)__msa_splati_h(filt, 3);
270 216
271 mask1 = mask0 + 2; 217 mask1 = mask0 + 2;
272 mask2 = mask0 + 4; 218 mask2 = mask0 + 4;
273 mask3 = mask0 + 6; 219 mask3 = mask0 + 6;
274 220
275 for (loop_cnt = (height >> 1); loop_cnt--;) { 221 for (loop_cnt = (height >> 1); loop_cnt--;) {
276 src0 = LOAD_SB(src); 222 src0 = LD_SB(src);
277 src2 = LOAD_SB(src + 16); 223 src2 = LD_SB(src + 16);
278 src3 = LOAD_SB(src + 24); 224 src3 = LD_SB(src + 24);
279 src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8); 225 src1 = __msa_sldi_b(src2, src0, 8);
280 src += src_stride; 226 src += src_stride;
281 227 XORI_B4_128_SB(src0, src1, src2, src3);
282 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
283
284 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 228 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
285 mask3, filt0, filt1, filt2, filt3, out0, out1, 229 mask3, filt0, filt1, filt2, filt3, out0, out1,
286 out2, out3); 230 out2, out3);
231 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
232 SAT_SH4_SH(out0, out1, out2, out3, 7);
287 233
288 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); 234 src0 = LD_SB(src);
289 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); 235 src2 = LD_SB(src + 16);
290 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); 236 src3 = LD_SB(src + 24);
291 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); 237 src1 = __msa_sldi_b(src2, src0, 8);
238 src += src_stride;
292 239
293 src0 = LOAD_SB(src); 240 out = PCKEV_XORI128_UB(out0, out1);
294 src2 = LOAD_SB(src + 16); 241 ST_UB(out, dst);
295 src3 = LOAD_SB(src + 24); 242 out = PCKEV_XORI128_UB(out2, out3);
296 src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8); 243 ST_UB(out, dst + 16);
297
298 PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
299 PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
300 dst += dst_stride; 244 dst += dst_stride;
301 245
302 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128); 246 XORI_B4_128_SB(src0, src1, src2, src3);
303
304 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 247 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
305 mask3, filt0, filt1, filt2, filt3, out0, out1, 248 mask3, filt0, filt1, filt2, filt3, out0, out1,
306 out2, out3); 249 out2, out3);
307 250 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
308 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); 251 SAT_SH4_SH(out0, out1, out2, out3, 7);
309 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); 252 out = PCKEV_XORI128_UB(out0, out1);
310 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); 253 ST_UB(out, dst);
311 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); 254 out = PCKEV_XORI128_UB(out2, out3);
312 255 ST_UB(out, dst + 16);
313 PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
314 PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
315
316 src += src_stride;
317 dst += dst_stride; 256 dst += dst_stride;
318 } 257 }
319 } 258 }
320 259
321 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, 260 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
322 uint8_t *dst, int32_t dst_stride, 261 uint8_t *dst, int32_t dst_stride,
323 int8_t *filter, int32_t height) { 262 int8_t *filter, int32_t height) {
324 uint32_t loop_cnt, cnt; 263 int32_t loop_cnt;
325 v16i8 src0, src1, src2, src3; 264 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
326 v16i8 filt0, filt1, filt2, filt3; 265 v16u8 mask0, mask1, mask2, mask3, out;
327 v16u8 mask0, mask1, mask2, mask3;
328 v8i16 filt, out0, out1, out2, out3; 266 v8i16 filt, out0, out1, out2, out3;
329 267
330 mask0 = LOAD_UB(&mc_filt_mask_arr[0]); 268 mask0 = LD_UB(&mc_filt_mask_arr[0]);
331
332 src -= 3; 269 src -= 3;
333 270
334 /* rearranging filter */ 271 /* rearranging filter */
335 filt = LOAD_SH(filter); 272 filt = LD_SH(filter);
336 filt0 = (v16i8)__msa_splati_h(filt, 0); 273 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
337 filt1 = (v16i8)__msa_splati_h(filt, 1);
338 filt2 = (v16i8)__msa_splati_h(filt, 2);
339 filt3 = (v16i8)__msa_splati_h(filt, 3);
340 274
341 mask1 = mask0 + 2; 275 mask1 = mask0 + 2;
342 mask2 = mask0 + 4; 276 mask2 = mask0 + 4;
343 mask3 = mask0 + 6; 277 mask3 = mask0 + 6;
344 278
345 for (loop_cnt = height; loop_cnt--;) { 279 for (loop_cnt = height; loop_cnt--;) {
346 for (cnt = 0; cnt < 2; ++cnt) { 280 src0 = LD_SB(src);
347 src0 = LOAD_SB(&src[cnt << 5]); 281 src2 = LD_SB(src + 16);
348 src2 = LOAD_SB(&src[16 + (cnt << 5)]); 282 src3 = LD_SB(src + 24);
349 src3 = LOAD_SB(&src[24 + (cnt << 5)]); 283 src1 = __msa_sldi_b(src2, src0, 8);
350 src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
351 284
352 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128); 285 XORI_B4_128_SB(src0, src1, src2, src3);
286 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
287 mask3, filt0, filt1, filt2, filt3, out0, out1,
288 out2, out3);
289 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
290 SAT_SH4_SH(out0, out1, out2, out3, 7);
291 out = PCKEV_XORI128_UB(out0, out1);
292 ST_UB(out, dst);
293 out = PCKEV_XORI128_UB(out2, out3);
294 ST_UB(out, dst + 16);
353 295
354 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 296 src0 = LD_SB(src + 32);
355 mask3, filt0, filt1, filt2, filt3, out0, out1, 297 src2 = LD_SB(src + 48);
356 out2, out3); 298 src3 = LD_SB(src + 56);
299 src1 = __msa_sldi_b(src2, src0, 8);
300 src += src_stride;
357 301
358 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); 302 XORI_B4_128_SB(src0, src1, src2, src3);
359 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); 303 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
360 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); 304 mask3, filt0, filt1, filt2, filt3, out0, out1,
361 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); 305 out2, out3);
362 306 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
363 PCKEV_B_XORI128_STORE_VEC(out1, out0, &dst[cnt << 5]); 307 SAT_SH4_SH(out0, out1, out2, out3, 7);
364 PCKEV_B_XORI128_STORE_VEC(out3, out2, &dst[16 + (cnt << 5)]); 308 out = PCKEV_XORI128_UB(out0, out1);
365 } 309 ST_UB(out, dst + 32);
366 310 out = PCKEV_XORI128_UB(out2, out3);
367 src += src_stride; 311 ST_UB(out, dst + 48);
368 dst += dst_stride; 312 dst += dst_stride;
369 } 313 }
370 } 314 }
371 315
372 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, 316 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
373 uint8_t *dst, int32_t dst_stride, 317 uint8_t *dst, int32_t dst_stride,
374 int8_t *filter) { 318 int8_t *filter) {
375 uint32_t out0, out1, out2, out3;
376 v16i8 src0, src1, src2, src3, mask; 319 v16i8 src0, src1, src2, src3, mask;
377 v16u8 vec0, vec1, filt0; 320 v16u8 filt0, vec0, vec1, res0, res1;
378 v16i8 res0, res1;
379 v8u16 vec2, vec3, filt, const255; 321 v8u16 vec2, vec3, filt, const255;
380 322
381 mask = LOAD_SB(&mc_filt_mask_arr[16]); 323 mask = LD_SB(&mc_filt_mask_arr[16]);
382 324
383 /* rearranging filter */ 325 /* rearranging filter */
384 filt = LOAD_UH(filter); 326 filt = LD_UH(filter);
385 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 327 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
386 328
387 const255 = (v8u16)__msa_ldi_h(255); 329 const255 = (v8u16) __msa_ldi_h(255);
388 330
389 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); 331 LD_SB4(src, src_stride, src0, src1, src2, src3);
390 332 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
391 vec0 = (v16u8)__msa_vshf_b(mask, src1, src0); 333 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
392 vec1 = (v16u8)__msa_vshf_b(mask, src3, src2); 334 SRARI_H2_UH(vec2, vec3, FILTER_BITS);
393 335 MIN_UH2_UH(vec2, vec3, const255);
394 vec2 = __msa_dotp_u_h(vec0, filt0); 336 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
395 vec3 = __msa_dotp_u_h(vec1, filt0); 337 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
396
397 vec2 = (v8u16)__msa_srari_h((v8i16)vec2, FILTER_BITS);
398 vec3 = (v8u16)__msa_srari_h((v8i16)vec3, FILTER_BITS);
399
400 vec2 = __msa_min_u_h(vec2, const255);
401 vec3 = __msa_min_u_h(vec3, const255);
402
403 res0 = __msa_pckev_b((v16i8)vec2, (v16i8)vec2);
404 res1 = __msa_pckev_b((v16i8)vec3, (v16i8)vec3);
405
406 out0 = __msa_copy_u_w((v4i32)res0, 0);
407 out1 = __msa_copy_u_w((v4i32)res0, 1);
408 out2 = __msa_copy_u_w((v4i32)res1, 0);
409 out3 = __msa_copy_u_w((v4i32)res1, 1);
410
411 STORE_WORD(dst, out0);
412 dst += dst_stride;
413 STORE_WORD(dst, out1);
414 dst += dst_stride;
415 STORE_WORD(dst, out2);
416 dst += dst_stride;
417 STORE_WORD(dst, out3);
418 } 338 }
419 339
420 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, 340 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
421 uint8_t *dst, int32_t dst_stride, 341 uint8_t *dst, int32_t dst_stride,
422 int8_t *filter) { 342 int8_t *filter) {
423 uint32_t out0, out1, out2, out3; 343 v16u8 vec0, vec1, vec2, vec3, filt0;
424 v16u8 filt0;
425 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 344 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
426 v16u8 vec0, vec1, vec2, vec3;
427 v8u16 vec4, vec5, vec6, vec7;
428 v16i8 res0, res1, res2, res3; 345 v16i8 res0, res1, res2, res3;
429 v8u16 filt, const255; 346 v8u16 vec4, vec5, vec6, vec7, filt, const255;
430 347
431 mask = LOAD_SB(&mc_filt_mask_arr[16]); 348 mask = LD_SB(&mc_filt_mask_arr[16]);
432 349
433 /* rearranging filter */ 350 /* rearranging filter */
434 filt = LOAD_UH(filter); 351 filt = LD_UH(filter);
435 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 352 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
436 353
437 const255 = (v8u16)__msa_ldi_h(255); 354 const255 = (v8u16) __msa_ldi_h(255);
438 355
439 LOAD_8VECS_SB(src, src_stride, 356 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
440 src0, src1, src2, src3, src4, src5, src6, src7); 357 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
441 358 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
442 vec0 = (v16u8)__msa_vshf_b(mask, src1, src0); 359 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
443 vec1 = (v16u8)__msa_vshf_b(mask, src3, src2); 360 vec6, vec7);
444 vec2 = (v16u8)__msa_vshf_b(mask, src5, src4); 361 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
445 vec3 = (v16u8)__msa_vshf_b(mask, src7, src6); 362 MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
446 363 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
447 vec4 = __msa_dotp_u_h(vec0, filt0); 364 res2, res3);
448 vec5 = __msa_dotp_u_h(vec1, filt0); 365 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
449 vec6 = __msa_dotp_u_h(vec2, filt0); 366 dst += (4 * dst_stride);
450 vec7 = __msa_dotp_u_h(vec3, filt0); 367 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
451
452 vec4 = (v8u16)__msa_srari_h((v8i16)vec4, FILTER_BITS);
453 vec5 = (v8u16)__msa_srari_h((v8i16)vec5, FILTER_BITS);
454 vec6 = (v8u16)__msa_srari_h((v8i16)vec6, FILTER_BITS);
455 vec7 = (v8u16)__msa_srari_h((v8i16)vec7, FILTER_BITS);
456
457 vec4 = __msa_min_u_h(vec4, const255);
458 vec5 = __msa_min_u_h(vec5, const255);
459 vec6 = __msa_min_u_h(vec6, const255);
460 vec7 = __msa_min_u_h(vec7, const255);
461
462 res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4);
463 res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5);
464 res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6);
465 res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7);
466
467 out0 = __msa_copy_u_w((v4i32)res0, 0);
468 out1 = __msa_copy_u_w((v4i32)res0, 1);
469 out2 = __msa_copy_u_w((v4i32)res1, 0);
470 out3 = __msa_copy_u_w((v4i32)res1, 1);
471
472 STORE_WORD(dst, out0);
473 dst += dst_stride;
474 STORE_WORD(dst, out1);
475 dst += dst_stride;
476 STORE_WORD(dst, out2);
477 dst += dst_stride;
478 STORE_WORD(dst, out3);
479 dst += dst_stride;
480
481 out0 = __msa_copy_u_w((v4i32)res2, 0);
482 out1 = __msa_copy_u_w((v4i32)res2, 1);
483 out2 = __msa_copy_u_w((v4i32)res3, 0);
484 out3 = __msa_copy_u_w((v4i32)res3, 1);
485
486 STORE_WORD(dst, out0);
487 dst += dst_stride;
488 STORE_WORD(dst, out1);
489 dst += dst_stride;
490 STORE_WORD(dst, out2);
491 dst += dst_stride;
492 STORE_WORD(dst, out3);
493 } 368 }
494 369
495 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, 370 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
496 uint8_t *dst, int32_t dst_stride, 371 uint8_t *dst, int32_t dst_stride,
497 int8_t *filter, int32_t height) { 372 int8_t *filter, int32_t height) {
498 if (4 == height) { 373 if (4 == height) {
499 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 374 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
500 } else if (8 == height) { 375 } else if (8 == height) {
501 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 376 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
502 } 377 }
503 } 378 }
504 379
505 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, 380 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
506 uint8_t *dst, int32_t dst_stride, 381 uint8_t *dst, int32_t dst_stride,
507 int8_t *filter) { 382 int8_t *filter) {
508 v16u8 filt0; 383 v16u8 filt0;
509 v16i8 src0, src1, src2, src3, mask; 384 v16i8 src0, src1, src2, src3, mask;
510 v8u16 vec0, vec1, vec2, vec3; 385 v8u16 vec0, vec1, vec2, vec3, const255, filt;
511 v8u16 out0, out1, out2, out3;
512 v8u16 const255, filt;
513 386
514 mask = LOAD_SB(&mc_filt_mask_arr[0]); 387 mask = LD_SB(&mc_filt_mask_arr[0]);
515 388
516 /* rearranging filter */ 389 /* rearranging filter */
517 filt = LOAD_UH(filter); 390 filt = LD_UH(filter);
518 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 391 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
519 392
520 const255 = (v8u16)__msa_ldi_h(255); 393 const255 = (v8u16) __msa_ldi_h(255);
521 394
522 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); 395 LD_SB4(src, src_stride, src0, src1, src2, src3);
523 396 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
524 vec0 = (v8u16)__msa_vshf_b(mask, src0, src0); 397 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
525 vec1 = (v8u16)__msa_vshf_b(mask, src1, src1); 398 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
526 vec2 = (v8u16)__msa_vshf_b(mask, src2, src2); 399 vec2, vec3);
527 vec3 = (v8u16)__msa_vshf_b(mask, src3, src3); 400 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
528 401 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
529 vec0 = __msa_dotp_u_h((v16u8)vec0, filt0); 402 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
530 vec1 = __msa_dotp_u_h((v16u8)vec1, filt0); 403 ST8x4_UB(src0, src1, dst, dst_stride);
531 vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
532 vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
533
534 SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
535
536 out0 = __msa_min_u_h(vec0, const255);
537 out1 = __msa_min_u_h(vec1, const255);
538 out2 = __msa_min_u_h(vec2, const255);
539 out3 = __msa_min_u_h(vec3, const255);
540
541 PCKEV_B_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
542 } 404 }
543 405
544 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 406 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
545 uint8_t *dst, int32_t dst_stride, 407 uint8_t *dst, int32_t dst_stride,
546 int8_t *filter, int32_t height) { 408 int8_t *filter, int32_t height) {
547 v16u8 filt0; 409 v16u8 filt0;
548 v16i8 src0, src1, src2, src3, mask; 410 v16i8 src0, src1, src2, src3, mask, out0, out1;
549 v8u16 vec0, vec1, vec2, vec3; 411 v8u16 vec0, vec1, vec2, vec3, filt, const255;
550 v8u16 filt, const255;
551 412
552 mask = LOAD_SB(&mc_filt_mask_arr[0]); 413 mask = LD_SB(&mc_filt_mask_arr[0]);
553 414
554 /* rearranging filter */ 415 /* rearranging filter */
555 filt = LOAD_UH(filter); 416 filt = LD_UH(filter);
556 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 417 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
557 418
558 const255 = (v8u16)__msa_ldi_h(255); 419 const255 = (v8u16) __msa_ldi_h(255);
559 420
560 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); 421 LD_SB4(src, src_stride, src0, src1, src2, src3);
561 src += (4 * src_stride); 422 src += (4 * src_stride);
562 423
563 vec0 = (v8u16)__msa_vshf_b(mask, src0, src0); 424 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
564 vec1 = (v8u16)__msa_vshf_b(mask, src1, src1); 425 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
565 vec2 = (v8u16)__msa_vshf_b(mask, src2, src2); 426 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
566 vec3 = (v8u16)__msa_vshf_b(mask, src3, src3); 427 vec2, vec3);
428 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
429 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
567 430
568 vec0 = __msa_dotp_u_h((v16u8)vec0, filt0); 431 LD_SB4(src, src_stride, src0, src1, src2, src3);
569 vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
570 vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
571 vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
572
573 SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
574
575 vec0 = __msa_min_u_h(vec0, const255);
576 vec1 = __msa_min_u_h(vec1, const255);
577 vec2 = __msa_min_u_h(vec2, const255);
578 vec3 = __msa_min_u_h(vec3, const255);
579
580 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
581 src += (4 * src_stride); 432 src += (4 * src_stride);
582 433
583 PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride); 434 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
435 ST8x4_UB(out0, out1, dst, dst_stride);
584 dst += (4 * dst_stride); 436 dst += (4 * dst_stride);
585 437
586 vec0 = (v8u16)__msa_vshf_b(mask, src0, src0); 438 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
587 vec1 = (v8u16)__msa_vshf_b(mask, src1, src1); 439 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
588 vec2 = (v8u16)__msa_vshf_b(mask, src2, src2); 440 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
589 vec3 = (v8u16)__msa_vshf_b(mask, src3, src3); 441 vec2, vec3);
590 442 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
591 vec0 = __msa_dotp_u_h((v16u8)vec0, filt0); 443 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
592 vec1 = __msa_dotp_u_h((v16u8)vec1, filt0); 444 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
593 vec2 = __msa_dotp_u_h((v16u8)vec2, filt0); 445 ST8x4_UB(out0, out1, dst, dst_stride);
594 vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
595
596 SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
597
598 vec0 = __msa_min_u_h(vec0, const255);
599 vec1 = __msa_min_u_h(vec1, const255);
600 vec2 = __msa_min_u_h(vec2, const255);
601 vec3 = __msa_min_u_h(vec3, const255);
602
603 PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
604 dst += (4 * dst_stride); 446 dst += (4 * dst_stride);
605 447
606 if (16 == height) { 448 if (16 == height) {
607 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); 449 LD_SB4(src, src_stride, src0, src1, src2, src3);
608 src += (4 * src_stride); 450 src += (4 * src_stride);
609 451
610 vec0 = (v8u16)__msa_vshf_b(mask, src0, src0); 452 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
611 vec1 = (v8u16)__msa_vshf_b(mask, src1, src1); 453 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
612 vec2 = (v8u16)__msa_vshf_b(mask, src2, src2); 454 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
613 vec3 = (v8u16)__msa_vshf_b(mask, src3, src3); 455 vec2, vec3);
614 456 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
615 vec0 = __msa_dotp_u_h((v16u8)vec0, filt0); 457 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
616 vec1 = __msa_dotp_u_h((v16u8)vec1, filt0); 458 LD_SB4(src, src_stride, src0, src1, src2, src3);
617 vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
618 vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
619
620 SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3,
621 vec0, vec1, vec2, vec3, FILTER_BITS);
622
623 vec0 = __msa_min_u_h(vec0, const255);
624 vec1 = __msa_min_u_h(vec1, const255);
625 vec2 = __msa_min_u_h(vec2, const255);
626 vec3 = __msa_min_u_h(vec3, const255);
627
628 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
629 src += (4 * src_stride); 459 src += (4 * src_stride);
630 460
631 PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride); 461 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
632 dst += (4 * dst_stride); 462 ST8x4_UB(out0, out1, dst, dst_stride);
633 463
634 vec0 = (v8u16)__msa_vshf_b(mask, src0, src0); 464 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
635 vec1 = (v8u16)__msa_vshf_b(mask, src1, src1); 465 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
636 vec2 = (v8u16)__msa_vshf_b(mask, src2, src2); 466 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
637 vec3 = (v8u16)__msa_vshf_b(mask, src3, src3); 467 vec2, vec3);
638 468 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
639 vec0 = __msa_dotp_u_h((v16u8)vec0, filt0); 469 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
640 vec1 = __msa_dotp_u_h((v16u8)vec1, filt0); 470 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
641 vec2 = __msa_dotp_u_h((v16u8)vec2, filt0); 471 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
642 vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
643
644 SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3,
645 vec0, vec1, vec2, vec3, FILTER_BITS);
646
647 vec0 = __msa_min_u_h(vec0, const255);
648 vec1 = __msa_min_u_h(vec1, const255);
649 vec2 = __msa_min_u_h(vec2, const255);
650 vec3 = __msa_min_u_h(vec3, const255);
651
652 PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
653 } 472 }
654 } 473 }
655 474
656 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, 475 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
657 uint8_t *dst, int32_t dst_stride, 476 uint8_t *dst, int32_t dst_stride,
658 int8_t *filter, int32_t height) { 477 int8_t *filter, int32_t height) {
659 if (4 == height) { 478 if (4 == height) {
660 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 479 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
661 } else { 480 } else {
662 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); 481 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
663 } 482 }
664 } 483 }
665 484
666 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, 485 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
667 uint8_t *dst, int32_t dst_stride, 486 uint8_t *dst, int32_t dst_stride,
668 int8_t *filter, int32_t height) { 487 int8_t *filter, int32_t height) {
669 uint32_t loop_cnt; 488 uint32_t loop_cnt;
670 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 489 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
671 v16u8 filt0; 490 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
672 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 491 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
673 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
674 v8u16 filt, const255;
675 492
676 mask = LOAD_SB(&mc_filt_mask_arr[0]); 493 mask = LD_SB(&mc_filt_mask_arr[0]);
677 494
678 loop_cnt = (height >> 2) - 1; 495 loop_cnt = (height >> 2) - 1;
679 496
680 /* rearranging filter */ 497 /* rearranging filter */
681 filt = LOAD_UH(filter); 498 filt = LD_UH(filter);
682 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 499 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
683 500
684 const255 = (v8u16)__msa_ldi_h(255); 501 const255 = (v8u16) __msa_ldi_h(255);
685 502
686 src0 = LOAD_SB(src); 503 LD_SB4(src, src_stride, src0, src2, src4, src6);
687 src1 = LOAD_SB(src + 8); 504 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
688 src += src_stride; 505 src += (4 * src_stride);
689 src2 = LOAD_SB(src);
690 src3 = LOAD_SB(src + 8);
691 src += src_stride;
692 src4 = LOAD_SB(src);
693 src5 = LOAD_SB(src + 8);
694 src += src_stride;
695 src6 = LOAD_SB(src);
696 src7 = LOAD_SB(src + 8);
697 src += src_stride;
698 506
699 vec0 = (v16u8)__msa_vshf_b(mask, src0, src0); 507 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
700 vec1 = (v16u8)__msa_vshf_b(mask, src1, src1); 508 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
701 vec2 = (v16u8)__msa_vshf_b(mask, src2, src2); 509 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
702 vec3 = (v16u8)__msa_vshf_b(mask, src3, src3); 510 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
703 vec4 = (v16u8)__msa_vshf_b(mask, src4, src4); 511 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
704 vec5 = (v16u8)__msa_vshf_b(mask, src5, src5); 512 out2, out3);
705 vec6 = (v16u8)__msa_vshf_b(mask, src6, src6); 513 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
706 vec7 = (v16u8)__msa_vshf_b(mask, src7, src7); 514 out6, out7);
707 515 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
708 out0 = __msa_dotp_u_h(vec0, filt0); 516 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
709 out1 = __msa_dotp_u_h(vec1, filt0); 517 MIN_UH4_UH(out0, out1, out2, out3, const255);
710 out2 = __msa_dotp_u_h(vec2, filt0); 518 MIN_UH4_UH(out4, out5, out6, out7, const255);
711 out3 = __msa_dotp_u_h(vec3, filt0); 519 PCKEV_ST_SB(out0, out1, dst);
712 out4 = __msa_dotp_u_h(vec4, filt0);
713 out5 = __msa_dotp_u_h(vec5, filt0);
714 out6 = __msa_dotp_u_h(vec6, filt0);
715 out7 = __msa_dotp_u_h(vec7, filt0);
716
717 out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
718 out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
719 out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
720 out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
721 out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
722 out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
723 out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
724 out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
725
726 out0 = __msa_min_u_h(out0, const255);
727 out1 = __msa_min_u_h(out1, const255);
728 out2 = __msa_min_u_h(out2, const255);
729 out3 = __msa_min_u_h(out3, const255);
730 out4 = __msa_min_u_h(out4, const255);
731 out5 = __msa_min_u_h(out5, const255);
732 out6 = __msa_min_u_h(out6, const255);
733 out7 = __msa_min_u_h(out7, const255);
734
735 PCKEV_B_STORE_VEC(out1, out0, dst);
736 dst += dst_stride; 520 dst += dst_stride;
737 PCKEV_B_STORE_VEC(out3, out2, dst); 521 PCKEV_ST_SB(out2, out3, dst);
738 dst += dst_stride; 522 dst += dst_stride;
739 PCKEV_B_STORE_VEC(out5, out4, dst); 523 PCKEV_ST_SB(out4, out5, dst);
740 dst += dst_stride; 524 dst += dst_stride;
741 PCKEV_B_STORE_VEC(out7, out6, dst); 525 PCKEV_ST_SB(out6, out7, dst);
742 dst += dst_stride; 526 dst += dst_stride;
743 527
744 for (; loop_cnt--;) { 528 for (; loop_cnt--;) {
745 src0 = LOAD_SB(src); 529 LD_SB4(src, src_stride, src0, src2, src4, src6);
746 src1 = LOAD_SB(src + 8); 530 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
747 src += src_stride; 531 src += (4 * src_stride);
748 src2 = LOAD_SB(src);
749 src3 = LOAD_SB(src + 8);
750 src += src_stride;
751 src4 = LOAD_SB(src);
752 src5 = LOAD_SB(src + 8);
753 src += src_stride;
754 src6 = LOAD_SB(src);
755 src7 = LOAD_SB(src + 8);
756 src += src_stride;
757 532
758 vec0 = (v16u8)__msa_vshf_b(mask, src0, src0); 533 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
759 vec1 = (v16u8)__msa_vshf_b(mask, src1, src1); 534 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
760 vec2 = (v16u8)__msa_vshf_b(mask, src2, src2); 535 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
761 vec3 = (v16u8)__msa_vshf_b(mask, src3, src3); 536 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
762 vec4 = (v16u8)__msa_vshf_b(mask, src4, src4); 537 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
763 vec5 = (v16u8)__msa_vshf_b(mask, src5, src5); 538 out2, out3);
764 vec6 = (v16u8)__msa_vshf_b(mask, src6, src6); 539 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
765 vec7 = (v16u8)__msa_vshf_b(mask, src7, src7); 540 out6, out7);
766 541 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
767 out0 = __msa_dotp_u_h(vec0, filt0); 542 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
768 out1 = __msa_dotp_u_h(vec1, filt0); 543 MIN_UH4_UH(out0, out1, out2, out3, const255);
769 out2 = __msa_dotp_u_h(vec2, filt0); 544 MIN_UH4_UH(out4, out5, out6, out7, const255);
770 out3 = __msa_dotp_u_h(vec3, filt0); 545 PCKEV_ST_SB(out0, out1, dst);
771 out4 = __msa_dotp_u_h(vec4, filt0);
772 out5 = __msa_dotp_u_h(vec5, filt0);
773 out6 = __msa_dotp_u_h(vec6, filt0);
774 out7 = __msa_dotp_u_h(vec7, filt0);
775
776 out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
777 out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
778 out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
779 out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
780 out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
781 out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
782 out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
783 out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
784
785 out0 = __msa_min_u_h(out0, const255);
786 out1 = __msa_min_u_h(out1, const255);
787 out2 = __msa_min_u_h(out2, const255);
788 out3 = __msa_min_u_h(out3, const255);
789 out4 = __msa_min_u_h(out4, const255);
790 out5 = __msa_min_u_h(out5, const255);
791 out6 = __msa_min_u_h(out6, const255);
792 out7 = __msa_min_u_h(out7, const255);
793
794 PCKEV_B_STORE_VEC(out1, out0, dst);
795 dst += dst_stride; 546 dst += dst_stride;
796 PCKEV_B_STORE_VEC(out3, out2, dst); 547 PCKEV_ST_SB(out2, out3, dst);
797 dst += dst_stride; 548 dst += dst_stride;
798 PCKEV_B_STORE_VEC(out5, out4, dst); 549 PCKEV_ST_SB(out4, out5, dst);
799 dst += dst_stride; 550 dst += dst_stride;
800 PCKEV_B_STORE_VEC(out7, out6, dst); 551 PCKEV_ST_SB(out6, out7, dst);
801 dst += dst_stride; 552 dst += dst_stride;
802 } 553 }
803 } 554 }
804 555
805 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, 556 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
806 uint8_t *dst, int32_t dst_stride, 557 uint8_t *dst, int32_t dst_stride,
807 int8_t *filter, int32_t height) { 558 int8_t *filter, int32_t height) {
808 uint32_t loop_cnt; 559 uint32_t loop_cnt;
809 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 560 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
810 v16u8 filt0; 561 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
811 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 562 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
812 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
813 v8u16 filt, const255;
814 563
815 mask = LOAD_SB(&mc_filt_mask_arr[0]); 564 mask = LD_SB(&mc_filt_mask_arr[0]);
816 565
817 /* rearranging filter */ 566 /* rearranging filter */
818 filt = LOAD_UH(filter); 567 filt = LD_UH(filter);
819 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 568 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
820 569
821 const255 = (v8u16)__msa_ldi_h(255); 570 const255 = (v8u16) __msa_ldi_h(255);
822 571
823 for (loop_cnt = height >> 1; loop_cnt--;) { 572 for (loop_cnt = height >> 1; loop_cnt--;) {
824 src0 = LOAD_SB(src); 573 src0 = LD_SB(src);
825 src2 = LOAD_SB(src + 16); 574 src2 = LD_SB(src + 16);
826 src3 = LOAD_SB(src + 24); 575 src3 = LD_SB(src + 24);
827 src1 = __msa_sld_b(src2, src0, 8); 576 src1 = __msa_sldi_b(src2, src0, 8);
828 src += src_stride; 577 src += src_stride;
829 src4 = LOAD_SB(src); 578 src4 = LD_SB(src);
830 src6 = LOAD_SB(src + 16); 579 src6 = LD_SB(src + 16);
831 src7 = LOAD_SB(src + 24); 580 src7 = LD_SB(src + 24);
832 src5 = __msa_sld_b(src6, src4, 8); 581 src5 = __msa_sldi_b(src6, src4, 8);
833 src += src_stride; 582 src += src_stride;
834 583
835 vec0 = (v16u8)__msa_vshf_b(mask, src0, src0); 584 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
836 vec1 = (v16u8)__msa_vshf_b(mask, src1, src1); 585 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
837 vec2 = (v16u8)__msa_vshf_b(mask, src2, src2); 586 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
838 vec3 = (v16u8)__msa_vshf_b(mask, src3, src3); 587 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
839 vec4 = (v16u8)__msa_vshf_b(mask, src4, src4); 588 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
840 vec5 = (v16u8)__msa_vshf_b(mask, src5, src5); 589 out2, out3);
841 vec6 = (v16u8)__msa_vshf_b(mask, src6, src6); 590 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
842 vec7 = (v16u8)__msa_vshf_b(mask, src7, src7); 591 out6, out7);
843 592 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
844 out0 = __msa_dotp_u_h(vec0, filt0); 593 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
845 out1 = __msa_dotp_u_h(vec1, filt0); 594 MIN_UH4_UH(out0, out1, out2, out3, const255);
846 out2 = __msa_dotp_u_h(vec2, filt0); 595 MIN_UH4_UH(out4, out5, out6, out7, const255);
847 out3 = __msa_dotp_u_h(vec3, filt0); 596 PCKEV_ST_SB(out0, out1, dst);
848 out4 = __msa_dotp_u_h(vec4, filt0); 597 PCKEV_ST_SB(out2, out3, dst + 16);
849 out5 = __msa_dotp_u_h(vec5, filt0);
850 out6 = __msa_dotp_u_h(vec6, filt0);
851 out7 = __msa_dotp_u_h(vec7, filt0);
852
853 out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
854 out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
855 out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
856 out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
857 out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
858 out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
859 out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
860 out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
861
862 out0 = __msa_min_u_h(out0, const255);
863 out1 = __msa_min_u_h(out1, const255);
864 out2 = __msa_min_u_h(out2, const255);
865 out3 = __msa_min_u_h(out3, const255);
866 out4 = __msa_min_u_h(out4, const255);
867 out5 = __msa_min_u_h(out5, const255);
868 out6 = __msa_min_u_h(out6, const255);
869 out7 = __msa_min_u_h(out7, const255);
870
871 PCKEV_B_STORE_VEC(out1, out0, dst);
872 PCKEV_B_STORE_VEC(out3, out2, dst + 16);
873 dst += dst_stride; 598 dst += dst_stride;
874 PCKEV_B_STORE_VEC(out5, out4, dst); 599 PCKEV_ST_SB(out4, out5, dst);
875 PCKEV_B_STORE_VEC(out7, out6, dst + 16); 600 PCKEV_ST_SB(out6, out7, dst + 16);
876 dst += dst_stride; 601 dst += dst_stride;
877 } 602 }
878 } 603 }
879 604
880 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, 605 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
881 uint8_t *dst, int32_t dst_stride, 606 uint8_t *dst, int32_t dst_stride,
882 int8_t *filter, int32_t height) { 607 int8_t *filter, int32_t height) {
883 uint32_t loop_cnt; 608 uint32_t loop_cnt;
884 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 609 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
885 v16u8 filt0; 610 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
886 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 611 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
887 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
888 v8u16 filt, const255;
889 612
890 mask = LOAD_SB(&mc_filt_mask_arr[0]); 613 mask = LD_SB(&mc_filt_mask_arr[0]);
891 614
892 /* rearranging filter */ 615 /* rearranging filter */
893 filt = LOAD_UH(filter); 616 filt = LD_UH(filter);
894 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 617 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
895 618
896 const255 = (v8u16)__msa_ldi_h(255); 619 const255 = (v8u16) __msa_ldi_h(255);
897 620
898 for (loop_cnt = height; loop_cnt--;) { 621 for (loop_cnt = height; loop_cnt--;) {
899 src0 = LOAD_SB(src); 622 src0 = LD_SB(src);
900 src2 = LOAD_SB(src + 16); 623 src2 = LD_SB(src + 16);
901 src4 = LOAD_SB(src + 32); 624 src4 = LD_SB(src + 32);
902 src6 = LOAD_SB(src + 48); 625 src6 = LD_SB(src + 48);
903 src7 = LOAD_SB(src + 56); 626 src7 = LD_SB(src + 56);
904 src1 = __msa_sld_b(src2, src0, 8); 627 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
905 src3 = __msa_sld_b(src4, src2, 8);
906 src5 = __msa_sld_b(src6, src4, 8);
907 src += src_stride; 628 src += src_stride;
908 629
909 vec0 = (v16u8)__msa_vshf_b(mask, src0, src0); 630 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
910 vec1 = (v16u8)__msa_vshf_b(mask, src1, src1); 631 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
911 vec2 = (v16u8)__msa_vshf_b(mask, src2, src2); 632 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
912 vec3 = (v16u8)__msa_vshf_b(mask, src3, src3); 633 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
913 vec4 = (v16u8)__msa_vshf_b(mask, src4, src4); 634 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
914 vec5 = (v16u8)__msa_vshf_b(mask, src5, src5); 635 out2, out3);
915 vec6 = (v16u8)__msa_vshf_b(mask, src6, src6); 636 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
916 vec7 = (v16u8)__msa_vshf_b(mask, src7, src7); 637 out6, out7);
917 638 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
918 out0 = __msa_dotp_u_h(vec0, filt0); 639 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
919 out1 = __msa_dotp_u_h(vec1, filt0); 640 MIN_UH4_UH(out0, out1, out2, out3, const255);
920 out2 = __msa_dotp_u_h(vec2, filt0); 641 MIN_UH4_UH(out4, out5, out6, out7, const255);
921 out3 = __msa_dotp_u_h(vec3, filt0); 642 PCKEV_ST_SB(out0, out1, dst);
922 out4 = __msa_dotp_u_h(vec4, filt0); 643 PCKEV_ST_SB(out2, out3, dst + 16);
923 out5 = __msa_dotp_u_h(vec5, filt0); 644 PCKEV_ST_SB(out4, out5, dst + 32);
924 out6 = __msa_dotp_u_h(vec6, filt0); 645 PCKEV_ST_SB(out6, out7, dst + 48);
925 out7 = __msa_dotp_u_h(vec7, filt0);
926
927 out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
928 out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
929 out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
930 out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
931 out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
932 out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
933 out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
934 out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
935
936 out0 = __msa_min_u_h(out0, const255);
937 out1 = __msa_min_u_h(out1, const255);
938 out2 = __msa_min_u_h(out2, const255);
939 out3 = __msa_min_u_h(out3, const255);
940 out4 = __msa_min_u_h(out4, const255);
941 out5 = __msa_min_u_h(out5, const255);
942 out6 = __msa_min_u_h(out6, const255);
943 out7 = __msa_min_u_h(out7, const255);
944
945 PCKEV_B_STORE_VEC(out1, out0, dst);
946 PCKEV_B_STORE_VEC(out3, out2, dst + 16);
947 PCKEV_B_STORE_VEC(out5, out4, dst + 32);
948 PCKEV_B_STORE_VEC(out7, out6, dst + 48);
949 dst += dst_stride; 646 dst += dst_stride;
950 } 647 }
951 } 648 }
952 649
953 void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, 650 void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
954 uint8_t *dst, ptrdiff_t dst_stride, 651 uint8_t *dst, ptrdiff_t dst_stride,
955 const int16_t *filter_x, int x_step_q4, 652 const int16_t *filter_x, int x_step_q4,
956 const int16_t *filter_y, int y_step_q4, 653 const int16_t *filter_y, int y_step_q4,
957 int w, int h) { 654 int w, int h) {
958 int8_t cnt, filt_hor[8]; 655 int8_t cnt, filt_hor[8];
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
1036 filt_hor, h); 733 filt_hor, h);
1037 break; 734 break;
1038 default: 735 default:
1039 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, 736 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
1040 filter_x, x_step_q4, filter_y, y_step_q4, 737 filter_x, x_step_q4, filter_y, y_step_q4,
1041 w, h); 738 w, h);
1042 break; 739 break;
1043 } 740 }
1044 } 741 }
1045 } 742 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_vert_msa.c ('k') | source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698