OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 29 matching lines...) Expand all Loading... |
40 v4i32 zero_m = {0}; \ | 40 v4i32 zero_m = {0}; \ |
41 y_m = LD(psrc_y); \ | 41 y_m = LD(psrc_y); \ |
42 u_m = LW(psrc_u); \ | 42 u_m = LW(psrc_u); \ |
43 v_m = LW(psrc_v); \ | 43 v_m = LW(psrc_v); \ |
44 out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \ | 44 out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \ |
45 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \ | 45 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \ |
46 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ | 46 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ |
47 } | 47 } |
48 | 48 |
49 // Convert 8 pixels of YUV 420 to RGB. | 49 // Convert 8 pixels of YUV 420 to RGB. |
50 #define YUVTORGB(in_y, in_u, in_v, ub, vr, ug, vg, bb, bg, br, yg, out_b, \ | 50 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ |
51 out_g, out_r) \ | 51 { \ |
52 { \ | 52 v8i16 vec0_m, vec1_m; \ |
53 v8i16 vec0_m; \ | 53 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ |
54 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ | 54 v4i32 reg5_m, reg6_m, reg7_m; \ |
55 v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \ | 55 v4i32 max = __msa_ldi_w(255); \ |
56 v4i32 max_val_m = __msa_ldi_w(255); \ | 56 v16i8 zero = {0}; \ |
57 v8i16 zero_m = {0}; \ | 57 \ |
58 \ | 58 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ |
59 in_u = (v16u8)__msa_ilvr_b((v16i8)in_u, (v16i8)in_u); \ | 59 vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in_uv); \ |
60 in_v = (v16u8)__msa_ilvr_b((v16i8)in_v, (v16i8)in_v); \ | 60 reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0_m); \ |
61 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ | 61 reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0_m); \ |
62 reg0_m = (v4i32)__msa_ilvr_h(zero_m, vec0_m); \ | 62 reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1_m); \ |
63 reg1_m = (v4i32)__msa_ilvl_h(zero_m, vec0_m); \ | 63 reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1_m); \ |
64 reg0_m *= vec_yg; \ | 64 reg0_m *= yg; \ |
65 reg1_m *= vec_yg; \ | 65 reg1_m *= yg; \ |
66 reg0_m = __msa_srai_w(reg0_m, 16); \ | 66 reg2_m *= ubvr; \ |
67 reg1_m = __msa_srai_w(reg1_m, 16); \ | 67 reg3_m *= ubvr; \ |
68 reg4_m = reg0_m + br; \ | 68 reg0_m = __msa_srai_w(reg0_m, 16); \ |
69 reg5_m = reg1_m + br; \ | 69 reg1_m = __msa_srai_w(reg1_m, 16); \ |
70 reg2_m = reg0_m + bg; \ | 70 reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ |
71 reg3_m = reg1_m + bg; \ | 71 reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ |
72 reg0_m += bb; \ | 72 reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ |
73 reg1_m += bb; \ | 73 reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ |
74 vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_u); \ | 74 reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ |
75 reg6_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \ | 75 reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ |
76 reg7_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \ | 76 reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ |
77 vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_v); \ | 77 reg5_m = reg0_m - reg5_m; \ |
78 reg8_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \ | 78 reg6_m = reg1_m - reg6_m; \ |
79 reg9_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \ | 79 reg2_m = reg0_m - reg2_m; \ |
80 reg0_m -= reg6_m * ub; \ | 80 reg3_m = reg1_m - reg3_m; \ |
81 reg1_m -= reg7_m * ub; \ | 81 reg7_m = reg0_m - reg7_m; \ |
82 reg2_m -= reg6_m * ug; \ | 82 reg4_m = reg1_m - reg4_m; \ |
83 reg3_m -= reg7_m * ug; \ | 83 reg5_m += bb; \ |
84 reg4_m -= reg8_m * vr; \ | 84 reg6_m += bb; \ |
85 reg5_m -= reg9_m * vr; \ | 85 reg7_m += bg; \ |
86 reg2_m -= reg8_m * vg; \ | 86 reg4_m += bg; \ |
87 reg3_m -= reg9_m * vg; \ | 87 reg2_m += br; \ |
88 reg0_m = __msa_srai_w(reg0_m, 6); \ | 88 reg3_m += br; \ |
89 reg1_m = __msa_srai_w(reg1_m, 6); \ | 89 reg5_m = __msa_srai_w(reg5_m, 6); \ |
90 reg2_m = __msa_srai_w(reg2_m, 6); \ | 90 reg6_m = __msa_srai_w(reg6_m, 6); \ |
91 reg3_m = __msa_srai_w(reg3_m, 6); \ | 91 reg7_m = __msa_srai_w(reg7_m, 6); \ |
92 reg4_m = __msa_srai_w(reg4_m, 6); \ | 92 reg4_m = __msa_srai_w(reg4_m, 6); \ |
93 reg5_m = __msa_srai_w(reg5_m, 6); \ | 93 reg2_m = __msa_srai_w(reg2_m, 6); \ |
94 reg0_m = __msa_maxi_s_w(reg0_m, 0); \ | 94 reg3_m = __msa_srai_w(reg3_m, 6); \ |
95 reg1_m = __msa_maxi_s_w(reg1_m, 0); \ | 95 reg5_m = __msa_maxi_s_w(reg5_m, 0); \ |
96 reg2_m = __msa_maxi_s_w(reg2_m, 0); \ | 96 reg6_m = __msa_maxi_s_w(reg6_m, 0); \ |
97 reg3_m = __msa_maxi_s_w(reg3_m, 0); \ | 97 reg7_m = __msa_maxi_s_w(reg7_m, 0); \ |
98 reg4_m = __msa_maxi_s_w(reg4_m, 0); \ | 98 reg4_m = __msa_maxi_s_w(reg4_m, 0); \ |
99 reg5_m = __msa_maxi_s_w(reg5_m, 0); \ | 99 reg2_m = __msa_maxi_s_w(reg2_m, 0); \ |
100 reg0_m = __msa_min_s_w(reg0_m, max_val_m); \ | 100 reg3_m = __msa_maxi_s_w(reg3_m, 0); \ |
101 reg1_m = __msa_min_s_w(reg1_m, max_val_m); \ | 101 reg5_m = __msa_min_s_w(max, reg5_m); \ |
102 reg2_m = __msa_min_s_w(reg2_m, max_val_m); \ | 102 reg6_m = __msa_min_s_w(max, reg6_m); \ |
103 reg3_m = __msa_min_s_w(reg3_m, max_val_m); \ | 103 reg7_m = __msa_min_s_w(max, reg7_m); \ |
104 reg4_m = __msa_min_s_w(reg4_m, max_val_m); \ | 104 reg4_m = __msa_min_s_w(max, reg4_m); \ |
105 reg5_m = __msa_min_s_w(reg5_m, max_val_m); \ | 105 reg2_m = __msa_min_s_w(max, reg2_m); \ |
106 out_b = __msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ | 106 reg3_m = __msa_min_s_w(max, reg3_m); \ |
107 out_g = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ | 107 out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ |
108 out_r = __msa_pckev_h((v8i16)reg5_m, (v8i16)reg4_m); \ | 108 out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ |
| 109 out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ |
109 } | 110 } |
110 | 111 |
111 // Pack and Store 8 ARGB values. | 112 // Pack and Store 8 ARGB values. |
112 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \ | 113 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \ |
113 { \ | 114 { \ |
114 v8i16 vec0_m, vec1_m; \ | 115 v8i16 vec0_m, vec1_m; \ |
115 v16u8 dst0_m, dst1_m; \ | 116 v16u8 dst0_m, dst1_m; \ |
116 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ | 117 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ |
117 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ | 118 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ |
118 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ | 119 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ |
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
205 void I422ToARGBRow_MSA(const uint8* src_y, | 206 void I422ToARGBRow_MSA(const uint8* src_y, |
206 const uint8* src_u, | 207 const uint8* src_u, |
207 const uint8* src_v, | 208 const uint8* src_v, |
208 uint8* rgb_buf, | 209 uint8* rgb_buf, |
209 const struct YuvConstants* yuvconstants, | 210 const struct YuvConstants* yuvconstants, |
210 int width) { | 211 int width) { |
211 int x; | 212 int x; |
212 v16u8 src0, src1, src2; | 213 v16u8 src0, src1, src2; |
213 v8i16 vec0, vec1, vec2; | 214 v8i16 vec0, vec1, vec2; |
214 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 215 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 216 v4i32 vec_ubvr, vec_ugvg; |
215 v16u8 const_255 = (v16u8)__msa_ldi_b(255); | 217 v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
216 | 218 |
217 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 219 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
218 vec_br, vec_yg); | 220 vec_br, vec_yg); |
| 221 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 222 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
219 | 223 |
220 for (x = 0; x < width; x += 8) { | 224 for (x = 0; x < width; x += 8) { |
221 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 225 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
222 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 226 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
223 vec_br, vec_yg, vec0, vec1, vec2); | 227 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 228 vec0, vec1, vec2); |
224 STOREARGB(vec0, vec1, vec2, const_255, rgb_buf); | 229 STOREARGB(vec0, vec1, vec2, const_255, rgb_buf); |
225 src_y += 8; | 230 src_y += 8; |
226 src_u += 4; | 231 src_u += 4; |
227 src_v += 4; | 232 src_v += 4; |
228 rgb_buf += 32; | 233 rgb_buf += 32; |
229 } | 234 } |
230 } | 235 } |
231 | 236 |
232 void I422ToRGBARow_MSA(const uint8* src_y, | 237 void I422ToRGBARow_MSA(const uint8* src_y, |
233 const uint8* src_u, | 238 const uint8* src_u, |
234 const uint8* src_v, | 239 const uint8* src_v, |
235 uint8* rgb_buf, | 240 uint8* rgb_buf, |
236 const struct YuvConstants* yuvconstants, | 241 const struct YuvConstants* yuvconstants, |
237 int width) { | 242 int width) { |
238 int x; | 243 int x; |
239 v16u8 src0, src1, src2; | 244 v16u8 src0, src1, src2; |
240 v8i16 vec0, vec1, vec2; | 245 v8i16 vec0, vec1, vec2; |
241 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 246 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 247 v4i32 vec_ubvr, vec_ugvg; |
242 v16u8 const_255 = (v16u8)__msa_ldi_b(255); | 248 v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
243 | 249 |
244 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 250 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
245 vec_br, vec_yg); | 251 vec_br, vec_yg); |
| 252 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 253 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
246 | 254 |
247 for (x = 0; x < width; x += 8) { | 255 for (x = 0; x < width; x += 8) { |
248 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 256 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
249 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 257 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
250 vec_br, vec_yg, vec0, vec1, vec2); | 258 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 259 vec0, vec1, vec2); |
251 STOREARGB(const_255, vec0, vec1, vec2, rgb_buf); | 260 STOREARGB(const_255, vec0, vec1, vec2, rgb_buf); |
252 src_y += 8; | 261 src_y += 8; |
253 src_u += 4; | 262 src_u += 4; |
254 src_v += 4; | 263 src_v += 4; |
255 rgb_buf += 32; | 264 rgb_buf += 32; |
256 } | 265 } |
257 } | 266 } |
258 | 267 |
259 void I422AlphaToARGBRow_MSA(const uint8* src_y, | 268 void I422AlphaToARGBRow_MSA(const uint8* src_y, |
260 const uint8* src_u, | 269 const uint8* src_u, |
261 const uint8* src_v, | 270 const uint8* src_v, |
262 const uint8* src_a, | 271 const uint8* src_a, |
263 uint8* rgb_buf, | 272 uint8* rgb_buf, |
264 const struct YuvConstants* yuvconstants, | 273 const struct YuvConstants* yuvconstants, |
265 int width) { | 274 int width) { |
266 int x; | 275 int x; |
267 int64 data_a; | 276 int64 data_a; |
268 v16u8 src0, src1, src2, src3; | 277 v16u8 src0, src1, src2, src3; |
269 v8i16 vec0, vec1, vec2; | 278 v8i16 vec0, vec1, vec2; |
270 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 279 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 280 v4i32 vec_ubvr, vec_ugvg; |
271 v4i32 zero = {0}; | 281 v4i32 zero = {0}; |
272 | 282 |
273 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 283 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
274 vec_br, vec_yg); | 284 vec_br, vec_yg); |
| 285 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 286 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
275 | 287 |
276 for (x = 0; x < width; x += 8) { | 288 for (x = 0; x < width; x += 8) { |
277 data_a = LD(src_a); | 289 data_a = LD(src_a); |
278 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 290 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
| 291 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
279 src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); | 292 src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); |
280 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 293 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
281 vec_br, vec_yg, vec0, vec1, vec2); | 294 vec0, vec1, vec2); |
282 src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); | 295 src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); |
283 STOREARGB(vec0, vec1, vec2, src3, rgb_buf); | 296 STOREARGB(vec0, vec1, vec2, src3, rgb_buf); |
284 src_y += 8; | 297 src_y += 8; |
285 src_u += 4; | 298 src_u += 4; |
286 src_v += 4; | 299 src_v += 4; |
287 src_a += 8; | 300 src_a += 8; |
288 rgb_buf += 32; | 301 rgb_buf += 32; |
289 } | 302 } |
290 } | 303 } |
291 | 304 |
292 void I422ToRGB24Row_MSA(const uint8* src_y, | 305 void I422ToRGB24Row_MSA(const uint8* src_y, |
293 const uint8* src_u, | 306 const uint8* src_u, |
294 const uint8* src_v, | 307 const uint8* src_v, |
295 uint8* rgb_buf, | 308 uint8* rgb_buf, |
296 const struct YuvConstants* yuvconstants, | 309 const struct YuvConstants* yuvconstants, |
297 int32 width) { | 310 int32 width) { |
298 int x; | 311 int x; |
299 int64 data_u, data_v; | 312 int64 data_u, data_v; |
300 v16u8 src0, src1, src2, src3, src4, src5, dst0, dst1, dst2; | 313 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; |
301 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; | 314 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; |
302 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 315 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 316 v4i32 vec_ubvr, vec_ugvg; |
303 v16u8 reg0, reg1, reg2, reg3; | 317 v16u8 reg0, reg1, reg2, reg3; |
304 v2i64 zero = {0}; | 318 v2i64 zero = {0}; |
305 v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; | 319 v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; |
306 v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; | 320 v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; |
307 v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, | 321 v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, |
308 11, 29, 12, 13, 30, 14, 15, 31}; | 322 11, 29, 12, 13, 30, 14, 15, 31}; |
309 | 323 |
310 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 324 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
311 vec_br, vec_yg); | 325 vec_br, vec_yg); |
| 326 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 327 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
312 | 328 |
313 for (x = 0; x < width; x += 16) { | 329 for (x = 0; x < width; x += 16) { |
314 src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); | 330 src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); |
315 data_u = LD(src_u); | 331 data_u = LD(src_u); |
316 data_v = LD(src_v); | 332 data_v = LD(src_v); |
317 src1 = (v16u8)__msa_insert_d(zero, 0, data_u); | 333 src1 = (v16u8)__msa_insert_d(zero, 0, data_u); |
318 src2 = (v16u8)__msa_insert_d(zero, 0, data_v); | 334 src2 = (v16u8)__msa_insert_d(zero, 0, data_v); |
| 335 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
319 src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); | 336 src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); |
320 src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 4); | 337 src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); |
321 src5 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); | 338 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
322 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 339 vec0, vec1, vec2); |
323 vec_br, vec_yg, vec0, vec1, vec2); | 340 YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
324 YUVTORGB(src3, src4, src5, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 341 vec3, vec4, vec5); |
325 vec_br, vec_yg, vec3, vec4, vec5); | |
326 reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); | 342 reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); |
327 reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); | 343 reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); |
328 reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); | 344 reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); |
329 reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); | 345 reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); |
330 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); | 346 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); |
331 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); | 347 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); |
332 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); | 348 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); |
333 ST_UB2(dst0, dst1, rgb_buf, 16); | 349 ST_UB2(dst0, dst1, rgb_buf, 16); |
334 ST_UB(dst2, (rgb_buf + 32)); | 350 ST_UB(dst2, (rgb_buf + 32)); |
335 src_y += 16; | 351 src_y += 16; |
336 src_u += 8; | 352 src_u += 8; |
337 src_v += 8; | 353 src_v += 8; |
338 rgb_buf += 48; | 354 rgb_buf += 48; |
339 } | 355 } |
340 } | 356 } |
341 | 357 |
342 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. | 358 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. |
343 void I422ToRGB565Row_MSA(const uint8* src_y, | 359 void I422ToRGB565Row_MSA(const uint8* src_y, |
344 const uint8* src_u, | 360 const uint8* src_u, |
345 const uint8* src_v, | 361 const uint8* src_v, |
346 uint8* dst_rgb565, | 362 uint8* dst_rgb565, |
347 const struct YuvConstants* yuvconstants, | 363 const struct YuvConstants* yuvconstants, |
348 int width) { | 364 int width) { |
349 int x; | 365 int x; |
350 v16u8 src0, src1, src2, dst0; | 366 v16u8 src0, src1, src2, dst0; |
351 v8i16 vec0, vec1, vec2; | 367 v8i16 vec0, vec1, vec2; |
352 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 368 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 369 v4i32 vec_ubvr, vec_ugvg; |
353 | 370 |
354 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 371 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
355 vec_br, vec_yg); | 372 vec_br, vec_yg); |
| 373 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 374 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
356 | 375 |
357 for (x = 0; x < width; x += 8) { | 376 for (x = 0; x < width; x += 8) { |
358 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 377 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
359 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 378 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
360 vec_br, vec_yg, vec0, vec2, vec1); | 379 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 380 vec0, vec2, vec1); |
361 vec0 = __msa_srai_h(vec0, 3); | 381 vec0 = __msa_srai_h(vec0, 3); |
362 vec1 = __msa_srai_h(vec1, 3); | 382 vec1 = __msa_srai_h(vec1, 3); |
363 vec2 = __msa_srai_h(vec2, 2); | 383 vec2 = __msa_srai_h(vec2, 2); |
364 vec1 = __msa_slli_h(vec1, 11); | 384 vec1 = __msa_slli_h(vec1, 11); |
365 vec2 = __msa_slli_h(vec2, 5); | 385 vec2 = __msa_slli_h(vec2, 5); |
366 vec0 |= vec1; | 386 vec0 |= vec1; |
367 dst0 = (v16u8)(vec2 | vec0); | 387 dst0 = (v16u8)(vec2 | vec0); |
368 ST_UB(dst0, dst_rgb565); | 388 ST_UB(dst0, dst_rgb565); |
369 src_y += 8; | 389 src_y += 8; |
370 src_u += 4; | 390 src_u += 4; |
371 src_v += 4; | 391 src_v += 4; |
372 dst_rgb565 += 16; | 392 dst_rgb565 += 16; |
373 } | 393 } |
374 } | 394 } |
375 | 395 |
376 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. | 396 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. |
377 void I422ToARGB4444Row_MSA(const uint8* src_y, | 397 void I422ToARGB4444Row_MSA(const uint8* src_y, |
378 const uint8* src_u, | 398 const uint8* src_u, |
379 const uint8* src_v, | 399 const uint8* src_v, |
380 uint8* dst_argb4444, | 400 uint8* dst_argb4444, |
381 const struct YuvConstants* yuvconstants, | 401 const struct YuvConstants* yuvconstants, |
382 int width) { | 402 int width) { |
383 int x; | 403 int x; |
384 v16u8 src0, src1, src2, dst0; | 404 v16u8 src0, src1, src2, dst0; |
385 v8i16 vec0, vec1, vec2; | 405 v8i16 vec0, vec1, vec2; |
386 v8u16 reg0, reg1, reg2; | 406 v8u16 reg0, reg1, reg2; |
387 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 407 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 408 v4i32 vec_ubvr, vec_ugvg; |
388 v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); | 409 v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); |
389 | 410 |
390 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 411 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
391 vec_br, vec_yg); | 412 vec_br, vec_yg); |
| 413 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 414 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
392 | 415 |
393 for (x = 0; x < width; x += 8) { | 416 for (x = 0; x < width; x += 8) { |
394 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 417 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
395 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 418 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
396 vec_br, vec_yg, vec0, vec1, vec2); | 419 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 420 vec0, vec1, vec2); |
397 reg0 = (v8u16)__msa_srai_h(vec0, 4); | 421 reg0 = (v8u16)__msa_srai_h(vec0, 4); |
398 reg1 = (v8u16)__msa_srai_h(vec1, 4); | 422 reg1 = (v8u16)__msa_srai_h(vec1, 4); |
399 reg2 = (v8u16)__msa_srai_h(vec2, 4); | 423 reg2 = (v8u16)__msa_srai_h(vec2, 4); |
400 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); | 424 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); |
401 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); | 425 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); |
402 reg1 |= const_0xF000; | 426 reg1 |= const_0xF000; |
403 reg0 |= reg2; | 427 reg0 |= reg2; |
404 dst0 = (v16u8)(reg1 | reg0); | 428 dst0 = (v16u8)(reg1 | reg0); |
405 ST_UB(dst0, dst_argb4444); | 429 ST_UB(dst0, dst_argb4444); |
406 src_y += 8; | 430 src_y += 8; |
407 src_u += 4; | 431 src_u += 4; |
408 src_v += 4; | 432 src_v += 4; |
409 dst_argb4444 += 16; | 433 dst_argb4444 += 16; |
410 } | 434 } |
411 } | 435 } |
412 | 436 |
413 void I422ToARGB1555Row_MSA(const uint8* src_y, | 437 void I422ToARGB1555Row_MSA(const uint8* src_y, |
414 const uint8* src_u, | 438 const uint8* src_u, |
415 const uint8* src_v, | 439 const uint8* src_v, |
416 uint8* dst_argb1555, | 440 uint8* dst_argb1555, |
417 const struct YuvConstants* yuvconstants, | 441 const struct YuvConstants* yuvconstants, |
418 int width) { | 442 int width) { |
419 int x; | 443 int x; |
420 v16u8 src0, src1, src2, dst0; | 444 v16u8 src0, src1, src2, dst0; |
421 v8i16 vec0, vec1, vec2; | 445 v8i16 vec0, vec1, vec2; |
422 v8u16 reg0, reg1, reg2; | 446 v8u16 reg0, reg1, reg2; |
423 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 447 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 448 v4i32 vec_ubvr, vec_ugvg; |
424 v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); | 449 v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); |
425 | 450 |
426 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 451 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
427 vec_br, vec_yg); | 452 vec_br, vec_yg); |
| 453 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 454 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
428 | 455 |
429 for (x = 0; x < width; x += 8) { | 456 for (x = 0; x < width; x += 8) { |
430 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 457 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
431 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 458 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
432 vec_br, vec_yg, vec0, vec1, vec2); | 459 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 460 vec0, vec1, vec2); |
433 reg0 = (v8u16)__msa_srai_h(vec0, 3); | 461 reg0 = (v8u16)__msa_srai_h(vec0, 3); |
434 reg1 = (v8u16)__msa_srai_h(vec1, 3); | 462 reg1 = (v8u16)__msa_srai_h(vec1, 3); |
435 reg2 = (v8u16)__msa_srai_h(vec2, 3); | 463 reg2 = (v8u16)__msa_srai_h(vec2, 3); |
436 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); | 464 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); |
437 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); | 465 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); |
438 reg1 |= const_0x8000; | 466 reg1 |= const_0x8000; |
439 reg0 |= reg2; | 467 reg0 |= reg2; |
440 dst0 = (v16u8)(reg1 | reg0); | 468 dst0 = (v16u8)(reg1 | reg0); |
441 ST_UB(dst0, dst_argb1555); | 469 ST_UB(dst0, dst_argb1555); |
442 src_y += 8; | 470 src_y += 8; |
(...skipping 1573 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2016 res1 = __msa_copy_u_d((v2i64)dst0, 1); | 2044 res1 = __msa_copy_u_d((v2i64)dst0, 1); |
2017 SD(res0, dst_u); | 2045 SD(res0, dst_u); |
2018 SD(res1, dst_v); | 2046 SD(res1, dst_v); |
2019 t += 48; | 2047 t += 48; |
2020 s += 48; | 2048 s += 48; |
2021 dst_u += 8; | 2049 dst_u += 8; |
2022 dst_v += 8; | 2050 dst_v += 8; |
2023 } | 2051 } |
2024 } | 2052 } |
2025 | 2053 |
| 2054 void NV12ToARGBRow_MSA(const uint8* src_y, |
| 2055 const uint8* src_uv, |
| 2056 uint8* rgb_buf, |
| 2057 const struct YuvConstants* yuvconstants, |
| 2058 int width) { |
| 2059 int x; |
| 2060 uint64 val0, val1; |
| 2061 v16u8 src0, src1, res0, res1, dst0, dst1; |
| 2062 v8i16 vec0, vec1, vec2; |
| 2063 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 2064 v4i32 vec_ubvr, vec_ugvg; |
| 2065 v16u8 zero = {0}; |
| 2066 v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
| 2067 |
| 2068 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 2069 vec_br, vec_yg); |
| 2070 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 2071 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 2072 |
| 2073 for (x = 0; x < width; x += 8) { |
| 2074 val0 = LD(src_y); |
| 2075 val1 = LD(src_uv); |
| 2076 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
| 2077 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
| 2078 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 2079 vec0, vec1, vec2); |
| 2080 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); |
| 2081 res1 = (v16u8)__msa_ilvev_b((v16i8)const_255, (v16i8)vec1); |
| 2082 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); |
| 2083 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); |
| 2084 ST_UB2(dst0, dst1, rgb_buf, 16); |
| 2085 src_y += 8; |
| 2086 src_uv += 8; |
| 2087 rgb_buf += 32; |
| 2088 } |
| 2089 } |
| 2090 |
| 2091 void NV12ToRGB565Row_MSA(const uint8* src_y, |
| 2092 const uint8* src_uv, |
| 2093 uint8* rgb_buf, |
| 2094 const struct YuvConstants* yuvconstants, |
| 2095 int width) { |
| 2096 int x; |
| 2097 uint64 val0, val1; |
| 2098 v16u8 src0, src1, dst0; |
| 2099 v8i16 vec0, vec1, vec2; |
| 2100 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 2101 v4i32 vec_ubvr, vec_ugvg; |
| 2102 v16u8 zero = {0}; |
| 2103 |
| 2104 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 2105 vec_br, vec_yg); |
| 2106 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 2107 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 2108 |
| 2109 for (x = 0; x < width; x += 8) { |
| 2110 val0 = LD(src_y); |
| 2111 val1 = LD(src_uv); |
| 2112 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
| 2113 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
| 2114 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 2115 vec0, vec1, vec2); |
| 2116 vec0 = vec0 >> 3; |
| 2117 vec1 = (vec1 >> 2) << 5; |
| 2118 vec2 = (vec2 >> 3) << 11; |
| 2119 dst0 = (v16u8)(vec0 | vec1 | vec2); |
| 2120 ST_UB(dst0, rgb_buf); |
| 2121 src_y += 8; |
| 2122 src_uv += 8; |
| 2123 rgb_buf += 16; |
| 2124 } |
| 2125 } |
| 2126 |
| 2127 void NV21ToARGBRow_MSA(const uint8* src_y, |
| 2128 const uint8* src_vu, |
| 2129 uint8* rgb_buf, |
| 2130 const struct YuvConstants* yuvconstants, |
| 2131 int width) { |
| 2132 int x; |
| 2133 uint64 val0, val1; |
| 2134 v16u8 src0, src1, res0, res1, dst0, dst1; |
| 2135 v8i16 vec0, vec1, vec2; |
| 2136 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 2137 v4i32 vec_ubvr, vec_ugvg; |
| 2138 v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
| 2139 v16u8 zero = {0}; |
| 2140 v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; |
| 2141 |
| 2142 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 2143 vec_br, vec_yg); |
| 2144 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 2145 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 2146 |
| 2147 for (x = 0; x < width; x += 8) { |
| 2148 val0 = LD(src_y); |
| 2149 val1 = LD(src_vu); |
| 2150 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
| 2151 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
| 2152 src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); |
| 2153 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 2154 vec0, vec1, vec2); |
| 2155 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); |
| 2156 res1 = (v16u8)__msa_ilvev_b((v16i8)const_255, (v16i8)vec1); |
| 2157 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); |
| 2158 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); |
| 2159 ST_UB2(dst0, dst1, rgb_buf, 16); |
| 2160 src_y += 8; |
| 2161 src_vu += 8; |
| 2162 rgb_buf += 32; |
| 2163 } |
| 2164 } |
| 2165 |
| 2166 void SobelRow_MSA(const uint8* src_sobelx, |
| 2167 const uint8* src_sobely, |
| 2168 uint8* dst_argb, |
| 2169 int width) { |
| 2170 int x; |
| 2171 v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; |
| 2172 v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; |
| 2173 v16i8 const_0x4 = __msa_ldi_b(0x4); |
| 2174 v16i8 mask1 = mask0 + const_0x4; |
| 2175 v16i8 mask2 = mask1 + const_0x4; |
| 2176 v16i8 mask3 = mask2 + const_0x4; |
| 2177 v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF); |
| 2178 |
| 2179 for (x = 0; x < width; x += 16) { |
| 2180 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
| 2181 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
| 2182 vec0 = __msa_adds_u_b(src0, src1); |
| 2183 dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)const_0xFF, (v16i8)vec0); |
| 2184 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)const_0xFF, (v16i8)vec0); |
| 2185 dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)const_0xFF, (v16i8)vec0); |
| 2186 dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)const_0xFF, (v16i8)vec0); |
| 2187 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); |
| 2188 src_sobelx += 16; |
| 2189 src_sobely += 16; |
| 2190 dst_argb += 64; |
| 2191 } |
| 2192 } |
| 2193 |
| 2194 void SobelToPlaneRow_MSA(const uint8* src_sobelx, |
| 2195 const uint8* src_sobely, |
| 2196 uint8* dst_y, |
| 2197 int width) { |
| 2198 int x; |
| 2199 v16u8 src0, src1, src2, src3, dst0, dst1; |
| 2200 |
| 2201 for (x = 0; x < width; x += 32) { |
| 2202 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
| 2203 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16); |
| 2204 src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
| 2205 src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16); |
| 2206 dst0 = __msa_adds_u_b(src0, src2); |
| 2207 dst1 = __msa_adds_u_b(src1, src3); |
| 2208 ST_UB2(dst0, dst1, dst_y, 16); |
| 2209 src_sobelx += 32; |
| 2210 src_sobely += 32; |
| 2211 dst_y += 32; |
| 2212 } |
| 2213 } |
| 2214 |
| 2215 void SobelXYRow_MSA(const uint8* src_sobelx, |
| 2216 const uint8* src_sobely, |
| 2217 uint8* dst_argb, |
| 2218 int width) { |
| 2219 int x; |
| 2220 v16u8 src0, src1, vec0, vec1, vec2; |
| 2221 v16u8 reg0, reg1, dst0, dst1, dst2, dst3; |
| 2222 v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF); |
| 2223 |
| 2224 for (x = 0; x < width; x += 16) { |
| 2225 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
| 2226 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
| 2227 vec0 = __msa_adds_u_b(src0, src1); |
| 2228 vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); |
| 2229 vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); |
| 2230 reg0 = (v16u8)__msa_ilvr_b((v16i8)const_0xFF, (v16i8)vec0); |
| 2231 reg1 = (v16u8)__msa_ilvl_b((v16i8)const_0xFF, (v16i8)vec0); |
| 2232 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); |
| 2233 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); |
| 2234 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); |
| 2235 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); |
| 2236 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); |
| 2237 src_sobelx += 16; |
| 2238 src_sobely += 16; |
| 2239 dst_argb += 64; |
| 2240 } |
| 2241 } |
| 2242 |
2026 #ifdef __cplusplus | 2243 #ifdef __cplusplus |
2027 } // extern "C" | 2244 } // extern "C" |
2028 } // namespace libyuv | 2245 } // namespace libyuv |
2029 #endif | 2246 #endif |
2030 | 2247 |
2031 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 2248 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
OLD | NEW |