Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(848)

Side by Side Diff: source/row_msa.cc

Issue 2636483002: Add MSA optimized NV12/21 To RGB row functions (Closed)
Patch Set: Incorporated review comments Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 29 matching lines...) Expand all
40 v4i32 zero_m = {0}; \ 40 v4i32 zero_m = {0}; \
41 y_m = LD(psrc_y); \ 41 y_m = LD(psrc_y); \
42 u_m = LW(psrc_u); \ 42 u_m = LW(psrc_u); \
43 v_m = LW(psrc_v); \ 43 v_m = LW(psrc_v); \
44 out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \ 44 out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \
45 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \ 45 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \
46 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ 46 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \
47 } 47 }
48 48
49 // Convert 8 pixels of YUV 420 to RGB. 49 // Convert 8 pixels of YUV 420 to RGB.
50 #define YUVTORGB(in_y, in_u, in_v, ub, vr, ug, vg, bb, bg, br, yg, out_b, \ 50 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
51 out_g, out_r) \ 51 { \
52 { \ 52 v8i16 vec0_m, vec1_m; \
53 v8i16 vec0_m; \ 53 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
54 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ 54 v4i32 reg5_m, reg6_m, reg7_m; \
55 v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \ 55 v4i32 max = __msa_ldi_w(255); \
56 v4i32 max_val_m = __msa_ldi_w(255); \ 56 v16i8 zero = {0}; \
57 v8i16 zero_m = {0}; \ 57 \
58 \ 58 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
59 in_u = (v16u8)__msa_ilvr_b((v16i8)in_u, (v16i8)in_u); \ 59 vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in_uv); \
60 in_v = (v16u8)__msa_ilvr_b((v16i8)in_v, (v16i8)in_v); \ 60 reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0_m); \
61 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ 61 reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0_m); \
62 reg0_m = (v4i32)__msa_ilvr_h(zero_m, vec0_m); \ 62 reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1_m); \
63 reg1_m = (v4i32)__msa_ilvl_h(zero_m, vec0_m); \ 63 reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1_m); \
64 reg0_m *= vec_yg; \ 64 reg0_m *= yg; \
65 reg1_m *= vec_yg; \ 65 reg1_m *= yg; \
66 reg0_m = __msa_srai_w(reg0_m, 16); \ 66 reg2_m *= ubvr; \
67 reg1_m = __msa_srai_w(reg1_m, 16); \ 67 reg3_m *= ubvr; \
68 reg4_m = reg0_m + br; \ 68 reg0_m = __msa_srai_w(reg0_m, 16); \
69 reg5_m = reg1_m + br; \ 69 reg1_m = __msa_srai_w(reg1_m, 16); \
70 reg2_m = reg0_m + bg; \ 70 reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
71 reg3_m = reg1_m + bg; \ 71 reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
72 reg0_m += bb; \ 72 reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
73 reg1_m += bb; \ 73 reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
74 vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_u); \ 74 reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
75 reg6_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \ 75 reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
76 reg7_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \ 76 reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
77 vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_v); \ 77 reg5_m = reg0_m - reg5_m; \
78 reg8_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \ 78 reg6_m = reg1_m - reg6_m; \
79 reg9_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \ 79 reg2_m = reg0_m - reg2_m; \
80 reg0_m -= reg6_m * ub; \ 80 reg3_m = reg1_m - reg3_m; \
81 reg1_m -= reg7_m * ub; \ 81 reg7_m = reg0_m - reg7_m; \
82 reg2_m -= reg6_m * ug; \ 82 reg4_m = reg1_m - reg4_m; \
83 reg3_m -= reg7_m * ug; \ 83 reg5_m += bb; \
84 reg4_m -= reg8_m * vr; \ 84 reg6_m += bb; \
85 reg5_m -= reg9_m * vr; \ 85 reg7_m += bg; \
86 reg2_m -= reg8_m * vg; \ 86 reg4_m += bg; \
87 reg3_m -= reg9_m * vg; \ 87 reg2_m += br; \
88 reg0_m = __msa_srai_w(reg0_m, 6); \ 88 reg3_m += br; \
89 reg1_m = __msa_srai_w(reg1_m, 6); \ 89 reg5_m = __msa_srai_w(reg5_m, 6); \
90 reg2_m = __msa_srai_w(reg2_m, 6); \ 90 reg6_m = __msa_srai_w(reg6_m, 6); \
91 reg3_m = __msa_srai_w(reg3_m, 6); \ 91 reg7_m = __msa_srai_w(reg7_m, 6); \
92 reg4_m = __msa_srai_w(reg4_m, 6); \ 92 reg4_m = __msa_srai_w(reg4_m, 6); \
93 reg5_m = __msa_srai_w(reg5_m, 6); \ 93 reg2_m = __msa_srai_w(reg2_m, 6); \
94 reg0_m = __msa_maxi_s_w(reg0_m, 0); \ 94 reg3_m = __msa_srai_w(reg3_m, 6); \
95 reg1_m = __msa_maxi_s_w(reg1_m, 0); \ 95 reg5_m = __msa_maxi_s_w(reg5_m, 0); \
96 reg2_m = __msa_maxi_s_w(reg2_m, 0); \ 96 reg6_m = __msa_maxi_s_w(reg6_m, 0); \
97 reg3_m = __msa_maxi_s_w(reg3_m, 0); \ 97 reg7_m = __msa_maxi_s_w(reg7_m, 0); \
98 reg4_m = __msa_maxi_s_w(reg4_m, 0); \ 98 reg4_m = __msa_maxi_s_w(reg4_m, 0); \
99 reg5_m = __msa_maxi_s_w(reg5_m, 0); \ 99 reg2_m = __msa_maxi_s_w(reg2_m, 0); \
100 reg0_m = __msa_min_s_w(reg0_m, max_val_m); \ 100 reg3_m = __msa_maxi_s_w(reg3_m, 0); \
101 reg1_m = __msa_min_s_w(reg1_m, max_val_m); \ 101 reg5_m = __msa_min_s_w(max, reg5_m); \
102 reg2_m = __msa_min_s_w(reg2_m, max_val_m); \ 102 reg6_m = __msa_min_s_w(max, reg6_m); \
103 reg3_m = __msa_min_s_w(reg3_m, max_val_m); \ 103 reg7_m = __msa_min_s_w(max, reg7_m); \
104 reg4_m = __msa_min_s_w(reg4_m, max_val_m); \ 104 reg4_m = __msa_min_s_w(max, reg4_m); \
105 reg5_m = __msa_min_s_w(reg5_m, max_val_m); \ 105 reg2_m = __msa_min_s_w(max, reg2_m); \
106 out_b = __msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ 106 reg3_m = __msa_min_s_w(max, reg3_m); \
107 out_g = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ 107 out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
108 out_r = __msa_pckev_h((v8i16)reg5_m, (v8i16)reg4_m); \ 108 out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
109 out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
109 } 110 }
110 111
111 // Pack and Store 8 ARGB values. 112 // Pack and Store 8 ARGB values.
112 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \ 113 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \
113 { \ 114 { \
114 v8i16 vec0_m, vec1_m; \ 115 v8i16 vec0_m, vec1_m; \
115 v16u8 dst0_m, dst1_m; \ 116 v16u8 dst0_m, dst1_m; \
116 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 117 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
117 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 118 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
118 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ 119 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after
205 void I422ToARGBRow_MSA(const uint8* src_y, 206 void I422ToARGBRow_MSA(const uint8* src_y,
206 const uint8* src_u, 207 const uint8* src_u,
207 const uint8* src_v, 208 const uint8* src_v,
208 uint8* rgb_buf, 209 uint8* rgb_buf,
209 const struct YuvConstants* yuvconstants, 210 const struct YuvConstants* yuvconstants,
210 int width) { 211 int width) {
211 int x; 212 int x;
212 v16u8 src0, src1, src2; 213 v16u8 src0, src1, src2;
213 v8i16 vec0, vec1, vec2; 214 v8i16 vec0, vec1, vec2;
214 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 215 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
216 v4i32 vec_ubvr, vec_ugvg;
215 v16u8 const_255 = (v16u8)__msa_ldi_b(255); 217 v16u8 const_255 = (v16u8)__msa_ldi_b(255);
216 218
217 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 219 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
218 vec_br, vec_yg); 220 vec_br, vec_yg);
221 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
222 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
219 223
220 for (x = 0; x < width; x += 8) { 224 for (x = 0; x < width; x += 8) {
221 READYUV422(src_y, src_u, src_v, src0, src1, src2); 225 READYUV422(src_y, src_u, src_v, src0, src1, src2);
222 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 226 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
223 vec_br, vec_yg, vec0, vec1, vec2); 227 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
228 vec0, vec1, vec2);
224 STOREARGB(vec0, vec1, vec2, const_255, rgb_buf); 229 STOREARGB(vec0, vec1, vec2, const_255, rgb_buf);
225 src_y += 8; 230 src_y += 8;
226 src_u += 4; 231 src_u += 4;
227 src_v += 4; 232 src_v += 4;
228 rgb_buf += 32; 233 rgb_buf += 32;
229 } 234 }
230 } 235 }
231 236
232 void I422ToRGBARow_MSA(const uint8* src_y, 237 void I422ToRGBARow_MSA(const uint8* src_y,
233 const uint8* src_u, 238 const uint8* src_u,
234 const uint8* src_v, 239 const uint8* src_v,
235 uint8* rgb_buf, 240 uint8* rgb_buf,
236 const struct YuvConstants* yuvconstants, 241 const struct YuvConstants* yuvconstants,
237 int width) { 242 int width) {
238 int x; 243 int x;
239 v16u8 src0, src1, src2; 244 v16u8 src0, src1, src2;
240 v8i16 vec0, vec1, vec2; 245 v8i16 vec0, vec1, vec2;
241 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 246 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
247 v4i32 vec_ubvr, vec_ugvg;
242 v16u8 const_255 = (v16u8)__msa_ldi_b(255); 248 v16u8 const_255 = (v16u8)__msa_ldi_b(255);
243 249
244 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 250 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
245 vec_br, vec_yg); 251 vec_br, vec_yg);
252 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
253 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
246 254
247 for (x = 0; x < width; x += 8) { 255 for (x = 0; x < width; x += 8) {
248 READYUV422(src_y, src_u, src_v, src0, src1, src2); 256 READYUV422(src_y, src_u, src_v, src0, src1, src2);
249 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 257 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
250 vec_br, vec_yg, vec0, vec1, vec2); 258 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
259 vec0, vec1, vec2);
251 STOREARGB(const_255, vec0, vec1, vec2, rgb_buf); 260 STOREARGB(const_255, vec0, vec1, vec2, rgb_buf);
252 src_y += 8; 261 src_y += 8;
253 src_u += 4; 262 src_u += 4;
254 src_v += 4; 263 src_v += 4;
255 rgb_buf += 32; 264 rgb_buf += 32;
256 } 265 }
257 } 266 }
258 267
259 void I422AlphaToARGBRow_MSA(const uint8* src_y, 268 void I422AlphaToARGBRow_MSA(const uint8* src_y,
260 const uint8* src_u, 269 const uint8* src_u,
261 const uint8* src_v, 270 const uint8* src_v,
262 const uint8* src_a, 271 const uint8* src_a,
263 uint8* rgb_buf, 272 uint8* rgb_buf,
264 const struct YuvConstants* yuvconstants, 273 const struct YuvConstants* yuvconstants,
265 int width) { 274 int width) {
266 int x; 275 int x;
267 int64 data_a; 276 int64 data_a;
268 v16u8 src0, src1, src2, src3; 277 v16u8 src0, src1, src2, src3;
269 v8i16 vec0, vec1, vec2; 278 v8i16 vec0, vec1, vec2;
270 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 279 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
280 v4i32 vec_ubvr, vec_ugvg;
271 v4i32 zero = {0}; 281 v4i32 zero = {0};
272 282
273 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 283 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
274 vec_br, vec_yg); 284 vec_br, vec_yg);
285 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
286 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
275 287
276 for (x = 0; x < width; x += 8) { 288 for (x = 0; x < width; x += 8) {
277 data_a = LD(src_a); 289 data_a = LD(src_a);
278 READYUV422(src_y, src_u, src_v, src0, src1, src2); 290 READYUV422(src_y, src_u, src_v, src0, src1, src2);
291 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
279 src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); 292 src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
280 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 293 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
281 vec_br, vec_yg, vec0, vec1, vec2); 294 vec0, vec1, vec2);
282 src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); 295 src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
283 STOREARGB(vec0, vec1, vec2, src3, rgb_buf); 296 STOREARGB(vec0, vec1, vec2, src3, rgb_buf);
284 src_y += 8; 297 src_y += 8;
285 src_u += 4; 298 src_u += 4;
286 src_v += 4; 299 src_v += 4;
287 src_a += 8; 300 src_a += 8;
288 rgb_buf += 32; 301 rgb_buf += 32;
289 } 302 }
290 } 303 }
291 304
292 void I422ToRGB24Row_MSA(const uint8* src_y, 305 void I422ToRGB24Row_MSA(const uint8* src_y,
293 const uint8* src_u, 306 const uint8* src_u,
294 const uint8* src_v, 307 const uint8* src_v,
295 uint8* rgb_buf, 308 uint8* rgb_buf,
296 const struct YuvConstants* yuvconstants, 309 const struct YuvConstants* yuvconstants,
297 int32 width) { 310 int32 width) {
298 int x; 311 int x;
299 int64 data_u, data_v; 312 int64 data_u, data_v;
300 v16u8 src0, src1, src2, src3, src4, src5, dst0, dst1, dst2; 313 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
301 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; 314 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
302 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 315 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
316 v4i32 vec_ubvr, vec_ugvg;
303 v16u8 reg0, reg1, reg2, reg3; 317 v16u8 reg0, reg1, reg2, reg3;
304 v2i64 zero = {0}; 318 v2i64 zero = {0};
305 v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; 319 v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
306 v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; 320 v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
307 v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, 321 v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10,
308 11, 29, 12, 13, 30, 14, 15, 31}; 322 11, 29, 12, 13, 30, 14, 15, 31};
309 323
310 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 324 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
311 vec_br, vec_yg); 325 vec_br, vec_yg);
326 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
327 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
312 328
313 for (x = 0; x < width; x += 16) { 329 for (x = 0; x < width; x += 16) {
314 src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); 330 src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
315 data_u = LD(src_u); 331 data_u = LD(src_u);
316 data_v = LD(src_v); 332 data_v = LD(src_v);
317 src1 = (v16u8)__msa_insert_d(zero, 0, data_u); 333 src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
318 src2 = (v16u8)__msa_insert_d(zero, 0, data_v); 334 src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
335 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
319 src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); 336 src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
320 src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 4); 337 src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
321 src5 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); 338 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
322 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 339 vec0, vec1, vec2);
323 vec_br, vec_yg, vec0, vec1, vec2); 340 YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
324 YUVTORGB(src3, src4, src5, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 341 vec3, vec4, vec5);
325 vec_br, vec_yg, vec3, vec4, vec5);
326 reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); 342 reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
327 reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); 343 reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
328 reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); 344 reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
329 reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); 345 reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
330 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); 346 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
331 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); 347 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
332 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); 348 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
333 ST_UB2(dst0, dst1, rgb_buf, 16); 349 ST_UB2(dst0, dst1, rgb_buf, 16);
334 ST_UB(dst2, (rgb_buf + 32)); 350 ST_UB(dst2, (rgb_buf + 32));
335 src_y += 16; 351 src_y += 16;
336 src_u += 8; 352 src_u += 8;
337 src_v += 8; 353 src_v += 8;
338 rgb_buf += 48; 354 rgb_buf += 48;
339 } 355 }
340 } 356 }
341 357
342 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. 358 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
343 void I422ToRGB565Row_MSA(const uint8* src_y, 359 void I422ToRGB565Row_MSA(const uint8* src_y,
344 const uint8* src_u, 360 const uint8* src_u,
345 const uint8* src_v, 361 const uint8* src_v,
346 uint8* dst_rgb565, 362 uint8* dst_rgb565,
347 const struct YuvConstants* yuvconstants, 363 const struct YuvConstants* yuvconstants,
348 int width) { 364 int width) {
349 int x; 365 int x;
350 v16u8 src0, src1, src2, dst0; 366 v16u8 src0, src1, src2, dst0;
351 v8i16 vec0, vec1, vec2; 367 v8i16 vec0, vec1, vec2;
352 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 368 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
369 v4i32 vec_ubvr, vec_ugvg;
353 370
354 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 371 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
355 vec_br, vec_yg); 372 vec_br, vec_yg);
373 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
374 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
356 375
357 for (x = 0; x < width; x += 8) { 376 for (x = 0; x < width; x += 8) {
358 READYUV422(src_y, src_u, src_v, src0, src1, src2); 377 READYUV422(src_y, src_u, src_v, src0, src1, src2);
359 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 378 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
360 vec_br, vec_yg, vec0, vec2, vec1); 379 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
380 vec0, vec2, vec1);
361 vec0 = __msa_srai_h(vec0, 3); 381 vec0 = __msa_srai_h(vec0, 3);
362 vec1 = __msa_srai_h(vec1, 3); 382 vec1 = __msa_srai_h(vec1, 3);
363 vec2 = __msa_srai_h(vec2, 2); 383 vec2 = __msa_srai_h(vec2, 2);
364 vec1 = __msa_slli_h(vec1, 11); 384 vec1 = __msa_slli_h(vec1, 11);
365 vec2 = __msa_slli_h(vec2, 5); 385 vec2 = __msa_slli_h(vec2, 5);
366 vec0 |= vec1; 386 vec0 |= vec1;
367 dst0 = (v16u8)(vec2 | vec0); 387 dst0 = (v16u8)(vec2 | vec0);
368 ST_UB(dst0, dst_rgb565); 388 ST_UB(dst0, dst_rgb565);
369 src_y += 8; 389 src_y += 8;
370 src_u += 4; 390 src_u += 4;
371 src_v += 4; 391 src_v += 4;
372 dst_rgb565 += 16; 392 dst_rgb565 += 16;
373 } 393 }
374 } 394 }
375 395
376 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. 396 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
377 void I422ToARGB4444Row_MSA(const uint8* src_y, 397 void I422ToARGB4444Row_MSA(const uint8* src_y,
378 const uint8* src_u, 398 const uint8* src_u,
379 const uint8* src_v, 399 const uint8* src_v,
380 uint8* dst_argb4444, 400 uint8* dst_argb4444,
381 const struct YuvConstants* yuvconstants, 401 const struct YuvConstants* yuvconstants,
382 int width) { 402 int width) {
383 int x; 403 int x;
384 v16u8 src0, src1, src2, dst0; 404 v16u8 src0, src1, src2, dst0;
385 v8i16 vec0, vec1, vec2; 405 v8i16 vec0, vec1, vec2;
386 v8u16 reg0, reg1, reg2; 406 v8u16 reg0, reg1, reg2;
387 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 407 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
408 v4i32 vec_ubvr, vec_ugvg;
388 v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); 409 v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
389 410
390 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 411 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
391 vec_br, vec_yg); 412 vec_br, vec_yg);
413 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
414 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
392 415
393 for (x = 0; x < width; x += 8) { 416 for (x = 0; x < width; x += 8) {
394 READYUV422(src_y, src_u, src_v, src0, src1, src2); 417 READYUV422(src_y, src_u, src_v, src0, src1, src2);
395 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 418 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
396 vec_br, vec_yg, vec0, vec1, vec2); 419 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
420 vec0, vec1, vec2);
397 reg0 = (v8u16)__msa_srai_h(vec0, 4); 421 reg0 = (v8u16)__msa_srai_h(vec0, 4);
398 reg1 = (v8u16)__msa_srai_h(vec1, 4); 422 reg1 = (v8u16)__msa_srai_h(vec1, 4);
399 reg2 = (v8u16)__msa_srai_h(vec2, 4); 423 reg2 = (v8u16)__msa_srai_h(vec2, 4);
400 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); 424 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
401 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); 425 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
402 reg1 |= const_0xF000; 426 reg1 |= const_0xF000;
403 reg0 |= reg2; 427 reg0 |= reg2;
404 dst0 = (v16u8)(reg1 | reg0); 428 dst0 = (v16u8)(reg1 | reg0);
405 ST_UB(dst0, dst_argb4444); 429 ST_UB(dst0, dst_argb4444);
406 src_y += 8; 430 src_y += 8;
407 src_u += 4; 431 src_u += 4;
408 src_v += 4; 432 src_v += 4;
409 dst_argb4444 += 16; 433 dst_argb4444 += 16;
410 } 434 }
411 } 435 }
412 436
413 void I422ToARGB1555Row_MSA(const uint8* src_y, 437 void I422ToARGB1555Row_MSA(const uint8* src_y,
414 const uint8* src_u, 438 const uint8* src_u,
415 const uint8* src_v, 439 const uint8* src_v,
416 uint8* dst_argb1555, 440 uint8* dst_argb1555,
417 const struct YuvConstants* yuvconstants, 441 const struct YuvConstants* yuvconstants,
418 int width) { 442 int width) {
419 int x; 443 int x;
420 v16u8 src0, src1, src2, dst0; 444 v16u8 src0, src1, src2, dst0;
421 v8i16 vec0, vec1, vec2; 445 v8i16 vec0, vec1, vec2;
422 v8u16 reg0, reg1, reg2; 446 v8u16 reg0, reg1, reg2;
423 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 447 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
448 v4i32 vec_ubvr, vec_ugvg;
424 v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); 449 v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
425 450
426 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 451 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
427 vec_br, vec_yg); 452 vec_br, vec_yg);
453 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
454 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
428 455
429 for (x = 0; x < width; x += 8) { 456 for (x = 0; x < width; x += 8) {
430 READYUV422(src_y, src_u, src_v, src0, src1, src2); 457 READYUV422(src_y, src_u, src_v, src0, src1, src2);
431 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 458 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
432 vec_br, vec_yg, vec0, vec1, vec2); 459 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
460 vec0, vec1, vec2);
433 reg0 = (v8u16)__msa_srai_h(vec0, 3); 461 reg0 = (v8u16)__msa_srai_h(vec0, 3);
434 reg1 = (v8u16)__msa_srai_h(vec1, 3); 462 reg1 = (v8u16)__msa_srai_h(vec1, 3);
435 reg2 = (v8u16)__msa_srai_h(vec2, 3); 463 reg2 = (v8u16)__msa_srai_h(vec2, 3);
436 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); 464 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
437 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); 465 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
438 reg1 |= const_0x8000; 466 reg1 |= const_0x8000;
439 reg0 |= reg2; 467 reg0 |= reg2;
440 dst0 = (v16u8)(reg1 | reg0); 468 dst0 = (v16u8)(reg1 | reg0);
441 ST_UB(dst0, dst_argb1555); 469 ST_UB(dst0, dst_argb1555);
442 src_y += 8; 470 src_y += 8;
(...skipping 1573 matching lines...) Expand 10 before | Expand all | Expand 10 after
2016 res1 = __msa_copy_u_d((v2i64)dst0, 1); 2044 res1 = __msa_copy_u_d((v2i64)dst0, 1);
2017 SD(res0, dst_u); 2045 SD(res0, dst_u);
2018 SD(res1, dst_v); 2046 SD(res1, dst_v);
2019 t += 48; 2047 t += 48;
2020 s += 48; 2048 s += 48;
2021 dst_u += 8; 2049 dst_u += 8;
2022 dst_v += 8; 2050 dst_v += 8;
2023 } 2051 }
2024 } 2052 }
2025 2053
2054 void NV12ToARGBRow_MSA(const uint8* src_y,
2055 const uint8* src_uv,
2056 uint8* rgb_buf,
2057 const struct YuvConstants* yuvconstants,
2058 int width) {
2059 int x;
2060 uint64 val0, val1;
2061 v16u8 src0, src1, res0, res1, dst0, dst1;
2062 v8i16 vec0, vec1, vec2;
2063 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2064 v4i32 vec_ubvr, vec_ugvg;
2065 v16u8 zero = {0};
2066 v16u8 const_255 = (v16u8)__msa_ldi_b(255);
2067
2068 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2069 vec_br, vec_yg);
2070 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2071 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2072
2073 for (x = 0; x < width; x += 8) {
2074 val0 = LD(src_y);
2075 val1 = LD(src_uv);
2076 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2077 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2078 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2079 vec0, vec1, vec2);
2080 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
2081 res1 = (v16u8)__msa_ilvev_b((v16i8)const_255, (v16i8)vec1);
2082 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
2083 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
2084 ST_UB2(dst0, dst1, rgb_buf, 16);
2085 src_y += 8;
2086 src_uv += 8;
2087 rgb_buf += 32;
2088 }
2089 }
2090
2091 void NV12ToRGB565Row_MSA(const uint8* src_y,
2092 const uint8* src_uv,
2093 uint8* rgb_buf,
2094 const struct YuvConstants* yuvconstants,
2095 int width) {
2096 int x;
2097 uint64 val0, val1;
2098 v16u8 src0, src1, dst0;
2099 v8i16 vec0, vec1, vec2;
2100 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2101 v4i32 vec_ubvr, vec_ugvg;
2102 v16u8 zero = {0};
2103
2104 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2105 vec_br, vec_yg);
2106 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2107 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2108
2109 for (x = 0; x < width; x += 8) {
2110 val0 = LD(src_y);
2111 val1 = LD(src_uv);
2112 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2113 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2114 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2115 vec0, vec1, vec2);
2116 vec0 = vec0 >> 3;
2117 vec1 = (vec1 >> 2) << 5;
2118 vec2 = (vec2 >> 3) << 11;
2119 dst0 = (v16u8)(vec0 | vec1 | vec2);
2120 ST_UB(dst0, rgb_buf);
2121 src_y += 8;
2122 src_uv += 8;
2123 rgb_buf += 16;
2124 }
2125 }
2126
2127 void NV21ToARGBRow_MSA(const uint8* src_y,
2128 const uint8* src_vu,
2129 uint8* rgb_buf,
2130 const struct YuvConstants* yuvconstants,
2131 int width) {
2132 int x;
2133 uint64 val0, val1;
2134 v16u8 src0, src1, res0, res1, dst0, dst1;
2135 v8i16 vec0, vec1, vec2;
2136 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2137 v4i32 vec_ubvr, vec_ugvg;
2138 v16u8 const_255 = (v16u8)__msa_ldi_b(255);
2139 v16u8 zero = {0};
2140 v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
2141
2142 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2143 vec_br, vec_yg);
2144 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2145 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2146
2147 for (x = 0; x < width; x += 8) {
2148 val0 = LD(src_y);
2149 val1 = LD(src_vu);
2150 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2151 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2152 src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
2153 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2154 vec0, vec1, vec2);
2155 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
2156 res1 = (v16u8)__msa_ilvev_b((v16i8)const_255, (v16i8)vec1);
2157 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
2158 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
2159 ST_UB2(dst0, dst1, rgb_buf, 16);
2160 src_y += 8;
2161 src_vu += 8;
2162 rgb_buf += 32;
2163 }
2164 }
2165
2166 void SobelRow_MSA(const uint8* src_sobelx,
2167 const uint8* src_sobely,
2168 uint8* dst_argb,
2169 int width) {
2170 int x;
2171 v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
2172 v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
2173 v16i8 const_0x4 = __msa_ldi_b(0x4);
2174 v16i8 mask1 = mask0 + const_0x4;
2175 v16i8 mask2 = mask1 + const_0x4;
2176 v16i8 mask3 = mask2 + const_0x4;
2177 v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF);
2178
2179 for (x = 0; x < width; x += 16) {
2180 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2181 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2182 vec0 = __msa_adds_u_b(src0, src1);
2183 dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)const_0xFF, (v16i8)vec0);
2184 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)const_0xFF, (v16i8)vec0);
2185 dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)const_0xFF, (v16i8)vec0);
2186 dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)const_0xFF, (v16i8)vec0);
2187 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2188 src_sobelx += 16;
2189 src_sobely += 16;
2190 dst_argb += 64;
2191 }
2192 }
2193
2194 void SobelToPlaneRow_MSA(const uint8* src_sobelx,
2195 const uint8* src_sobely,
2196 uint8* dst_y,
2197 int width) {
2198 int x;
2199 v16u8 src0, src1, src2, src3, dst0, dst1;
2200
2201 for (x = 0; x < width; x += 32) {
2202 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2203 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16);
2204 src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2205 src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16);
2206 dst0 = __msa_adds_u_b(src0, src2);
2207 dst1 = __msa_adds_u_b(src1, src3);
2208 ST_UB2(dst0, dst1, dst_y, 16);
2209 src_sobelx += 32;
2210 src_sobely += 32;
2211 dst_y += 32;
2212 }
2213 }
2214
2215 void SobelXYRow_MSA(const uint8* src_sobelx,
2216 const uint8* src_sobely,
2217 uint8* dst_argb,
2218 int width) {
2219 int x;
2220 v16u8 src0, src1, vec0, vec1, vec2;
2221 v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
2222 v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF);
2223
2224 for (x = 0; x < width; x += 16) {
2225 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2226 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2227 vec0 = __msa_adds_u_b(src0, src1);
2228 vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
2229 vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
2230 reg0 = (v16u8)__msa_ilvr_b((v16i8)const_0xFF, (v16i8)vec0);
2231 reg1 = (v16u8)__msa_ilvl_b((v16i8)const_0xFF, (v16i8)vec0);
2232 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
2233 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
2234 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
2235 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
2236 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2237 src_sobelx += 16;
2238 src_sobely += 16;
2239 dst_argb += 64;
2240 }
2241 }
2242
2026 #ifdef __cplusplus 2243 #ifdef __cplusplus
2027 } // extern "C" 2244 } // extern "C"
2028 } // namespace libyuv 2245 } // namespace libyuv
2029 #endif 2246 #endif
2030 2247
2031 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 2248 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
OLDNEW
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698