Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(249)

Side by Side Diff: source/row_msa.cc

Issue 2421843002: Add MSA optimized ARGB4444ToI420 and ARGB4444ToARGB functions (Closed)
Patch Set: Incorporated review comments Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 203 matching lines...) Expand 10 before | Expand all | Expand 10 after
214 dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0); 214 dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
215 dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0); 215 dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
216 ST_UB(dst0, dst_u); 216 ST_UB(dst0, dst_u);
217 ST_UB(dst1, dst_v); 217 ST_UB(dst1, dst_v);
218 src_uyvy += 64; 218 src_uyvy += 64;
219 dst_u += 16; 219 dst_u += 16;
220 dst_v += 16; 220 dst_v += 16;
221 } 221 }
222 } 222 }
223 223
224 void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
225 int x;
226 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
227 v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
228 v16i8 zero = { 0 };
229 v8u16 const_0x19 = (v8u16) __msa_ldi_h(0x19);
230 v8u16 const_0x81 = (v8u16) __msa_ldi_h(0x81);
231 v8u16 const_0x42 = (v8u16) __msa_ldi_h(0x42);
232 v8u16 const_0x1080 = (v8u16) __msa_fill_h(0x1080);
233
234 for (x = 0; x < width; x += 16) {
235 src0 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 0);
236 src1 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 16);
237 src2 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 32);
238 src3 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 48);
239 vec0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
240 vec1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
241 vec2 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
242 vec3 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
243 reg0 = (v8u16) __msa_ilvev_b(zero, (v16i8) vec0);
244 reg1 = (v8u16) __msa_ilvev_b(zero, (v16i8) vec1);
245 reg2 = (v8u16) __msa_ilvev_b(zero, (v16i8) vec2);
246 reg3 = (v8u16) __msa_ilvev_b(zero, (v16i8) vec3);
247 reg4 = (v8u16) __msa_ilvod_b(zero, (v16i8) vec0);
248 reg5 = (v8u16) __msa_ilvod_b(zero, (v16i8) vec1);
249 reg0 *= const_0x19;
250 reg1 *= const_0x19;
251 reg2 *= const_0x81;
252 reg3 *= const_0x81;
253 reg4 *= const_0x42;
254 reg5 *= const_0x42;
255 reg0 += reg2;
256 reg1 += reg3;
257 reg0 += reg4;
258 reg1 += reg5;
259 reg0 += const_0x1080;
260 reg1 += const_0x1080;
261 reg0 = (v8u16) __msa_srai_h((v8i16) reg0, 8);
262 reg1 = (v8u16) __msa_srai_h((v8i16) reg1, 8);
263 dst0 = (v16u8) __msa_pckev_b((v16i8) reg1, (v16i8) reg0);
264 ST_UB(dst0, dst_y);
265 src_argb0 += 64;
266 dst_y += 16;
267 }
268 }
269
270 void ARGBToUVRow_MSA(const uint8* src_argb0, int src_stride_argb,
fbarchard1 2016/10/19 18:10:33 this is kinda HUGE! But I see it does a 16 byte s
271 uint8* dst_u, uint8* dst_v, int width) {
272 int x;
273 const uint8* src_argb0_next = src_argb0 + src_stride_argb;
274 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
275 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
276 v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
277 v16u8 dst0, dst1;
278 v8u16 const_0x70 = (v8u16) __msa_ldi_h(0x70);
279 v8u16 const_0x4A = (v8u16) __msa_ldi_h(0x4A);
280 v8u16 const_0x26 = (v8u16) __msa_ldi_h(0x26);
281 v8u16 const_0x5E = (v8u16) __msa_ldi_h(0x5E);
282 v8u16 const_0x12 = (v8u16) __msa_ldi_h(0x12);
283 v8u16 const_0x8080 = (v8u16) __msa_fill_h(0x8080);
284
285 for (x = 0; x < width; x += 32) {
286 src0 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 0);
287 src1 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 16);
288 src2 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 32);
289 src3 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 48);
290 src4 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 64);
291 src5 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 80);
292 src6 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 96);
293 src7 = (v16u8) __msa_ld_b((v16u8*) src_argb0, 112);
294 vec0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
295 vec1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
296 vec2 = (v16u8) __msa_pckev_b((v16i8) src5, (v16i8) src4);
297 vec3 = (v16u8) __msa_pckev_b((v16i8) src7, (v16i8) src6);
298 vec4 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
299 vec5 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
300 vec6 = (v16u8) __msa_pckod_b((v16i8) src5, (v16i8) src4);
301 vec7 = (v16u8) __msa_pckod_b((v16i8) src7, (v16i8) src6);
302 vec8 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
303 vec9 = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
304 vec4 = (v16u8) __msa_pckev_b((v16i8) vec5, (v16i8) vec4);
305 vec5 = (v16u8) __msa_pckev_b((v16i8) vec7, (v16i8) vec6);
306 vec0 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
307 vec1 = (v16u8) __msa_pckod_b((v16i8) vec3, (v16i8) vec2);
308 reg0 = __msa_hadd_u_h(vec8, vec8);
309 reg1 = __msa_hadd_u_h(vec9, vec9);
310 reg2 = __msa_hadd_u_h(vec4, vec4);
311 reg3 = __msa_hadd_u_h(vec5, vec5);
312 reg4 = __msa_hadd_u_h(vec0, vec0);
313 reg5 = __msa_hadd_u_h(vec1, vec1);
314 src0 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 0);
315 src1 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 16);
316 src2 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 32);
317 src3 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 48);
318 src4 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 64);
319 src5 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 80);
320 src6 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 96);
321 src7 = (v16u8) __msa_ld_b((v16u8*) src_argb0_next, 112);
322 vec0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
323 vec1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
324 vec2 = (v16u8) __msa_pckev_b((v16i8) src5, (v16i8) src4);
325 vec3 = (v16u8) __msa_pckev_b((v16i8) src7, (v16i8) src6);
326 vec4 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
327 vec5 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
328 vec6 = (v16u8) __msa_pckod_b((v16i8) src5, (v16i8) src4);
329 vec7 = (v16u8) __msa_pckod_b((v16i8) src7, (v16i8) src6);
330 vec8 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
331 vec9 = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
332 vec4 = (v16u8) __msa_pckev_b((v16i8) vec5, (v16i8) vec4);
333 vec5 = (v16u8) __msa_pckev_b((v16i8) vec7, (v16i8) vec6);
334 vec0 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
335 vec1 = (v16u8) __msa_pckod_b((v16i8) vec3, (v16i8) vec2);
336 reg0 += __msa_hadd_u_h(vec8, vec8);
337 reg1 += __msa_hadd_u_h(vec9, vec9);
338 reg2 += __msa_hadd_u_h(vec4, vec4);
339 reg3 += __msa_hadd_u_h(vec5, vec5);
340 reg4 += __msa_hadd_u_h(vec0, vec0);
341 reg5 += __msa_hadd_u_h(vec1, vec1);
342 reg0 = (v8u16) __msa_srai_h((v8i16) reg0, 2);
343 reg1 = (v8u16) __msa_srai_h((v8i16) reg1, 2);
344 reg2 = (v8u16) __msa_srai_h((v8i16) reg2, 2);
345 reg3 = (v8u16) __msa_srai_h((v8i16) reg3, 2);
346 reg4 = (v8u16) __msa_srai_h((v8i16) reg4, 2);
347 reg5 = (v8u16) __msa_srai_h((v8i16) reg5, 2);
348 reg6 = reg0 * const_0x70;
349 reg7 = reg1 * const_0x70;
350 reg8 = reg2 * const_0x4A;
351 reg9 = reg3 * const_0x4A;
352 reg6 += const_0x8080;
353 reg7 += const_0x8080;
354 reg8 += reg4 * const_0x26;
355 reg9 += reg5 * const_0x26;
356 reg0 *= const_0x12;
357 reg1 *= const_0x12;
358 reg2 *= const_0x5E;
359 reg3 *= const_0x5E;
360 reg4 *= const_0x70;
361 reg5 *= const_0x70;
362 reg2 += reg0;
363 reg3 += reg1;
364 reg4 += const_0x8080;
365 reg5 += const_0x8080;
366 reg6 -= reg8;
367 reg7 -= reg9;
368 reg4 -= reg2;
369 reg5 -= reg3;
370 reg6 = (v8u16) __msa_srai_h((v8i16) reg6, 8);
371 reg7 = (v8u16) __msa_srai_h((v8i16) reg7, 8);
372 reg4 = (v8u16) __msa_srai_h((v8i16) reg4, 8);
373 reg5 = (v8u16) __msa_srai_h((v8i16) reg5, 8);
374 dst0 = (v16u8) __msa_pckev_b((v16i8) reg7, (v16i8) reg6);
375 dst1 = (v16u8) __msa_pckev_b((v16i8) reg5, (v16i8) reg4);
376 ST_UB(dst0, dst_u);
377 ST_UB(dst1, dst_v);
378 src_argb0 += 128;
379 src_argb0_next += 128;
380 dst_u += 16;
381 dst_v += 16;
382 }
383 }
384
385 void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, uint8* dst_argb,
386 int width) {
387 int x;
388 v16u8 src0, src1;
389 v8u16 vec0, vec1, vec2, vec3;
390 v16u8 dst0, dst1, dst2, dst3;
391
392 for (x = 0; x < width; x += 16) {
393 src0 = (v16u8) __msa_ld_b((v16u8*) src_argb4444, 0);
394 src1 = (v16u8) __msa_ld_b((v16u8*) src_argb4444, 16);
395 vec0 = (v8u16) __msa_andi_b(src0, 0x0F);
396 vec1 = (v8u16) __msa_andi_b(src1, 0x0F);
397 vec2 = (v8u16) __msa_andi_b(src0, 0xF0);
398 vec3 = (v8u16) __msa_andi_b(src1, 0xF0);
399 vec0 |= (v8u16) __msa_slli_b((v16i8) vec0, 4);
400 vec1 |= (v8u16) __msa_slli_b((v16i8) vec1, 4);
401 vec2 |= (v8u16) __msa_srli_b((v16i8) vec2, 4);
402 vec3 |= (v8u16) __msa_srli_b((v16i8) vec3, 4);
403 dst0 = (v16u8) __msa_ilvr_b((v16i8) vec2, (v16i8) vec0);
404 dst1 = (v16u8) __msa_ilvl_b((v16i8) vec2, (v16i8) vec0);
405 dst2 = (v16u8) __msa_ilvr_b((v16i8) vec3, (v16i8) vec1);
406 dst3 = (v16u8) __msa_ilvl_b((v16i8) vec3, (v16i8) vec1);
407 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
408 src_argb4444 += 32;
409 dst_argb += 64;
410 }
411 }
412
224 #ifdef __cplusplus 413 #ifdef __cplusplus
225 } // extern "C" 414 } // extern "C"
226 } // namespace libyuv 415 } // namespace libyuv
227 #endif 416 #endif
228 417
229 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 418 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
OLDNEW
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698