OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm.h" | 8 #include "SkBlitRow_opts_arm.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 408 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
419 | 419 |
420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; | 420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
421 alpha_mask = vld1_u8(alpha_mask_setup); | 421 alpha_mask = vld1_u8(alpha_mask_setup); |
422 | 422 |
423 /* do the NEON unrolled code */ | 423 /* do the NEON unrolled code */ |
424 #define UNROLL 4 | 424 #define UNROLL 4 |
425 while (count >= UNROLL) { | 425 while (count >= UNROLL) { |
426 uint8x8_t src_raw, dst_raw, dst_final; | 426 uint8x8_t src_raw, dst_raw, dst_final; |
427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; | 427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; |
428 | 428 |
429 __builtin_prefetch(src+32); | |
430 __builtin_prefetch(dst+32); | |
djsollen
2013/07/11 15:24:47
can you put the comment here that this *may* be sl
kevin.petit.not.used.account
2013/07/11 15:42:14
Done.
| |
431 | |
429 /* get the source */ | 432 /* get the source */ |
430 src_raw = vreinterpret_u8_u32(vld1_u32(src)); | 433 src_raw = vreinterpret_u8_u32(vld1_u32(src)); |
431 #if UNROLL > 2 | 434 #if UNROLL > 2 |
432 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); | 435 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); |
433 #endif | 436 #endif |
434 | 437 |
435 /* get and hold the dst too */ | 438 /* get and hold the dst too */ |
436 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); | 439 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); |
437 #if UNROLL > 2 | 440 #if UNROLL > 2 |
438 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); | 441 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |
439 #endif | 442 #endif |
440 | 443 |
441 /* 1st and 2nd bits of the unrolling */ | 444 /* 1st and 2nd bits of the unrolling */ |
442 { | 445 { |
443 uint8x8_t dst_cooked; | 446 uint8x8_t dst_cooked; |
444 uint16x8_t dst_wide; | 447 uint16x8_t dst_wide; |
445 uint8x8_t alpha_narrow; | 448 uint8x8_t alpha_narrow; |
446 uint16x8_t alpha_wide; | 449 uint16x8_t alpha_wide; |
447 | 450 |
448 /* get the alphas spread out properly */ | 451 /* get the alphas spread out properly */ |
449 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); | 452 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); |
450 #if 1 | |
451 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ | |
452 /* we collapsed (255-a)+1 ... */ | |
453 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | 453 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
454 #else | |
455 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); | |
456 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); | |
457 #endif | |
458 | 454 |
459 /* spread the dest */ | 455 /* spread the dest */ |
460 dst_wide = vmovl_u8(dst_raw); | 456 dst_wide = vmovl_u8(dst_raw); |
461 | 457 |
462 /* alpha mul the dest */ | 458 /* alpha mul the dest */ |
463 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | 459 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
464 dst_cooked = vshrn_n_u16(dst_wide, 8); | 460 dst_cooked = vshrn_n_u16(dst_wide, 8); |
465 | 461 |
466 /* sum -- ignoring any byte lane overflows */ | 462 /* sum -- ignoring any byte lane overflows */ |
467 dst_final = vadd_u8(src_raw, dst_cooked); | 463 dst_final = vadd_u8(src_raw, dst_cooked); |
468 } | 464 } |
469 | 465 |
470 #if UNROLL > 2 | 466 #if UNROLL > 2 |
471 /* the 3rd and 4th bits of our unrolling */ | 467 /* the 3rd and 4th bits of our unrolling */ |
472 { | 468 { |
473 uint8x8_t dst_cooked; | 469 uint8x8_t dst_cooked; |
474 uint16x8_t dst_wide; | 470 uint16x8_t dst_wide; |
475 uint8x8_t alpha_narrow; | 471 uint8x8_t alpha_narrow; |
476 uint16x8_t alpha_wide; | 472 uint16x8_t alpha_wide; |
477 | 473 |
478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); | 474 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); |
479 #if 1 | |
480 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ | |
481 /* we collapsed (255-a)+1 ... */ | |
482 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | 475 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
483 #else | |
484 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); | |
485 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); | |
486 #endif | |
487 | 476 |
488 /* spread the dest */ | 477 /* spread the dest */ |
489 dst_wide = vmovl_u8(dst_raw_2); | 478 dst_wide = vmovl_u8(dst_raw_2); |
490 | 479 |
491 /* alpha mul the dest */ | 480 /* alpha mul the dest */ |
492 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | 481 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
493 dst_cooked = vshrn_n_u16(dst_wide, 8); | 482 dst_cooked = vshrn_n_u16(dst_wide, 8); |
494 | 483 |
495 /* sum -- ignoring any byte lane overflows */ | 484 /* sum -- ignoring any byte lane overflows */ |
496 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); | 485 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); |
(...skipping 792 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1289 * case where we do not inspect the src alpha. | 1278 * case where we do not inspect the src alpha. |
1290 */ | 1279 */ |
1291 #if SK_A32_SHIFT == 24 | 1280 #if SK_A32_SHIFT == 24 |
1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1281 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1282 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1294 #else | 1283 #else |
1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1284 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1296 #endif | 1285 #endif |
1297 S32A_Blend_BlitRow32_arm // S32A_Blend | 1286 S32A_Blend_BlitRow32_arm // S32A_Blend |
1298 }; | 1287 }; |
OLD | NEW |