src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 18459008: ARM Skia NEON patches - 13 - S32A_Opaque

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 18459008: ARM Skia NEON patches - 13 - S32A_Opaque (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm.h"	8 #include "SkBlitRow_opts_arm.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 408 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
419	419

420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};	420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

421 alpha_mask = vld1_u8(alpha_mask_setup);	421 alpha_mask = vld1_u8(alpha_mask_setup);

422	422

423 /* do the NEON unrolled code */	423 /* do the NEON unrolled code */

424 #define UNROLL 4	424 #define UNROLL 4

425 while (count >= UNROLL) {	425 while (count >= UNROLL) {

426 uint8x8_t src_raw, dst_raw, dst_final;	426 uint8x8_t src_raw, dst_raw, dst_final;

427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;	427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;

428	428

	429 __builtin_prefetch(src+32);

	430 __builtin_prefetch(dst+32);
	djsollen 2013/07/11 15:24:47 can you put the comment here that this may be sl can you put the comment here that this may be slightly slower if the count is low, but worth it for the general case. kevin.petit.not.used.account 2013/07/11 15:42:14 Done. Show quoted text On 2013/07/11 15:24:47, djsollen wrote: > can you put the comment here that this may be slightly slower if the count is > low, but worth it for the general case. Done.
	431

429 /* get the source */	432 /* get the source */

430 src_raw = vreinterpret_u8_u32(vld1_u32(src));	433 src_raw = vreinterpret_u8_u32(vld1_u32(src));

431 #if UNROLL > 2	434 #if UNROLL > 2

432 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));	435 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));

433 #endif	436 #endif

434	437

435 /* get and hold the dst too */	438 /* get and hold the dst too */

436 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));	439 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));

437 #if UNROLL > 2	440 #if UNROLL > 2

438 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));	441 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));

439 #endif	442 #endif

440	443

441 /* 1st and 2nd bits of the unrolling */	444 /* 1st and 2nd bits of the unrolling */

442 {	445 {

443 uint8x8_t dst_cooked;	446 uint8x8_t dst_cooked;

444 uint16x8_t dst_wide;	447 uint16x8_t dst_wide;

445 uint8x8_t alpha_narrow;	448 uint8x8_t alpha_narrow;

446 uint16x8_t alpha_wide;	449 uint16x8_t alpha_wide;

447	450

448 /* get the alphas spread out properly */	451 /* get the alphas spread out properly */

449 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);	452 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);

450 #if 1

451 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

452 /* we collapsed (255-a)+1 ... */

453 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);	453 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

454 #else

455 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);

456 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));

457 #endif

458	454

459 /* spread the dest */	455 /* spread the dest */

460 dst_wide = vmovl_u8(dst_raw);	456 dst_wide = vmovl_u8(dst_raw);

461	457

462 /* alpha mul the dest */	458 /* alpha mul the dest */

463 dst_wide = vmulq_u16 (dst_wide, alpha_wide);	459 dst_wide = vmulq_u16 (dst_wide, alpha_wide);

464 dst_cooked = vshrn_n_u16(dst_wide, 8);	460 dst_cooked = vshrn_n_u16(dst_wide, 8);

465	461

466 /* sum -- ignoring any byte lane overflows */	462 /* sum -- ignoring any byte lane overflows */

467 dst_final = vadd_u8(src_raw, dst_cooked);	463 dst_final = vadd_u8(src_raw, dst_cooked);

468 }	464 }

469	465

470 #if UNROLL > 2	466 #if UNROLL > 2

471 /* the 3rd and 4th bits of our unrolling */	467 /* the 3rd and 4th bits of our unrolling */

472 {	468 {

473 uint8x8_t dst_cooked;	469 uint8x8_t dst_cooked;

474 uint16x8_t dst_wide;	470 uint16x8_t dst_wide;

475 uint8x8_t alpha_narrow;	471 uint8x8_t alpha_narrow;

476 uint16x8_t alpha_wide;	472 uint16x8_t alpha_wide;

477	473

478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);	474 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);

479 #if 1

480 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

481 /* we collapsed (255-a)+1 ... */

482 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);	475 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

483 #else

484 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);

485 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));

486 #endif

487	476

488 /* spread the dest */	477 /* spread the dest */

489 dst_wide = vmovl_u8(dst_raw_2);	478 dst_wide = vmovl_u8(dst_raw_2);

490	479

491 /* alpha mul the dest */	480 /* alpha mul the dest */

492 dst_wide = vmulq_u16 (dst_wide, alpha_wide);	481 dst_wide = vmulq_u16 (dst_wide, alpha_wide);

493 dst_cooked = vshrn_n_u16(dst_wide, 8);	482 dst_cooked = vshrn_n_u16(dst_wide, 8);

494	483

495 /* sum -- ignoring any byte lane overflows */	484 /* sum -- ignoring any byte lane overflows */

496 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);	485 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);

(...skipping 792 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1289 * case where we do not inspect the src alpha.	1278 * case where we do not inspect the src alpha.

1290 */	1279 */

1291 #if SK_A32_SHIFT == 24	1280 #if SK_A32_SHIFT == 24

1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1281 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1282 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1294 #else	1283 #else

1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1284 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1296 #endif	1285 #endif

1297 S32A_Blend_BlitRow32_arm // S32A_Blend	1286 S32A_Blend_BlitRow32_arm // S32A_Blend

1298 };	1287 };

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »