src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 317193003: ARM Skia NEON patches - 39 - arm64 565 blitters

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 317193003: ARM Skia NEON patches - 39 - arm64 565 blitters (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Review comments / small perf improvement Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

11 #include "SkBlitRow.h"	11 #include "SkBlitRow.h"

12 #include "SkColorPriv.h"	12 #include "SkColorPriv.h"

13 #include "SkDither.h"	13 #include "SkDither.h"

14 #include "SkMathPriv.h"	14 #include "SkMathPriv.h"

15 #include "SkUtils.h"	15 #include "SkUtils.h"

16	16

17 #include "SkColor_opts_neon.h"	17 #include "SkColor_opts_neon.h"

18 #include <arm_neon.h>	18 #include <arm_neon.h>

19	19

20 #ifdef SK_CPU_ARM32	20 #ifdef SK_CPU_ARM64

	21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {

	22 uint8x8x4_t vsrc;

	23 uint8x8_t vsrc_0, vsrc_1, vsrc_2;

	24

	25 asm (

	26 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"

	27 "mov %[vsrc0].8b, v0.8b \t\n"

	28 "mov %[vsrc1].8b, v1.8b \t\n"

	29 "mov %[vsrc2].8b, v2.8b \t\n"

	30 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),

	31 [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)

	32 : : "v0", "v1", "v2", "v3"

	33 );

	34

	35 vsrc.val[0] = vsrc_0;

	36 vsrc.val[1] = vsrc_1;

	37 vsrc.val[2] = vsrc_2;

	38

	39 return vsrc;

	40 }

	41

	42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {

	43 uint8x8x4_t vsrc;

	44 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;

	45

	46 asm (

	47 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"

	48 "mov %[vsrc0].8b, v0.8b \t\n"

	49 "mov %[vsrc1].8b, v1.8b \t\n"

	50 "mov %[vsrc2].8b, v2.8b \t\n"

	51 "mov %[vsrc3].8b, v3.8b \t\n"

	52 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),

	53 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),

	54 [src] "+&r" (src)

	55 : : "v0", "v1", "v2", "v3"

	56 );

	57

	58 vsrc.val[0] = vsrc_0;

	59 vsrc.val[1] = vsrc_1;

	60 vsrc.val[2] = vsrc_2;

	61 vsrc.val[3] = vsrc_3;

	62

	63 return vsrc;

	64 }

	65 #endif

	66

21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,	67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,

22 const SkPMColor* SK_RESTRICT src, int count,	68 const SkPMColor* SK_RESTRICT src, int count,

23 U8CPU alpha, int /x/, int /y/) {	69 U8CPU alpha, int /x/, int /y/) {

24 SkASSERT(255 == alpha);	70 SkASSERT(255 == alpha);

25	71

26 while (count >= 8) {	72 while (count >= 8) {

27 uint8x8x4_t vsrc;	73 uint8x8x4_t vsrc;

28 uint16x8_t vdst;	74 uint16x8_t vdst;

29	75

30 // Load	76 // Load

	77 #ifdef SK_CPU_ARM64

	78 vsrc = sk_vld4_u8_arm64_3(src);

	79 #else

31 vsrc = vld4_u8((uint8_t*)src);	80 vsrc = vld4_u8((uint8_t*)src);

	81 src += 8;

	82 #endif

32	83

33 // Convert src to 565	84 // Convert src to 565

34 vdst = SkPixel32ToPixel16_neon8(vsrc);	85 vdst = SkPixel32ToPixel16_neon8(vsrc);

35	86

36 // Store	87 // Store

37 vst1q_u16(dst, vdst);	88 vst1q_u16(dst, vdst);

38	89

39 // Prepare next iteration	90 // Prepare next iteration

40 dst += 8;	91 dst += 8;

41 src += 8;

42 count -= 8;	92 count -= 8;

43 };	93 };

44	94

45 // Leftovers	95 // Leftovers

46 while (count > 0) {	96 while (count > 0) {

47 SkPMColor c = *src++;	97 SkPMColor c = *src++;

48 SkPMColorAssert(c);	98 SkPMColorAssert(c);

49 *dst = SkPixel32ToPixel16_ToU16(c);	99 *dst = SkPixel32ToPixel16_ToU16(c);

50 dst++;	100 dst++;

51 count--;	101 count--;

52 };	102 };

53 }	103 }

54	104

55 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,	105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,

56 const SkPMColor* SK_RESTRICT src, int count,	106 const SkPMColor* SK_RESTRICT src, int count,

57 U8CPU alpha, int /x/, int /y/) {	107 U8CPU alpha, int /x/, int /y/) {

58 SkASSERT(255 > alpha);	108 SkASSERT(255 > alpha);

59	109

60 uint16x8_t vmask_blue, vscale;	110 uint16x8_t vmask_blue, vscale;

61	111

62 // prepare constants	112 // prepare constants

63 vscale = vdupq_n_u16(SkAlpha255To256(alpha));	113 vscale = vdupq_n_u16(SkAlpha255To256(alpha));

64 vmask_blue = vmovq_n_u16(0x1F);	114 vmask_blue = vmovq_n_u16(0x1F);

65	115

66 while (count >= 8) {	116 while (count >= 8) {

	117 uint8x8x4_t vsrc;

67 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;	118 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;

68 uint16x8_t vres_r, vres_g, vres_b;	119 uint16x8_t vres_r, vres_g, vres_b;

69 uint8x8_t vsrc_r, vsrc_g, vsrc_b;

70	120

71 // Load src	121 // Load src

	122 #ifdef SK_CPU_ARM64

	123 vsrc = sk_vld4_u8_arm64_3(src);

	124 #else

72 {	125 {

73 register uint8x8_t d0 asm("d0");	126 register uint8x8_t d0 asm("d0");

74 register uint8x8_t d1 asm("d1");	127 register uint8x8_t d1 asm("d1");

75 register uint8x8_t d2 asm("d2");	128 register uint8x8_t d2 asm("d2");

76 register uint8x8_t d3 asm("d3");	129 register uint8x8_t d3 asm("d3");

77	130

78 asm (	131 asm (

79 "vld4.8 {d0-d3},[%[src]]!"	132 "vld4.8 {d0-d3},[%[src]]!"

80 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)	133 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)

81 :	134 :

82 );	135 );

83 vsrc_g = d1;	136 vsrc.val[0] = d0;

84 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)	137 vsrc.val[1] = d1;

85 vsrc_r = d2; vsrc_b = d0;	138 vsrc.val[2] = d2;

86 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)	139 }

87 vsrc_r = d0; vsrc_b = d2;

88 #endif	140 #endif

89 }

90	141

91 // Load and unpack dst	142 // Load and unpack dst

92 vdst = vld1q_u16(dst);	143 vdst = vld1q_u16(dst);

93 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes	144 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes

94 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue	145 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue

95 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red	146 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red

96 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green	147 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green

97	148

98 // Shift src to 565	149 // Shift src to 565 range

99 vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range	150 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);

100 vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range	151 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);

101 vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range	152 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);

102	153

103 // Scale src - dst	154 // Scale src - dst

104 vres_r = vmovl_u8(vsrc_r) - vdst_r;	155 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;

105 vres_g = vmovl_u8(vsrc_g) - vdst_g;	156 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;

106 vres_b = vmovl_u8(vsrc_b) - vdst_b;	157 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;

107	158

108 vres_r = vshrq_n_u16(vres_r * vscale, 8);	159 vres_r = vshrq_n_u16(vres_r * vscale, 8);

109 vres_g = vshrq_n_u16(vres_g * vscale, 8);	160 vres_g = vshrq_n_u16(vres_g * vscale, 8);

110 vres_b = vshrq_n_u16(vres_b * vscale, 8);	161 vres_b = vshrq_n_u16(vres_b * vscale, 8);

111	162

112 vres_r += vdst_r;	163 vres_r += vdst_r;

113 vres_g += vdst_g;	164 vres_g += vdst_g;

114 vres_b += vdst_b;	165 vres_b += vdst_b;

115	166

116 // Combine	167 // Combine

(...skipping 12 matching lines...) Expand all Loading...
129 SkPMColorAssert(c);	180 SkPMColorAssert(c);

130 uint16_t d = *dst;	181 uint16_t d = *dst;

131 *dst++ = SkPackRGB16(	182 *dst++ = SkPackRGB16(

132 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),	183 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),

133 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),	184 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),

134 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));	185 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));

135 } while (--count != 0);	186 } while (--count != 0);

136 }	187 }

137 }	188 }

138	189

	190 #ifdef SK_CPU_ARM32

139 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,	191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,

140 const SkPMColor* SK_RESTRICT src, int count,	192 const SkPMColor* SK_RESTRICT src, int count,

141 U8CPU alpha, int /x/, int /y/) {	193 U8CPU alpha, int /x/, int /y/) {

142 SkASSERT(255 == alpha);	194 SkASSERT(255 == alpha);

143	195

144 if (count >= 8) {	196 if (count >= 8) {

145 uint16_t* SK_RESTRICT keep_dst = 0;	197 uint16_t* SK_RESTRICT keep_dst = 0;

146	198

147 asm volatile (	199 asm volatile (

148 "ands ip, %[count], #7 \n\t"	200 "ands ip, %[count], #7 \n\t"

(...skipping 157 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
306	358

307 "21: \n\t"	359 "21: \n\t"

308 : [count] "+r" (count)	360 : [count] "+r" (count)

309 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc)	361 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc)

310 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7",	362 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7",

311 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29",	363 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29",

312 "d30","d31"	364 "d30","d31"

313 );	365 );

314 }	366 }

315 }	367 }

	368 #endif

316	369

317 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {	370 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {

318 prod += vdupq_n_u16(128);	371 prod += vdupq_n_u16(128);

319 prod += vshrq_n_u16(prod, 8);	372 prod += vshrq_n_u16(prod, 8);

320 return vshrq_n_u16(prod, 8);	373 return vshrq_n_u16(prod, 8);

321 }	374 }

322	375

323 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,	376 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,

324 const SkPMColor* SK_RESTRICT src, int count,	377 const SkPMColor* SK_RESTRICT src, int count,

325 U8CPU alpha, int /x/, int /y/) {	378 U8CPU alpha, int /x/, int /y/) {

(...skipping 13 matching lines...) Expand all Loading...
339 valpha = vdup_n_u8(alpha);	392 valpha = vdup_n_u8(alpha);

340 vmask_blue = vmovq_n_u16(SK_B16_MASK);	393 vmask_blue = vmovq_n_u16(SK_B16_MASK);

341	394

342 do {	395 do {

343 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;	396 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;

344 uint16x8_t vres_a, vres_r, vres_g, vres_b;	397 uint16x8_t vres_a, vres_r, vres_g, vres_b;

345 uint8x8x4_t vsrc;	398 uint8x8x4_t vsrc;

346	399

347 // load pixels	400 // load pixels

348 vdst = vld1q_u16(dst);	401 vdst = vld1q_u16(dst);

	402 #ifdef SK_CPU_ARM64

	403 vsrc = sk_vld4_u8_arm64_4(src);

	404 #else

349 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))	405 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
	mtklein 2014/06/06 14:41:17 Think it makes sense to follow up and do the same Think it makes sense to follow up and do the same for 32-bit side, given that we have what looks like a couple different strategies here too? Then all the code can call sk_vld4_u8_3 or sk_vld4_u8_4 as needed?
350 asm (	406 asm (

351 "vld4.u8 %h[vsrc], [%[src]]!"	407 "vld4.u8 %h[vsrc], [%[src]]!"

352 : [vsrc] "=w" (vsrc), [src] "+&r" (src)	408 : [vsrc] "=w" (vsrc), [src] "+&r" (src)

353 : :	409 : :

354 );	410 );

355 #else	411 #else

356 register uint8x8_t d0 asm("d0");	412 register uint8x8_t d0 asm("d0");

357 register uint8x8_t d1 asm("d1");	413 register uint8x8_t d1 asm("d1");

358 register uint8x8_t d2 asm("d2");	414 register uint8x8_t d2 asm("d2");

359 register uint8x8_t d3 asm("d3");	415 register uint8x8_t d3 asm("d3");

360	416

361 asm volatile (	417 asm volatile (

362 "vld4.u8 {d0-d3},[%[src]]!;"	418 "vld4.u8 {d0-d3},[%[src]]!;"

363 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),	419 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),

364 [src] "+&r" (src)	420 [src] "+&r" (src)

365 : :	421 : :

366 );	422 );

367 vsrc.val[0] = d0;	423 vsrc.val[0] = d0;

368 vsrc.val[1] = d1;	424 vsrc.val[1] = d1;

369 vsrc.val[2] = d2;	425 vsrc.val[2] = d2;

370 vsrc.val[3] = d3;	426 vsrc.val[3] = d3;

371 #endif	427 #endif

	428 #endif // #ifdef SK_CPU_ARM64

372	429

373	430

374 // deinterleave dst	431 // deinterleave dst

375 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes	432 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes

376 vdst_b = vdst & vmask_blue; // extract blue	433 vdst_b = vdst & vmask_blue; // extract blue

377 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red	434 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red

378 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green	435 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green

379	436

380 // shift src to 565	437 // shift src to 565

381 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);	438 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);

(...skipping 79 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
461 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];	518 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

462	519

463 uint8x8_t vdither = vld1_u8(dstart); // load dither values	520 uint8x8_t vdither = vld1_u8(dstart); // load dither values

464 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s	521 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s

465	522

466 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg	523 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg

467 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask	524 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask

468	525

469 do {	526 do {

470	527

	528 uint8x8x4_t vsrc;

471 uint8x8_t vsrc_r, vsrc_g, vsrc_b;	529 uint8x8_t vsrc_r, vsrc_g, vsrc_b;

472 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;	530 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;

473 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;	531 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;

474 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;	532 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;

475 uint16x8_t vdst;	533 uint16x8_t vdst;

476 uint16x8_t vdst_r, vdst_g, vdst_b;	534 uint16x8_t vdst_r, vdst_g, vdst_b;

477 int16x8_t vres_r, vres_g, vres_b;	535 int16x8_t vres_r, vres_g, vres_b;

478 int8x8_t vres8_r, vres8_g, vres8_b;	536 int8x8_t vres8_r, vres8_g, vres8_b;

479	537

480 // Load source and add dither	538 // Load source and add dither

	539 #ifdef SK_CPU_ARM64

	540 vsrc = sk_vld4_u8_arm64_3(src);

	541 #else

481 {	542 {

482 register uint8x8_t d0 asm("d0");	543 register uint8x8_t d0 asm("d0");

483 register uint8x8_t d1 asm("d1");	544 register uint8x8_t d1 asm("d1");

484 register uint8x8_t d2 asm("d2");	545 register uint8x8_t d2 asm("d2");

485 register uint8x8_t d3 asm("d3");	546 register uint8x8_t d3 asm("d3");

486	547

487 asm (	548 asm (

488 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"	549 "vld4.8 {d0-d3},[%[src]]! "

489 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)	550 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)

490 :	551 :

491 );	552 );

492 vsrc_g = d1;	553 vsrc.val[0] = d0;

493 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)	554 vsrc.val[1] = d1;

494 vsrc_r = d2; vsrc_b = d0;	555 vsrc.val[2] = d2;

495 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)	556 }

496 vsrc_r = d0; vsrc_b = d2;

497 #endif	557 #endif

498 }	558 vsrc_r = vsrc.val[NEON_R];

	559 vsrc_g = vsrc.val[NEON_G];

	560 vsrc_b = vsrc.val[NEON_B];

499	561

500 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6	562 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6

501 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5	563 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5

502 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5	564 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5

503	565

504 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen	566 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen

505 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen	567 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen

506 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen	568 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen

507	569

508 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result	570 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
569 sb = SkDITHER_B32To565(sb, dither);	631 sb = SkDITHER_B32To565(sb, dither);

570	632

571 uint16_t d = *dst;	633 uint16_t d = *dst;

572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),	634 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),

573 SkAlphaBlend(sg, SkGetPackedG16(d), scale),	635 SkAlphaBlend(sg, SkGetPackedG16(d), scale),

574 SkAlphaBlend(sb, SkGetPackedB16(d), scale));	636 SkAlphaBlend(sb, SkGetPackedB16(d), scale));

575 DITHER_INC_X(x);	637 DITHER_INC_X(x);

576 } while (--count != 0);	638 } while (--count != 0);

577 }	639 }

578 }	640 }

579 #endif

580	641

581 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	642 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

582 const SkPMColor* SK_RESTRICT src,	643 const SkPMColor* SK_RESTRICT src,

583 int count, U8CPU alpha) {	644 int count, U8CPU alpha) {

584	645

585 SkASSERT(255 == alpha);	646 SkASSERT(255 == alpha);

586 if (count > 0) {	647 if (count > 0) {

587	648

588	649

589 uint8x8_t alpha_mask;	650 uint8x8_t alpha_mask;

(...skipping 450 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1040 uint16_t pc = (uint16_t) p;	1101 uint16_t pc = (uint16_t) p;

1041 sprintf(buf,"%8s:", str);	1102 sprintf(buf,"%8s:", str);

1042 len = (len / sizeof(uint16_t)); /* passed as bytes */	1103 len = (len / sizeof(uint16_t)); /* passed as bytes */

1043 for(i=0;i<len;i++) {	1104 for(i=0;i<len;i++) {

1044 sprintf(tbuf, " %04x", pc[i]);	1105 sprintf(tbuf, " %04x", pc[i]);

1045 strcat(buf, tbuf);	1106 strcat(buf, tbuf);

1046 }	1107 }

1047 SkDebugf("%s\n", buf);	1108 SkDebugf("%s\n", buf);

1048 }	1109 }

1049 #endif	1110 #endif

	1111 #endif // #ifdef SK_CPU_ARM32

1050	1112

1051 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,	1113 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,

1052 const SkPMColor* SK_RESTRICT src,	1114 const SkPMColor* SK_RESTRICT src,

1053 int count, U8CPU alpha, int x, int y) {	1115 int count, U8CPU alpha, int x, int y) {

1054 SkASSERT(255 == alpha);	1116 SkASSERT(255 == alpha);

1055	1117

1056 #define UNROLL 8	1118 #define UNROLL 8

1057	1119

1058 if (count >= UNROLL) {	1120 if (count >= UNROLL) {

1059	1121

1060 #if defined(DEBUG_OPAQUE_DITHER)	1122 #if defined(DEBUG_OPAQUE_DITHER)

1061 uint16_t tmpbuf[UNROLL];	1123 uint16_t tmpbuf[UNROLL];

1062 int td[UNROLL];	1124 int td[UNROLL];

1063 int tdv[UNROLL];	1125 int tdv[UNROLL];

1064 int ta[UNROLL];	1126 int ta[UNROLL];

1065 int tap[UNROLL];	1127 int tap[UNROLL];

1066 uint16_t in_dst[UNROLL];	1128 uint16_t in_dst[UNROLL];

1067 int offset = 0;	1129 int offset = 0;

1068 int noisy = 0;	1130 int noisy = 0;

1069 #endif	1131 #endif

1070	1132

1071 uint8x8_t dbase;	1133 uint8x8_t dbase;

1072 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];	1134 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

1073 dbase = vld1_u8(dstart);	1135 dbase = vld1_u8(dstart);

1074	1136

1075 do {	1137 do {

	1138 uint8x8x4_t vsrc;

1076 uint8x8_t sr, sg, sb, sa, d;	1139 uint8x8_t sr, sg, sb, sa, d;

1077 uint16x8_t dst8, scale8, alpha8;	1140 uint16x8_t dst8, scale8, alpha8;

1078 uint16x8_t dst_r, dst_g, dst_b;	1141 uint16x8_t dst_r, dst_g, dst_b;

1079	1142

1080 #if defined(DEBUG_OPAQUE_DITHER)	1143 #if defined(DEBUG_OPAQUE_DITHER)

1081 // calculate 8 elements worth into a temp buffer	1144 // calculate 8 elements worth into a temp buffer

1082 {	1145 {

1083 int my_y = y;	1146 int my_y = y;

1084 int my_x = x;	1147 int my_x = x;

1085 SkPMColor* my_src = (SkPMColor*)src;	1148 SkPMColor* my_src = (SkPMColor*)src;

(...skipping 30 matching lines...) Expand all Loading...
1116 tmpbuf[i] = *my_dst;	1179 tmpbuf[i] = *my_dst;

1117 ta[i] = tdv[i] = td[i] = 0xbeef;	1180 ta[i] = tdv[i] = td[i] = 0xbeef;

1118 }	1181 }

1119 in_dst[i] = *my_dst;	1182 in_dst[i] = *my_dst;

1120 my_dst += 1;	1183 my_dst += 1;

1121 DITHER_INC_X(my_x);	1184 DITHER_INC_X(my_x);

1122 }	1185 }

1123 }	1186 }

1124 #endif	1187 #endif

1125	1188

1126	1189 #ifdef SK_CPU_ARM64

	1190 vsrc = sk_vld4_u8_arm64_4(src);

	1191 #else

1127 {	1192 {

1128 register uint8x8_t d0 asm("d0");	1193 register uint8x8_t d0 asm("d0");

1129 register uint8x8_t d1 asm("d1");	1194 register uint8x8_t d1 asm("d1");

1130 register uint8x8_t d2 asm("d2");	1195 register uint8x8_t d2 asm("d2");

1131 register uint8x8_t d3 asm("d3");	1196 register uint8x8_t d3 asm("d3");

1132	1197

1133 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"	1198 asm ("vld4.8 {d0-d3},[%[src]]! "

1134 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)	1199 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)

1135 :	1200 :

1136 );	1201 );

1137 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)	1202 vsrc.val[0] = d0;

1138 sr = d2; sg = d1; sb = d0; sa = d3;	1203 vsrc.val[1] = d1;

1139 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)	1204 vsrc.val[2] = d2;

1140 sr = d0; sg = d1; sb = d2; sa = d3;	1205 vsrc.val[3] = d3;

	1206 }

1141 #endif	1207 #endif

1142 }	1208 sa = vsrc.val[NEON_A];

	1209 sr = vsrc.val[NEON_R];

	1210 sg = vsrc.val[NEON_G];

	1211 sb = vsrc.val[NEON_B];

1143	1212

1144 /* calculate 'd', which will be 0..7	1213 /* calculate 'd', which will be 0..7

1145 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice	1214 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice

1146 */	1215 */

1147 alpha8 = vmovl_u8(dbase);	1216 alpha8 = vmovl_u8(dbase);

1148 alpha8 = vmlal_u8(alpha8, sa, dbase);	1217 alpha8 = vmlal_u8(alpha8, sa, dbase);

1149 d = vshrn_n_u16(alpha8, 8); // narrowing too	1218 d = vshrn_n_u16(alpha8, 8); // narrowing too

1150	1219

1151 // sr = sr - (sr>>5) + d	1220 // sr = sr - (sr>>5) + d

1152 /* watching for 8-bit overflow. d is 0..7; risky range of	1221 /* watching for 8-bit overflow. d is 0..7; risky range of

(...skipping 123 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1276 #define UNROLL 8	1345 #define UNROLL 8

1277 if (count >= UNROLL) {	1346 if (count >= UNROLL) {

1278 uint8x8_t d;	1347 uint8x8_t d;

1279 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];	1348 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

1280 d = vld1_u8(dstart);	1349 d = vld1_u8(dstart);

1281	1350

1282 while (count >= UNROLL) {	1351 while (count >= UNROLL) {

1283 uint8x8_t sr, sg, sb;	1352 uint8x8_t sr, sg, sb;

1284 uint16x8_t dr, dg, db;	1353 uint16x8_t dr, dg, db;

1285 uint16x8_t dst8;	1354 uint16x8_t dst8;

	1355 uint8x8x4_t vsrc;

1286	1356

	1357 #ifdef SK_CPU_ARM64

	1358 vsrc = sk_vld4_u8_arm64_3(src);

	1359 #else

1287 {	1360 {

1288 register uint8x8_t d0 asm("d0");	1361 register uint8x8_t d0 asm("d0");

1289 register uint8x8_t d1 asm("d1");	1362 register uint8x8_t d1 asm("d1");

1290 register uint8x8_t d2 asm("d2");	1363 register uint8x8_t d2 asm("d2");

1291 register uint8x8_t d3 asm("d3");	1364 register uint8x8_t d3 asm("d3");

1292	1365

1293 asm (	1366 asm (

1294 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"	1367 "vld4.8 {d0-d3},[%[src]]! "

1295 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)	1368 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)

1296 :	1369 :

1297 );	1370 );

1298 sg = d1;	1371 vsrc.val[0] = d0;

1299 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)	1372 vsrc.val[1] = d1;

1300 sr = d2; sb = d0;	1373 vsrc.val[2] = d2;

1301 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)	1374 }

1302 sr = d0; sb = d2;

1303 #endif	1375 #endif

1304 }	1376 sr = vsrc.val[NEON_R];

	1377 sg = vsrc.val[NEON_G];

	1378 sb = vsrc.val[NEON_B];

	1379

1305 /* XXX: if we want to prefetch, hide it in the above asm()	1380 /* XXX: if we want to prefetch, hide it in the above asm()

1306 * using the gcc __builtin_prefetch(), the prefetch will	1381 * using the gcc __builtin_prefetch(), the prefetch will

1307 * fall to the bottom of the loop -- it won't stick up	1382 * fall to the bottom of the loop -- it won't stick up

1308 * at the top of the loop, just after the vld4.	1383 * at the top of the loop, just after the vld4.

1309 */	1384 */

1310	1385

1311 // sr = sr - (sr>>5) + d	1386 // sr = sr - (sr>>5) + d

1312 sr = vsub_u8(sr, vshr_n_u8(sr, 5));	1387 sr = vsub_u8(sr, vshr_n_u8(sr, 5));

1313 dr = vaddl_u8(sr, d);	1388 dr = vaddl_u8(sr, d);

1314	1389

(...skipping 47 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1362 SkPMColor c = *src++;	1437 SkPMColor c = *src++;

1363 SkPMColorAssert(c);	1438 SkPMColorAssert(c);

1364 SkASSERT(SkGetPackedA32(c) == 255);	1439 SkASSERT(SkGetPackedA32(c) == 255);

1365	1440

1366 unsigned dither = DITHER_VALUE(x);	1441 unsigned dither = DITHER_VALUE(x);

1367 *dst++ = SkDitherRGB32To565(c, dither);	1442 *dst++ = SkDitherRGB32To565(c, dither);

1368 DITHER_INC_X(x);	1443 DITHER_INC_X(x);

1369 } while (--count != 0);	1444 } while (--count != 0);

1370 }	1445 }

1371 }	1446 }

1372 #endif

1373	1447

1374 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,	1448 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,

1375 SkPMColor color) {	1449 SkPMColor color) {

1376 if (count <= 0) {	1450 if (count <= 0) {

1377 return;	1451 return;

1378 }	1452 }

1379	1453

1380 if (0 == color) {	1454 if (0 == color) {

1381 if (src != dst) {	1455 if (src != dst) {

1382 memcpy(dst, src, count * sizeof(SkPMColor));	1456 memcpy(dst, src, count * sizeof(SkPMColor));

(...skipping 85 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1468 dst = color + SkAlphaMulQ(src, scale);	1542 dst = color + SkAlphaMulQ(src, scale);

1469 src += 1;	1543 src += 1;

1470 dst += 1;	1544 dst += 1;

1471 count--;	1545 count--;

1472 }	1546 }

1473 }	1547 }

1474	1548

1475 ///////////////////////////////////////////////////////////////////////////////	1549 ///////////////////////////////////////////////////////////////////////////////

1476	1550

1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {	1551 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {

1478 #ifdef SK_CPU_ARM32

1479 // no dither	1552 // no dither

1480 S32_D565_Opaque_neon,	1553 S32_D565_Opaque_neon,

1481 S32_D565_Blend_neon,	1554 S32_D565_Blend_neon,

	1555 #ifdef SK_CPU_ARM32

1482 S32A_D565_Opaque_neon,	1556 S32A_D565_Opaque_neon,

	1557 #else

	1558 NULL,

	1559 #endif

1483 S32A_D565_Blend_neon,	1560 S32A_D565_Blend_neon,

1484	1561

1485 // dither	1562 // dither

1486 S32_D565_Opaque_Dither_neon,	1563 S32_D565_Opaque_Dither_neon,

1487 S32_D565_Blend_Dither_neon,	1564 S32_D565_Blend_Dither_neon,

1488 S32A_D565_Opaque_Dither_neon,	1565 S32A_D565_Opaque_Dither_neon,

1489 NULL, // S32A_D565_Blend_Dither	1566 NULL, // S32A_D565_Blend_Dither

1490 #else

1491 NULL, NULL, NULL, NULL,

1492 NULL, NULL, NULL, NULL

1493 #endif

1494 };	1567 };

1495	1568

1496 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {	1569 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

1497 NULL, // S32_Opaque,	1570 NULL, // S32_Opaque,

1498 S32_Blend_BlitRow32_neon, // S32_Blend,	1571 S32_Blend_BlitRow32_neon, // S32_Blend,

1499 /*	1572 /*

1500 * We have two choices for S32A_Opaque procs. The one reads the src alpha	1573 * We have two choices for S32A_Opaque procs. The one reads the src alpha

1501 * value and attempts to optimize accordingly. The optimization is	1574 * value and attempts to optimize accordingly. The optimization is

1502 * sensitive to the source content and is not a win in all cases. For	1575 * sensitive to the source content and is not a win in all cases. For

1503 * example, if there are a lot of transitions between the alpha states,	1576 * example, if there are a lot of transitions between the alpha states,

1504 * the performance will almost certainly be worse. However, for many	1577 * the performance will almost certainly be worse. However, for many

1505 * common cases the performance is equivalent or better than the standard	1578 * common cases the performance is equivalent or better than the standard

1506 * case where we do not inspect the src alpha.	1579 * case where we do not inspect the src alpha.

1507 */	1580 */

1508 #if SK_A32_SHIFT == 24	1581 #if SK_A32_SHIFT == 24

1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1582 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1583 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1511 #else	1584 #else

1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1585 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1513 #endif	1586 #endif

1514 #ifdef SK_CPU_ARM32	1587 #ifdef SK_CPU_ARM32

1515 S32A_Blend_BlitRow32_neon // S32A_Blend	1588 S32A_Blend_BlitRow32_neon // S32A_Blend

1516 #else	1589 #else

1517 NULL	1590 NULL

1518 #endif	1591 #endif

1519 };	1592 };

OLD	NEW

« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »