src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 317193003: ARM Skia NEON patches - 39 - arm64 565 blitters

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 317193003: ARM Skia NEON patches - 39 - arm64 565 blitters (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: ignored-tests.txt Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

11 #include "SkBlitRow.h"	11 #include "SkBlitRow.h"

12 #include "SkColorPriv.h"	12 #include "SkColorPriv.h"

13 #include "SkDither.h"	13 #include "SkDither.h"

14 #include "SkMathPriv.h"	14 #include "SkMathPriv.h"

15 #include "SkUtils.h"	15 #include "SkUtils.h"

16	16

17 #include "SkColor_opts_neon.h"	17 #include "SkColor_opts_neon.h"

18 #include <arm_neon.h>	18 #include <arm_neon.h>

19	19

20 #ifdef SK_CPU_ARM32

21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,	20 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,

22 const SkPMColor* SK_RESTRICT src, int count,	21 const SkPMColor* SK_RESTRICT src, int count,

23 U8CPU alpha, int /x/, int /y/) {	22 U8CPU alpha, int /x/, int /y/) {

24 SkASSERT(255 == alpha);	23 SkASSERT(255 == alpha);

25	24

26 while (count >= 8) {	25 while (count >= 8) {

27 uint8x8x4_t vsrc;	26 uint8x8x4_t vsrc;

28 uint16x8_t vdst;	27 uint16x8_t vdst;

29	28

30 // Load	29 // Load

	30 #ifdef SK_CPU_ARM64

	31 uint8x8_t vsrc_0, vsrc_1, vsrc_2;

	32 asm (

	33 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"
	mtklein 2014/06/05 16:28:36 Looks like we've got the same load-32-bytes thing Looks like we've got the same load-32-bytes thing going on here several times. Any way we can abstract out this whole #ifdef-#else-#endif block as an inline function? Seems like the main difference is that some post-increment and some manually increment src? kevin.petit 2014/06/05 17:00:02 There are actually a few more differences. Some us Show quoted text On 2014/06/05 16:28:36, mtklein wrote: > Looks like we've got the same load-32-bytes thing going on here several times. > Any way we can abstract out this whole #ifdef-#else-#endif block as an inline > function? Seems like the main difference is that some post-increment and some > manually increment src? There are actually a few more differences. Some use only 24bytes, some the full 32. Factorising the whole #ifdef-#else-#endif would be ideal but would require quite a lot of regression testing (the blocks are slightly different from function to function with a measurable impact on performance) on arm32 which I don't really have the time to do right now (although that's definitely on the list of things I want to do). I'll try to make something sensible for the SK_ARM_CPU64 case. BTW, do you know if Skia still has to support gcc4.6?
	34 "mov %[vsrc0].8b, v0.8b \t\n"

	35 "mov %[vsrc1].8b, v1.8b \t\n"

	36 "mov %[vsrc2].8b, v2.8b \t\n"

	37 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),

	38 [vsrc2] "=w" (vsrc_2)

	39 : [src] "r" (src)

	40 : "v0", "v1", "v2", "v3"

	41 );

	42 vsrc.val[0] = vsrc_0;

	43 vsrc.val[1] = vsrc_1;

	44 vsrc.val[2] = vsrc_2;

	45

	46 #else

31 vsrc = vld4_u8((uint8_t*)src);	47 vsrc = vld4_u8((uint8_t*)src);

	48 #endif

32	49

33 // Convert src to 565	50 // Convert src to 565

34 vdst = SkPixel32ToPixel16_neon8(vsrc);	51 vdst = SkPixel32ToPixel16_neon8(vsrc);

35	52

36 // Store	53 // Store

37 vst1q_u16(dst, vdst);	54 vst1q_u16(dst, vdst);

38	55

39 // Prepare next iteration	56 // Prepare next iteration

40 dst += 8;	57 dst += 8;

41 src += 8;	58 src += 8;

(...skipping 15 matching lines...) Expand all Loading...
57 U8CPU alpha, int /x/, int /y/) {	74 U8CPU alpha, int /x/, int /y/) {

58 SkASSERT(255 > alpha);	75 SkASSERT(255 > alpha);

59	76

60 uint16x8_t vmask_blue, vscale;	77 uint16x8_t vmask_blue, vscale;

61	78

62 // prepare constants	79 // prepare constants

63 vscale = vdupq_n_u16(SkAlpha255To256(alpha));	80 vscale = vdupq_n_u16(SkAlpha255To256(alpha));

64 vmask_blue = vmovq_n_u16(0x1F);	81 vmask_blue = vmovq_n_u16(0x1F);

65	82

66 while (count >= 8) {	83 while (count >= 8) {

	84 uint8x8x4_t vsrc;

67 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;	85 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;

68 uint16x8_t vres_r, vres_g, vres_b;	86 uint16x8_t vres_r, vres_g, vres_b;

69 uint8x8_t vsrc_r, vsrc_g, vsrc_b;

70	87

71 // Load src	88 // Load src

	89 #ifdef SK_CPU_ARM64

	90 uint8x8_t vsrc_0, vsrc_1, vsrc_2;

	91 asm (

	92 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"

	93 "mov %[vsrc0].8b, v0.8b \t\n"

	94 "mov %[vsrc1].8b, v1.8b \t\n"

	95 "mov %[vsrc2].8b, v2.8b \t\n"

	96 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),

	97 [vsrc2] "=w" (vsrc_2)

	98 : [src] "r" (src)

	99 : "v0", "v1", "v2", "v3"

	100 );

	101 vsrc.val[0] = vsrc_0;

	102 vsrc.val[1] = vsrc_1;

	103 vsrc.val[2] = vsrc_2;

	104 src += 8;

	105 #else

72 {	106 {

73 register uint8x8_t d0 asm("d0");	107 register uint8x8_t d0 asm("d0");

74 register uint8x8_t d1 asm("d1");	108 register uint8x8_t d1 asm("d1");

75 register uint8x8_t d2 asm("d2");	109 register uint8x8_t d2 asm("d2");

76 register uint8x8_t d3 asm("d3");	110 register uint8x8_t d3 asm("d3");

77	111

78 asm (	112 asm (

79 "vld4.8 {d0-d3},[%[src]]!"	113 "vld4.8 {d0-d3},[%[src]]!"

80 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)	114 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)

81 :	115 :

82 );	116 );

83 vsrc_g = d1;	117 vsrc.val[0] = d0;

84 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)	118 vsrc.val[1] = d1;

85 vsrc_r = d2; vsrc_b = d0;	119 vsrc.val[2] = d2;

86 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)	120 }

87 vsrc_r = d0; vsrc_b = d2;

88 #endif	121 #endif

89 }

90	122

91 // Load and unpack dst	123 // Load and unpack dst

92 vdst = vld1q_u16(dst);	124 vdst = vld1q_u16(dst);

93 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes	125 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes

94 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue	126 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue

95 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red	127 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red

96 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green	128 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green

97	129

98 // Shift src to 565	130 // Shift src to 565 range

99 vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range	131 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);

100 vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range	132 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);

101 vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range	133 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);

102	134

103 // Scale src - dst	135 // Scale src - dst

104 vres_r = vmovl_u8(vsrc_r) - vdst_r;	136 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;

105 vres_g = vmovl_u8(vsrc_g) - vdst_g;	137 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;

106 vres_b = vmovl_u8(vsrc_b) - vdst_b;	138 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;

107	139

108 vres_r = vshrq_n_u16(vres_r * vscale, 8);	140 vres_r = vshrq_n_u16(vres_r * vscale, 8);

109 vres_g = vshrq_n_u16(vres_g * vscale, 8);	141 vres_g = vshrq_n_u16(vres_g * vscale, 8);

110 vres_b = vshrq_n_u16(vres_b * vscale, 8);	142 vres_b = vshrq_n_u16(vres_b * vscale, 8);

111	143

112 vres_r += vdst_r;	144 vres_r += vdst_r;

113 vres_g += vdst_g;	145 vres_g += vdst_g;

114 vres_b += vdst_b;	146 vres_b += vdst_b;

115	147

116 // Combine	148 // Combine

(...skipping 12 matching lines...) Expand all Loading...
129 SkPMColorAssert(c);	161 SkPMColorAssert(c);

130 uint16_t d = *dst;	162 uint16_t d = *dst;

131 *dst++ = SkPackRGB16(	163 *dst++ = SkPackRGB16(

132 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),	164 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),

133 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),	165 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),

134 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));	166 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));

135 } while (--count != 0);	167 } while (--count != 0);

136 }	168 }

137 }	169 }

138	170

	171 #ifdef SK_CPU_ARM32

139 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,	172 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,

140 const SkPMColor* SK_RESTRICT src, int count,	173 const SkPMColor* SK_RESTRICT src, int count,

141 U8CPU alpha, int /x/, int /y/) {	174 U8CPU alpha, int /x/, int /y/) {

142 SkASSERT(255 == alpha);	175 SkASSERT(255 == alpha);

143	176

144 if (count >= 8) {	177 if (count >= 8) {

145 uint16_t* SK_RESTRICT keep_dst = 0;	178 uint16_t* SK_RESTRICT keep_dst = 0;

146	179

147 asm volatile (	180 asm volatile (

148 "ands ip, %[count], #7 \n\t"	181 "ands ip, %[count], #7 \n\t"

(...skipping 157 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
306	339

307 "21: \n\t"	340 "21: \n\t"

308 : [count] "+r" (count)	341 : [count] "+r" (count)

309 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc)	342 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc)

310 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7",	343 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7",

311 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29",	344 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29",

312 "d30","d31"	345 "d30","d31"

313 );	346 );

314 }	347 }

315 }	348 }

	349 #endif

316	350

317 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {	351 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {

318 prod += vdupq_n_u16(128);	352 prod += vdupq_n_u16(128);

319 prod += vshrq_n_u16(prod, 8);	353 prod += vshrq_n_u16(prod, 8);

320 return vshrq_n_u16(prod, 8);	354 return vshrq_n_u16(prod, 8);

321 }	355 }

322	356

323 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,	357 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,

324 const SkPMColor* SK_RESTRICT src, int count,	358 const SkPMColor* SK_RESTRICT src, int count,

325 U8CPU alpha, int /x/, int /y/) {	359 U8CPU alpha, int /x/, int /y/) {

(...skipping 13 matching lines...) Expand all Loading...
339 valpha = vdup_n_u8(alpha);	373 valpha = vdup_n_u8(alpha);

340 vmask_blue = vmovq_n_u16(SK_B16_MASK);	374 vmask_blue = vmovq_n_u16(SK_B16_MASK);

341	375

342 do {	376 do {

343 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;	377 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;

344 uint16x8_t vres_a, vres_r, vres_g, vres_b;	378 uint16x8_t vres_a, vres_r, vres_g, vres_b;

345 uint8x8x4_t vsrc;	379 uint8x8x4_t vsrc;

346	380

347 // load pixels	381 // load pixels

348 vdst = vld1q_u16(dst);	382 vdst = vld1q_u16(dst);

	383 #ifdef SK_CPU_ARM64

	384 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;

	385 asm (

	386 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"

	387 "mov %[vsrc0].8b, v0.8b \t\n"

	388 "mov %[vsrc1].8b, v1.8b \t\n"

	389 "mov %[vsrc2].8b, v2.8b \t\n"

	390 "mov %[vsrc3].8b, v3.8b \t\n"

	391 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),

	392 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3)

	393 : [src] "r" (src)

	394 : "v0", "v1", "v2", "v3"

	395 );

	396 src += 8;

	397 vsrc.val[0] = vsrc_0;

	398 vsrc.val[1] = vsrc_1;

	399 vsrc.val[2] = vsrc_2;

	400 vsrc.val[3] = vsrc_3;

	401 #else

349 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))	402 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

350 asm (	403 asm (

351 "vld4.u8 %h[vsrc], [%[src]]!"	404 "vld4.u8 %h[vsrc], [%[src]]!"

352 : [vsrc] "=w" (vsrc), [src] "+&r" (src)	405 : [vsrc] "=w" (vsrc), [src] "+&r" (src)

353 : :	406 : :

354 );	407 );

355 #else	408 #else

356 register uint8x8_t d0 asm("d0");	409 register uint8x8_t d0 asm("d0");

357 register uint8x8_t d1 asm("d1");	410 register uint8x8_t d1 asm("d1");

358 register uint8x8_t d2 asm("d2");	411 register uint8x8_t d2 asm("d2");

359 register uint8x8_t d3 asm("d3");	412 register uint8x8_t d3 asm("d3");

360	413

361 asm volatile (	414 asm volatile (

362 "vld4.u8 {d0-d3},[%[src]]!;"	415 "vld4.u8 {d0-d3},[%[src]]!;"

363 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),	416 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),

364 [src] "+&r" (src)	417 [src] "+&r" (src)

365 : :	418 : :

366 );	419 );

367 vsrc.val[0] = d0;	420 vsrc.val[0] = d0;

368 vsrc.val[1] = d1;	421 vsrc.val[1] = d1;

369 vsrc.val[2] = d2;	422 vsrc.val[2] = d2;

370 vsrc.val[3] = d3;	423 vsrc.val[3] = d3;

371 #endif	424 #endif

	425 #endif // #ifdef SK_CPU_ARM64

372	426

373	427

374 // deinterleave dst	428 // deinterleave dst

375 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes	429 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes

376 vdst_b = vdst & vmask_blue; // extract blue	430 vdst_b = vdst & vmask_blue; // extract blue

377 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red	431 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red

378 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green	432 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green

379	433

380 // shift src to 565	434 // shift src to 565

381 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);	435 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);

(...skipping 79 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
461 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];	515 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

462	516

463 uint8x8_t vdither = vld1_u8(dstart); // load dither values	517 uint8x8_t vdither = vld1_u8(dstart); // load dither values

464 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s	518 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s

465	519

466 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg	520 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg

467 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask	521 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask

468	522

469 do {	523 do {

470	524

	525 uint8x8x4_t vsrc;

471 uint8x8_t vsrc_r, vsrc_g, vsrc_b;	526 uint8x8_t vsrc_r, vsrc_g, vsrc_b;

472 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;	527 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;

473 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;	528 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;

474 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;	529 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;

475 uint16x8_t vdst;	530 uint16x8_t vdst;

476 uint16x8_t vdst_r, vdst_g, vdst_b;	531 uint16x8_t vdst_r, vdst_g, vdst_b;

477 int16x8_t vres_r, vres_g, vres_b;	532 int16x8_t vres_r, vres_g, vres_b;

478 int8x8_t vres8_r, vres8_g, vres8_b;	533 int8x8_t vres8_r, vres8_g, vres8_b;

479	534

480 // Load source and add dither	535 // Load source and add dither

	536 #ifdef SK_CPU_ARM64

	537 uint8x8_t vsrc_0, vsrc_1, vsrc_2;

	538 asm (

	539 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"

	540 "mov %[vsrc0].8b, v0.8b \t\n"

	541 "mov %[vsrc1].8b, v1.8b \t\n"

	542 "mov %[vsrc2].8b, v2.8b \t\n"

	543 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),

	544 [vsrc2] "=w" (vsrc_2)

	545 : [src] "r" (src)

	546 : "v0", "v1", "v2", "v3"

	547 );

	548

	549 src += 8;

	550

	551 vsrc.val[0] = vsrc_0;

	552 vsrc.val[1] = vsrc_1;

	553 vsrc.val[2] = vsrc_2;

	554 #else

481 {	555 {

482 register uint8x8_t d0 asm("d0");	556 register uint8x8_t d0 asm("d0");

483 register uint8x8_t d1 asm("d1");	557 register uint8x8_t d1 asm("d1");

484 register uint8x8_t d2 asm("d2");	558 register uint8x8_t d2 asm("d2");

485 register uint8x8_t d3 asm("d3");	559 register uint8x8_t d3 asm("d3");

486	560

487 asm (	561 asm (

488 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"	562 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"

489 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)	563 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)

490 :	564 :

491 );	565 );

492 vsrc_g = d1;	566 vsrc.val[0] = d0;

493 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)	567 vsrc.val[1] = d1;

494 vsrc_r = d2; vsrc_b = d0;	568 vsrc.val[2] = d2;

495 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)	569 }

496 vsrc_r = d0; vsrc_b = d2;

497 #endif	570 #endif

498 }	571 vsrc_r = vsrc.val[NEON_R];

	572 vsrc_g = vsrc.val[NEON_G];

	573 vsrc_b = vsrc.val[NEON_B];

499	574

500 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6	575 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6

501 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5	576 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5

502 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5	577 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5

503	578

504 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen	579 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen

505 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen	580 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen

506 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen	581 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen

507	582

508 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result	583 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
569 sb = SkDITHER_B32To565(sb, dither);	644 sb = SkDITHER_B32To565(sb, dither);

570	645

571 uint16_t d = *dst;	646 uint16_t d = *dst;

572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),	647 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),

573 SkAlphaBlend(sg, SkGetPackedG16(d), scale),	648 SkAlphaBlend(sg, SkGetPackedG16(d), scale),

574 SkAlphaBlend(sb, SkGetPackedB16(d), scale));	649 SkAlphaBlend(sb, SkGetPackedB16(d), scale));

575 DITHER_INC_X(x);	650 DITHER_INC_X(x);

576 } while (--count != 0);	651 } while (--count != 0);

577 }	652 }

578 }	653 }

579 #endif

580	654

581 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	655 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

582 const SkPMColor* SK_RESTRICT src,	656 const SkPMColor* SK_RESTRICT src,

583 int count, U8CPU alpha) {	657 int count, U8CPU alpha) {

584	658

585 SkASSERT(255 == alpha);	659 SkASSERT(255 == alpha);

586 if (count > 0) {	660 if (count > 0) {

587	661

588	662

589 uint8x8_t alpha_mask;	663 uint8x8_t alpha_mask;

(...skipping 450 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1040 uint16_t pc = (uint16_t) p;	1114 uint16_t pc = (uint16_t) p;

1041 sprintf(buf,"%8s:", str);	1115 sprintf(buf,"%8s:", str);

1042 len = (len / sizeof(uint16_t)); /* passed as bytes */	1116 len = (len / sizeof(uint16_t)); /* passed as bytes */

1043 for(i=0;i<len;i++) {	1117 for(i=0;i<len;i++) {

1044 sprintf(tbuf, " %04x", pc[i]);	1118 sprintf(tbuf, " %04x", pc[i]);

1045 strcat(buf, tbuf);	1119 strcat(buf, tbuf);

1046 }	1120 }

1047 SkDebugf("%s\n", buf);	1121 SkDebugf("%s\n", buf);

1048 }	1122 }

1049 #endif	1123 #endif

	1124 #endif // #ifdef SK_CPU_ARM32

1050	1125

1051 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,	1126 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,

1052 const SkPMColor* SK_RESTRICT src,	1127 const SkPMColor* SK_RESTRICT src,

1053 int count, U8CPU alpha, int x, int y) {	1128 int count, U8CPU alpha, int x, int y) {

1054 SkASSERT(255 == alpha);	1129 SkASSERT(255 == alpha);

1055	1130

1056 #define UNROLL 8	1131 #define UNROLL 8

1057	1132

1058 if (count >= UNROLL) {	1133 if (count >= UNROLL) {

1059	1134

1060 #if defined(DEBUG_OPAQUE_DITHER)	1135 #if defined(DEBUG_OPAQUE_DITHER)

1061 uint16_t tmpbuf[UNROLL];	1136 uint16_t tmpbuf[UNROLL];

1062 int td[UNROLL];	1137 int td[UNROLL];

1063 int tdv[UNROLL];	1138 int tdv[UNROLL];

1064 int ta[UNROLL];	1139 int ta[UNROLL];

1065 int tap[UNROLL];	1140 int tap[UNROLL];

1066 uint16_t in_dst[UNROLL];	1141 uint16_t in_dst[UNROLL];

1067 int offset = 0;	1142 int offset = 0;

1068 int noisy = 0;	1143 int noisy = 0;

1069 #endif	1144 #endif

1070	1145

1071 uint8x8_t dbase;	1146 uint8x8_t dbase;

1072 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];	1147 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

1073 dbase = vld1_u8(dstart);	1148 dbase = vld1_u8(dstart);

1074	1149

1075 do {	1150 do {

	1151 uint8x8x4_t vsrc;

1076 uint8x8_t sr, sg, sb, sa, d;	1152 uint8x8_t sr, sg, sb, sa, d;

1077 uint16x8_t dst8, scale8, alpha8;	1153 uint16x8_t dst8, scale8, alpha8;

1078 uint16x8_t dst_r, dst_g, dst_b;	1154 uint16x8_t dst_r, dst_g, dst_b;

1079	1155

1080 #if defined(DEBUG_OPAQUE_DITHER)	1156 #if defined(DEBUG_OPAQUE_DITHER)

1081 // calculate 8 elements worth into a temp buffer	1157 // calculate 8 elements worth into a temp buffer

1082 {	1158 {

1083 int my_y = y;	1159 int my_y = y;

1084 int my_x = x;	1160 int my_x = x;

1085 SkPMColor* my_src = (SkPMColor*)src;	1161 SkPMColor* my_src = (SkPMColor*)src;

(...skipping 30 matching lines...) Expand all Loading...
1116 tmpbuf[i] = *my_dst;	1192 tmpbuf[i] = *my_dst;

1117 ta[i] = tdv[i] = td[i] = 0xbeef;	1193 ta[i] = tdv[i] = td[i] = 0xbeef;

1118 }	1194 }

1119 in_dst[i] = *my_dst;	1195 in_dst[i] = *my_dst;

1120 my_dst += 1;	1196 my_dst += 1;

1121 DITHER_INC_X(my_x);	1197 DITHER_INC_X(my_x);

1122 }	1198 }

1123 }	1199 }

1124 #endif	1200 #endif

1125	1201

	1202 #ifdef SK_CPU_ARM64

	1203 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;

	1204 asm (

	1205 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"

	1206 "mov %[vsrc0].8b, v0.8b \t\n"

	1207 "mov %[vsrc1].8b, v1.8b \t\n"

	1208 "mov %[vsrc2].8b, v2.8b \t\n"

	1209 "mov %[vsrc3].8b, v3.8b \t\n"

	1210 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),

	1211 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3)

	1212 : [src] "r" (src)

	1213 : "v0", "v1", "v2", "v3"

	1214 );

1126	1215

	1216 src += 8;

	1217 vsrc.val[0] = vsrc_0;

	1218 vsrc.val[1] = vsrc_1;

	1219 vsrc.val[2] = vsrc_2;

	1220 vsrc.val[3] = vsrc_3;

	1221 #else

1127 {	1222 {

1128 register uint8x8_t d0 asm("d0");	1223 register uint8x8_t d0 asm("d0");

1129 register uint8x8_t d1 asm("d1");	1224 register uint8x8_t d1 asm("d1");

1130 register uint8x8_t d2 asm("d2");	1225 register uint8x8_t d2 asm("d2");

1131 register uint8x8_t d3 asm("d3");	1226 register uint8x8_t d3 asm("d3");

1132	1227

1133 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"	1228 asm ("vld4.8 {d0-d3},[%[src]]! "

1134 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)	1229 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)

1135 :	1230 :

1136 );	1231 );

1137 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)	1232 vsrc.val[0] = d0;

1138 sr = d2; sg = d1; sb = d0; sa = d3;	1233 vsrc.val[1] = d1;

1139 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)	1234 vsrc.val[2] = d2;

1140 sr = d0; sg = d1; sb = d2; sa = d3;	1235 vsrc.val[3] = d3;

	1236 }

1141 #endif	1237 #endif

1142 }	1238 sa = vsrc.val[NEON_A];

	1239 sr = vsrc.val[NEON_R];

	1240 sg = vsrc.val[NEON_G];

	1241 sb = vsrc.val[NEON_B];

1143	1242

1144 /* calculate 'd', which will be 0..7	1243 /* calculate 'd', which will be 0..7

1145 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice	1244 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice

1146 */	1245 */

1147 alpha8 = vmovl_u8(dbase);	1246 alpha8 = vmovl_u8(dbase);

1148 alpha8 = vmlal_u8(alpha8, sa, dbase);	1247 alpha8 = vmlal_u8(alpha8, sa, dbase);

1149 d = vshrn_n_u16(alpha8, 8); // narrowing too	1248 d = vshrn_n_u16(alpha8, 8); // narrowing too

1150	1249

1151 // sr = sr - (sr>>5) + d	1250 // sr = sr - (sr>>5) + d

1152 /* watching for 8-bit overflow. d is 0..7; risky range of	1251 /* watching for 8-bit overflow. d is 0..7; risky range of

(...skipping 123 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1276 #define UNROLL 8	1375 #define UNROLL 8

1277 if (count >= UNROLL) {	1376 if (count >= UNROLL) {

1278 uint8x8_t d;	1377 uint8x8_t d;

1279 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];	1378 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

1280 d = vld1_u8(dstart);	1379 d = vld1_u8(dstart);

1281	1380

1282 while (count >= UNROLL) {	1381 while (count >= UNROLL) {

1283 uint8x8_t sr, sg, sb;	1382 uint8x8_t sr, sg, sb;

1284 uint16x8_t dr, dg, db;	1383 uint16x8_t dr, dg, db;

1285 uint16x8_t dst8;	1384 uint16x8_t dst8;

	1385 uint8x8x4_t vsrc;

1286	1386

	1387 #ifdef SK_CPU_ARM64

	1388 uint8x8_t vsrc_0, vsrc_1, vsrc_2;

	1389 asm (

	1390 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"

	1391 "mov %[vsrc0].8b, v0.8b \t\n"

	1392 "mov %[vsrc1].8b, v1.8b \t\n"

	1393 "mov %[vsrc2].8b, v2.8b \t\n"

	1394 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),

	1395 [vsrc2] "=w" (vsrc_2)

	1396 : [src] "r" (src)

	1397 : "v0", "v1", "v2", "v3"

	1398 );

	1399 src += 8;

	1400

	1401 vsrc.val[0] = vsrc_0;

	1402 vsrc.val[1] = vsrc_1;

	1403 vsrc.val[2] = vsrc_2;

	1404 #else

1287 {	1405 {

1288 register uint8x8_t d0 asm("d0");	1406 register uint8x8_t d0 asm("d0");

1289 register uint8x8_t d1 asm("d1");	1407 register uint8x8_t d1 asm("d1");

1290 register uint8x8_t d2 asm("d2");	1408 register uint8x8_t d2 asm("d2");

1291 register uint8x8_t d3 asm("d3");	1409 register uint8x8_t d3 asm("d3");

1292	1410

1293 asm (	1411 asm (

1294 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"	1412 "vld4.8 {d0-d3},[%[src]]! "

1295 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)	1413 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)

1296 :	1414 :

1297 );	1415 );

1298 sg = d1;	1416 vsrc.val[0] = d0;

1299 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)	1417 vsrc.val[1] = d1;

1300 sr = d2; sb = d0;	1418 vsrc.val[2] = d2;

1301 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)	1419 }

1302 sr = d0; sb = d2;

1303 #endif	1420 #endif

1304 }	1421 sr = vsrc.val[NEON_R];

	1422 sg = vsrc.val[NEON_G];

	1423 sb = vsrc.val[NEON_B];

	1424

1305 /* XXX: if we want to prefetch, hide it in the above asm()	1425 /* XXX: if we want to prefetch, hide it in the above asm()

1306 * using the gcc __builtin_prefetch(), the prefetch will	1426 * using the gcc __builtin_prefetch(), the prefetch will

1307 * fall to the bottom of the loop -- it won't stick up	1427 * fall to the bottom of the loop -- it won't stick up

1308 * at the top of the loop, just after the vld4.	1428 * at the top of the loop, just after the vld4.

1309 */	1429 */

1310	1430

1311 // sr = sr - (sr>>5) + d	1431 // sr = sr - (sr>>5) + d

1312 sr = vsub_u8(sr, vshr_n_u8(sr, 5));	1432 sr = vsub_u8(sr, vshr_n_u8(sr, 5));

1313 dr = vaddl_u8(sr, d);	1433 dr = vaddl_u8(sr, d);

1314	1434

(...skipping 47 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1362 SkPMColor c = *src++;	1482 SkPMColor c = *src++;

1363 SkPMColorAssert(c);	1483 SkPMColorAssert(c);

1364 SkASSERT(SkGetPackedA32(c) == 255);	1484 SkASSERT(SkGetPackedA32(c) == 255);

1365	1485

1366 unsigned dither = DITHER_VALUE(x);	1486 unsigned dither = DITHER_VALUE(x);

1367 *dst++ = SkDitherRGB32To565(c, dither);	1487 *dst++ = SkDitherRGB32To565(c, dither);

1368 DITHER_INC_X(x);	1488 DITHER_INC_X(x);

1369 } while (--count != 0);	1489 } while (--count != 0);

1370 }	1490 }

1371 }	1491 }

1372 #endif

1373	1492

1374 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,	1493 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,

1375 SkPMColor color) {	1494 SkPMColor color) {

1376 if (count <= 0) {	1495 if (count <= 0) {

1377 return;	1496 return;

1378 }	1497 }

1379	1498

1380 if (0 == color) {	1499 if (0 == color) {

1381 if (src != dst) {	1500 if (src != dst) {

1382 memcpy(dst, src, count * sizeof(SkPMColor));	1501 memcpy(dst, src, count * sizeof(SkPMColor));

(...skipping 85 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1468 dst = color + SkAlphaMulQ(src, scale);	1587 dst = color + SkAlphaMulQ(src, scale);

1469 src += 1;	1588 src += 1;

1470 dst += 1;	1589 dst += 1;

1471 count--;	1590 count--;

1472 }	1591 }

1473 }	1592 }

1474	1593

1475 ///////////////////////////////////////////////////////////////////////////////	1594 ///////////////////////////////////////////////////////////////////////////////

1476	1595

1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {	1596 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {

1478 #ifdef SK_CPU_ARM32

1479 // no dither	1597 // no dither

1480 S32_D565_Opaque_neon,	1598 S32_D565_Opaque_neon,

1481 S32_D565_Blend_neon,	1599 S32_D565_Blend_neon,

	1600 #ifdef SK_CPU_ARM32

1482 S32A_D565_Opaque_neon,	1601 S32A_D565_Opaque_neon,

	1602 #else

	1603 NULL,

	1604 #endif

1483 S32A_D565_Blend_neon,	1605 S32A_D565_Blend_neon,

1484	1606

1485 // dither	1607 // dither

1486 S32_D565_Opaque_Dither_neon,	1608 S32_D565_Opaque_Dither_neon,

1487 S32_D565_Blend_Dither_neon,	1609 S32_D565_Blend_Dither_neon,

1488 S32A_D565_Opaque_Dither_neon,	1610 S32A_D565_Opaque_Dither_neon,

1489 NULL, // S32A_D565_Blend_Dither	1611 NULL, // S32A_D565_Blend_Dither

1490 #else

1491 NULL, NULL, NULL, NULL,

1492 NULL, NULL, NULL, NULL

1493 #endif

1494 };	1612 };

1495	1613

1496 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {	1614 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

1497 NULL, // S32_Opaque,	1615 NULL, // S32_Opaque,

1498 S32_Blend_BlitRow32_neon, // S32_Blend,	1616 S32_Blend_BlitRow32_neon, // S32_Blend,

1499 /*	1617 /*

1500 * We have two choices for S32A_Opaque procs. The one reads the src alpha	1618 * We have two choices for S32A_Opaque procs. The one reads the src alpha

1501 * value and attempts to optimize accordingly. The optimization is	1619 * value and attempts to optimize accordingly. The optimization is

1502 * sensitive to the source content and is not a win in all cases. For	1620 * sensitive to the source content and is not a win in all cases. For

1503 * example, if there are a lot of transitions between the alpha states,	1621 * example, if there are a lot of transitions between the alpha states,

1504 * the performance will almost certainly be worse. However, for many	1622 * the performance will almost certainly be worse. However, for many

1505 * common cases the performance is equivalent or better than the standard	1623 * common cases the performance is equivalent or better than the standard

1506 * case where we do not inspect the src alpha.	1624 * case where we do not inspect the src alpha.

1507 */	1625 */

1508 #if SK_A32_SHIFT == 24	1626 #if SK_A32_SHIFT == 24

1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1627 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1628 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1511 #else	1629 #else

1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1630 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1513 #endif	1631 #endif

1514 #ifdef SK_CPU_ARM32	1632 #ifdef SK_CPU_ARM32

1515 S32A_Blend_BlitRow32_neon // S32A_Blend	1633 S32A_Blend_BlitRow32_neon // S32A_Blend

1516 #else	1634 #else

1517 NULL	1635 NULL

1518 #endif	1636 #endif

1519 };	1637 };

OLD	NEW

« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »