src/opts/SkSwizzler_opts.h - Issue 1663623002: NEON optimizations for GrayAlpha -> RGBA/BGRA Premul/Unpremul

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1663623002: NEON optimizations for GrayAlpha -> RGBA/BGRA Premul/Unpremul (Closed) Base URL: https://skia.googlesource.com/skia.git@gralpha

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkSwizzler_opts_DEFINED	8 #ifndef SkSwizzler_opts_DEFINED

9 #define SkSwizzler_opts_DEFINED	9 #define SkSwizzler_opts_DEFINED

10	10

(...skipping 80 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
91 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {	91 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {

92 const uint8_t* src = (const uint8_t*)vsrc;	92 const uint8_t* src = (const uint8_t*)vsrc;

93 for (int i = 0; i < count; i++) {	93 for (int i = 0; i < count; i++) {

94 dst[i] = (uint32_t)0xFF << 24	94 dst[i] = (uint32_t)0xFF << 24

95 \| (uint32_t)src[i] << 16	95 \| (uint32_t)src[i] << 16

96 \| (uint32_t)src[i] << 8	96 \| (uint32_t)src[i] << 8

97 \| (uint32_t)src[i] << 0;	97 \| (uint32_t)src[i] << 0;

98 }	98 }

99 }	99 }

100	100

	101 static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {

	102 const uint8_t* src = (const uint8_t*)vsrc;

	103 for (int i = 0; i < count; i++) {

	104 uint8_t g = src[0],

	105 a = src[1];

	106 src += 2;

	107 dst[i] = (uint32_t)a << 24

	108 \| (uint32_t)g << 16

	109 \| (uint32_t)g << 8

	110 \| (uint32_t)g << 0;

	111 }

	112 }

	113

	114 static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {

	115 const uint8_t* src = (const uint8_t*)vsrc;

	116 for (int i = 0; i < count; i++) {

	117 uint8_t g = src[0],

	118 a = src[1];

	119 src += 2;

	120 g = (g*a+127)/255;

	121 dst[i] = (uint32_t)a << 24

	122 \| (uint32_t)g << 16

	123 \| (uint32_t)g << 8

	124 \| (uint32_t)g << 0;

	125 }

	126 }

	127

101 #if defined(SK_ARM_HAS_NEON)	128 #if defined(SK_ARM_HAS_NEON)

102	129

103 // Rounded divide by 255, (x + 127) / 255	130 // Rounded divide by 255, (x + 127) / 255

104 static uint8x8_t div255_round(uint16x8_t x) {	131 static uint8x8_t div255_round(uint16x8_t x) {

105 // result = (x + 127) / 255	132 // result = (x + 127) / 255

106 // result = (x + 127) / 256 + error1	133 // result = (x + 127) / 256 + error1

107 //	134 //

108 // error1 = (x + 127) / (255 * 256)	135 // error1 = (x + 127) / (255 * 256)

109 // error1 = (x + 127) / (256 * 256) + error2	136 // error1 = (x + 127) / (256 * 256) + error2

110 //	137 //

(...skipping 193 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
304 // Store 8 pixels.	331 // Store 8 pixels.

305 vst4_u8((uint8_t*) dst, rgba);	332 vst4_u8((uint8_t*) dst, rgba);

306 src += 8;	333 src += 8;

307 dst += 8;	334 dst += 8;

308 count -= 8;	335 count -= 8;

309 }	336 }

310	337

311 gray_to_RGB1_portable(dst, src, count);	338 gray_to_RGB1_portable(dst, src, count);

312 }	339 }

313	340

	341 template <bool kPremul>

	342 static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {

	343 const uint8_t* src = (const uint8_t*) vsrc;

	344 while (count >= 16) {

	345 // Load 16 pixels.

	346 uint8x16x2_t ga = vld2q_u8(src);
	mtklein 2016/02/03 01:08:15 We sure are getting to use all the vldN / vstN, eh We sure are getting to use all the vldN / vstN, eh? msarett 2016/02/03 14:48:51 Yeah it's fun to have good uses for all the instru Show quoted text On 2016/02/03 01:08:15, mtklein wrote: > We sure are getting to use all the vldN / vstN, eh? Yeah it's fun to have good uses for all the instructions :)
	347

	348 // Premultiply if requested.

	349 if (kPremul) {

	350 ga.val[0] = vcombine_u8(

	351 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),

	352 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));

	353 }

	354

	355 // Set each of the color channels.

	356 uint8x16x4_t rgba;

	357 rgba.val[0] = ga.val[0];

	358 rgba.val[1] = ga.val[0];

	359 rgba.val[2] = ga.val[0];

	360 rgba.val[3] = ga.val[1];

	361

	362 // Store 16 pixels.

	363 vst4q_u8((uint8_t*) dst, rgba);

	364 src += 16*2;

	365 dst += 16;

	366 count -= 16;

	367 }

	368

	369 if (count >= 8) {

	370 // Load 8 pixels.

	371 uint8x8x2_t ga = vld2_u8(src);

	372

	373 // Premultiply if requested.

	374 if (kPremul) {

	375 ga.val[0] = scale(ga.val[0], ga.val[1]);

	376 }

	377

	378 // Set each of the color channels.

	379 uint8x8x4_t rgba;

	380 rgba.val[0] = ga.val[0];

	381 rgba.val[1] = ga.val[0];

	382 rgba.val[2] = ga.val[0];

	383 rgba.val[3] = ga.val[1];

	384

	385 // Store 8 pixels.

	386 vst4_u8((uint8_t*) dst, rgba);

	387 src += 8*2;

	388 dst += 8;

	389 count -= 8;

	390 }

	391

	392 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;

	393 proc(dst, src, count);

	394 }

	395

	396 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {

	397 expand_grayA<false>(dst, src, count);

	398 }

	399

	400 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {

	401 expand_grayA<true>(dst, src, count);

	402 }

	403

314 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	404 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

315	405

316 template <bool kSwapRB>	406 template <bool kSwapRB>

317 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {	407 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {

318 auto src = (const uint32_t*)vsrc;	408 auto src = (const uint32_t*)vsrc;

319	409

320 auto premul8 = [](__m128i* lo, __m128i* hi) {	410 auto premul8 = [](__m128i* lo, __m128i* hi) {

321 const __m128i zeros = _mm_setzero_si128();	411 const __m128i zeros = _mm_setzero_si128();

322 const __m128i _128 = _mm_set1_epi16(128);	412 const __m128i _128 = _mm_set1_epi16(128);

323 const __m128i _257 = _mm_set1_epi16(257);	413 const __m128i _257 = _mm_set1_epi16(257);

(...skipping 151 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
475 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);	565 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);

476	566

477 src += 16;	567 src += 16;

478 dst += 16;	568 dst += 16;

479 count -= 16;	569 count -= 16;

480 }	570 }

481	571

482 gray_to_RGB1_portable(dst, src, count);	572 gray_to_RGB1_portable(dst, src, count);

483 }	573 }

484	574

	575 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {

	576 grayA_to_RGBA_portable(dst, src, count);

	577 }

	578

	579 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {

	580 grayA_to_rgbA_portable(dst, src, count);

	581 }

	582

485 #else	583 #else

486	584

487 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {	585 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {

488 RGBA_to_rgbA_portable(dst, src, count);	586 RGBA_to_rgbA_portable(dst, src, count);

489 }	587 }

490	588

491 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {	589 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {

492 RGBA_to_bgrA_portable(dst, src, count);	590 RGBA_to_bgrA_portable(dst, src, count);

493 }	591 }

494	592

495 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {	593 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {

496 RGBA_to_BGRA_portable(dst, src, count);	594 RGBA_to_BGRA_portable(dst, src, count);

497 }	595 }

498	596

499 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {	597 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {

500 RGB_to_RGB1_portable(dst, src, count);	598 RGB_to_RGB1_portable(dst, src, count);

501 }	599 }

502	600

503 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {	601 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {

504 RGB_to_BGR1_portable(dst, src, count);	602 RGB_to_BGR1_portable(dst, src, count);

505 }	603 }

506	604

507 static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {	605 static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {

508 gray_to_RGB1_portable(dst, src, count);	606 gray_to_RGB1_portable(dst, src, count);

509 }	607 }

510	608

	609 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {

	610 grayA_to_RGBA_portable(dst, src, count);

	611 }

	612

	613 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {

	614 grayA_to_rgbA_portable(dst, src, count);

	615 }

	616

511 #endif	617 #endif

512	618

513 }	619 }

514	620

515 #endif // SkSwizzler_opts_DEFINED	621 #endif // SkSwizzler_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »