src/opts/SkSwizzler_opts.h - Issue 1676773003: Optimize CMYK->RGBA (BGRA) transform for jpeg decodes

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1676773003: Optimize CMYK->RGBA (BGRA) transform for jpeg decodes (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkSwizzler_opts_DEFINED	8 #ifndef SkSwizzler_opts_DEFINED

9 #define SkSwizzler_opts_DEFINED	9 #define SkSwizzler_opts_DEFINED

10	10

(...skipping 107 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
118 a = src[1];	118 a = src[1];

119 src += 2;	119 src += 2;

120 g = (g*a+127)/255;	120 g = (g*a+127)/255;

121 dst[i] = (uint32_t)a << 24	121 dst[i] = (uint32_t)a << 24

122 \| (uint32_t)g << 16	122 \| (uint32_t)g << 16

123 \| (uint32_t)g << 8	123 \| (uint32_t)g << 8

124 \| (uint32_t)g << 0;	124 \| (uint32_t)g << 0;

125 }	125 }

126 }	126 }

127	127

	128 static void CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {

	129 const uint32_t* src = (const uint32_t*)vsrc;

	130 for (int i = 0; i < count; i++) {

	131 uint8_t k = src[i] >> 24,

	132 y = src[i] >> 16,

	133 m = src[i] >> 8,

	134 c = src[i] >> 0;

	135 uint8_t b = (y*k+127)/255,
	scroggo 2016/02/08 14:59:43 This pattern appears a lot. Should it be a macro? This pattern appears a lot. Should it be a macro? mtklein 2016/02/08 15:08:47 Don't think so. When we put it in a macro, people Show quoted text On 2016/02/08 14:59:43, scroggo wrote: > This pattern appears a lot. Should it be a macro? Don't think so. When we put it in a macro, people are tempted to change the macro to be faster and less correct.
	136 g = (m*k+127)/255,

	137 r = (c*k+127)/255;
	mtklein 2016/02/08 15:14:26 This is really the math? Having never seen it bef This is really the math? Having never seen it before, this seems surprising. I'd think r goes inversely in proportion to c and k, g inversely to m and k, b inversely to y and k, etc. Is it not r == ((255-c)(255-k)+127)/255, etc? msarett* 2016/02/08 15:23:08 The short answer is that libjpeg-turbo actually ou Show quoted text On 2016/02/08 15:14:26, mtklein wrote: > This is really the math? Having never seen it before, this seems surprising. > I'd think r goes inversely in proportion to c and k, g inversely to m and k, b > inversely to y and k, etc. > > Is it not r == ((255-c)(255-k)+127)/255, etc? The short answer is that libjpeg-turbo actually outputs inverted CMYK. Maybe these functions should be renamed? Or maybe the below explanation (from SkSwizzler.cpp) belongs here? CMYK is stored as four bytes per pixel. We will implement a crude conversion from CMYK -> RGB using formulas from easyrgb.com. CMYK -> CMY C = C (1 - K) + K M = M * (1 - K) + K Y = Y * (1 - K) + K libjpeg actually gives us inverted CMYK, so we must subtract the original terms from 1. CMYK -> CMY C = (1 - C) * (1 - (1 - K)) + (1 - K) M = (1 - M) * (1 - (1 - K)) + (1 - K) Y = (1 - Y) * (1 - (1 - K)) + (1 - K) Simplifying the above expression. CMYK -> CMY C = 1 - CK M = 1 - MK Y = 1 - YK CMY -> RGB R = (1 - C) * 255 G = (1 - M) * 255 B = (1 - Y) * 255 Therefore the full conversion is below. This can be verified at www.rapidtables.com (assuming inverted CMYK). CMYK -> RGB R = C * K * 255 G = M * K * 255 B = Y * K * 255 As a final note, we have treated the CMYK values as if they were on a scale from 0-1, when in fact they are 8-bit ints scaling from 0-255. We must divide each CMYK component by 255 to obtain the true conversion we should perform. CMYK -> RGB R = C * K / 255 G = M * K / 255 B = Y * K / 255 mtklein 2016/02/08 16:47:15 Let's go with a name change (inverted_CMYK_to_...) Show quoted text On 2016/02/08 15:23:08, msarett wrote: > On 2016/02/08 15:14:26, mtklein wrote: > > This is really the math? Having never seen it before, this seems surprising. > > I'd think r goes inversely in proportion to c and k, g inversely to m and k, b > > inversely to y and k, etc. > > > > Is it not r == ((255-c)(255-k)+127)/255, etc? > > The short answer is that libjpeg-turbo actually outputs inverted CMYK. Maybe > these functions should be renamed? Or maybe the below explanation (from > SkSwizzler.cpp) belongs here? > > CMYK is stored as four bytes per pixel. > We will implement a crude conversion from CMYK -> RGB using formulas > from http://easyrgb.com. > CMYK -> CMY > C = C (1 - K) + K > M = M * (1 - K) + K > Y = Y * (1 - K) + K > libjpeg actually gives us inverted CMYK, so we must subtract the > original terms from 1. > CMYK -> CMY > C = (1 - C) * (1 - (1 - K)) + (1 - K) > M = (1 - M) * (1 - (1 - K)) + (1 - K) > Y = (1 - Y) * (1 - (1 - K)) + (1 - K) > Simplifying the above expression. > CMYK -> CMY > C = 1 - CK > M = 1 - MK > Y = 1 - YK > CMY -> RGB > R = (1 - C) * 255 > G = (1 - M) * 255 > B = (1 - Y) * 255 > Therefore the full conversion is below. This can be verified at > http://www.rapidtables.com (assuming inverted CMYK). > CMYK -> RGB > R = C * K * 255 > G = M * K * 255 > B = Y * K * 255 > As a final note, we have treated the CMYK values as if they were on > a scale from 0-1, when in fact they are 8-bit ints scaling from 0-255. > We must divide each CMYK component by 255 to obtain the true conversion > we should perform. > CMYK -> RGB > R = C * K / 255 > G = M * K / 255 > B = Y * K / 255 Let's go with a name change (inverted_CMYK_to_...) and a pointer to that explanation. msarett 2016/02/08 17:22:42 Done. Show quoted text On 2016/02/08 16:47:15, mtklein wrote: > On 2016/02/08 15:23:08, msarett wrote: > > On 2016/02/08 15:14:26, mtklein wrote: > > > This is really the math? Having never seen it before, this seems > surprising. > > > I'd think r goes inversely in proportion to c and k, g inversely to m and k, > b > > > inversely to y and k, etc. > > > > > > Is it not r == ((255-c)(255-k)+127)/255, etc? > > > > The short answer is that libjpeg-turbo actually outputs inverted CMYK. Maybe > > these functions should be renamed? Or maybe the below explanation (from > > SkSwizzler.cpp) belongs here? > > > > CMYK is stored as four bytes per pixel. > > We will implement a crude conversion from CMYK -> RGB using formulas > > from http://easyrgb.com. > > CMYK -> CMY > > C = C (1 - K) + K > > M = M * (1 - K) + K > > Y = Y * (1 - K) + K > > libjpeg actually gives us inverted CMYK, so we must subtract the > > original terms from 1. > > CMYK -> CMY > > C = (1 - C) * (1 - (1 - K)) + (1 - K) > > M = (1 - M) * (1 - (1 - K)) + (1 - K) > > Y = (1 - Y) * (1 - (1 - K)) + (1 - K) > > Simplifying the above expression. > > CMYK -> CMY > > C = 1 - CK > > M = 1 - MK > > Y = 1 - YK > > CMY -> RGB > > R = (1 - C) * 255 > > G = (1 - M) * 255 > > B = (1 - Y) * 255 > > Therefore the full conversion is below. This can be verified at > > http://www.rapidtables.com (assuming inverted CMYK). > > CMYK -> RGB > > R = C * K * 255 > > G = M * K * 255 > > B = Y * K * 255 > > As a final note, we have treated the CMYK values as if they were on > > a scale from 0-1, when in fact they are 8-bit ints scaling from 0-255. > > We must divide each CMYK component by 255 to obtain the true conversion > > we should perform. > > CMYK -> RGB > > R = C * K / 255 > > G = M * K / 255 > > B = Y * K / 255 > > Let's go with a name change (inverted_CMYK_to_...) and a pointer to that > explanation. Done.
	138 dst[i] = (uint32_t)0xFF << 24

	139 \| (uint32_t) b << 16

	140 \| (uint32_t) g << 8

	141 \| (uint32_t) r << 0;

	142 }

	143 }

	144

	145 static void CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {

	146 const uint32_t* src = (const uint32_t*)vsrc;

	147 for (int i = 0; i < count; i++) {

	148 uint8_t k = src[i] >> 24,

	149 y = src[i] >> 16,

	150 m = src[i] >> 8,

	151 c = src[i] >> 0;

	152 uint8_t b = (y*k+127)/255,

	153 g = (m*k+127)/255,

	154 r = (c*k+127)/255;

	155 dst[i] = (uint32_t)0xFF << 24

	156 \| (uint32_t) r << 16

	157 \| (uint32_t) g << 8

	158 \| (uint32_t) b << 0;

	159 }

	160 }

	161

128 #if defined(SK_ARM_HAS_NEON)	162 #if defined(SK_ARM_HAS_NEON)

129	163

130 // Rounded divide by 255, (x + 127) / 255	164 // Rounded divide by 255, (x + 127) / 255

131 static uint8x8_t div255_round(uint16x8_t x) {	165 static uint8x8_t div255_round(uint16x8_t x) {

132 // result = (x + 127) / 255	166 // result = (x + 127) / 255

133 // result = (x + 127) / 256 + error1	167 // result = (x + 127) / 256 + error1

134 //	168 //

135 // error1 = (x + 127) / (255 * 256)	169 // error1 = (x + 127) / (255 * 256)

136 // error1 = (x + 127) / (256 * 256) + error2	170 // error1 = (x + 127) / (256 * 256) + error2

137 //	171 //

(...skipping 256 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
394 }	428 }

395	429

396 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {	430 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {

397 expand_grayA<false>(dst, src, count);	431 expand_grayA<false>(dst, src, count);

398 }	432 }

399	433

400 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {	434 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {

401 expand_grayA<true>(dst, src, count);	435 expand_grayA<true>(dst, src, count);

402 }	436 }

403	437

	438 template <bool kSwapRB>

	439 static void cmyk_should_swapRB(uint32_t* dst, const void* vsrc, int count) {

	440 auto src = (const uint32_t*)vsrc;

	441 while (count >= 8) {

	442 // Load 8 cmyk pixels.

	443 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);

	444

	445 uint8x8_t k = pixels.val[3],

	446 y = pixels.val[2],

	447 m = pixels.val[1],

	448 c = pixels.val[0];

	449

	450 // Scale to r, g, b.

	451 uint8x8_t b = scale(y, k);

	452 uint8x8_t g = scale(m, k);

	453 uint8x8_t r = scale(c, k);

	454

	455 // Store 8 rgba pixels.

	456 if (kSwapRB) {

	457 pixels.val[3] = vdup_n_u8(0xFF);

	458 pixels.val[2] = r;

	459 pixels.val[1] = g;

	460 pixels.val[0] = b;

	461 } else {

	462 pixels.val[3] = vdup_n_u8(0xFF);

	463 pixels.val[2] = b;

	464 pixels.val[1] = g;

	465 pixels.val[0] = r;

	466 }

	467 vst4_u8((uint8_t*) dst, pixels);

	468 src += 8;

	469 dst += 8;

	470 count -= 8;

	471 }

	472

	473 auto proc = kSwapRB ? CMYK_to_BGR1_portable : CMYK_to_RGB1_portable;

	474 proc(dst, src, count);

	475 }

	476

	477 static void CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {

	478 cmyk_should_swapRB<false>(dst, src, count);

	479 }

	480

	481 static void CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {

	482 cmyk_should_swapRB<true>(dst, src, count);

	483 }

	484

404 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	485 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

405	486

406 // Scale a byte by another.	487 // Scale a byte by another.

407 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.	488 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.

408 static __m128i scale(__m128i x, __m128i y) {	489 static __m128i scale(__m128i x, __m128i y) {

409 const __m128i _128 = _mm_set1_epi16(128);	490 const __m128i _128 = _mm_set1_epi16(128);

410 const __m128i _257 = _mm_set1_epi16(257);	491 const __m128i _257 = _mm_set1_epi16(257);

411	492

412 // (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.	493 // (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.

413 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);	494 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);

(...skipping 210 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
624 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);	705 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);

625	706

626 src += 8*2;	707 src += 8*2;

627 dst += 8;	708 dst += 8;

628 count -= 8;	709 count -= 8;

629 }	710 }

630	711

631 grayA_to_rgbA_portable(dst, src, count);	712 grayA_to_rgbA_portable(dst, src, count);

632 }	713 }

633	714

	715 template <bool kSwapRB>

	716 static void cmyk_should_swapRB(uint32_t* dst, const void* vsrc, int count) {

	717 auto src = (const uint32_t*)vsrc;

	718

	719 auto convert8 = [](__m128i* lo, __m128i* hi) {

	720 const __m128i zeros = _mm_setzero_si128();

	721 __m128i planar;

	722 if (kSwapRB) {

	723 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);

	724 } else {

	725 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);

	726 }

	727

	728 // Swizzle the pixels to 8-bit planar.

	729 lo = _mm_shuffle_epi8(lo, planar); // ccccmmmm yy yykkkk

	730 hi = _mm_shuffle_epi8(hi, planar); // CCCCMMMM YY YYKKKK

	731 __m128i cm = _mm_unpacklo_epi32(lo, hi), // ccccCCCC mm mmMMMM

	732 yk = _mm_unpackhi_epi32(lo, hi); // yyyyYYYY kk kkKKKK

	733

	734 // Unpack to 16-bit planar.

	735 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_ C_C_C_

	736 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_ M_M_M_

	737 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_ Y_Y_Y_

	738 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_ K_K_K_

	739

	740 // Scale to r, g, b.

	741 __m128i r = scale(c, k),

	742 g = scale(m, k),

	743 b = scale(y, k);

	744

	745 // Repack into interlaced pixels.

	746 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RG RGRGRG

	747 ba = _mm_or_si128(b, _mm_set1_epi16(0xFF00)); // b1b1b1b1 B1 B1B1B1

	748 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rg bargba

	749 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RG B1RGB1

	750 };

	751

	752 while (count >= 8) {

	753 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),

	754 hi = _mm_loadu_si128((const __m128i*) (src + 4));

	755

	756 convert8(&lo, &hi);

	757

	758 _mm_storeu_si128((__m128i*) (dst + 0), lo);

	759 _mm_storeu_si128((__m128i*) (dst + 4), hi);

	760

	761 src += 8;

	762 dst += 8;

	763 count -= 8;

	764 }

	765

	766 if (count >= 4) {

	767 __m128i lo = _mm_loadu_si128((const __m128i*) src),

	768 hi = _mm_setzero_si128();

	769

	770 convert8(&lo, &hi);

	771

	772 _mm_storeu_si128((__m128i*) dst, lo);

	773

	774 src += 4;

	775 dst += 4;

	776 count -= 4;

	777 }

	778

	779 auto proc = kSwapRB ? CMYK_to_BGR1_portable : CMYK_to_RGB1_portable;

	780 proc(dst, src, count);

	781 }

	782

	783 static void CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {

	784 cmyk_should_swapRB<false>(dst, src, count);

	785 }

	786

	787 static void CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {

	788 cmyk_should_swapRB<true>(dst, src, count);

	789 }

	790

634 #else	791 #else

635	792

636 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {	793 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {

637 RGBA_to_rgbA_portable(dst, src, count);	794 RGBA_to_rgbA_portable(dst, src, count);

638 }	795 }

639	796

640 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {	797 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {

641 RGBA_to_bgrA_portable(dst, src, count);	798 RGBA_to_bgrA_portable(dst, src, count);

642 }	799 }

643	800

(...skipping 14 matching lines...) Expand all Loading...
658 }	815 }

659	816

660 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {	817 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {

661 grayA_to_RGBA_portable(dst, src, count);	818 grayA_to_RGBA_portable(dst, src, count);

662 }	819 }

663	820

664 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {	821 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {

665 grayA_to_rgbA_portable(dst, src, count);	822 grayA_to_rgbA_portable(dst, src, count);

666 }	823 }

667	824

	825 static void CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {

	826 CMYK_to_RGB1_portable(dst, src, count);

	827 }

	828

	829 static void CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {

	830 CMYK_to_BGR1_portable(dst, src, count);

	831 }

	832

668 #endif	833 #endif

669	834

670 }	835 }

671	836

672 #endif // SkSwizzler_opts_DEFINED	837 #endif // SkSwizzler_opts_DEFINED

OLD	NEW

« src/core/SkOpts.h ('K') | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »