Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4)

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1676773003: Optimize CMYK->RGBA (BGRA) transform for jpeg decodes (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« src/core/SkOpts.h ('K') | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkSwizzler_opts_DEFINED 8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED
10 10
(...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after
118 a = src[1]; 118 a = src[1];
119 src += 2; 119 src += 2;
120 g = (g*a+127)/255; 120 g = (g*a+127)/255;
121 dst[i] = (uint32_t)a << 24 121 dst[i] = (uint32_t)a << 24
122 | (uint32_t)g << 16 122 | (uint32_t)g << 16
123 | (uint32_t)g << 8 123 | (uint32_t)g << 8
124 | (uint32_t)g << 0; 124 | (uint32_t)g << 0;
125 } 125 }
126 } 126 }
127 127
128 static void CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
129 const uint32_t* src = (const uint32_t*)vsrc;
130 for (int i = 0; i < count; i++) {
131 uint8_t k = src[i] >> 24,
132 y = src[i] >> 16,
133 m = src[i] >> 8,
134 c = src[i] >> 0;
135 uint8_t b = (y*k+127)/255,
scroggo 2016/02/08 14:59:43 This pattern appears a lot. Should it be a macro?
mtklein 2016/02/08 15:08:47 Don't think so. When we put it in a macro, people
136 g = (m*k+127)/255,
137 r = (c*k+127)/255;
mtklein 2016/02/08 15:14:26 This is really the math? Having never seen it bef
msarett 2016/02/08 15:23:08 The short answer is that libjpeg-turbo actually ou
mtklein 2016/02/08 16:47:15 Let's go with a name change (inverted_CMYK_to_...)
msarett 2016/02/08 17:22:42 Done.
138 dst[i] = (uint32_t)0xFF << 24
139 | (uint32_t) b << 16
140 | (uint32_t) g << 8
141 | (uint32_t) r << 0;
142 }
143 }
144
145 static void CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
146 const uint32_t* src = (const uint32_t*)vsrc;
147 for (int i = 0; i < count; i++) {
148 uint8_t k = src[i] >> 24,
149 y = src[i] >> 16,
150 m = src[i] >> 8,
151 c = src[i] >> 0;
152 uint8_t b = (y*k+127)/255,
153 g = (m*k+127)/255,
154 r = (c*k+127)/255;
155 dst[i] = (uint32_t)0xFF << 24
156 | (uint32_t) r << 16
157 | (uint32_t) g << 8
158 | (uint32_t) b << 0;
159 }
160 }
161
128 #if defined(SK_ARM_HAS_NEON) 162 #if defined(SK_ARM_HAS_NEON)
129 163
130 // Rounded divide by 255, (x + 127) / 255 164 // Rounded divide by 255, (x + 127) / 255
131 static uint8x8_t div255_round(uint16x8_t x) { 165 static uint8x8_t div255_round(uint16x8_t x) {
132 // result = (x + 127) / 255 166 // result = (x + 127) / 255
133 // result = (x + 127) / 256 + error1 167 // result = (x + 127) / 256 + error1
134 // 168 //
135 // error1 = (x + 127) / (255 * 256) 169 // error1 = (x + 127) / (255 * 256)
136 // error1 = (x + 127) / (256 * 256) + error2 170 // error1 = (x + 127) / (256 * 256) + error2
137 // 171 //
(...skipping 256 matching lines...) Expand 10 before | Expand all | Expand 10 after
394 } 428 }
395 429
396 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { 430 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
397 expand_grayA<false>(dst, src, count); 431 expand_grayA<false>(dst, src, count);
398 } 432 }
399 433
400 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { 434 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
401 expand_grayA<true>(dst, src, count); 435 expand_grayA<true>(dst, src, count);
402 } 436 }
403 437
438 template <bool kSwapRB>
439 static void cmyk_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
440 auto src = (const uint32_t*)vsrc;
441 while (count >= 8) {
442 // Load 8 cmyk pixels.
443 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
444
445 uint8x8_t k = pixels.val[3],
446 y = pixels.val[2],
447 m = pixels.val[1],
448 c = pixels.val[0];
449
450 // Scale to r, g, b.
451 uint8x8_t b = scale(y, k);
452 uint8x8_t g = scale(m, k);
453 uint8x8_t r = scale(c, k);
454
455 // Store 8 rgba pixels.
456 if (kSwapRB) {
457 pixels.val[3] = vdup_n_u8(0xFF);
458 pixels.val[2] = r;
459 pixels.val[1] = g;
460 pixels.val[0] = b;
461 } else {
462 pixels.val[3] = vdup_n_u8(0xFF);
463 pixels.val[2] = b;
464 pixels.val[1] = g;
465 pixels.val[0] = r;
466 }
467 vst4_u8((uint8_t*) dst, pixels);
468 src += 8;
469 dst += 8;
470 count -= 8;
471 }
472
473 auto proc = kSwapRB ? CMYK_to_BGR1_portable : CMYK_to_RGB1_portable;
474 proc(dst, src, count);
475 }
476
477 static void CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
478 cmyk_should_swapRB<false>(dst, src, count);
479 }
480
481 static void CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
482 cmyk_should_swapRB<true>(dst, src, count);
483 }
484
404 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 485 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
405 486
406 // Scale a byte by another. 487 // Scale a byte by another.
407 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 488 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
408 static __m128i scale(__m128i x, __m128i y) { 489 static __m128i scale(__m128i x, __m128i y) {
409 const __m128i _128 = _mm_set1_epi16(128); 490 const __m128i _128 = _mm_set1_epi16(128);
410 const __m128i _257 = _mm_set1_epi16(257); 491 const __m128i _257 = _mm_set1_epi16(257);
411 492
412 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 493 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
413 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); 494 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
(...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after
624 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); 705 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
625 706
626 src += 8*2; 707 src += 8*2;
627 dst += 8; 708 dst += 8;
628 count -= 8; 709 count -= 8;
629 } 710 }
630 711
631 grayA_to_rgbA_portable(dst, src, count); 712 grayA_to_rgbA_portable(dst, src, count);
632 } 713 }
633 714
715 template <bool kSwapRB>
716 static void cmyk_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
717 auto src = (const uint32_t*)vsrc;
718
719 auto convert8 = [](__m128i* lo, __m128i* hi) {
720 const __m128i zeros = _mm_setzero_si128();
721 __m128i planar;
722 if (kSwapRB) {
723 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
724 } else {
725 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
726 }
727
728 // Swizzle the pixels to 8-bit planar.
729 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yy yykkkk
730 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YY YYKKKK
731 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mm mmMMMM
732 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kk kkKKKK
733
734 // Unpack to 16-bit planar.
735 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_ C_C_C_
736 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_ M_M_M_
737 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_ Y_Y_Y_
738 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_ K_K_K_
739
740 // Scale to r, g, b.
741 __m128i r = scale(c, k),
742 g = scale(m, k),
743 b = scale(y, k);
744
745 // Repack into interlaced pixels.
746 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RG RGRGRG
747 ba = _mm_or_si128(b, _mm_set1_epi16(0xFF00)); // b1b1b1b1 B1 B1B1B1
748 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rg bargba
749 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RG B1RGB1
750 };
751
752 while (count >= 8) {
753 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
754 hi = _mm_loadu_si128((const __m128i*) (src + 4));
755
756 convert8(&lo, &hi);
757
758 _mm_storeu_si128((__m128i*) (dst + 0), lo);
759 _mm_storeu_si128((__m128i*) (dst + 4), hi);
760
761 src += 8;
762 dst += 8;
763 count -= 8;
764 }
765
766 if (count >= 4) {
767 __m128i lo = _mm_loadu_si128((const __m128i*) src),
768 hi = _mm_setzero_si128();
769
770 convert8(&lo, &hi);
771
772 _mm_storeu_si128((__m128i*) dst, lo);
773
774 src += 4;
775 dst += 4;
776 count -= 4;
777 }
778
779 auto proc = kSwapRB ? CMYK_to_BGR1_portable : CMYK_to_RGB1_portable;
780 proc(dst, src, count);
781 }
782
783 static void CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
784 cmyk_should_swapRB<false>(dst, src, count);
785 }
786
787 static void CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
788 cmyk_should_swapRB<true>(dst, src, count);
789 }
790
634 #else 791 #else
635 792
636 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 793 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
637 RGBA_to_rgbA_portable(dst, src, count); 794 RGBA_to_rgbA_portable(dst, src, count);
638 } 795 }
639 796
640 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 797 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
641 RGBA_to_bgrA_portable(dst, src, count); 798 RGBA_to_bgrA_portable(dst, src, count);
642 } 799 }
643 800
(...skipping 14 matching lines...) Expand all
658 } 815 }
659 816
660 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { 817 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
661 grayA_to_RGBA_portable(dst, src, count); 818 grayA_to_RGBA_portable(dst, src, count);
662 } 819 }
663 820
664 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { 821 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
665 grayA_to_rgbA_portable(dst, src, count); 822 grayA_to_rgbA_portable(dst, src, count);
666 } 823 }
667 824
825 static void CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
826 CMYK_to_RGB1_portable(dst, src, count);
827 }
828
829 static void CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
830 CMYK_to_BGR1_portable(dst, src, count);
831 }
832
668 #endif 833 #endif
669 834
670 } 835 }
671 836
672 #endif // SkSwizzler_opts_DEFINED 837 #endif // SkSwizzler_opts_DEFINED
OLDNEW
« src/core/SkOpts.h ('K') | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698