Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(186)

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1680743005: NEON Optimized RGBA->PMColor sampling in SkSwizzler (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Share implementations for sampleSize 4 and 8 Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkSwizzler_opts_DEFINED 8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED
10 10
11 #include "SkColorPriv.h" 11 #include "SkColorPriv.h"
12 12
13 namespace SK_OPTS_NS { 13 namespace SK_OPTS_NS {
14 14
15 template <int kSampleSize>
15 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { 16 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
16 auto src = (const uint32_t*)vsrc; 17 auto src = (const uint32_t*)vsrc;
18 int j = 0;
17 for (int i = 0; i < count; i++) { 19 for (int i = 0; i < count; i++) {
18 uint8_t a = src[i] >> 24, 20 uint8_t a = src[j] >> 24,
19 b = src[i] >> 16, 21 b = src[j] >> 16,
20 g = src[i] >> 8, 22 g = src[j] >> 8,
21 r = src[i] >> 0; 23 r = src[j] >> 0;
24 j += kSampleSize;
22 b = (b*a+127)/255; 25 b = (b*a+127)/255;
23 g = (g*a+127)/255; 26 g = (g*a+127)/255;
24 r = (r*a+127)/255; 27 r = (r*a+127)/255;
25 dst[i] = (uint32_t)a << 24 28 dst[i] = (uint32_t)a << 24
26 | (uint32_t)b << 16 29 | (uint32_t)b << 16
27 | (uint32_t)g << 8 30 | (uint32_t)g << 8
28 | (uint32_t)r << 0; 31 | (uint32_t)r << 0;
29 } 32 }
30 } 33 }
31 34
35 template <int kSampleSize>
32 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { 36 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
33 auto src = (const uint32_t*)vsrc; 37 auto src = (const uint32_t*)vsrc;
38 int j = 0;
34 for (int i = 0; i < count; i++) { 39 for (int i = 0; i < count; i++) {
35 uint8_t a = src[i] >> 24, 40 uint8_t a = src[j] >> 24,
36 b = src[i] >> 16, 41 b = src[j] >> 16,
37 g = src[i] >> 8, 42 g = src[j] >> 8,
38 r = src[i] >> 0; 43 r = src[j] >> 0;
44 j += kSampleSize;
39 b = (b*a+127)/255; 45 b = (b*a+127)/255;
40 g = (g*a+127)/255; 46 g = (g*a+127)/255;
41 r = (r*a+127)/255; 47 r = (r*a+127)/255;
42 dst[i] = (uint32_t)a << 24 48 dst[i] = (uint32_t)a << 24
43 | (uint32_t)r << 16 49 | (uint32_t)r << 16
44 | (uint32_t)g << 8 50 | (uint32_t)g << 8
45 | (uint32_t)b << 0; 51 | (uint32_t)b << 0;
46 } 52 }
47 } 53 }
48 54
(...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after
218 rgba.val[1] = g; 224 rgba.val[1] = g;
219 rgba.val[0] = r; 225 rgba.val[0] = r;
220 } 226 }
221 vst4_u8((uint8_t*) dst, rgba); 227 vst4_u8((uint8_t*) dst, rgba);
222 src += 8; 228 src += 8;
223 dst += 8; 229 dst += 8;
224 count -= 8; 230 count -= 8;
225 } 231 }
226 232
227 // Call portable code to finish up the tail of [0,8) pixels. 233 // Call portable code to finish up the tail of [0,8) pixels.
228 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 234 auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>;
229 proc(dst, src, count); 235 proc(dst, src, count);
230 } 236 }
231 237
232 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 238 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
233 premul_should_swapRB<false>(dst, src, count); 239 premul_should_swapRB<false>(dst, src, count);
234 } 240 }
235 241
236 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 242 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
237 premul_should_swapRB<true>(dst, src, count); 243 premul_should_swapRB<true>(dst, src, count);
238 } 244 }
(...skipping 238 matching lines...) Expand 10 before | Expand all | Expand 10 after
477 } 483 }
478 484
479 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 485 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
480 inverted_cmyk_to<kRGB1>(dst, src, count); 486 inverted_cmyk_to<kRGB1>(dst, src, count);
481 } 487 }
482 488
483 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 489 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
484 inverted_cmyk_to<kBGR1>(dst, src, count); 490 inverted_cmyk_to<kBGR1>(dst, src, count);
485 } 491 }
486 492
493 static void load_rgba_sample2(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, u int8x8_t* b,
494 uint8x8_t* a) {
495 // Load 16 pixels.
496 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
497
498 // Choose 8 pixels.
499 // pxpxpxpxpxpxpxpx -> pppppppp
500 *r = vmovn_u16(vreinterpretq_u16_u8(rgba.val[0]));
501 *g = vmovn_u16(vreinterpretq_u16_u8(rgba.val[1]));
502 *b = vmovn_u16(vreinterpretq_u16_u8(rgba.val[2]));
503 *a = vmovn_u16(vreinterpretq_u16_u8(rgba.val[3]));
504 }
505
506 template <int kSampleSize>
507 static void load_rgba_sample(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, ui nt8x8_t* b,
508 uint8x8_t* a) {
509 uint8x8_t rgba0 = vld1_u8((const uint8_t*) src); // rgba xxx x
510 uint8x8_t rgba1 = vld1_u8((const uint8_t*) src + kSampleSize); // rgba xxx x
511 uint8x8_t rgba01 = vzip_u8(rgba0, rgba1).val[0]; // rrgg bba a
512
513 uint8x8_t rgba2 = vld1_u8((const uint8_t*) src + 2*kSampleSize); // rgba xxx x
514 uint8x8_t rgba3 = vld1_u8((const uint8_t*) src + 3*kSampleSize); // rgba xxx x
515 uint8x8_t rgba23 = vzip_u8(rgba2, rgba3).val[0]; // rrgg bba a
516
517 uint16x4x2_t rgba03 = vzip_u16(vreinterpret_u16_u8(rgba01), // rrrr ggg g
518 vreinterpret_u16_u8(rgba23)); // bbbb aaa a
519
520 uint8x8_t rgba4 = vld1_u8((const uint8_t*) src + 4*kSampleSize); // rgba xxx x
521 uint8x8_t rgba5 = vld1_u8((const uint8_t*) src + 5*kSampleSize); // rgba xxx x
522 uint8x8_t rgba45 = vzip_u8(rgba4, rgba5).val[0]; // rrgg bba a
523
524 uint8x8_t rgba6 = vld1_u8((const uint8_t*) src + 6*kSampleSize); // rgba xxx x
525 uint8x8_t rgba7 = vld1_u8((const uint8_t*) src + 7*kSampleSize); // rgba xxx x
526 uint8x8_t rgba67 = vzip_u8(rgba6, rgba7).val[0]; // rrgg bba a
527
528 uint16x4x2_t rgba47 = vzip_u16(vreinterpret_u16_u8(rgba45), // rrrr ggg g
529 vreinterpret_u16_u8(rgba67)); // bbbb aaa a
530
531 uint32x2x2_t rg = vzip_u32(vreinterpret_u32_u16(rgba03.val[0]), // rrrr rrr r
532 vreinterpret_u32_u16(rgba47.val[0])); // gggg ggg g
533 uint32x2x2_t ba = vzip_u32(vreinterpret_u32_u16(rgba03.val[1]), // bbbb bbb b
534 vreinterpret_u32_u16(rgba47.val[1])); // aaaa aaa a
535
536 *r = vreinterpret_u8_u32(rg.val[0]);
537 *g = vreinterpret_u8_u32(rg.val[1]);
538 *b = vreinterpret_u8_u32(ba.val[0]);
539 *a = vreinterpret_u8_u32(ba.val[1]);
540 }
541
542 template <bool kSwapRB, int kSampleSize>
543 static void premul_should_swapRB_sample(uint32_t* dst, const void* vsrc, int cou nt) {
544 auto src = (const uint32_t*)vsrc;
545
546 // We must use 9 as the limit to be sure that we don't read past the end of our memory.
547 while (count >= 9) {
548 // Load pixels.
549 uint8x8_t r, g, b, a;
550 if (2 == kSampleSize) {
551 load_rgba_sample2(src, &r, &g, &b, &a);
552 } else {
553 load_rgba_sample<kSampleSize>(src, &r, &g, &b, &a);
554 }
555
556 // Premultiply.
557 r = scale(r, a);
558 g = scale(g, a);
559 b = scale(b, a);
560
561 // Store 8 premultiplied pixels.
562 uint8x8x4_t result;
563 if (kSwapRB) {
564 result.val[0] = b;
565 result.val[1] = g;
566 result.val[2] = r;
567 result.val[3] = a;
568 } else {
569 result.val[0] = r;
570 result.val[1] = g;
571 result.val[2] = b;
572 result.val[3] = a;
573 }
574 vst4_u8((uint8_t*) dst, result);
575 src += 8*kSampleSize;
576 dst += 8;
577 count -= 8;
578 }
579
580 auto proc = kSwapRB ? RGBA_to_bgrA_portable<kSampleSize> : RGBA_to_rgbA_port able<kSampleSize>;
581 proc(dst, src, count);
582 }
583
584 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
585 premul_should_swapRB_sample<false, 2>(dst, src, count);
586 }
587
588 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
589 premul_should_swapRB_sample<true, 2>(dst, src, count);
590 }
591
592 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
593 premul_should_swapRB_sample<false, 4>(dst, src, count);
594 }
595
596 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
597 premul_should_swapRB_sample<true, 4>(dst, src, count);
598 }
599
600 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
601 premul_should_swapRB_sample<false, 8>(dst, src, count);
602 }
603
604 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
605 premul_should_swapRB_sample<true, 8>(dst, src, count);
606 }
607
487 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 608 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
488 609
489 // Scale a byte by another. 610 // Scale a byte by another.
490 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 611 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
491 static __m128i scale(__m128i x, __m128i y) { 612 static __m128i scale(__m128i x, __m128i y) {
492 const __m128i _128 = _mm_set1_epi16(128); 613 const __m128i _128 = _mm_set1_epi16(128);
493 const __m128i _257 = _mm_set1_epi16(257); 614 const __m128i _257 = _mm_set1_epi16(257);
494 615
495 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 616 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
496 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); 617 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
554 premul8(&lo, &hi); 675 premul8(&lo, &hi);
555 676
556 _mm_storeu_si128((__m128i*) dst, lo); 677 _mm_storeu_si128((__m128i*) dst, lo);
557 678
558 src += 4; 679 src += 4;
559 dst += 4; 680 dst += 4;
560 count -= 4; 681 count -= 4;
561 } 682 }
562 683
563 // Call portable code to finish up the tail of [0,4) pixels. 684 // Call portable code to finish up the tail of [0,4) pixels.
564 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 685 auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>;
565 proc(dst, src, count); 686 proc(dst, src, count);
566 } 687 }
567 688
568 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 689 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
569 premul_should_swapRB<false>(dst, src, count); 690 premul_should_swapRB<false>(dst, src, count);
570 } 691 }
571 692
572 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 693 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
573 premul_should_swapRB<true>(dst, src, count); 694 premul_should_swapRB<true>(dst, src, count);
574 } 695 }
(...skipping 209 matching lines...) Expand 10 before | Expand all | Expand 10 after
784 } 905 }
785 906
786 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 907 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
787 inverted_cmyk_to<kRGB1>(dst, src, count); 908 inverted_cmyk_to<kRGB1>(dst, src, count);
788 } 909 }
789 910
790 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 911 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
791 inverted_cmyk_to<kBGR1>(dst, src, count); 912 inverted_cmyk_to<kBGR1>(dst, src, count);
792 } 913 }
793 914
915 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
916 RGBA_to_rgbA_portable<2>(dst, src, count);
917 }
918
919 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
920 RGBA_to_bgrA_portable<2>(dst, src, count);
921 }
922
923 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
924 RGBA_to_rgbA_portable<4>(dst, src, count);
925 }
926
927 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
928 RGBA_to_bgrA_portable<4>(dst, src, count);
929 }
930
931 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
932 RGBA_to_rgbA_portable<8>(dst, src, count);
933 }
934
935 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
936 RGBA_to_bgrA_portable<8>(dst, src, count);
937 }
938
794 #else 939 #else
795 940
796 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 941 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
797 RGBA_to_rgbA_portable(dst, src, count); 942 RGBA_to_rgbA_portable<1>(dst, src, count);
798 } 943 }
799 944
800 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 945 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
801 RGBA_to_bgrA_portable(dst, src, count); 946 RGBA_to_bgrA_portable<1>(dst, src, count);
802 } 947 }
803 948
804 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { 949 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
805 RGBA_to_BGRA_portable(dst, src, count); 950 RGBA_to_BGRA_portable(dst, src, count);
806 } 951 }
807 952
808 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { 953 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
809 RGB_to_RGB1_portable(dst, src, count); 954 RGB_to_RGB1_portable(dst, src, count);
810 } 955 }
811 956
(...skipping 14 matching lines...) Expand all
826 } 971 }
827 972
828 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 973 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
829 inverted_CMYK_to_RGB1_portable(dst, src, count); 974 inverted_CMYK_to_RGB1_portable(dst, src, count);
830 } 975 }
831 976
832 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 977 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
833 inverted_CMYK_to_BGR1_portable(dst, src, count); 978 inverted_CMYK_to_BGR1_portable(dst, src, count);
834 } 979 }
835 980
981 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
982 RGBA_to_rgbA_portable<2>(dst, src, count);
983 }
984
985 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
986 RGBA_to_bgrA_portable<2>(dst, src, count);
987 }
988
989 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
990 RGBA_to_rgbA_portable<4>(dst, src, count);
991 }
992
993 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
994 RGBA_to_bgrA_portable<4>(dst, src, count);
995 }
996
997 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
998 RGBA_to_rgbA_portable<8>(dst, src, count);
999 }
1000
1001 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
1002 RGBA_to_bgrA_portable<8>(dst, src, count);
1003 }
1004
836 #endif 1005 #endif
837 1006
838 } 1007 }
839 1008
840 #endif // SkSwizzler_opts_DEFINED 1009 #endif // SkSwizzler_opts_DEFINED
OLDNEW
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698