Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(756)

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1680743005: NEON Optimized RGBA->PMColor sampling in SkSwizzler (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Adding NEON implementations Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« src/codec/SkSwizzler.cpp ('K') | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkSwizzler_opts_DEFINED 8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED
10 10
11 #include "SkColorPriv.h" 11 #include "SkColorPriv.h"
12 12
13 namespace SK_OPTS_NS { 13 namespace SK_OPTS_NS {
14 14
15 template <int kSampleSize>
15 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { 16 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
16 auto src = (const uint32_t*)vsrc; 17 auto src = (const uint32_t*)vsrc;
18 int j = 0;
17 for (int i = 0; i < count; i++) { 19 for (int i = 0; i < count; i++) {
18 uint8_t a = src[i] >> 24, 20 uint8_t a = src[j] >> 24,
19 b = src[i] >> 16, 21 b = src[j] >> 16,
20 g = src[i] >> 8, 22 g = src[j] >> 8,
21 r = src[i] >> 0; 23 r = src[j] >> 0;
24 j += kSampleSize;
22 b = (b*a+127)/255; 25 b = (b*a+127)/255;
23 g = (g*a+127)/255; 26 g = (g*a+127)/255;
24 r = (r*a+127)/255; 27 r = (r*a+127)/255;
25 dst[i] = (uint32_t)a << 24 28 dst[i] = (uint32_t)a << 24
26 | (uint32_t)b << 16 29 | (uint32_t)b << 16
27 | (uint32_t)g << 8 30 | (uint32_t)g << 8
28 | (uint32_t)r << 0; 31 | (uint32_t)r << 0;
29 } 32 }
30 } 33 }
31 34
35 template <int kSampleSize>
32 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { 36 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
33 auto src = (const uint32_t*)vsrc; 37 auto src = (const uint32_t*)vsrc;
38 int j = 0;
34 for (int i = 0; i < count; i++) { 39 for (int i = 0; i < count; i++) {
35 uint8_t a = src[i] >> 24, 40 uint8_t a = src[j] >> 24,
36 b = src[i] >> 16, 41 b = src[j] >> 16,
37 g = src[i] >> 8, 42 g = src[j] >> 8,
38 r = src[i] >> 0; 43 r = src[j] >> 0;
44 j += kSampleSize;
39 b = (b*a+127)/255; 45 b = (b*a+127)/255;
40 g = (g*a+127)/255; 46 g = (g*a+127)/255;
41 r = (r*a+127)/255; 47 r = (r*a+127)/255;
42 dst[i] = (uint32_t)a << 24 48 dst[i] = (uint32_t)a << 24
43 | (uint32_t)r << 16 49 | (uint32_t)r << 16
44 | (uint32_t)g << 8 50 | (uint32_t)g << 8
45 | (uint32_t)b << 0; 51 | (uint32_t)b << 0;
46 } 52 }
47 } 53 }
48 54
(...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after
218 rgba.val[1] = g; 224 rgba.val[1] = g;
219 rgba.val[0] = r; 225 rgba.val[0] = r;
220 } 226 }
221 vst4_u8((uint8_t*) dst, rgba); 227 vst4_u8((uint8_t*) dst, rgba);
222 src += 8; 228 src += 8;
223 dst += 8; 229 dst += 8;
224 count -= 8; 230 count -= 8;
225 } 231 }
226 232
227 // Call portable code to finish up the tail of [0,8) pixels. 233 // Call portable code to finish up the tail of [0,8) pixels.
228 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 234 auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>;
229 proc(dst, src, count); 235 proc(dst, src, count);
230 } 236 }
231 237
232 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 238 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
233 premul_should_swapRB<false>(dst, src, count); 239 premul_should_swapRB<false>(dst, src, count);
234 } 240 }
235 241
236 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 242 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
237 premul_should_swapRB<true>(dst, src, count); 243 premul_should_swapRB<true>(dst, src, count);
238 } 244 }
(...skipping 238 matching lines...) Expand 10 before | Expand all | Expand 10 after
477 } 483 }
478 484
479 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 485 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
480 inverted_cmyk_to<kRGB1>(dst, src, count); 486 inverted_cmyk_to<kRGB1>(dst, src, count);
481 } 487 }
482 488
483 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 489 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
484 inverted_cmyk_to<kBGR1>(dst, src, count); 490 inverted_cmyk_to<kBGR1>(dst, src, count);
485 } 491 }
486 492
493 static void load_rgba_sample2(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, u int8x8_t* b,
494 uint8x8_t* a) {
495 // Load 16 pixels.
496 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
497
498 // Choose 8 pixels.
499 // pxpxpxpxpxpxpxpx -> pppppppp
500 *r = vmovn_u16(vreinterpretq_u16_u8(rgba.val[0]));
501 *g = vmovn_u16(vreinterpretq_u16_u8(rgba.val[1]));
502 *b = vmovn_u16(vreinterpretq_u16_u8(rgba.val[2]));
503 *a = vmovn_u16(vreinterpretq_u16_u8(rgba.val[3]));
504 }
505
506 static void load_rgba_sample4(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, u int8x8_t* b,
507 uint8x8_t* a) {
508 // Load 32 pixels.
509 uint8x16x4_t rgba0 = vld4q_u8((const uint8_t*) src);
510 uint8x16x4_t rgba1 = vld4q_u8((const uint8_t*) src + 16);
511
512 // Choose 8 pixels.
513 // pxxxpxxpxxxpxxx -> pxpxpxpx
514 uint16x4_t r0 = vmovn_u32(vreinterpretq_u32_u8(rgba0.val[0])),
515 g0 = vmovn_u32(vreinterpretq_u32_u8(rgba0.val[1])),
516 b0 = vmovn_u32(vreinterpretq_u32_u8(rgba0.val[2])),
517 a0 = vmovn_u32(vreinterpretq_u32_u8(rgba0.val[3]));
518 uint16x4_t r1 = vmovn_u32(vreinterpretq_u32_u8(rgba1.val[0])),
519 g1 = vmovn_u32(vreinterpretq_u32_u8(rgba1.val[1])),
520 b1 = vmovn_u32(vreinterpretq_u32_u8(rgba1.val[2])),
521 a1 = vmovn_u32(vreinterpretq_u32_u8(rgba1.val[3]));
522
523 // pxpxpxpx, pxpxpxpx -> pppppppp
524 *r = vmovn_u16(vcombine_u16(r0, r1));
525 *g = vmovn_u16(vcombine_u16(g0, g1));
526 *b = vmovn_u16(vcombine_u16(b0, b1));
527 *a = vmovn_u16(vcombine_u16(a0, a1));
528 }
529
530 static void load_rgba_sample8(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, u int8x8_t* b,
531 uint8x8_t* a) {
532 uint8x8_t rgba0 = vld1_u8((const uint8_t*) src); // rgba xxxx
533 uint8x8_t rgba1 = vld1_u8((const uint8_t*) src + 8); // rgba xxxx
534 uint8x8_t rgba01 = vzip_u8(rgba0, rgba1).val[0]; // rrgg bbaa
535
536 uint8x8_t rgba2 = vld1_u8((const uint8_t*) src + 16); // rgba xxxx
537 uint8x8_t rgba3 = vld1_u8((const uint8_t*) src + 24); // rgba xxxx
538 uint8x8_t rgba23 = vzip_u8(rgba2, rgba3).val[0]; // rrgg bbaa
539
540 // rrrr gggg, bbbb aaaa
541 uint16x4x2_t rgba03 = vzip_u16(vreinterpret_u16_u8(rgba01), vreinterpret_u16 _u8(rgba23));
542
543 uint8x8_t rgba4 = vld1_u8((const uint8_t*) src + 32); // rgba xxxx
544 uint8x8_t rgba5 = vld1_u8((const uint8_t*) src + 40); // rgba xxxx
545 uint8x8_t rgba45 = vzip_u8(rgba4, rgba5).val[0]; // rrgg bbaa
546
547 uint8x8_t rgba6 = vld1_u8((const uint8_t*) src + 48); // rgba xxxx
548 uint8x8_t rgba7 = vld1_u8((const uint8_t*) src + 56); // rgba xxxx
549 uint8x8_t rgba67 = vzip_u8(rgba6, rgba7).val[0]; // rrgg bbaa
550
551 // rrrr gggg, bbbb aaaa
552 uint16x4x2_t rgba47 = vzip_u16(vreinterpret_u16_u8(rgba45), vreinterpret_u16 _u8(rgba67));
553
554 // rrrr rrrr, gggg gggg
555 uint32x2x2_t rg = vzip_u32(vreinterpret_u32_u16(rgba03.val[0]),
556 vreinterpret_u32_u16(rgba47.val[0]));
557 // bbbb bbbb, aaaa aaaa
558 uint32x2x2_t ba = vzip_u32(vreinterpret_u32_u16(rgba03.val[1]),
559 vreinterpret_u32_u16(rgba47.val[1]));
560
561 *r = vreinterpret_u8_u32(rg.val[0]);
562 *g = vreinterpret_u8_u32(rg.val[1]);
563 *b = vreinterpret_u8_u32(ba.val[0]);
564 *a = vreinterpret_u8_u32(ba.val[1]);
565 }
566
567 template <bool kSwapRB, int kSampleSize>
568 static void premul_should_swapRB_sample(uint32_t* dst, const void* vsrc, int cou nt) {
569 auto src = (const uint32_t*)vsrc;
570
571 // We must use 9 as the limit to be sure that we don't read past the end of our memory.
572 while (count >= 9) {
573 // Load pixels.
574 uint8x8_t r, g, b, a;
575 if (2 == kSampleSize) {
576 load_rgba_sample2(src, &r, &g, &b, &a);
577 } else if (4 == kSampleSize) {
578 load_rgba_sample4(src, &r, &g, &b, &a);
579 } else if (8 == kSampleSize) {
580 load_rgba_sample8(src, &r, &g, &b, &a);
581 }
582
583 // Premultiply.
584 r = scale(r, a);
585 g = scale(g, a);
586 b = scale(b, a);
587
588 // Store 8 premultiplied pixels.
589 uint8x8x4_t result;
590 if (kSwapRB) {
591 result.val[0] = b;
592 result.val[1] = g;
593 result.val[2] = r;
594 result.val[3] = a;
595 } else {
596 result.val[0] = r;
597 result.val[1] = g;
598 result.val[2] = b;
599 result.val[3] = a;
600 }
601 vst4_u8((uint8_t*) dst, result);
602 src += 8*kSampleSize;
603 dst += 8;
604 count -= 8;
605 }
606
607 auto proc = kSwapRB ? RGBA_to_bgrA_portable<kSampleSize> : RGBA_to_rgbA_port able<kSampleSize>;
608 proc(dst, src, count);
609 }
610
611 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
612 premul_should_swapRB_sample<false, 2>(dst, src, count);
613 }
614
615 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
616 premul_should_swapRB_sample<true, 2>(dst, src, count);
617 }
618
619 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
620 premul_should_swapRB_sample<false, 4>(dst, src, count);
621 }
622
623 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
624 premul_should_swapRB_sample<true, 4>(dst, src, count);
625 }
626
627 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
628 premul_should_swapRB_sample<false, 8>(dst, src, count);
629 }
630
631 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
632 premul_should_swapRB_sample<true, 8>(dst, src, count);
633 }
634
487 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 635 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
488 636
489 // Scale a byte by another. 637 // Scale a byte by another.
490 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 638 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
491 static __m128i scale(__m128i x, __m128i y) { 639 static __m128i scale(__m128i x, __m128i y) {
492 const __m128i _128 = _mm_set1_epi16(128); 640 const __m128i _128 = _mm_set1_epi16(128);
493 const __m128i _257 = _mm_set1_epi16(257); 641 const __m128i _257 = _mm_set1_epi16(257);
494 642
495 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 643 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
496 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); 644 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
554 premul8(&lo, &hi); 702 premul8(&lo, &hi);
555 703
556 _mm_storeu_si128((__m128i*) dst, lo); 704 _mm_storeu_si128((__m128i*) dst, lo);
557 705
558 src += 4; 706 src += 4;
559 dst += 4; 707 dst += 4;
560 count -= 4; 708 count -= 4;
561 } 709 }
562 710
563 // Call portable code to finish up the tail of [0,4) pixels. 711 // Call portable code to finish up the tail of [0,4) pixels.
564 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 712 auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>;
565 proc(dst, src, count); 713 proc(dst, src, count);
566 } 714 }
567 715
568 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 716 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
569 premul_should_swapRB<false>(dst, src, count); 717 premul_should_swapRB<false>(dst, src, count);
570 } 718 }
571 719
572 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 720 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
573 premul_should_swapRB<true>(dst, src, count); 721 premul_should_swapRB<true>(dst, src, count);
574 } 722 }
(...skipping 209 matching lines...) Expand 10 before | Expand all | Expand 10 after
784 } 932 }
785 933
786 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 934 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
787 inverted_cmyk_to<kRGB1>(dst, src, count); 935 inverted_cmyk_to<kRGB1>(dst, src, count);
788 } 936 }
789 937
790 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 938 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
791 inverted_cmyk_to<kBGR1>(dst, src, count); 939 inverted_cmyk_to<kBGR1>(dst, src, count);
792 } 940 }
793 941
942 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
943 RGBA_to_rgbA_portable<2>(dst, src, count);
944 }
945
946 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
947 RGBA_to_bgrA_portable<2>(dst, src, count);
948 }
949
950 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
951 RGBA_to_rgbA_portable<4>(dst, src, count);
952 }
953
954 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
955 RGBA_to_bgrA_portable<4>(dst, src, count);
956 }
957
958 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
959 RGBA_to_rgbA_portable<8>(dst, src, count);
960 }
961
962 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
963 RGBA_to_bgrA_portable<8>(dst, src, count);
964 }
965
794 #else 966 #else
795 967
796 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 968 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
797 RGBA_to_rgbA_portable(dst, src, count); 969 RGBA_to_rgbA_portable<1>(dst, src, count);
798 } 970 }
799 971
800 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 972 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
801 RGBA_to_bgrA_portable(dst, src, count); 973 RGBA_to_bgrA_portable<1>(dst, src, count);
802 } 974 }
803 975
804 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { 976 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
805 RGBA_to_BGRA_portable(dst, src, count); 977 RGBA_to_BGRA_portable(dst, src, count);
806 } 978 }
807 979
808 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { 980 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
809 RGB_to_RGB1_portable(dst, src, count); 981 RGB_to_RGB1_portable(dst, src, count);
810 } 982 }
811 983
(...skipping 14 matching lines...) Expand all
826 } 998 }
827 999
828 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 1000 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
829 inverted_CMYK_to_RGB1_portable(dst, src, count); 1001 inverted_CMYK_to_RGB1_portable(dst, src, count);
830 } 1002 }
831 1003
832 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 1004 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
833 inverted_CMYK_to_BGR1_portable(dst, src, count); 1005 inverted_CMYK_to_BGR1_portable(dst, src, count);
834 } 1006 }
835 1007
1008 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
1009 RGBA_to_rgbA_portable<2>(dst, src, count);
1010 }
1011
1012 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
1013 RGBA_to_bgrA_portable<2>(dst, src, count);
1014 }
1015
1016 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
1017 RGBA_to_rgbA_portable<4>(dst, src, count);
1018 }
1019
1020 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
1021 RGBA_to_bgrA_portable<4>(dst, src, count);
1022 }
1023
1024 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
1025 RGBA_to_rgbA_portable<8>(dst, src, count);
1026 }
1027
1028 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
1029 RGBA_to_bgrA_portable<8>(dst, src, count);
1030 }
1031
836 #endif 1032 #endif
837 1033
838 } 1034 }
839 1035
840 #endif // SkSwizzler_opts_DEFINED 1036 #endif // SkSwizzler_opts_DEFINED
OLDNEW
« src/codec/SkSwizzler.cpp ('K') | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698