OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
10 | 10 |
11 #include "SkColorPriv.h" | 11 #include "SkColorPriv.h" |
12 | 12 |
13 namespace SK_OPTS_NS { | 13 namespace SK_OPTS_NS { |
14 | 14 |
| 15 template <int kSampleSize> |
15 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { | 16 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { |
16 auto src = (const uint32_t*)vsrc; | 17 auto src = (const uint32_t*)vsrc; |
| 18 int j = 0; |
17 for (int i = 0; i < count; i++) { | 19 for (int i = 0; i < count; i++) { |
18 uint8_t a = src[i] >> 24, | 20 uint8_t a = src[j] >> 24, |
19 b = src[i] >> 16, | 21 b = src[j] >> 16, |
20 g = src[i] >> 8, | 22 g = src[j] >> 8, |
21 r = src[i] >> 0; | 23 r = src[j] >> 0; |
| 24 j += kSampleSize; |
22 b = (b*a+127)/255; | 25 b = (b*a+127)/255; |
23 g = (g*a+127)/255; | 26 g = (g*a+127)/255; |
24 r = (r*a+127)/255; | 27 r = (r*a+127)/255; |
25 dst[i] = (uint32_t)a << 24 | 28 dst[i] = (uint32_t)a << 24 |
26 | (uint32_t)b << 16 | 29 | (uint32_t)b << 16 |
27 | (uint32_t)g << 8 | 30 | (uint32_t)g << 8 |
28 | (uint32_t)r << 0; | 31 | (uint32_t)r << 0; |
29 } | 32 } |
30 } | 33 } |
31 | 34 |
| 35 template <int kSampleSize> |
32 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { | 36 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { |
33 auto src = (const uint32_t*)vsrc; | 37 auto src = (const uint32_t*)vsrc; |
| 38 int j = 0; |
34 for (int i = 0; i < count; i++) { | 39 for (int i = 0; i < count; i++) { |
35 uint8_t a = src[i] >> 24, | 40 uint8_t a = src[j] >> 24, |
36 b = src[i] >> 16, | 41 b = src[j] >> 16, |
37 g = src[i] >> 8, | 42 g = src[j] >> 8, |
38 r = src[i] >> 0; | 43 r = src[j] >> 0; |
| 44 j += kSampleSize; |
39 b = (b*a+127)/255; | 45 b = (b*a+127)/255; |
40 g = (g*a+127)/255; | 46 g = (g*a+127)/255; |
41 r = (r*a+127)/255; | 47 r = (r*a+127)/255; |
42 dst[i] = (uint32_t)a << 24 | 48 dst[i] = (uint32_t)a << 24 |
43 | (uint32_t)r << 16 | 49 | (uint32_t)r << 16 |
44 | (uint32_t)g << 8 | 50 | (uint32_t)g << 8 |
45 | (uint32_t)b << 0; | 51 | (uint32_t)b << 0; |
46 } | 52 } |
47 } | 53 } |
48 | 54 |
(...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
218 rgba.val[1] = g; | 224 rgba.val[1] = g; |
219 rgba.val[0] = r; | 225 rgba.val[0] = r; |
220 } | 226 } |
221 vst4_u8((uint8_t*) dst, rgba); | 227 vst4_u8((uint8_t*) dst, rgba); |
222 src += 8; | 228 src += 8; |
223 dst += 8; | 229 dst += 8; |
224 count -= 8; | 230 count -= 8; |
225 } | 231 } |
226 | 232 |
227 // Call portable code to finish up the tail of [0,8) pixels. | 233 // Call portable code to finish up the tail of [0,8) pixels. |
228 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; | 234 auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>; |
229 proc(dst, src, count); | 235 proc(dst, src, count); |
230 } | 236 } |
231 | 237 |
232 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 238 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
233 premul_should_swapRB<false>(dst, src, count); | 239 premul_should_swapRB<false>(dst, src, count); |
234 } | 240 } |
235 | 241 |
236 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 242 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
237 premul_should_swapRB<true>(dst, src, count); | 243 premul_should_swapRB<true>(dst, src, count); |
238 } | 244 } |
(...skipping 238 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
477 } | 483 } |
478 | 484 |
479 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { | 485 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { |
480 inverted_cmyk_to<kRGB1>(dst, src, count); | 486 inverted_cmyk_to<kRGB1>(dst, src, count); |
481 } | 487 } |
482 | 488 |
483 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { | 489 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { |
484 inverted_cmyk_to<kBGR1>(dst, src, count); | 490 inverted_cmyk_to<kBGR1>(dst, src, count); |
485 } | 491 } |
486 | 492 |
| 493 static void load_rgba_sample2(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, u
int8x8_t* b, |
| 494 uint8x8_t* a) { |
| 495 // Load 16 pixels. |
| 496 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src); |
| 497 |
| 498 // Choose 8 pixels. |
| 499 // pxpxpxpxpxpxpxpx -> pppppppp |
| 500 *r = vmovn_u16(vreinterpretq_u16_u8(rgba.val[0])); |
| 501 *g = vmovn_u16(vreinterpretq_u16_u8(rgba.val[1])); |
| 502 *b = vmovn_u16(vreinterpretq_u16_u8(rgba.val[2])); |
| 503 *a = vmovn_u16(vreinterpretq_u16_u8(rgba.val[3])); |
| 504 } |
| 505 |
| 506 template <int kSampleSize> |
| 507 static void load_rgba_sample(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, ui
nt8x8_t* b, |
| 508 uint8x8_t* a) { |
| 509 uint8x8_t rgba0 = vld1_u8((const uint8_t*) src); // rgba xxx
x |
| 510 uint8x8_t rgba1 = vld1_u8((const uint8_t*) src + kSampleSize); // rgba xxx
x |
| 511 uint8x8_t rgba01 = vzip_u8(rgba0, rgba1).val[0]; // rrgg bba
a |
| 512 |
| 513 uint8x8_t rgba2 = vld1_u8((const uint8_t*) src + 2*kSampleSize); // rgba xxx
x |
| 514 uint8x8_t rgba3 = vld1_u8((const uint8_t*) src + 3*kSampleSize); // rgba xxx
x |
| 515 uint8x8_t rgba23 = vzip_u8(rgba2, rgba3).val[0]; // rrgg bba
a |
| 516 |
| 517 uint16x4x2_t rgba03 = vzip_u16(vreinterpret_u16_u8(rgba01), // rrrr ggg
g |
| 518 vreinterpret_u16_u8(rgba23)); // bbbb aaa
a |
| 519 |
| 520 uint8x8_t rgba4 = vld1_u8((const uint8_t*) src + 4*kSampleSize); // rgba xxx
x |
| 521 uint8x8_t rgba5 = vld1_u8((const uint8_t*) src + 5*kSampleSize); // rgba xxx
x |
| 522 uint8x8_t rgba45 = vzip_u8(rgba4, rgba5).val[0]; // rrgg bba
a |
| 523 |
| 524 uint8x8_t rgba6 = vld1_u8((const uint8_t*) src + 6*kSampleSize); // rgba xxx
x |
| 525 uint8x8_t rgba7 = vld1_u8((const uint8_t*) src + 7*kSampleSize); // rgba xxx
x |
| 526 uint8x8_t rgba67 = vzip_u8(rgba6, rgba7).val[0]; // rrgg bba
a |
| 527 |
| 528 uint16x4x2_t rgba47 = vzip_u16(vreinterpret_u16_u8(rgba45), // rrrr ggg
g |
| 529 vreinterpret_u16_u8(rgba67)); // bbbb aaa
a |
| 530 |
| 531 uint32x2x2_t rg = vzip_u32(vreinterpret_u32_u16(rgba03.val[0]), // rrrr rrr
r |
| 532 vreinterpret_u32_u16(rgba47.val[0])); // gggg ggg
g |
| 533 uint32x2x2_t ba = vzip_u32(vreinterpret_u32_u16(rgba03.val[1]), // bbbb bbb
b |
| 534 vreinterpret_u32_u16(rgba47.val[1])); // aaaa aaa
a |
| 535 |
| 536 *r = vreinterpret_u8_u32(rg.val[0]); |
| 537 *g = vreinterpret_u8_u32(rg.val[1]); |
| 538 *b = vreinterpret_u8_u32(ba.val[0]); |
| 539 *a = vreinterpret_u8_u32(ba.val[1]); |
| 540 } |
| 541 |
| 542 template <bool kSwapRB, int kSampleSize> |
| 543 static void premul_should_swapRB_sample(uint32_t* dst, const void* vsrc, int cou
nt) { |
| 544 auto src = (const uint32_t*)vsrc; |
| 545 |
| 546 // We must use 9 as the limit to be sure that we don't read past the end of
our memory. |
| 547 while (count >= 9) { |
| 548 // Load pixels. |
| 549 uint8x8_t r, g, b, a; |
| 550 if (2 == kSampleSize) { |
| 551 load_rgba_sample2(src, &r, &g, &b, &a); |
| 552 } else { |
| 553 load_rgba_sample<kSampleSize>(src, &r, &g, &b, &a); |
| 554 } |
| 555 |
| 556 // Premultiply. |
| 557 r = scale(r, a); |
| 558 g = scale(g, a); |
| 559 b = scale(b, a); |
| 560 |
| 561 // Store 8 premultiplied pixels. |
| 562 uint8x8x4_t result; |
| 563 if (kSwapRB) { |
| 564 result.val[0] = b; |
| 565 result.val[1] = g; |
| 566 result.val[2] = r; |
| 567 result.val[3] = a; |
| 568 } else { |
| 569 result.val[0] = r; |
| 570 result.val[1] = g; |
| 571 result.val[2] = b; |
| 572 result.val[3] = a; |
| 573 } |
| 574 vst4_u8((uint8_t*) dst, result); |
| 575 src += 8*kSampleSize; |
| 576 dst += 8; |
| 577 count -= 8; |
| 578 } |
| 579 |
| 580 auto proc = kSwapRB ? RGBA_to_bgrA_portable<kSampleSize> : RGBA_to_rgbA_port
able<kSampleSize>; |
| 581 proc(dst, src, count); |
| 582 } |
| 583 |
| 584 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) { |
| 585 premul_should_swapRB_sample<false, 2>(dst, src, count); |
| 586 } |
| 587 |
| 588 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) { |
| 589 premul_should_swapRB_sample<true, 2>(dst, src, count); |
| 590 } |
| 591 |
| 592 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) { |
| 593 premul_should_swapRB_sample<false, 4>(dst, src, count); |
| 594 } |
| 595 |
| 596 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) { |
| 597 premul_should_swapRB_sample<true, 4>(dst, src, count); |
| 598 } |
| 599 |
| 600 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) { |
| 601 premul_should_swapRB_sample<false, 8>(dst, src, count); |
| 602 } |
| 603 |
| 604 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) { |
| 605 premul_should_swapRB_sample<true, 8>(dst, src, count); |
| 606 } |
| 607 |
487 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 608 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
488 | 609 |
489 // Scale a byte by another. | 610 // Scale a byte by another. |
490 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. | 611 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. |
491 static __m128i scale(__m128i x, __m128i y) { | 612 static __m128i scale(__m128i x, __m128i y) { |
492 const __m128i _128 = _mm_set1_epi16(128); | 613 const __m128i _128 = _mm_set1_epi16(128); |
493 const __m128i _257 = _mm_set1_epi16(257); | 614 const __m128i _257 = _mm_set1_epi16(257); |
494 | 615 |
495 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. | 616 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. |
496 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); | 617 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
554 premul8(&lo, &hi); | 675 premul8(&lo, &hi); |
555 | 676 |
556 _mm_storeu_si128((__m128i*) dst, lo); | 677 _mm_storeu_si128((__m128i*) dst, lo); |
557 | 678 |
558 src += 4; | 679 src += 4; |
559 dst += 4; | 680 dst += 4; |
560 count -= 4; | 681 count -= 4; |
561 } | 682 } |
562 | 683 |
563 // Call portable code to finish up the tail of [0,4) pixels. | 684 // Call portable code to finish up the tail of [0,4) pixels. |
564 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; | 685 auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>; |
565 proc(dst, src, count); | 686 proc(dst, src, count); |
566 } | 687 } |
567 | 688 |
568 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 689 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
569 premul_should_swapRB<false>(dst, src, count); | 690 premul_should_swapRB<false>(dst, src, count); |
570 } | 691 } |
571 | 692 |
572 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 693 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
573 premul_should_swapRB<true>(dst, src, count); | 694 premul_should_swapRB<true>(dst, src, count); |
574 } | 695 } |
(...skipping 209 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
784 } | 905 } |
785 | 906 |
786 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { | 907 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { |
787 inverted_cmyk_to<kRGB1>(dst, src, count); | 908 inverted_cmyk_to<kRGB1>(dst, src, count); |
788 } | 909 } |
789 | 910 |
790 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { | 911 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { |
791 inverted_cmyk_to<kBGR1>(dst, src, count); | 912 inverted_cmyk_to<kBGR1>(dst, src, count); |
792 } | 913 } |
793 | 914 |
| 915 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) { |
| 916 RGBA_to_rgbA_portable<2>(dst, src, count); |
| 917 } |
| 918 |
| 919 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) { |
| 920 RGBA_to_bgrA_portable<2>(dst, src, count); |
| 921 } |
| 922 |
| 923 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) { |
| 924 RGBA_to_rgbA_portable<4>(dst, src, count); |
| 925 } |
| 926 |
| 927 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) { |
| 928 RGBA_to_bgrA_portable<4>(dst, src, count); |
| 929 } |
| 930 |
| 931 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) { |
| 932 RGBA_to_rgbA_portable<8>(dst, src, count); |
| 933 } |
| 934 |
| 935 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) { |
| 936 RGBA_to_bgrA_portable<8>(dst, src, count); |
| 937 } |
| 938 |
794 #else | 939 #else |
795 | 940 |
796 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 941 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
797 RGBA_to_rgbA_portable(dst, src, count); | 942 RGBA_to_rgbA_portable<1>(dst, src, count); |
798 } | 943 } |
799 | 944 |
800 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 945 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
801 RGBA_to_bgrA_portable(dst, src, count); | 946 RGBA_to_bgrA_portable<1>(dst, src, count); |
802 } | 947 } |
803 | 948 |
804 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { | 949 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { |
805 RGBA_to_BGRA_portable(dst, src, count); | 950 RGBA_to_BGRA_portable(dst, src, count); |
806 } | 951 } |
807 | 952 |
808 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { | 953 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { |
809 RGB_to_RGB1_portable(dst, src, count); | 954 RGB_to_RGB1_portable(dst, src, count); |
810 } | 955 } |
811 | 956 |
(...skipping 14 matching lines...) Expand all Loading... |
826 } | 971 } |
827 | 972 |
828 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { | 973 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { |
829 inverted_CMYK_to_RGB1_portable(dst, src, count); | 974 inverted_CMYK_to_RGB1_portable(dst, src, count); |
830 } | 975 } |
831 | 976 |
832 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { | 977 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { |
833 inverted_CMYK_to_BGR1_portable(dst, src, count); | 978 inverted_CMYK_to_BGR1_portable(dst, src, count); |
834 } | 979 } |
835 | 980 |
| 981 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) { |
| 982 RGBA_to_rgbA_portable<2>(dst, src, count); |
| 983 } |
| 984 |
| 985 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) { |
| 986 RGBA_to_bgrA_portable<2>(dst, src, count); |
| 987 } |
| 988 |
| 989 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) { |
| 990 RGBA_to_rgbA_portable<4>(dst, src, count); |
| 991 } |
| 992 |
| 993 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) { |
| 994 RGBA_to_bgrA_portable<4>(dst, src, count); |
| 995 } |
| 996 |
| 997 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) { |
| 998 RGBA_to_rgbA_portable<8>(dst, src, count); |
| 999 } |
| 1000 |
| 1001 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) { |
| 1002 RGBA_to_bgrA_portable<8>(dst, src, count); |
| 1003 } |
| 1004 |
836 #endif | 1005 #endif |
837 | 1006 |
838 } | 1007 } |
839 | 1008 |
840 #endif // SkSwizzler_opts_DEFINED | 1009 #endif // SkSwizzler_opts_DEFINED |
OLD | NEW |