| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
| 9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
| 10 | 10 |
| 11 #include "SkColorPriv.h" | 11 #include "SkColorPriv.h" |
| 12 | 12 |
| 13 namespace SK_OPTS_NS { | 13 namespace SK_OPTS_NS { |
| 14 | 14 |
| 15 template <int kSampleSize> |
| 15 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { | 16 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { |
| 16 auto src = (const uint32_t*)vsrc; | 17 auto src = (const uint32_t*)vsrc; |
| 18 int j = 0; |
| 17 for (int i = 0; i < count; i++) { | 19 for (int i = 0; i < count; i++) { |
| 18 uint8_t a = src[i] >> 24, | 20 uint8_t a = src[j] >> 24, |
| 19 b = src[i] >> 16, | 21 b = src[j] >> 16, |
| 20 g = src[i] >> 8, | 22 g = src[j] >> 8, |
| 21 r = src[i] >> 0; | 23 r = src[j] >> 0; |
| 24 j += kSampleSize; |
| 22 b = (b*a+127)/255; | 25 b = (b*a+127)/255; |
| 23 g = (g*a+127)/255; | 26 g = (g*a+127)/255; |
| 24 r = (r*a+127)/255; | 27 r = (r*a+127)/255; |
| 25 dst[i] = (uint32_t)a << 24 | 28 dst[i] = (uint32_t)a << 24 |
| 26 | (uint32_t)b << 16 | 29 | (uint32_t)b << 16 |
| 27 | (uint32_t)g << 8 | 30 | (uint32_t)g << 8 |
| 28 | (uint32_t)r << 0; | 31 | (uint32_t)r << 0; |
| 29 } | 32 } |
| 30 } | 33 } |
| 31 | 34 |
| 35 template <int kSampleSize> |
| 32 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { | 36 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { |
| 33 auto src = (const uint32_t*)vsrc; | 37 auto src = (const uint32_t*)vsrc; |
| 38 int j = 0; |
| 34 for (int i = 0; i < count; i++) { | 39 for (int i = 0; i < count; i++) { |
| 35 uint8_t a = src[i] >> 24, | 40 uint8_t a = src[j] >> 24, |
| 36 b = src[i] >> 16, | 41 b = src[j] >> 16, |
| 37 g = src[i] >> 8, | 42 g = src[j] >> 8, |
| 38 r = src[i] >> 0; | 43 r = src[j] >> 0; |
| 44 j += kSampleSize; |
| 39 b = (b*a+127)/255; | 45 b = (b*a+127)/255; |
| 40 g = (g*a+127)/255; | 46 g = (g*a+127)/255; |
| 41 r = (r*a+127)/255; | 47 r = (r*a+127)/255; |
| 42 dst[i] = (uint32_t)a << 24 | 48 dst[i] = (uint32_t)a << 24 |
| 43 | (uint32_t)r << 16 | 49 | (uint32_t)r << 16 |
| 44 | (uint32_t)g << 8 | 50 | (uint32_t)g << 8 |
| 45 | (uint32_t)b << 0; | 51 | (uint32_t)b << 0; |
| 46 } | 52 } |
| 47 } | 53 } |
| 48 | 54 |
| (...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 218 rgba.val[1] = g; | 224 rgba.val[1] = g; |
| 219 rgba.val[0] = r; | 225 rgba.val[0] = r; |
| 220 } | 226 } |
| 221 vst4_u8((uint8_t*) dst, rgba); | 227 vst4_u8((uint8_t*) dst, rgba); |
| 222 src += 8; | 228 src += 8; |
| 223 dst += 8; | 229 dst += 8; |
| 224 count -= 8; | 230 count -= 8; |
| 225 } | 231 } |
| 226 | 232 |
| 227 // Call portable code to finish up the tail of [0,8) pixels. | 233 // Call portable code to finish up the tail of [0,8) pixels. |
| 228 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; | 234 auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>; |
| 229 proc(dst, src, count); | 235 proc(dst, src, count); |
| 230 } | 236 } |
| 231 | 237 |
| 232 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 238 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
| 233 premul_should_swapRB<false>(dst, src, count); | 239 premul_should_swapRB<false>(dst, src, count); |
| 234 } | 240 } |
| 235 | 241 |
| 236 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 242 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
| 237 premul_should_swapRB<true>(dst, src, count); | 243 premul_should_swapRB<true>(dst, src, count); |
| 238 } | 244 } |
| (...skipping 238 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 477 } | 483 } |
| 478 | 484 |
| 479 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { | 485 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { |
| 480 inverted_cmyk_to<kRGB1>(dst, src, count); | 486 inverted_cmyk_to<kRGB1>(dst, src, count); |
| 481 } | 487 } |
| 482 | 488 |
| 483 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { | 489 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { |
| 484 inverted_cmyk_to<kBGR1>(dst, src, count); | 490 inverted_cmyk_to<kBGR1>(dst, src, count); |
| 485 } | 491 } |
| 486 | 492 |
| 493 static void load_rgba_sample2(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, u
int8x8_t* b, |
| 494 uint8x8_t* a) { |
| 495 // Load 16 pixels. |
| 496 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src); |
| 497 |
| 498 // Choose 8 pixels. |
| 499 // pxpxpxpxpxpxpxpx -> pppppppp |
| 500 *r = vmovn_u16(vreinterpretq_u16_u8(rgba.val[0])); |
| 501 *g = vmovn_u16(vreinterpretq_u16_u8(rgba.val[1])); |
| 502 *b = vmovn_u16(vreinterpretq_u16_u8(rgba.val[2])); |
| 503 *a = vmovn_u16(vreinterpretq_u16_u8(rgba.val[3])); |
| 504 } |
| 505 |
| 506 template <int kSampleSize> |
| 507 static void load_rgba_sample(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, ui
nt8x8_t* b, |
| 508 uint8x8_t* a) { |
| 509 uint8x8_t rgba0 = vld1_u8((const uint8_t*) src); // rgba xxx
x |
| 510 uint8x8_t rgba1 = vld1_u8((const uint8_t*) src + kSampleSize); // rgba xxx
x |
| 511 uint8x8_t rgba01 = vzip_u8(rgba0, rgba1).val[0]; // rrgg bba
a |
| 512 |
| 513 uint8x8_t rgba2 = vld1_u8((const uint8_t*) src + 2*kSampleSize); // rgba xxx
x |
| 514 uint8x8_t rgba3 = vld1_u8((const uint8_t*) src + 3*kSampleSize); // rgba xxx
x |
| 515 uint8x8_t rgba23 = vzip_u8(rgba2, rgba3).val[0]; // rrgg bba
a |
| 516 |
| 517 uint16x4x2_t rgba03 = vzip_u16(vreinterpret_u16_u8(rgba01), // rrrr ggg
g |
| 518 vreinterpret_u16_u8(rgba23)); // bbbb aaa
a |
| 519 |
| 520 uint8x8_t rgba4 = vld1_u8((const uint8_t*) src + 4*kSampleSize); // rgba xxx
x |
| 521 uint8x8_t rgba5 = vld1_u8((const uint8_t*) src + 5*kSampleSize); // rgba xxx
x |
| 522 uint8x8_t rgba45 = vzip_u8(rgba4, rgba5).val[0]; // rrgg bba
a |
| 523 |
| 524 uint8x8_t rgba6 = vld1_u8((const uint8_t*) src + 6*kSampleSize); // rgba xxx
x |
| 525 uint8x8_t rgba7 = vld1_u8((const uint8_t*) src + 7*kSampleSize); // rgba xxx
x |
| 526 uint8x8_t rgba67 = vzip_u8(rgba6, rgba7).val[0]; // rrgg bba
a |
| 527 |
| 528 uint16x4x2_t rgba47 = vzip_u16(vreinterpret_u16_u8(rgba45), // rrrr ggg
g |
| 529 vreinterpret_u16_u8(rgba67)); // bbbb aaa
a |
| 530 |
| 531 uint32x2x2_t rg = vzip_u32(vreinterpret_u32_u16(rgba03.val[0]), // rrrr rrr
r |
| 532 vreinterpret_u32_u16(rgba47.val[0])); // gggg ggg
g |
| 533 uint32x2x2_t ba = vzip_u32(vreinterpret_u32_u16(rgba03.val[1]), // bbbb bbb
b |
| 534 vreinterpret_u32_u16(rgba47.val[1])); // aaaa aaa
a |
| 535 |
| 536 *r = vreinterpret_u8_u32(rg.val[0]); |
| 537 *g = vreinterpret_u8_u32(rg.val[1]); |
| 538 *b = vreinterpret_u8_u32(ba.val[0]); |
| 539 *a = vreinterpret_u8_u32(ba.val[1]); |
| 540 } |
| 541 |
| 542 template <bool kSwapRB, int kSampleSize> |
| 543 static void premul_should_swapRB_sample(uint32_t* dst, const void* vsrc, int cou
nt) { |
| 544 auto src = (const uint32_t*)vsrc; |
| 545 |
| 546 // We must use 9 as the limit to be sure that we don't read past the end of
our memory. |
| 547 while (count >= 9) { |
| 548 // Load pixels. |
| 549 uint8x8_t r, g, b, a; |
| 550 if (2 == kSampleSize) { |
| 551 load_rgba_sample2(src, &r, &g, &b, &a); |
| 552 } else { |
| 553 load_rgba_sample<kSampleSize>(src, &r, &g, &b, &a); |
| 554 } |
| 555 |
| 556 // Premultiply. |
| 557 r = scale(r, a); |
| 558 g = scale(g, a); |
| 559 b = scale(b, a); |
| 560 |
| 561 // Store 8 premultiplied pixels. |
| 562 uint8x8x4_t result; |
| 563 if (kSwapRB) { |
| 564 result.val[0] = b; |
| 565 result.val[1] = g; |
| 566 result.val[2] = r; |
| 567 result.val[3] = a; |
| 568 } else { |
| 569 result.val[0] = r; |
| 570 result.val[1] = g; |
| 571 result.val[2] = b; |
| 572 result.val[3] = a; |
| 573 } |
| 574 vst4_u8((uint8_t*) dst, result); |
| 575 src += 8*kSampleSize; |
| 576 dst += 8; |
| 577 count -= 8; |
| 578 } |
| 579 |
| 580 auto proc = kSwapRB ? RGBA_to_bgrA_portable<kSampleSize> : RGBA_to_rgbA_port
able<kSampleSize>; |
| 581 proc(dst, src, count); |
| 582 } |
| 583 |
| 584 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) { |
| 585 premul_should_swapRB_sample<false, 2>(dst, src, count); |
| 586 } |
| 587 |
| 588 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) { |
| 589 premul_should_swapRB_sample<true, 2>(dst, src, count); |
| 590 } |
| 591 |
| 592 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) { |
| 593 premul_should_swapRB_sample<false, 4>(dst, src, count); |
| 594 } |
| 595 |
| 596 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) { |
| 597 premul_should_swapRB_sample<true, 4>(dst, src, count); |
| 598 } |
| 599 |
| 600 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) { |
| 601 premul_should_swapRB_sample<false, 8>(dst, src, count); |
| 602 } |
| 603 |
| 604 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) { |
| 605 premul_should_swapRB_sample<true, 8>(dst, src, count); |
| 606 } |
| 607 |
| 487 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 608 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 488 | 609 |
| 489 // Scale a byte by another. | 610 // Scale a byte by another. |
| 490 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. | 611 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. |
| 491 static __m128i scale(__m128i x, __m128i y) { | 612 static __m128i scale(__m128i x, __m128i y) { |
| 492 const __m128i _128 = _mm_set1_epi16(128); | 613 const __m128i _128 = _mm_set1_epi16(128); |
| 493 const __m128i _257 = _mm_set1_epi16(257); | 614 const __m128i _257 = _mm_set1_epi16(257); |
| 494 | 615 |
| 495 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. | 616 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. |
| 496 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); | 617 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); |
| (...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 554 premul8(&lo, &hi); | 675 premul8(&lo, &hi); |
| 555 | 676 |
| 556 _mm_storeu_si128((__m128i*) dst, lo); | 677 _mm_storeu_si128((__m128i*) dst, lo); |
| 557 | 678 |
| 558 src += 4; | 679 src += 4; |
| 559 dst += 4; | 680 dst += 4; |
| 560 count -= 4; | 681 count -= 4; |
| 561 } | 682 } |
| 562 | 683 |
| 563 // Call portable code to finish up the tail of [0,4) pixels. | 684 // Call portable code to finish up the tail of [0,4) pixels. |
| 564 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; | 685 auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>; |
| 565 proc(dst, src, count); | 686 proc(dst, src, count); |
| 566 } | 687 } |
| 567 | 688 |
| 568 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 689 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
| 569 premul_should_swapRB<false>(dst, src, count); | 690 premul_should_swapRB<false>(dst, src, count); |
| 570 } | 691 } |
| 571 | 692 |
| 572 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 693 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
| 573 premul_should_swapRB<true>(dst, src, count); | 694 premul_should_swapRB<true>(dst, src, count); |
| 574 } | 695 } |
| (...skipping 209 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 784 } | 905 } |
| 785 | 906 |
| 786 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { | 907 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { |
| 787 inverted_cmyk_to<kRGB1>(dst, src, count); | 908 inverted_cmyk_to<kRGB1>(dst, src, count); |
| 788 } | 909 } |
| 789 | 910 |
| 790 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { | 911 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { |
| 791 inverted_cmyk_to<kBGR1>(dst, src, count); | 912 inverted_cmyk_to<kBGR1>(dst, src, count); |
| 792 } | 913 } |
| 793 | 914 |
| 915 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) { |
| 916 RGBA_to_rgbA_portable<2>(dst, src, count); |
| 917 } |
| 918 |
| 919 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) { |
| 920 RGBA_to_bgrA_portable<2>(dst, src, count); |
| 921 } |
| 922 |
| 923 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) { |
| 924 RGBA_to_rgbA_portable<4>(dst, src, count); |
| 925 } |
| 926 |
| 927 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) { |
| 928 RGBA_to_bgrA_portable<4>(dst, src, count); |
| 929 } |
| 930 |
| 931 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) { |
| 932 RGBA_to_rgbA_portable<8>(dst, src, count); |
| 933 } |
| 934 |
| 935 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) { |
| 936 RGBA_to_bgrA_portable<8>(dst, src, count); |
| 937 } |
| 938 |
| 794 #else | 939 #else |
| 795 | 940 |
| 796 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 941 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
| 797 RGBA_to_rgbA_portable(dst, src, count); | 942 RGBA_to_rgbA_portable<1>(dst, src, count); |
| 798 } | 943 } |
| 799 | 944 |
| 800 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 945 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
| 801 RGBA_to_bgrA_portable(dst, src, count); | 946 RGBA_to_bgrA_portable<1>(dst, src, count); |
| 802 } | 947 } |
| 803 | 948 |
| 804 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { | 949 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { |
| 805 RGBA_to_BGRA_portable(dst, src, count); | 950 RGBA_to_BGRA_portable(dst, src, count); |
| 806 } | 951 } |
| 807 | 952 |
| 808 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { | 953 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { |
| 809 RGB_to_RGB1_portable(dst, src, count); | 954 RGB_to_RGB1_portable(dst, src, count); |
| 810 } | 955 } |
| 811 | 956 |
| (...skipping 14 matching lines...) Expand all Loading... |
| 826 } | 971 } |
| 827 | 972 |
| 828 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { | 973 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { |
| 829 inverted_CMYK_to_RGB1_portable(dst, src, count); | 974 inverted_CMYK_to_RGB1_portable(dst, src, count); |
| 830 } | 975 } |
| 831 | 976 |
| 832 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { | 977 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { |
| 833 inverted_CMYK_to_BGR1_portable(dst, src, count); | 978 inverted_CMYK_to_BGR1_portable(dst, src, count); |
| 834 } | 979 } |
| 835 | 980 |
| 981 static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) { |
| 982 RGBA_to_rgbA_portable<2>(dst, src, count); |
| 983 } |
| 984 |
| 985 static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) { |
| 986 RGBA_to_bgrA_portable<2>(dst, src, count); |
| 987 } |
| 988 |
| 989 static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) { |
| 990 RGBA_to_rgbA_portable<4>(dst, src, count); |
| 991 } |
| 992 |
| 993 static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) { |
| 994 RGBA_to_bgrA_portable<4>(dst, src, count); |
| 995 } |
| 996 |
| 997 static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) { |
| 998 RGBA_to_rgbA_portable<8>(dst, src, count); |
| 999 } |
| 1000 |
| 1001 static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) { |
| 1002 RGBA_to_bgrA_portable<8>(dst, src, count); |
| 1003 } |
| 1004 |
| 836 #endif | 1005 #endif |
| 837 | 1006 |
| 838 } | 1007 } |
| 839 | 1008 |
| 840 #endif // SkSwizzler_opts_DEFINED | 1009 #endif // SkSwizzler_opts_DEFINED |
| OLD | NEW |