Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(546)

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1666853002: SSE optimizations for GrayAlpha -> RGBA/BGRA Premul/Unpremul (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Response to comments Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « bench/SwizzleBench.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkSwizzler_opts_DEFINED 8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED
10 10
(...skipping 385 matching lines...) Expand 10 before | Expand all | Expand 10 after
396 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { 396 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
397 expand_grayA<false>(dst, src, count); 397 expand_grayA<false>(dst, src, count);
398 } 398 }
399 399
400 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { 400 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
401 expand_grayA<true>(dst, src, count); 401 expand_grayA<true>(dst, src, count);
402 } 402 }
403 403
404 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 404 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
405 405
406 // Scale a byte by another.
407 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
408 static __m128i scale(__m128i x, __m128i y) {
409 const __m128i _128 = _mm_set1_epi16(128);
410 const __m128i _257 = _mm_set1_epi16(257);
411
412 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
413 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
414 }
415
406 template <bool kSwapRB> 416 template <bool kSwapRB>
407 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { 417 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
408 auto src = (const uint32_t*)vsrc; 418 auto src = (const uint32_t*)vsrc;
409 419
410 auto premul8 = [](__m128i* lo, __m128i* hi) { 420 auto premul8 = [](__m128i* lo, __m128i* hi) {
411 const __m128i zeros = _mm_setzero_si128(); 421 const __m128i zeros = _mm_setzero_si128();
412 const __m128i _128 = _mm_set1_epi16(128);
413 const __m128i _257 = _mm_set1_epi16(257);
414 __m128i planar; 422 __m128i planar;
415 if (kSwapRB) { 423 if (kSwapRB) {
416 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 424 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
417 } else { 425 } else {
418 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 426 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
419 } 427 }
420 428
421 // Swizzle the pixels to 8-bit planar. 429 // Swizzle the pixels to 8-bit planar.
422 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bb bbaaaa 430 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bb bbaaaa
423 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BB BBAAAA 431 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BB BBAAAA
424 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR gg ggGGGG 432 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR gg ggGGGG
425 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aa aaAAAA 433 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aa aaAAAA
426 434
427 // Unpack to 16-bit planar. 435 // Unpack to 16-bit planar.
428 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_ R_R_R_ 436 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_ R_R_R_
429 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_ G_G_G_ 437 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_ G_G_G_
430 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_ B_B_B_ 438 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_ B_B_B_
431 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_ A_A_A_ 439 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_ A_A_A_
432 440
433 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 441 // Premultiply!
434 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); 442 r = scale(r, a);
435 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); 443 g = scale(g, a);
436 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); 444 b = scale(b, a);
437 445
438 // Repack into interlaced pixels. 446 // Repack into interlaced pixels.
439 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RG RGRGRG 447 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RG RGRGRG
440 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BA BABABA 448 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BA BABABA
441 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rg bargba 449 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rg bargba
442 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RG BARGBA 450 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RG BARGBA
443 }; 451 };
444 452
445 while (count >= 8) { 453 while (count >= 8) {
446 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 454 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after
565 _mm_storeu_si128((__m128i*) (dst + 12), ggga3); 573 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
566 574
567 src += 16; 575 src += 16;
568 dst += 16; 576 dst += 16;
569 count -= 16; 577 count -= 16;
570 } 578 }
571 579
572 gray_to_RGB1_portable(dst, src, count); 580 gray_to_RGB1_portable(dst, src, count);
573 } 581 }
574 582
575 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { 583 static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
584 const uint8_t* src = (const uint8_t*) vsrc;
585 while (count >= 8) {
586 __m128i ga = _mm_loadu_si128((const __m128i*) src);
587
588 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
589 _mm_slli_epi16(ga, 8));
590
591 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
592 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
593
594 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
595 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
596
597 src += 8*2;
598 dst += 8;
599 count -= 8;
600 }
601
576 grayA_to_RGBA_portable(dst, src, count); 602 grayA_to_RGBA_portable(dst, src, count);
577 } 603 }
578 604
579 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { 605 static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
606 const uint8_t* src = (const uint8_t*) vsrc;
607 while (count >= 8) {
608 __m128i grayA = _mm_loadu_si128((const __m128i*) src);
609
610 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
611 __m128i a0 = _mm_srli_epi16(grayA, 8);
612
613 // Premultiply
614 g0 = scale(g0, a0);
615
616 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
617 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
618
619
620 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
621 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
622
623 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
624 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
625
626 src += 8*2;
627 dst += 8;
628 count -= 8;
629 }
630
580 grayA_to_rgbA_portable(dst, src, count); 631 grayA_to_rgbA_portable(dst, src, count);
581 } 632 }
582 633
583 #else 634 #else
584 635
585 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 636 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
586 RGBA_to_rgbA_portable(dst, src, count); 637 RGBA_to_rgbA_portable(dst, src, count);
587 } 638 }
588 639
589 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 640 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
(...skipping 22 matching lines...) Expand all
612 663
613 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { 664 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
614 grayA_to_rgbA_portable(dst, src, count); 665 grayA_to_rgbA_portable(dst, src, count);
615 } 666 }
616 667
617 #endif 668 #endif
618 669
619 } 670 }
620 671
621 #endif // SkSwizzler_opts_DEFINED 672 #endif // SkSwizzler_opts_DEFINED
OLDNEW
« no previous file with comments | « bench/SwizzleBench.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698