OLD | NEW |
1 #include "SkBlitRow_opts_SSE4.h" | 1 #include "SkBlitRow_opts_SSE4.h" |
2 | 2 |
3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met
hods. | 3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met
hods. |
4 // The stubs should never be called, so we make them crash just to confirm that. | 4 // The stubs should never be called, so we make them crash just to confirm that. |
5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 | 5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 |
6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST
RICT, int, U8CPU) { | 6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST
RICT, int, U8CPU) { |
7 sk_throw(); | 7 sk_throw(); |
8 } | 8 } |
9 | 9 |
| 10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y)
{ |
| 11 sk_throw(); |
| 12 } |
| 13 |
10 #else | 14 #else |
11 | 15 |
12 #include <emmintrin.h> // SSE2: Most _mm_foo() in this file. | 16 #include <smmintrin.h> // SSE4.1 intrinsics |
13 #include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128. | |
14 | 17 |
15 #include "SkColorPriv.h" | 18 #include "SkColorPriv.h" |
16 #include "SkColor_opts_SSE2.h" | 19 #include "SkColor_opts_SSE2.h" |
17 | 20 |
18 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, | 21 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, |
19 const SkPMColor* SK_RESTRICT src, | 22 const SkPMColor* SK_RESTRICT src, |
20 int count, | 23 int count, |
21 U8CPU alpha) { | 24 U8CPU alpha) { |
22 SkASSERT(alpha == 255); | 25 SkASSERT(alpha == 255); |
23 // As long as we can, we'll work on 16 pixel pairs at once. | 26 // As long as we can, we'll work on 16 pixel pairs at once. |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
56 | 59 |
57 // Wrap up the last <= 15 pixels. | 60 // Wrap up the last <= 15 pixels. |
58 for (int i = count16*16; i < count; i++) { | 61 for (int i = count16*16; i < count; i++) { |
59 // This check is not really necessarily, but it prevents pointless autov
ectorization. | 62 // This check is not really necessarily, but it prevents pointless autov
ectorization. |
60 if (src[i] & 0xFF000000) { | 63 if (src[i] & 0xFF000000) { |
61 dst[i] = SkPMSrcOver(src[i], dst[i]); | 64 dst[i] = SkPMSrcOver(src[i], dst[i]); |
62 } | 65 } |
63 } | 66 } |
64 } | 67 } |
65 | 68 |
| 69 static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t s
rc_expand) { |
| 70 uint32_t dst_expand = SkExpand_rgb_16(dst) * scale; |
| 71 return SkCompact_rgb_16((src_expand + dst_expand) >> 5); |
| 72 } |
| 73 |
| 74 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y)
{ |
| 75 SkASSERT(count > 0); |
| 76 |
| 77 uint32_t src_expand = (SkGetPackedG32(src) << 24) | |
| 78 (SkGetPackedR32(src) << 13) | |
| 79 (SkGetPackedB32(src) << 2); |
| 80 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; |
| 81 |
| 82 // Check if we have enough pixels to run SIMD |
| 83 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { |
| 84 __m128i* dst_wide; |
| 85 const __m128i src_expand_wide = _mm_set1_epi32(src_expand); |
| 86 const __m128i scale_wide = _mm_set1_epi32(scale); |
| 87 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE | |
| 88 SK_B16_MASK_IN_PLACE | |
| 89 (SK_G16_MASK_IN_PLACE << 16)); |
| 90 |
| 91 // Align dst to an even 16 byte address (0-7 pixels) |
| 92 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { |
| 93 *dst = Color32A_D565_1x(*dst, scale, src_expand); |
| 94 dst += 1; |
| 95 count--; |
| 96 } |
| 97 |
| 98 dst_wide = reinterpret_cast<__m128i*>(dst); |
| 99 do { |
| 100 // Load 8 RGB565 pixels |
| 101 __m128i pixels = _mm_load_si128(dst_wide); |
| 102 |
| 103 // Duplicate and mask |
| 104 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); |
| 105 pixels_high = _mm_and_si128(mask_green, pixels_high); |
| 106 pixels = _mm_unpacklo_epi16(pixels, pixels); |
| 107 pixels = _mm_and_si128(mask_green, pixels); |
| 108 |
| 109 // Scale with alpha |
| 110 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); |
| 111 pixels = _mm_mullo_epi32(pixels, scale_wide); |
| 112 |
| 113 // Add src_expand_wide and shift down again |
| 114 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); |
| 115 pixels_high = _mm_srli_epi32(pixels_high, 5); |
| 116 pixels = _mm_add_epi32(pixels, src_expand_wide); |
| 117 pixels = _mm_srli_epi32(pixels, 5); |
| 118 |
| 119 // Mask |
| 120 pixels_high = _mm_and_si128(mask_green, pixels_high); |
| 121 pixels = _mm_and_si128(mask_green, pixels); |
| 122 |
| 123 // Combine into RGB565 and store |
| 124 pixels = _mm_hadd_epi16(pixels, pixels_high); |
| 125 _mm_store_si128(dst_wide, pixels); |
| 126 count -= 8; |
| 127 dst_wide++; |
| 128 } while (count >= 8); |
| 129 |
| 130 dst = reinterpret_cast<uint16_t*>(dst_wide); |
| 131 } |
| 132 |
| 133 // Small loop to handle remaining pixels. |
| 134 while (count > 0) { |
| 135 *dst = Color32A_D565_1x(*dst, scale, src_expand); |
| 136 dst += 1; |
| 137 count--; |
| 138 } |
| 139 } |
| 140 |
66 #endif | 141 #endif |
OLD | NEW |