OLD | NEW |
1 #include "SkBlitRow_opts_SSE4.h" | 1 #include "SkBlitRow_opts_SSE4.h" |
2 | 2 |
3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met
hods. | 3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met
hods. |
4 // The stubs should never be called, so we make them crash just to confirm that. | 4 // The stubs should never be called, so we make them crash just to confirm that. |
5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 | 5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 |
6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST
RICT, int, U8CPU) { | 6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST
RICT, int, U8CPU) { |
7 sk_throw(); | 7 sk_throw(); |
8 } | 8 } |
9 | 9 |
10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y)
{ | |
11 sk_throw(); | |
12 } | |
13 | |
14 #else | 10 #else |
15 | 11 |
16 #include <smmintrin.h> // SSE4.1 intrinsics | 12 #include <smmintrin.h> // SSE4.1 intrinsics |
17 | |
18 #include "SkColorPriv.h" | 13 #include "SkColorPriv.h" |
19 #include "SkColor_opts_SSE2.h" | 14 #include "SkColor_opts_SSE2.h" |
20 | 15 |
21 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, | 16 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, |
22 const SkPMColor* SK_RESTRICT src, | 17 const SkPMColor* SK_RESTRICT src, |
23 int count, | 18 int count, |
24 U8CPU alpha) { | 19 U8CPU alpha) { |
25 SkASSERT(alpha == 255); | 20 SkASSERT(alpha == 255); |
26 // As long as we can, we'll work on 16 pixel pairs at once. | 21 // As long as we can, we'll work on 16 pixel pairs at once. |
27 int count16 = count / 16; | 22 int count16 = count / 16; |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
59 | 54 |
60 // Wrap up the last <= 15 pixels. | 55 // Wrap up the last <= 15 pixels. |
61 for (int i = count16*16; i < count; i++) { | 56 for (int i = count16*16; i < count; i++) { |
62 // This check is not really necessarily, but it prevents pointless autov
ectorization. | 57 // This check is not really necessarily, but it prevents pointless autov
ectorization. |
63 if (src[i] & 0xFF000000) { | 58 if (src[i] & 0xFF000000) { |
64 dst[i] = SkPMSrcOver(src[i], dst[i]); | 59 dst[i] = SkPMSrcOver(src[i], dst[i]); |
65 } | 60 } |
66 } | 61 } |
67 } | 62 } |
68 | 63 |
69 static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t s
rc_expand) { | |
70 uint32_t dst_expand = SkExpand_rgb_16(dst) * scale; | |
71 return SkCompact_rgb_16((src_expand + dst_expand) >> 5); | |
72 } | |
73 | |
74 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y)
{ | |
75 SkASSERT(count > 0); | |
76 | |
77 uint32_t src_expand = (SkGetPackedG32(src) << 24) | | |
78 (SkGetPackedR32(src) << 13) | | |
79 (SkGetPackedB32(src) << 2); | |
80 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; | |
81 | |
82 // Check if we have enough pixels to run SIMD | |
83 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { | |
84 __m128i* dst_wide; | |
85 const __m128i src_expand_wide = _mm_set1_epi32(src_expand); | |
86 const __m128i scale_wide = _mm_set1_epi32(scale); | |
87 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE | | |
88 SK_B16_MASK_IN_PLACE | | |
89 (SK_G16_MASK_IN_PLACE << 16)); | |
90 | |
91 // Align dst to an even 16 byte address (0-7 pixels) | |
92 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { | |
93 *dst = Color32A_D565_1x(*dst, scale, src_expand); | |
94 dst += 1; | |
95 count--; | |
96 } | |
97 | |
98 dst_wide = reinterpret_cast<__m128i*>(dst); | |
99 do { | |
100 // Load 8 RGB565 pixels | |
101 __m128i pixels = _mm_load_si128(dst_wide); | |
102 | |
103 // Duplicate and mask | |
104 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); | |
105 pixels_high = _mm_and_si128(mask_green, pixels_high); | |
106 pixels = _mm_unpacklo_epi16(pixels, pixels); | |
107 pixels = _mm_and_si128(mask_green, pixels); | |
108 | |
109 // Scale with alpha | |
110 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); | |
111 pixels = _mm_mullo_epi32(pixels, scale_wide); | |
112 | |
113 // Add src_expand_wide and shift down again | |
114 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); | |
115 pixels_high = _mm_srli_epi32(pixels_high, 5); | |
116 pixels = _mm_add_epi32(pixels, src_expand_wide); | |
117 pixels = _mm_srli_epi32(pixels, 5); | |
118 | |
119 // Mask | |
120 pixels_high = _mm_and_si128(mask_green, pixels_high); | |
121 pixels = _mm_and_si128(mask_green, pixels); | |
122 | |
123 // Combine into RGB565 and store | |
124 pixels = _mm_hadd_epi16(pixels, pixels_high); | |
125 _mm_store_si128(dst_wide, pixels); | |
126 count -= 8; | |
127 dst_wide++; | |
128 } while (count >= 8); | |
129 | |
130 dst = reinterpret_cast<uint16_t*>(dst_wide); | |
131 } | |
132 | |
133 // Small loop to handle remaining pixels. | |
134 while (count > 0) { | |
135 *dst = Color32A_D565_1x(*dst, scale, src_expand); | |
136 dst += 1; | |
137 count--; | |
138 } | |
139 } | |
140 | |
141 #endif | 64 #endif |
OLD | NEW |