OLD | NEW |
---|---|
1 #include "SkBlitRow_opts_SSE4.h" | 1 #include "SkBlitRow_opts_SSE4.h" |
2 | 2 |
3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods. | 3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods. |
4 // The stubs should never be called, so we make them crash just to confirm that. | 4 // The stubs should never be called, so we make them crash just to confirm that. |
5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 | 5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 |
6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) { | 6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) { |
7 sk_throw(); | 7 sk_throw(); |
8 } | 8 } |
9 | 9 |
10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { | |
11 sk_throw(); | |
12 } | |
13 | |
10 #else | 14 #else |
11 | 15 |
12 #include <emmintrin.h> // SSE2: Most _mm_foo() in this file. | 16 #include <emmintrin.h> // SSE2: Most _mm_foo() in this file. |
13 #include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128. | 17 #include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128. |
14 | 18 |
15 #include "SkColorPriv.h" | 19 #include "SkColorPriv.h" |
16 #include "SkColor_opts_SSE2.h" | 20 #include "SkColor_opts_SSE2.h" |
17 | 21 |
18 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, | 22 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, |
19 const SkPMColor* SK_RESTRICT src, | 23 const SkPMColor* SK_RESTRICT src, |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
57 // Wrap up the last <= 15 pixels. | 61 // Wrap up the last <= 15 pixels. |
58 SkASSERT(count - (count16*16) <= 15); | 62 SkASSERT(count - (count16*16) <= 15); |
59 for (int i = count16*16; i < count; i++) { | 63 for (int i = count16*16; i < count; i++) { |
60 // This check is not really necessarily, but it prevents pointless autov ectorization. | 64 // This check is not really necessarily, but it prevents pointless autov ectorization. |
61 if (src[i] & 0xFF000000) { | 65 if (src[i] & 0xFF000000) { |
62 dst[i] = SkPMSrcOver(src[i], dst[i]); | 66 dst[i] = SkPMSrcOver(src[i], dst[i]); |
63 } | 67 } |
64 } | 68 } |
65 } | 69 } |
66 | 70 |
71 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { | |
72 SkASSERT(count > 0); | |
73 // dst must be 2-byte aligned. | |
74 SkASSERT((((size_t)dst) & 0x01) == 0); | |
mtklein
2015/01/30 18:17:37
Seems paranoid? If we need to keep this, let's wr
henrik.smiding
2015/02/10 15:11:51
I thought so too, but I actually found it in the n
| |
75 | |
76 uint32_t src_expand = (SkGetPackedG32(src) << 24) | | |
77 (SkGetPackedR32(src) << 13) | | |
78 (SkGetPackedB32(src) << 2); | |
79 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; | |
80 | |
81 // Check if we have enough pixels to run SIMD, at all | |
82 if (count >= 8) { | |
mtklein
2015/01/30 18:17:37
This logic seems complex, and I'm worried we're ov
henrik.smiding
2015/02/10 15:11:52
I did this optimization based on reports that it w
| |
83 __m128i* dst_wide; | |
84 const __m128i src_expand_wide = _mm_set1_epi32(src_expand); | |
85 const __m128i scale_wide = _mm_set1_epi32(scale); | |
86 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE | | |
87 SK_B16_MASK_IN_PLACE | | |
88 (SK_G16_MASK_IN_PLACE << 16)); | |
89 | |
90 // Check if we should run the aligned SIMD loop | |
91 if ((count >= 64) || ((((size_t)dst) & 0x0F) == 0)) { | |
mtklein
2015/01/30 18:17:37
How'd you pick 64?
henrik.smiding
2015/02/10 15:11:52
I measured different widths with skia bench, using
| |
92 // Align dst to an even 16 byte address (0-7 pixels) | |
93 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { | |
94 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; | |
mtklein
2015/01/30 18:17:37
You've got a lot of repeated code here. If it nee
henrik.smiding
2015/02/10 15:11:51
I'll re-factor the code a bit. When you're satisfi
| |
95 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); | |
96 dst += 1; | |
97 count--; | |
98 } | |
99 | |
100 dst_wide = reinterpret_cast<__m128i*>(dst); | |
101 do { | |
102 // Load 8 RGB565 pixels | |
103 __m128i pixels = _mm_load_si128(dst_wide); | |
104 | |
105 // Duplicate and mask | |
106 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); | |
107 pixels_high = _mm_and_si128(mask_green, pixels_high); | |
108 pixels = _mm_unpacklo_epi16(pixels, pixels); | |
109 pixels = _mm_and_si128(mask_green, pixels); | |
110 | |
111 // Scale with alpha | |
112 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); | |
mtklein
2015/01/30 18:17:37
It doesn't seem like SSE4 is essential to the spee
henrik.smiding
2015/02/10 15:11:51
I added the _mm_hadd instruction at the last minut
| |
113 pixels = _mm_mullo_epi32(pixels, scale_wide); | |
114 | |
115 // Add src_expand_wide and shift down again | |
116 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); | |
117 pixels_high = _mm_srli_epi32(pixels_high, 5); | |
118 pixels = _mm_add_epi32(pixels, src_expand_wide); | |
119 pixels = _mm_srli_epi32(pixels, 5); | |
120 | |
121 // Mask | |
122 pixels_high = _mm_and_si128(mask_green, pixels_high); | |
123 pixels = _mm_and_si128(mask_green, pixels); | |
124 | |
125 // Combine into RGB565 and store | |
126 pixels = _mm_hadd_epi16(pixels, pixels_high); | |
127 _mm_store_si128(dst_wide, pixels); | |
128 count -= 8; | |
129 dst_wide++; | |
130 } while (count >= 8); | |
131 } | |
132 else { // Unaligned loop to handle medium widths | |
133 dst_wide = reinterpret_cast<__m128i*>(dst); | |
134 | |
135 do { | |
136 // Load 8 RGB565 pixels | |
137 __m128i pixels = _mm_loadu_si128(dst_wide); | |
138 | |
139 // Duplicate and mask | |
140 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); | |
141 pixels_high = _mm_and_si128(mask_green, pixels_high); | |
142 pixels = _mm_unpacklo_epi16(pixels, pixels); | |
143 pixels = _mm_and_si128(mask_green, pixels); | |
144 | |
145 // Scale with alpha | |
146 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); | |
147 pixels = _mm_mullo_epi32(pixels, scale_wide); | |
148 | |
149 // Add src_expand_wide and shift down again | |
150 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); | |
151 pixels_high = _mm_srli_epi32(pixels_high, 5); | |
152 pixels = _mm_add_epi32(pixels, src_expand_wide); | |
153 pixels = _mm_srli_epi32(pixels, 5); | |
154 | |
155 // Mask | |
156 pixels_high = _mm_and_si128(mask_green, pixels_high); | |
157 pixels = _mm_and_si128(mask_green, pixels); | |
158 | |
159 // Combine into RGB565 and store | |
160 pixels = _mm_hadd_epi16(pixels, pixels_high); | |
161 _mm_storeu_si128(dst_wide, pixels); | |
162 count -= 8; | |
163 dst_wide++; | |
164 } while (count >= 8); | |
165 } | |
166 | |
167 dst = reinterpret_cast<uint16_t*>(dst_wide); | |
168 } | |
169 | |
170 // Small loop to handle remaining 0-7 pixels. | |
171 while (count > 0) { | |
172 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; | |
173 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); | |
174 dst += 1; | |
175 count--; | |
176 } | |
177 } | |
178 | |
67 #endif | 179 #endif |
OLD | NEW |