Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(459)

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1601883002: Add SSSE3 Optimizations for premul and swap (Closed) Base URL: https://skia.googlesource.com/skia.git@f-and-x
Patch Set: Faster repacking, style, comments Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkSwizzler_opts_DEFINED 8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED
10 10
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after
167 // Store 8 pixels. 167 // Store 8 pixels.
168 vst4_u8((uint8_t*) dst, bgra); 168 vst4_u8((uint8_t*) dst, bgra);
169 src += 8; 169 src += 8;
170 dst += 8; 170 dst += 8;
171 count -= 8; 171 count -= 8;
172 } 172 }
173 173
174 swaprb_xxxa_portable(dst, src, count); 174 swaprb_xxxa_portable(dst, src, count);
175 } 175 }
176 176
177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
178
179 template <bool kSwapRB>
180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {
181 const __m128i zeros = _mm_setzero_si128();
182 const __m128i _128 = _mm_set1_epi16(128);
183 const __m128i _257 = _mm_set1_epi16(257);
184 __m128i planar;
185 if (kSwapRB) {
186 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
187 } else {
188 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
189 }
190
191 while (count >= 8) {
192 // We'll load 8 pixels into 4 registers, each holding a 16-bit component plane.
193
194 // First just load the 8 interlaced pixels.
195 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), // bgrabgra bg rabgra
196 hi = _mm_loadu_si128((const __m128i*) (src + 4)); // BGRABGRA BG RABGRA
197
198 // Swizzle them to 8-bit planar.
199 lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rr rraaaa
200 hi = _mm_shuffle_epi8(lo, planar); // BBBBGGGG RR RRAAAA
201 __m128i bg = _mm_unpacklo_epi32(lo, hi), // bbbbBBBB gg ggGGGG
202 ra = _mm_unpackhi_epi32(lo, hi); // rrrrRRRR aa aaAAAA
203
204 // Unpack to 16-bit planar.
205 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_ B_B_B_
206 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_ G_G_G_
207 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_ R_R_R_
208 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_ A_A_A_
209
210 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
211 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);
212 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);
213 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);
214
215 // Repack into interlaced pixels.
216 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG BGBGBG
217 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA RARARA
218 lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra
219 hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG RABGRA
220
221 // Store interlaced pixels.
222 _mm_storeu_si128((__m128i*) (dst + 0), lo);
223 _mm_storeu_si128((__m128i*) (dst + 4), hi);
224
225 src += 8;
226 dst += 8;
227 count -= 8;
228 }
229
230 if (count >= 4) {
mtklein 2016/01/19 18:28:30 OK, now that we've got count >= 8 in shape, let's
msarett 2016/01/19 19:17:43 Done.
231 // First just load 4 interlaced pixels.
232 __m128i lo = _mm_loadu_si128((const __m128i*) src); // bgrabgra bg rabgra
233
234 // Swizzle them to 8-bit planar.
235 lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rr rraaaa
236 __m128i bg = _mm_unpacklo_epi32(lo, zeros), // bbbb____ gg gg____
237 ra = _mm_unpackhi_epi32(lo, zeros); // rrrr____ aa aa____
238
239 // Unpack to 16-bit planar.
240 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ __ ______
241 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ __ ______
242 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ __ ______
243 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ __ ______
244
245 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
246 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);
247 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);
248 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);
249
250 // Repack into interlaced pixels.
251 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg __ ______
252 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara __ ______
253 lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra
254
255 // Store interlaced pixels.
256 _mm_storeu_si128((__m128i*) dst, lo);
257
258 src += 4;
259 dst += 4;
260 count -= 4;
261 }
262
263 // Call portable code to finish up the tail of [0,4) pixels.
264 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;
265 proc(dst, src, count);
266 }
267
268 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
269 premul_xxxa_should_swaprb<false>(dst, src, count);
270 }
271
272 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
273 premul_xxxa_should_swaprb<true>(dst, src, count);
274 }
275
276 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
277 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5);
278
279 while (count >= 4) {
280 __m128i bgra = _mm_loadu_si128((const __m128i*) src);
281 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB);
282 _mm_storeu_si128((__m128i*) dst, rgba);
283
284 src += 4;
285 dst += 4;
286 count -= 4;
287 }
288
289 swaprb_xxxa_portable(dst, src, count);
290 }
291
177 #else 292 #else
178 293
179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { 294 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
180 premul_xxxa_portable(dst, src, count); 295 premul_xxxa_portable(dst, src, count);
181 } 296 }
182 297
183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 298 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
184 premul_swaprb_xxxa_portable(dst, src, count); 299 premul_swaprb_xxxa_portable(dst, src, count);
185 } 300 }
186 301
187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 302 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
188 swaprb_xxxa_portable(dst, src, count); 303 swaprb_xxxa_portable(dst, src, count);
189 } 304 }
190 305
191 #endif 306 #endif
192 307
193 } 308 }
194 309
195 #endif // SkSwizzler_opts_DEFINED 310 #endif // SkSwizzler_opts_DEFINED
OLDNEW
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698