Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(304)

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1618003002: Use NEON optimizations for RGB -> RGB(FF) or BGR(FF) in SkSwizzler (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Bringing back the comments (that still make sense) Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkSwizzler_opts_DEFINED 8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED
10 10
11 #include "SkColorPriv.h" 11 #include "SkColorPriv.h"
12 12
13 namespace SK_OPTS_NS { 13 namespace SK_OPTS_NS {
14 14
15 // These variable names in these functions just pretend the input is BGRA. 15 // These variable names in these functions just pretend the input is BGRA.
mtklein 2016/01/22 14:37:32 (or BGR)
msarett 2016/01/22 15:00:36 Done.
16 // They work fine with both RGBA and BGRA. 16 // They work fine with both RGBA and BGRA.
mtklein 2016/01/22 14:37:32 (or both BGR and RGB).
msarett 2016/01/22 15:00:36 Done.
17 17
18 static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) { 18 static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) {
mtklein 2016/01/22 14:37:33 Do you think we made things unnecessarily complica
mtklein 2016/01/22 14:44:04 Gonna write up a CL to demonstrate what I mean.
msarett 2016/01/22 15:00:36 Yes.
mtklein 2016/01/22 15:23:57 Oh, right, how about RGB_to_RGB1 / RGB_to_BGR1?
msarett 2016/01/22 17:27:23 Done.
19 for (int i = 0; i < count; i++) { 19 for (int i = 0; i < count; i++) {
20 uint8_t a = src[i] >> 24, 20 uint8_t a = src[i] >> 24,
21 r = src[i] >> 16, 21 r = src[i] >> 16,
22 g = src[i] >> 8, 22 g = src[i] >> 8,
23 b = src[i] >> 0; 23 b = src[i] >> 0;
24 r = (r*a+127)/255; 24 r = (r*a+127)/255;
25 g = (g*a+127)/255; 25 g = (g*a+127)/255;
26 b = (b*a+127)/255; 26 b = (b*a+127)/255;
27 dst[i] = (uint32_t)a << 24 27 dst[i] = (uint32_t)a << 24
28 | (uint32_t)r << 16 28 | (uint32_t)r << 16
(...skipping 24 matching lines...) Expand all
53 r = src[i] >> 16, 53 r = src[i] >> 16,
54 g = src[i] >> 8, 54 g = src[i] >> 8,
55 b = src[i] >> 0; 55 b = src[i] >> 0;
56 dst[i] = (uint32_t)a << 24 56 dst[i] = (uint32_t)a << 24
57 | (uint32_t)b << 16 57 | (uint32_t)b << 16
58 | (uint32_t)g << 8 58 | (uint32_t)g << 8
59 | (uint32_t)r << 0; 59 | (uint32_t)r << 0;
60 } 60 }
61 } 61 }
62 62
63 static void xxx_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {
mtklein 2016/01/22 14:37:32 src is three-byte right, not some sort of RGBx? S
msarett 2016/01/22 15:00:36 Agreed. I made a similar comment in Patch Set 1.
mtklein 2016/01/22 15:23:57 Good. Was going to suggest that. :) While they'r
msarett 2016/01/22 17:27:23 Done.
64 int i8 = 0;
65 const uint8_t* src8 = (const uint8_t*) src;
66 for (int i32 = 0; i32 < count; i32++) {
67 uint8_t b = src8[i8++],
68 g = src8[i8++],
69 r = src8[i8++];
70 dst[i32] = (uint32_t) b << 0
mtklein 2016/01/22 14:37:32 Let's keep our order consistent with the rest of t
msarett 2016/01/22 15:00:36 Done.
71 | (uint32_t) g << 8
72 | (uint32_t) r << 16
73 | (uint32_t)0xFF << 24;
74 }
75 }
76
77 static void xxx_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int c ount) {
78 int i8 = 0;
79 const uint8_t* src8 = (const uint8_t*) src;
80 for (int i32 = 0; i32 < count; i32++) {
81 uint8_t b = src8[i8++],
82 g = src8[i8++],
83 r = src8[i8++];
84 dst[i32] = (uint32_t) r << 0
85 | (uint32_t) g << 8
86 | (uint32_t) b << 16
87 | (uint32_t)0xFF << 24;
88 }
89 }
90
63 #if defined(SK_ARM_HAS_NEON) 91 #if defined(SK_ARM_HAS_NEON)
64 92
65 // Rounded divide by 255, (x + 127) / 255 93 // Rounded divide by 255, (x + 127) / 255
66 static uint8x8_t div255_round(uint16x8_t x) { 94 static uint8x8_t div255_round(uint16x8_t x) {
67 // result = (x + 127) / 255 95 // result = (x + 127) / 255
68 // result = (x + 127) / 256 + error1 96 // result = (x + 127) / 256 + error1
69 // 97 //
70 // error1 = (x + 127) / (255 * 256) 98 // error1 = (x + 127) / (255 * 256)
71 // error1 = (x + 127) / (256 * 256) + error2 99 // error1 = (x + 127) / (256 * 256) + error2
72 // 100 //
(...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after
161 // Store 8 pixels. 189 // Store 8 pixels.
162 vst4_u8((uint8_t*) dst, bgra); 190 vst4_u8((uint8_t*) dst, bgra);
163 src += 8; 191 src += 8;
164 dst += 8; 192 dst += 8;
165 count -= 8; 193 count -= 8;
166 } 194 }
167 195
168 swaprb_xxxa_portable(dst, src, count); 196 swaprb_xxxa_portable(dst, src, count);
169 } 197 }
170 198
199 template <bool kSwapRB>
200 static void xxx_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int cou nt) {
201 const uint8_t* src8 = (const uint8_t*) src;
202 while (count >= 16) {
203 // Load 16 pixels.
204 uint8x16x3_t bgr = vld3q_u8(src8);
205
206 // Insert an opaque alpha channel and swap if needed.
207 uint8x16x4_t bgra;
208 if (kSwapRB) {
209 bgra.val[0] = bgr.val[2];
210 bgra.val[2] = bgr.val[0];
211 } else {
212 bgra.val[0] = bgr.val[0];
213 bgra.val[2] = bgr.val[2];
214 }
215 bgra.val[1] = bgr.val[1];
216 bgra.val[3] = vdupq_n_u8(0xFF);
217
218 // Store 16 pixels.
219 vst4q_u8((uint8_t*) dst, bgra);
220 src8 += 48;
mtklein 2016/01/22 14:37:33 might write this as += 16*3? I find it really ple
msarett 2016/01/22 15:00:36 Done.
221 dst += 16;
222 count -= 16;
223 }
224
225 if (count >= 8) {
226 // Load 8 pixels.
227 uint8x8x3_t bgr = vld3_u8(src8);
228
229 // Insert an opaque alpha channel and swap if needed.
230 uint8x8x4_t bgra;
231 if (kSwapRB) {
232 bgra.val[0] = bgr.val[2];
233 bgra.val[2] = bgr.val[0];
234 } else {
235 bgra.val[0] = bgr.val[0];
236 bgra.val[2] = bgr.val[2];
237 }
238 bgra.val[1] = bgr.val[1];
239 bgra.val[3] = vdup_n_u8(0xFF);
240
241 // Store 8 pixels.
242 vst4_u8((uint8_t*) dst, bgra);
243 src8 += 24;
244 dst += 8;
245 count -= 8;
246 }
247
248 // Call portable code to finish up the tail of [0,8) pixels.
249 auto proc = kSwapRB ? xxx_swaprb_xxxa_portable : xxx_xxxa_portable;
250 proc(dst, (const uint32_t*) src8, count);
251 }
252
253 static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) {
254 xxx_xxxa_should_swaprb<false>(dst, src, count);
255 }
256
257 static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
258 xxx_xxxa_should_swaprb<true>(dst, src, count);
259 }
260
171 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 261 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
172 262
173 template <bool kSwapRB> 263 template <bool kSwapRB>
174 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { 264 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {
175 265
176 auto premul8 = [](__m128i* lo, __m128i* hi) { 266 auto premul8 = [](__m128i* lo, __m128i* hi) {
177 const __m128i zeros = _mm_setzero_si128(); 267 const __m128i zeros = _mm_setzero_si128();
178 const __m128i _128 = _mm_set1_epi16(128); 268 const __m128i _128 = _mm_set1_epi16(128);
179 const __m128i _257 = _mm_set1_epi16(257); 269 const __m128i _257 = _mm_set1_epi16(257);
180 __m128i planar; 270 __m128i planar;
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after
257 _mm_storeu_si128((__m128i*) dst, rgba); 347 _mm_storeu_si128((__m128i*) dst, rgba);
258 348
259 src += 4; 349 src += 4;
260 dst += 4; 350 dst += 4;
261 count -= 4; 351 count -= 4;
262 } 352 }
263 353
264 swaprb_xxxa_portable(dst, src, count); 354 swaprb_xxxa_portable(dst, src, count);
265 } 355 }
266 356
357 static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) {
358 xxx_xxxa_portable(dst, src, count);
359 }
360
361 static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
362 xxx_swaprb_xxxa_portable(dst, src, count);
363 }
364
267 #else 365 #else
268 366
269 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { 367 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
270 premul_xxxa_portable(dst, src, count); 368 premul_xxxa_portable(dst, src, count);
271 } 369 }
272 370
273 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 371 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
274 premul_swaprb_xxxa_portable(dst, src, count); 372 premul_swaprb_xxxa_portable(dst, src, count);
275 } 373 }
276 374
277 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 375 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
278 swaprb_xxxa_portable(dst, src, count); 376 swaprb_xxxa_portable(dst, src, count);
279 } 377 }
280 378
379 static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) {
380 xxx_xxxa_portable(dst, src, count);
381 }
382
383 static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
384 xxx_swaprb_xxxa_portable(dst, src, count);
385 }
386
281 #endif 387 #endif
282 388
283 } 389 }
284 390
285 #endif // SkSwizzler_opts_DEFINED 391 #endif // SkSwizzler_opts_DEFINED
OLDNEW
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698