Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(225)

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1618003002: Use NEON optimizations for RGB -> RGB(FF) or BGR(FF) in SkSwizzler (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Fix Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkSwizzler_opts_DEFINED 8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED
10 10
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
53 b = src[i] >> 16, 53 b = src[i] >> 16,
54 g = src[i] >> 8, 54 g = src[i] >> 8,
55 r = src[i] >> 0; 55 r = src[i] >> 0;
56 dst[i] = (uint32_t)a << 24 56 dst[i] = (uint32_t)a << 24
57 | (uint32_t)r << 16 57 | (uint32_t)r << 16
58 | (uint32_t)g << 8 58 | (uint32_t)g << 8
59 | (uint32_t)b << 0; 59 | (uint32_t)b << 0;
60 } 60 }
61 } 61 }
62 62
63 static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
64 const uint8_t* src = (const uint8_t*)vsrc;
65 for (int i = 0; i < count; i++) {
66 uint8_t r = src[0],
67 g = src[1],
68 b = src[2];
69 src += 3;
70 dst[i] = (uint32_t)0xFF << 24
71 | (uint32_t)b << 16
72 | (uint32_t)g << 8
73 | (uint32_t)r << 0;
74 }
75 }
76
77 static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
78 const uint8_t* src = (const uint8_t*)vsrc;
79 for (int i = 0; i < count; i++) {
80 uint8_t r = src[0],
81 g = src[1],
82 b = src[2];
83 src += 3;
84 dst[i] = (uint32_t)0xFF << 24
85 | (uint32_t)r << 16
86 | (uint32_t)g << 8
87 | (uint32_t)b << 0;
88 }
89 }
90
63 #if defined(SK_ARM_HAS_NEON) 91 #if defined(SK_ARM_HAS_NEON)
64 92
65 // Rounded divide by 255, (x + 127) / 255 93 // Rounded divide by 255, (x + 127) / 255
66 static uint8x8_t div255_round(uint16x8_t x) { 94 static uint8x8_t div255_round(uint16x8_t x) {
67 // result = (x + 127) / 255 95 // result = (x + 127) / 255
68 // result = (x + 127) / 256 + error1 96 // result = (x + 127) / 256 + error1
69 // 97 //
70 // error1 = (x + 127) / (255 * 256) 98 // error1 = (x + 127) / (255 * 256)
71 // error1 = (x + 127) / (256 * 256) + error2 99 // error1 = (x + 127) / (256 * 256) + error2
72 // 100 //
(...skipping 16 matching lines...) Expand all
89 // Scale a byte by another, (x * y + 127) / 255 117 // Scale a byte by another, (x * y + 127) / 255
90 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { 118 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
91 return div255_round(vmull_u8(x, y)); 119 return div255_round(vmull_u8(x, y));
92 } 120 }
93 121
94 template <bool kSwapRB> 122 template <bool kSwapRB>
95 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { 123 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
96 auto src = (const uint32_t*)vsrc; 124 auto src = (const uint32_t*)vsrc;
97 while (count >= 8) { 125 while (count >= 8) {
98 // Load 8 pixels. 126 // Load 8 pixels.
99 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); 127 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
100 128
101 uint8x8_t a = bgra.val[3], 129 uint8x8_t a = rgba.val[3],
102 b = bgra.val[2], 130 b = rgba.val[2],
103 g = bgra.val[1], 131 g = rgba.val[1],
104 r = bgra.val[0]; 132 r = rgba.val[0];
105 133
106 // Premultiply. 134 // Premultiply.
107 b = scale(b, a); 135 b = scale(b, a);
108 g = scale(g, a); 136 g = scale(g, a);
109 r = scale(r, a); 137 r = scale(r, a);
110 138
111 // Store 8 premultiplied pixels. 139 // Store 8 premultiplied pixels.
112 if (kSwapRB) { 140 if (kSwapRB) {
113 bgra.val[2] = r; 141 rgba.val[2] = r;
114 bgra.val[1] = g; 142 rgba.val[1] = g;
115 bgra.val[0] = b; 143 rgba.val[0] = b;
116 } else { 144 } else {
117 bgra.val[2] = b; 145 rgba.val[2] = b;
118 bgra.val[1] = g; 146 rgba.val[1] = g;
119 bgra.val[0] = r; 147 rgba.val[0] = r;
120 } 148 }
121 vst4_u8((uint8_t*) dst, bgra); 149 vst4_u8((uint8_t*) dst, rgba);
122 src += 8; 150 src += 8;
123 dst += 8; 151 dst += 8;
124 count -= 8; 152 count -= 8;
125 } 153 }
126 154
127 // Call portable code to finish up the tail of [0,8) pixels. 155 // Call portable code to finish up the tail of [0,8) pixels.
128 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 156 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
129 proc(dst, src, count); 157 proc(dst, src, count);
130 } 158 }
131 159
132 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 160 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
133 premul_should_swapRB<false>(dst, src, count); 161 premul_should_swapRB<false>(dst, src, count);
134 } 162 }
135 163
136 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 164 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
137 premul_should_swapRB<true>(dst, src, count); 165 premul_should_swapRB<true>(dst, src, count);
138 } 166 }
139 167
140 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { 168 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
141 auto src = (const uint32_t*)vsrc; 169 auto src = (const uint32_t*)vsrc;
142 while (count >= 16) { 170 while (count >= 16) {
143 // Load 16 pixels. 171 // Load 16 pixels.
144 uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src); 172 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
145 173
146 // Swap r and b. 174 // Swap r and b.
147 SkTSwap(bgra.val[0], bgra.val[2]); 175 SkTSwap(rgba.val[0], rgba.val[2]);
148 176
149 // Store 16 pixels. 177 // Store 16 pixels.
150 vst4q_u8((uint8_t*) dst, bgra); 178 vst4q_u8((uint8_t*) dst, rgba);
151 src += 16; 179 src += 16;
152 dst += 16; 180 dst += 16;
153 count -= 16; 181 count -= 16;
154 } 182 }
155 183
156 if (count >= 8) { 184 if (count >= 8) {
157 // Load 8 pixels. 185 // Load 8 pixels.
158 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); 186 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
159 187
160 // Swap r and b. 188 // Swap r and b.
161 SkTSwap(bgra.val[0], bgra.val[2]); 189 SkTSwap(rgba.val[0], rgba.val[2]);
162 190
163 // Store 8 pixels. 191 // Store 8 pixels.
164 vst4_u8((uint8_t*) dst, bgra); 192 vst4_u8((uint8_t*) dst, rgba);
165 src += 8; 193 src += 8;
166 dst += 8; 194 dst += 8;
167 count -= 8; 195 count -= 8;
168 } 196 }
169 197
170 RGBA_to_BGRA_portable(dst, src, count); 198 RGBA_to_BGRA_portable(dst, src, count);
171 } 199 }
172 200
201 template <bool kSwapRB>
202 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int cou nt) {
203 const uint8_t* src = (const uint8_t*) vsrc;
204 while (count >= 16) {
205 // Load 16 pixels.
206 uint8x16x3_t rgb = vld3q_u8(src);
207
208 // Insert an opaque alpha channel and swap if needed.
209 uint8x16x4_t rgba;
210 if (kSwapRB) {
211 rgba.val[0] = rgb.val[2];
212 rgba.val[2] = rgb.val[0];
213 } else {
214 rgba.val[0] = rgb.val[0];
215 rgba.val[2] = rgb.val[2];
216 }
217 rgba.val[1] = rgb.val[1];
218 rgba.val[3] = vdupq_n_u8(0xFF);
219
220 // Store 16 pixels.
221 vst4q_u8((uint8_t*) dst, rgba);
222 src += 16*3;
223 dst += 16;
224 count -= 16;
225 }
226
227 if (count >= 8) {
228 // Load 8 pixels.
229 uint8x8x3_t rgb = vld3_u8(src);
230
231 // Insert an opaque alpha channel and swap if needed.
232 uint8x8x4_t rgba;
233 if (kSwapRB) {
234 rgba.val[0] = rgb.val[2];
235 rgba.val[2] = rgb.val[0];
236 } else {
237 rgba.val[0] = rgb.val[0];
238 rgba.val[2] = rgb.val[2];
239 }
240 rgba.val[1] = rgb.val[1];
241 rgba.val[3] = vdup_n_u8(0xFF);
242
243 // Store 8 pixels.
244 vst4_u8((uint8_t*) dst, rgba);
245 src += 8*3;
246 dst += 8;
247 count -= 8;
248 }
249
250 // Call portable code to finish up the tail of [0,8) pixels.
251 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
252 proc(dst, src, count);
253 }
254
255 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
256 insert_alpha_should_swaprb<false>(dst, src, count);
257 }
258
259 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
260 insert_alpha_should_swaprb<true>(dst, src, count);
261 }
262
173 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 263 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
174 264
175 template <bool kSwapRB> 265 template <bool kSwapRB>
176 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { 266 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
177 auto src = (const uint32_t*)vsrc; 267 auto src = (const uint32_t*)vsrc;
178 268
179 auto premul8 = [](__m128i* lo, __m128i* hi) { 269 auto premul8 = [](__m128i* lo, __m128i* hi) {
180 const __m128i zeros = _mm_setzero_si128(); 270 const __m128i zeros = _mm_setzero_si128();
181 const __m128i _128 = _mm_set1_epi16(128); 271 const __m128i _128 = _mm_set1_epi16(128);
182 const __m128i _257 = _mm_set1_epi16(257); 272 const __m128i _257 = _mm_set1_epi16(257);
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after
261 _mm_storeu_si128((__m128i*) dst, bgra); 351 _mm_storeu_si128((__m128i*) dst, bgra);
262 352
263 src += 4; 353 src += 4;
264 dst += 4; 354 dst += 4;
265 count -= 4; 355 count -= 4;
266 } 356 }
267 357
268 RGBA_to_BGRA_portable(dst, src, count); 358 RGBA_to_BGRA_portable(dst, src, count);
269 } 359 }
270 360
361 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
362 RGB_to_RGB1_portable(dst, src, count);
363 }
364
365 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
366 RGB_to_BGR1_portable(dst, src, count);
367 }
368
271 #else 369 #else
272 370
273 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 371 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
274 RGBA_to_rgbA_portable(dst, src, count); 372 RGBA_to_rgbA_portable(dst, src, count);
275 } 373 }
276 374
277 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 375 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
278 RGBA_to_bgrA_portable(dst, src, count); 376 RGBA_to_bgrA_portable(dst, src, count);
279 } 377 }
280 378
281 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { 379 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
282 RGBA_to_BGRA_portable(dst, src, count); 380 RGBA_to_BGRA_portable(dst, src, count);
283 } 381 }
284 382
383 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
384 RGB_to_RGB1_portable(dst, src, count);
385 }
386
387 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
388 RGB_to_BGR1_portable(dst, src, count);
389 }
390
285 #endif 391 #endif
286 392
287 } 393 }
288 394
289 #endif // SkSwizzler_opts_DEFINED 395 #endif // SkSwizzler_opts_DEFINED
OLDNEW
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698