Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(100)

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1626463002: Refactor swizzle names and types. (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | tests/SwizzlerTest.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkSwizzler_opts_DEFINED 8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED
10 10
11 #include "SkColorPriv.h" 11 #include "SkColorPriv.h"
12 12
13 namespace SK_OPTS_NS { 13 namespace SK_OPTS_NS {
14 14
15 // These variable names in these functions just pretend the input is BGRA. 15 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
msarett 2016/01/22 15:35:08 I find this comment to still be useful? There is
mtklein 2016/01/22 15:39:52 Sort of... now that the order and the function nam
msarett 2016/01/22 15:40:51 Agreed that it's obvious. Let's drop it.
16 // They work fine with both RGBA and BGRA. 16 auto src = (const uint32_t*)vsrc;
17
18 static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) {
19 for (int i = 0; i < count; i++) { 17 for (int i = 0; i < count; i++) {
20 uint8_t a = src[i] >> 24, 18 uint8_t a = src[i] >> 24,
21 r = src[i] >> 16, 19 b = src[i] >> 16,
22 g = src[i] >> 8, 20 g = src[i] >> 8,
23 b = src[i] >> 0; 21 r = src[i] >> 0;
22 b = (b*a+127)/255;
23 g = (g*a+127)/255;
24 r = (r*a+127)/255; 24 r = (r*a+127)/255;
25 dst[i] = (uint32_t)a << 24
26 | (uint32_t)b << 16
27 | (uint32_t)g << 8
28 | (uint32_t)r << 0;
29 }
30 }
31
32 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
33 auto src = (const uint32_t*)vsrc;
34 for (int i = 0; i < count; i++) {
35 uint8_t a = src[i] >> 24,
36 b = src[i] >> 16,
37 g = src[i] >> 8,
38 r = src[i] >> 0;
39 b = (b*a+127)/255;
25 g = (g*a+127)/255; 40 g = (g*a+127)/255;
26 b = (b*a+127)/255; 41 r = (r*a+127)/255;
27 dst[i] = (uint32_t)a << 24 42 dst[i] = (uint32_t)a << 24
28 | (uint32_t)r << 16 43 | (uint32_t)r << 16
29 | (uint32_t)g << 8 44 | (uint32_t)g << 8
30 | (uint32_t)b << 0; 45 | (uint32_t)b << 0;
31 } 46 }
32 } 47 }
33 48
34 static void premul_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], in t count) { 49 static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
50 auto src = (const uint32_t*)vsrc;
35 for (int i = 0; i < count; i++) { 51 for (int i = 0; i < count; i++) {
36 uint8_t a = src[i] >> 24, 52 uint8_t a = src[i] >> 24,
37 r = src[i] >> 16, 53 b = src[i] >> 16,
38 g = src[i] >> 8, 54 g = src[i] >> 8,
39 b = src[i] >> 0; 55 r = src[i] >> 0;
40 r = (r*a+127)/255;
41 g = (g*a+127)/255;
42 b = (b*a+127)/255;
43 dst[i] = (uint32_t)a << 24 56 dst[i] = (uint32_t)a << 24
44 | (uint32_t)b << 16 57 | (uint32_t)r << 16
45 | (uint32_t)g << 8 58 | (uint32_t)g << 8
46 | (uint32_t)r << 0; 59 | (uint32_t)b << 0;
47 } 60 }
48 } 61 }
49 62
50 static void swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) {
51 for (int i = 0; i < count; i++) {
52 uint8_t a = src[i] >> 24,
53 r = src[i] >> 16,
54 g = src[i] >> 8,
55 b = src[i] >> 0;
56 dst[i] = (uint32_t)a << 24
57 | (uint32_t)b << 16
58 | (uint32_t)g << 8
59 | (uint32_t)r << 0;
60 }
61 }
62
63 #if defined(SK_ARM_HAS_NEON) 63 #if defined(SK_ARM_HAS_NEON)
64 64
65 // Rounded divide by 255, (x + 127) / 255 65 // Rounded divide by 255, (x + 127) / 255
66 static uint8x8_t div255_round(uint16x8_t x) { 66 static uint8x8_t div255_round(uint16x8_t x) {
67 // result = (x + 127) / 255 67 // result = (x + 127) / 255
68 // result = (x + 127) / 256 + error1 68 // result = (x + 127) / 256 + error1
69 // 69 //
70 // error1 = (x + 127) / (255 * 256) 70 // error1 = (x + 127) / (255 * 256)
71 // error1 = (x + 127) / (256 * 256) + error2 71 // error1 = (x + 127) / (256 * 256) + error2
72 // 72 //
(...skipping 12 matching lines...) Expand all
85 // "add, round, and narrow back to 8-bits" instruction. 85 // "add, round, and narrow back to 8-bits" instruction.
86 return vraddhn_u16(x, vrshrq_n_u16(x, 8)); 86 return vraddhn_u16(x, vrshrq_n_u16(x, 8));
87 } 87 }
88 88
89 // Scale a byte by another, (x * y + 127) / 255 89 // Scale a byte by another, (x * y + 127) / 255
90 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { 90 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
91 return div255_round(vmull_u8(x, y)); 91 return div255_round(vmull_u8(x, y));
92 } 92 }
93 93
94 template <bool kSwapRB> 94 template <bool kSwapRB>
95 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { 95 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
96 auto src = (const uint32_t*)vsrc;
96 while (count >= 8) { 97 while (count >= 8) {
97 // Load 8 pixels. 98 // Load 8 pixels.
98 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); 99 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);
99 100
100 uint8x8_t a = bgra.val[3], 101 uint8x8_t a = bgra.val[3],
101 r = bgra.val[2], 102 b = bgra.val[2],
102 g = bgra.val[1], 103 g = bgra.val[1],
103 b = bgra.val[0]; 104 r = bgra.val[0];
104 105
105 // Premultiply. 106 // Premultiply.
107 b = scale(b, a);
108 g = scale(g, a);
106 r = scale(r, a); 109 r = scale(r, a);
107 g = scale(g, a);
108 b = scale(b, a);
109 110
110 // Store 8 premultiplied pixels. 111 // Store 8 premultiplied pixels.
111 if (kSwapRB) { 112 if (kSwapRB) {
113 bgra.val[2] = r;
114 bgra.val[1] = g;
115 bgra.val[0] = b;
116 } else {
112 bgra.val[2] = b; 117 bgra.val[2] = b;
113 bgra.val[1] = g; 118 bgra.val[1] = g;
114 bgra.val[0] = r; 119 bgra.val[0] = r;
115 } else {
116 bgra.val[2] = r;
117 bgra.val[1] = g;
118 bgra.val[0] = b;
119 } 120 }
120 vst4_u8((uint8_t*) dst, bgra); 121 vst4_u8((uint8_t*) dst, bgra);
121 src += 8; 122 src += 8;
122 dst += 8; 123 dst += 8;
123 count -= 8; 124 count -= 8;
124 } 125 }
125 126
126 // Call portable code to finish up the tail of [0,8) pixels. 127 // Call portable code to finish up the tail of [0,8) pixels.
127 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; 128 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
128 proc(dst, src, count); 129 proc(dst, src, count);
129 } 130 }
130 131
131 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { 132 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
132 premul_xxxa_should_swaprb<false>(dst, src, count); 133 premul_should_swapRB<false>(dst, src, count);
133 } 134 }
134 135
135 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 136 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
136 premul_xxxa_should_swaprb<true>(dst, src, count); 137 premul_should_swapRB<true>(dst, src, count);
137 } 138 }
138 139
139 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 140 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
141 auto src = (const uint32_t*)vsrc;
140 while (count >= 16) { 142 while (count >= 16) {
141 // Load 16 pixels. 143 // Load 16 pixels.
142 uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src); 144 uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src);
143 145
144 // Swap r and b. 146 // Swap r and b.
145 SkTSwap(bgra.val[0], bgra.val[2]); 147 SkTSwap(bgra.val[0], bgra.val[2]);
146 148
147 // Store 16 pixels. 149 // Store 16 pixels.
148 vst4q_u8((uint8_t*) dst, bgra); 150 vst4q_u8((uint8_t*) dst, bgra);
149 src += 16; 151 src += 16;
150 dst += 16; 152 dst += 16;
151 count -= 16; 153 count -= 16;
152 } 154 }
153 155
154 if (count >= 8) { 156 if (count >= 8) {
155 // Load 8 pixels. 157 // Load 8 pixels.
156 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); 158 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);
157 159
158 // Swap r and b. 160 // Swap r and b.
159 SkTSwap(bgra.val[0], bgra.val[2]); 161 SkTSwap(bgra.val[0], bgra.val[2]);
160 162
161 // Store 8 pixels. 163 // Store 8 pixels.
162 vst4_u8((uint8_t*) dst, bgra); 164 vst4_u8((uint8_t*) dst, bgra);
163 src += 8; 165 src += 8;
164 dst += 8; 166 dst += 8;
165 count -= 8; 167 count -= 8;
166 } 168 }
167 169
168 swaprb_xxxa_portable(dst, src, count); 170 RGBA_to_BGRA_portable(dst, src, count);
169 } 171 }
170 172
171 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 173 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
172 174
173 template <bool kSwapRB> 175 template <bool kSwapRB>
174 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { 176 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
177 auto src = (const uint32_t*)vsrc;
175 178
176 auto premul8 = [](__m128i* lo, __m128i* hi) { 179 auto premul8 = [](__m128i* lo, __m128i* hi) {
177 const __m128i zeros = _mm_setzero_si128(); 180 const __m128i zeros = _mm_setzero_si128();
178 const __m128i _128 = _mm_set1_epi16(128); 181 const __m128i _128 = _mm_set1_epi16(128);
179 const __m128i _257 = _mm_set1_epi16(257); 182 const __m128i _257 = _mm_set1_epi16(257);
180 __m128i planar; 183 __m128i planar;
181 if (kSwapRB) { 184 if (kSwapRB) {
182 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 185 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
183 } else { 186 } else {
184 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 187 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
185 } 188 }
186 189
187 // Swizzle the pixels to 8-bit planar. 190 // Swizzle the pixels to 8-bit planar.
188 *lo = _mm_shuffle_epi8(*lo, planar); // bbbbgggg rr rraaaa 191 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bb bbaaaa
189 *hi = _mm_shuffle_epi8(*hi, planar); // BBBBGGGG RR RRAAAA 192 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BB BBAAAA
190 __m128i bg = _mm_unpacklo_epi32(*lo, *hi), // bbbbBBBB gg ggGGGG 193 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR gg ggGGGG
191 ra = _mm_unpackhi_epi32(*lo, *hi); // rrrrRRRR aa aaAAAA 194 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aa aaAAAA
192 195
193 // Unpack to 16-bit planar. 196 // Unpack to 16-bit planar.
194 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_ B_B_B_ 197 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_ R_R_R_
195 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_ G_G_G_ 198 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_ G_G_G_
196 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_ R_R_R_ 199 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_ B_B_B_
197 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_ A_A_A_ 200 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_ A_A_A_
198 201
199 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 202 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
203 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);
204 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);
200 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); 205 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);
201 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);
202 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);
203 206
204 // Repack into interlaced pixels. 207 // Repack into interlaced pixels.
205 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG BGBGBG 208 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RG RGRGRG
206 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA RARARA 209 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BA BABABA
207 *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra 210 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rg bargba
208 *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG RABGRA 211 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RG BARGBA
209 }; 212 };
210 213
211 while (count >= 8) { 214 while (count >= 8) {
212 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 215 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
213 hi = _mm_loadu_si128((const __m128i*) (src + 4)); 216 hi = _mm_loadu_si128((const __m128i*) (src + 4));
214 217
215 premul8(&lo, &hi); 218 premul8(&lo, &hi);
216 219
217 _mm_storeu_si128((__m128i*) (dst + 0), lo); 220 _mm_storeu_si128((__m128i*) (dst + 0), lo);
218 _mm_storeu_si128((__m128i*) (dst + 4), hi); 221 _mm_storeu_si128((__m128i*) (dst + 4), hi);
(...skipping 10 matching lines...) Expand all
229 premul8(&lo, &hi); 232 premul8(&lo, &hi);
230 233
231 _mm_storeu_si128((__m128i*) dst, lo); 234 _mm_storeu_si128((__m128i*) dst, lo);
232 235
233 src += 4; 236 src += 4;
234 dst += 4; 237 dst += 4;
235 count -= 4; 238 count -= 4;
236 } 239 }
237 240
238 // Call portable code to finish up the tail of [0,4) pixels. 241 // Call portable code to finish up the tail of [0,4) pixels.
239 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; 242 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
240 proc(dst, src, count); 243 proc(dst, src, count);
241 } 244 }
242 245
243 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { 246 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
244 premul_xxxa_should_swaprb<false>(dst, src, count); 247 premul_should_swapRB<false>(dst, src, count);
245 } 248 }
246 249
247 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 250 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
248 premul_xxxa_should_swaprb<true>(dst, src, count); 251 premul_should_swapRB<true>(dst, src, count);
249 } 252 }
250 253
251 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 254 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
255 auto src = (const uint32_t*)vsrc;
252 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5); 256 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5);
253 257
254 while (count >= 4) { 258 while (count >= 4) {
255 __m128i bgra = _mm_loadu_si128((const __m128i*) src); 259 __m128i rgba = _mm_loadu_si128((const __m128i*) src);
256 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB); 260 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
257 _mm_storeu_si128((__m128i*) dst, rgba); 261 _mm_storeu_si128((__m128i*) dst, bgra);
258 262
259 src += 4; 263 src += 4;
260 dst += 4; 264 dst += 4;
261 count -= 4; 265 count -= 4;
262 } 266 }
263 267
264 swaprb_xxxa_portable(dst, src, count); 268 RGBA_to_BGRA_portable(dst, src, count);
265 } 269 }
266 270
267 #else 271 #else
268 272
269 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { 273 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
270 premul_xxxa_portable(dst, src, count); 274 RGBA_to_rgbA_portable(dst, src, count);
271 } 275 }
272 276
273 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 277 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
274 premul_swaprb_xxxa_portable(dst, src, count); 278 RGBA_to_bgrA_portable(dst, src, count);
275 } 279 }
276 280
277 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 281 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
278 swaprb_xxxa_portable(dst, src, count); 282 RGBA_to_BGRA_portable(dst, src, count);
279 } 283 }
280 284
281 #endif 285 #endif
282 286
283 } 287 }
284 288
285 #endif // SkSwizzler_opts_DEFINED 289 #endif // SkSwizzler_opts_DEFINED
OLDNEW
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | tests/SwizzlerTest.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698