Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(393)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 156113005: ARM Skia NEON patches - 27 - S32A_D565_Blend (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Add ignored-tests.txt Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 212 matching lines...) Expand 10 before | Expand all | Expand 10 after
223 "21: \n\t" 223 "21: \n\t"
224 : [count] "+r" (count) 224 : [count] "+r" (count)
225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc) 225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc)
226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7", 226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7",
227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29", 227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29",
228 "d30","d31" 228 "d30","d31"
229 ); 229 );
230 } 230 }
231 } 231 }
232 232
233 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
234 prod += vdupq_n_u16(128);
235 prod += vshrq_n_u16(prod, 8);
236 return vshrq_n_u16(prod, 8);
237 }
238
233 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 239 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
234 const SkPMColor* SK_RESTRICT src, int count, 240 const SkPMColor* SK_RESTRICT src, int count,
235 U8CPU alpha, int /*x*/, int /*y*/) { 241 U8CPU alpha, int /*x*/, int /*y*/) {
242 SkASSERT(255 > alpha);
236 243
237 U8CPU alpha_for_asm = alpha; 244 /* This code implements a Neon version of S32A_D565_Blend. The results have
238 245 * a few mismatches compared to the original code. These mismatches never
239 asm volatile ( 246 * exceed 1.
240 /* This code implements a Neon version of S32A_D565_Blend. The output differ s from
241 * the original in two respects:
242 * 1. The results have a few mismatches compared to the original code. Thes e mismatches
243 * never exceed 1. It's possible to improve accuracy vs. a floating poin t
244 * implementation by introducing rounding right shifts (vrshr) for the f inal stage.
245 * Rounding is not present in the code below, because although results w ould be closer
246 * to a floating point implementation, the number of mismatches compared to the
247 * original code would be far greater.
248 * 2. On certain inputs, the original code can overflow, causing colour cha nnels to
249 * mix. Although the Neon code can also overflow, it doesn't allow one c olour channel
250 * to affect another.
251 */ 247 */
252 248
253 #if 1 249 if (count >= 8) {
254 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ 250 uint16x8_t valpha_max, vmask_blue;
255 "add %[alpha], %[alpha], #1 \n\t" // adjust r ange of alpha 0-256 251 uint8x8_t valpha;
252
253 // prepare constants
254 valpha_max = vmovq_n_u16(255);
255 valpha = vdup_n_u8(alpha);
256 vmask_blue = vmovq_n_u16(SK_B16_MASK);
257
258 do {
259 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
260 uint16x8_t vres_a, vres_r, vres_g, vres_b;
261 uint8x8x4_t vsrc;
262
263 // load pixels
264 vdst = vld1q_u16(dst);
265 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
266 asm (
267 "vld4.u8 %h[vsrc], [%[src]]!"
268 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
269 : :
270 );
256 #else 271 #else
257 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 272 register uint8x8_t d0 asm("d0");
258 #endif 273 register uint8x8_t d1 asm("d1");
259 "vmov.u16 q3, #255 \n\t" // set up constant 274 register uint8x8_t d2 asm("d2");
260 "movs r4, %[count], lsr #3 \n\t" // calc. c ount>>3 275 register uint8x8_t d3 asm("d3");
261 "vmov.u16 d2[0], %[alpha] \n\t" // move al pha to Neon
262 "beq 2f \n\t" // if coun t8 == 0, exit
263 "vmov.u16 q15, #0x1f \n\t" // set up blue mask
264 276
265 "1: \n\t" 277 asm volatile (
266 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load ei ght dst RGB565 pixels 278 "vld4.u8 {d0-d3},[%[src]]!;"
267 "subs r4, r4, #1 \n\t" // decreme nt loop counter 279 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
268 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load ei ght src ABGR32 pixels 280 [src] "+&r" (src)
269 // and deinterleave 281 : :
270 282 );
271 "vshl.u16 q9, q0, #5 \n\t" // shift g reen to top of lanes 283 vsrc.val[0] = d0;
272 "vand q10, q0, q15 \n\t" // extract blue 284 vsrc.val[1] = d1;
273 "vshr.u16 q8, q0, #11 \n\t" // extract red 285 vsrc.val[2] = d2;
274 "vshr.u16 q9, q9, #10 \n\t" // extract green 286 vsrc.val[3] = d3;
275 // dstrgb = {q8, q9, q10}
276
277 "vshr.u8 d24, d24, #3 \n\t" // shift r ed to 565 range
278 "vshr.u8 d25, d25, #2 \n\t" // shift g reen to 565 range
279 "vshr.u8 d26, d26, #3 \n\t" // shift b lue to 565 range
280
281 "vmovl.u8 q11, d24 \n\t" // widen r ed to 16 bits
282 "vmovl.u8 q12, d25 \n\t" // widen g reen to 16 bits
283 "vmovl.u8 q14, d27 \n\t" // widen a lpha to 16 bits
284 "vmovl.u8 q13, d26 \n\t" // widen b lue to 16 bits
285 // srcrgba = {q11, q12, q13, q14}
286
287 "vmul.u16 q2, q14, d2[0] \n\t" // sa * sr c_scale
288 "vmul.u16 q11, q11, d2[0] \n\t" // red res ult = src_red * src_scale
289 "vmul.u16 q12, q12, d2[0] \n\t" // grn res ult = src_grn * src_scale
290 "vmul.u16 q13, q13, d2[0] \n\t" // blu res ult = src_blu * src_scale
291
292 "vshr.u16 q2, q2, #8 \n\t" // sa * sr c_scale >> 8
293 "vsub.u16 q2, q3, q2 \n\t" // 255 - ( sa * src_scale >> 8)
294 // dst_scale = q2
295
296 "vmla.u16 q11, q8, q2 \n\t" // red res ult += dst_red * dst_scale
297 "vmla.u16 q12, q9, q2 \n\t" // grn res ult += dst_grn * dst_scale
298 "vmla.u16 q13, q10, q2 \n\t" // blu res ult += dst_blu * dst_scale
299
300 #if 1
301 // trying for a better match with SkDiv255Round(a)
302 // C alg is: a+=128; (a+a>>8)>>8
303 // we'll use just a rounding shift [q2 is available for scratch]
304 "vrshr.u16 q11, q11, #8 \n\t" // shift down red
305 "vrshr.u16 q12, q12, #8 \n\t" // shift down green
306 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue
307 #else
308 // arm's original "truncating divide by 256"
309 "vshr.u16 q11, q11, #8 \n\t" // shift d own red
310 "vshr.u16 q12, q12, #8 \n\t" // shift d own green
311 "vshr.u16 q13, q13, #8 \n\t" // shift d own blue
312 #endif 287 #endif
313 288
314 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue
315 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue
316 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write p ixel back to dst, update ptr
317 289
318 "bne 1b \n\t" // if coun ter != 0, loop 290 // deinterleave dst
319 "2: \n\t" // exi t 291 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes
292 vdst_b = vdst & vmask_blue; // extract blue
293 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red
294 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
320 295
321 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [a lpha] "+r" (alpha_for_asm) 296 // shift src to 565
322 : 297 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
323 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d 6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 298 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
324 ); 299 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
325 300
326 count &= 7; 301 // calc src * src_scale
327 if (count > 0) { 302 vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
328 do { 303 vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
329 SkPMColor sc = *src++; 304 vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
330 if (sc) { 305 vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
331 uint16_t dc = *dst; 306
332 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 307 // prepare dst_scale
333 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(Sk GetPackedR16(dc), dst_scale); 308 vres_a = SkDiv255Round_neon8(vres_a);
334 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(Sk GetPackedG16(dc), dst_scale); 309 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
335 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(Sk GetPackedB16(dc), dst_scale); 310
336 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv25 5Round(db)); 311 // add dst * dst_scale to previous result
337 } 312 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
338 dst += 1; 313 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
339 } while (--count != 0); 314 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
315
316 #ifdef S32A_D565_BLEND_EXACT
317 // It is possible to get exact results with this but it is slow,
318 // even slower than C code in some cases
319 vres_r = SkDiv255Round_neon8(vres_r);
320 vres_g = SkDiv255Round_neon8(vres_g);
321 vres_b = SkDiv255Round_neon8(vres_b);
322 #else
323 vres_r = vrshrq_n_u16(vres_r, 8);
324 vres_g = vrshrq_n_u16(vres_g, 8);
325 vres_b = vrshrq_n_u16(vres_b, 8);
326 #endif
327 // pack result
328 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
329 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red in to green/blue
330
331 // store
332 vst1q_u16(dst, vres_b);
333 dst += 8;
334 count -= 8;
335 } while (count >= 8);
336 }
337
338 // leftovers
339 while (count-- > 0) {
340 SkPMColor sc = *src++;
341 if (sc) {
342 uint16_t dc = *dst;
343 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alph a);
344 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetP ackedR16(dc), dst_scale);
345 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetP ackedG16(dc), dst_scale);
346 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetP ackedB16(dc), dst_scale);
347 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Rou nd(db));
348 }
349 dst += 1;
340 } 350 }
341 } 351 }
342 352
343 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 353 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
344 * each dither value is spaced out into byte lanes, and repeated 354 * each dither value is spaced out into byte lanes, and repeated
345 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 355 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
346 * start of each row. 356 * start of each row.
347 */ 357 */
348 static const uint8_t gDitherMatrix_Neon[48] = { 358 static const uint8_t gDitherMatrix_Neon[48] = {
349 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 359 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
(...skipping 1084 matching lines...) Expand 10 before | Expand all | Expand 10 after
1434 * case where we do not inspect the src alpha. 1444 * case where we do not inspect the src alpha.
1435 */ 1445 */
1436 #if SK_A32_SHIFT == 24 1446 #if SK_A32_SHIFT == 24
1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1447 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1448 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1439 #else 1449 #else
1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1450 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1441 #endif 1451 #endif
1442 S32A_Blend_BlitRow32_neon // S32A_Blend 1452 S32A_Blend_BlitRow32_neon // S32A_Blend
1443 }; 1453 };
OLDNEW
« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698