OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
11 #include "SkBlitRow.h" | 11 #include "SkBlitRow.h" |
12 #include "SkColorPriv.h" | 12 #include "SkColorPriv.h" |
13 #include "SkDither.h" | 13 #include "SkDither.h" |
14 #include "SkMathPriv.h" | 14 #include "SkMathPriv.h" |
15 #include "SkUtils.h" | 15 #include "SkUtils.h" |
16 | 16 |
17 #include "SkColor_opts_neon.h" | 17 #include "SkColor_opts_neon.h" |
18 #include <arm_neon.h> | 18 #include <arm_neon.h> |
19 | 19 |
20 #ifdef SK_CPU_ARM32 | |
21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 20 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
22 const SkPMColor* SK_RESTRICT src, int count, | 21 const SkPMColor* SK_RESTRICT src, int count, |
23 U8CPU alpha, int /*x*/, int /*y*/) { | 22 U8CPU alpha, int /*x*/, int /*y*/) { |
24 SkASSERT(255 == alpha); | 23 SkASSERT(255 == alpha); |
25 | 24 |
26 while (count >= 8) { | 25 while (count >= 8) { |
27 uint8x8x4_t vsrc; | 26 uint8x8x4_t vsrc; |
28 uint16x8_t vdst; | 27 uint16x8_t vdst; |
29 | 28 |
30 // Load | 29 // Load |
30 #ifdef SK_CPU_ARM64 | |
31 uint8x8_t vsrc_0, vsrc_1, vsrc_2; | |
32 asm ( | |
33 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n" | |
mtklein
2014/06/05 16:28:36
Looks like we've got the same load-32-bytes thing
kevin.petit
2014/06/05 17:00:02
There are actually a few more differences. Some us
| |
34 "mov %[vsrc0].8b, v0.8b \t\n" | |
35 "mov %[vsrc1].8b, v1.8b \t\n" | |
36 "mov %[vsrc2].8b, v2.8b \t\n" | |
37 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), | |
38 [vsrc2] "=w" (vsrc_2) | |
39 : [src] "r" (src) | |
40 : "v0", "v1", "v2", "v3" | |
41 ); | |
42 vsrc.val[0] = vsrc_0; | |
43 vsrc.val[1] = vsrc_1; | |
44 vsrc.val[2] = vsrc_2; | |
45 | |
46 #else | |
31 vsrc = vld4_u8((uint8_t*)src); | 47 vsrc = vld4_u8((uint8_t*)src); |
48 #endif | |
32 | 49 |
33 // Convert src to 565 | 50 // Convert src to 565 |
34 vdst = SkPixel32ToPixel16_neon8(vsrc); | 51 vdst = SkPixel32ToPixel16_neon8(vsrc); |
35 | 52 |
36 // Store | 53 // Store |
37 vst1q_u16(dst, vdst); | 54 vst1q_u16(dst, vdst); |
38 | 55 |
39 // Prepare next iteration | 56 // Prepare next iteration |
40 dst += 8; | 57 dst += 8; |
41 src += 8; | 58 src += 8; |
(...skipping 15 matching lines...) Expand all Loading... | |
57 U8CPU alpha, int /*x*/, int /*y*/) { | 74 U8CPU alpha, int /*x*/, int /*y*/) { |
58 SkASSERT(255 > alpha); | 75 SkASSERT(255 > alpha); |
59 | 76 |
60 uint16x8_t vmask_blue, vscale; | 77 uint16x8_t vmask_blue, vscale; |
61 | 78 |
62 // prepare constants | 79 // prepare constants |
63 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); | 80 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); |
64 vmask_blue = vmovq_n_u16(0x1F); | 81 vmask_blue = vmovq_n_u16(0x1F); |
65 | 82 |
66 while (count >= 8) { | 83 while (count >= 8) { |
84 uint8x8x4_t vsrc; | |
67 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; | 85 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
68 uint16x8_t vres_r, vres_g, vres_b; | 86 uint16x8_t vres_r, vres_g, vres_b; |
69 uint8x8_t vsrc_r, vsrc_g, vsrc_b; | |
70 | 87 |
71 // Load src | 88 // Load src |
89 #ifdef SK_CPU_ARM64 | |
90 uint8x8_t vsrc_0, vsrc_1, vsrc_2; | |
91 asm ( | |
92 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n" | |
93 "mov %[vsrc0].8b, v0.8b \t\n" | |
94 "mov %[vsrc1].8b, v1.8b \t\n" | |
95 "mov %[vsrc2].8b, v2.8b \t\n" | |
96 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), | |
97 [vsrc2] "=w" (vsrc_2) | |
98 : [src] "r" (src) | |
99 : "v0", "v1", "v2", "v3" | |
100 ); | |
101 vsrc.val[0] = vsrc_0; | |
102 vsrc.val[1] = vsrc_1; | |
103 vsrc.val[2] = vsrc_2; | |
104 src += 8; | |
105 #else | |
72 { | 106 { |
73 register uint8x8_t d0 asm("d0"); | 107 register uint8x8_t d0 asm("d0"); |
74 register uint8x8_t d1 asm("d1"); | 108 register uint8x8_t d1 asm("d1"); |
75 register uint8x8_t d2 asm("d2"); | 109 register uint8x8_t d2 asm("d2"); |
76 register uint8x8_t d3 asm("d3"); | 110 register uint8x8_t d3 asm("d3"); |
77 | 111 |
78 asm ( | 112 asm ( |
79 "vld4.8 {d0-d3},[%[src]]!" | 113 "vld4.8 {d0-d3},[%[src]]!" |
80 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) | 114 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
81 : | 115 : |
82 ); | 116 ); |
83 vsrc_g = d1; | 117 vsrc.val[0] = d0; |
84 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 118 vsrc.val[1] = d1; |
85 vsrc_r = d2; vsrc_b = d0; | 119 vsrc.val[2] = d2; |
86 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 120 } |
87 vsrc_r = d0; vsrc_b = d2; | |
88 #endif | 121 #endif |
89 } | |
90 | 122 |
91 // Load and unpack dst | 123 // Load and unpack dst |
92 vdst = vld1q_u16(dst); | 124 vdst = vld1q_u16(dst); |
93 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes | 125 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes |
94 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue | 126 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue |
95 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red | 127 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red |
96 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green | 128 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green |
97 | 129 |
98 // Shift src to 565 | 130 // Shift src to 565 range |
99 vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range | 131 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3); |
100 vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range | 132 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2); |
101 vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range | 133 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3); |
102 | 134 |
103 // Scale src - dst | 135 // Scale src - dst |
104 vres_r = vmovl_u8(vsrc_r) - vdst_r; | 136 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r; |
105 vres_g = vmovl_u8(vsrc_g) - vdst_g; | 137 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g; |
106 vres_b = vmovl_u8(vsrc_b) - vdst_b; | 138 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b; |
107 | 139 |
108 vres_r = vshrq_n_u16(vres_r * vscale, 8); | 140 vres_r = vshrq_n_u16(vres_r * vscale, 8); |
109 vres_g = vshrq_n_u16(vres_g * vscale, 8); | 141 vres_g = vshrq_n_u16(vres_g * vscale, 8); |
110 vres_b = vshrq_n_u16(vres_b * vscale, 8); | 142 vres_b = vshrq_n_u16(vres_b * vscale, 8); |
111 | 143 |
112 vres_r += vdst_r; | 144 vres_r += vdst_r; |
113 vres_g += vdst_g; | 145 vres_g += vdst_g; |
114 vres_b += vdst_b; | 146 vres_b += vdst_b; |
115 | 147 |
116 // Combine | 148 // Combine |
(...skipping 12 matching lines...) Expand all Loading... | |
129 SkPMColorAssert(c); | 161 SkPMColorAssert(c); |
130 uint16_t d = *dst; | 162 uint16_t d = *dst; |
131 *dst++ = SkPackRGB16( | 163 *dst++ = SkPackRGB16( |
132 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), | 164 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), |
133 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), | 165 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), |
134 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); | 166 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); |
135 } while (--count != 0); | 167 } while (--count != 0); |
136 } | 168 } |
137 } | 169 } |
138 | 170 |
171 #ifdef SK_CPU_ARM32 | |
139 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 172 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
140 const SkPMColor* SK_RESTRICT src, int count, | 173 const SkPMColor* SK_RESTRICT src, int count, |
141 U8CPU alpha, int /*x*/, int /*y*/) { | 174 U8CPU alpha, int /*x*/, int /*y*/) { |
142 SkASSERT(255 == alpha); | 175 SkASSERT(255 == alpha); |
143 | 176 |
144 if (count >= 8) { | 177 if (count >= 8) { |
145 uint16_t* SK_RESTRICT keep_dst = 0; | 178 uint16_t* SK_RESTRICT keep_dst = 0; |
146 | 179 |
147 asm volatile ( | 180 asm volatile ( |
148 "ands ip, %[count], #7 \n\t" | 181 "ands ip, %[count], #7 \n\t" |
(...skipping 157 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
306 | 339 |
307 "21: \n\t" | 340 "21: \n\t" |
308 : [count] "+r" (count) | 341 : [count] "+r" (count) |
309 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc) | 342 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc) |
310 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7", | 343 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7", |
311 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29", | 344 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29", |
312 "d30","d31" | 345 "d30","d31" |
313 ); | 346 ); |
314 } | 347 } |
315 } | 348 } |
349 #endif | |
316 | 350 |
317 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { | 351 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { |
318 prod += vdupq_n_u16(128); | 352 prod += vdupq_n_u16(128); |
319 prod += vshrq_n_u16(prod, 8); | 353 prod += vshrq_n_u16(prod, 8); |
320 return vshrq_n_u16(prod, 8); | 354 return vshrq_n_u16(prod, 8); |
321 } | 355 } |
322 | 356 |
323 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, | 357 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
324 const SkPMColor* SK_RESTRICT src, int count, | 358 const SkPMColor* SK_RESTRICT src, int count, |
325 U8CPU alpha, int /*x*/, int /*y*/) { | 359 U8CPU alpha, int /*x*/, int /*y*/) { |
(...skipping 13 matching lines...) Expand all Loading... | |
339 valpha = vdup_n_u8(alpha); | 373 valpha = vdup_n_u8(alpha); |
340 vmask_blue = vmovq_n_u16(SK_B16_MASK); | 374 vmask_blue = vmovq_n_u16(SK_B16_MASK); |
341 | 375 |
342 do { | 376 do { |
343 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; | 377 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
344 uint16x8_t vres_a, vres_r, vres_g, vres_b; | 378 uint16x8_t vres_a, vres_r, vres_g, vres_b; |
345 uint8x8x4_t vsrc; | 379 uint8x8x4_t vsrc; |
346 | 380 |
347 // load pixels | 381 // load pixels |
348 vdst = vld1q_u16(dst); | 382 vdst = vld1q_u16(dst); |
383 #ifdef SK_CPU_ARM64 | |
384 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3; | |
385 asm ( | |
386 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n" | |
387 "mov %[vsrc0].8b, v0.8b \t\n" | |
388 "mov %[vsrc1].8b, v1.8b \t\n" | |
389 "mov %[vsrc2].8b, v2.8b \t\n" | |
390 "mov %[vsrc3].8b, v3.8b \t\n" | |
391 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), | |
392 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3) | |
393 : [src] "r" (src) | |
394 : "v0", "v1", "v2", "v3" | |
395 ); | |
396 src += 8; | |
397 vsrc.val[0] = vsrc_0; | |
398 vsrc.val[1] = vsrc_1; | |
399 vsrc.val[2] = vsrc_2; | |
400 vsrc.val[3] = vsrc_3; | |
401 #else | |
349 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 402 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
350 asm ( | 403 asm ( |
351 "vld4.u8 %h[vsrc], [%[src]]!" | 404 "vld4.u8 %h[vsrc], [%[src]]!" |
352 : [vsrc] "=w" (vsrc), [src] "+&r" (src) | 405 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
353 : : | 406 : : |
354 ); | 407 ); |
355 #else | 408 #else |
356 register uint8x8_t d0 asm("d0"); | 409 register uint8x8_t d0 asm("d0"); |
357 register uint8x8_t d1 asm("d1"); | 410 register uint8x8_t d1 asm("d1"); |
358 register uint8x8_t d2 asm("d2"); | 411 register uint8x8_t d2 asm("d2"); |
359 register uint8x8_t d3 asm("d3"); | 412 register uint8x8_t d3 asm("d3"); |
360 | 413 |
361 asm volatile ( | 414 asm volatile ( |
362 "vld4.u8 {d0-d3},[%[src]]!;" | 415 "vld4.u8 {d0-d3},[%[src]]!;" |
363 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | 416 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
364 [src] "+&r" (src) | 417 [src] "+&r" (src) |
365 : : | 418 : : |
366 ); | 419 ); |
367 vsrc.val[0] = d0; | 420 vsrc.val[0] = d0; |
368 vsrc.val[1] = d1; | 421 vsrc.val[1] = d1; |
369 vsrc.val[2] = d2; | 422 vsrc.val[2] = d2; |
370 vsrc.val[3] = d3; | 423 vsrc.val[3] = d3; |
371 #endif | 424 #endif |
425 #endif // #ifdef SK_CPU_ARM64 | |
372 | 426 |
373 | 427 |
374 // deinterleave dst | 428 // deinterleave dst |
375 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes | 429 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes |
376 vdst_b = vdst & vmask_blue; // extract blue | 430 vdst_b = vdst & vmask_blue; // extract blue |
377 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red | 431 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red |
378 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green | 432 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green |
379 | 433 |
380 // shift src to 565 | 434 // shift src to 565 |
381 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); | 435 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); |
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
461 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 515 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
462 | 516 |
463 uint8x8_t vdither = vld1_u8(dstart); // load dither values | 517 uint8x8_t vdither = vld1_u8(dstart); // load dither values |
464 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s | 518 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s |
465 | 519 |
466 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg | 520 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg |
467 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask | 521 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask |
468 | 522 |
469 do { | 523 do { |
470 | 524 |
525 uint8x8x4_t vsrc; | |
471 uint8x8_t vsrc_r, vsrc_g, vsrc_b; | 526 uint8x8_t vsrc_r, vsrc_g, vsrc_b; |
472 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; | 527 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; |
473 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; | 528 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; |
474 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; | 529 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; |
475 uint16x8_t vdst; | 530 uint16x8_t vdst; |
476 uint16x8_t vdst_r, vdst_g, vdst_b; | 531 uint16x8_t vdst_r, vdst_g, vdst_b; |
477 int16x8_t vres_r, vres_g, vres_b; | 532 int16x8_t vres_r, vres_g, vres_b; |
478 int8x8_t vres8_r, vres8_g, vres8_b; | 533 int8x8_t vres8_r, vres8_g, vres8_b; |
479 | 534 |
480 // Load source and add dither | 535 // Load source and add dither |
536 #ifdef SK_CPU_ARM64 | |
537 uint8x8_t vsrc_0, vsrc_1, vsrc_2; | |
538 asm ( | |
539 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n" | |
540 "mov %[vsrc0].8b, v0.8b \t\n" | |
541 "mov %[vsrc1].8b, v1.8b \t\n" | |
542 "mov %[vsrc2].8b, v2.8b \t\n" | |
543 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), | |
544 [vsrc2] "=w" (vsrc_2) | |
545 : [src] "r" (src) | |
546 : "v0", "v1", "v2", "v3" | |
547 ); | |
548 | |
549 src += 8; | |
550 | |
551 vsrc.val[0] = vsrc_0; | |
552 vsrc.val[1] = vsrc_1; | |
553 vsrc.val[2] = vsrc_2; | |
554 #else | |
481 { | 555 { |
482 register uint8x8_t d0 asm("d0"); | 556 register uint8x8_t d0 asm("d0"); |
483 register uint8x8_t d1 asm("d1"); | 557 register uint8x8_t d1 asm("d1"); |
484 register uint8x8_t d2 asm("d2"); | 558 register uint8x8_t d2 asm("d2"); |
485 register uint8x8_t d3 asm("d3"); | 559 register uint8x8_t d3 asm("d3"); |
486 | 560 |
487 asm ( | 561 asm ( |
488 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 562 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
489 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) | 563 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
490 : | 564 : |
491 ); | 565 ); |
492 vsrc_g = d1; | 566 vsrc.val[0] = d0; |
493 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 567 vsrc.val[1] = d1; |
494 vsrc_r = d2; vsrc_b = d0; | 568 vsrc.val[2] = d2; |
495 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 569 } |
496 vsrc_r = d0; vsrc_b = d2; | |
497 #endif | 570 #endif |
498 } | 571 vsrc_r = vsrc.val[NEON_R]; |
572 vsrc_g = vsrc.val[NEON_G]; | |
573 vsrc_b = vsrc.val[NEON_B]; | |
499 | 574 |
500 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 | 575 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 |
501 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 | 576 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 |
502 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 | 577 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 |
503 | 578 |
504 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen | 579 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen |
505 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen | 580 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen |
506 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen | 581 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen |
507 | 582 |
508 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result | 583 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
569 sb = SkDITHER_B32To565(sb, dither); | 644 sb = SkDITHER_B32To565(sb, dither); |
570 | 645 |
571 uint16_t d = *dst; | 646 uint16_t d = *dst; |
572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), | 647 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), |
573 SkAlphaBlend(sg, SkGetPackedG16(d), scale), | 648 SkAlphaBlend(sg, SkGetPackedG16(d), scale), |
574 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); | 649 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); |
575 DITHER_INC_X(x); | 650 DITHER_INC_X(x); |
576 } while (--count != 0); | 651 } while (--count != 0); |
577 } | 652 } |
578 } | 653 } |
579 #endif | |
580 | 654 |
581 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 655 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
582 const SkPMColor* SK_RESTRICT src, | 656 const SkPMColor* SK_RESTRICT src, |
583 int count, U8CPU alpha) { | 657 int count, U8CPU alpha) { |
584 | 658 |
585 SkASSERT(255 == alpha); | 659 SkASSERT(255 == alpha); |
586 if (count > 0) { | 660 if (count > 0) { |
587 | 661 |
588 | 662 |
589 uint8x8_t alpha_mask; | 663 uint8x8_t alpha_mask; |
(...skipping 450 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1040 uint16_t *pc = (uint16_t*) p; | 1114 uint16_t *pc = (uint16_t*) p; |
1041 sprintf(buf,"%8s:", str); | 1115 sprintf(buf,"%8s:", str); |
1042 len = (len / sizeof(uint16_t)); /* passed as bytes */ | 1116 len = (len / sizeof(uint16_t)); /* passed as bytes */ |
1043 for(i=0;i<len;i++) { | 1117 for(i=0;i<len;i++) { |
1044 sprintf(tbuf, " %04x", pc[i]); | 1118 sprintf(tbuf, " %04x", pc[i]); |
1045 strcat(buf, tbuf); | 1119 strcat(buf, tbuf); |
1046 } | 1120 } |
1047 SkDebugf("%s\n", buf); | 1121 SkDebugf("%s\n", buf); |
1048 } | 1122 } |
1049 #endif | 1123 #endif |
1124 #endif // #ifdef SK_CPU_ARM32 | |
1050 | 1125 |
1051 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, | 1126 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
1052 const SkPMColor* SK_RESTRICT src, | 1127 const SkPMColor* SK_RESTRICT src, |
1053 int count, U8CPU alpha, int x, int y) { | 1128 int count, U8CPU alpha, int x, int y) { |
1054 SkASSERT(255 == alpha); | 1129 SkASSERT(255 == alpha); |
1055 | 1130 |
1056 #define UNROLL 8 | 1131 #define UNROLL 8 |
1057 | 1132 |
1058 if (count >= UNROLL) { | 1133 if (count >= UNROLL) { |
1059 | 1134 |
1060 #if defined(DEBUG_OPAQUE_DITHER) | 1135 #if defined(DEBUG_OPAQUE_DITHER) |
1061 uint16_t tmpbuf[UNROLL]; | 1136 uint16_t tmpbuf[UNROLL]; |
1062 int td[UNROLL]; | 1137 int td[UNROLL]; |
1063 int tdv[UNROLL]; | 1138 int tdv[UNROLL]; |
1064 int ta[UNROLL]; | 1139 int ta[UNROLL]; |
1065 int tap[UNROLL]; | 1140 int tap[UNROLL]; |
1066 uint16_t in_dst[UNROLL]; | 1141 uint16_t in_dst[UNROLL]; |
1067 int offset = 0; | 1142 int offset = 0; |
1068 int noisy = 0; | 1143 int noisy = 0; |
1069 #endif | 1144 #endif |
1070 | 1145 |
1071 uint8x8_t dbase; | 1146 uint8x8_t dbase; |
1072 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1147 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
1073 dbase = vld1_u8(dstart); | 1148 dbase = vld1_u8(dstart); |
1074 | 1149 |
1075 do { | 1150 do { |
1151 uint8x8x4_t vsrc; | |
1076 uint8x8_t sr, sg, sb, sa, d; | 1152 uint8x8_t sr, sg, sb, sa, d; |
1077 uint16x8_t dst8, scale8, alpha8; | 1153 uint16x8_t dst8, scale8, alpha8; |
1078 uint16x8_t dst_r, dst_g, dst_b; | 1154 uint16x8_t dst_r, dst_g, dst_b; |
1079 | 1155 |
1080 #if defined(DEBUG_OPAQUE_DITHER) | 1156 #if defined(DEBUG_OPAQUE_DITHER) |
1081 // calculate 8 elements worth into a temp buffer | 1157 // calculate 8 elements worth into a temp buffer |
1082 { | 1158 { |
1083 int my_y = y; | 1159 int my_y = y; |
1084 int my_x = x; | 1160 int my_x = x; |
1085 SkPMColor* my_src = (SkPMColor*)src; | 1161 SkPMColor* my_src = (SkPMColor*)src; |
(...skipping 30 matching lines...) Expand all Loading... | |
1116 tmpbuf[i] = *my_dst; | 1192 tmpbuf[i] = *my_dst; |
1117 ta[i] = tdv[i] = td[i] = 0xbeef; | 1193 ta[i] = tdv[i] = td[i] = 0xbeef; |
1118 } | 1194 } |
1119 in_dst[i] = *my_dst; | 1195 in_dst[i] = *my_dst; |
1120 my_dst += 1; | 1196 my_dst += 1; |
1121 DITHER_INC_X(my_x); | 1197 DITHER_INC_X(my_x); |
1122 } | 1198 } |
1123 } | 1199 } |
1124 #endif | 1200 #endif |
1125 | 1201 |
1202 #ifdef SK_CPU_ARM64 | |
1203 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3; | |
1204 asm ( | |
1205 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n" | |
1206 "mov %[vsrc0].8b, v0.8b \t\n" | |
1207 "mov %[vsrc1].8b, v1.8b \t\n" | |
1208 "mov %[vsrc2].8b, v2.8b \t\n" | |
1209 "mov %[vsrc3].8b, v3.8b \t\n" | |
1210 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), | |
1211 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3) | |
1212 : [src] "r" (src) | |
1213 : "v0", "v1", "v2", "v3" | |
1214 ); | |
1126 | 1215 |
1216 src += 8; | |
1217 vsrc.val[0] = vsrc_0; | |
1218 vsrc.val[1] = vsrc_1; | |
1219 vsrc.val[2] = vsrc_2; | |
1220 vsrc.val[3] = vsrc_3; | |
1221 #else | |
1127 { | 1222 { |
1128 register uint8x8_t d0 asm("d0"); | 1223 register uint8x8_t d0 asm("d0"); |
1129 register uint8x8_t d1 asm("d1"); | 1224 register uint8x8_t d1 asm("d1"); |
1130 register uint8x8_t d2 asm("d2"); | 1225 register uint8x8_t d2 asm("d2"); |
1131 register uint8x8_t d3 asm("d3"); | 1226 register uint8x8_t d3 asm("d3"); |
1132 | 1227 |
1133 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 1228 asm ("vld4.8 {d0-d3},[%[src]]! " |
1134 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) | 1229 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) |
1135 : | 1230 : |
1136 ); | 1231 ); |
1137 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 1232 vsrc.val[0] = d0; |
1138 sr = d2; sg = d1; sb = d0; sa = d3; | 1233 vsrc.val[1] = d1; |
1139 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 1234 vsrc.val[2] = d2; |
1140 sr = d0; sg = d1; sb = d2; sa = d3; | 1235 vsrc.val[3] = d3; |
1236 } | |
1141 #endif | 1237 #endif |
1142 } | 1238 sa = vsrc.val[NEON_A]; |
1239 sr = vsrc.val[NEON_R]; | |
1240 sg = vsrc.val[NEON_G]; | |
1241 sb = vsrc.val[NEON_B]; | |
1143 | 1242 |
1144 /* calculate 'd', which will be 0..7 | 1243 /* calculate 'd', which will be 0..7 |
1145 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice | 1244 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice |
1146 */ | 1245 */ |
1147 alpha8 = vmovl_u8(dbase); | 1246 alpha8 = vmovl_u8(dbase); |
1148 alpha8 = vmlal_u8(alpha8, sa, dbase); | 1247 alpha8 = vmlal_u8(alpha8, sa, dbase); |
1149 d = vshrn_n_u16(alpha8, 8); // narrowing too | 1248 d = vshrn_n_u16(alpha8, 8); // narrowing too |
1150 | 1249 |
1151 // sr = sr - (sr>>5) + d | 1250 // sr = sr - (sr>>5) + d |
1152 /* watching for 8-bit overflow. d is 0..7; risky range of | 1251 /* watching for 8-bit overflow. d is 0..7; risky range of |
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1276 #define UNROLL 8 | 1375 #define UNROLL 8 |
1277 if (count >= UNROLL) { | 1376 if (count >= UNROLL) { |
1278 uint8x8_t d; | 1377 uint8x8_t d; |
1279 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1378 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
1280 d = vld1_u8(dstart); | 1379 d = vld1_u8(dstart); |
1281 | 1380 |
1282 while (count >= UNROLL) { | 1381 while (count >= UNROLL) { |
1283 uint8x8_t sr, sg, sb; | 1382 uint8x8_t sr, sg, sb; |
1284 uint16x8_t dr, dg, db; | 1383 uint16x8_t dr, dg, db; |
1285 uint16x8_t dst8; | 1384 uint16x8_t dst8; |
1385 uint8x8x4_t vsrc; | |
1286 | 1386 |
1387 #ifdef SK_CPU_ARM64 | |
1388 uint8x8_t vsrc_0, vsrc_1, vsrc_2; | |
1389 asm ( | |
1390 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n" | |
1391 "mov %[vsrc0].8b, v0.8b \t\n" | |
1392 "mov %[vsrc1].8b, v1.8b \t\n" | |
1393 "mov %[vsrc2].8b, v2.8b \t\n" | |
1394 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), | |
1395 [vsrc2] "=w" (vsrc_2) | |
1396 : [src] "r" (src) | |
1397 : "v0", "v1", "v2", "v3" | |
1398 ); | |
1399 src += 8; | |
1400 | |
1401 vsrc.val[0] = vsrc_0; | |
1402 vsrc.val[1] = vsrc_1; | |
1403 vsrc.val[2] = vsrc_2; | |
1404 #else | |
1287 { | 1405 { |
1288 register uint8x8_t d0 asm("d0"); | 1406 register uint8x8_t d0 asm("d0"); |
1289 register uint8x8_t d1 asm("d1"); | 1407 register uint8x8_t d1 asm("d1"); |
1290 register uint8x8_t d2 asm("d2"); | 1408 register uint8x8_t d2 asm("d2"); |
1291 register uint8x8_t d3 asm("d3"); | 1409 register uint8x8_t d3 asm("d3"); |
1292 | 1410 |
1293 asm ( | 1411 asm ( |
1294 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 1412 "vld4.8 {d0-d3},[%[src]]! " |
1295 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) | 1413 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
1296 : | 1414 : |
1297 ); | 1415 ); |
1298 sg = d1; | 1416 vsrc.val[0] = d0; |
1299 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 1417 vsrc.val[1] = d1; |
1300 sr = d2; sb = d0; | 1418 vsrc.val[2] = d2; |
1301 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 1419 } |
1302 sr = d0; sb = d2; | |
1303 #endif | 1420 #endif |
1304 } | 1421 sr = vsrc.val[NEON_R]; |
1422 sg = vsrc.val[NEON_G]; | |
1423 sb = vsrc.val[NEON_B]; | |
1424 | |
1305 /* XXX: if we want to prefetch, hide it in the above asm() | 1425 /* XXX: if we want to prefetch, hide it in the above asm() |
1306 * using the gcc __builtin_prefetch(), the prefetch will | 1426 * using the gcc __builtin_prefetch(), the prefetch will |
1307 * fall to the bottom of the loop -- it won't stick up | 1427 * fall to the bottom of the loop -- it won't stick up |
1308 * at the top of the loop, just after the vld4. | 1428 * at the top of the loop, just after the vld4. |
1309 */ | 1429 */ |
1310 | 1430 |
1311 // sr = sr - (sr>>5) + d | 1431 // sr = sr - (sr>>5) + d |
1312 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); | 1432 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); |
1313 dr = vaddl_u8(sr, d); | 1433 dr = vaddl_u8(sr, d); |
1314 | 1434 |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1362 SkPMColor c = *src++; | 1482 SkPMColor c = *src++; |
1363 SkPMColorAssert(c); | 1483 SkPMColorAssert(c); |
1364 SkASSERT(SkGetPackedA32(c) == 255); | 1484 SkASSERT(SkGetPackedA32(c) == 255); |
1365 | 1485 |
1366 unsigned dither = DITHER_VALUE(x); | 1486 unsigned dither = DITHER_VALUE(x); |
1367 *dst++ = SkDitherRGB32To565(c, dither); | 1487 *dst++ = SkDitherRGB32To565(c, dither); |
1368 DITHER_INC_X(x); | 1488 DITHER_INC_X(x); |
1369 } while (--count != 0); | 1489 } while (--count != 0); |
1370 } | 1490 } |
1371 } | 1491 } |
1372 #endif | |
1373 | 1492 |
1374 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, | 1493 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, |
1375 SkPMColor color) { | 1494 SkPMColor color) { |
1376 if (count <= 0) { | 1495 if (count <= 0) { |
1377 return; | 1496 return; |
1378 } | 1497 } |
1379 | 1498 |
1380 if (0 == color) { | 1499 if (0 == color) { |
1381 if (src != dst) { | 1500 if (src != dst) { |
1382 memcpy(dst, src, count * sizeof(SkPMColor)); | 1501 memcpy(dst, src, count * sizeof(SkPMColor)); |
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1468 *dst = color + SkAlphaMulQ(*src, scale); | 1587 *dst = color + SkAlphaMulQ(*src, scale); |
1469 src += 1; | 1588 src += 1; |
1470 dst += 1; | 1589 dst += 1; |
1471 count--; | 1590 count--; |
1472 } | 1591 } |
1473 } | 1592 } |
1474 | 1593 |
1475 /////////////////////////////////////////////////////////////////////////////// | 1594 /////////////////////////////////////////////////////////////////////////////// |
1476 | 1595 |
1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { | 1596 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
1478 #ifdef SK_CPU_ARM32 | |
1479 // no dither | 1597 // no dither |
1480 S32_D565_Opaque_neon, | 1598 S32_D565_Opaque_neon, |
1481 S32_D565_Blend_neon, | 1599 S32_D565_Blend_neon, |
1600 #ifdef SK_CPU_ARM32 | |
1482 S32A_D565_Opaque_neon, | 1601 S32A_D565_Opaque_neon, |
1602 #else | |
1603 NULL, | |
1604 #endif | |
1483 S32A_D565_Blend_neon, | 1605 S32A_D565_Blend_neon, |
1484 | 1606 |
1485 // dither | 1607 // dither |
1486 S32_D565_Opaque_Dither_neon, | 1608 S32_D565_Opaque_Dither_neon, |
1487 S32_D565_Blend_Dither_neon, | 1609 S32_D565_Blend_Dither_neon, |
1488 S32A_D565_Opaque_Dither_neon, | 1610 S32A_D565_Opaque_Dither_neon, |
1489 NULL, // S32A_D565_Blend_Dither | 1611 NULL, // S32A_D565_Blend_Dither |
1490 #else | |
1491 NULL, NULL, NULL, NULL, | |
1492 NULL, NULL, NULL, NULL | |
1493 #endif | |
1494 }; | 1612 }; |
1495 | 1613 |
1496 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1614 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
1497 NULL, // S32_Opaque, | 1615 NULL, // S32_Opaque, |
1498 S32_Blend_BlitRow32_neon, // S32_Blend, | 1616 S32_Blend_BlitRow32_neon, // S32_Blend, |
1499 /* | 1617 /* |
1500 * We have two choices for S32A_Opaque procs. The one reads the src alpha | 1618 * We have two choices for S32A_Opaque procs. The one reads the src alpha |
1501 * value and attempts to optimize accordingly. The optimization is | 1619 * value and attempts to optimize accordingly. The optimization is |
1502 * sensitive to the source content and is not a win in all cases. For | 1620 * sensitive to the source content and is not a win in all cases. For |
1503 * example, if there are a lot of transitions between the alpha states, | 1621 * example, if there are a lot of transitions between the alpha states, |
1504 * the performance will almost certainly be worse. However, for many | 1622 * the performance will almost certainly be worse. However, for many |
1505 * common cases the performance is equivalent or better than the standard | 1623 * common cases the performance is equivalent or better than the standard |
1506 * case where we do not inspect the src alpha. | 1624 * case where we do not inspect the src alpha. |
1507 */ | 1625 */ |
1508 #if SK_A32_SHIFT == 24 | 1626 #if SK_A32_SHIFT == 24 |
1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1627 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1628 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1511 #else | 1629 #else |
1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1630 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1513 #endif | 1631 #endif |
1514 #ifdef SK_CPU_ARM32 | 1632 #ifdef SK_CPU_ARM32 |
1515 S32A_Blend_BlitRow32_neon // S32A_Blend | 1633 S32A_Blend_BlitRow32_neon // S32A_Blend |
1516 #else | 1634 #else |
1517 NULL | 1635 NULL |
1518 #endif | 1636 #endif |
1519 }; | 1637 }; |
OLD | NEW |