OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
11 #include "SkBlitRow.h" | 11 #include "SkBlitRow.h" |
12 #include "SkColorPriv.h" | 12 #include "SkColorPriv.h" |
13 #include "SkDither.h" | 13 #include "SkDither.h" |
14 #include "SkMathPriv.h" | 14 #include "SkMathPriv.h" |
15 #include "SkUtils.h" | 15 #include "SkUtils.h" |
16 | 16 |
17 #include "SkColor_opts_neon.h" | 17 #include "SkColor_opts_neon.h" |
18 #include <arm_neon.h> | 18 #include <arm_neon.h> |
19 | 19 |
20 #ifdef SK_CPU_ARM32 | 20 #ifdef SK_CPU_ARM64 |
21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) { | |
22 uint8x8x4_t vsrc; | |
23 uint8x8_t vsrc_0, vsrc_1, vsrc_2; | |
24 | |
25 asm ( | |
26 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" | |
27 "mov %[vsrc0].8b, v0.8b \t\n" | |
28 "mov %[vsrc1].8b, v1.8b \t\n" | |
29 "mov %[vsrc2].8b, v2.8b \t\n" | |
30 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), | |
31 [vsrc2] "=w" (vsrc_2), [src] "+&r" (src) | |
32 : : "v0", "v1", "v2", "v3" | |
33 ); | |
34 | |
35 vsrc.val[0] = vsrc_0; | |
36 vsrc.val[1] = vsrc_1; | |
37 vsrc.val[2] = vsrc_2; | |
38 | |
39 return vsrc; | |
40 } | |
41 | |
42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) { | |
43 uint8x8x4_t vsrc; | |
44 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3; | |
45 | |
46 asm ( | |
47 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" | |
48 "mov %[vsrc0].8b, v0.8b \t\n" | |
49 "mov %[vsrc1].8b, v1.8b \t\n" | |
50 "mov %[vsrc2].8b, v2.8b \t\n" | |
51 "mov %[vsrc3].8b, v3.8b \t\n" | |
52 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), | |
53 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3), | |
54 [src] "+&r" (src) | |
55 : : "v0", "v1", "v2", "v3" | |
56 ); | |
57 | |
58 vsrc.val[0] = vsrc_0; | |
59 vsrc.val[1] = vsrc_1; | |
60 vsrc.val[2] = vsrc_2; | |
61 vsrc.val[3] = vsrc_3; | |
62 | |
63 return vsrc; | |
64 } | |
65 #endif | |
66 | |
21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
22 const SkPMColor* SK_RESTRICT src, int count, | 68 const SkPMColor* SK_RESTRICT src, int count, |
23 U8CPU alpha, int /*x*/, int /*y*/) { | 69 U8CPU alpha, int /*x*/, int /*y*/) { |
24 SkASSERT(255 == alpha); | 70 SkASSERT(255 == alpha); |
25 | 71 |
26 while (count >= 8) { | 72 while (count >= 8) { |
27 uint8x8x4_t vsrc; | 73 uint8x8x4_t vsrc; |
28 uint16x8_t vdst; | 74 uint16x8_t vdst; |
29 | 75 |
30 // Load | 76 // Load |
77 #ifdef SK_CPU_ARM64 | |
78 vsrc = sk_vld4_u8_arm64_3(src); | |
79 #else | |
31 vsrc = vld4_u8((uint8_t*)src); | 80 vsrc = vld4_u8((uint8_t*)src); |
81 src += 8; | |
82 #endif | |
32 | 83 |
33 // Convert src to 565 | 84 // Convert src to 565 |
34 vdst = SkPixel32ToPixel16_neon8(vsrc); | 85 vdst = SkPixel32ToPixel16_neon8(vsrc); |
35 | 86 |
36 // Store | 87 // Store |
37 vst1q_u16(dst, vdst); | 88 vst1q_u16(dst, vdst); |
38 | 89 |
39 // Prepare next iteration | 90 // Prepare next iteration |
40 dst += 8; | 91 dst += 8; |
41 src += 8; | |
42 count -= 8; | 92 count -= 8; |
43 }; | 93 }; |
44 | 94 |
45 // Leftovers | 95 // Leftovers |
46 while (count > 0) { | 96 while (count > 0) { |
47 SkPMColor c = *src++; | 97 SkPMColor c = *src++; |
48 SkPMColorAssert(c); | 98 SkPMColorAssert(c); |
49 *dst = SkPixel32ToPixel16_ToU16(c); | 99 *dst = SkPixel32ToPixel16_ToU16(c); |
50 dst++; | 100 dst++; |
51 count--; | 101 count--; |
52 }; | 102 }; |
53 } | 103 } |
54 | 104 |
55 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, | 105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
56 const SkPMColor* SK_RESTRICT src, int count, | 106 const SkPMColor* SK_RESTRICT src, int count, |
57 U8CPU alpha, int /*x*/, int /*y*/) { | 107 U8CPU alpha, int /*x*/, int /*y*/) { |
58 SkASSERT(255 > alpha); | 108 SkASSERT(255 > alpha); |
59 | 109 |
60 uint16x8_t vmask_blue, vscale; | 110 uint16x8_t vmask_blue, vscale; |
61 | 111 |
62 // prepare constants | 112 // prepare constants |
63 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); | 113 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); |
64 vmask_blue = vmovq_n_u16(0x1F); | 114 vmask_blue = vmovq_n_u16(0x1F); |
65 | 115 |
66 while (count >= 8) { | 116 while (count >= 8) { |
117 uint8x8x4_t vsrc; | |
67 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; | 118 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
68 uint16x8_t vres_r, vres_g, vres_b; | 119 uint16x8_t vres_r, vres_g, vres_b; |
69 uint8x8_t vsrc_r, vsrc_g, vsrc_b; | |
70 | 120 |
71 // Load src | 121 // Load src |
122 #ifdef SK_CPU_ARM64 | |
123 vsrc = sk_vld4_u8_arm64_3(src); | |
124 #else | |
72 { | 125 { |
73 register uint8x8_t d0 asm("d0"); | 126 register uint8x8_t d0 asm("d0"); |
74 register uint8x8_t d1 asm("d1"); | 127 register uint8x8_t d1 asm("d1"); |
75 register uint8x8_t d2 asm("d2"); | 128 register uint8x8_t d2 asm("d2"); |
76 register uint8x8_t d3 asm("d3"); | 129 register uint8x8_t d3 asm("d3"); |
77 | 130 |
78 asm ( | 131 asm ( |
79 "vld4.8 {d0-d3},[%[src]]!" | 132 "vld4.8 {d0-d3},[%[src]]!" |
80 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) | 133 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
81 : | 134 : |
82 ); | 135 ); |
83 vsrc_g = d1; | 136 vsrc.val[0] = d0; |
84 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 137 vsrc.val[1] = d1; |
85 vsrc_r = d2; vsrc_b = d0; | 138 vsrc.val[2] = d2; |
86 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 139 } |
87 vsrc_r = d0; vsrc_b = d2; | |
88 #endif | 140 #endif |
89 } | |
90 | 141 |
91 // Load and unpack dst | 142 // Load and unpack dst |
92 vdst = vld1q_u16(dst); | 143 vdst = vld1q_u16(dst); |
93 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes | 144 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes |
94 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue | 145 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue |
95 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red | 146 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red |
96 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green | 147 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green |
97 | 148 |
98 // Shift src to 565 | 149 // Shift src to 565 range |
99 vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range | 150 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3); |
100 vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range | 151 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2); |
101 vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range | 152 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3); |
102 | 153 |
103 // Scale src - dst | 154 // Scale src - dst |
104 vres_r = vmovl_u8(vsrc_r) - vdst_r; | 155 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r; |
105 vres_g = vmovl_u8(vsrc_g) - vdst_g; | 156 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g; |
106 vres_b = vmovl_u8(vsrc_b) - vdst_b; | 157 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b; |
107 | 158 |
108 vres_r = vshrq_n_u16(vres_r * vscale, 8); | 159 vres_r = vshrq_n_u16(vres_r * vscale, 8); |
109 vres_g = vshrq_n_u16(vres_g * vscale, 8); | 160 vres_g = vshrq_n_u16(vres_g * vscale, 8); |
110 vres_b = vshrq_n_u16(vres_b * vscale, 8); | 161 vres_b = vshrq_n_u16(vres_b * vscale, 8); |
111 | 162 |
112 vres_r += vdst_r; | 163 vres_r += vdst_r; |
113 vres_g += vdst_g; | 164 vres_g += vdst_g; |
114 vres_b += vdst_b; | 165 vres_b += vdst_b; |
115 | 166 |
116 // Combine | 167 // Combine |
(...skipping 12 matching lines...) Expand all Loading... | |
129 SkPMColorAssert(c); | 180 SkPMColorAssert(c); |
130 uint16_t d = *dst; | 181 uint16_t d = *dst; |
131 *dst++ = SkPackRGB16( | 182 *dst++ = SkPackRGB16( |
132 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), | 183 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), |
133 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), | 184 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), |
134 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); | 185 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); |
135 } while (--count != 0); | 186 } while (--count != 0); |
136 } | 187 } |
137 } | 188 } |
138 | 189 |
190 #ifdef SK_CPU_ARM32 | |
139 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
140 const SkPMColor* SK_RESTRICT src, int count, | 192 const SkPMColor* SK_RESTRICT src, int count, |
141 U8CPU alpha, int /*x*/, int /*y*/) { | 193 U8CPU alpha, int /*x*/, int /*y*/) { |
142 SkASSERT(255 == alpha); | 194 SkASSERT(255 == alpha); |
143 | 195 |
144 if (count >= 8) { | 196 if (count >= 8) { |
145 uint16_t* SK_RESTRICT keep_dst = 0; | 197 uint16_t* SK_RESTRICT keep_dst = 0; |
146 | 198 |
147 asm volatile ( | 199 asm volatile ( |
148 "ands ip, %[count], #7 \n\t" | 200 "ands ip, %[count], #7 \n\t" |
(...skipping 157 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
306 | 358 |
307 "21: \n\t" | 359 "21: \n\t" |
308 : [count] "+r" (count) | 360 : [count] "+r" (count) |
309 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc) | 361 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc) |
310 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7", | 362 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7", |
311 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29", | 363 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29", |
312 "d30","d31" | 364 "d30","d31" |
313 ); | 365 ); |
314 } | 366 } |
315 } | 367 } |
368 #endif | |
316 | 369 |
317 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { | 370 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { |
318 prod += vdupq_n_u16(128); | 371 prod += vdupq_n_u16(128); |
319 prod += vshrq_n_u16(prod, 8); | 372 prod += vshrq_n_u16(prod, 8); |
320 return vshrq_n_u16(prod, 8); | 373 return vshrq_n_u16(prod, 8); |
321 } | 374 } |
322 | 375 |
323 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, | 376 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
324 const SkPMColor* SK_RESTRICT src, int count, | 377 const SkPMColor* SK_RESTRICT src, int count, |
325 U8CPU alpha, int /*x*/, int /*y*/) { | 378 U8CPU alpha, int /*x*/, int /*y*/) { |
(...skipping 13 matching lines...) Expand all Loading... | |
339 valpha = vdup_n_u8(alpha); | 392 valpha = vdup_n_u8(alpha); |
340 vmask_blue = vmovq_n_u16(SK_B16_MASK); | 393 vmask_blue = vmovq_n_u16(SK_B16_MASK); |
341 | 394 |
342 do { | 395 do { |
343 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; | 396 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
344 uint16x8_t vres_a, vres_r, vres_g, vres_b; | 397 uint16x8_t vres_a, vres_r, vres_g, vres_b; |
345 uint8x8x4_t vsrc; | 398 uint8x8x4_t vsrc; |
346 | 399 |
347 // load pixels | 400 // load pixels |
348 vdst = vld1q_u16(dst); | 401 vdst = vld1q_u16(dst); |
402 #ifdef SK_CPU_ARM64 | |
403 vsrc = sk_vld4_u8_arm64_4(src); | |
404 #else | |
349 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 405 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
mtklein
2014/06/06 14:41:17
Think it makes sense to follow up and do the same
| |
350 asm ( | 406 asm ( |
351 "vld4.u8 %h[vsrc], [%[src]]!" | 407 "vld4.u8 %h[vsrc], [%[src]]!" |
352 : [vsrc] "=w" (vsrc), [src] "+&r" (src) | 408 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
353 : : | 409 : : |
354 ); | 410 ); |
355 #else | 411 #else |
356 register uint8x8_t d0 asm("d0"); | 412 register uint8x8_t d0 asm("d0"); |
357 register uint8x8_t d1 asm("d1"); | 413 register uint8x8_t d1 asm("d1"); |
358 register uint8x8_t d2 asm("d2"); | 414 register uint8x8_t d2 asm("d2"); |
359 register uint8x8_t d3 asm("d3"); | 415 register uint8x8_t d3 asm("d3"); |
360 | 416 |
361 asm volatile ( | 417 asm volatile ( |
362 "vld4.u8 {d0-d3},[%[src]]!;" | 418 "vld4.u8 {d0-d3},[%[src]]!;" |
363 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | 419 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
364 [src] "+&r" (src) | 420 [src] "+&r" (src) |
365 : : | 421 : : |
366 ); | 422 ); |
367 vsrc.val[0] = d0; | 423 vsrc.val[0] = d0; |
368 vsrc.val[1] = d1; | 424 vsrc.val[1] = d1; |
369 vsrc.val[2] = d2; | 425 vsrc.val[2] = d2; |
370 vsrc.val[3] = d3; | 426 vsrc.val[3] = d3; |
371 #endif | 427 #endif |
428 #endif // #ifdef SK_CPU_ARM64 | |
372 | 429 |
373 | 430 |
374 // deinterleave dst | 431 // deinterleave dst |
375 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes | 432 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes |
376 vdst_b = vdst & vmask_blue; // extract blue | 433 vdst_b = vdst & vmask_blue; // extract blue |
377 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red | 434 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red |
378 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green | 435 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green |
379 | 436 |
380 // shift src to 565 | 437 // shift src to 565 |
381 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); | 438 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); |
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
461 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 518 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
462 | 519 |
463 uint8x8_t vdither = vld1_u8(dstart); // load dither values | 520 uint8x8_t vdither = vld1_u8(dstart); // load dither values |
464 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s | 521 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s |
465 | 522 |
466 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg | 523 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg |
467 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask | 524 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask |
468 | 525 |
469 do { | 526 do { |
470 | 527 |
528 uint8x8x4_t vsrc; | |
471 uint8x8_t vsrc_r, vsrc_g, vsrc_b; | 529 uint8x8_t vsrc_r, vsrc_g, vsrc_b; |
472 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; | 530 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; |
473 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; | 531 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; |
474 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; | 532 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; |
475 uint16x8_t vdst; | 533 uint16x8_t vdst; |
476 uint16x8_t vdst_r, vdst_g, vdst_b; | 534 uint16x8_t vdst_r, vdst_g, vdst_b; |
477 int16x8_t vres_r, vres_g, vres_b; | 535 int16x8_t vres_r, vres_g, vres_b; |
478 int8x8_t vres8_r, vres8_g, vres8_b; | 536 int8x8_t vres8_r, vres8_g, vres8_b; |
479 | 537 |
480 // Load source and add dither | 538 // Load source and add dither |
539 #ifdef SK_CPU_ARM64 | |
540 vsrc = sk_vld4_u8_arm64_3(src); | |
541 #else | |
481 { | 542 { |
482 register uint8x8_t d0 asm("d0"); | 543 register uint8x8_t d0 asm("d0"); |
483 register uint8x8_t d1 asm("d1"); | 544 register uint8x8_t d1 asm("d1"); |
484 register uint8x8_t d2 asm("d2"); | 545 register uint8x8_t d2 asm("d2"); |
485 register uint8x8_t d3 asm("d3"); | 546 register uint8x8_t d3 asm("d3"); |
486 | 547 |
487 asm ( | 548 asm ( |
488 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 549 "vld4.8 {d0-d3},[%[src]]! " |
489 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) | 550 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
490 : | 551 : |
491 ); | 552 ); |
492 vsrc_g = d1; | 553 vsrc.val[0] = d0; |
493 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 554 vsrc.val[1] = d1; |
494 vsrc_r = d2; vsrc_b = d0; | 555 vsrc.val[2] = d2; |
495 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 556 } |
496 vsrc_r = d0; vsrc_b = d2; | |
497 #endif | 557 #endif |
498 } | 558 vsrc_r = vsrc.val[NEON_R]; |
559 vsrc_g = vsrc.val[NEON_G]; | |
560 vsrc_b = vsrc.val[NEON_B]; | |
499 | 561 |
500 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 | 562 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 |
501 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 | 563 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 |
502 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 | 564 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 |
503 | 565 |
504 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen | 566 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen |
505 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen | 567 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen |
506 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen | 568 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen |
507 | 569 |
508 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result | 570 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
569 sb = SkDITHER_B32To565(sb, dither); | 631 sb = SkDITHER_B32To565(sb, dither); |
570 | 632 |
571 uint16_t d = *dst; | 633 uint16_t d = *dst; |
572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), | 634 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), |
573 SkAlphaBlend(sg, SkGetPackedG16(d), scale), | 635 SkAlphaBlend(sg, SkGetPackedG16(d), scale), |
574 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); | 636 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); |
575 DITHER_INC_X(x); | 637 DITHER_INC_X(x); |
576 } while (--count != 0); | 638 } while (--count != 0); |
577 } | 639 } |
578 } | 640 } |
579 #endif | |
580 | 641 |
581 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 642 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
582 const SkPMColor* SK_RESTRICT src, | 643 const SkPMColor* SK_RESTRICT src, |
583 int count, U8CPU alpha) { | 644 int count, U8CPU alpha) { |
584 | 645 |
585 SkASSERT(255 == alpha); | 646 SkASSERT(255 == alpha); |
586 if (count > 0) { | 647 if (count > 0) { |
587 | 648 |
588 | 649 |
589 uint8x8_t alpha_mask; | 650 uint8x8_t alpha_mask; |
(...skipping 450 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1040 uint16_t *pc = (uint16_t*) p; | 1101 uint16_t *pc = (uint16_t*) p; |
1041 sprintf(buf,"%8s:", str); | 1102 sprintf(buf,"%8s:", str); |
1042 len = (len / sizeof(uint16_t)); /* passed as bytes */ | 1103 len = (len / sizeof(uint16_t)); /* passed as bytes */ |
1043 for(i=0;i<len;i++) { | 1104 for(i=0;i<len;i++) { |
1044 sprintf(tbuf, " %04x", pc[i]); | 1105 sprintf(tbuf, " %04x", pc[i]); |
1045 strcat(buf, tbuf); | 1106 strcat(buf, tbuf); |
1046 } | 1107 } |
1047 SkDebugf("%s\n", buf); | 1108 SkDebugf("%s\n", buf); |
1048 } | 1109 } |
1049 #endif | 1110 #endif |
1111 #endif // #ifdef SK_CPU_ARM32 | |
1050 | 1112 |
1051 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, | 1113 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
1052 const SkPMColor* SK_RESTRICT src, | 1114 const SkPMColor* SK_RESTRICT src, |
1053 int count, U8CPU alpha, int x, int y) { | 1115 int count, U8CPU alpha, int x, int y) { |
1054 SkASSERT(255 == alpha); | 1116 SkASSERT(255 == alpha); |
1055 | 1117 |
1056 #define UNROLL 8 | 1118 #define UNROLL 8 |
1057 | 1119 |
1058 if (count >= UNROLL) { | 1120 if (count >= UNROLL) { |
1059 | 1121 |
1060 #if defined(DEBUG_OPAQUE_DITHER) | 1122 #if defined(DEBUG_OPAQUE_DITHER) |
1061 uint16_t tmpbuf[UNROLL]; | 1123 uint16_t tmpbuf[UNROLL]; |
1062 int td[UNROLL]; | 1124 int td[UNROLL]; |
1063 int tdv[UNROLL]; | 1125 int tdv[UNROLL]; |
1064 int ta[UNROLL]; | 1126 int ta[UNROLL]; |
1065 int tap[UNROLL]; | 1127 int tap[UNROLL]; |
1066 uint16_t in_dst[UNROLL]; | 1128 uint16_t in_dst[UNROLL]; |
1067 int offset = 0; | 1129 int offset = 0; |
1068 int noisy = 0; | 1130 int noisy = 0; |
1069 #endif | 1131 #endif |
1070 | 1132 |
1071 uint8x8_t dbase; | 1133 uint8x8_t dbase; |
1072 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1134 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
1073 dbase = vld1_u8(dstart); | 1135 dbase = vld1_u8(dstart); |
1074 | 1136 |
1075 do { | 1137 do { |
1138 uint8x8x4_t vsrc; | |
1076 uint8x8_t sr, sg, sb, sa, d; | 1139 uint8x8_t sr, sg, sb, sa, d; |
1077 uint16x8_t dst8, scale8, alpha8; | 1140 uint16x8_t dst8, scale8, alpha8; |
1078 uint16x8_t dst_r, dst_g, dst_b; | 1141 uint16x8_t dst_r, dst_g, dst_b; |
1079 | 1142 |
1080 #if defined(DEBUG_OPAQUE_DITHER) | 1143 #if defined(DEBUG_OPAQUE_DITHER) |
1081 // calculate 8 elements worth into a temp buffer | 1144 // calculate 8 elements worth into a temp buffer |
1082 { | 1145 { |
1083 int my_y = y; | 1146 int my_y = y; |
1084 int my_x = x; | 1147 int my_x = x; |
1085 SkPMColor* my_src = (SkPMColor*)src; | 1148 SkPMColor* my_src = (SkPMColor*)src; |
(...skipping 30 matching lines...) Expand all Loading... | |
1116 tmpbuf[i] = *my_dst; | 1179 tmpbuf[i] = *my_dst; |
1117 ta[i] = tdv[i] = td[i] = 0xbeef; | 1180 ta[i] = tdv[i] = td[i] = 0xbeef; |
1118 } | 1181 } |
1119 in_dst[i] = *my_dst; | 1182 in_dst[i] = *my_dst; |
1120 my_dst += 1; | 1183 my_dst += 1; |
1121 DITHER_INC_X(my_x); | 1184 DITHER_INC_X(my_x); |
1122 } | 1185 } |
1123 } | 1186 } |
1124 #endif | 1187 #endif |
1125 | 1188 |
1126 | 1189 #ifdef SK_CPU_ARM64 |
1190 vsrc = sk_vld4_u8_arm64_4(src); | |
1191 #else | |
1127 { | 1192 { |
1128 register uint8x8_t d0 asm("d0"); | 1193 register uint8x8_t d0 asm("d0"); |
1129 register uint8x8_t d1 asm("d1"); | 1194 register uint8x8_t d1 asm("d1"); |
1130 register uint8x8_t d2 asm("d2"); | 1195 register uint8x8_t d2 asm("d2"); |
1131 register uint8x8_t d3 asm("d3"); | 1196 register uint8x8_t d3 asm("d3"); |
1132 | 1197 |
1133 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 1198 asm ("vld4.8 {d0-d3},[%[src]]! " |
1134 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) | 1199 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) |
1135 : | 1200 : |
1136 ); | 1201 ); |
1137 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 1202 vsrc.val[0] = d0; |
1138 sr = d2; sg = d1; sb = d0; sa = d3; | 1203 vsrc.val[1] = d1; |
1139 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 1204 vsrc.val[2] = d2; |
1140 sr = d0; sg = d1; sb = d2; sa = d3; | 1205 vsrc.val[3] = d3; |
1206 } | |
1141 #endif | 1207 #endif |
1142 } | 1208 sa = vsrc.val[NEON_A]; |
1209 sr = vsrc.val[NEON_R]; | |
1210 sg = vsrc.val[NEON_G]; | |
1211 sb = vsrc.val[NEON_B]; | |
1143 | 1212 |
1144 /* calculate 'd', which will be 0..7 | 1213 /* calculate 'd', which will be 0..7 |
1145 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice | 1214 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice |
1146 */ | 1215 */ |
1147 alpha8 = vmovl_u8(dbase); | 1216 alpha8 = vmovl_u8(dbase); |
1148 alpha8 = vmlal_u8(alpha8, sa, dbase); | 1217 alpha8 = vmlal_u8(alpha8, sa, dbase); |
1149 d = vshrn_n_u16(alpha8, 8); // narrowing too | 1218 d = vshrn_n_u16(alpha8, 8); // narrowing too |
1150 | 1219 |
1151 // sr = sr - (sr>>5) + d | 1220 // sr = sr - (sr>>5) + d |
1152 /* watching for 8-bit overflow. d is 0..7; risky range of | 1221 /* watching for 8-bit overflow. d is 0..7; risky range of |
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1276 #define UNROLL 8 | 1345 #define UNROLL 8 |
1277 if (count >= UNROLL) { | 1346 if (count >= UNROLL) { |
1278 uint8x8_t d; | 1347 uint8x8_t d; |
1279 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1348 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
1280 d = vld1_u8(dstart); | 1349 d = vld1_u8(dstart); |
1281 | 1350 |
1282 while (count >= UNROLL) { | 1351 while (count >= UNROLL) { |
1283 uint8x8_t sr, sg, sb; | 1352 uint8x8_t sr, sg, sb; |
1284 uint16x8_t dr, dg, db; | 1353 uint16x8_t dr, dg, db; |
1285 uint16x8_t dst8; | 1354 uint16x8_t dst8; |
1355 uint8x8x4_t vsrc; | |
1286 | 1356 |
1357 #ifdef SK_CPU_ARM64 | |
1358 vsrc = sk_vld4_u8_arm64_3(src); | |
1359 #else | |
1287 { | 1360 { |
1288 register uint8x8_t d0 asm("d0"); | 1361 register uint8x8_t d0 asm("d0"); |
1289 register uint8x8_t d1 asm("d1"); | 1362 register uint8x8_t d1 asm("d1"); |
1290 register uint8x8_t d2 asm("d2"); | 1363 register uint8x8_t d2 asm("d2"); |
1291 register uint8x8_t d3 asm("d3"); | 1364 register uint8x8_t d3 asm("d3"); |
1292 | 1365 |
1293 asm ( | 1366 asm ( |
1294 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 1367 "vld4.8 {d0-d3},[%[src]]! " |
1295 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) | 1368 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
1296 : | 1369 : |
1297 ); | 1370 ); |
1298 sg = d1; | 1371 vsrc.val[0] = d0; |
1299 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) | 1372 vsrc.val[1] = d1; |
1300 sr = d2; sb = d0; | 1373 vsrc.val[2] = d2; |
1301 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) | 1374 } |
1302 sr = d0; sb = d2; | |
1303 #endif | 1375 #endif |
1304 } | 1376 sr = vsrc.val[NEON_R]; |
1377 sg = vsrc.val[NEON_G]; | |
1378 sb = vsrc.val[NEON_B]; | |
1379 | |
1305 /* XXX: if we want to prefetch, hide it in the above asm() | 1380 /* XXX: if we want to prefetch, hide it in the above asm() |
1306 * using the gcc __builtin_prefetch(), the prefetch will | 1381 * using the gcc __builtin_prefetch(), the prefetch will |
1307 * fall to the bottom of the loop -- it won't stick up | 1382 * fall to the bottom of the loop -- it won't stick up |
1308 * at the top of the loop, just after the vld4. | 1383 * at the top of the loop, just after the vld4. |
1309 */ | 1384 */ |
1310 | 1385 |
1311 // sr = sr - (sr>>5) + d | 1386 // sr = sr - (sr>>5) + d |
1312 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); | 1387 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); |
1313 dr = vaddl_u8(sr, d); | 1388 dr = vaddl_u8(sr, d); |
1314 | 1389 |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1362 SkPMColor c = *src++; | 1437 SkPMColor c = *src++; |
1363 SkPMColorAssert(c); | 1438 SkPMColorAssert(c); |
1364 SkASSERT(SkGetPackedA32(c) == 255); | 1439 SkASSERT(SkGetPackedA32(c) == 255); |
1365 | 1440 |
1366 unsigned dither = DITHER_VALUE(x); | 1441 unsigned dither = DITHER_VALUE(x); |
1367 *dst++ = SkDitherRGB32To565(c, dither); | 1442 *dst++ = SkDitherRGB32To565(c, dither); |
1368 DITHER_INC_X(x); | 1443 DITHER_INC_X(x); |
1369 } while (--count != 0); | 1444 } while (--count != 0); |
1370 } | 1445 } |
1371 } | 1446 } |
1372 #endif | |
1373 | 1447 |
1374 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, | 1448 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, |
1375 SkPMColor color) { | 1449 SkPMColor color) { |
1376 if (count <= 0) { | 1450 if (count <= 0) { |
1377 return; | 1451 return; |
1378 } | 1452 } |
1379 | 1453 |
1380 if (0 == color) { | 1454 if (0 == color) { |
1381 if (src != dst) { | 1455 if (src != dst) { |
1382 memcpy(dst, src, count * sizeof(SkPMColor)); | 1456 memcpy(dst, src, count * sizeof(SkPMColor)); |
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1468 *dst = color + SkAlphaMulQ(*src, scale); | 1542 *dst = color + SkAlphaMulQ(*src, scale); |
1469 src += 1; | 1543 src += 1; |
1470 dst += 1; | 1544 dst += 1; |
1471 count--; | 1545 count--; |
1472 } | 1546 } |
1473 } | 1547 } |
1474 | 1548 |
1475 /////////////////////////////////////////////////////////////////////////////// | 1549 /////////////////////////////////////////////////////////////////////////////// |
1476 | 1550 |
1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { | 1551 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
1478 #ifdef SK_CPU_ARM32 | |
1479 // no dither | 1552 // no dither |
1480 S32_D565_Opaque_neon, | 1553 S32_D565_Opaque_neon, |
1481 S32_D565_Blend_neon, | 1554 S32_D565_Blend_neon, |
1555 #ifdef SK_CPU_ARM32 | |
1482 S32A_D565_Opaque_neon, | 1556 S32A_D565_Opaque_neon, |
1557 #else | |
1558 NULL, | |
1559 #endif | |
1483 S32A_D565_Blend_neon, | 1560 S32A_D565_Blend_neon, |
1484 | 1561 |
1485 // dither | 1562 // dither |
1486 S32_D565_Opaque_Dither_neon, | 1563 S32_D565_Opaque_Dither_neon, |
1487 S32_D565_Blend_Dither_neon, | 1564 S32_D565_Blend_Dither_neon, |
1488 S32A_D565_Opaque_Dither_neon, | 1565 S32A_D565_Opaque_Dither_neon, |
1489 NULL, // S32A_D565_Blend_Dither | 1566 NULL, // S32A_D565_Blend_Dither |
1490 #else | |
1491 NULL, NULL, NULL, NULL, | |
1492 NULL, NULL, NULL, NULL | |
1493 #endif | |
1494 }; | 1567 }; |
1495 | 1568 |
1496 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1569 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
1497 NULL, // S32_Opaque, | 1570 NULL, // S32_Opaque, |
1498 S32_Blend_BlitRow32_neon, // S32_Blend, | 1571 S32_Blend_BlitRow32_neon, // S32_Blend, |
1499 /* | 1572 /* |
1500 * We have two choices for S32A_Opaque procs. The one reads the src alpha | 1573 * We have two choices for S32A_Opaque procs. The one reads the src alpha |
1501 * value and attempts to optimize accordingly. The optimization is | 1574 * value and attempts to optimize accordingly. The optimization is |
1502 * sensitive to the source content and is not a win in all cases. For | 1575 * sensitive to the source content and is not a win in all cases. For |
1503 * example, if there are a lot of transitions between the alpha states, | 1576 * example, if there are a lot of transitions between the alpha states, |
1504 * the performance will almost certainly be worse. However, for many | 1577 * the performance will almost certainly be worse. However, for many |
1505 * common cases the performance is equivalent or better than the standard | 1578 * common cases the performance is equivalent or better than the standard |
1506 * case where we do not inspect the src alpha. | 1579 * case where we do not inspect the src alpha. |
1507 */ | 1580 */ |
1508 #if SK_A32_SHIFT == 24 | 1581 #if SK_A32_SHIFT == 24 |
1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1582 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1583 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1511 #else | 1584 #else |
1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1585 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1513 #endif | 1586 #endif |
1514 #ifdef SK_CPU_ARM32 | 1587 #ifdef SK_CPU_ARM32 |
1515 S32A_Blend_BlitRow32_neon // S32A_Blend | 1588 S32A_Blend_BlitRow32_neon // S32A_Blend |
1516 #else | 1589 #else |
1517 NULL | 1590 NULL |
1518 #endif | 1591 #endif |
1519 }; | 1592 }; |
OLD | NEW |