Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(9)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 317193003: ARM Skia NEON patches - 39 - arm64 565 blitters (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Review comments / small perf improvement Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h" 11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h" 12 #include "SkColorPriv.h"
13 #include "SkDither.h" 13 #include "SkDither.h"
14 #include "SkMathPriv.h" 14 #include "SkMathPriv.h"
15 #include "SkUtils.h" 15 #include "SkUtils.h"
16 16
17 #include "SkColor_opts_neon.h" 17 #include "SkColor_opts_neon.h"
18 #include <arm_neon.h> 18 #include <arm_neon.h>
19 19
20 #ifdef SK_CPU_ARM32 20 #ifdef SK_CPU_ARM64
21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22 uint8x8x4_t vsrc;
23 uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24
25 asm (
26 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27 "mov %[vsrc0].8b, v0.8b \t\n"
28 "mov %[vsrc1].8b, v1.8b \t\n"
29 "mov %[vsrc2].8b, v2.8b \t\n"
30 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31 [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32 : : "v0", "v1", "v2", "v3"
33 );
34
35 vsrc.val[0] = vsrc_0;
36 vsrc.val[1] = vsrc_1;
37 vsrc.val[2] = vsrc_2;
38
39 return vsrc;
40 }
41
42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43 uint8x8x4_t vsrc;
44 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45
46 asm (
47 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48 "mov %[vsrc0].8b, v0.8b \t\n"
49 "mov %[vsrc1].8b, v1.8b \t\n"
50 "mov %[vsrc2].8b, v2.8b \t\n"
51 "mov %[vsrc3].8b, v3.8b \t\n"
52 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54 [src] "+&r" (src)
55 : : "v0", "v1", "v2", "v3"
56 );
57
58 vsrc.val[0] = vsrc_0;
59 vsrc.val[1] = vsrc_1;
60 vsrc.val[2] = vsrc_2;
61 vsrc.val[3] = vsrc_3;
62
63 return vsrc;
64 }
65 #endif
66
21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22 const SkPMColor* SK_RESTRICT src, int count, 68 const SkPMColor* SK_RESTRICT src, int count,
23 U8CPU alpha, int /*x*/, int /*y*/) { 69 U8CPU alpha, int /*x*/, int /*y*/) {
24 SkASSERT(255 == alpha); 70 SkASSERT(255 == alpha);
25 71
26 while (count >= 8) { 72 while (count >= 8) {
27 uint8x8x4_t vsrc; 73 uint8x8x4_t vsrc;
28 uint16x8_t vdst; 74 uint16x8_t vdst;
29 75
30 // Load 76 // Load
77 #ifdef SK_CPU_ARM64
78 vsrc = sk_vld4_u8_arm64_3(src);
79 #else
31 vsrc = vld4_u8((uint8_t*)src); 80 vsrc = vld4_u8((uint8_t*)src);
81 src += 8;
82 #endif
32 83
33 // Convert src to 565 84 // Convert src to 565
34 vdst = SkPixel32ToPixel16_neon8(vsrc); 85 vdst = SkPixel32ToPixel16_neon8(vsrc);
35 86
36 // Store 87 // Store
37 vst1q_u16(dst, vdst); 88 vst1q_u16(dst, vdst);
38 89
39 // Prepare next iteration 90 // Prepare next iteration
40 dst += 8; 91 dst += 8;
41 src += 8;
42 count -= 8; 92 count -= 8;
43 }; 93 };
44 94
45 // Leftovers 95 // Leftovers
46 while (count > 0) { 96 while (count > 0) {
47 SkPMColor c = *src++; 97 SkPMColor c = *src++;
48 SkPMColorAssert(c); 98 SkPMColorAssert(c);
49 *dst = SkPixel32ToPixel16_ToU16(c); 99 *dst = SkPixel32ToPixel16_ToU16(c);
50 dst++; 100 dst++;
51 count--; 101 count--;
52 }; 102 };
53 } 103 }
54 104
55 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
56 const SkPMColor* SK_RESTRICT src, int count, 106 const SkPMColor* SK_RESTRICT src, int count,
57 U8CPU alpha, int /*x*/, int /*y*/) { 107 U8CPU alpha, int /*x*/, int /*y*/) {
58 SkASSERT(255 > alpha); 108 SkASSERT(255 > alpha);
59 109
60 uint16x8_t vmask_blue, vscale; 110 uint16x8_t vmask_blue, vscale;
61 111
62 // prepare constants 112 // prepare constants
63 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); 113 vscale = vdupq_n_u16(SkAlpha255To256(alpha));
64 vmask_blue = vmovq_n_u16(0x1F); 114 vmask_blue = vmovq_n_u16(0x1F);
65 115
66 while (count >= 8) { 116 while (count >= 8) {
117 uint8x8x4_t vsrc;
67 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 118 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
68 uint16x8_t vres_r, vres_g, vres_b; 119 uint16x8_t vres_r, vres_g, vres_b;
69 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
70 120
71 // Load src 121 // Load src
122 #ifdef SK_CPU_ARM64
123 vsrc = sk_vld4_u8_arm64_3(src);
124 #else
72 { 125 {
73 register uint8x8_t d0 asm("d0"); 126 register uint8x8_t d0 asm("d0");
74 register uint8x8_t d1 asm("d1"); 127 register uint8x8_t d1 asm("d1");
75 register uint8x8_t d2 asm("d2"); 128 register uint8x8_t d2 asm("d2");
76 register uint8x8_t d3 asm("d3"); 129 register uint8x8_t d3 asm("d3");
77 130
78 asm ( 131 asm (
79 "vld4.8 {d0-d3},[%[src]]!" 132 "vld4.8 {d0-d3},[%[src]]!"
80 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 133 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
81 : 134 :
82 ); 135 );
83 vsrc_g = d1; 136 vsrc.val[0] = d0;
84 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 137 vsrc.val[1] = d1;
85 vsrc_r = d2; vsrc_b = d0; 138 vsrc.val[2] = d2;
86 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 139 }
87 vsrc_r = d0; vsrc_b = d2;
88 #endif 140 #endif
89 }
90 141
91 // Load and unpack dst 142 // Load and unpack dst
92 vdst = vld1q_u16(dst); 143 vdst = vld1q_u16(dst);
93 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes 144 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes
94 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue 145 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
95 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red 146 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red
96 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green 147 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green
97 148
98 // Shift src to 565 149 // Shift src to 565 range
99 vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range 150 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
100 vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range 151 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
101 vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range 152 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
102 153
103 // Scale src - dst 154 // Scale src - dst
104 vres_r = vmovl_u8(vsrc_r) - vdst_r; 155 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
105 vres_g = vmovl_u8(vsrc_g) - vdst_g; 156 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
106 vres_b = vmovl_u8(vsrc_b) - vdst_b; 157 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
107 158
108 vres_r = vshrq_n_u16(vres_r * vscale, 8); 159 vres_r = vshrq_n_u16(vres_r * vscale, 8);
109 vres_g = vshrq_n_u16(vres_g * vscale, 8); 160 vres_g = vshrq_n_u16(vres_g * vscale, 8);
110 vres_b = vshrq_n_u16(vres_b * vscale, 8); 161 vres_b = vshrq_n_u16(vres_b * vscale, 8);
111 162
112 vres_r += vdst_r; 163 vres_r += vdst_r;
113 vres_g += vdst_g; 164 vres_g += vdst_g;
114 vres_b += vdst_b; 165 vres_b += vdst_b;
115 166
116 // Combine 167 // Combine
(...skipping 12 matching lines...) Expand all
129 SkPMColorAssert(c); 180 SkPMColorAssert(c);
130 uint16_t d = *dst; 181 uint16_t d = *dst;
131 *dst++ = SkPackRGB16( 182 *dst++ = SkPackRGB16(
132 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), 183 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
133 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), 184 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
134 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); 185 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
135 } while (--count != 0); 186 } while (--count != 0);
136 } 187 }
137 } 188 }
138 189
190 #ifdef SK_CPU_ARM32
139 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
140 const SkPMColor* SK_RESTRICT src, int count, 192 const SkPMColor* SK_RESTRICT src, int count,
141 U8CPU alpha, int /*x*/, int /*y*/) { 193 U8CPU alpha, int /*x*/, int /*y*/) {
142 SkASSERT(255 == alpha); 194 SkASSERT(255 == alpha);
143 195
144 if (count >= 8) { 196 if (count >= 8) {
145 uint16_t* SK_RESTRICT keep_dst = 0; 197 uint16_t* SK_RESTRICT keep_dst = 0;
146 198
147 asm volatile ( 199 asm volatile (
148 "ands ip, %[count], #7 \n\t" 200 "ands ip, %[count], #7 \n\t"
(...skipping 157 matching lines...) Expand 10 before | Expand all | Expand 10 after
306 358
307 "21: \n\t" 359 "21: \n\t"
308 : [count] "+r" (count) 360 : [count] "+r" (count)
309 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc) 361 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc)
310 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7", 362 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7",
311 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29", 363 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29",
312 "d30","d31" 364 "d30","d31"
313 ); 365 );
314 } 366 }
315 } 367 }
368 #endif
316 369
317 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 370 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
318 prod += vdupq_n_u16(128); 371 prod += vdupq_n_u16(128);
319 prod += vshrq_n_u16(prod, 8); 372 prod += vshrq_n_u16(prod, 8);
320 return vshrq_n_u16(prod, 8); 373 return vshrq_n_u16(prod, 8);
321 } 374 }
322 375
323 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 376 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
324 const SkPMColor* SK_RESTRICT src, int count, 377 const SkPMColor* SK_RESTRICT src, int count,
325 U8CPU alpha, int /*x*/, int /*y*/) { 378 U8CPU alpha, int /*x*/, int /*y*/) {
(...skipping 13 matching lines...) Expand all
339 valpha = vdup_n_u8(alpha); 392 valpha = vdup_n_u8(alpha);
340 vmask_blue = vmovq_n_u16(SK_B16_MASK); 393 vmask_blue = vmovq_n_u16(SK_B16_MASK);
341 394
342 do { 395 do {
343 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 396 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
344 uint16x8_t vres_a, vres_r, vres_g, vres_b; 397 uint16x8_t vres_a, vres_r, vres_g, vres_b;
345 uint8x8x4_t vsrc; 398 uint8x8x4_t vsrc;
346 399
347 // load pixels 400 // load pixels
348 vdst = vld1q_u16(dst); 401 vdst = vld1q_u16(dst);
402 #ifdef SK_CPU_ARM64
403 vsrc = sk_vld4_u8_arm64_4(src);
404 #else
349 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 405 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
mtklein 2014/06/06 14:41:17 Think it makes sense to follow up and do the same
350 asm ( 406 asm (
351 "vld4.u8 %h[vsrc], [%[src]]!" 407 "vld4.u8 %h[vsrc], [%[src]]!"
352 : [vsrc] "=w" (vsrc), [src] "+&r" (src) 408 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
353 : : 409 : :
354 ); 410 );
355 #else 411 #else
356 register uint8x8_t d0 asm("d0"); 412 register uint8x8_t d0 asm("d0");
357 register uint8x8_t d1 asm("d1"); 413 register uint8x8_t d1 asm("d1");
358 register uint8x8_t d2 asm("d2"); 414 register uint8x8_t d2 asm("d2");
359 register uint8x8_t d3 asm("d3"); 415 register uint8x8_t d3 asm("d3");
360 416
361 asm volatile ( 417 asm volatile (
362 "vld4.u8 {d0-d3},[%[src]]!;" 418 "vld4.u8 {d0-d3},[%[src]]!;"
363 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 419 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
364 [src] "+&r" (src) 420 [src] "+&r" (src)
365 : : 421 : :
366 ); 422 );
367 vsrc.val[0] = d0; 423 vsrc.val[0] = d0;
368 vsrc.val[1] = d1; 424 vsrc.val[1] = d1;
369 vsrc.val[2] = d2; 425 vsrc.val[2] = d2;
370 vsrc.val[3] = d3; 426 vsrc.val[3] = d3;
371 #endif 427 #endif
428 #endif // #ifdef SK_CPU_ARM64
372 429
373 430
374 // deinterleave dst 431 // deinterleave dst
375 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes 432 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes
376 vdst_b = vdst & vmask_blue; // extract blue 433 vdst_b = vdst & vmask_blue; // extract blue
377 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red 434 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red
378 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green 435 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
379 436
380 // shift src to 565 437 // shift src to 565
381 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); 438 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after
461 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 518 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
462 519
463 uint8x8_t vdither = vld1_u8(dstart); // load dither values 520 uint8x8_t vdither = vld1_u8(dstart); // load dither values
464 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s 521 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s
465 522
466 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg 523 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg
467 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask 524 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask
468 525
469 do { 526 do {
470 527
528 uint8x8x4_t vsrc;
471 uint8x8_t vsrc_r, vsrc_g, vsrc_b; 529 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
472 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; 530 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
473 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; 531 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
474 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; 532 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
475 uint16x8_t vdst; 533 uint16x8_t vdst;
476 uint16x8_t vdst_r, vdst_g, vdst_b; 534 uint16x8_t vdst_r, vdst_g, vdst_b;
477 int16x8_t vres_r, vres_g, vres_b; 535 int16x8_t vres_r, vres_g, vres_b;
478 int8x8_t vres8_r, vres8_g, vres8_b; 536 int8x8_t vres8_r, vres8_g, vres8_b;
479 537
480 // Load source and add dither 538 // Load source and add dither
539 #ifdef SK_CPU_ARM64
540 vsrc = sk_vld4_u8_arm64_3(src);
541 #else
481 { 542 {
482 register uint8x8_t d0 asm("d0"); 543 register uint8x8_t d0 asm("d0");
483 register uint8x8_t d1 asm("d1"); 544 register uint8x8_t d1 asm("d1");
484 register uint8x8_t d2 asm("d2"); 545 register uint8x8_t d2 asm("d2");
485 register uint8x8_t d3 asm("d3"); 546 register uint8x8_t d3 asm("d3");
486 547
487 asm ( 548 asm (
488 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 549 "vld4.8 {d0-d3},[%[src]]! "
489 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 550 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
490 : 551 :
491 ); 552 );
492 vsrc_g = d1; 553 vsrc.val[0] = d0;
493 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 554 vsrc.val[1] = d1;
494 vsrc_r = d2; vsrc_b = d0; 555 vsrc.val[2] = d2;
495 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 556 }
496 vsrc_r = d0; vsrc_b = d2;
497 #endif 557 #endif
498 } 558 vsrc_r = vsrc.val[NEON_R];
559 vsrc_g = vsrc.val[NEON_G];
560 vsrc_b = vsrc.val[NEON_B];
499 561
500 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 562 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
501 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 563 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
502 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 564 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
503 565
504 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen 566 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
505 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen 567 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen
506 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen 568 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen
507 569
508 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result 570 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
569 sb = SkDITHER_B32To565(sb, dither); 631 sb = SkDITHER_B32To565(sb, dither);
570 632
571 uint16_t d = *dst; 633 uint16_t d = *dst;
572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 634 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
573 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 635 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
574 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 636 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
575 DITHER_INC_X(x); 637 DITHER_INC_X(x);
576 } while (--count != 0); 638 } while (--count != 0);
577 } 639 }
578 } 640 }
579 #endif
580 641
581 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 642 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
582 const SkPMColor* SK_RESTRICT src, 643 const SkPMColor* SK_RESTRICT src,
583 int count, U8CPU alpha) { 644 int count, U8CPU alpha) {
584 645
585 SkASSERT(255 == alpha); 646 SkASSERT(255 == alpha);
586 if (count > 0) { 647 if (count > 0) {
587 648
588 649
589 uint8x8_t alpha_mask; 650 uint8x8_t alpha_mask;
(...skipping 450 matching lines...) Expand 10 before | Expand all | Expand 10 after
1040 uint16_t *pc = (uint16_t*) p; 1101 uint16_t *pc = (uint16_t*) p;
1041 sprintf(buf,"%8s:", str); 1102 sprintf(buf,"%8s:", str);
1042 len = (len / sizeof(uint16_t)); /* passed as bytes */ 1103 len = (len / sizeof(uint16_t)); /* passed as bytes */
1043 for(i=0;i<len;i++) { 1104 for(i=0;i<len;i++) {
1044 sprintf(tbuf, " %04x", pc[i]); 1105 sprintf(tbuf, " %04x", pc[i]);
1045 strcat(buf, tbuf); 1106 strcat(buf, tbuf);
1046 } 1107 }
1047 SkDebugf("%s\n", buf); 1108 SkDebugf("%s\n", buf);
1048 } 1109 }
1049 #endif 1110 #endif
1111 #endif // #ifdef SK_CPU_ARM32
1050 1112
1051 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 1113 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1052 const SkPMColor* SK_RESTRICT src, 1114 const SkPMColor* SK_RESTRICT src,
1053 int count, U8CPU alpha, int x, int y) { 1115 int count, U8CPU alpha, int x, int y) {
1054 SkASSERT(255 == alpha); 1116 SkASSERT(255 == alpha);
1055 1117
1056 #define UNROLL 8 1118 #define UNROLL 8
1057 1119
1058 if (count >= UNROLL) { 1120 if (count >= UNROLL) {
1059 1121
1060 #if defined(DEBUG_OPAQUE_DITHER) 1122 #if defined(DEBUG_OPAQUE_DITHER)
1061 uint16_t tmpbuf[UNROLL]; 1123 uint16_t tmpbuf[UNROLL];
1062 int td[UNROLL]; 1124 int td[UNROLL];
1063 int tdv[UNROLL]; 1125 int tdv[UNROLL];
1064 int ta[UNROLL]; 1126 int ta[UNROLL];
1065 int tap[UNROLL]; 1127 int tap[UNROLL];
1066 uint16_t in_dst[UNROLL]; 1128 uint16_t in_dst[UNROLL];
1067 int offset = 0; 1129 int offset = 0;
1068 int noisy = 0; 1130 int noisy = 0;
1069 #endif 1131 #endif
1070 1132
1071 uint8x8_t dbase; 1133 uint8x8_t dbase;
1072 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1134 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1073 dbase = vld1_u8(dstart); 1135 dbase = vld1_u8(dstart);
1074 1136
1075 do { 1137 do {
1138 uint8x8x4_t vsrc;
1076 uint8x8_t sr, sg, sb, sa, d; 1139 uint8x8_t sr, sg, sb, sa, d;
1077 uint16x8_t dst8, scale8, alpha8; 1140 uint16x8_t dst8, scale8, alpha8;
1078 uint16x8_t dst_r, dst_g, dst_b; 1141 uint16x8_t dst_r, dst_g, dst_b;
1079 1142
1080 #if defined(DEBUG_OPAQUE_DITHER) 1143 #if defined(DEBUG_OPAQUE_DITHER)
1081 // calculate 8 elements worth into a temp buffer 1144 // calculate 8 elements worth into a temp buffer
1082 { 1145 {
1083 int my_y = y; 1146 int my_y = y;
1084 int my_x = x; 1147 int my_x = x;
1085 SkPMColor* my_src = (SkPMColor*)src; 1148 SkPMColor* my_src = (SkPMColor*)src;
(...skipping 30 matching lines...) Expand all
1116 tmpbuf[i] = *my_dst; 1179 tmpbuf[i] = *my_dst;
1117 ta[i] = tdv[i] = td[i] = 0xbeef; 1180 ta[i] = tdv[i] = td[i] = 0xbeef;
1118 } 1181 }
1119 in_dst[i] = *my_dst; 1182 in_dst[i] = *my_dst;
1120 my_dst += 1; 1183 my_dst += 1;
1121 DITHER_INC_X(my_x); 1184 DITHER_INC_X(my_x);
1122 } 1185 }
1123 } 1186 }
1124 #endif 1187 #endif
1125 1188
1126 1189 #ifdef SK_CPU_ARM64
1190 vsrc = sk_vld4_u8_arm64_4(src);
1191 #else
1127 { 1192 {
1128 register uint8x8_t d0 asm("d0"); 1193 register uint8x8_t d0 asm("d0");
1129 register uint8x8_t d1 asm("d1"); 1194 register uint8x8_t d1 asm("d1");
1130 register uint8x8_t d2 asm("d2"); 1195 register uint8x8_t d2 asm("d2");
1131 register uint8x8_t d3 asm("d3"); 1196 register uint8x8_t d3 asm("d3");
1132 1197
1133 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1198 asm ("vld4.8 {d0-d3},[%[src]]! "
1134 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) 1199 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1135 : 1200 :
1136 ); 1201 );
1137 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1202 vsrc.val[0] = d0;
1138 sr = d2; sg = d1; sb = d0; sa = d3; 1203 vsrc.val[1] = d1;
1139 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1204 vsrc.val[2] = d2;
1140 sr = d0; sg = d1; sb = d2; sa = d3; 1205 vsrc.val[3] = d3;
1206 }
1141 #endif 1207 #endif
1142 } 1208 sa = vsrc.val[NEON_A];
1209 sr = vsrc.val[NEON_R];
1210 sg = vsrc.val[NEON_G];
1211 sb = vsrc.val[NEON_B];
1143 1212
1144 /* calculate 'd', which will be 0..7 1213 /* calculate 'd', which will be 0..7
1145 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice 1214 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1146 */ 1215 */
1147 alpha8 = vmovl_u8(dbase); 1216 alpha8 = vmovl_u8(dbase);
1148 alpha8 = vmlal_u8(alpha8, sa, dbase); 1217 alpha8 = vmlal_u8(alpha8, sa, dbase);
1149 d = vshrn_n_u16(alpha8, 8); // narrowing too 1218 d = vshrn_n_u16(alpha8, 8); // narrowing too
1150 1219
1151 // sr = sr - (sr>>5) + d 1220 // sr = sr - (sr>>5) + d
1152 /* watching for 8-bit overflow. d is 0..7; risky range of 1221 /* watching for 8-bit overflow. d is 0..7; risky range of
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after
1276 #define UNROLL 8 1345 #define UNROLL 8
1277 if (count >= UNROLL) { 1346 if (count >= UNROLL) {
1278 uint8x8_t d; 1347 uint8x8_t d;
1279 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1348 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1280 d = vld1_u8(dstart); 1349 d = vld1_u8(dstart);
1281 1350
1282 while (count >= UNROLL) { 1351 while (count >= UNROLL) {
1283 uint8x8_t sr, sg, sb; 1352 uint8x8_t sr, sg, sb;
1284 uint16x8_t dr, dg, db; 1353 uint16x8_t dr, dg, db;
1285 uint16x8_t dst8; 1354 uint16x8_t dst8;
1355 uint8x8x4_t vsrc;
1286 1356
1357 #ifdef SK_CPU_ARM64
1358 vsrc = sk_vld4_u8_arm64_3(src);
1359 #else
1287 { 1360 {
1288 register uint8x8_t d0 asm("d0"); 1361 register uint8x8_t d0 asm("d0");
1289 register uint8x8_t d1 asm("d1"); 1362 register uint8x8_t d1 asm("d1");
1290 register uint8x8_t d2 asm("d2"); 1363 register uint8x8_t d2 asm("d2");
1291 register uint8x8_t d3 asm("d3"); 1364 register uint8x8_t d3 asm("d3");
1292 1365
1293 asm ( 1366 asm (
1294 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1367 "vld4.8 {d0-d3},[%[src]]! "
1295 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1368 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1296 : 1369 :
1297 ); 1370 );
1298 sg = d1; 1371 vsrc.val[0] = d0;
1299 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1372 vsrc.val[1] = d1;
1300 sr = d2; sb = d0; 1373 vsrc.val[2] = d2;
1301 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1374 }
1302 sr = d0; sb = d2;
1303 #endif 1375 #endif
1304 } 1376 sr = vsrc.val[NEON_R];
1377 sg = vsrc.val[NEON_G];
1378 sb = vsrc.val[NEON_B];
1379
1305 /* XXX: if we want to prefetch, hide it in the above asm() 1380 /* XXX: if we want to prefetch, hide it in the above asm()
1306 * using the gcc __builtin_prefetch(), the prefetch will 1381 * using the gcc __builtin_prefetch(), the prefetch will
1307 * fall to the bottom of the loop -- it won't stick up 1382 * fall to the bottom of the loop -- it won't stick up
1308 * at the top of the loop, just after the vld4. 1383 * at the top of the loop, just after the vld4.
1309 */ 1384 */
1310 1385
1311 // sr = sr - (sr>>5) + d 1386 // sr = sr - (sr>>5) + d
1312 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1387 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1313 dr = vaddl_u8(sr, d); 1388 dr = vaddl_u8(sr, d);
1314 1389
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
1362 SkPMColor c = *src++; 1437 SkPMColor c = *src++;
1363 SkPMColorAssert(c); 1438 SkPMColorAssert(c);
1364 SkASSERT(SkGetPackedA32(c) == 255); 1439 SkASSERT(SkGetPackedA32(c) == 255);
1365 1440
1366 unsigned dither = DITHER_VALUE(x); 1441 unsigned dither = DITHER_VALUE(x);
1367 *dst++ = SkDitherRGB32To565(c, dither); 1442 *dst++ = SkDitherRGB32To565(c, dither);
1368 DITHER_INC_X(x); 1443 DITHER_INC_X(x);
1369 } while (--count != 0); 1444 } while (--count != 0);
1370 } 1445 }
1371 } 1446 }
1372 #endif
1373 1447
1374 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1448 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1375 SkPMColor color) { 1449 SkPMColor color) {
1376 if (count <= 0) { 1450 if (count <= 0) {
1377 return; 1451 return;
1378 } 1452 }
1379 1453
1380 if (0 == color) { 1454 if (0 == color) {
1381 if (src != dst) { 1455 if (src != dst) {
1382 memcpy(dst, src, count * sizeof(SkPMColor)); 1456 memcpy(dst, src, count * sizeof(SkPMColor));
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after
1468 *dst = color + SkAlphaMulQ(*src, scale); 1542 *dst = color + SkAlphaMulQ(*src, scale);
1469 src += 1; 1543 src += 1;
1470 dst += 1; 1544 dst += 1;
1471 count--; 1545 count--;
1472 } 1546 }
1473 } 1547 }
1474 1548
1475 /////////////////////////////////////////////////////////////////////////////// 1549 ///////////////////////////////////////////////////////////////////////////////
1476 1550
1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1551 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1478 #ifdef SK_CPU_ARM32
1479 // no dither 1552 // no dither
1480 S32_D565_Opaque_neon, 1553 S32_D565_Opaque_neon,
1481 S32_D565_Blend_neon, 1554 S32_D565_Blend_neon,
1555 #ifdef SK_CPU_ARM32
1482 S32A_D565_Opaque_neon, 1556 S32A_D565_Opaque_neon,
1557 #else
1558 NULL,
1559 #endif
1483 S32A_D565_Blend_neon, 1560 S32A_D565_Blend_neon,
1484 1561
1485 // dither 1562 // dither
1486 S32_D565_Opaque_Dither_neon, 1563 S32_D565_Opaque_Dither_neon,
1487 S32_D565_Blend_Dither_neon, 1564 S32_D565_Blend_Dither_neon,
1488 S32A_D565_Opaque_Dither_neon, 1565 S32A_D565_Opaque_Dither_neon,
1489 NULL, // S32A_D565_Blend_Dither 1566 NULL, // S32A_D565_Blend_Dither
1490 #else
1491 NULL, NULL, NULL, NULL,
1492 NULL, NULL, NULL, NULL
1493 #endif
1494 }; 1567 };
1495 1568
1496 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1569 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1497 NULL, // S32_Opaque, 1570 NULL, // S32_Opaque,
1498 S32_Blend_BlitRow32_neon, // S32_Blend, 1571 S32_Blend_BlitRow32_neon, // S32_Blend,
1499 /* 1572 /*
1500 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1573 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1501 * value and attempts to optimize accordingly. The optimization is 1574 * value and attempts to optimize accordingly. The optimization is
1502 * sensitive to the source content and is not a win in all cases. For 1575 * sensitive to the source content and is not a win in all cases. For
1503 * example, if there are a lot of transitions between the alpha states, 1576 * example, if there are a lot of transitions between the alpha states,
1504 * the performance will almost certainly be worse. However, for many 1577 * the performance will almost certainly be worse. However, for many
1505 * common cases the performance is equivalent or better than the standard 1578 * common cases the performance is equivalent or better than the standard
1506 * case where we do not inspect the src alpha. 1579 * case where we do not inspect the src alpha.
1507 */ 1580 */
1508 #if SK_A32_SHIFT == 24 1581 #if SK_A32_SHIFT == 24
1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1582 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1583 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1511 #else 1584 #else
1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1585 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1513 #endif 1586 #endif
1514 #ifdef SK_CPU_ARM32 1587 #ifdef SK_CPU_ARM32
1515 S32A_Blend_BlitRow32_neon // S32A_Blend 1588 S32A_Blend_BlitRow32_neon // S32A_Blend
1516 #else 1589 #else
1517 NULL 1590 NULL
1518 #endif 1591 #endif
1519 }; 1592 };
OLDNEW
« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698