Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(111)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 317193003: ARM Skia NEON patches - 39 - arm64 565 blitters (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: ignored-tests.txt Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h" 11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h" 12 #include "SkColorPriv.h"
13 #include "SkDither.h" 13 #include "SkDither.h"
14 #include "SkMathPriv.h" 14 #include "SkMathPriv.h"
15 #include "SkUtils.h" 15 #include "SkUtils.h"
16 16
17 #include "SkColor_opts_neon.h" 17 #include "SkColor_opts_neon.h"
18 #include <arm_neon.h> 18 #include <arm_neon.h>
19 19
20 #ifdef SK_CPU_ARM32
21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 20 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22 const SkPMColor* SK_RESTRICT src, int count, 21 const SkPMColor* SK_RESTRICT src, int count,
23 U8CPU alpha, int /*x*/, int /*y*/) { 22 U8CPU alpha, int /*x*/, int /*y*/) {
24 SkASSERT(255 == alpha); 23 SkASSERT(255 == alpha);
25 24
26 while (count >= 8) { 25 while (count >= 8) {
27 uint8x8x4_t vsrc; 26 uint8x8x4_t vsrc;
28 uint16x8_t vdst; 27 uint16x8_t vdst;
29 28
30 // Load 29 // Load
30 #ifdef SK_CPU_ARM64
31 uint8x8_t vsrc_0, vsrc_1, vsrc_2;
32 asm (
33 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"
mtklein 2014/06/05 16:28:36 Looks like we've got the same load-32-bytes thing
kevin.petit 2014/06/05 17:00:02 There are actually a few more differences. Some us
34 "mov %[vsrc0].8b, v0.8b \t\n"
35 "mov %[vsrc1].8b, v1.8b \t\n"
36 "mov %[vsrc2].8b, v2.8b \t\n"
37 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
38 [vsrc2] "=w" (vsrc_2)
39 : [src] "r" (src)
40 : "v0", "v1", "v2", "v3"
41 );
42 vsrc.val[0] = vsrc_0;
43 vsrc.val[1] = vsrc_1;
44 vsrc.val[2] = vsrc_2;
45
46 #else
31 vsrc = vld4_u8((uint8_t*)src); 47 vsrc = vld4_u8((uint8_t*)src);
48 #endif
32 49
33 // Convert src to 565 50 // Convert src to 565
34 vdst = SkPixel32ToPixel16_neon8(vsrc); 51 vdst = SkPixel32ToPixel16_neon8(vsrc);
35 52
36 // Store 53 // Store
37 vst1q_u16(dst, vdst); 54 vst1q_u16(dst, vdst);
38 55
39 // Prepare next iteration 56 // Prepare next iteration
40 dst += 8; 57 dst += 8;
41 src += 8; 58 src += 8;
(...skipping 15 matching lines...) Expand all
57 U8CPU alpha, int /*x*/, int /*y*/) { 74 U8CPU alpha, int /*x*/, int /*y*/) {
58 SkASSERT(255 > alpha); 75 SkASSERT(255 > alpha);
59 76
60 uint16x8_t vmask_blue, vscale; 77 uint16x8_t vmask_blue, vscale;
61 78
62 // prepare constants 79 // prepare constants
63 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); 80 vscale = vdupq_n_u16(SkAlpha255To256(alpha));
64 vmask_blue = vmovq_n_u16(0x1F); 81 vmask_blue = vmovq_n_u16(0x1F);
65 82
66 while (count >= 8) { 83 while (count >= 8) {
84 uint8x8x4_t vsrc;
67 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 85 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
68 uint16x8_t vres_r, vres_g, vres_b; 86 uint16x8_t vres_r, vres_g, vres_b;
69 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
70 87
71 // Load src 88 // Load src
89 #ifdef SK_CPU_ARM64
90 uint8x8_t vsrc_0, vsrc_1, vsrc_2;
91 asm (
92 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"
93 "mov %[vsrc0].8b, v0.8b \t\n"
94 "mov %[vsrc1].8b, v1.8b \t\n"
95 "mov %[vsrc2].8b, v2.8b \t\n"
96 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
97 [vsrc2] "=w" (vsrc_2)
98 : [src] "r" (src)
99 : "v0", "v1", "v2", "v3"
100 );
101 vsrc.val[0] = vsrc_0;
102 vsrc.val[1] = vsrc_1;
103 vsrc.val[2] = vsrc_2;
104 src += 8;
105 #else
72 { 106 {
73 register uint8x8_t d0 asm("d0"); 107 register uint8x8_t d0 asm("d0");
74 register uint8x8_t d1 asm("d1"); 108 register uint8x8_t d1 asm("d1");
75 register uint8x8_t d2 asm("d2"); 109 register uint8x8_t d2 asm("d2");
76 register uint8x8_t d3 asm("d3"); 110 register uint8x8_t d3 asm("d3");
77 111
78 asm ( 112 asm (
79 "vld4.8 {d0-d3},[%[src]]!" 113 "vld4.8 {d0-d3},[%[src]]!"
80 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 114 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
81 : 115 :
82 ); 116 );
83 vsrc_g = d1; 117 vsrc.val[0] = d0;
84 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 118 vsrc.val[1] = d1;
85 vsrc_r = d2; vsrc_b = d0; 119 vsrc.val[2] = d2;
86 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 120 }
87 vsrc_r = d0; vsrc_b = d2;
88 #endif 121 #endif
89 }
90 122
91 // Load and unpack dst 123 // Load and unpack dst
92 vdst = vld1q_u16(dst); 124 vdst = vld1q_u16(dst);
93 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes 125 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes
94 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue 126 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
95 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red 127 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red
96 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green 128 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green
97 129
98 // Shift src to 565 130 // Shift src to 565 range
99 vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range 131 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
100 vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range 132 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
101 vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range 133 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
102 134
103 // Scale src - dst 135 // Scale src - dst
104 vres_r = vmovl_u8(vsrc_r) - vdst_r; 136 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
105 vres_g = vmovl_u8(vsrc_g) - vdst_g; 137 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
106 vres_b = vmovl_u8(vsrc_b) - vdst_b; 138 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
107 139
108 vres_r = vshrq_n_u16(vres_r * vscale, 8); 140 vres_r = vshrq_n_u16(vres_r * vscale, 8);
109 vres_g = vshrq_n_u16(vres_g * vscale, 8); 141 vres_g = vshrq_n_u16(vres_g * vscale, 8);
110 vres_b = vshrq_n_u16(vres_b * vscale, 8); 142 vres_b = vshrq_n_u16(vres_b * vscale, 8);
111 143
112 vres_r += vdst_r; 144 vres_r += vdst_r;
113 vres_g += vdst_g; 145 vres_g += vdst_g;
114 vres_b += vdst_b; 146 vres_b += vdst_b;
115 147
116 // Combine 148 // Combine
(...skipping 12 matching lines...) Expand all
129 SkPMColorAssert(c); 161 SkPMColorAssert(c);
130 uint16_t d = *dst; 162 uint16_t d = *dst;
131 *dst++ = SkPackRGB16( 163 *dst++ = SkPackRGB16(
132 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), 164 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
133 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), 165 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
134 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); 166 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
135 } while (--count != 0); 167 } while (--count != 0);
136 } 168 }
137 } 169 }
138 170
171 #ifdef SK_CPU_ARM32
139 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 172 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
140 const SkPMColor* SK_RESTRICT src, int count, 173 const SkPMColor* SK_RESTRICT src, int count,
141 U8CPU alpha, int /*x*/, int /*y*/) { 174 U8CPU alpha, int /*x*/, int /*y*/) {
142 SkASSERT(255 == alpha); 175 SkASSERT(255 == alpha);
143 176
144 if (count >= 8) { 177 if (count >= 8) {
145 uint16_t* SK_RESTRICT keep_dst = 0; 178 uint16_t* SK_RESTRICT keep_dst = 0;
146 179
147 asm volatile ( 180 asm volatile (
148 "ands ip, %[count], #7 \n\t" 181 "ands ip, %[count], #7 \n\t"
(...skipping 157 matching lines...) Expand 10 before | Expand all | Expand 10 after
306 339
307 "21: \n\t" 340 "21: \n\t"
308 : [count] "+r" (count) 341 : [count] "+r" (count)
309 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc) 342 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc)
310 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7", 343 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7",
311 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29", 344 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29",
312 "d30","d31" 345 "d30","d31"
313 ); 346 );
314 } 347 }
315 } 348 }
349 #endif
316 350
317 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 351 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
318 prod += vdupq_n_u16(128); 352 prod += vdupq_n_u16(128);
319 prod += vshrq_n_u16(prod, 8); 353 prod += vshrq_n_u16(prod, 8);
320 return vshrq_n_u16(prod, 8); 354 return vshrq_n_u16(prod, 8);
321 } 355 }
322 356
323 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 357 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
324 const SkPMColor* SK_RESTRICT src, int count, 358 const SkPMColor* SK_RESTRICT src, int count,
325 U8CPU alpha, int /*x*/, int /*y*/) { 359 U8CPU alpha, int /*x*/, int /*y*/) {
(...skipping 13 matching lines...) Expand all
339 valpha = vdup_n_u8(alpha); 373 valpha = vdup_n_u8(alpha);
340 vmask_blue = vmovq_n_u16(SK_B16_MASK); 374 vmask_blue = vmovq_n_u16(SK_B16_MASK);
341 375
342 do { 376 do {
343 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 377 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
344 uint16x8_t vres_a, vres_r, vres_g, vres_b; 378 uint16x8_t vres_a, vres_r, vres_g, vres_b;
345 uint8x8x4_t vsrc; 379 uint8x8x4_t vsrc;
346 380
347 // load pixels 381 // load pixels
348 vdst = vld1q_u16(dst); 382 vdst = vld1q_u16(dst);
383 #ifdef SK_CPU_ARM64
384 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
385 asm (
386 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"
387 "mov %[vsrc0].8b, v0.8b \t\n"
388 "mov %[vsrc1].8b, v1.8b \t\n"
389 "mov %[vsrc2].8b, v2.8b \t\n"
390 "mov %[vsrc3].8b, v3.8b \t\n"
391 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
392 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3)
393 : [src] "r" (src)
394 : "v0", "v1", "v2", "v3"
395 );
396 src += 8;
397 vsrc.val[0] = vsrc_0;
398 vsrc.val[1] = vsrc_1;
399 vsrc.val[2] = vsrc_2;
400 vsrc.val[3] = vsrc_3;
401 #else
349 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 402 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
350 asm ( 403 asm (
351 "vld4.u8 %h[vsrc], [%[src]]!" 404 "vld4.u8 %h[vsrc], [%[src]]!"
352 : [vsrc] "=w" (vsrc), [src] "+&r" (src) 405 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
353 : : 406 : :
354 ); 407 );
355 #else 408 #else
356 register uint8x8_t d0 asm("d0"); 409 register uint8x8_t d0 asm("d0");
357 register uint8x8_t d1 asm("d1"); 410 register uint8x8_t d1 asm("d1");
358 register uint8x8_t d2 asm("d2"); 411 register uint8x8_t d2 asm("d2");
359 register uint8x8_t d3 asm("d3"); 412 register uint8x8_t d3 asm("d3");
360 413
361 asm volatile ( 414 asm volatile (
362 "vld4.u8 {d0-d3},[%[src]]!;" 415 "vld4.u8 {d0-d3},[%[src]]!;"
363 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 416 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
364 [src] "+&r" (src) 417 [src] "+&r" (src)
365 : : 418 : :
366 ); 419 );
367 vsrc.val[0] = d0; 420 vsrc.val[0] = d0;
368 vsrc.val[1] = d1; 421 vsrc.val[1] = d1;
369 vsrc.val[2] = d2; 422 vsrc.val[2] = d2;
370 vsrc.val[3] = d3; 423 vsrc.val[3] = d3;
371 #endif 424 #endif
425 #endif // #ifdef SK_CPU_ARM64
372 426
373 427
374 // deinterleave dst 428 // deinterleave dst
375 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes 429 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes
376 vdst_b = vdst & vmask_blue; // extract blue 430 vdst_b = vdst & vmask_blue; // extract blue
377 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red 431 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red
378 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green 432 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
379 433
380 // shift src to 565 434 // shift src to 565
381 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); 435 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after
461 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 515 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
462 516
463 uint8x8_t vdither = vld1_u8(dstart); // load dither values 517 uint8x8_t vdither = vld1_u8(dstart); // load dither values
464 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s 518 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s
465 519
466 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg 520 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg
467 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask 521 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask
468 522
469 do { 523 do {
470 524
525 uint8x8x4_t vsrc;
471 uint8x8_t vsrc_r, vsrc_g, vsrc_b; 526 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
472 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; 527 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
473 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; 528 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
474 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; 529 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
475 uint16x8_t vdst; 530 uint16x8_t vdst;
476 uint16x8_t vdst_r, vdst_g, vdst_b; 531 uint16x8_t vdst_r, vdst_g, vdst_b;
477 int16x8_t vres_r, vres_g, vres_b; 532 int16x8_t vres_r, vres_g, vres_b;
478 int8x8_t vres8_r, vres8_g, vres8_b; 533 int8x8_t vres8_r, vres8_g, vres8_b;
479 534
480 // Load source and add dither 535 // Load source and add dither
536 #ifdef SK_CPU_ARM64
537 uint8x8_t vsrc_0, vsrc_1, vsrc_2;
538 asm (
539 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"
540 "mov %[vsrc0].8b, v0.8b \t\n"
541 "mov %[vsrc1].8b, v1.8b \t\n"
542 "mov %[vsrc2].8b, v2.8b \t\n"
543 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
544 [vsrc2] "=w" (vsrc_2)
545 : [src] "r" (src)
546 : "v0", "v1", "v2", "v3"
547 );
548
549 src += 8;
550
551 vsrc.val[0] = vsrc_0;
552 vsrc.val[1] = vsrc_1;
553 vsrc.val[2] = vsrc_2;
554 #else
481 { 555 {
482 register uint8x8_t d0 asm("d0"); 556 register uint8x8_t d0 asm("d0");
483 register uint8x8_t d1 asm("d1"); 557 register uint8x8_t d1 asm("d1");
484 register uint8x8_t d2 asm("d2"); 558 register uint8x8_t d2 asm("d2");
485 register uint8x8_t d3 asm("d3"); 559 register uint8x8_t d3 asm("d3");
486 560
487 asm ( 561 asm (
488 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 562 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
489 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 563 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
490 : 564 :
491 ); 565 );
492 vsrc_g = d1; 566 vsrc.val[0] = d0;
493 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 567 vsrc.val[1] = d1;
494 vsrc_r = d2; vsrc_b = d0; 568 vsrc.val[2] = d2;
495 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 569 }
496 vsrc_r = d0; vsrc_b = d2;
497 #endif 570 #endif
498 } 571 vsrc_r = vsrc.val[NEON_R];
572 vsrc_g = vsrc.val[NEON_G];
573 vsrc_b = vsrc.val[NEON_B];
499 574
500 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 575 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
501 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 576 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
502 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 577 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
503 578
504 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen 579 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
505 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen 580 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen
506 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen 581 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen
507 582
508 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result 583 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
569 sb = SkDITHER_B32To565(sb, dither); 644 sb = SkDITHER_B32To565(sb, dither);
570 645
571 uint16_t d = *dst; 646 uint16_t d = *dst;
572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 647 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
573 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 648 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
574 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 649 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
575 DITHER_INC_X(x); 650 DITHER_INC_X(x);
576 } while (--count != 0); 651 } while (--count != 0);
577 } 652 }
578 } 653 }
579 #endif
580 654
581 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 655 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
582 const SkPMColor* SK_RESTRICT src, 656 const SkPMColor* SK_RESTRICT src,
583 int count, U8CPU alpha) { 657 int count, U8CPU alpha) {
584 658
585 SkASSERT(255 == alpha); 659 SkASSERT(255 == alpha);
586 if (count > 0) { 660 if (count > 0) {
587 661
588 662
589 uint8x8_t alpha_mask; 663 uint8x8_t alpha_mask;
(...skipping 450 matching lines...) Expand 10 before | Expand all | Expand 10 after
1040 uint16_t *pc = (uint16_t*) p; 1114 uint16_t *pc = (uint16_t*) p;
1041 sprintf(buf,"%8s:", str); 1115 sprintf(buf,"%8s:", str);
1042 len = (len / sizeof(uint16_t)); /* passed as bytes */ 1116 len = (len / sizeof(uint16_t)); /* passed as bytes */
1043 for(i=0;i<len;i++) { 1117 for(i=0;i<len;i++) {
1044 sprintf(tbuf, " %04x", pc[i]); 1118 sprintf(tbuf, " %04x", pc[i]);
1045 strcat(buf, tbuf); 1119 strcat(buf, tbuf);
1046 } 1120 }
1047 SkDebugf("%s\n", buf); 1121 SkDebugf("%s\n", buf);
1048 } 1122 }
1049 #endif 1123 #endif
1124 #endif // #ifdef SK_CPU_ARM32
1050 1125
1051 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 1126 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1052 const SkPMColor* SK_RESTRICT src, 1127 const SkPMColor* SK_RESTRICT src,
1053 int count, U8CPU alpha, int x, int y) { 1128 int count, U8CPU alpha, int x, int y) {
1054 SkASSERT(255 == alpha); 1129 SkASSERT(255 == alpha);
1055 1130
1056 #define UNROLL 8 1131 #define UNROLL 8
1057 1132
1058 if (count >= UNROLL) { 1133 if (count >= UNROLL) {
1059 1134
1060 #if defined(DEBUG_OPAQUE_DITHER) 1135 #if defined(DEBUG_OPAQUE_DITHER)
1061 uint16_t tmpbuf[UNROLL]; 1136 uint16_t tmpbuf[UNROLL];
1062 int td[UNROLL]; 1137 int td[UNROLL];
1063 int tdv[UNROLL]; 1138 int tdv[UNROLL];
1064 int ta[UNROLL]; 1139 int ta[UNROLL];
1065 int tap[UNROLL]; 1140 int tap[UNROLL];
1066 uint16_t in_dst[UNROLL]; 1141 uint16_t in_dst[UNROLL];
1067 int offset = 0; 1142 int offset = 0;
1068 int noisy = 0; 1143 int noisy = 0;
1069 #endif 1144 #endif
1070 1145
1071 uint8x8_t dbase; 1146 uint8x8_t dbase;
1072 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1147 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1073 dbase = vld1_u8(dstart); 1148 dbase = vld1_u8(dstart);
1074 1149
1075 do { 1150 do {
1151 uint8x8x4_t vsrc;
1076 uint8x8_t sr, sg, sb, sa, d; 1152 uint8x8_t sr, sg, sb, sa, d;
1077 uint16x8_t dst8, scale8, alpha8; 1153 uint16x8_t dst8, scale8, alpha8;
1078 uint16x8_t dst_r, dst_g, dst_b; 1154 uint16x8_t dst_r, dst_g, dst_b;
1079 1155
1080 #if defined(DEBUG_OPAQUE_DITHER) 1156 #if defined(DEBUG_OPAQUE_DITHER)
1081 // calculate 8 elements worth into a temp buffer 1157 // calculate 8 elements worth into a temp buffer
1082 { 1158 {
1083 int my_y = y; 1159 int my_y = y;
1084 int my_x = x; 1160 int my_x = x;
1085 SkPMColor* my_src = (SkPMColor*)src; 1161 SkPMColor* my_src = (SkPMColor*)src;
(...skipping 30 matching lines...) Expand all
1116 tmpbuf[i] = *my_dst; 1192 tmpbuf[i] = *my_dst;
1117 ta[i] = tdv[i] = td[i] = 0xbeef; 1193 ta[i] = tdv[i] = td[i] = 0xbeef;
1118 } 1194 }
1119 in_dst[i] = *my_dst; 1195 in_dst[i] = *my_dst;
1120 my_dst += 1; 1196 my_dst += 1;
1121 DITHER_INC_X(my_x); 1197 DITHER_INC_X(my_x);
1122 } 1198 }
1123 } 1199 }
1124 #endif 1200 #endif
1125 1201
1202 #ifdef SK_CPU_ARM64
1203 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
1204 asm (
1205 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"
1206 "mov %[vsrc0].8b, v0.8b \t\n"
1207 "mov %[vsrc1].8b, v1.8b \t\n"
1208 "mov %[vsrc2].8b, v2.8b \t\n"
1209 "mov %[vsrc3].8b, v3.8b \t\n"
1210 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
1211 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3)
1212 : [src] "r" (src)
1213 : "v0", "v1", "v2", "v3"
1214 );
1126 1215
1216 src += 8;
1217 vsrc.val[0] = vsrc_0;
1218 vsrc.val[1] = vsrc_1;
1219 vsrc.val[2] = vsrc_2;
1220 vsrc.val[3] = vsrc_3;
1221 #else
1127 { 1222 {
1128 register uint8x8_t d0 asm("d0"); 1223 register uint8x8_t d0 asm("d0");
1129 register uint8x8_t d1 asm("d1"); 1224 register uint8x8_t d1 asm("d1");
1130 register uint8x8_t d2 asm("d2"); 1225 register uint8x8_t d2 asm("d2");
1131 register uint8x8_t d3 asm("d3"); 1226 register uint8x8_t d3 asm("d3");
1132 1227
1133 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1228 asm ("vld4.8 {d0-d3},[%[src]]! "
1134 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) 1229 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1135 : 1230 :
1136 ); 1231 );
1137 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1232 vsrc.val[0] = d0;
1138 sr = d2; sg = d1; sb = d0; sa = d3; 1233 vsrc.val[1] = d1;
1139 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1234 vsrc.val[2] = d2;
1140 sr = d0; sg = d1; sb = d2; sa = d3; 1235 vsrc.val[3] = d3;
1236 }
1141 #endif 1237 #endif
1142 } 1238 sa = vsrc.val[NEON_A];
1239 sr = vsrc.val[NEON_R];
1240 sg = vsrc.val[NEON_G];
1241 sb = vsrc.val[NEON_B];
1143 1242
1144 /* calculate 'd', which will be 0..7 1243 /* calculate 'd', which will be 0..7
1145 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice 1244 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1146 */ 1245 */
1147 alpha8 = vmovl_u8(dbase); 1246 alpha8 = vmovl_u8(dbase);
1148 alpha8 = vmlal_u8(alpha8, sa, dbase); 1247 alpha8 = vmlal_u8(alpha8, sa, dbase);
1149 d = vshrn_n_u16(alpha8, 8); // narrowing too 1248 d = vshrn_n_u16(alpha8, 8); // narrowing too
1150 1249
1151 // sr = sr - (sr>>5) + d 1250 // sr = sr - (sr>>5) + d
1152 /* watching for 8-bit overflow. d is 0..7; risky range of 1251 /* watching for 8-bit overflow. d is 0..7; risky range of
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after
1276 #define UNROLL 8 1375 #define UNROLL 8
1277 if (count >= UNROLL) { 1376 if (count >= UNROLL) {
1278 uint8x8_t d; 1377 uint8x8_t d;
1279 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1378 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1280 d = vld1_u8(dstart); 1379 d = vld1_u8(dstart);
1281 1380
1282 while (count >= UNROLL) { 1381 while (count >= UNROLL) {
1283 uint8x8_t sr, sg, sb; 1382 uint8x8_t sr, sg, sb;
1284 uint16x8_t dr, dg, db; 1383 uint16x8_t dr, dg, db;
1285 uint16x8_t dst8; 1384 uint16x8_t dst8;
1385 uint8x8x4_t vsrc;
1286 1386
1387 #ifdef SK_CPU_ARM64
1388 uint8x8_t vsrc_0, vsrc_1, vsrc_2;
1389 asm (
1390 "ld4 {v0.8b - v3.8b}, [%[src]] \t\n"
1391 "mov %[vsrc0].8b, v0.8b \t\n"
1392 "mov %[vsrc1].8b, v1.8b \t\n"
1393 "mov %[vsrc2].8b, v2.8b \t\n"
1394 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
1395 [vsrc2] "=w" (vsrc_2)
1396 : [src] "r" (src)
1397 : "v0", "v1", "v2", "v3"
1398 );
1399 src += 8;
1400
1401 vsrc.val[0] = vsrc_0;
1402 vsrc.val[1] = vsrc_1;
1403 vsrc.val[2] = vsrc_2;
1404 #else
1287 { 1405 {
1288 register uint8x8_t d0 asm("d0"); 1406 register uint8x8_t d0 asm("d0");
1289 register uint8x8_t d1 asm("d1"); 1407 register uint8x8_t d1 asm("d1");
1290 register uint8x8_t d2 asm("d2"); 1408 register uint8x8_t d2 asm("d2");
1291 register uint8x8_t d3 asm("d3"); 1409 register uint8x8_t d3 asm("d3");
1292 1410
1293 asm ( 1411 asm (
1294 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1412 "vld4.8 {d0-d3},[%[src]]! "
1295 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1413 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1296 : 1414 :
1297 ); 1415 );
1298 sg = d1; 1416 vsrc.val[0] = d0;
1299 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1417 vsrc.val[1] = d1;
1300 sr = d2; sb = d0; 1418 vsrc.val[2] = d2;
1301 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1419 }
1302 sr = d0; sb = d2;
1303 #endif 1420 #endif
1304 } 1421 sr = vsrc.val[NEON_R];
1422 sg = vsrc.val[NEON_G];
1423 sb = vsrc.val[NEON_B];
1424
1305 /* XXX: if we want to prefetch, hide it in the above asm() 1425 /* XXX: if we want to prefetch, hide it in the above asm()
1306 * using the gcc __builtin_prefetch(), the prefetch will 1426 * using the gcc __builtin_prefetch(), the prefetch will
1307 * fall to the bottom of the loop -- it won't stick up 1427 * fall to the bottom of the loop -- it won't stick up
1308 * at the top of the loop, just after the vld4. 1428 * at the top of the loop, just after the vld4.
1309 */ 1429 */
1310 1430
1311 // sr = sr - (sr>>5) + d 1431 // sr = sr - (sr>>5) + d
1312 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1432 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1313 dr = vaddl_u8(sr, d); 1433 dr = vaddl_u8(sr, d);
1314 1434
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
1362 SkPMColor c = *src++; 1482 SkPMColor c = *src++;
1363 SkPMColorAssert(c); 1483 SkPMColorAssert(c);
1364 SkASSERT(SkGetPackedA32(c) == 255); 1484 SkASSERT(SkGetPackedA32(c) == 255);
1365 1485
1366 unsigned dither = DITHER_VALUE(x); 1486 unsigned dither = DITHER_VALUE(x);
1367 *dst++ = SkDitherRGB32To565(c, dither); 1487 *dst++ = SkDitherRGB32To565(c, dither);
1368 DITHER_INC_X(x); 1488 DITHER_INC_X(x);
1369 } while (--count != 0); 1489 } while (--count != 0);
1370 } 1490 }
1371 } 1491 }
1372 #endif
1373 1492
1374 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1493 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1375 SkPMColor color) { 1494 SkPMColor color) {
1376 if (count <= 0) { 1495 if (count <= 0) {
1377 return; 1496 return;
1378 } 1497 }
1379 1498
1380 if (0 == color) { 1499 if (0 == color) {
1381 if (src != dst) { 1500 if (src != dst) {
1382 memcpy(dst, src, count * sizeof(SkPMColor)); 1501 memcpy(dst, src, count * sizeof(SkPMColor));
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after
1468 *dst = color + SkAlphaMulQ(*src, scale); 1587 *dst = color + SkAlphaMulQ(*src, scale);
1469 src += 1; 1588 src += 1;
1470 dst += 1; 1589 dst += 1;
1471 count--; 1590 count--;
1472 } 1591 }
1473 } 1592 }
1474 1593
1475 /////////////////////////////////////////////////////////////////////////////// 1594 ///////////////////////////////////////////////////////////////////////////////
1476 1595
1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1596 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1478 #ifdef SK_CPU_ARM32
1479 // no dither 1597 // no dither
1480 S32_D565_Opaque_neon, 1598 S32_D565_Opaque_neon,
1481 S32_D565_Blend_neon, 1599 S32_D565_Blend_neon,
1600 #ifdef SK_CPU_ARM32
1482 S32A_D565_Opaque_neon, 1601 S32A_D565_Opaque_neon,
1602 #else
1603 NULL,
1604 #endif
1483 S32A_D565_Blend_neon, 1605 S32A_D565_Blend_neon,
1484 1606
1485 // dither 1607 // dither
1486 S32_D565_Opaque_Dither_neon, 1608 S32_D565_Opaque_Dither_neon,
1487 S32_D565_Blend_Dither_neon, 1609 S32_D565_Blend_Dither_neon,
1488 S32A_D565_Opaque_Dither_neon, 1610 S32A_D565_Opaque_Dither_neon,
1489 NULL, // S32A_D565_Blend_Dither 1611 NULL, // S32A_D565_Blend_Dither
1490 #else
1491 NULL, NULL, NULL, NULL,
1492 NULL, NULL, NULL, NULL
1493 #endif
1494 }; 1612 };
1495 1613
1496 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1614 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1497 NULL, // S32_Opaque, 1615 NULL, // S32_Opaque,
1498 S32_Blend_BlitRow32_neon, // S32_Blend, 1616 S32_Blend_BlitRow32_neon, // S32_Blend,
1499 /* 1617 /*
1500 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1618 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1501 * value and attempts to optimize accordingly. The optimization is 1619 * value and attempts to optimize accordingly. The optimization is
1502 * sensitive to the source content and is not a win in all cases. For 1620 * sensitive to the source content and is not a win in all cases. For
1503 * example, if there are a lot of transitions between the alpha states, 1621 * example, if there are a lot of transitions between the alpha states,
1504 * the performance will almost certainly be worse. However, for many 1622 * the performance will almost certainly be worse. However, for many
1505 * common cases the performance is equivalent or better than the standard 1623 * common cases the performance is equivalent or better than the standard
1506 * case where we do not inspect the src alpha. 1624 * case where we do not inspect the src alpha.
1507 */ 1625 */
1508 #if SK_A32_SHIFT == 24 1626 #if SK_A32_SHIFT == 24
1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1627 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1628 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1511 #else 1629 #else
1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1630 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1513 #endif 1631 #endif
1514 #ifdef SK_CPU_ARM32 1632 #ifdef SK_CPU_ARM32
1515 S32A_Blend_BlitRow32_neon // S32A_Blend 1633 S32A_Blend_BlitRow32_neon // S32A_Blend
1516 #else 1634 #else
1517 NULL 1635 NULL
1518 #endif 1636 #endif
1519 }; 1637 };
OLDNEW
« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698