Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(593)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 302283003: ARM Skia NEON patches - 38 - arm64 8888 blitters (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Comments Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « gyp/opts.gyp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h" 11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h" 12 #include "SkColorPriv.h"
13 #include "SkDither.h" 13 #include "SkDither.h"
14 #include "SkMathPriv.h" 14 #include "SkMathPriv.h"
15 #include "SkUtils.h" 15 #include "SkUtils.h"
16 16
17 #include "SkColor_opts_neon.h" 17 #include "SkColor_opts_neon.h"
18 #include <arm_neon.h> 18 #include <arm_neon.h>
19 19
20 #ifdef SK_CPU_ARM
20 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
21 const SkPMColor* SK_RESTRICT src, int count, 22 const SkPMColor* SK_RESTRICT src, int count,
22 U8CPU alpha, int /*x*/, int /*y*/) { 23 U8CPU alpha, int /*x*/, int /*y*/) {
23 SkASSERT(255 == alpha); 24 SkASSERT(255 == alpha);
24 25
25 while (count >= 8) { 26 while (count >= 8) {
26 uint8x8x4_t vsrc; 27 uint8x8x4_t vsrc;
27 uint16x8_t vdst; 28 uint16x8_t vdst;
28 29
29 // Load 30 // Load
(...skipping 538 matching lines...) Expand 10 before | Expand all | Expand 10 after
568 sb = SkDITHER_B32To565(sb, dither); 569 sb = SkDITHER_B32To565(sb, dither);
569 570
570 uint16_t d = *dst; 571 uint16_t d = *dst;
571 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
572 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 573 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
573 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 574 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
574 DITHER_INC_X(x); 575 DITHER_INC_X(x);
575 } while (--count != 0); 576 } while (--count != 0);
576 } 577 }
577 } 578 }
579 #endif
578 580
579 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 581 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
580 const SkPMColor* SK_RESTRICT src, 582 const SkPMColor* SK_RESTRICT src,
581 int count, U8CPU alpha) { 583 int count, U8CPU alpha) {
582 584
583 SkASSERT(255 == alpha); 585 SkASSERT(255 == alpha);
584 if (count > 0) { 586 if (count > 0) {
585 587
586 588
587 uint8x8_t alpha_mask; 589 uint8x8_t alpha_mask;
(...skipping 324 matching lines...) Expand 10 before | Expand all | Expand 10 after
912 vsrc_wide = vmovl_u8(vsrc); 914 vsrc_wide = vmovl_u8(vsrc);
913 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 915 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
914 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 916 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
915 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 917 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
916 918
917 // Store 919 // Store
918 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 920 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
919 } 921 }
920 } 922 }
921 923
924 #ifdef SK_CPU_ARM
922 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 925 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
923 const SkPMColor* SK_RESTRICT src, 926 const SkPMColor* SK_RESTRICT src,
924 int count, U8CPU alpha) { 927 int count, U8CPU alpha) {
925 928
926 SkASSERT(255 >= alpha); 929 SkASSERT(255 >= alpha);
927 930
928 if (count <= 0) { 931 if (count <= 0) {
929 return; 932 return;
930 } 933 }
931 934
(...skipping 427 matching lines...) Expand 10 before | Expand all | Expand 10 after
1359 SkPMColor c = *src++; 1362 SkPMColor c = *src++;
1360 SkPMColorAssert(c); 1363 SkPMColorAssert(c);
1361 SkASSERT(SkGetPackedA32(c) == 255); 1364 SkASSERT(SkGetPackedA32(c) == 255);
1362 1365
1363 unsigned dither = DITHER_VALUE(x); 1366 unsigned dither = DITHER_VALUE(x);
1364 *dst++ = SkDitherRGB32To565(c, dither); 1367 *dst++ = SkDitherRGB32To565(c, dither);
1365 DITHER_INC_X(x); 1368 DITHER_INC_X(x);
1366 } while (--count != 0); 1369 } while (--count != 0);
1367 } 1370 }
1368 } 1371 }
1372 #endif
1369 1373
1370 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1374 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1371 SkPMColor color) { 1375 SkPMColor color) {
1372 if (count <= 0) { 1376 if (count <= 0) {
1373 return; 1377 return;
1374 } 1378 }
1375 1379
1376 if (0 == color) { 1380 if (0 == color) {
1377 if (src != dst) { 1381 if (src != dst) {
1378 memcpy(dst, src, count * sizeof(SkPMColor)); 1382 memcpy(dst, src, count * sizeof(SkPMColor));
(...skipping 15 matching lines...) Expand all
1394 1398
1395 vcolor = vdupq_n_u32(color); 1399 vcolor = vdupq_n_u32(color);
1396 1400
1397 // scale numerical interval [0-255], so load as 8 bits 1401 // scale numerical interval [0-255], so load as 8 bits
1398 vscale = vdup_n_u8(scale); 1402 vscale = vdup_n_u8(scale);
1399 1403
1400 do { 1404 do {
1401 // load src color, 8 pixels, 4 64 bit registers 1405 // load src color, 8 pixels, 4 64 bit registers
1402 // (and increment src). 1406 // (and increment src).
1403 uint32x2x4_t vsrc; 1407 uint32x2x4_t vsrc;
1404 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 1408 #if defined(SK_CPU_ARM) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR_ _ > 6)))
1405 asm ( 1409 asm (
1406 "vld1.32 %h[vsrc], [%[src]]!" 1410 "vld1.32 %h[vsrc], [%[src]]!"
1407 : [vsrc] "=w" (vsrc), [src] "+r" (src) 1411 : [vsrc] "=w" (vsrc), [src] "+r" (src)
1408 : : 1412 : :
1409 ); 1413 );
1410 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 1414 #else // 64bit targets and Clang
1411 vsrc.val[0] = vld1_u32(src); 1415 vsrc.val[0] = vld1_u32(src);
1412 vsrc.val[1] = vld1_u32(src+2); 1416 vsrc.val[1] = vld1_u32(src+2);
1413 vsrc.val[2] = vld1_u32(src+4); 1417 vsrc.val[2] = vld1_u32(src+4);
1414 vsrc.val[3] = vld1_u32(src+6); 1418 vsrc.val[3] = vld1_u32(src+6);
1415 src += 8; 1419 src += 8;
1416 #endif 1420 #endif
1417 1421
1418 // multiply long by scale, 64 bits at a time, 1422 // multiply long by scale, 64 bits at a time,
1419 // destination into a 128 bit register. 1423 // destination into a 128 bit register.
1420 uint16x8x4_t vtmp; 1424 uint16x8x4_t vtmp;
(...skipping 15 matching lines...) Expand all
1436 1440
1437 // adding back the color, using 128 bit registers. 1441 // adding back the color, using 128 bit registers.
1438 uint32x4x2_t vdst; 1442 uint32x4x2_t vdst;
1439 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + 1443 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
1440 vreinterpretq_u8_u32(vcolor)); 1444 vreinterpretq_u8_u32(vcolor));
1441 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + 1445 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
1442 vreinterpretq_u8_u32(vcolor)); 1446 vreinterpretq_u8_u32(vcolor));
1443 1447
1444 // store back the 8 calculated pixels (2 128 bit 1448 // store back the 8 calculated pixels (2 128 bit
1445 // registers), and increment dst. 1449 // registers), and increment dst.
1446 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 1450 #if defined(SK_CPU_ARM) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR_ _ > 6)))
1447 asm ( 1451 asm (
1448 "vst1.32 %h[vdst], [%[dst]]!" 1452 "vst1.32 %h[vdst], [%[dst]]!"
1449 : [dst] "+r" (dst) 1453 : [dst] "+r" (dst)
1450 : [vdst] "w" (vdst) 1454 : [vdst] "w" (vdst)
1451 : "memory" 1455 : "memory"
1452 ); 1456 );
1453 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 1457 #else // 64bit targets and Clang
1454 vst1q_u32(dst, vdst.val[0]); 1458 vst1q_u32(dst, vdst.val[0]);
1455 vst1q_u32(dst+4, vdst.val[1]); 1459 vst1q_u32(dst+4, vdst.val[1]);
1456 dst += 8; 1460 dst += 8;
1457 #endif 1461 #endif
1458 count -= 8; 1462 count -= 8;
1459 1463
1460 } while (count >= 8); 1464 } while (count >= 8);
1461 } 1465 }
1462 1466
1463 while (count > 0) { 1467 while (count > 0) {
1464 *dst = color + SkAlphaMulQ(*src, scale); 1468 *dst = color + SkAlphaMulQ(*src, scale);
1465 src += 1; 1469 src += 1;
1466 dst += 1; 1470 dst += 1;
1467 count--; 1471 count--;
1468 } 1472 }
1469 } 1473 }
1470 1474
1471 /////////////////////////////////////////////////////////////////////////////// 1475 ///////////////////////////////////////////////////////////////////////////////
1472 1476
1473 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1478 #ifdef SK_CPU_ARM
1474 // no dither 1479 // no dither
1475 S32_D565_Opaque_neon, 1480 S32_D565_Opaque_neon,
1476 S32_D565_Blend_neon, 1481 S32_D565_Blend_neon,
1477 S32A_D565_Opaque_neon, 1482 S32A_D565_Opaque_neon,
1478 S32A_D565_Blend_neon, 1483 S32A_D565_Blend_neon,
1479 1484
1480 // dither 1485 // dither
1481 S32_D565_Opaque_Dither_neon, 1486 S32_D565_Opaque_Dither_neon,
1482 S32_D565_Blend_Dither_neon, 1487 S32_D565_Blend_Dither_neon,
1483 S32A_D565_Opaque_Dither_neon, 1488 S32A_D565_Opaque_Dither_neon,
1484 NULL, // S32A_D565_Blend_Dither 1489 NULL, // S32A_D565_Blend_Dither
1490 #else
1491 NULL, NULL, NULL, NULL,
1492 NULL, NULL, NULL, NULL
1493 #endif
1485 }; 1494 };
1486 1495
1487 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1496 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1488 NULL, // S32_Opaque, 1497 NULL, // S32_Opaque,
1489 S32_Blend_BlitRow32_neon, // S32_Blend, 1498 S32_Blend_BlitRow32_neon, // S32_Blend,
1490 /* 1499 /*
1491 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1500 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1492 * value and attempts to optimize accordingly. The optimization is 1501 * value and attempts to optimize accordingly. The optimization is
1493 * sensitive to the source content and is not a win in all cases. For 1502 * sensitive to the source content and is not a win in all cases. For
1494 * example, if there are a lot of transitions between the alpha states, 1503 * example, if there are a lot of transitions between the alpha states,
1495 * the performance will almost certainly be worse. However, for many 1504 * the performance will almost certainly be worse. However, for many
1496 * common cases the performance is equivalent or better than the standard 1505 * common cases the performance is equivalent or better than the standard
1497 * case where we do not inspect the src alpha. 1506 * case where we do not inspect the src alpha.
1498 */ 1507 */
1499 #if SK_A32_SHIFT == 24 1508 #if SK_A32_SHIFT == 24
1500 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1501 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1502 #else 1511 #else
1503 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1504 #endif 1513 #endif
1514 #ifdef SK_CPU_ARM
1505 S32A_Blend_BlitRow32_neon // S32A_Blend 1515 S32A_Blend_BlitRow32_neon // S32A_Blend
1516 #else
1517 NULL
1518 #endif
1506 }; 1519 };
OLDNEW
« no previous file with comments | « gyp/opts.gyp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698