OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
11 #include "SkBlitRow.h" | 11 #include "SkBlitRow.h" |
12 #include "SkColorPriv.h" | 12 #include "SkColorPriv.h" |
13 #include "SkDither.h" | 13 #include "SkDither.h" |
14 #include "SkMathPriv.h" | 14 #include "SkMathPriv.h" |
15 #include "SkUtils.h" | 15 #include "SkUtils.h" |
16 | 16 |
17 #include "SkColor_opts_neon.h" | 17 #include "SkColor_opts_neon.h" |
18 #include <arm_neon.h> | 18 #include <arm_neon.h> |
19 | 19 |
20 #ifdef SK_CPU_ARM | 20 #ifdef SK_CPU_ARM32 |
21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
22 const SkPMColor* SK_RESTRICT src, int count, | 22 const SkPMColor* SK_RESTRICT src, int count, |
23 U8CPU alpha, int /*x*/, int /*y*/) { | 23 U8CPU alpha, int /*x*/, int /*y*/) { |
24 SkASSERT(255 == alpha); | 24 SkASSERT(255 == alpha); |
25 | 25 |
26 while (count >= 8) { | 26 while (count >= 8) { |
27 uint8x8x4_t vsrc; | 27 uint8x8x4_t vsrc; |
28 uint16x8_t vdst; | 28 uint16x8_t vdst; |
29 | 29 |
30 // Load | 30 // Load |
(...skipping 883 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
914 vsrc_wide = vmovl_u8(vsrc); | 914 vsrc_wide = vmovl_u8(vsrc); |
915 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); | 915 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
916 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); | 916 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
917 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); | 917 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
918 | 918 |
919 // Store | 919 // Store |
920 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); | 920 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
921 } | 921 } |
922 } | 922 } |
923 | 923 |
924 #ifdef SK_CPU_ARM | 924 #ifdef SK_CPU_ARM32 |
925 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 925 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
926 const SkPMColor* SK_RESTRICT src, | 926 const SkPMColor* SK_RESTRICT src, |
927 int count, U8CPU alpha) { | 927 int count, U8CPU alpha) { |
928 | 928 |
929 SkASSERT(255 >= alpha); | 929 SkASSERT(255 >= alpha); |
930 | 930 |
931 if (count <= 0) { | 931 if (count <= 0) { |
932 return; | 932 return; |
933 } | 933 } |
934 | 934 |
(...skipping 463 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1398 | 1398 |
1399 vcolor = vdupq_n_u32(color); | 1399 vcolor = vdupq_n_u32(color); |
1400 | 1400 |
1401 // scale numerical interval [0-255], so load as 8 bits | 1401 // scale numerical interval [0-255], so load as 8 bits |
1402 vscale = vdup_n_u8(scale); | 1402 vscale = vdup_n_u8(scale); |
1403 | 1403 |
1404 do { | 1404 do { |
1405 // load src color, 8 pixels, 4 64 bit registers | 1405 // load src color, 8 pixels, 4 64 bit registers |
1406 // (and increment src). | 1406 // (and increment src). |
1407 uint32x2x4_t vsrc; | 1407 uint32x2x4_t vsrc; |
1408 #if defined(SK_CPU_ARM) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR_
_ > 6))) | 1408 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINO
R__ > 6))) |
1409 asm ( | 1409 asm ( |
1410 "vld1.32 %h[vsrc], [%[src]]!" | 1410 "vld1.32 %h[vsrc], [%[src]]!" |
1411 : [vsrc] "=w" (vsrc), [src] "+r" (src) | 1411 : [vsrc] "=w" (vsrc), [src] "+r" (src) |
1412 : : | 1412 : : |
1413 ); | 1413 ); |
1414 #else // 64bit targets and Clang | 1414 #else // 64bit targets and Clang |
1415 vsrc.val[0] = vld1_u32(src); | 1415 vsrc.val[0] = vld1_u32(src); |
1416 vsrc.val[1] = vld1_u32(src+2); | 1416 vsrc.val[1] = vld1_u32(src+2); |
1417 vsrc.val[2] = vld1_u32(src+4); | 1417 vsrc.val[2] = vld1_u32(src+4); |
1418 vsrc.val[3] = vld1_u32(src+6); | 1418 vsrc.val[3] = vld1_u32(src+6); |
(...skipping 21 matching lines...) Expand all Loading... |
1440 | 1440 |
1441 // adding back the color, using 128 bit registers. | 1441 // adding back the color, using 128 bit registers. |
1442 uint32x4x2_t vdst; | 1442 uint32x4x2_t vdst; |
1443 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + | 1443 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + |
1444 vreinterpretq_u8_u32(vcolor)); | 1444 vreinterpretq_u8_u32(vcolor)); |
1445 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + | 1445 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + |
1446 vreinterpretq_u8_u32(vcolor)); | 1446 vreinterpretq_u8_u32(vcolor)); |
1447 | 1447 |
1448 // store back the 8 calculated pixels (2 128 bit | 1448 // store back the 8 calculated pixels (2 128 bit |
1449 // registers), and increment dst. | 1449 // registers), and increment dst. |
1450 #if defined(SK_CPU_ARM) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR_
_ > 6))) | 1450 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINO
R__ > 6))) |
1451 asm ( | 1451 asm ( |
1452 "vst1.32 %h[vdst], [%[dst]]!" | 1452 "vst1.32 %h[vdst], [%[dst]]!" |
1453 : [dst] "+r" (dst) | 1453 : [dst] "+r" (dst) |
1454 : [vdst] "w" (vdst) | 1454 : [vdst] "w" (vdst) |
1455 : "memory" | 1455 : "memory" |
1456 ); | 1456 ); |
1457 #else // 64bit targets and Clang | 1457 #else // 64bit targets and Clang |
1458 vst1q_u32(dst, vdst.val[0]); | 1458 vst1q_u32(dst, vdst.val[0]); |
1459 vst1q_u32(dst+4, vdst.val[1]); | 1459 vst1q_u32(dst+4, vdst.val[1]); |
1460 dst += 8; | 1460 dst += 8; |
1461 #endif | 1461 #endif |
1462 count -= 8; | 1462 count -= 8; |
1463 | 1463 |
1464 } while (count >= 8); | 1464 } while (count >= 8); |
1465 } | 1465 } |
1466 | 1466 |
1467 while (count > 0) { | 1467 while (count > 0) { |
1468 *dst = color + SkAlphaMulQ(*src, scale); | 1468 *dst = color + SkAlphaMulQ(*src, scale); |
1469 src += 1; | 1469 src += 1; |
1470 dst += 1; | 1470 dst += 1; |
1471 count--; | 1471 count--; |
1472 } | 1472 } |
1473 } | 1473 } |
1474 | 1474 |
1475 /////////////////////////////////////////////////////////////////////////////// | 1475 /////////////////////////////////////////////////////////////////////////////// |
1476 | 1476 |
1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { | 1477 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
1478 #ifdef SK_CPU_ARM | 1478 #ifdef SK_CPU_ARM32 |
1479 // no dither | 1479 // no dither |
1480 S32_D565_Opaque_neon, | 1480 S32_D565_Opaque_neon, |
1481 S32_D565_Blend_neon, | 1481 S32_D565_Blend_neon, |
1482 S32A_D565_Opaque_neon, | 1482 S32A_D565_Opaque_neon, |
1483 S32A_D565_Blend_neon, | 1483 S32A_D565_Blend_neon, |
1484 | 1484 |
1485 // dither | 1485 // dither |
1486 S32_D565_Opaque_Dither_neon, | 1486 S32_D565_Opaque_Dither_neon, |
1487 S32_D565_Blend_Dither_neon, | 1487 S32_D565_Blend_Dither_neon, |
1488 S32A_D565_Opaque_Dither_neon, | 1488 S32A_D565_Opaque_Dither_neon, |
(...skipping 15 matching lines...) Expand all Loading... |
1504 * the performance will almost certainly be worse. However, for many | 1504 * the performance will almost certainly be worse. However, for many |
1505 * common cases the performance is equivalent or better than the standard | 1505 * common cases the performance is equivalent or better than the standard |
1506 * case where we do not inspect the src alpha. | 1506 * case where we do not inspect the src alpha. |
1507 */ | 1507 */ |
1508 #if SK_A32_SHIFT == 24 | 1508 #if SK_A32_SHIFT == 24 |
1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1509 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1510 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1511 #else | 1511 #else |
1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1512 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1513 #endif | 1513 #endif |
1514 #ifdef SK_CPU_ARM | 1514 #ifdef SK_CPU_ARM32 |
1515 S32A_Blend_BlitRow32_neon // S32A_Blend | 1515 S32A_Blend_BlitRow32_neon // S32A_Blend |
1516 #else | 1516 #else |
1517 NULL | 1517 NULL |
1518 #endif | 1518 #endif |
1519 }; | 1519 }; |
OLD | NEW |