OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 1366 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1377 if (0 == color) { | 1377 if (0 == color) { |
1378 if (src != dst) { | 1378 if (src != dst) { |
1379 memcpy(dst, src, count * sizeof(SkPMColor)); | 1379 memcpy(dst, src, count * sizeof(SkPMColor)); |
1380 } | 1380 } |
1381 return; | 1381 return; |
1382 } | 1382 } |
1383 | 1383 |
1384 unsigned colorA = SkGetPackedA32(color); | 1384 unsigned colorA = SkGetPackedA32(color); |
1385 if (255 == colorA) { | 1385 if (255 == colorA) { |
1386 sk_memset32(dst, color, count); | 1386 sk_memset32(dst, color, count); |
1387 } else { | 1387 return; |
1388 unsigned scale = 256 - SkAlpha255To256(colorA); | 1388 } |
1389 | 1389 |
1390 if (count >= 8) { | 1390 unsigned scale = 256 - SkAlpha255To256(colorA); |
1391 // at the end of this assembly, count will have been decremented | |
1392 // to a negative value. That is, if count mod 8 = x, it will be | |
1393 // -8 +x coming out. | |
1394 asm volatile ( | |
1395 PLD128(src, 0) | |
1396 | 1391 |
1397 "vdup.32 q0, %[color] \n\t" | 1392 if (count >= 8) { |
| 1393 uint32x4_t vcolor; |
| 1394 uint8x8_t vscale; |
1398 | 1395 |
1399 PLD128(src, 128) | 1396 vcolor = vdupq_n_u32(color); |
1400 | 1397 |
1401 // scale numerical interval [0-255], so load as 8 bits | 1398 // scale numerical interval [0-255], so load as 8 bits |
1402 "vdup.8 d2, %[scale] \n\t" | 1399 vscale = vdup_n_u8(scale); |
1403 | 1400 |
1404 PLD128(src, 256) | 1401 do { |
| 1402 // load src color, 8 pixels, 4 64 bit registers |
| 1403 // (and increment src). |
| 1404 uint32x2x4_t vsrc; |
| 1405 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 1406 asm ( |
| 1407 "vld1.32 %h[vsrc], [%[src]]!" |
| 1408 : [vsrc] "=w" (vsrc), [src] "+r" (src) |
| 1409 : : |
| 1410 ); |
| 1411 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 1412 vsrc.val[0] = vld1_u32(src); |
| 1413 vsrc.val[1] = vld1_u32(src+2); |
| 1414 vsrc.val[2] = vld1_u32(src+4); |
| 1415 vsrc.val[3] = vld1_u32(src+6); |
| 1416 src += 8; |
| 1417 #endif |
1405 | 1418 |
1406 "subs %[count], %[count], #8 \n\t" | 1419 // multiply long by scale, 64 bits at a time, |
| 1420 // destination into a 128 bit register. |
| 1421 uint16x8x4_t vtmp; |
| 1422 vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale); |
| 1423 vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale); |
| 1424 vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale); |
| 1425 vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale); |
1407 | 1426 |
1408 PLD128(src, 384) | 1427 // shift the 128 bit registers, containing the 16 |
| 1428 // bit scaled values back to 8 bits, narrowing the |
| 1429 // results to 64 bit registers. |
| 1430 uint8x16x2_t vres; |
| 1431 vres.val[0] = vcombine_u8( |
| 1432 vshrn_n_u16(vtmp.val[0], 8), |
| 1433 vshrn_n_u16(vtmp.val[1], 8)); |
| 1434 vres.val[1] = vcombine_u8( |
| 1435 vshrn_n_u16(vtmp.val[2], 8), |
| 1436 vshrn_n_u16(vtmp.val[3], 8)); |
1409 | 1437 |
1410 "Loop_Color32: \n\t" | 1438 // adding back the color, using 128 bit registers. |
| 1439 uint32x4x2_t vdst; |
| 1440 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + |
| 1441 vreinterpretq_u8_u32(vcolor)); |
| 1442 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + |
| 1443 vreinterpretq_u8_u32(vcolor)); |
1411 | 1444 |
1412 // load src color, 8 pixels, 4 64 bit registers | 1445 // store back the 8 calculated pixels (2 128 bit |
1413 // (and increment src). | 1446 // registers), and increment dst. |
1414 "vld1.32 {d4-d7}, [%[src]]! \n\t" | 1447 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 1448 asm ( |
| 1449 "vst1.32 %h[vdst], [%[dst]]!" |
| 1450 : [dst] "+r" (dst) |
| 1451 : [vdst] "w" (vdst) |
| 1452 : "memory" |
| 1453 ); |
| 1454 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 1455 vst1q_u32(dst, vdst.val[0]); |
| 1456 vst1q_u32(dst+4, vdst.val[1]); |
| 1457 dst += 8; |
| 1458 #endif |
| 1459 count -= 8; |
1415 | 1460 |
1416 PLD128(src, 384) | 1461 } while (count >= 8); |
| 1462 } |
1417 | 1463 |
1418 // multiply long by scale, 64 bits at a time, | 1464 while (count > 0) { |
1419 // destination into a 128 bit register. | 1465 *dst = color + SkAlphaMulQ(*src, scale); |
1420 "vmull.u8 q4, d4, d2 \n\t" | 1466 src += 1; |
1421 "vmull.u8 q5, d5, d2 \n\t" | 1467 dst += 1; |
1422 "vmull.u8 q6, d6, d2 \n\t" | 1468 count--; |
1423 "vmull.u8 q7, d7, d2 \n\t" | |
1424 | |
1425 // shift the 128 bit registers, containing the 16 | |
1426 // bit scaled values back to 8 bits, narrowing the | |
1427 // results to 64 bit registers. | |
1428 "vshrn.i16 d8, q4, #8 \n\t" | |
1429 "vshrn.i16 d9, q5, #8 \n\t" | |
1430 "vshrn.i16 d10, q6, #8 \n\t" | |
1431 "vshrn.i16 d11, q7, #8 \n\t" | |
1432 | |
1433 // adding back the color, using 128 bit registers. | |
1434 "vadd.i8 q6, q4, q0 \n\t" | |
1435 "vadd.i8 q7, q5, q0 \n\t" | |
1436 | |
1437 // store back the 8 calculated pixels (2 128 bit | |
1438 // registers), and increment dst. | |
1439 "vst1.32 {d12-d15}, [%[dst]]! \n\t" | |
1440 | |
1441 "subs %[count], %[count], #8 \n\t" | |
1442 "bge Loop_Color32 \n\t" | |
1443 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) | |
1444 : [color] "r" (color), [scale] "r" (scale) | |
1445 : "cc", "memory", | |
1446 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", | |
1447 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" | |
1448 ); | |
1449 // At this point, if we went through the inline assembly, count is | |
1450 // a negative value: | |
1451 // if the value is -8, there is no pixel left to process. | |
1452 // if the value is -7, there is one pixel left to process | |
1453 // ... | |
1454 // And'ing it with 7 will give us the number of pixels | |
1455 // left to process. | |
1456 count = count & 0x7; | |
1457 } | |
1458 | |
1459 while (count > 0) { | |
1460 *dst = color + SkAlphaMulQ(*src, scale); | |
1461 src += 1; | |
1462 dst += 1; | |
1463 count--; | |
1464 } | |
1465 } | 1469 } |
1466 } | 1470 } |
1467 | 1471 |
1468 /////////////////////////////////////////////////////////////////////////////// | 1472 /////////////////////////////////////////////////////////////////////////////// |
1469 | 1473 |
1470 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { | 1474 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
1471 // no dither | 1475 // no dither |
1472 S32_D565_Opaque_neon, | 1476 S32_D565_Opaque_neon, |
1473 S32_D565_Blend_neon, | 1477 S32_D565_Blend_neon, |
1474 S32A_D565_Opaque_neon, | 1478 S32A_D565_Opaque_neon, |
(...skipping 19 matching lines...) Expand all Loading... |
1494 * case where we do not inspect the src alpha. | 1498 * case where we do not inspect the src alpha. |
1495 */ | 1499 */ |
1496 #if SK_A32_SHIFT == 24 | 1500 #if SK_A32_SHIFT == 24 |
1497 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1501 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1498 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1502 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1499 #else | 1503 #else |
1500 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1504 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1501 #endif | 1505 #endif |
1502 S32A_Blend_BlitRow32_neon // S32A_Blend | 1506 S32A_Blend_BlitRow32_neon // S32A_Blend |
1503 }; | 1507 }; |
OLD | NEW |