OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 1366 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1377 if (0 == color) { | 1377 if (0 == color) { |
1378 if (src != dst) { | 1378 if (src != dst) { |
1379 memcpy(dst, src, count * sizeof(SkPMColor)); | 1379 memcpy(dst, src, count * sizeof(SkPMColor)); |
1380 } | 1380 } |
1381 return; | 1381 return; |
1382 } | 1382 } |
1383 | 1383 |
1384 unsigned colorA = SkGetPackedA32(color); | 1384 unsigned colorA = SkGetPackedA32(color); |
1385 if (255 == colorA) { | 1385 if (255 == colorA) { |
1386 sk_memset32(dst, color, count); | 1386 sk_memset32(dst, color, count); |
1387 } else { | 1387 return; |
1388 unsigned scale = 256 - SkAlpha255To256(colorA); | 1388 } |
1389 | 1389 |
1390 if (count >= 8) { | 1390 unsigned scale = 256 - SkAlpha255To256(colorA); |
1391 // at the end of this assembly, count will have been decremented | |
1392 // to a negative value. That is, if count mod 8 = x, it will be | |
1393 // -8 +x coming out. | |
1394 asm volatile ( | |
1395 PLD128(src, 0) | |
mtklein
2014/04/29 13:50:30
Was it that these preloads that were detrimental t
kevin.petit
2014/04/29 14:08:56
Nope, these preloads are disabled in all builds as
| |
1396 | 1391 |
1397 "vdup.32 q0, %[color] \n\t" | 1392 if (count >= 8) { |
1393 uint32x4_t vcolor; | |
1394 uint8x8_t vscale; | |
1398 | 1395 |
1399 PLD128(src, 128) | 1396 vcolor = vdupq_n_u32(color); |
1400 | 1397 |
1401 // scale numerical interval [0-255], so load as 8 bits | 1398 // scale numerical interval [0-255], so load as 8 bits |
1402 "vdup.8 d2, %[scale] \n\t" | 1399 vscale = vdup_n_u8(scale); |
1403 | 1400 |
1404 PLD128(src, 256) | 1401 do { |
1402 uint32x2x4_t vsrc; | |
1403 uint16x8x4_t vtmp; | |
mtklein
2014/04/29 13:50:30
Can you move the declarations of vtmp, vres, and v
kevin.petit
2014/04/29 14:08:56
Done.
| |
1404 uint8x16x2_t vres; | |
1405 uint32x4x2_t vdst; | |
1405 | 1406 |
1406 "subs %[count], %[count], #8 \n\t" | 1407 // load src color, 8 pixels, 4 64 bit registers |
1408 // (and increment src). | |
1409 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | |
1410 asm ( | |
1411 "vld1.32 %h[vsrc], [%[src]]!" | |
1412 : [vsrc] "=w" (vsrc), [src] "+r" (src) | |
1413 : : | |
1414 ); | |
1415 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | |
1416 vsrc.val[0] = vld1_u32(src); | |
1417 vsrc.val[1] = vld1_u32(src+2); | |
1418 vsrc.val[2] = vld1_u32(src+4); | |
1419 vsrc.val[3] = vld1_u32(src+6); | |
1420 src += 8; | |
1421 #endif | |
1407 | 1422 |
1408 PLD128(src, 384) | 1423 // multiply long by scale, 64 bits at a time, |
1424 // destination into a 128 bit register. | |
1425 vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale); | |
1426 vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale); | |
1427 vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale); | |
1428 vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale); | |
1409 | 1429 |
1410 "Loop_Color32: \n\t" | 1430 // shift the 128 bit registers, containing the 16 |
1431 // bit scaled values back to 8 bits, narrowing the | |
1432 // results to 64 bit registers. | |
1433 vres.val[0] = vcombine_u8( | |
1434 vshrn_n_u16(vtmp.val[0], 8), | |
1435 vshrn_n_u16(vtmp.val[1], 8)); | |
1436 vres.val[1] = vcombine_u8( | |
1437 vshrn_n_u16(vtmp.val[2], 8), | |
1438 vshrn_n_u16(vtmp.val[3], 8)); | |
1411 | 1439 |
1412 // load src color, 8 pixels, 4 64 bit registers | 1440 // adding back the color, using 128 bit registers. |
1413 // (and increment src). | 1441 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + |
1414 "vld1.32 {d4-d7}, [%[src]]! \n\t" | 1442 vreinterpretq_u8_u32(vcolor)); |
1443 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + | |
1444 vreinterpretq_u8_u32(vcolor)); | |
1415 | 1445 |
1416 PLD128(src, 384) | 1446 // store back the 8 calculated pixels (2 128 bit |
1447 // registers), and increment dst. | |
1448 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | |
1449 asm ( | |
1450 "vst1.32 %h[vdst], [%[dst]]!" | |
1451 : [dst] "+r" (dst) | |
1452 : [vdst] "w" (vdst) | |
1453 : "memory" | |
1454 ); | |
1455 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | |
1456 vst1q_u32(dst, vdst.val[0]); | |
1457 vst1q_u32(dst+4, vdst.val[1]); | |
1458 dst += 8; | |
1459 #endif | |
1460 count -= 8; | |
1417 | 1461 |
1418 // multiply long by scale, 64 bits at a time, | 1462 } while (count >= 8); |
1419 // destination into a 128 bit register. | 1463 } |
1420 "vmull.u8 q4, d4, d2 \n\t" | |
1421 "vmull.u8 q5, d5, d2 \n\t" | |
1422 "vmull.u8 q6, d6, d2 \n\t" | |
1423 "vmull.u8 q7, d7, d2 \n\t" | |
1424 | 1464 |
1425 // shift the 128 bit registers, containing the 16 | 1465 while (count > 0) { |
1426 // bit scaled values back to 8 bits, narrowing the | 1466 *dst = color + SkAlphaMulQ(*src, scale); |
1427 // results to 64 bit registers. | 1467 src += 1; |
1428 "vshrn.i16 d8, q4, #8 \n\t" | 1468 dst += 1; |
1429 "vshrn.i16 d9, q5, #8 \n\t" | 1469 count--; |
1430 "vshrn.i16 d10, q6, #8 \n\t" | |
1431 "vshrn.i16 d11, q7, #8 \n\t" | |
1432 | |
1433 // adding back the color, using 128 bit registers. | |
1434 "vadd.i8 q6, q4, q0 \n\t" | |
1435 "vadd.i8 q7, q5, q0 \n\t" | |
1436 | |
1437 // store back the 8 calculated pixels (2 128 bit | |
1438 // registers), and increment dst. | |
1439 "vst1.32 {d12-d15}, [%[dst]]! \n\t" | |
1440 | |
1441 "subs %[count], %[count], #8 \n\t" | |
1442 "bge Loop_Color32 \n\t" | |
1443 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) | |
1444 : [color] "r" (color), [scale] "r" (scale) | |
1445 : "cc", "memory", | |
1446 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", | |
1447 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" | |
1448 ); | |
1449 // At this point, if we went through the inline assembly, count is | |
1450 // a negative value: | |
1451 // if the value is -8, there is no pixel left to process. | |
1452 // if the value is -7, there is one pixel left to process | |
1453 // ... | |
1454 // And'ing it with 7 will give us the number of pixels | |
1455 // left to process. | |
1456 count = count & 0x7; | |
1457 } | |
1458 | |
1459 while (count > 0) { | |
1460 *dst = color + SkAlphaMulQ(*src, scale); | |
1461 src += 1; | |
1462 dst += 1; | |
1463 count--; | |
1464 } | |
1465 } | 1470 } |
1466 } | 1471 } |
1467 | 1472 |
1468 /////////////////////////////////////////////////////////////////////////////// | 1473 /////////////////////////////////////////////////////////////////////////////// |
1469 | 1474 |
1470 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { | 1475 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
1471 // no dither | 1476 // no dither |
1472 S32_D565_Opaque_neon, | 1477 S32_D565_Opaque_neon, |
1473 S32_D565_Blend_neon, | 1478 S32_D565_Blend_neon, |
1474 S32A_D565_Opaque_neon, | 1479 S32A_D565_Opaque_neon, |
(...skipping 19 matching lines...) Expand all Loading... | |
1494 * case where we do not inspect the src alpha. | 1499 * case where we do not inspect the src alpha. |
1495 */ | 1500 */ |
1496 #if SK_A32_SHIFT == 24 | 1501 #if SK_A32_SHIFT == 24 |
1497 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1502 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1498 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1503 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1499 #else | 1504 #else |
1500 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1505 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1501 #endif | 1506 #endif |
1502 S32A_Blend_BlitRow32_neon // S32A_Blend | 1507 S32A_Blend_BlitRow32_neon // S32A_Blend |
1503 }; | 1508 }; |
OLD | NEW |