src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 258173005: ARM Skia NEON patches - 36 - Color32

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 258173005: ARM Skia NEON patches - 36 - Color32 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Address review comments Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 1366 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1377 if (0 == color) {	1377 if (0 == color) {

1378 if (src != dst) {	1378 if (src != dst) {

1379 memcpy(dst, src, count * sizeof(SkPMColor));	1379 memcpy(dst, src, count * sizeof(SkPMColor));

1380 }	1380 }

1381 return;	1381 return;

1382 }	1382 }

1383	1383

1384 unsigned colorA = SkGetPackedA32(color);	1384 unsigned colorA = SkGetPackedA32(color);

1385 if (255 == colorA) {	1385 if (255 == colorA) {

1386 sk_memset32(dst, color, count);	1386 sk_memset32(dst, color, count);

1387 } else {	1387 return;

1388 unsigned scale = 256 - SkAlpha255To256(colorA);	1388 }

1389	1389

1390 if (count >= 8) {	1390 unsigned scale = 256 - SkAlpha255To256(colorA);

1391 // at the end of this assembly, count will have been decremented

1392 // to a negative value. That is, if count mod 8 = x, it will be

1393 // -8 +x coming out.

1394 asm volatile (

1395 PLD128(src, 0)

1396	1391

1397 "vdup.32 q0, %[color] \n\t"	1392 if (count >= 8) {

	1393 uint32x4_t vcolor;

	1394 uint8x8_t vscale;

1398	1395

1399 PLD128(src, 128)	1396 vcolor = vdupq_n_u32(color);

1400	1397

1401 // scale numerical interval [0-255], so load as 8 bits	1398 // scale numerical interval [0-255], so load as 8 bits

1402 "vdup.8 d2, %[scale] \n\t"	1399 vscale = vdup_n_u8(scale);

1403	1400

1404 PLD128(src, 256)	1401 do {

	1402 // load src color, 8 pixels, 4 64 bit registers

	1403 // (and increment src).

	1404 uint32x2x4_t vsrc;

	1405 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

	1406 asm (

	1407 "vld1.32 %h[vsrc], [%[src]]!"

	1408 : [vsrc] "=w" (vsrc), [src] "+r" (src)

	1409 : :

	1410 );

	1411 #else // (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

	1412 vsrc.val[0] = vld1_u32(src);

	1413 vsrc.val[1] = vld1_u32(src+2);

	1414 vsrc.val[2] = vld1_u32(src+4);

	1415 vsrc.val[3] = vld1_u32(src+6);

	1416 src += 8;

	1417 #endif

1405	1418

1406 "subs %[count], %[count], #8 \n\t"	1419 // multiply long by scale, 64 bits at a time,

	1420 // destination into a 128 bit register.

	1421 uint16x8x4_t vtmp;

	1422 vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);

	1423 vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);

	1424 vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);

	1425 vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);

1407	1426

1408 PLD128(src, 384)	1427 // shift the 128 bit registers, containing the 16

	1428 // bit scaled values back to 8 bits, narrowing the

	1429 // results to 64 bit registers.

	1430 uint8x16x2_t vres;

	1431 vres.val[0] = vcombine_u8(

	1432 vshrn_n_u16(vtmp.val[0], 8),

	1433 vshrn_n_u16(vtmp.val[1], 8));

	1434 vres.val[1] = vcombine_u8(

	1435 vshrn_n_u16(vtmp.val[2], 8),

	1436 vshrn_n_u16(vtmp.val[3], 8));

1409	1437

1410 "Loop_Color32: \n\t"	1438 // adding back the color, using 128 bit registers.

	1439 uint32x4x2_t vdst;

	1440 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +

	1441 vreinterpretq_u8_u32(vcolor));

	1442 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +

	1443 vreinterpretq_u8_u32(vcolor));

1411	1444

1412 // load src color, 8 pixels, 4 64 bit registers	1445 // store back the 8 calculated pixels (2 128 bit

1413 // (and increment src).	1446 // registers), and increment dst.

1414 "vld1.32 {d4-d7}, [%[src]]! \n\t"	1447 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

	1448 asm (

	1449 "vst1.32 %h[vdst], [%[dst]]!"

	1450 : [dst] "+r" (dst)

	1451 : [vdst] "w" (vdst)

	1452 : "memory"

	1453 );

	1454 #else // (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

	1455 vst1q_u32(dst, vdst.val[0]);

	1456 vst1q_u32(dst+4, vdst.val[1]);

	1457 dst += 8;

	1458 #endif

	1459 count -= 8;

1415	1460

1416 PLD128(src, 384)	1461 } while (count >= 8);

	1462 }

1417	1463

1418 // multiply long by scale, 64 bits at a time,	1464 while (count > 0) {

1419 // destination into a 128 bit register.	1465 dst = color + SkAlphaMulQ(src, scale);

1420 "vmull.u8 q4, d4, d2 \n\t"	1466 src += 1;

1421 "vmull.u8 q5, d5, d2 \n\t"	1467 dst += 1;

1422 "vmull.u8 q6, d6, d2 \n\t"	1468 count--;

1423 "vmull.u8 q7, d7, d2 \n\t"

1424

1425 // shift the 128 bit registers, containing the 16

1426 // bit scaled values back to 8 bits, narrowing the

1427 // results to 64 bit registers.

1428 "vshrn.i16 d8, q4, #8 \n\t"

1429 "vshrn.i16 d9, q5, #8 \n\t"

1430 "vshrn.i16 d10, q6, #8 \n\t"

1431 "vshrn.i16 d11, q7, #8 \n\t"

1432

1433 // adding back the color, using 128 bit registers.

1434 "vadd.i8 q6, q4, q0 \n\t"

1435 "vadd.i8 q7, q5, q0 \n\t"

1436

1437 // store back the 8 calculated pixels (2 128 bit

1438 // registers), and increment dst.

1439 "vst1.32 {d12-d15}, [%[dst]]! \n\t"

1440

1441 "subs %[count], %[count], #8 \n\t"

1442 "bge Loop_Color32 \n\t"

1443 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)

1444 : [color] "r" (color), [scale] "r" (scale)

1445 : "cc", "memory",

1446 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",

1447 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"

1448 );

1449 // At this point, if we went through the inline assembly, count is

1450 // a negative value:

1451 // if the value is -8, there is no pixel left to process.

1452 // if the value is -7, there is one pixel left to process

1453 // ...

1454 // And'ing it with 7 will give us the number of pixels

1455 // left to process.

1456 count = count & 0x7;

1457 }

1458

1459 while (count > 0) {

1460 dst = color + SkAlphaMulQ(src, scale);

1461 src += 1;

1462 dst += 1;

1463 count--;

1464 }

1465 }	1469 }

1466 }	1470 }

1467	1471

1468 ///////////////////////////////////////////////////////////////////////////////	1472 ///////////////////////////////////////////////////////////////////////////////

1469	1473

1470 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {	1474 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {

1471 // no dither	1475 // no dither

1472 S32_D565_Opaque_neon,	1476 S32_D565_Opaque_neon,

1473 S32_D565_Blend_neon,	1477 S32_D565_Blend_neon,

1474 S32A_D565_Opaque_neon,	1478 S32A_D565_Opaque_neon,

(...skipping 19 matching lines...) Expand all Loading...
1494 * case where we do not inspect the src alpha.	1498 * case where we do not inspect the src alpha.

1495 */	1499 */

1496 #if SK_A32_SHIFT == 24	1500 #if SK_A32_SHIFT == 24

1497 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1501 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1498 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1502 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1499 #else	1503 #else

1500 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1504 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1501 #endif	1505 #endif

1502 S32A_Blend_BlitRow32_neon // S32A_Blend	1506 S32A_Blend_BlitRow32_neon // S32A_Blend

1503 };	1507 };

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »