src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 258173005: ARM Skia NEON patches - 36 - Color32

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 258173005: ARM Skia NEON patches - 36 - Color32 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 1366 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1377 if (0 == color) {	1377 if (0 == color) {

1378 if (src != dst) {	1378 if (src != dst) {

1379 memcpy(dst, src, count * sizeof(SkPMColor));	1379 memcpy(dst, src, count * sizeof(SkPMColor));

1380 }	1380 }

1381 return;	1381 return;

1382 }	1382 }

1383	1383

1384 unsigned colorA = SkGetPackedA32(color);	1384 unsigned colorA = SkGetPackedA32(color);

1385 if (255 == colorA) {	1385 if (255 == colorA) {

1386 sk_memset32(dst, color, count);	1386 sk_memset32(dst, color, count);

1387 } else {	1387 return;

1388 unsigned scale = 256 - SkAlpha255To256(colorA);	1388 }

1389	1389

1390 if (count >= 8) {	1390 unsigned scale = 256 - SkAlpha255To256(colorA);

1391 // at the end of this assembly, count will have been decremented

1392 // to a negative value. That is, if count mod 8 = x, it will be

1393 // -8 +x coming out.

1394 asm volatile (

1395 PLD128(src, 0)
mtklein 2014/04/29 13:50:30 Was it that these preloads that were detrimental t Was it that these preloads that were detrimental to small-N performance, and are predicted well anyway for large N? kevin.petit 2014/04/29 14:08:56 Nope, these preloads are disabled in all builds as Show quoted text On 2014/04/29 13:50:30, mtklein wrote: > Was it that these preloads that were detrimental to small-N performance, and are > predicted well anyway for large N? Nope, these preloads are disabled in all builds as far as I can tell. My next proposed patch removes all that useless logic (Color32 was the only potential "user"). Usually, the more asm you have and the less the compiler is able to re-order code (even surrounding code). I would say the perf improvement comes from reducing the amount of asm and handling count in a slightly saner way but I haven't done any detailed analysis to back that.
1396	1391

1397 "vdup.32 q0, %[color] \n\t"	1392 if (count >= 8) {

	1393 uint32x4_t vcolor;

	1394 uint8x8_t vscale;

1398	1395

1399 PLD128(src, 128)	1396 vcolor = vdupq_n_u32(color);

1400	1397

1401 // scale numerical interval [0-255], so load as 8 bits	1398 // scale numerical interval [0-255], so load as 8 bits

1402 "vdup.8 d2, %[scale] \n\t"	1399 vscale = vdup_n_u8(scale);

1403	1400

1404 PLD128(src, 256)	1401 do {

	1402 uint32x2x4_t vsrc;

	1403 uint16x8x4_t vtmp;
	mtklein 2014/04/29 13:50:30 Can you move the declarations of vtmp, vres, and v Can you move the declarations of vtmp, vres, and vdst down to just before they're written into (just under the comments)? It'd help me follow the bouncing between 64 and 128 bit registers a bit to see the types right there. kevin.petit 2014/04/29 14:08:56 Done. Show quoted text On 2014/04/29 13:50:30, mtklein wrote: > Can you move the declarations of vtmp, vres, and vdst down to just before > they're written into (just under the comments)? It'd help me follow the > bouncing between 64 and 128 bit registers a bit to see the types right there. Done.
	1404 uint8x16x2_t vres;

	1405 uint32x4x2_t vdst;

1405	1406

1406 "subs %[count], %[count], #8 \n\t"	1407 // load src color, 8 pixels, 4 64 bit registers

	1408 // (and increment src).

	1409 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

	1410 asm (

	1411 "vld1.32 %h[vsrc], [%[src]]!"

	1412 : [vsrc] "=w" (vsrc), [src] "+r" (src)

	1413 : :

	1414 );

	1415 #else // (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

	1416 vsrc.val[0] = vld1_u32(src);

	1417 vsrc.val[1] = vld1_u32(src+2);

	1418 vsrc.val[2] = vld1_u32(src+4);

	1419 vsrc.val[3] = vld1_u32(src+6);

	1420 src += 8;

	1421 #endif

1407	1422

1408 PLD128(src, 384)	1423 // multiply long by scale, 64 bits at a time,

	1424 // destination into a 128 bit register.

	1425 vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);

	1426 vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);

	1427 vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);

	1428 vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);

1409	1429

1410 "Loop_Color32: \n\t"	1430 // shift the 128 bit registers, containing the 16

	1431 // bit scaled values back to 8 bits, narrowing the

	1432 // results to 64 bit registers.

	1433 vres.val[0] = vcombine_u8(

	1434 vshrn_n_u16(vtmp.val[0], 8),

	1435 vshrn_n_u16(vtmp.val[1], 8));

	1436 vres.val[1] = vcombine_u8(

	1437 vshrn_n_u16(vtmp.val[2], 8),

	1438 vshrn_n_u16(vtmp.val[3], 8));

1411	1439

1412 // load src color, 8 pixels, 4 64 bit registers	1440 // adding back the color, using 128 bit registers.

1413 // (and increment src).	1441 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +

1414 "vld1.32 {d4-d7}, [%[src]]! \n\t"	1442 vreinterpretq_u8_u32(vcolor));

	1443 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +

	1444 vreinterpretq_u8_u32(vcolor));

1415	1445

1416 PLD128(src, 384)	1446 // store back the 8 calculated pixels (2 128 bit

	1447 // registers), and increment dst.

	1448 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

	1449 asm (

	1450 "vst1.32 %h[vdst], [%[dst]]!"

	1451 : [dst] "+r" (dst)

	1452 : [vdst] "w" (vdst)

	1453 : "memory"

	1454 );

	1455 #else // (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

	1456 vst1q_u32(dst, vdst.val[0]);

	1457 vst1q_u32(dst+4, vdst.val[1]);

	1458 dst += 8;

	1459 #endif

	1460 count -= 8;

1417	1461

1418 // multiply long by scale, 64 bits at a time,	1462 } while (count >= 8);

1419 // destination into a 128 bit register.	1463 }

1420 "vmull.u8 q4, d4, d2 \n\t"

1421 "vmull.u8 q5, d5, d2 \n\t"

1422 "vmull.u8 q6, d6, d2 \n\t"

1423 "vmull.u8 q7, d7, d2 \n\t"

1424	1464

1425 // shift the 128 bit registers, containing the 16	1465 while (count > 0) {

1426 // bit scaled values back to 8 bits, narrowing the	1466 dst = color + SkAlphaMulQ(src, scale);

1427 // results to 64 bit registers.	1467 src += 1;

1428 "vshrn.i16 d8, q4, #8 \n\t"	1468 dst += 1;

1429 "vshrn.i16 d9, q5, #8 \n\t"	1469 count--;

1430 "vshrn.i16 d10, q6, #8 \n\t"

1431 "vshrn.i16 d11, q7, #8 \n\t"

1432

1433 // adding back the color, using 128 bit registers.

1434 "vadd.i8 q6, q4, q0 \n\t"

1435 "vadd.i8 q7, q5, q0 \n\t"

1436

1437 // store back the 8 calculated pixels (2 128 bit

1438 // registers), and increment dst.

1439 "vst1.32 {d12-d15}, [%[dst]]! \n\t"

1440

1441 "subs %[count], %[count], #8 \n\t"

1442 "bge Loop_Color32 \n\t"

1443 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)

1444 : [color] "r" (color), [scale] "r" (scale)

1445 : "cc", "memory",

1446 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",

1447 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"

1448 );

1449 // At this point, if we went through the inline assembly, count is

1450 // a negative value:

1451 // if the value is -8, there is no pixel left to process.

1452 // if the value is -7, there is one pixel left to process

1453 // ...

1454 // And'ing it with 7 will give us the number of pixels

1455 // left to process.

1456 count = count & 0x7;

1457 }

1458

1459 while (count > 0) {

1460 dst = color + SkAlphaMulQ(src, scale);

1461 src += 1;

1462 dst += 1;

1463 count--;

1464 }

1465 }	1470 }

1466 }	1471 }

1467	1472

1468 ///////////////////////////////////////////////////////////////////////////////	1473 ///////////////////////////////////////////////////////////////////////////////

1469	1474

1470 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {	1475 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {

1471 // no dither	1476 // no dither

1472 S32_D565_Opaque_neon,	1477 S32_D565_Opaque_neon,

1473 S32_D565_Blend_neon,	1478 S32_D565_Blend_neon,

1474 S32A_D565_Opaque_neon,	1479 S32A_D565_Opaque_neon,

(...skipping 19 matching lines...) Expand all Loading...
1494 * case where we do not inspect the src alpha.	1499 * case where we do not inspect the src alpha.

1495 */	1500 */

1496 #if SK_A32_SHIFT == 24	1501 #if SK_A32_SHIFT == 24

1497 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1502 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1498 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1503 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1499 #else	1504 #else

1500 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1505 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1501 #endif	1506 #endif

1502 S32A_Blend_BlitRow32_neon // S32A_Blend	1507 S32A_Blend_BlitRow32_neon // S32A_Blend

1503 };	1508 };

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »