Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 258173005: ARM Skia NEON patches - 36 - Color32 (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 1366 matching lines...) Expand 10 before | Expand all | Expand 10 after
1377 if (0 == color) { 1377 if (0 == color) {
1378 if (src != dst) { 1378 if (src != dst) {
1379 memcpy(dst, src, count * sizeof(SkPMColor)); 1379 memcpy(dst, src, count * sizeof(SkPMColor));
1380 } 1380 }
1381 return; 1381 return;
1382 } 1382 }
1383 1383
1384 unsigned colorA = SkGetPackedA32(color); 1384 unsigned colorA = SkGetPackedA32(color);
1385 if (255 == colorA) { 1385 if (255 == colorA) {
1386 sk_memset32(dst, color, count); 1386 sk_memset32(dst, color, count);
1387 } else { 1387 return;
1388 unsigned scale = 256 - SkAlpha255To256(colorA); 1388 }
1389 1389
1390 if (count >= 8) { 1390 unsigned scale = 256 - SkAlpha255To256(colorA);
1391 // at the end of this assembly, count will have been decremented
1392 // to a negative value. That is, if count mod 8 = x, it will be
1393 // -8 +x coming out.
1394 asm volatile (
1395 PLD128(src, 0)
mtklein 2014/04/29 13:50:30 Was it that these preloads that were detrimental t
kevin.petit 2014/04/29 14:08:56 Nope, these preloads are disabled in all builds as
1396 1391
1397 "vdup.32 q0, %[color] \n\t" 1392 if (count >= 8) {
1393 uint32x4_t vcolor;
1394 uint8x8_t vscale;
1398 1395
1399 PLD128(src, 128) 1396 vcolor = vdupq_n_u32(color);
1400 1397
1401 // scale numerical interval [0-255], so load as 8 bits 1398 // scale numerical interval [0-255], so load as 8 bits
1402 "vdup.8 d2, %[scale] \n\t" 1399 vscale = vdup_n_u8(scale);
1403 1400
1404 PLD128(src, 256) 1401 do {
1402 uint32x2x4_t vsrc;
1403 uint16x8x4_t vtmp;
mtklein 2014/04/29 13:50:30 Can you move the declarations of vtmp, vres, and v
kevin.petit 2014/04/29 14:08:56 Done.
1404 uint8x16x2_t vres;
1405 uint32x4x2_t vdst;
1405 1406
1406 "subs %[count], %[count], #8 \n\t" 1407 // load src color, 8 pixels, 4 64 bit registers
1408 // (and increment src).
1409 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1410 asm (
1411 "vld1.32 %h[vsrc], [%[src]]!"
1412 : [vsrc] "=w" (vsrc), [src] "+r" (src)
1413 : :
1414 );
1415 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1416 vsrc.val[0] = vld1_u32(src);
1417 vsrc.val[1] = vld1_u32(src+2);
1418 vsrc.val[2] = vld1_u32(src+4);
1419 vsrc.val[3] = vld1_u32(src+6);
1420 src += 8;
1421 #endif
1407 1422
1408 PLD128(src, 384) 1423 // multiply long by scale, 64 bits at a time,
1424 // destination into a 128 bit register.
1425 vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
1426 vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
1427 vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
1428 vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
1409 1429
1410 "Loop_Color32: \n\t" 1430 // shift the 128 bit registers, containing the 16
1431 // bit scaled values back to 8 bits, narrowing the
1432 // results to 64 bit registers.
1433 vres.val[0] = vcombine_u8(
1434 vshrn_n_u16(vtmp.val[0], 8),
1435 vshrn_n_u16(vtmp.val[1], 8));
1436 vres.val[1] = vcombine_u8(
1437 vshrn_n_u16(vtmp.val[2], 8),
1438 vshrn_n_u16(vtmp.val[3], 8));
1411 1439
1412 // load src color, 8 pixels, 4 64 bit registers 1440 // adding back the color, using 128 bit registers.
1413 // (and increment src). 1441 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
1414 "vld1.32 {d4-d7}, [%[src]]! \n\t" 1442 vreinterpretq_u8_u32(vcolor));
1443 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
1444 vreinterpretq_u8_u32(vcolor));
1415 1445
1416 PLD128(src, 384) 1446 // store back the 8 calculated pixels (2 128 bit
1447 // registers), and increment dst.
1448 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1449 asm (
1450 "vst1.32 %h[vdst], [%[dst]]!"
1451 : [dst] "+r" (dst)
1452 : [vdst] "w" (vdst)
1453 : "memory"
1454 );
1455 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1456 vst1q_u32(dst, vdst.val[0]);
1457 vst1q_u32(dst+4, vdst.val[1]);
1458 dst += 8;
1459 #endif
1460 count -= 8;
1417 1461
1418 // multiply long by scale, 64 bits at a time, 1462 } while (count >= 8);
1419 // destination into a 128 bit register. 1463 }
1420 "vmull.u8 q4, d4, d2 \n\t"
1421 "vmull.u8 q5, d5, d2 \n\t"
1422 "vmull.u8 q6, d6, d2 \n\t"
1423 "vmull.u8 q7, d7, d2 \n\t"
1424 1464
1425 // shift the 128 bit registers, containing the 16 1465 while (count > 0) {
1426 // bit scaled values back to 8 bits, narrowing the 1466 *dst = color + SkAlphaMulQ(*src, scale);
1427 // results to 64 bit registers. 1467 src += 1;
1428 "vshrn.i16 d8, q4, #8 \n\t" 1468 dst += 1;
1429 "vshrn.i16 d9, q5, #8 \n\t" 1469 count--;
1430 "vshrn.i16 d10, q6, #8 \n\t"
1431 "vshrn.i16 d11, q7, #8 \n\t"
1432
1433 // adding back the color, using 128 bit registers.
1434 "vadd.i8 q6, q4, q0 \n\t"
1435 "vadd.i8 q7, q5, q0 \n\t"
1436
1437 // store back the 8 calculated pixels (2 128 bit
1438 // registers), and increment dst.
1439 "vst1.32 {d12-d15}, [%[dst]]! \n\t"
1440
1441 "subs %[count], %[count], #8 \n\t"
1442 "bge Loop_Color32 \n\t"
1443 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1444 : [color] "r" (color), [scale] "r" (scale)
1445 : "cc", "memory",
1446 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1447 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1448 );
1449 // At this point, if we went through the inline assembly, count is
1450 // a negative value:
1451 // if the value is -8, there is no pixel left to process.
1452 // if the value is -7, there is one pixel left to process
1453 // ...
1454 // And'ing it with 7 will give us the number of pixels
1455 // left to process.
1456 count = count & 0x7;
1457 }
1458
1459 while (count > 0) {
1460 *dst = color + SkAlphaMulQ(*src, scale);
1461 src += 1;
1462 dst += 1;
1463 count--;
1464 }
1465 } 1470 }
1466 } 1471 }
1467 1472
1468 /////////////////////////////////////////////////////////////////////////////// 1473 ///////////////////////////////////////////////////////////////////////////////
1469 1474
1470 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1475 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1471 // no dither 1476 // no dither
1472 S32_D565_Opaque_neon, 1477 S32_D565_Opaque_neon,
1473 S32_D565_Blend_neon, 1478 S32_D565_Blend_neon,
1474 S32A_D565_Opaque_neon, 1479 S32A_D565_Opaque_neon,
(...skipping 19 matching lines...) Expand all
1494 * case where we do not inspect the src alpha. 1499 * case where we do not inspect the src alpha.
1495 */ 1500 */
1496 #if SK_A32_SHIFT == 24 1501 #if SK_A32_SHIFT == 24
1497 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1502 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1498 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1503 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1499 #else 1504 #else
1500 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1505 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1501 #endif 1506 #endif
1502 S32A_Blend_BlitRow32_neon // S32A_Blend 1507 S32A_Blend_BlitRow32_neon // S32A_Blend
1503 }; 1508 };
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698