Chromium Code Reviews| Index: src/opts/SkSwizzler_neon.h |
| diff --git a/src/opts/SkSwizzler_neon.h b/src/opts/SkSwizzler_neon.h |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..88915aaa5a9f0f429671163d12d1a023b1c33ac8 |
| --- /dev/null |
| +++ b/src/opts/SkSwizzler_neon.h |
| @@ -0,0 +1,145 @@ |
| +/* |
| + * Copyright 2016 Google Inc. |
| + * |
| + * Use of this source code is governed by a BSD-style license that can be |
| + * found in the LICENSE file. |
| + */ |
| + |
| +#ifndef SkSwizzler_neon_DEFINED |
| +#define SkSwizzler_neon_DEFINED |
| + |
| +#include "SkColorPriv.h" |
| + |
| +#include <arm_neon.h> |
| + |
| +namespace sk_neon { |
| + |
| +static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
|
mtklein
2016/01/13 15:20:01
I think some of the clarity of this code is gettin
msarett
2016/01/13 16:17:15
Acknowledged.
|
| + int i = 0; |
| + uint8_t* dst8 = (uint8_t*) dst; |
| + const uint8_t* src8 = (const uint8_t*) src; |
| + for (; i < count; i += 8) { |
| + // Load 8 pixels. This load instruction will deinterleave the pixels, |
| + // producing alphas, reds, greens, and blues each in its own vector. |
| + uint8x8x4_t rgba = vld4_u8(src8); |
|
mtklein
2016/01/13 15:20:01
Load 8 pixels.
msarett
2016/01/13 16:17:15
Done.
|
| + |
| + // These should be free operations. |
|
mtklein
2016/01/13 15:20:01
I don't think you need any comment here. I'd sugg
msarett
2016/01/13 16:17:15
Done.
|
| + uint8x8_t reds = rgba.val[0]; |
| + uint8x8_t greens = rgba.val[1]; |
| + uint8x8_t blues = rgba.val[2]; |
| + uint8x8_t alphas = rgba.val[3]; |
| + |
| + // Premultiplication Step 1: Multiply each color component by the |
| + // alpha component. Note that this also widens each of the components |
| + // to 16-bits. |
| + uint16x8_t product_reds = vmull_u8(reds, alphas); |
|
mtklein
2016/01/13 15:20:01
I'd suggest:
// Premultiply.
r = scale(r, a);
g =
msarett
2016/01/13 16:17:16
I think the refactoring makes things clearer.
My
|
| + uint16x8_t product_greens = vmull_u8(greens, alphas); |
| + uint16x8_t product_blues = vmull_u8(blues, alphas); |
| + |
| + // Premultiplication Step 2: Perform a rounded divide by 255. |
| + // result = (x + 128) / 255 |
| + // result = (x + 128) / 256 + error1 |
| + // |
| + // error1 = (x + 128) / (255 * 256) |
| + // error1 = (x + 128) / (256 * 256) + error2 |
| + // |
| + // error2 = (x + 128) / (255 * 256 * 256) |
| + // |
| + // The maximum value of error2 is too small to matter. Thus: |
| + // result = (x + 128) / 256 + (x + 128) / (256 * 256) |
| + // result = ((x + 128) / 256 + x + 128) / 256 |
| + // result = ((x + 128) >> 8 + x + 128) >> 8 |
| + // |
| + // Use >>> to represent "rounded right shift" which, conveniently, |
| + // NEON supports. |
| + // result = ((x >>> 8) + x) >>> 8 |
| + // |
| + // Note that the second right shift is actually performed as an |
| + // "add, round, and narrow back to 8-bits" instruction. |
| + uint8x8_t result_reds = vraddhn_u16(product_reds, vrshrq_n_u16(product_reds, 8)); |
| + uint8x8_t result_greens = vraddhn_u16(product_greens, vrshrq_n_u16(product_greens, 8)); |
| + uint8x8_t result_blues = vraddhn_u16(product_blues, vrshrq_n_u16(product_blues, 8)); |
| + |
| + // Store 8 premultiplied pixels. |
|
mtklein
2016/01/13 15:20:01
Good from here on.
msarett
2016/01/13 16:17:15
Acknowledged.
|
| + rgba.val[0] = result_reds; |
| + rgba.val[1] = result_greens; |
| + rgba.val[2] = result_blues; |
| + vst4_u8(dst8, rgba); |
| + src8 += 32; |
| + dst8 += 32; |
| + } |
| + |
| + dst = (uint32_t*) dst8; |
| + for (; i < count; i++) { |
| + dst[i] = SkPremultiplyARGBInline(src8[3], src8[0], src8[1], src8[2]); |
| + src8 += 4; |
| + dst++; |
| + } |
| +} |
| + |
| +static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| + int i = 0; |
| + uint8_t* dst8 = (uint8_t*) dst; |
| + const uint8_t* src8 = (const uint8_t*) src; |
| + for (; i < count; i += 8) { |
| + // Load 8 pixels. This load instruction will deinterleave the pixels, |
| + // producing alphas, reds, greens, and blues each in its own vector. |
| + uint8x8x4_t rgba = vld4_u8(src8); |
| + |
| + // These should be free operations. |
| + uint8x8_t reds = rgba.val[0]; |
| + uint8x8_t greens = rgba.val[1]; |
| + uint8x8_t blues = rgba.val[2]; |
| + uint8x8_t alphas = rgba.val[3]; |
| + |
| + // Premultiplication Step 1: Multiply each color component by the |
| + // alpha component. Note that this also widens each of the components |
| + // to 16-bits. |
| + uint16x8_t product_reds = vmull_u8(reds, alphas); |
| + uint16x8_t product_greens = vmull_u8(greens, alphas); |
| + uint16x8_t product_blues = vmull_u8(blues, alphas); |
| + |
| + // Premultiplication Step 2: Perform a rounded divide by 255. |
| + // result = (x + 128) / 255 |
| + // result = (x + 128) / 256 + error1 |
| + // |
| + // error1 = (x + 128) / (255 * 256) |
| + // error1 = (x + 128) / (256 * 256) + error2 |
| + // |
| + // error2 = (x + 128) / (255 * 256 * 256) |
| + // |
| + // The maximum value of error2 is too small to matter. Thus: |
| + // result = (x + 128) / 256 + (x + 128) / (256 * 256) |
| + // result = ((x + 128) / 256 + x + 128) / 256 |
| + // result = ((x + 128) >> 8 + x + 128) >> 8 |
| + // |
| + // Use >>> to represent "rounded right shift" which, conveniently, |
| + // NEON supports. |
| + // result = ((x >>> 8) + x) >>> 8 |
| + // |
| + // Note that the second right shift is actually performed as an |
| + // "add, round, and narrow back to 8-bits" instruction. |
| + uint8x8_t result_reds = vraddhn_u16(product_reds, vrshrq_n_u16(product_reds, 8)); |
| + uint8x8_t result_greens = vraddhn_u16(product_greens, vrshrq_n_u16(product_greens, 8)); |
| + uint8x8_t result_blues = vraddhn_u16(product_blues, vrshrq_n_u16(product_blues, 8)); |
| + |
| + // Store 8 premultiplied pixels. |
| + rgba.val[0] = result_blues; |
|
msarett
2016/01/13 14:05:38
This is identical to premul_xxxa with the exceptio
mtklein
2016/01/13 14:33:17
Yes, template <bool kSwapRB> on the function, then
msarett
2016/01/13 16:17:16
Done.
|
| + rgba.val[1] = result_greens; |
| + rgba.val[2] = result_red; |
| + vst4_u8(dst8, rgba); |
| + src8 += 32; |
| + dst8 += 32; |
| + } |
| + |
| + dst = (uint32_t*) dst8; |
| + for (; i < count; i++) { |
| + dst[i] = SkPremultiplyARGBInline(src8[3], src8[0], src8[1], src8[2]); |
| + src8 += 4; |
| + dst++; |
| + } |
| +} |
| + |
| +} |
| + |
| +#endif // SkSwizzler_neon_DEFINED |