Index: src/opts/SkSwizzler_neon.h |
diff --git a/src/opts/SkSwizzler_neon.h b/src/opts/SkSwizzler_neon.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..88915aaa5a9f0f429671163d12d1a023b1c33ac8 |
--- /dev/null |
+++ b/src/opts/SkSwizzler_neon.h |
@@ -0,0 +1,145 @@ |
+/* |
+ * Copyright 2016 Google Inc. |
+ * |
+ * Use of this source code is governed by a BSD-style license that can be |
+ * found in the LICENSE file. |
+ */ |
+ |
+#ifndef SkSwizzler_neon_DEFINED |
+#define SkSwizzler_neon_DEFINED |
+ |
+#include "SkColorPriv.h" |
+ |
+#include <arm_neon.h> |
+ |
+namespace sk_neon { |
+ |
+static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
mtklein
2016/01/13 15:20:01
I think some of the clarity of this code is gettin
msarett
2016/01/13 16:17:15
Acknowledged.
|
+ int i = 0; |
+ uint8_t* dst8 = (uint8_t*) dst; |
+ const uint8_t* src8 = (const uint8_t*) src; |
+ for (; i < count; i += 8) { |
+ // Load 8 pixels. This load instruction will deinterleave the pixels, |
+ // producing alphas, reds, greens, and blues each in its own vector. |
+ uint8x8x4_t rgba = vld4_u8(src8); |
mtklein
2016/01/13 15:20:01
Load 8 pixels.
msarett
2016/01/13 16:17:15
Done.
|
+ |
+ // These should be free operations. |
mtklein
2016/01/13 15:20:01
I don't think you need any comment here. I'd sugg
msarett
2016/01/13 16:17:15
Done.
|
+ uint8x8_t reds = rgba.val[0]; |
+ uint8x8_t greens = rgba.val[1]; |
+ uint8x8_t blues = rgba.val[2]; |
+ uint8x8_t alphas = rgba.val[3]; |
+ |
+ // Premultiplication Step 1: Multiply each color component by the |
+ // alpha component. Note that this also widens each of the components |
+ // to 16-bits. |
+ uint16x8_t product_reds = vmull_u8(reds, alphas); |
mtklein
2016/01/13 15:20:01
I'd suggest:
// Premultiply.
r = scale(r, a);
g =
msarett
2016/01/13 16:17:16
I think the refactoring makes things clearer.
My
|
+ uint16x8_t product_greens = vmull_u8(greens, alphas); |
+ uint16x8_t product_blues = vmull_u8(blues, alphas); |
+ |
+ // Premultiplication Step 2: Perform a rounded divide by 255. |
+ // result = (x + 128) / 255 |
+ // result = (x + 128) / 256 + error1 |
+ // |
+ // error1 = (x + 128) / (255 * 256) |
+ // error1 = (x + 128) / (256 * 256) + error2 |
+ // |
+ // error2 = (x + 128) / (255 * 256 * 256) |
+ // |
+ // The maximum value of error2 is too small to matter. Thus: |
+ // result = (x + 128) / 256 + (x + 128) / (256 * 256) |
+ // result = ((x + 128) / 256 + x + 128) / 256 |
+ // result = ((x + 128) >> 8 + x + 128) >> 8 |
+ // |
+ // Use >>> to represent "rounded right shift" which, conveniently, |
+ // NEON supports. |
+ // result = ((x >>> 8) + x) >>> 8 |
+ // |
+ // Note that the second right shift is actually performed as an |
+ // "add, round, and narrow back to 8-bits" instruction. |
+ uint8x8_t result_reds = vraddhn_u16(product_reds, vrshrq_n_u16(product_reds, 8)); |
+ uint8x8_t result_greens = vraddhn_u16(product_greens, vrshrq_n_u16(product_greens, 8)); |
+ uint8x8_t result_blues = vraddhn_u16(product_blues, vrshrq_n_u16(product_blues, 8)); |
+ |
+ // Store 8 premultiplied pixels. |
mtklein
2016/01/13 15:20:01
Good from here on.
msarett
2016/01/13 16:17:15
Acknowledged.
|
+ rgba.val[0] = result_reds; |
+ rgba.val[1] = result_greens; |
+ rgba.val[2] = result_blues; |
+ vst4_u8(dst8, rgba); |
+ src8 += 32; |
+ dst8 += 32; |
+ } |
+ |
+ dst = (uint32_t*) dst8; |
+ for (; i < count; i++) { |
+ dst[i] = SkPremultiplyARGBInline(src8[3], src8[0], src8[1], src8[2]); |
+ src8 += 4; |
+ dst++; |
+ } |
+} |
+ |
+static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
+ int i = 0; |
+ uint8_t* dst8 = (uint8_t*) dst; |
+ const uint8_t* src8 = (const uint8_t*) src; |
+ for (; i < count; i += 8) { |
+ // Load 8 pixels. This load instruction will deinterleave the pixels, |
+ // producing alphas, reds, greens, and blues each in its own vector. |
+ uint8x8x4_t rgba = vld4_u8(src8); |
+ |
+ // These should be free operations. |
+ uint8x8_t reds = rgba.val[0]; |
+ uint8x8_t greens = rgba.val[1]; |
+ uint8x8_t blues = rgba.val[2]; |
+ uint8x8_t alphas = rgba.val[3]; |
+ |
+ // Premultiplication Step 1: Multiply each color component by the |
+ // alpha component. Note that this also widens each of the components |
+ // to 16-bits. |
+ uint16x8_t product_reds = vmull_u8(reds, alphas); |
+ uint16x8_t product_greens = vmull_u8(greens, alphas); |
+ uint16x8_t product_blues = vmull_u8(blues, alphas); |
+ |
+ // Premultiplication Step 2: Perform a rounded divide by 255. |
+ // result = (x + 128) / 255 |
+ // result = (x + 128) / 256 + error1 |
+ // |
+ // error1 = (x + 128) / (255 * 256) |
+ // error1 = (x + 128) / (256 * 256) + error2 |
+ // |
+ // error2 = (x + 128) / (255 * 256 * 256) |
+ // |
+ // The maximum value of error2 is too small to matter. Thus: |
+ // result = (x + 128) / 256 + (x + 128) / (256 * 256) |
+ // result = ((x + 128) / 256 + x + 128) / 256 |
+ // result = ((x + 128) >> 8 + x + 128) >> 8 |
+ // |
+ // Use >>> to represent "rounded right shift" which, conveniently, |
+ // NEON supports. |
+ // result = ((x >>> 8) + x) >>> 8 |
+ // |
+ // Note that the second right shift is actually performed as an |
+ // "add, round, and narrow back to 8-bits" instruction. |
+ uint8x8_t result_reds = vraddhn_u16(product_reds, vrshrq_n_u16(product_reds, 8)); |
+ uint8x8_t result_greens = vraddhn_u16(product_greens, vrshrq_n_u16(product_greens, 8)); |
+ uint8x8_t result_blues = vraddhn_u16(product_blues, vrshrq_n_u16(product_blues, 8)); |
+ |
+ // Store 8 premultiplied pixels. |
+ rgba.val[0] = result_blues; |
msarett
2016/01/13 14:05:38
This is identical to premul_xxxa with the exceptio
mtklein
2016/01/13 14:33:17
Yes, template <bool kSwapRB> on the function, then
msarett
2016/01/13 16:17:16
Done.
|
+ rgba.val[1] = result_greens; |
+ rgba.val[2] = result_red; |
+ vst4_u8(dst8, rgba); |
+ src8 += 32; |
+ dst8 += 32; |
+ } |
+ |
+ dst = (uint32_t*) dst8; |
+ for (; i < count; i++) { |
+ dst[i] = SkPremultiplyARGBInline(src8[3], src8[0], src8[1], src8[2]); |
+ src8 += 4; |
+ dst++; |
+ } |
+} |
+ |
+} |
+ |
+#endif // SkSwizzler_neon_DEFINED |