Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1131)

Side by Side Diff: src/opts/SkSwizzler_neon.h

Issue 1577703006: Optimized premultiplying swizzles for NEON (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« src/opts/SkOpts_neon.cpp ('K') | « src/opts/SkOpts_neon.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkSwizzler_neon_DEFINED
9 #define SkSwizzler_neon_DEFINED
10
11 #include "SkColorPriv.h"
12
13 #include <arm_neon.h>
14
15 namespace sk_neon {
16
17 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
mtklein 2016/01/13 15:20:01 I think some of the clarity of this code is gettin
msarett 2016/01/13 16:17:15 Acknowledged.
18 int i = 0;
19 uint8_t* dst8 = (uint8_t*) dst;
20 const uint8_t* src8 = (const uint8_t*) src;
21 for (; i < count; i += 8) {
22 // Load 8 pixels. This load instruction will deinterleave the pixels,
23 // producing alphas, reds, greens, and blues each in its own vector.
24 uint8x8x4_t rgba = vld4_u8(src8);
mtklein 2016/01/13 15:20:01 Load 8 pixels.
msarett 2016/01/13 16:17:15 Done.
25
26 // These should be free operations.
mtklein 2016/01/13 15:20:01 I don't think you need any comment here. I'd sugg
msarett 2016/01/13 16:17:15 Done.
27 uint8x8_t reds = rgba.val[0];
28 uint8x8_t greens = rgba.val[1];
29 uint8x8_t blues = rgba.val[2];
30 uint8x8_t alphas = rgba.val[3];
31
32 // Premultiplication Step 1: Multiply each color component by the
33 // alpha component. Note that this also widens each of the components
34 // to 16-bits.
35 uint16x8_t product_reds = vmull_u8(reds, alphas);
mtklein 2016/01/13 15:20:01 I'd suggest: // Premultiply. r = scale(r, a); g =
msarett 2016/01/13 16:17:16 I think the refactoring makes things clearer. My
36 uint16x8_t product_greens = vmull_u8(greens, alphas);
37 uint16x8_t product_blues = vmull_u8(blues, alphas);
38
39 // Premultiplication Step 2: Perform a rounded divide by 255.
40 // result = (x + 128) / 255
41 // result = (x + 128) / 256 + error1
42 //
43 // error1 = (x + 128) / (255 * 256)
44 // error1 = (x + 128) / (256 * 256) + error2
45 //
46 // error2 = (x + 128) / (255 * 256 * 256)
47 //
48 // The maximum value of error2 is too small to matter. Thus:
49 // result = (x + 128) / 256 + (x + 128) / (256 * 256)
50 // result = ((x + 128) / 256 + x + 128) / 256
51 // result = ((x + 128) >> 8 + x + 128) >> 8
52 //
53 // Use >>> to represent "rounded right shift" which, conveniently,
54 // NEON supports.
55 // result = ((x >>> 8) + x) >>> 8
56 //
57 // Note that the second right shift is actually performed as an
58 // "add, round, and narrow back to 8-bits" instruction.
59 uint8x8_t result_reds = vraddhn_u16(product_reds, vrshrq_n_u16(product_r eds, 8));
60 uint8x8_t result_greens = vraddhn_u16(product_greens, vrshrq_n_u16(produ ct_greens, 8));
61 uint8x8_t result_blues = vraddhn_u16(product_blues, vrshrq_n_u16(product _blues, 8));
62
63 // Store 8 premultiplied pixels.
mtklein 2016/01/13 15:20:01 Good from here on.
msarett 2016/01/13 16:17:15 Acknowledged.
64 rgba.val[0] = result_reds;
65 rgba.val[1] = result_greens;
66 rgba.val[2] = result_blues;
67 vst4_u8(dst8, rgba);
68 src8 += 32;
69 dst8 += 32;
70 }
71
72 dst = (uint32_t*) dst8;
73 for (; i < count; i++) {
74 dst[i] = SkPremultiplyARGBInline(src8[3], src8[0], src8[1], src8[2]);
75 src8 += 4;
76 dst++;
77 }
78 }
79
80 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
81 int i = 0;
82 uint8_t* dst8 = (uint8_t*) dst;
83 const uint8_t* src8 = (const uint8_t*) src;
84 for (; i < count; i += 8) {
85 // Load 8 pixels. This load instruction will deinterleave the pixels,
86 // producing alphas, reds, greens, and blues each in its own vector.
87 uint8x8x4_t rgba = vld4_u8(src8);
88
89 // These should be free operations.
90 uint8x8_t reds = rgba.val[0];
91 uint8x8_t greens = rgba.val[1];
92 uint8x8_t blues = rgba.val[2];
93 uint8x8_t alphas = rgba.val[3];
94
95 // Premultiplication Step 1: Multiply each color component by the
96 // alpha component. Note that this also widens each of the components
97 // to 16-bits.
98 uint16x8_t product_reds = vmull_u8(reds, alphas);
99 uint16x8_t product_greens = vmull_u8(greens, alphas);
100 uint16x8_t product_blues = vmull_u8(blues, alphas);
101
102 // Premultiplication Step 2: Perform a rounded divide by 255.
103 // result = (x + 128) / 255
104 // result = (x + 128) / 256 + error1
105 //
106 // error1 = (x + 128) / (255 * 256)
107 // error1 = (x + 128) / (256 * 256) + error2
108 //
109 // error2 = (x + 128) / (255 * 256 * 256)
110 //
111 // The maximum value of error2 is too small to matter. Thus:
112 // result = (x + 128) / 256 + (x + 128) / (256 * 256)
113 // result = ((x + 128) / 256 + x + 128) / 256
114 // result = ((x + 128) >> 8 + x + 128) >> 8
115 //
116 // Use >>> to represent "rounded right shift" which, conveniently,
117 // NEON supports.
118 // result = ((x >>> 8) + x) >>> 8
119 //
120 // Note that the second right shift is actually performed as an
121 // "add, round, and narrow back to 8-bits" instruction.
122 uint8x8_t result_reds = vraddhn_u16(product_reds, vrshrq_n_u16(product_r eds, 8));
123 uint8x8_t result_greens = vraddhn_u16(product_greens, vrshrq_n_u16(produ ct_greens, 8));
124 uint8x8_t result_blues = vraddhn_u16(product_blues, vrshrq_n_u16(product _blues, 8));
125
126 // Store 8 premultiplied pixels.
127 rgba.val[0] = result_blues;
msarett 2016/01/13 14:05:38 This is identical to premul_xxxa with the exceptio
mtklein 2016/01/13 14:33:17 Yes, template <bool kSwapRB> on the function, then
msarett 2016/01/13 16:17:16 Done.
128 rgba.val[1] = result_greens;
129 rgba.val[2] = result_red;
130 vst4_u8(dst8, rgba);
131 src8 += 32;
132 dst8 += 32;
133 }
134
135 dst = (uint32_t*) dst8;
136 for (; i < count; i++) {
137 dst[i] = SkPremultiplyARGBInline(src8[3], src8[0], src8[1], src8[2]);
138 src8 += 4;
139 dst++;
140 }
141 }
142
143 }
144
145 #endif // SkSwizzler_neon_DEFINED
OLDNEW
« src/opts/SkOpts_neon.cpp ('K') | « src/opts/SkOpts_neon.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698