OLD | NEW |
---|---|
(Empty) | |
1 /* | |
2 * Copyright 2016 Google Inc. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license that can be | |
5 * found in the LICENSE file. | |
6 */ | |
7 | |
8 #ifndef SkBitmapFilter_opts_DEFINED | |
9 #define SkBitmapFilter_opts_DEFINED | |
10 | |
11 #include "SkConvolver.h" | |
12 | |
13 namespace SK_OPTS_NS { | |
14 | |
15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
16 | |
17 #include <emmintrin.h> | |
mtklein_C
2016/11/16 14:24:47
This will #include the SSE intrinsics into the SK_
xiangze.zhang
2016/11/17 02:33:07
Done.
| |
18 | |
19 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft, | |
20 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) { | |
21 int remainder[4] = {0}; | |
22 for (int i = 0; i < r; i++) { | |
23 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; | |
24 remainder[0] += coeff * pixelsLeft[i * 4 + 0]; | |
25 remainder[1] += coeff * pixelsLeft[i * 4 + 1]; | |
26 remainder[2] += coeff * pixelsLeft[i * 4 + 2]; | |
27 remainder[3] += coeff * pixelsLeft[i * 4 + 3]; | |
28 } | |
29 __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], rem ainder[3]); | |
30 accum = _mm_add_epi32(accum, t); | |
31 } | |
32 | |
33 // Convolves horizontally along a single row. The row data is given in | |
34 // |srcData| and continues for the numValues() of the filter. | |
35 void convolve_horizontally(const unsigned char* srcData, | |
36 const SkConvolutionFilter1D& filter, | |
37 unsigned char* outRow, | |
38 bool /*hasAlpha*/) { | |
39 // Output one pixel each iteration, calculating all channels (RGBA) toge ther. | |
40 int numValues = filter.numValues(); | |
41 for (int outX = 0; outX < numValues; outX++) { | |
42 // Get the filter that determines the current output pixel. | |
43 int filterOffset, filterLength; | |
44 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
45 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
46 | |
47 // Compute the first pixel in this row that the filter affects. It w ill | |
48 // touch |filterLength| pixels (4 bytes each) after this. | |
49 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; | |
50 | |
51 __m128i zero = _mm_setzero_si128(); | |
52 __m128i accum = _mm_setzero_si128(); | |
53 | |
54 // We will load and accumulate with four coefficients per iteration. | |
55 for (int filterX = 0; filterX < filterLength >> 2; filterX++) { | |
56 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. | |
57 __m128i coeff, coeff16; | |
58 // [16] xx xx xx xx c3 c2 c1 c0 | |
59 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues)); | |
60 // [16] xx xx xx xx c1 c1 c0 c0 | |
61 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
62 // [16] c1 c1 c1 c1 c0 c0 c0 c0 | |
63 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
64 | |
65 // Load four pixels => unpack the first two pixels to 16 bits => | |
66 // multiply with coefficients => accumulate the convolution resu lt. | |
67 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
68 __m128i src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>( rowToFilter)); | |
69 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
70 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
71 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
72 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
73 // [32] a0*c0 b0*c0 g0*c0 r0*c0 | |
74 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
75 accum = _mm_add_epi32(accum, t); | |
76 // [32] a1*c1 b1*c1 g1*c1 r1*c1 | |
77 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
78 accum = _mm_add_epi32(accum, t); | |
79 | |
80 // Duplicate 3rd and 4th coefficients for all channels => | |
81 // unpack the 3rd and 4th pixels to 16 bits => multiply with coe fficients | |
82 // => accumulate the convolution results. | |
83 // [16] xx xx xx xx c3 c3 c2 c2 | |
84 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
85 // [16] c3 c3 c3 c3 c2 c2 c2 c2 | |
86 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
87 // [16] a3 g3 b3 r3 a2 g2 b2 r2 | |
88 src16 = _mm_unpackhi_epi8(src8, zero); | |
89 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
90 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
91 // [32] a2*c2 b2*c2 g2*c2 r2*c2 | |
92 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
93 accum = _mm_add_epi32(accum, t); | |
94 // [32] a3*c3 b3*c3 g3*c3 r3*c3 | |
95 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
96 accum = _mm_add_epi32(accum, t); | |
97 | |
98 // Advance the pixel and coefficients pointers. | |
99 rowToFilter += 16; | |
100 filterValues += 4; | |
101 } | |
102 | |
103 // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3 | |
104 // coefficients one at a time. | |
105 int r = filterLength & 3; | |
106 if (r) { | |
107 int remainderOffset = (filterOffset + filterLength - r) * 4; | |
108 AccumRemainder(srcData + remainderOffset, filterValues, accum, r ); | |
109 } | |
110 | |
111 // Shift right for fixed point implementation. | |
112 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); | |
113 | |
114 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ). | |
115 accum = _mm_packs_epi32(accum, zero); | |
116 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n). | |
117 accum = _mm_packus_epi16(accum, zero); | |
118 | |
119 // Store the pixel value of 32 bits. | |
120 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum); | |
121 outRow += 4; | |
122 } | |
123 } | |
124 | |
125 // Convolves horizontally along four rows. The row data is given in | |
126 // |srcData| and continues for the numValues() of the filter. | |
127 // The algorithm is almost same as |convolve_horizontally|. Please | |
128 // refer to that function for detailed comments. | |
129 void convolve_4_rows_horizontally(const unsigned char* srcData[4], | |
130 const SkConvolutionFilter1D& filter, | |
131 unsigned char* outRow[4], | |
132 size_t outRowBytes) { | |
133 SkDEBUGCODE(const unsigned char* out_row_0_start = outRow[0];) | |
134 | |
135 // Output one pixel each iteration, calculating all channels (RGBA) toge ther. | |
136 int numValues = filter.numValues(); | |
137 for (int outX = 0; outX < numValues; outX++) { | |
138 int filterOffset, filterLength; | |
139 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
140 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
141 | |
142 __m128i zero = _mm_setzero_si128(); | |
143 | |
144 // four pixels in a column per iteration. | |
145 __m128i accum0 = _mm_setzero_si128(); | |
146 __m128i accum1 = _mm_setzero_si128(); | |
147 __m128i accum2 = _mm_setzero_si128(); | |
148 __m128i accum3 = _mm_setzero_si128(); | |
149 | |
150 int start = filterOffset * 4; | |
151 // We will load and accumulate with four coefficients per iteration. | |
152 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) { | |
153 __m128i coeff, coeff16lo, coeff16hi; | |
154 // [16] xx xx xx xx c3 c2 c1 c0 | |
155 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues)); | |
156 // [16] xx xx xx xx c1 c1 c0 c0 | |
157 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
158 // [16] c1 c1 c1 c1 c0 c0 c0 c0 | |
159 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); | |
160 // [16] xx xx xx xx c3 c3 c2 c2 | |
161 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
162 // [16] c3 c3 c3 c3 c2 c2 c2 c2 | |
163 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); | |
164 | |
165 __m128i src8, src16, mul_hi, mul_lo, t; | |
166 | |
167 #define ITERATION(src, accum) \ | |
168 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ | |
169 src16 = _mm_unpacklo_epi8(src8, zero); \ | |
170 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ | |
171 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ | |
172 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ | |
173 accum = _mm_add_epi32(accum, t); \ | |
174 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ | |
175 accum = _mm_add_epi32(accum, t); \ | |
176 src16 = _mm_unpackhi_epi8(src8, zero); \ | |
177 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ | |
178 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ | |
179 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ | |
180 accum = _mm_add_epi32(accum, t); \ | |
181 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ | |
182 accum = _mm_add_epi32(accum, t) | |
183 | |
184 ITERATION(srcData[0] + start, accum0); | |
185 ITERATION(srcData[1] + start, accum1); | |
186 ITERATION(srcData[2] + start, accum2); | |
187 ITERATION(srcData[3] + start, accum3); | |
188 | |
189 start += 16; | |
190 filterValues += 4; | |
191 } | |
192 | |
193 int r = filterLength & 3; | |
194 if (r) { | |
195 int remainderOffset = (filterOffset + filterLength - r) * 4; | |
196 AccumRemainder(srcData[0] + remainderOffset, filterValues, accum 0, r); | |
197 AccumRemainder(srcData[1] + remainderOffset, filterValues, accum 1, r); | |
198 AccumRemainder(srcData[2] + remainderOffset, filterValues, accum 2, r); | |
199 AccumRemainder(srcData[3] + remainderOffset, filterValues, accum 3, r); | |
200 } | |
201 | |
202 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | |
203 accum0 = _mm_packs_epi32(accum0, zero); | |
204 accum0 = _mm_packus_epi16(accum0, zero); | |
205 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | |
206 accum1 = _mm_packs_epi32(accum1, zero); | |
207 accum1 = _mm_packus_epi16(accum1, zero); | |
208 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | |
209 accum2 = _mm_packs_epi32(accum2, zero); | |
210 accum2 = _mm_packus_epi16(accum2, zero); | |
211 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); | |
212 accum3 = _mm_packs_epi32(accum3, zero); | |
213 accum3 = _mm_packus_epi16(accum3, zero); | |
214 | |
215 // We seem to be running off the edge here (chromium:491660). | |
216 SkASSERT(((size_t)outRow[0] - (size_t)out_row_0_start) < outRowBytes ); | |
217 | |
218 *(reinterpret_cast<int*>(outRow[0])) = _mm_cvtsi128_si32(accum0); | |
219 *(reinterpret_cast<int*>(outRow[1])) = _mm_cvtsi128_si32(accum1); | |
220 *(reinterpret_cast<int*>(outRow[2])) = _mm_cvtsi128_si32(accum2); | |
221 *(reinterpret_cast<int*>(outRow[3])) = _mm_cvtsi128_si32(accum3); | |
222 | |
223 outRow[0] += 4; | |
224 outRow[1] += 4; | |
225 outRow[2] += 4; | |
226 outRow[3] += 4; | |
227 } | |
228 } | |
229 | |
230 // Does vertical convolution to produce one output row. The filter values an d | |
231 // length are given in the first two parameters. These are applied to each | |
232 // of the rows pointed to in the |sourceDataRows| array, with each row | |
233 // being |pixelWidth| wide. | |
234 // | |
235 // The output must have room for |pixelWidth * 4| bytes. | |
236 template<bool hasAlpha> | |
237 void convolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues, | |
238 int filterLength, | |
239 unsigned char* const* sourceDataRows, | |
240 int pixelWidth, | |
241 unsigned char* outRow) { | |
242 // Output four pixels per iteration (16 bytes). | |
243 int width = pixelWidth & ~3; | |
244 __m128i zero = _mm_setzero_si128(); | |
245 for (int outX = 0; outX < width; outX += 4) { | |
246 // Accumulated result for each pixel. 32 bits per RGBA channel. | |
247 __m128i accum0 = _mm_setzero_si128(); | |
248 __m128i accum1 = _mm_setzero_si128(); | |
249 __m128i accum2 = _mm_setzero_si128(); | |
250 __m128i accum3 = _mm_setzero_si128(); | |
251 | |
252 // Convolve with one filter coefficient per iteration. | |
253 for (int filterY = 0; filterY < filterLength; filterY++) { | |
254 | |
255 // Duplicate the filter coefficient 8 times. | |
256 // [16] cj cj cj cj cj cj cj cj | |
257 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]); | |
258 | |
259 // Load four pixels (16 bytes) together. | |
260 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
261 const __m128i* src = reinterpret_cast<const __m128i*>( | |
262 &sourceDataRows[filterY][outX << 2]); | |
263 __m128i src8 = _mm_loadu_si128(src); | |
264 | |
265 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each cha nnels => | |
266 // multiply with current coefficient => accumulate the result. | |
267 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
268 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
269 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
270 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
271 // [32] a0 b0 g0 r0 | |
272 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
273 accum0 = _mm_add_epi32(accum0, t); | |
274 // [32] a1 b1 g1 r1 | |
275 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
276 accum1 = _mm_add_epi32(accum1, t); | |
277 | |
278 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each cha nnels => | |
279 // multiply with current coefficient => accumulate the result. | |
280 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
281 src16 = _mm_unpackhi_epi8(src8, zero); | |
282 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
283 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
284 // [32] a2 b2 g2 r2 | |
285 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
286 accum2 = _mm_add_epi32(accum2, t); | |
287 // [32] a3 b3 g3 r3 | |
288 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
289 accum3 = _mm_add_epi32(accum3, t); | |
290 } | |
291 | |
292 // Shift right for fixed point implementation. | |
293 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | |
294 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | |
295 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | |
296 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); | |
297 | |
298 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ). | |
299 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
300 accum0 = _mm_packs_epi32(accum0, accum1); | |
301 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
302 accum2 = _mm_packs_epi32(accum2, accum3); | |
303 | |
304 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n). | |
305 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
306 accum0 = _mm_packus_epi16(accum0, accum2); | |
307 | |
308 if (hasAlpha) { | |
309 // Compute the max(ri, gi, bi) for each pixel. | |
310 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
311 __m128i a = _mm_srli_epi32(accum0, 8); | |
312 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
313 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. | |
314 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
315 a = _mm_srli_epi32(accum0, 16); | |
316 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
317 b = _mm_max_epu8(a, b); // Max of r and g and b. | |
318 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
319 b = _mm_slli_epi32(b, 24); | |
320 | |
321 // Make sure the value of alpha channel is always larger than ma ximum | |
322 // value of color channels. | |
323 accum0 = _mm_max_epu8(b, accum0); | |
324 } else { | |
325 // Set value of alpha channels to 0xFF. | |
326 __m128i mask = _mm_set1_epi32(0xff000000); | |
327 accum0 = _mm_or_si128(accum0, mask); | |
328 } | |
329 | |
330 // Store the convolution result (16 bytes) and advance the pixel poi nters. | |
331 _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0); | |
332 outRow += 16; | |
333 } | |
334 | |
335 // When the width of the output is not divisible by 4, We need to save o ne | |
336 // pixel (4 bytes) each time. And also the fourth pixel is always absent . | |
337 int r = pixelWidth & 3; | |
338 if (r) { | |
339 __m128i accum0 = _mm_setzero_si128(); | |
340 __m128i accum1 = _mm_setzero_si128(); | |
341 __m128i accum2 = _mm_setzero_si128(); | |
342 for (int filterY = 0; filterY < filterLength; ++filterY) { | |
343 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]); | |
344 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
345 const __m128i* src = reinterpret_cast<const __m128i*>( | |
346 &sourceDataRows[filterY][width << 2]); | |
347 __m128i src8 = _mm_loadu_si128(src); | |
348 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
349 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
350 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
351 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
352 // [32] a0 b0 g0 r0 | |
353 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
354 accum0 = _mm_add_epi32(accum0, t); | |
355 // [32] a1 b1 g1 r1 | |
356 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
357 accum1 = _mm_add_epi32(accum1, t); | |
358 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
359 src16 = _mm_unpackhi_epi8(src8, zero); | |
360 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
361 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
362 // [32] a2 b2 g2 r2 | |
363 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
364 accum2 = _mm_add_epi32(accum2, t); | |
365 } | |
366 | |
367 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | |
368 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | |
369 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | |
370 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
371 accum0 = _mm_packs_epi32(accum0, accum1); | |
372 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
373 accum2 = _mm_packs_epi32(accum2, zero); | |
374 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
375 accum0 = _mm_packus_epi16(accum0, accum2); | |
376 if (hasAlpha) { | |
377 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
378 __m128i a = _mm_srli_epi32(accum0, 8); | |
379 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
380 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. | |
381 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
382 a = _mm_srli_epi32(accum0, 16); | |
383 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
384 b = _mm_max_epu8(a, b); // Max of r and g and b. | |
385 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
386 b = _mm_slli_epi32(b, 24); | |
387 accum0 = _mm_max_epu8(b, accum0); | |
388 } else { | |
389 __m128i mask = _mm_set1_epi32(0xff000000); | |
390 accum0 = _mm_or_si128(accum0, mask); | |
391 } | |
392 | |
393 for (int i = 0; i < r; i++) { | |
394 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0); | |
395 accum0 = _mm_srli_si128(accum0, 4); | |
396 outRow += 4; | |
397 } | |
398 } | |
399 } | |
400 | |
401 #elif defined(SK_ARM_HAS_NEON) | |
402 | |
403 #include <arm_neon.h> | |
mtklein_C
2016/11/16 14:24:47
Same deal with emmintrin. It's probably best to p
xiangze.zhang
2016/11/17 02:33:07
Done.
| |
404 | |
405 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft, | |
406 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, int32x4 _t& accum, int r) { | |
407 int remainder[4] = {0}; | |
408 for (int i = 0; i < r; i++) { | |
409 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; | |
410 remainder[0] += coeff * pixelsLeft[i * 4 + 0]; | |
411 remainder[1] += coeff * pixelsLeft[i * 4 + 1]; | |
412 remainder[2] += coeff * pixelsLeft[i * 4 + 2]; | |
413 remainder[3] += coeff * pixelsLeft[i * 4 + 3]; | |
414 } | |
415 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]}; | |
416 accum += t; | |
417 } | |
418 | |
419 // Convolves horizontally along a single row. The row data is given in | |
420 // |srcData| and continues for the numValues() of the filter. | |
421 void convolve_horizontally(const unsigned char* srcData, | |
422 const SkConvolutionFilter1D& filter, | |
423 unsigned char* outRow, | |
424 bool /*hasAlpha*/) { | |
425 // Loop over each pixel on this row in the output image. | |
426 int numValues = filter.numValues(); | |
427 for (int outX = 0; outX < numValues; outX++) { | |
428 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); | |
429 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); | |
430 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); | |
431 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); | |
432 // Get the filter that determines the current output pixel. | |
433 int filterOffset, filterLength; | |
434 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
435 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
436 | |
437 // Compute the first pixel in this row that the filter affects. It w ill | |
438 // touch |filterLength| pixels (4 bytes each) after this. | |
439 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; | |
440 | |
441 // Apply the filter to the row to get the destination pixel in |accu m|. | |
442 int32x4_t accum = vdupq_n_s32(0); | |
443 for (int filterX = 0; filterX < filterLength >> 2; filterX++) { | |
444 // Load 4 coefficients | |
445 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; | |
446 coeffs = vld1_s16(filterValues); | |
447 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0)); | |
448 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1)); | |
449 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2)); | |
450 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3)); | |
451 | |
452 // Load pixels and calc | |
453 uint8x16_t pixels = vld1q_u8(rowToFilter); | |
454 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pi xels))); | |
455 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(p ixels))); | |
456 | |
457 int16x4_t p0_src = vget_low_s16(p01_16); | |
458 int16x4_t p1_src = vget_high_s16(p01_16); | |
459 int16x4_t p2_src = vget_low_s16(p23_16); | |
460 int16x4_t p3_src = vget_high_s16(p23_16); | |
461 | |
462 int32x4_t p0 = vmull_s16(p0_src, coeff0); | |
463 int32x4_t p1 = vmull_s16(p1_src, coeff1); | |
464 int32x4_t p2 = vmull_s16(p2_src, coeff2); | |
465 int32x4_t p3 = vmull_s16(p3_src, coeff3); | |
466 | |
467 accum += p0; | |
468 accum += p1; | |
469 accum += p2; | |
470 accum += p3; | |
471 | |
472 // Advance the pointers | |
473 rowToFilter += 16; | |
474 filterValues += 4; | |
475 } | |
476 | |
477 int r = filterLength & 3; | |
478 if (r) { | |
479 int remainder_offset = (filterOffset + filterLength - r) * 4; | |
480 AccumRemainder(srcData + remainder_offset, filterValues, accum, r); | |
481 } | |
482 | |
483 // Bring this value back in range. All of the filter scaling factors | |
484 // are in fixed point with kShiftBits bits of fractional part. | |
485 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); | |
486 | |
487 // Pack and store the new pixel. | |
488 int16x4_t accum16 = vqmovn_s32(accum); | |
489 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); | |
490 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_ u8(accum8), 0); | |
491 outRow += 4; | |
492 } | |
493 } | |
494 | |
495 // Convolves horizontally along four rows. The row data is given in | |
496 // |srcData| and continues for the numValues() of the filter. | |
497 // The algorithm is almost same as |convolve_horizontally|. Please | |
498 // refer to that function for detailed comments. | |
499 void convolve_4_rows_horizontally(const unsigned char* srcData[4], | |
500 const SkConvolutionFilter1D& filter, | |
501 unsigned char* outRow[4], | |
502 size_t outRowBytes) { | |
503 // Output one pixel each iteration, calculating all channels (RGBA) toge ther. | |
504 int numValues = filter.numValues(); | |
505 for (int outX = 0; outX < numValues; outX++) { | |
506 | |
507 int filterOffset, filterLength; | |
508 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
509 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
510 | |
511 // four pixels in a column per iteration. | |
512 int32x4_t accum0 = vdupq_n_s32(0); | |
513 int32x4_t accum1 = vdupq_n_s32(0); | |
514 int32x4_t accum2 = vdupq_n_s32(0); | |
515 int32x4_t accum3 = vdupq_n_s32(0); | |
516 | |
517 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); | |
518 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); | |
519 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); | |
520 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); | |
521 | |
522 int start = filterOffset * 4; | |
523 | |
524 // We will load and accumulate with four coefficients per iteration. | |
525 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) { | |
526 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; | |
527 | |
528 coeffs = vld1_s16(filterValues); | |
529 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0)); | |
530 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1)); | |
531 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2)); | |
532 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3)); | |
533 | |
534 uint8x16_t pixels; | |
535 int16x8_t p01_16, p23_16; | |
536 int32x4_t p0, p1, p2, p3; | |
537 | |
538 #define ITERATION(src, accum) \ | |
539 pixels = vld1q_u8(src); \ | |
540 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \ | |
541 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \ | |
542 p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \ | |
543 p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \ | |
544 p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \ | |
545 p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \ | |
546 accum += p0; \ | |
547 accum += p1; \ | |
548 accum += p2; \ | |
549 accum += p3 | |
550 | |
551 ITERATION(srcData[0] + start, accum0); | |
552 ITERATION(srcData[1] + start, accum1); | |
553 ITERATION(srcData[2] + start, accum2); | |
554 ITERATION(srcData[3] + start, accum3); | |
555 | |
556 start += 16; | |
557 filterValues += 4; | |
558 } | |
559 | |
560 int r = filterLength & 3; | |
561 if (r) { | |
562 int remainder_offset = (filterOffset + filterLength - r) * 4; | |
563 AccumRemainder(srcData[0] + remainder_offset, filterValues, accu m0, r); | |
564 AccumRemainder(srcData[1] + remainder_offset, filterValues, accu m1, r); | |
565 AccumRemainder(srcData[2] + remainder_offset, filterValues, accu m2, r); | |
566 AccumRemainder(srcData[3] + remainder_offset, filterValues, accu m3, r); | |
567 } | |
568 | |
569 int16x4_t accum16; | |
570 uint8x8_t res0, res1, res2, res3; | |
571 | |
572 #define PACK_RESULT(accum, res) \ | |
573 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ | |
574 accum16 = vqmovn_s32(accum); \ | |
575 res = vqmovun_s16(vcombine_s16(accum16, accum16)); | |
576 | |
577 PACK_RESULT(accum0, res0); | |
578 PACK_RESULT(accum1, res1); | |
579 PACK_RESULT(accum2, res2); | |
580 PACK_RESULT(accum3, res3); | |
581 | |
582 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u 32_u8(res0), 0); | |
583 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u 32_u8(res1), 0); | |
584 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u 32_u8(res2), 0); | |
585 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u 32_u8(res3), 0); | |
586 outRow[0] += 4; | |
587 outRow[1] += 4; | |
588 outRow[2] += 4; | |
589 outRow[3] += 4; | |
590 } | |
591 } | |
592 | |
593 | |
594 // Does vertical convolution to produce one output row. The filter values an d | |
595 // length are given in the first two parameters. These are applied to each | |
596 // of the rows pointed to in the |sourceDataRows| array, with each row | |
597 // being |pixelWidth| wide. | |
598 // | |
599 // The output must have room for |pixelWidth * 4| bytes. | |
600 template<bool hasAlpha> | |
601 void convolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues, | |
602 int filterLength, | |
603 unsigned char* const* sourceDataRows, | |
604 int pixelWidth, | |
605 unsigned char* outRow) { | |
606 int width = pixelWidth & ~3; | |
607 | |
608 // Output four pixels per iteration (16 bytes). | |
609 for (int outX = 0; outX < width; outX += 4) { | |
610 | |
611 // Accumulated result for each pixel. 32 bits per RGBA channel. | |
612 int32x4_t accum0 = vdupq_n_s32(0); | |
613 int32x4_t accum1 = vdupq_n_s32(0); | |
614 int32x4_t accum2 = vdupq_n_s32(0); | |
615 int32x4_t accum3 = vdupq_n_s32(0); | |
616 | |
617 // Convolve with one filter coefficient per iteration. | |
618 for (int filterY = 0; filterY < filterLength; filterY++) { | |
619 | |
620 // Duplicate the filter coefficient 4 times. | |
621 // [16] cj cj cj cj | |
622 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]); | |
623 | |
624 // Load four pixels (16 bytes) together. | |
625 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
626 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]); | |
627 | |
628 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8))); | |
629 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8))); | |
630 int16x4_t src16_0 = vget_low_s16(src16_01); | |
631 int16x4_t src16_1 = vget_high_s16(src16_01); | |
632 int16x4_t src16_2 = vget_low_s16(src16_23); | |
633 int16x4_t src16_3 = vget_high_s16(src16_23); | |
634 | |
635 accum0 += vmull_s16(src16_0, coeff16); | |
636 accum1 += vmull_s16(src16_1, coeff16); | |
637 accum2 += vmull_s16(src16_2, coeff16); | |
638 accum3 += vmull_s16(src16_3, coeff16); | |
639 } | |
640 | |
641 // Shift right for fixed point implementation. | |
642 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); | |
643 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); | |
644 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); | |
645 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits); | |
646 | |
647 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ). | |
648 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
649 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1)); | |
650 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
651 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum3)); | |
652 | |
653 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n). | |
654 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
655 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1)); | |
656 | |
657 if (hasAlpha) { | |
658 // Compute the max(ri, gi, bi) for each pixel. | |
659 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
660 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8)); | |
661 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
662 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g | |
663 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
664 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16)); | |
665 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
666 b = vmaxq_u8(a, b); // Max of r and g and b. | |
667 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
668 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 )); | |
669 | |
670 // Make sure the value of alpha channel is always larger than ma ximum | |
671 // value of color channels. | |
672 accum8 = vmaxq_u8(b, accum8); | |
673 } else { | |
674 // Set value of alpha channels to 0xFF. | |
675 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdu pq_n_u32(0xFF000000)); | |
676 } | |
677 | |
678 // Store the convolution result (16 bytes) and advance the pixel poi nters. | |
679 vst1q_u8(outRow, accum8); | |
680 outRow += 16; | |
681 } | |
682 | |
683 // Process the leftovers when the width of the output is not divisible | |
684 // by 4, that is at most 3 pixels. | |
685 int r = pixelWidth & 3; | |
686 if (r) { | |
687 | |
688 int32x4_t accum0 = vdupq_n_s32(0); | |
689 int32x4_t accum1 = vdupq_n_s32(0); | |
690 int32x4_t accum2 = vdupq_n_s32(0); | |
691 | |
692 for (int filterY = 0; filterY < filterLength; ++filterY) { | |
693 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]); | |
694 | |
695 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
696 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]) ; | |
697 | |
698 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8))); | |
699 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8))); | |
700 int16x4_t src16_0 = vget_low_s16(src16_01); | |
701 int16x4_t src16_1 = vget_high_s16(src16_01); | |
702 int16x4_t src16_2 = vget_low_s16(src16_23); | |
703 | |
704 accum0 += vmull_s16(src16_0, coeff16); | |
705 accum1 += vmull_s16(src16_1, coeff16); | |
706 accum2 += vmull_s16(src16_2, coeff16); | |
707 } | |
708 | |
709 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); | |
710 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); | |
711 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); | |
712 | |
713 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1)); | |
714 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum2)); | |
715 | |
716 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1)); | |
717 | |
718 if (hasAlpha) { | |
719 // Compute the max(ri, gi, bi) for each pixel. | |
720 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
721 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8)); | |
722 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
723 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g | |
724 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
725 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16)); | |
726 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
727 b = vmaxq_u8(a, b); // Max of r and g and b. | |
728 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
729 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 )); | |
730 | |
731 // Make sure the value of alpha channel is always larger than ma ximum | |
732 // value of color channels. | |
733 accum8 = vmaxq_u8(b, accum8); | |
734 } else { | |
735 // Set value of alpha channels to 0xFF. | |
736 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdu pq_n_u32(0xFF000000)); | |
737 } | |
738 | |
739 switch(r) { | |
740 case 1: | |
741 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret q_u32_u8(accum8), 0); | |
742 break; | |
743 case 2: | |
744 vst1_u32(reinterpret_cast<uint32_t*>(outRow), | |
745 vreinterpret_u32_u8(vget_low_u8(accum8))); | |
746 break; | |
747 case 3: | |
748 vst1_u32(reinterpret_cast<uint32_t*>(outRow), | |
749 vreinterpret_u32_u8(vget_low_u8(accum8))); | |
750 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpr etq_u32_u8(accum8), 2); | |
751 break; | |
752 } | |
753 } | |
754 } | |
755 | |
756 #else | |
757 | |
758 // Converts the argument to an 8-bit unsigned value by clamping to the range | |
759 // 0-255. | |
760 inline unsigned char ClampTo8(int a) { | |
761 if (static_cast<unsigned>(a) < 256) { | |
762 return a; // Avoid the extra check in the common case. | |
763 } | |
764 if (a < 0) { | |
765 return 0; | |
766 } | |
767 return 255; | |
768 } | |
769 | |
770 // Convolves horizontally along a single row. The row data is given in | |
771 // |srcData| and continues for the numValues() of the filter. | |
772 template<bool hasAlpha> | |
773 void ConvolveHorizontally(const unsigned char* srcData, | |
774 const SkConvolutionFilter1D& filter, | |
775 unsigned char* outRow) { | |
776 // Loop over each pixel on this row in the output image. | |
777 int numValues = filter.numValues(); | |
778 for (int outX = 0; outX < numValues; outX++) { | |
779 // Get the filter that determines the current output pixel. | |
780 int filterOffset, filterLength; | |
781 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
782 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
783 | |
784 // Compute the first pixel in this row that the filter affects. It w ill | |
785 // touch |filterLength| pixels (4 bytes each) after this. | |
786 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; | |
787 | |
788 // Apply the filter to the row to get the destination pixel in |accu m|. | |
789 int accum[4] = {0}; | |
790 for (int filterX = 0; filterX < filterLength; filterX++) { | |
791 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterX]; | |
792 accum[0] += curFilter * rowToFilter[filterX * 4 + 0]; | |
793 accum[1] += curFilter * rowToFilter[filterX * 4 + 1]; | |
794 accum[2] += curFilter * rowToFilter[filterX * 4 + 2]; | |
795 if (hasAlpha) { | |
796 accum[3] += curFilter * rowToFilter[filterX * 4 + 3]; | |
797 } | |
798 } | |
799 | |
800 // Bring this value back in range. All of the filter scaling factors | |
801 // are in fixed point with kShiftBits bits of fractional part. | |
802 accum[0] >>= SkConvolutionFilter1D::kShiftBits; | |
803 accum[1] >>= SkConvolutionFilter1D::kShiftBits; | |
804 accum[2] >>= SkConvolutionFilter1D::kShiftBits; | |
805 if (hasAlpha) { | |
806 accum[3] >>= SkConvolutionFilter1D::kShiftBits; | |
807 } | |
808 | |
809 // Store the new pixel. | |
810 outRow[outX * 4 + 0] = ClampTo8(accum[0]); | |
811 outRow[outX * 4 + 1] = ClampTo8(accum[1]); | |
812 outRow[outX * 4 + 2] = ClampTo8(accum[2]); | |
813 if (hasAlpha) { | |
814 outRow[outX * 4 + 3] = ClampTo8(accum[3]); | |
815 } | |
816 } | |
817 } | |
818 | |
819 // Does vertical convolution to produce one output row. The filter values an d | |
820 // length are given in the first two parameters. These are applied to each | |
821 // of the rows pointed to in the |sourceDataRows| array, with each row | |
822 // being |pixelWidth| wide. | |
823 // | |
824 // The output must have room for |pixelWidth * 4| bytes. | |
825 template<bool hasAlpha> | |
826 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues, | |
827 int filterLength, | |
828 unsigned char* const* sourceDataRows, | |
829 int pixelWidth, | |
830 unsigned char* outRow) { | |
831 // We go through each column in the output and do a vertical convolution , | |
832 // generating one output pixel each time. | |
833 for (int outX = 0; outX < pixelWidth; outX++) { | |
834 // Compute the number of bytes over in each row that the current col umn | |
835 // we're convolving starts at. The pixel will cover the next 4 bytes . | |
836 int byteOffset = outX * 4; | |
837 | |
838 // Apply the filter to one column of pixels. | |
839 int accum[4] = {0}; | |
840 for (int filterY = 0; filterY < filterLength; filterY++) { | |
841 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterY]; | |
842 accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0]; | |
843 accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1]; | |
844 accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2]; | |
845 if (hasAlpha) { | |
846 accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3]; | |
847 } | |
848 } | |
849 | |
850 // Bring this value back in range. All of the filter scaling factors | |
851 // are in fixed point with kShiftBits bits of precision. | |
852 accum[0] >>= SkConvolutionFilter1D::kShiftBits; | |
853 accum[1] >>= SkConvolutionFilter1D::kShiftBits; | |
854 accum[2] >>= SkConvolutionFilter1D::kShiftBits; | |
855 if (hasAlpha) { | |
856 accum[3] >>= SkConvolutionFilter1D::kShiftBits; | |
857 } | |
858 | |
859 // Store the new pixel. | |
860 outRow[byteOffset + 0] = ClampTo8(accum[0]); | |
861 outRow[byteOffset + 1] = ClampTo8(accum[1]); | |
862 outRow[byteOffset + 2] = ClampTo8(accum[2]); | |
863 if (hasAlpha) { | |
864 unsigned char alpha = ClampTo8(accum[3]); | |
865 | |
866 // Make sure the alpha channel doesn't come out smaller than any of the | |
867 // color channels. We use premultipled alpha channels, so this s hould | |
868 // never happen, but rounding errors will cause this from time t o time. | |
869 // These "impossible" colors will cause overflows (and hence ran dom pixel | |
870 // values) when the resulting bitmap is drawn to the screen. | |
871 // | |
872 // We only need to do this when generating the final output row (here). | |
873 int maxColorChannel = SkTMax(outRow[byteOffset + 0], | |
874 SkTMax(outRow[byteOffset + 1], | |
875 outRow[byteOffset + 2])); | |
876 if (alpha < maxColorChannel) { | |
877 outRow[byteOffset + 3] = maxColorChannel; | |
878 } else { | |
879 outRow[byteOffset + 3] = alpha; | |
880 } | |
881 } else { | |
882 // No alpha channel, the image is opaque. | |
883 outRow[byteOffset + 3] = 0xff; | |
884 } | |
885 } | |
886 } | |
887 | |
888 // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize ). We originally | |
889 // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles | |
890 // suffer here too. | |
891 // | |
892 // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. http s://bug.skia.org/2575 | |
893 #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE) | |
894 #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), no inline)) | |
895 #else | |
896 #define SK_MAYBE_DISABLE_VECTORIZATION | |
897 #endif | |
898 | |
899 SK_MAYBE_DISABLE_VECTORIZATION | |
900 void convolve_horizontally(const unsigned char* srcData, | |
901 const SkConvolutionFilter1D& filter, | |
902 unsigned char* outRow, | |
903 bool has_alpha) { | |
904 if (has_alpha) { | |
905 ConvolveHorizontally<true>(srcData, filter, outRow); | |
906 } else { | |
907 ConvolveHorizontally<false>(srcData, filter, outRow); | |
908 } | |
909 } | |
910 #undef SK_MAYBE_DISABLE_VECTORIZATION | |
911 | |
912 void (*convolve_4_rows_horizontally)(const unsigned char* src_data[4], | |
mtklein_C
2016/11/16 14:24:47
Do you think it'd make the calling code clearer to
xiangze.zhang
2016/11/17 02:33:07
The calling code can check and allocate a smaller
| |
913 const SkConvolutionFilter1D& filter, | |
914 unsigned char* out_row[4], | |
915 size_t out_row_bytes) | |
916 = nullptr; | |
917 | |
918 | |
919 #endif | |
920 | |
921 void convolve_vertically(const SkConvolutionFilter1D::ConvolutionFixed* filt er_values, | |
922 int filter_length, | |
923 unsigned char* const* source_data_rows, | |
924 int pixel_width, | |
925 unsigned char* out_row, | |
926 bool has_alpha) { | |
927 if (has_alpha) { | |
928 convolveVertically<true>(filter_values, filter_length, source_data_r ows, | |
929 pixel_width, out_row); | |
930 } else { | |
931 convolveVertically<false>(filter_values, filter_length, source_data_ rows, | |
932 pixel_width, out_row); | |
933 } | |
934 } | |
935 | |
936 } // namespace SK_OPTS_NS | |
937 | |
938 #endif//SkBitmapFilter_opts_DEFINED | |
OLD | NEW |