Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4)

Side by Side Diff: src/opts/SkBitmapFilter_opts.h

Issue 2500113004: Port convolve functions to SkOpts (Closed)
Patch Set: Format Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkBitmapFilter_opts_SSE2.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkBitmapFilter_opts_DEFINED
9 #define SkBitmapFilter_opts_DEFINED
10
11 #include "SkConvolver.h"
12
13 namespace SK_OPTS_NS {
14
15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
16
17 #include <emmintrin.h>
mtklein_C 2016/11/16 14:24:47 This will #include the SSE intrinsics into the SK_
xiangze.zhang 2016/11/17 02:33:07 Done.
18
19 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,
20 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) {
21 int remainder[4] = {0};
22 for (int i = 0; i < r; i++) {
23 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
24 remainder[0] += coeff * pixelsLeft[i * 4 + 0];
25 remainder[1] += coeff * pixelsLeft[i * 4 + 1];
26 remainder[2] += coeff * pixelsLeft[i * 4 + 2];
27 remainder[3] += coeff * pixelsLeft[i * 4 + 3];
28 }
29 __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], rem ainder[3]);
30 accum = _mm_add_epi32(accum, t);
31 }
32
33 // Convolves horizontally along a single row. The row data is given in
34 // |srcData| and continues for the numValues() of the filter.
35 void convolve_horizontally(const unsigned char* srcData,
36 const SkConvolutionFilter1D& filter,
37 unsigned char* outRow,
38 bool /*hasAlpha*/) {
39 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.
40 int numValues = filter.numValues();
41 for (int outX = 0; outX < numValues; outX++) {
42 // Get the filter that determines the current output pixel.
43 int filterOffset, filterLength;
44 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
45 filter.FilterForValue(outX, &filterOffset, &filterLength);
46
47 // Compute the first pixel in this row that the filter affects. It w ill
48 // touch |filterLength| pixels (4 bytes each) after this.
49 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
50
51 __m128i zero = _mm_setzero_si128();
52 __m128i accum = _mm_setzero_si128();
53
54 // We will load and accumulate with four coefficients per iteration.
55 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
56 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
57 __m128i coeff, coeff16;
58 // [16] xx xx xx xx c3 c2 c1 c0
59 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues));
60 // [16] xx xx xx xx c1 c1 c0 c0
61 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
62 // [16] c1 c1 c1 c1 c0 c0 c0 c0
63 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
64
65 // Load four pixels => unpack the first two pixels to 16 bits =>
66 // multiply with coefficients => accumulate the convolution resu lt.
67 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
68 __m128i src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>( rowToFilter));
69 // [16] a1 b1 g1 r1 a0 b0 g0 r0
70 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
71 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
72 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
73 // [32] a0*c0 b0*c0 g0*c0 r0*c0
74 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
75 accum = _mm_add_epi32(accum, t);
76 // [32] a1*c1 b1*c1 g1*c1 r1*c1
77 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
78 accum = _mm_add_epi32(accum, t);
79
80 // Duplicate 3rd and 4th coefficients for all channels =>
81 // unpack the 3rd and 4th pixels to 16 bits => multiply with coe fficients
82 // => accumulate the convolution results.
83 // [16] xx xx xx xx c3 c3 c2 c2
84 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
85 // [16] c3 c3 c3 c3 c2 c2 c2 c2
86 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
87 // [16] a3 g3 b3 r3 a2 g2 b2 r2
88 src16 = _mm_unpackhi_epi8(src8, zero);
89 mul_hi = _mm_mulhi_epi16(src16, coeff16);
90 mul_lo = _mm_mullo_epi16(src16, coeff16);
91 // [32] a2*c2 b2*c2 g2*c2 r2*c2
92 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
93 accum = _mm_add_epi32(accum, t);
94 // [32] a3*c3 b3*c3 g3*c3 r3*c3
95 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
96 accum = _mm_add_epi32(accum, t);
97
98 // Advance the pixel and coefficients pointers.
99 rowToFilter += 16;
100 filterValues += 4;
101 }
102
103 // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3
104 // coefficients one at a time.
105 int r = filterLength & 3;
106 if (r) {
107 int remainderOffset = (filterOffset + filterLength - r) * 4;
108 AccumRemainder(srcData + remainderOffset, filterValues, accum, r );
109 }
110
111 // Shift right for fixed point implementation.
112 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
113
114 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ).
115 accum = _mm_packs_epi32(accum, zero);
116 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n).
117 accum = _mm_packus_epi16(accum, zero);
118
119 // Store the pixel value of 32 bits.
120 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum);
121 outRow += 4;
122 }
123 }
124
125 // Convolves horizontally along four rows. The row data is given in
126 // |srcData| and continues for the numValues() of the filter.
127 // The algorithm is almost same as |convolve_horizontally|. Please
128 // refer to that function for detailed comments.
129 void convolve_4_rows_horizontally(const unsigned char* srcData[4],
130 const SkConvolutionFilter1D& filter,
131 unsigned char* outRow[4],
132 size_t outRowBytes) {
133 SkDEBUGCODE(const unsigned char* out_row_0_start = outRow[0];)
134
135 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.
136 int numValues = filter.numValues();
137 for (int outX = 0; outX < numValues; outX++) {
138 int filterOffset, filterLength;
139 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
140 filter.FilterForValue(outX, &filterOffset, &filterLength);
141
142 __m128i zero = _mm_setzero_si128();
143
144 // four pixels in a column per iteration.
145 __m128i accum0 = _mm_setzero_si128();
146 __m128i accum1 = _mm_setzero_si128();
147 __m128i accum2 = _mm_setzero_si128();
148 __m128i accum3 = _mm_setzero_si128();
149
150 int start = filterOffset * 4;
151 // We will load and accumulate with four coefficients per iteration.
152 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {
153 __m128i coeff, coeff16lo, coeff16hi;
154 // [16] xx xx xx xx c3 c2 c1 c0
155 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues));
156 // [16] xx xx xx xx c1 c1 c0 c0
157 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
158 // [16] c1 c1 c1 c1 c0 c0 c0 c0
159 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
160 // [16] xx xx xx xx c3 c3 c2 c2
161 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
162 // [16] c3 c3 c3 c3 c2 c2 c2 c2
163 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
164
165 __m128i src8, src16, mul_hi, mul_lo, t;
166
167 #define ITERATION(src, accum) \
168 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
169 src16 = _mm_unpacklo_epi8(src8, zero); \
170 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
171 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
172 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
173 accum = _mm_add_epi32(accum, t); \
174 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
175 accum = _mm_add_epi32(accum, t); \
176 src16 = _mm_unpackhi_epi8(src8, zero); \
177 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
178 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
179 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
180 accum = _mm_add_epi32(accum, t); \
181 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
182 accum = _mm_add_epi32(accum, t)
183
184 ITERATION(srcData[0] + start, accum0);
185 ITERATION(srcData[1] + start, accum1);
186 ITERATION(srcData[2] + start, accum2);
187 ITERATION(srcData[3] + start, accum3);
188
189 start += 16;
190 filterValues += 4;
191 }
192
193 int r = filterLength & 3;
194 if (r) {
195 int remainderOffset = (filterOffset + filterLength - r) * 4;
196 AccumRemainder(srcData[0] + remainderOffset, filterValues, accum 0, r);
197 AccumRemainder(srcData[1] + remainderOffset, filterValues, accum 1, r);
198 AccumRemainder(srcData[2] + remainderOffset, filterValues, accum 2, r);
199 AccumRemainder(srcData[3] + remainderOffset, filterValues, accum 3, r);
200 }
201
202 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
203 accum0 = _mm_packs_epi32(accum0, zero);
204 accum0 = _mm_packus_epi16(accum0, zero);
205 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
206 accum1 = _mm_packs_epi32(accum1, zero);
207 accum1 = _mm_packus_epi16(accum1, zero);
208 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
209 accum2 = _mm_packs_epi32(accum2, zero);
210 accum2 = _mm_packus_epi16(accum2, zero);
211 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
212 accum3 = _mm_packs_epi32(accum3, zero);
213 accum3 = _mm_packus_epi16(accum3, zero);
214
215 // We seem to be running off the edge here (chromium:491660).
216 SkASSERT(((size_t)outRow[0] - (size_t)out_row_0_start) < outRowBytes );
217
218 *(reinterpret_cast<int*>(outRow[0])) = _mm_cvtsi128_si32(accum0);
219 *(reinterpret_cast<int*>(outRow[1])) = _mm_cvtsi128_si32(accum1);
220 *(reinterpret_cast<int*>(outRow[2])) = _mm_cvtsi128_si32(accum2);
221 *(reinterpret_cast<int*>(outRow[3])) = _mm_cvtsi128_si32(accum3);
222
223 outRow[0] += 4;
224 outRow[1] += 4;
225 outRow[2] += 4;
226 outRow[3] += 4;
227 }
228 }
229
230 // Does vertical convolution to produce one output row. The filter values an d
231 // length are given in the first two parameters. These are applied to each
232 // of the rows pointed to in the |sourceDataRows| array, with each row
233 // being |pixelWidth| wide.
234 //
235 // The output must have room for |pixelWidth * 4| bytes.
236 template<bool hasAlpha>
237 void convolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,
238 int filterLength,
239 unsigned char* const* sourceDataRows,
240 int pixelWidth,
241 unsigned char* outRow) {
242 // Output four pixels per iteration (16 bytes).
243 int width = pixelWidth & ~3;
244 __m128i zero = _mm_setzero_si128();
245 for (int outX = 0; outX < width; outX += 4) {
246 // Accumulated result for each pixel. 32 bits per RGBA channel.
247 __m128i accum0 = _mm_setzero_si128();
248 __m128i accum1 = _mm_setzero_si128();
249 __m128i accum2 = _mm_setzero_si128();
250 __m128i accum3 = _mm_setzero_si128();
251
252 // Convolve with one filter coefficient per iteration.
253 for (int filterY = 0; filterY < filterLength; filterY++) {
254
255 // Duplicate the filter coefficient 8 times.
256 // [16] cj cj cj cj cj cj cj cj
257 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
258
259 // Load four pixels (16 bytes) together.
260 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
261 const __m128i* src = reinterpret_cast<const __m128i*>(
262 &sourceDataRows[filterY][outX << 2]);
263 __m128i src8 = _mm_loadu_si128(src);
264
265 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each cha nnels =>
266 // multiply with current coefficient => accumulate the result.
267 // [16] a1 b1 g1 r1 a0 b0 g0 r0
268 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
269 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
270 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
271 // [32] a0 b0 g0 r0
272 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
273 accum0 = _mm_add_epi32(accum0, t);
274 // [32] a1 b1 g1 r1
275 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
276 accum1 = _mm_add_epi32(accum1, t);
277
278 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each cha nnels =>
279 // multiply with current coefficient => accumulate the result.
280 // [16] a3 b3 g3 r3 a2 b2 g2 r2
281 src16 = _mm_unpackhi_epi8(src8, zero);
282 mul_hi = _mm_mulhi_epi16(src16, coeff16);
283 mul_lo = _mm_mullo_epi16(src16, coeff16);
284 // [32] a2 b2 g2 r2
285 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
286 accum2 = _mm_add_epi32(accum2, t);
287 // [32] a3 b3 g3 r3
288 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
289 accum3 = _mm_add_epi32(accum3, t);
290 }
291
292 // Shift right for fixed point implementation.
293 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
294 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
295 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
296 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
297
298 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ).
299 // [16] a1 b1 g1 r1 a0 b0 g0 r0
300 accum0 = _mm_packs_epi32(accum0, accum1);
301 // [16] a3 b3 g3 r3 a2 b2 g2 r2
302 accum2 = _mm_packs_epi32(accum2, accum3);
303
304 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n).
305 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
306 accum0 = _mm_packus_epi16(accum0, accum2);
307
308 if (hasAlpha) {
309 // Compute the max(ri, gi, bi) for each pixel.
310 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
311 __m128i a = _mm_srli_epi32(accum0, 8);
312 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
313 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
314 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
315 a = _mm_srli_epi32(accum0, 16);
316 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
317 b = _mm_max_epu8(a, b); // Max of r and g and b.
318 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
319 b = _mm_slli_epi32(b, 24);
320
321 // Make sure the value of alpha channel is always larger than ma ximum
322 // value of color channels.
323 accum0 = _mm_max_epu8(b, accum0);
324 } else {
325 // Set value of alpha channels to 0xFF.
326 __m128i mask = _mm_set1_epi32(0xff000000);
327 accum0 = _mm_or_si128(accum0, mask);
328 }
329
330 // Store the convolution result (16 bytes) and advance the pixel poi nters.
331 _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0);
332 outRow += 16;
333 }
334
335 // When the width of the output is not divisible by 4, We need to save o ne
336 // pixel (4 bytes) each time. And also the fourth pixel is always absent .
337 int r = pixelWidth & 3;
338 if (r) {
339 __m128i accum0 = _mm_setzero_si128();
340 __m128i accum1 = _mm_setzero_si128();
341 __m128i accum2 = _mm_setzero_si128();
342 for (int filterY = 0; filterY < filterLength; ++filterY) {
343 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
344 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
345 const __m128i* src = reinterpret_cast<const __m128i*>(
346 &sourceDataRows[filterY][width << 2]);
347 __m128i src8 = _mm_loadu_si128(src);
348 // [16] a1 b1 g1 r1 a0 b0 g0 r0
349 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
350 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
351 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
352 // [32] a0 b0 g0 r0
353 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
354 accum0 = _mm_add_epi32(accum0, t);
355 // [32] a1 b1 g1 r1
356 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
357 accum1 = _mm_add_epi32(accum1, t);
358 // [16] a3 b3 g3 r3 a2 b2 g2 r2
359 src16 = _mm_unpackhi_epi8(src8, zero);
360 mul_hi = _mm_mulhi_epi16(src16, coeff16);
361 mul_lo = _mm_mullo_epi16(src16, coeff16);
362 // [32] a2 b2 g2 r2
363 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
364 accum2 = _mm_add_epi32(accum2, t);
365 }
366
367 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
368 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
369 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
370 // [16] a1 b1 g1 r1 a0 b0 g0 r0
371 accum0 = _mm_packs_epi32(accum0, accum1);
372 // [16] a3 b3 g3 r3 a2 b2 g2 r2
373 accum2 = _mm_packs_epi32(accum2, zero);
374 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
375 accum0 = _mm_packus_epi16(accum0, accum2);
376 if (hasAlpha) {
377 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
378 __m128i a = _mm_srli_epi32(accum0, 8);
379 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
380 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
381 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
382 a = _mm_srli_epi32(accum0, 16);
383 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
384 b = _mm_max_epu8(a, b); // Max of r and g and b.
385 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
386 b = _mm_slli_epi32(b, 24);
387 accum0 = _mm_max_epu8(b, accum0);
388 } else {
389 __m128i mask = _mm_set1_epi32(0xff000000);
390 accum0 = _mm_or_si128(accum0, mask);
391 }
392
393 for (int i = 0; i < r; i++) {
394 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0);
395 accum0 = _mm_srli_si128(accum0, 4);
396 outRow += 4;
397 }
398 }
399 }
400
401 #elif defined(SK_ARM_HAS_NEON)
402
403 #include <arm_neon.h>
mtklein_C 2016/11/16 14:24:47 Same deal with emmintrin. It's probably best to p
xiangze.zhang 2016/11/17 02:33:07 Done.
404
405 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,
406 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, int32x4 _t& accum, int r) {
407 int remainder[4] = {0};
408 for (int i = 0; i < r; i++) {
409 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
410 remainder[0] += coeff * pixelsLeft[i * 4 + 0];
411 remainder[1] += coeff * pixelsLeft[i * 4 + 1];
412 remainder[2] += coeff * pixelsLeft[i * 4 + 2];
413 remainder[3] += coeff * pixelsLeft[i * 4 + 3];
414 }
415 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};
416 accum += t;
417 }
418
419 // Convolves horizontally along a single row. The row data is given in
420 // |srcData| and continues for the numValues() of the filter.
421 void convolve_horizontally(const unsigned char* srcData,
422 const SkConvolutionFilter1D& filter,
423 unsigned char* outRow,
424 bool /*hasAlpha*/) {
425 // Loop over each pixel on this row in the output image.
426 int numValues = filter.numValues();
427 for (int outX = 0; outX < numValues; outX++) {
428 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
429 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
430 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
431 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
432 // Get the filter that determines the current output pixel.
433 int filterOffset, filterLength;
434 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
435 filter.FilterForValue(outX, &filterOffset, &filterLength);
436
437 // Compute the first pixel in this row that the filter affects. It w ill
438 // touch |filterLength| pixels (4 bytes each) after this.
439 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
440
441 // Apply the filter to the row to get the destination pixel in |accu m|.
442 int32x4_t accum = vdupq_n_s32(0);
443 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
444 // Load 4 coefficients
445 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
446 coeffs = vld1_s16(filterValues);
447 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0));
448 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1));
449 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2));
450 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3));
451
452 // Load pixels and calc
453 uint8x16_t pixels = vld1q_u8(rowToFilter);
454 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pi xels)));
455 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(p ixels)));
456
457 int16x4_t p0_src = vget_low_s16(p01_16);
458 int16x4_t p1_src = vget_high_s16(p01_16);
459 int16x4_t p2_src = vget_low_s16(p23_16);
460 int16x4_t p3_src = vget_high_s16(p23_16);
461
462 int32x4_t p0 = vmull_s16(p0_src, coeff0);
463 int32x4_t p1 = vmull_s16(p1_src, coeff1);
464 int32x4_t p2 = vmull_s16(p2_src, coeff2);
465 int32x4_t p3 = vmull_s16(p3_src, coeff3);
466
467 accum += p0;
468 accum += p1;
469 accum += p2;
470 accum += p3;
471
472 // Advance the pointers
473 rowToFilter += 16;
474 filterValues += 4;
475 }
476
477 int r = filterLength & 3;
478 if (r) {
479 int remainder_offset = (filterOffset + filterLength - r) * 4;
480 AccumRemainder(srcData + remainder_offset, filterValues, accum, r);
481 }
482
483 // Bring this value back in range. All of the filter scaling factors
484 // are in fixed point with kShiftBits bits of fractional part.
485 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
486
487 // Pack and store the new pixel.
488 int16x4_t accum16 = vqmovn_s32(accum);
489 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
490 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_ u8(accum8), 0);
491 outRow += 4;
492 }
493 }
494
495 // Convolves horizontally along four rows. The row data is given in
496 // |srcData| and continues for the numValues() of the filter.
497 // The algorithm is almost same as |convolve_horizontally|. Please
498 // refer to that function for detailed comments.
499 void convolve_4_rows_horizontally(const unsigned char* srcData[4],
500 const SkConvolutionFilter1D& filter,
501 unsigned char* outRow[4],
502 size_t outRowBytes) {
503 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.
504 int numValues = filter.numValues();
505 for (int outX = 0; outX < numValues; outX++) {
506
507 int filterOffset, filterLength;
508 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
509 filter.FilterForValue(outX, &filterOffset, &filterLength);
510
511 // four pixels in a column per iteration.
512 int32x4_t accum0 = vdupq_n_s32(0);
513 int32x4_t accum1 = vdupq_n_s32(0);
514 int32x4_t accum2 = vdupq_n_s32(0);
515 int32x4_t accum3 = vdupq_n_s32(0);
516
517 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
518 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
519 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
520 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
521
522 int start = filterOffset * 4;
523
524 // We will load and accumulate with four coefficients per iteration.
525 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {
526 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
527
528 coeffs = vld1_s16(filterValues);
529 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0));
530 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1));
531 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2));
532 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3));
533
534 uint8x16_t pixels;
535 int16x8_t p01_16, p23_16;
536 int32x4_t p0, p1, p2, p3;
537
538 #define ITERATION(src, accum) \
539 pixels = vld1q_u8(src); \
540 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \
541 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \
542 p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \
543 p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \
544 p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \
545 p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \
546 accum += p0; \
547 accum += p1; \
548 accum += p2; \
549 accum += p3
550
551 ITERATION(srcData[0] + start, accum0);
552 ITERATION(srcData[1] + start, accum1);
553 ITERATION(srcData[2] + start, accum2);
554 ITERATION(srcData[3] + start, accum3);
555
556 start += 16;
557 filterValues += 4;
558 }
559
560 int r = filterLength & 3;
561 if (r) {
562 int remainder_offset = (filterOffset + filterLength - r) * 4;
563 AccumRemainder(srcData[0] + remainder_offset, filterValues, accu m0, r);
564 AccumRemainder(srcData[1] + remainder_offset, filterValues, accu m1, r);
565 AccumRemainder(srcData[2] + remainder_offset, filterValues, accu m2, r);
566 AccumRemainder(srcData[3] + remainder_offset, filterValues, accu m3, r);
567 }
568
569 int16x4_t accum16;
570 uint8x8_t res0, res1, res2, res3;
571
572 #define PACK_RESULT(accum, res) \
573 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \
574 accum16 = vqmovn_s32(accum); \
575 res = vqmovun_s16(vcombine_s16(accum16, accum16));
576
577 PACK_RESULT(accum0, res0);
578 PACK_RESULT(accum1, res1);
579 PACK_RESULT(accum2, res2);
580 PACK_RESULT(accum3, res3);
581
582 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u 32_u8(res0), 0);
583 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u 32_u8(res1), 0);
584 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u 32_u8(res2), 0);
585 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u 32_u8(res3), 0);
586 outRow[0] += 4;
587 outRow[1] += 4;
588 outRow[2] += 4;
589 outRow[3] += 4;
590 }
591 }
592
593
594 // Does vertical convolution to produce one output row. The filter values an d
595 // length are given in the first two parameters. These are applied to each
596 // of the rows pointed to in the |sourceDataRows| array, with each row
597 // being |pixelWidth| wide.
598 //
599 // The output must have room for |pixelWidth * 4| bytes.
600 template<bool hasAlpha>
601 void convolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,
602 int filterLength,
603 unsigned char* const* sourceDataRows,
604 int pixelWidth,
605 unsigned char* outRow) {
606 int width = pixelWidth & ~3;
607
608 // Output four pixels per iteration (16 bytes).
609 for (int outX = 0; outX < width; outX += 4) {
610
611 // Accumulated result for each pixel. 32 bits per RGBA channel.
612 int32x4_t accum0 = vdupq_n_s32(0);
613 int32x4_t accum1 = vdupq_n_s32(0);
614 int32x4_t accum2 = vdupq_n_s32(0);
615 int32x4_t accum3 = vdupq_n_s32(0);
616
617 // Convolve with one filter coefficient per iteration.
618 for (int filterY = 0; filterY < filterLength; filterY++) {
619
620 // Duplicate the filter coefficient 4 times.
621 // [16] cj cj cj cj
622 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);
623
624 // Load four pixels (16 bytes) together.
625 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
626 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
627
628 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8)));
629 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8)));
630 int16x4_t src16_0 = vget_low_s16(src16_01);
631 int16x4_t src16_1 = vget_high_s16(src16_01);
632 int16x4_t src16_2 = vget_low_s16(src16_23);
633 int16x4_t src16_3 = vget_high_s16(src16_23);
634
635 accum0 += vmull_s16(src16_0, coeff16);
636 accum1 += vmull_s16(src16_1, coeff16);
637 accum2 += vmull_s16(src16_2, coeff16);
638 accum3 += vmull_s16(src16_3, coeff16);
639 }
640
641 // Shift right for fixed point implementation.
642 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
643 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
644 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
645 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
646
647 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ).
648 // [16] a1 b1 g1 r1 a0 b0 g0 r0
649 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1));
650 // [16] a3 b3 g3 r3 a2 b2 g2 r2
651 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum3));
652
653 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n).
654 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
655 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1));
656
657 if (hasAlpha) {
658 // Compute the max(ri, gi, bi) for each pixel.
659 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
660 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8));
661 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
662 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
663 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
664 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16));
665 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
666 b = vmaxq_u8(a, b); // Max of r and g and b.
667 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
668 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 ));
669
670 // Make sure the value of alpha channel is always larger than ma ximum
671 // value of color channels.
672 accum8 = vmaxq_u8(b, accum8);
673 } else {
674 // Set value of alpha channels to 0xFF.
675 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdu pq_n_u32(0xFF000000));
676 }
677
678 // Store the convolution result (16 bytes) and advance the pixel poi nters.
679 vst1q_u8(outRow, accum8);
680 outRow += 16;
681 }
682
683 // Process the leftovers when the width of the output is not divisible
684 // by 4, that is at most 3 pixels.
685 int r = pixelWidth & 3;
686 if (r) {
687
688 int32x4_t accum0 = vdupq_n_s32(0);
689 int32x4_t accum1 = vdupq_n_s32(0);
690 int32x4_t accum2 = vdupq_n_s32(0);
691
692 for (int filterY = 0; filterY < filterLength; ++filterY) {
693 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);
694
695 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
696 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]) ;
697
698 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8)));
699 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8)));
700 int16x4_t src16_0 = vget_low_s16(src16_01);
701 int16x4_t src16_1 = vget_high_s16(src16_01);
702 int16x4_t src16_2 = vget_low_s16(src16_23);
703
704 accum0 += vmull_s16(src16_0, coeff16);
705 accum1 += vmull_s16(src16_1, coeff16);
706 accum2 += vmull_s16(src16_2, coeff16);
707 }
708
709 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
710 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
711 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
712
713 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1));
714 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum2));
715
716 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1));
717
718 if (hasAlpha) {
719 // Compute the max(ri, gi, bi) for each pixel.
720 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
721 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8));
722 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
723 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
724 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
725 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16));
726 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
727 b = vmaxq_u8(a, b); // Max of r and g and b.
728 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
729 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 ));
730
731 // Make sure the value of alpha channel is always larger than ma ximum
732 // value of color channels.
733 accum8 = vmaxq_u8(b, accum8);
734 } else {
735 // Set value of alpha channels to 0xFF.
736 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdu pq_n_u32(0xFF000000));
737 }
738
739 switch(r) {
740 case 1:
741 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret q_u32_u8(accum8), 0);
742 break;
743 case 2:
744 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
745 vreinterpret_u32_u8(vget_low_u8(accum8)));
746 break;
747 case 3:
748 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
749 vreinterpret_u32_u8(vget_low_u8(accum8)));
750 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpr etq_u32_u8(accum8), 2);
751 break;
752 }
753 }
754 }
755
756 #else
757
758 // Converts the argument to an 8-bit unsigned value by clamping to the range
759 // 0-255.
760 inline unsigned char ClampTo8(int a) {
761 if (static_cast<unsigned>(a) < 256) {
762 return a; // Avoid the extra check in the common case.
763 }
764 if (a < 0) {
765 return 0;
766 }
767 return 255;
768 }
769
770 // Convolves horizontally along a single row. The row data is given in
771 // |srcData| and continues for the numValues() of the filter.
772 template<bool hasAlpha>
773 void ConvolveHorizontally(const unsigned char* srcData,
774 const SkConvolutionFilter1D& filter,
775 unsigned char* outRow) {
776 // Loop over each pixel on this row in the output image.
777 int numValues = filter.numValues();
778 for (int outX = 0; outX < numValues; outX++) {
779 // Get the filter that determines the current output pixel.
780 int filterOffset, filterLength;
781 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
782 filter.FilterForValue(outX, &filterOffset, &filterLength);
783
784 // Compute the first pixel in this row that the filter affects. It w ill
785 // touch |filterLength| pixels (4 bytes each) after this.
786 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
787
788 // Apply the filter to the row to get the destination pixel in |accu m|.
789 int accum[4] = {0};
790 for (int filterX = 0; filterX < filterLength; filterX++) {
791 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterX];
792 accum[0] += curFilter * rowToFilter[filterX * 4 + 0];
793 accum[1] += curFilter * rowToFilter[filterX * 4 + 1];
794 accum[2] += curFilter * rowToFilter[filterX * 4 + 2];
795 if (hasAlpha) {
796 accum[3] += curFilter * rowToFilter[filterX * 4 + 3];
797 }
798 }
799
800 // Bring this value back in range. All of the filter scaling factors
801 // are in fixed point with kShiftBits bits of fractional part.
802 accum[0] >>= SkConvolutionFilter1D::kShiftBits;
803 accum[1] >>= SkConvolutionFilter1D::kShiftBits;
804 accum[2] >>= SkConvolutionFilter1D::kShiftBits;
805 if (hasAlpha) {
806 accum[3] >>= SkConvolutionFilter1D::kShiftBits;
807 }
808
809 // Store the new pixel.
810 outRow[outX * 4 + 0] = ClampTo8(accum[0]);
811 outRow[outX * 4 + 1] = ClampTo8(accum[1]);
812 outRow[outX * 4 + 2] = ClampTo8(accum[2]);
813 if (hasAlpha) {
814 outRow[outX * 4 + 3] = ClampTo8(accum[3]);
815 }
816 }
817 }
818
819 // Does vertical convolution to produce one output row. The filter values an d
820 // length are given in the first two parameters. These are applied to each
821 // of the rows pointed to in the |sourceDataRows| array, with each row
822 // being |pixelWidth| wide.
823 //
824 // The output must have room for |pixelWidth * 4| bytes.
825 template<bool hasAlpha>
826 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,
827 int filterLength,
828 unsigned char* const* sourceDataRows,
829 int pixelWidth,
830 unsigned char* outRow) {
831 // We go through each column in the output and do a vertical convolution ,
832 // generating one output pixel each time.
833 for (int outX = 0; outX < pixelWidth; outX++) {
834 // Compute the number of bytes over in each row that the current col umn
835 // we're convolving starts at. The pixel will cover the next 4 bytes .
836 int byteOffset = outX * 4;
837
838 // Apply the filter to one column of pixels.
839 int accum[4] = {0};
840 for (int filterY = 0; filterY < filterLength; filterY++) {
841 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterY];
842 accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0];
843 accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1];
844 accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2];
845 if (hasAlpha) {
846 accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3];
847 }
848 }
849
850 // Bring this value back in range. All of the filter scaling factors
851 // are in fixed point with kShiftBits bits of precision.
852 accum[0] >>= SkConvolutionFilter1D::kShiftBits;
853 accum[1] >>= SkConvolutionFilter1D::kShiftBits;
854 accum[2] >>= SkConvolutionFilter1D::kShiftBits;
855 if (hasAlpha) {
856 accum[3] >>= SkConvolutionFilter1D::kShiftBits;
857 }
858
859 // Store the new pixel.
860 outRow[byteOffset + 0] = ClampTo8(accum[0]);
861 outRow[byteOffset + 1] = ClampTo8(accum[1]);
862 outRow[byteOffset + 2] = ClampTo8(accum[2]);
863 if (hasAlpha) {
864 unsigned char alpha = ClampTo8(accum[3]);
865
866 // Make sure the alpha channel doesn't come out smaller than any of the
867 // color channels. We use premultipled alpha channels, so this s hould
868 // never happen, but rounding errors will cause this from time t o time.
869 // These "impossible" colors will cause overflows (and hence ran dom pixel
870 // values) when the resulting bitmap is drawn to the screen.
871 //
872 // We only need to do this when generating the final output row (here).
873 int maxColorChannel = SkTMax(outRow[byteOffset + 0],
874 SkTMax(outRow[byteOffset + 1],
875 outRow[byteOffset + 2]));
876 if (alpha < maxColorChannel) {
877 outRow[byteOffset + 3] = maxColorChannel;
878 } else {
879 outRow[byteOffset + 3] = alpha;
880 }
881 } else {
882 // No alpha channel, the image is opaque.
883 outRow[byteOffset + 3] = 0xff;
884 }
885 }
886 }
887
888 // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize ). We originally
889 // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles
890 // suffer here too.
891 //
892 // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. http s://bug.skia.org/2575
893 #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE)
894 #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), no inline))
895 #else
896 #define SK_MAYBE_DISABLE_VECTORIZATION
897 #endif
898
899 SK_MAYBE_DISABLE_VECTORIZATION
900 void convolve_horizontally(const unsigned char* srcData,
901 const SkConvolutionFilter1D& filter,
902 unsigned char* outRow,
903 bool has_alpha) {
904 if (has_alpha) {
905 ConvolveHorizontally<true>(srcData, filter, outRow);
906 } else {
907 ConvolveHorizontally<false>(srcData, filter, outRow);
908 }
909 }
910 #undef SK_MAYBE_DISABLE_VECTORIZATION
911
912 void (*convolve_4_rows_horizontally)(const unsigned char* src_data[4],
mtklein_C 2016/11/16 14:24:47 Do you think it'd make the calling code clearer to
xiangze.zhang 2016/11/17 02:33:07 The calling code can check and allocate a smaller
913 const SkConvolutionFilter1D& filter,
914 unsigned char* out_row[4],
915 size_t out_row_bytes)
916 = nullptr;
917
918
919 #endif
920
921 void convolve_vertically(const SkConvolutionFilter1D::ConvolutionFixed* filt er_values,
922 int filter_length,
923 unsigned char* const* source_data_rows,
924 int pixel_width,
925 unsigned char* out_row,
926 bool has_alpha) {
927 if (has_alpha) {
928 convolveVertically<true>(filter_values, filter_length, source_data_r ows,
929 pixel_width, out_row);
930 } else {
931 convolveVertically<false>(filter_values, filter_length, source_data_ rows,
932 pixel_width, out_row);
933 }
934 }
935
936 } // namespace SK_OPTS_NS
937
938 #endif//SkBitmapFilter_opts_DEFINED
OLDNEW
« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkBitmapFilter_opts_SSE2.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698