Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(14)

Side by Side Diff: src/opts/SkBitmapFilter_opts.h

Issue 2500113004: Port convolve functions to SkOpts (Closed)
Patch Set: Fix typo Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkBitmapFilter_opts_SSE2.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkBitmapFilter_opts_DEFINED
9 #define SkBitmapFilter_opts_DEFINED
10
11 #include "SkConvolver.h"
12
13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
14 #include <emmintrin.h>
15 #elif defined(SK_ARM_HAS_NEON)
16 #include <arm_neon.h>
17 #endif
18
19 namespace SK_OPTS_NS {
20
21 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
22
23 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,
24 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) {
25 int remainder[4] = {0};
26 for (int i = 0; i < r; i++) {
27 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
28 remainder[0] += coeff * pixelsLeft[i * 4 + 0];
29 remainder[1] += coeff * pixelsLeft[i * 4 + 1];
30 remainder[2] += coeff * pixelsLeft[i * 4 + 2];
31 remainder[3] += coeff * pixelsLeft[i * 4 + 3];
32 }
33 __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], rem ainder[3]);
34 accum = _mm_add_epi32(accum, t);
35 }
36
37 // Convolves horizontally along a single row. The row data is given in
38 // |srcData| and continues for the numValues() of the filter.
39 void convolve_horizontally(const unsigned char* srcData,
40 const SkConvolutionFilter1D& filter,
41 unsigned char* outRow,
42 bool /*hasAlpha*/) {
43 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.
44 int numValues = filter.numValues();
45 for (int outX = 0; outX < numValues; outX++) {
46 // Get the filter that determines the current output pixel.
47 int filterOffset, filterLength;
48 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
49 filter.FilterForValue(outX, &filterOffset, &filterLength);
50
51 // Compute the first pixel in this row that the filter affects. It w ill
52 // touch |filterLength| pixels (4 bytes each) after this.
53 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
54
55 __m128i zero = _mm_setzero_si128();
56 __m128i accum = _mm_setzero_si128();
57
58 // We will load and accumulate with four coefficients per iteration.
59 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
60 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
61 __m128i coeff, coeff16;
62 // [16] xx xx xx xx c3 c2 c1 c0
63 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues));
64 // [16] xx xx xx xx c1 c1 c0 c0
65 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
66 // [16] c1 c1 c1 c1 c0 c0 c0 c0
67 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
68
69 // Load four pixels => unpack the first two pixels to 16 bits =>
70 // multiply with coefficients => accumulate the convolution resu lt.
71 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
72 __m128i src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>( rowToFilter));
73 // [16] a1 b1 g1 r1 a0 b0 g0 r0
74 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
75 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
76 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
77 // [32] a0*c0 b0*c0 g0*c0 r0*c0
78 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
79 accum = _mm_add_epi32(accum, t);
80 // [32] a1*c1 b1*c1 g1*c1 r1*c1
81 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
82 accum = _mm_add_epi32(accum, t);
83
84 // Duplicate 3rd and 4th coefficients for all channels =>
85 // unpack the 3rd and 4th pixels to 16 bits => multiply with coe fficients
86 // => accumulate the convolution results.
87 // [16] xx xx xx xx c3 c3 c2 c2
88 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
89 // [16] c3 c3 c3 c3 c2 c2 c2 c2
90 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
91 // [16] a3 g3 b3 r3 a2 g2 b2 r2
92 src16 = _mm_unpackhi_epi8(src8, zero);
93 mul_hi = _mm_mulhi_epi16(src16, coeff16);
94 mul_lo = _mm_mullo_epi16(src16, coeff16);
95 // [32] a2*c2 b2*c2 g2*c2 r2*c2
96 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
97 accum = _mm_add_epi32(accum, t);
98 // [32] a3*c3 b3*c3 g3*c3 r3*c3
99 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
100 accum = _mm_add_epi32(accum, t);
101
102 // Advance the pixel and coefficients pointers.
103 rowToFilter += 16;
104 filterValues += 4;
105 }
106
107 // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3
108 // coefficients one at a time.
109 int r = filterLength & 3;
110 if (r) {
111 int remainderOffset = (filterOffset + filterLength - r) * 4;
112 AccumRemainder(srcData + remainderOffset, filterValues, accum, r );
113 }
114
115 // Shift right for fixed point implementation.
116 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
117
118 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ).
119 accum = _mm_packs_epi32(accum, zero);
120 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n).
121 accum = _mm_packus_epi16(accum, zero);
122
123 // Store the pixel value of 32 bits.
124 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum);
125 outRow += 4;
126 }
127 }
128
129 // Convolves horizontally along four rows. The row data is given in
130 // |srcData| and continues for the numValues() of the filter.
131 // The algorithm is almost same as |convolve_horizontally|. Please
132 // refer to that function for detailed comments.
133 void convolve_4_rows_horizontally(const unsigned char* srcData[4],
134 const SkConvolutionFilter1D& filter,
135 unsigned char* outRow[4],
136 size_t outRowBytes) {
137 SkDEBUGCODE(const unsigned char* out_row_0_start = outRow[0];)
138
139 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.
140 int numValues = filter.numValues();
141 for (int outX = 0; outX < numValues; outX++) {
142 int filterOffset, filterLength;
143 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
144 filter.FilterForValue(outX, &filterOffset, &filterLength);
145
146 __m128i zero = _mm_setzero_si128();
147
148 // four pixels in a column per iteration.
149 __m128i accum0 = _mm_setzero_si128();
150 __m128i accum1 = _mm_setzero_si128();
151 __m128i accum2 = _mm_setzero_si128();
152 __m128i accum3 = _mm_setzero_si128();
153
154 int start = filterOffset * 4;
155 // We will load and accumulate with four coefficients per iteration.
156 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {
157 __m128i coeff, coeff16lo, coeff16hi;
158 // [16] xx xx xx xx c3 c2 c1 c0
159 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues));
160 // [16] xx xx xx xx c1 c1 c0 c0
161 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
162 // [16] c1 c1 c1 c1 c0 c0 c0 c0
163 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
164 // [16] xx xx xx xx c3 c3 c2 c2
165 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
166 // [16] c3 c3 c3 c3 c2 c2 c2 c2
167 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
168
169 __m128i src8, src16, mul_hi, mul_lo, t;
170
171 #define ITERATION(src, accum) \
172 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
173 src16 = _mm_unpacklo_epi8(src8, zero); \
174 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
175 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
176 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
177 accum = _mm_add_epi32(accum, t); \
178 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
179 accum = _mm_add_epi32(accum, t); \
180 src16 = _mm_unpackhi_epi8(src8, zero); \
181 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
182 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
183 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
184 accum = _mm_add_epi32(accum, t); \
185 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
186 accum = _mm_add_epi32(accum, t)
187
188 ITERATION(srcData[0] + start, accum0);
189 ITERATION(srcData[1] + start, accum1);
190 ITERATION(srcData[2] + start, accum2);
191 ITERATION(srcData[3] + start, accum3);
192
193 start += 16;
194 filterValues += 4;
195 }
196
197 int r = filterLength & 3;
198 if (r) {
199 int remainderOffset = (filterOffset + filterLength - r) * 4;
200 AccumRemainder(srcData[0] + remainderOffset, filterValues, accum 0, r);
201 AccumRemainder(srcData[1] + remainderOffset, filterValues, accum 1, r);
202 AccumRemainder(srcData[2] + remainderOffset, filterValues, accum 2, r);
203 AccumRemainder(srcData[3] + remainderOffset, filterValues, accum 3, r);
204 }
205
206 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
207 accum0 = _mm_packs_epi32(accum0, zero);
208 accum0 = _mm_packus_epi16(accum0, zero);
209 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
210 accum1 = _mm_packs_epi32(accum1, zero);
211 accum1 = _mm_packus_epi16(accum1, zero);
212 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
213 accum2 = _mm_packs_epi32(accum2, zero);
214 accum2 = _mm_packus_epi16(accum2, zero);
215 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
216 accum3 = _mm_packs_epi32(accum3, zero);
217 accum3 = _mm_packus_epi16(accum3, zero);
218
219 // We seem to be running off the edge here (chromium:491660).
220 SkASSERT(((size_t)outRow[0] - (size_t)out_row_0_start) < outRowBytes );
221
222 *(reinterpret_cast<int*>(outRow[0])) = _mm_cvtsi128_si32(accum0);
223 *(reinterpret_cast<int*>(outRow[1])) = _mm_cvtsi128_si32(accum1);
224 *(reinterpret_cast<int*>(outRow[2])) = _mm_cvtsi128_si32(accum2);
225 *(reinterpret_cast<int*>(outRow[3])) = _mm_cvtsi128_si32(accum3);
226
227 outRow[0] += 4;
228 outRow[1] += 4;
229 outRow[2] += 4;
230 outRow[3] += 4;
231 }
232 }
233
234 // Does vertical convolution to produce one output row. The filter values an d
235 // length are given in the first two parameters. These are applied to each
236 // of the rows pointed to in the |sourceDataRows| array, with each row
237 // being |pixelWidth| wide.
238 //
239 // The output must have room for |pixelWidth * 4| bytes.
240 template<bool hasAlpha>
241 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,
242 int filterLength,
243 unsigned char* const* sourceDataRows,
244 int pixelWidth,
245 unsigned char* outRow) {
246 // Output four pixels per iteration (16 bytes).
247 int width = pixelWidth & ~3;
248 __m128i zero = _mm_setzero_si128();
249 for (int outX = 0; outX < width; outX += 4) {
250 // Accumulated result for each pixel. 32 bits per RGBA channel.
251 __m128i accum0 = _mm_setzero_si128();
252 __m128i accum1 = _mm_setzero_si128();
253 __m128i accum2 = _mm_setzero_si128();
254 __m128i accum3 = _mm_setzero_si128();
255
256 // Convolve with one filter coefficient per iteration.
257 for (int filterY = 0; filterY < filterLength; filterY++) {
258
259 // Duplicate the filter coefficient 8 times.
260 // [16] cj cj cj cj cj cj cj cj
261 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
262
263 // Load four pixels (16 bytes) together.
264 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
265 const __m128i* src = reinterpret_cast<const __m128i*>(
266 &sourceDataRows[filterY][outX << 2]);
267 __m128i src8 = _mm_loadu_si128(src);
268
269 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each cha nnels =>
270 // multiply with current coefficient => accumulate the result.
271 // [16] a1 b1 g1 r1 a0 b0 g0 r0
272 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
273 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
274 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
275 // [32] a0 b0 g0 r0
276 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
277 accum0 = _mm_add_epi32(accum0, t);
278 // [32] a1 b1 g1 r1
279 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
280 accum1 = _mm_add_epi32(accum1, t);
281
282 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each cha nnels =>
283 // multiply with current coefficient => accumulate the result.
284 // [16] a3 b3 g3 r3 a2 b2 g2 r2
285 src16 = _mm_unpackhi_epi8(src8, zero);
286 mul_hi = _mm_mulhi_epi16(src16, coeff16);
287 mul_lo = _mm_mullo_epi16(src16, coeff16);
288 // [32] a2 b2 g2 r2
289 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
290 accum2 = _mm_add_epi32(accum2, t);
291 // [32] a3 b3 g3 r3
292 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
293 accum3 = _mm_add_epi32(accum3, t);
294 }
295
296 // Shift right for fixed point implementation.
297 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
298 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
299 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
300 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
301
302 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ).
303 // [16] a1 b1 g1 r1 a0 b0 g0 r0
304 accum0 = _mm_packs_epi32(accum0, accum1);
305 // [16] a3 b3 g3 r3 a2 b2 g2 r2
306 accum2 = _mm_packs_epi32(accum2, accum3);
307
308 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n).
309 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
310 accum0 = _mm_packus_epi16(accum0, accum2);
311
312 if (hasAlpha) {
313 // Compute the max(ri, gi, bi) for each pixel.
314 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
315 __m128i a = _mm_srli_epi32(accum0, 8);
316 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
317 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
318 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
319 a = _mm_srli_epi32(accum0, 16);
320 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
321 b = _mm_max_epu8(a, b); // Max of r and g and b.
322 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
323 b = _mm_slli_epi32(b, 24);
324
325 // Make sure the value of alpha channel is always larger than ma ximum
326 // value of color channels.
327 accum0 = _mm_max_epu8(b, accum0);
328 } else {
329 // Set value of alpha channels to 0xFF.
330 __m128i mask = _mm_set1_epi32(0xff000000);
331 accum0 = _mm_or_si128(accum0, mask);
332 }
333
334 // Store the convolution result (16 bytes) and advance the pixel poi nters.
335 _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0);
336 outRow += 16;
337 }
338
339 // When the width of the output is not divisible by 4, We need to save o ne
340 // pixel (4 bytes) each time. And also the fourth pixel is always absent .
341 int r = pixelWidth & 3;
342 if (r) {
343 __m128i accum0 = _mm_setzero_si128();
344 __m128i accum1 = _mm_setzero_si128();
345 __m128i accum2 = _mm_setzero_si128();
346 for (int filterY = 0; filterY < filterLength; ++filterY) {
347 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
348 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
349 const __m128i* src = reinterpret_cast<const __m128i*>(
350 &sourceDataRows[filterY][width << 2]);
351 __m128i src8 = _mm_loadu_si128(src);
352 // [16] a1 b1 g1 r1 a0 b0 g0 r0
353 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
354 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
355 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
356 // [32] a0 b0 g0 r0
357 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
358 accum0 = _mm_add_epi32(accum0, t);
359 // [32] a1 b1 g1 r1
360 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
361 accum1 = _mm_add_epi32(accum1, t);
362 // [16] a3 b3 g3 r3 a2 b2 g2 r2
363 src16 = _mm_unpackhi_epi8(src8, zero);
364 mul_hi = _mm_mulhi_epi16(src16, coeff16);
365 mul_lo = _mm_mullo_epi16(src16, coeff16);
366 // [32] a2 b2 g2 r2
367 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
368 accum2 = _mm_add_epi32(accum2, t);
369 }
370
371 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
372 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
373 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
374 // [16] a1 b1 g1 r1 a0 b0 g0 r0
375 accum0 = _mm_packs_epi32(accum0, accum1);
376 // [16] a3 b3 g3 r3 a2 b2 g2 r2
377 accum2 = _mm_packs_epi32(accum2, zero);
378 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
379 accum0 = _mm_packus_epi16(accum0, accum2);
380 if (hasAlpha) {
381 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
382 __m128i a = _mm_srli_epi32(accum0, 8);
383 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
384 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
385 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
386 a = _mm_srli_epi32(accum0, 16);
387 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
388 b = _mm_max_epu8(a, b); // Max of r and g and b.
389 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
390 b = _mm_slli_epi32(b, 24);
391 accum0 = _mm_max_epu8(b, accum0);
392 } else {
393 __m128i mask = _mm_set1_epi32(0xff000000);
394 accum0 = _mm_or_si128(accum0, mask);
395 }
396
397 for (int i = 0; i < r; i++) {
398 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0);
399 accum0 = _mm_srli_si128(accum0, 4);
400 outRow += 4;
401 }
402 }
403 }
404
405 #elif defined(SK_ARM_HAS_NEON)
406
407 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,
408 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, int32x4 _t& accum, int r) {
409 int remainder[4] = {0};
410 for (int i = 0; i < r; i++) {
411 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
412 remainder[0] += coeff * pixelsLeft[i * 4 + 0];
413 remainder[1] += coeff * pixelsLeft[i * 4 + 1];
414 remainder[2] += coeff * pixelsLeft[i * 4 + 2];
415 remainder[3] += coeff * pixelsLeft[i * 4 + 3];
416 }
417 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};
418 accum += t;
419 }
420
421 // Convolves horizontally along a single row. The row data is given in
422 // |srcData| and continues for the numValues() of the filter.
423 void convolve_horizontally(const unsigned char* srcData,
424 const SkConvolutionFilter1D& filter,
425 unsigned char* outRow,
426 bool /*hasAlpha*/) {
427 // Loop over each pixel on this row in the output image.
428 int numValues = filter.numValues();
429 for (int outX = 0; outX < numValues; outX++) {
430 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
431 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
432 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
433 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
434 // Get the filter that determines the current output pixel.
435 int filterOffset, filterLength;
436 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
437 filter.FilterForValue(outX, &filterOffset, &filterLength);
438
439 // Compute the first pixel in this row that the filter affects. It w ill
440 // touch |filterLength| pixels (4 bytes each) after this.
441 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
442
443 // Apply the filter to the row to get the destination pixel in |accu m|.
444 int32x4_t accum = vdupq_n_s32(0);
445 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
446 // Load 4 coefficients
447 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
448 coeffs = vld1_s16(filterValues);
449 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0));
450 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1));
451 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2));
452 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3));
453
454 // Load pixels and calc
455 uint8x16_t pixels = vld1q_u8(rowToFilter);
456 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pi xels)));
457 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(p ixels)));
458
459 int16x4_t p0_src = vget_low_s16(p01_16);
460 int16x4_t p1_src = vget_high_s16(p01_16);
461 int16x4_t p2_src = vget_low_s16(p23_16);
462 int16x4_t p3_src = vget_high_s16(p23_16);
463
464 int32x4_t p0 = vmull_s16(p0_src, coeff0);
465 int32x4_t p1 = vmull_s16(p1_src, coeff1);
466 int32x4_t p2 = vmull_s16(p2_src, coeff2);
467 int32x4_t p3 = vmull_s16(p3_src, coeff3);
468
469 accum += p0;
470 accum += p1;
471 accum += p2;
472 accum += p3;
473
474 // Advance the pointers
475 rowToFilter += 16;
476 filterValues += 4;
477 }
478
479 int r = filterLength & 3;
480 if (r) {
481 int remainder_offset = (filterOffset + filterLength - r) * 4;
482 AccumRemainder(srcData + remainder_offset, filterValues, accum, r);
483 }
484
485 // Bring this value back in range. All of the filter scaling factors
486 // are in fixed point with kShiftBits bits of fractional part.
487 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
488
489 // Pack and store the new pixel.
490 int16x4_t accum16 = vqmovn_s32(accum);
491 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
492 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_ u8(accum8), 0);
493 outRow += 4;
494 }
495 }
496
497 // Convolves horizontally along four rows. The row data is given in
498 // |srcData| and continues for the numValues() of the filter.
499 // The algorithm is almost same as |convolve_horizontally|. Please
500 // refer to that function for detailed comments.
501 void convolve_4_rows_horizontally(const unsigned char* srcData[4],
502 const SkConvolutionFilter1D& filter,
503 unsigned char* outRow[4],
504 size_t outRowBytes) {
505 // Output one pixel each iteration, calculating all channels (RGBA) toge ther.
506 int numValues = filter.numValues();
507 for (int outX = 0; outX < numValues; outX++) {
508
509 int filterOffset, filterLength;
510 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
511 filter.FilterForValue(outX, &filterOffset, &filterLength);
512
513 // four pixels in a column per iteration.
514 int32x4_t accum0 = vdupq_n_s32(0);
515 int32x4_t accum1 = vdupq_n_s32(0);
516 int32x4_t accum2 = vdupq_n_s32(0);
517 int32x4_t accum3 = vdupq_n_s32(0);
518
519 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
520 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
521 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
522 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
523
524 int start = filterOffset * 4;
525
526 // We will load and accumulate with four coefficients per iteration.
527 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) {
528 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
529
530 coeffs = vld1_s16(filterValues);
531 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0));
532 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1));
533 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2));
534 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3));
535
536 uint8x16_t pixels;
537 int16x8_t p01_16, p23_16;
538 int32x4_t p0, p1, p2, p3;
539
540 #define ITERATION(src, accum) \
541 pixels = vld1q_u8(src); \
542 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \
543 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \
544 p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \
545 p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \
546 p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \
547 p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \
548 accum += p0; \
549 accum += p1; \
550 accum += p2; \
551 accum += p3
552
553 ITERATION(srcData[0] + start, accum0);
554 ITERATION(srcData[1] + start, accum1);
555 ITERATION(srcData[2] + start, accum2);
556 ITERATION(srcData[3] + start, accum3);
557
558 start += 16;
559 filterValues += 4;
560 }
561
562 int r = filterLength & 3;
563 if (r) {
564 int remainder_offset = (filterOffset + filterLength - r) * 4;
565 AccumRemainder(srcData[0] + remainder_offset, filterValues, accu m0, r);
566 AccumRemainder(srcData[1] + remainder_offset, filterValues, accu m1, r);
567 AccumRemainder(srcData[2] + remainder_offset, filterValues, accu m2, r);
568 AccumRemainder(srcData[3] + remainder_offset, filterValues, accu m3, r);
569 }
570
571 int16x4_t accum16;
572 uint8x8_t res0, res1, res2, res3;
573
574 #define PACK_RESULT(accum, res) \
575 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \
576 accum16 = vqmovn_s32(accum); \
577 res = vqmovun_s16(vcombine_s16(accum16, accum16));
578
579 PACK_RESULT(accum0, res0);
580 PACK_RESULT(accum1, res1);
581 PACK_RESULT(accum2, res2);
582 PACK_RESULT(accum3, res3);
583
584 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u 32_u8(res0), 0);
585 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u 32_u8(res1), 0);
586 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u 32_u8(res2), 0);
587 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u 32_u8(res3), 0);
588 outRow[0] += 4;
589 outRow[1] += 4;
590 outRow[2] += 4;
591 outRow[3] += 4;
592 }
593 }
594
595
596 // Does vertical convolution to produce one output row. The filter values an d
597 // length are given in the first two parameters. These are applied to each
598 // of the rows pointed to in the |sourceDataRows| array, with each row
599 // being |pixelWidth| wide.
600 //
601 // The output must have room for |pixelWidth * 4| bytes.
602 template<bool hasAlpha>
603 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,
604 int filterLength,
605 unsigned char* const* sourceDataRows,
606 int pixelWidth,
607 unsigned char* outRow) {
608 int width = pixelWidth & ~3;
609
610 // Output four pixels per iteration (16 bytes).
611 for (int outX = 0; outX < width; outX += 4) {
612
613 // Accumulated result for each pixel. 32 bits per RGBA channel.
614 int32x4_t accum0 = vdupq_n_s32(0);
615 int32x4_t accum1 = vdupq_n_s32(0);
616 int32x4_t accum2 = vdupq_n_s32(0);
617 int32x4_t accum3 = vdupq_n_s32(0);
618
619 // Convolve with one filter coefficient per iteration.
620 for (int filterY = 0; filterY < filterLength; filterY++) {
621
622 // Duplicate the filter coefficient 4 times.
623 // [16] cj cj cj cj
624 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);
625
626 // Load four pixels (16 bytes) together.
627 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
628 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
629
630 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8)));
631 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8)));
632 int16x4_t src16_0 = vget_low_s16(src16_01);
633 int16x4_t src16_1 = vget_high_s16(src16_01);
634 int16x4_t src16_2 = vget_low_s16(src16_23);
635 int16x4_t src16_3 = vget_high_s16(src16_23);
636
637 accum0 += vmull_s16(src16_0, coeff16);
638 accum1 += vmull_s16(src16_1, coeff16);
639 accum2 += vmull_s16(src16_2, coeff16);
640 accum3 += vmull_s16(src16_3, coeff16);
641 }
642
643 // Shift right for fixed point implementation.
644 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
645 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
646 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
647 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
648
649 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ).
650 // [16] a1 b1 g1 r1 a0 b0 g0 r0
651 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1));
652 // [16] a3 b3 g3 r3 a2 b2 g2 r2
653 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum3));
654
655 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n).
656 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
657 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1));
658
659 if (hasAlpha) {
660 // Compute the max(ri, gi, bi) for each pixel.
661 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
662 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8));
663 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
664 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
665 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
666 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16));
667 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
668 b = vmaxq_u8(a, b); // Max of r and g and b.
669 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
670 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 ));
671
672 // Make sure the value of alpha channel is always larger than ma ximum
673 // value of color channels.
674 accum8 = vmaxq_u8(b, accum8);
675 } else {
676 // Set value of alpha channels to 0xFF.
677 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdu pq_n_u32(0xFF000000));
678 }
679
680 // Store the convolution result (16 bytes) and advance the pixel poi nters.
681 vst1q_u8(outRow, accum8);
682 outRow += 16;
683 }
684
685 // Process the leftovers when the width of the output is not divisible
686 // by 4, that is at most 3 pixels.
687 int r = pixelWidth & 3;
688 if (r) {
689
690 int32x4_t accum0 = vdupq_n_s32(0);
691 int32x4_t accum1 = vdupq_n_s32(0);
692 int32x4_t accum2 = vdupq_n_s32(0);
693
694 for (int filterY = 0; filterY < filterLength; ++filterY) {
695 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);
696
697 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
698 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]) ;
699
700 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8)));
701 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8)));
702 int16x4_t src16_0 = vget_low_s16(src16_01);
703 int16x4_t src16_1 = vget_high_s16(src16_01);
704 int16x4_t src16_2 = vget_low_s16(src16_23);
705
706 accum0 += vmull_s16(src16_0, coeff16);
707 accum1 += vmull_s16(src16_1, coeff16);
708 accum2 += vmull_s16(src16_2, coeff16);
709 }
710
711 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
712 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
713 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
714
715 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1));
716 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum2));
717
718 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1));
719
720 if (hasAlpha) {
721 // Compute the max(ri, gi, bi) for each pixel.
722 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
723 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8));
724 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
725 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
726 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
727 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16));
728 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
729 b = vmaxq_u8(a, b); // Max of r and g and b.
730 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
731 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 ));
732
733 // Make sure the value of alpha channel is always larger than ma ximum
734 // value of color channels.
735 accum8 = vmaxq_u8(b, accum8);
736 } else {
737 // Set value of alpha channels to 0xFF.
738 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdu pq_n_u32(0xFF000000));
739 }
740
741 switch(r) {
742 case 1:
743 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret q_u32_u8(accum8), 0);
744 break;
745 case 2:
746 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
747 vreinterpret_u32_u8(vget_low_u8(accum8)));
748 break;
749 case 3:
750 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
751 vreinterpret_u32_u8(vget_low_u8(accum8)));
752 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpr etq_u32_u8(accum8), 2);
753 break;
754 }
755 }
756 }
757
758 #else
759
760 // Converts the argument to an 8-bit unsigned value by clamping to the range
761 // 0-255.
762 inline unsigned char ClampTo8(int a) {
763 if (static_cast<unsigned>(a) < 256) {
764 return a; // Avoid the extra check in the common case.
765 }
766 if (a < 0) {
767 return 0;
768 }
769 return 255;
770 }
771
772 // Convolves horizontally along a single row. The row data is given in
773 // |srcData| and continues for the numValues() of the filter.
774 template<bool hasAlpha>
775 void ConvolveHorizontally(const unsigned char* srcData,
776 const SkConvolutionFilter1D& filter,
777 unsigned char* outRow) {
778 // Loop over each pixel on this row in the output image.
779 int numValues = filter.numValues();
780 for (int outX = 0; outX < numValues; outX++) {
781 // Get the filter that determines the current output pixel.
782 int filterOffset, filterLength;
783 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
784 filter.FilterForValue(outX, &filterOffset, &filterLength);
785
786 // Compute the first pixel in this row that the filter affects. It w ill
787 // touch |filterLength| pixels (4 bytes each) after this.
788 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
789
790 // Apply the filter to the row to get the destination pixel in |accu m|.
791 int accum[4] = {0};
792 for (int filterX = 0; filterX < filterLength; filterX++) {
793 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterX];
794 accum[0] += curFilter * rowToFilter[filterX * 4 + 0];
795 accum[1] += curFilter * rowToFilter[filterX * 4 + 1];
796 accum[2] += curFilter * rowToFilter[filterX * 4 + 2];
797 if (hasAlpha) {
798 accum[3] += curFilter * rowToFilter[filterX * 4 + 3];
799 }
800 }
801
802 // Bring this value back in range. All of the filter scaling factors
803 // are in fixed point with kShiftBits bits of fractional part.
804 accum[0] >>= SkConvolutionFilter1D::kShiftBits;
805 accum[1] >>= SkConvolutionFilter1D::kShiftBits;
806 accum[2] >>= SkConvolutionFilter1D::kShiftBits;
807 if (hasAlpha) {
808 accum[3] >>= SkConvolutionFilter1D::kShiftBits;
809 }
810
811 // Store the new pixel.
812 outRow[outX * 4 + 0] = ClampTo8(accum[0]);
813 outRow[outX * 4 + 1] = ClampTo8(accum[1]);
814 outRow[outX * 4 + 2] = ClampTo8(accum[2]);
815 if (hasAlpha) {
816 outRow[outX * 4 + 3] = ClampTo8(accum[3]);
817 }
818 }
819 }
820
821 // Does vertical convolution to produce one output row. The filter values an d
822 // length are given in the first two parameters. These are applied to each
823 // of the rows pointed to in the |sourceDataRows| array, with each row
824 // being |pixelWidth| wide.
825 //
826 // The output must have room for |pixelWidth * 4| bytes.
827 template<bool hasAlpha>
828 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,
829 int filterLength,
830 unsigned char* const* sourceDataRows,
831 int pixelWidth,
832 unsigned char* outRow) {
833 // We go through each column in the output and do a vertical convolution ,
834 // generating one output pixel each time.
835 for (int outX = 0; outX < pixelWidth; outX++) {
836 // Compute the number of bytes over in each row that the current col umn
837 // we're convolving starts at. The pixel will cover the next 4 bytes .
838 int byteOffset = outX * 4;
839
840 // Apply the filter to one column of pixels.
841 int accum[4] = {0};
842 for (int filterY = 0; filterY < filterLength; filterY++) {
843 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterY];
844 accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0];
845 accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1];
846 accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2];
847 if (hasAlpha) {
848 accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3];
849 }
850 }
851
852 // Bring this value back in range. All of the filter scaling factors
853 // are in fixed point with kShiftBits bits of precision.
854 accum[0] >>= SkConvolutionFilter1D::kShiftBits;
855 accum[1] >>= SkConvolutionFilter1D::kShiftBits;
856 accum[2] >>= SkConvolutionFilter1D::kShiftBits;
857 if (hasAlpha) {
858 accum[3] >>= SkConvolutionFilter1D::kShiftBits;
859 }
860
861 // Store the new pixel.
862 outRow[byteOffset + 0] = ClampTo8(accum[0]);
863 outRow[byteOffset + 1] = ClampTo8(accum[1]);
864 outRow[byteOffset + 2] = ClampTo8(accum[2]);
865 if (hasAlpha) {
866 unsigned char alpha = ClampTo8(accum[3]);
867
868 // Make sure the alpha channel doesn't come out smaller than any of the
869 // color channels. We use premultipled alpha channels, so this s hould
870 // never happen, but rounding errors will cause this from time t o time.
871 // These "impossible" colors will cause overflows (and hence ran dom pixel
872 // values) when the resulting bitmap is drawn to the screen.
873 //
874 // We only need to do this when generating the final output row (here).
875 int maxColorChannel = SkTMax(outRow[byteOffset + 0],
876 SkTMax(outRow[byteOffset + 1],
877 outRow[byteOffset + 2]));
878 if (alpha < maxColorChannel) {
879 outRow[byteOffset + 3] = maxColorChannel;
880 } else {
881 outRow[byteOffset + 3] = alpha;
882 }
883 } else {
884 // No alpha channel, the image is opaque.
885 outRow[byteOffset + 3] = 0xff;
886 }
887 }
888 }
889
890 // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize ). We originally
891 // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles
892 // suffer here too.
893 //
894 // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. http s://bug.skia.org/2575
895 #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE)
896 #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), no inline))
897 #else
898 #define SK_MAYBE_DISABLE_VECTORIZATION
899 #endif
900
901 SK_MAYBE_DISABLE_VECTORIZATION
902 void convolve_horizontally(const unsigned char* srcData,
903 const SkConvolutionFilter1D& filter,
904 unsigned char* outRow,
905 bool hasAlpha) {
906 if (hasAlpha) {
907 ConvolveHorizontally<true>(srcData, filter, outRow);
908 } else {
909 ConvolveHorizontally<false>(srcData, filter, outRow);
910 }
911 }
912 #undef SK_MAYBE_DISABLE_VECTORIZATION
913
914 void (*convolve_4_rows_horizontally)(const unsigned char* srcData[4],
915 const SkConvolutionFilter1D& filter,
916 unsigned char* outRow[4],
917 size_t outRowBytes)
918 = nullptr;
919
920
921 #endif
922
923 void convolve_vertically(const SkConvolutionFilter1D::ConvolutionFixed* filt erValues,
924 int filterLength,
925 unsigned char* const* sourceDataRows,
926 int pixelWidth,
927 unsigned char* outRow,
928 bool hasAlpha) {
929 if (hasAlpha) {
930 ConvolveVertically<true>(filterValues, filterLength, sourceDataRows,
931 pixelWidth, outRow);
932 } else {
933 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows ,
934 pixelWidth, outRow);
935 }
936 }
937
938 } // namespace SK_OPTS_NS
939
940 #endif//SkBitmapFilter_opts_DEFINED
OLDNEW
« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkBitmapFilter_opts_SSE2.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698