OLD | NEW |
---|---|
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include <algorithm> | 5 #include <algorithm> |
6 | 6 |
7 #include "skia/ext/convolver.h" | 7 #include "skia/ext/convolver.h" |
8 #include "third_party/skia/include/core/SkTypes.h" | 8 #include "third_party/skia/include/core/SkTypes.h" |
9 | 9 |
10 #if defined(ARCH_CPU_X86_FAMILY) | |
11 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h | |
12 #endif | |
13 | |
10 namespace skia { | 14 namespace skia { |
11 | 15 |
12 namespace { | 16 namespace { |
13 | 17 |
14 // Converts the argument to an 8-bit unsigned value by clamping to the range | 18 // Converts the argument to an 8-bit unsigned value by clamping to the range |
15 // 0-255. | 19 // 0-255. |
16 inline unsigned char ClampTo8(int a) { | 20 inline unsigned char ClampTo8(int a) { |
17 if (static_cast<unsigned>(a) < 256) | 21 if (static_cast<unsigned>(a) < 256) |
18 return a; // Avoid the extra check in the common case. | 22 return a; // Avoid the extra check in the common case. |
19 if (a < 0) | 23 if (a < 0) |
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
192 if (has_alpha) | 196 if (has_alpha) |
193 accum[3] >>= ConvolutionFilter1D::kShiftBits; | 197 accum[3] >>= ConvolutionFilter1D::kShiftBits; |
194 | 198 |
195 // Store the new pixel. | 199 // Store the new pixel. |
196 out_row[byte_offset + 0] = ClampTo8(accum[0]); | 200 out_row[byte_offset + 0] = ClampTo8(accum[0]); |
197 out_row[byte_offset + 1] = ClampTo8(accum[1]); | 201 out_row[byte_offset + 1] = ClampTo8(accum[1]); |
198 out_row[byte_offset + 2] = ClampTo8(accum[2]); | 202 out_row[byte_offset + 2] = ClampTo8(accum[2]); |
199 if (has_alpha) { | 203 if (has_alpha) { |
200 unsigned char alpha = ClampTo8(accum[3]); | 204 unsigned char alpha = ClampTo8(accum[3]); |
201 | 205 |
202 // Make sure the alpha channel doesn't come out larger than any of the | 206 // Make sure the alpha channel doesn't come out smaller than any of the |
203 // color channels. We use premultipled alpha channels, so this should | 207 // color channels. We use premultipled alpha channels, so this should |
204 // never happen, but rounding errors will cause this from time to time. | 208 // never happen, but rounding errors will cause this from time to time. |
205 // These "impossible" colors will cause overflows (and hence random pixel | 209 // These "impossible" colors will cause overflows (and hence random pixel |
206 // values) when the resulting bitmap is drawn to the screen. | 210 // values) when the resulting bitmap is drawn to the screen. |
207 // | 211 // |
208 // We only need to do this when generating the final output row (here). | 212 // We only need to do this when generating the final output row (here). |
209 int max_color_channel = std::max(out_row[byte_offset + 0], | 213 int max_color_channel = std::max(out_row[byte_offset + 0], |
210 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2])); | 214 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2])); |
211 if (alpha < max_color_channel) | 215 if (alpha < max_color_channel) |
212 out_row[byte_offset + 3] = max_color_channel; | 216 out_row[byte_offset + 3] = max_color_channel; |
213 else | 217 else |
214 out_row[byte_offset + 3] = alpha; | 218 out_row[byte_offset + 3] = alpha; |
215 } else { | 219 } else { |
216 // No alpha channel, the image is opaque. | 220 // No alpha channel, the image is opaque. |
217 out_row[byte_offset + 3] = 0xff; | 221 out_row[byte_offset + 3] = 0xff; |
218 } | 222 } |
219 } | 223 } |
220 } | 224 } |
221 | 225 |
226 | |
227 // Convolves horizontally along a single row. The row data is given in | |
228 // |src_data| and continues for the num_values() of the filter. | |
229 void ConvolveHorizontally_SSE2(const unsigned char* src_data, | |
230 const ConvolutionFilter1D& filter, | |
231 unsigned char* out_row) { | |
232 #ifdef ARCH_CPU_X86_FAMILY | |
233 int num_values = filter.num_values(); | |
234 | |
235 int filter_offset, filter_length; | |
236 __m128i zero = _mm_setzero_si128(); | |
237 __m128i mask[4]; | |
brettw
2011/02/21 04:45:45
What's mask[0] for? Can you provide a comment for
jiesun
2011/02/22 21:37:03
yes, mask[0] is not used.
| |
238 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); | |
239 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); | |
240 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); | |
241 | |
242 for (int out_x = 0; out_x < num_values; out_x += 1) { | |
243 const ConvolutionFilter1D::Fixed* filter_values = | |
244 filter.FilterForValue(out_x, &filter_offset, &filter_length); | |
245 | |
246 __m128i accum = _mm_setzero_si128(); | |
247 | |
248 const __m128i* row_to_filter = | |
brettw
2011/02/21 04:45:45
Can you comment what this means?
jiesun
2011/02/22 21:37:03
Done.
| |
249 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); | |
250 // Four filter taps per iteration. | |
251 for (int j = 0; j < filter_length >> 2; ++j) { | |
252 __m128i coeff, coeff16; | |
brettw
2011/02/21 04:45:45
For each of the "blocks" of SSE code you're writte
jiesun
2011/02/22 21:37:03
Done.
| |
253 // [16] xx xx xx xx c3 c2 c1 c0 | |
254 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | |
255 // [16] xx xx xx xx c1 c1 c0 c0 | |
256 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
257 // [16] c1 c1 c1 c1 c0 c0 c0 c0 | |
258 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
259 | |
260 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
261 __m128i src8 = _mm_loadu_si128(row_to_filter); | |
262 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
263 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
264 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
265 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
266 // [32] a0*c0 b0*c0 g0*c0 r0*c0 | |
267 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
268 accum = _mm_add_epi32(accum, t); | |
269 // [32] a1*c1 b1*c1 g1*c1 r1*c1 | |
270 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
271 accum = _mm_add_epi32(accum, t); | |
272 | |
273 // [16] xx xx xx xx c3 c3 c2 c2 | |
274 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
275 // [16] c3 c3 c3 c3 c2 c2 c2 c2 | |
276 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
277 // [16] a3 g3 b3 r3 a2 g2 b2 r2 | |
278 src16 = _mm_unpackhi_epi8(src8, zero); | |
279 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
280 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
281 // [32] a2*c2 b2*c2 g2*c2 r2*c2 | |
282 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
283 accum = _mm_add_epi32(accum, t); | |
284 // [32] a3*c3 b3*c3 g3*c3 r3*c3 | |
285 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
286 accum = _mm_add_epi32(accum, t); | |
287 | |
288 row_to_filter += 1; | |
289 filter_values += 4; | |
290 } | |
291 | |
292 // remaining | |
brettw
2011/02/21 04:45:45
Can you provide a better comment here?
jiesun
2011/02/22 21:37:03
Done.
| |
293 int r = filter_length&3; | |
294 if (r) { | |
295 // Note: filter_values must be padded to align_up(filter_offset, 8). | |
296 __m128i coeff, coeff16; | |
297 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | |
298 // Mask out extra filter taps. | |
299 coeff = _mm_and_si128(coeff, mask[r]); | |
300 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
301 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
302 | |
303 // Note: line buffer must be padded to align_up(filter_offset, 16). | |
304 // We resolve this by use C-version for the last horizontal line. | |
305 __m128i src8 = _mm_loadu_si128(row_to_filter); | |
306 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
307 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
308 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
309 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
310 accum = _mm_add_epi32(accum, t); | |
311 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
312 accum = _mm_add_epi32(accum, t); | |
313 | |
314 src16 = _mm_unpackhi_epi8(src8, zero); | |
315 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
316 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
317 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
318 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
319 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
320 accum = _mm_add_epi32(accum, t); | |
321 } | |
322 | |
323 // shift right for fix point implementation before saturation. | |
324 accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits); | |
325 accum = _mm_packs_epi32(accum, zero); | |
326 accum = _mm_packus_epi16(accum, zero); | |
327 | |
328 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); | |
329 out_row += 4; | |
330 } | |
331 #endif | |
332 } | |
333 | |
334 // Convolves horizontally along four rows. The row data is given in | |
335 // |src_data| and continues for the num_values() of the filter. | |
336 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4], | |
337 const ConvolutionFilter1D& filter, | |
338 unsigned char* out_row[4]) { | |
339 #ifdef ARCH_CPU_X86_FAMILY | |
340 int width = filter.num_values(); | |
341 | |
342 int filter_offset, filter_length; | |
343 __m128i zero = _mm_setzero_si128(); | |
344 __m128i mask[4]; | |
345 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); | |
346 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); | |
347 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); | |
348 | |
349 for (int i = 0; i < width; ++i) { | |
350 const ConvolutionFilter1D::Fixed* filter_values = | |
351 filter.FilterForValue(i, &filter_offset, &filter_length); | |
352 | |
353 // four pixels in a column per iteration. | |
354 __m128i accum0 = _mm_setzero_si128(); | |
355 __m128i accum1 = _mm_setzero_si128(); | |
356 __m128i accum2 = _mm_setzero_si128(); | |
357 __m128i accum3 = _mm_setzero_si128(); | |
358 int start = (filter_offset<<2); | |
359 for (int j = 0; j < (filter_length >> 2); ++j) { | |
360 __m128i coeff, coeff16lo, coeff16hi; | |
361 // [16] xx xx xx xx c3 c2 c1 c0 | |
362 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | |
363 // [16] xx xx xx xx c1 c1 c0 c0 | |
364 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
365 // [16] c1 c1 c1 c1 c0 c0 c0 c0 | |
366 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); | |
367 // [16] xx xx xx xx c3 c3 c2 c2 | |
368 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
369 // [16] c3 c3 c3 c3 c2 c2 c2 c2 | |
370 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); | |
371 | |
372 __m128i src8, src16, mul_hi, mul_lo, t; | |
373 | |
374 #define ITERATION(src, accum) \ | |
375 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ | |
376 src16 = _mm_unpacklo_epi8(src8, zero); \ | |
377 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ | |
378 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ | |
379 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ | |
380 accum = _mm_add_epi32(accum, t); \ | |
381 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ | |
382 accum = _mm_add_epi32(accum, t); \ | |
383 src16 = _mm_unpackhi_epi8(src8, zero); \ | |
384 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ | |
385 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ | |
386 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ | |
387 accum = _mm_add_epi32(accum, t); \ | |
388 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ | |
389 accum = _mm_add_epi32(accum, t) | |
390 | |
391 ITERATION(src_data[0]+start, accum0); | |
392 ITERATION(src_data[1]+start, accum1); | |
393 ITERATION(src_data[2]+start, accum2); | |
394 ITERATION(src_data[3]+start, accum3); | |
395 | |
396 start += 16; | |
397 filter_values += 4; | |
398 } | |
399 | |
400 int r = filter_length&3; | |
401 if (r) { | |
402 // Note: filter_values must be padded to align_up(filter_offset, 8); | |
403 __m128i coeff; | |
404 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | |
405 // Mask out extra filter taps. | |
406 coeff = _mm_and_si128(coeff, mask[r]); | |
407 | |
408 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
409 /* c1 c1 c1 c1 c0 c0 c0 c0 */ | |
410 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); | |
411 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
412 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); | |
413 | |
414 __m128i src8, src16, mul_hi, mul_lo, t; | |
415 | |
416 ITERATION(src_data[0]+start, accum0); | |
417 ITERATION(src_data[1]+start, accum1); | |
418 ITERATION(src_data[2]+start, accum2); | |
419 ITERATION(src_data[3]+start, accum3); | |
420 } | |
421 | |
422 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); | |
423 accum0 = _mm_packs_epi32(accum0, zero); | |
424 accum0 = _mm_packus_epi16(accum0, zero); | |
425 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); | |
426 accum1 = _mm_packs_epi32(accum1, zero); | |
427 accum1 = _mm_packus_epi16(accum1, zero); | |
428 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); | |
429 accum2 = _mm_packs_epi32(accum2, zero); | |
430 accum2 = _mm_packus_epi16(accum2, zero); | |
431 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits); | |
432 accum3 = _mm_packs_epi32(accum3, zero); | |
433 accum3 = _mm_packus_epi16(accum3, zero); | |
434 | |
435 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); | |
436 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); | |
437 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); | |
438 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); | |
439 | |
440 out_row[0] += 4; | |
441 out_row[1] += 4; | |
442 out_row[2] += 4; | |
443 out_row[3] += 4; | |
444 } | |
445 #endif | |
446 } | |
447 | |
448 template<bool has_alpha> | |
449 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values, | |
450 int filter_length, | |
brettw
2011/02/21 04:45:45
Check indentation
jiesun
2011/02/22 21:37:03
Done.
| |
451 unsigned char* const* source_data_rows, | |
452 int pixel_width, | |
453 unsigned char* out_row) { | |
454 #ifdef ARCH_CPU_X86_FAMILY | |
455 int width = pixel_width & ~3; | |
456 | |
457 __m128i zero = _mm_setzero_si128(); | |
458 __m128i accum0, accum1, accum2, accum3, coeff16; | |
459 const __m128i* src; | |
460 for (int i = 0; i < width; i += 4) { // Four pixels per iteration. | |
461 accum0 = _mm_setzero_si128(); | |
462 accum1 = _mm_setzero_si128(); | |
463 accum2 = _mm_setzero_si128(); | |
464 accum3 = _mm_setzero_si128(); | |
465 for (int j = 0; j < filter_length; ++j) { | |
466 coeff16 = _mm_set1_epi16(filter_values[j]); | |
467 | |
468 // aligned load due to row_buffer is 16 byte aligned. | |
469 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
470 src = reinterpret_cast<const __m128i*>(&source_data_rows[j][i<<2]); | |
471 __m128i src8 = _mm_loadu_si128(src); | |
472 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
473 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
474 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
475 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
476 // [32] a0 b0 g0 r0 | |
477 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
478 accum0 = _mm_add_epi32(accum0, t); | |
479 // [32] a1 b1 g1 r1 | |
480 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
481 accum1 = _mm_add_epi32(accum1, t); | |
482 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
483 src16 = _mm_unpackhi_epi8(src8, zero); | |
484 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
485 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
486 // [32] a2 b2 g2 r2 | |
487 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
488 accum2 = _mm_add_epi32(accum2, t); | |
489 // [32] a3 b3 g3 r3 | |
490 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
491 accum3 = _mm_add_epi32(accum3, t); | |
492 } | |
493 | |
494 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); | |
495 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); | |
496 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); | |
497 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits); | |
498 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
499 accum0 = _mm_packs_epi32(accum0, accum1); | |
500 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
501 accum2 = _mm_packs_epi32(accum2, accum3); | |
502 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
503 accum0 = _mm_packus_epi16(accum0, accum2); | |
504 if (has_alpha) { | |
505 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
506 __m128i a = _mm_srli_epi32(accum0, 8); | |
507 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
508 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. | |
509 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
510 a = _mm_srli_epi32(accum0, 16); | |
511 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
512 b = _mm_max_epu8(a, b); // Max of r and g and b. | |
513 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
514 b = _mm_slli_epi32(b, 24); | |
515 accum0 = _mm_max_epu8(b, accum0); | |
516 } else { | |
517 __m128i mask = _mm_set1_epi32(0xff000000); | |
518 accum0 = _mm_or_si128(accum0, mask); | |
519 } | |
520 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); | |
521 out_row += 16; | |
522 } | |
523 | |
524 if (pixel_width & 3) { | |
525 accum0 = _mm_setzero_si128(); | |
526 accum1 = _mm_setzero_si128(); | |
527 accum2 = _mm_setzero_si128(); | |
528 for (int j = 0; j < filter_length; ++j) { | |
529 coeff16 = _mm_set1_epi16(filter_values[j]); | |
530 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
531 src = reinterpret_cast<const __m128i*>(&source_data_rows[j][width<<2]); | |
532 __m128i src8 = _mm_loadu_si128(src); | |
533 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
534 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
535 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
536 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
537 // [32] a0 b0 g0 r0 | |
538 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
539 accum0 = _mm_add_epi32(accum0, t); | |
540 // [32] a1 b1 g1 r1 | |
541 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
542 accum1 = _mm_add_epi32(accum1, t); | |
543 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
544 src16 = _mm_unpackhi_epi8(src8, zero); | |
545 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
546 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
547 // [32] a2 b2 g2 r2 | |
548 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
549 accum2 = _mm_add_epi32(accum2, t); | |
550 } | |
551 | |
552 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); | |
553 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); | |
554 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); | |
555 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
556 accum0 = _mm_packs_epi32(accum0, accum1); | |
557 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
558 accum2 = _mm_packs_epi32(accum2, zero); | |
559 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
560 accum0 = _mm_packus_epi16(accum0, accum2); | |
561 if (has_alpha) { | |
562 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
563 __m128i a = _mm_srli_epi32(accum0, 8); | |
564 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
565 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. | |
566 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
567 a = _mm_srli_epi32(accum0, 16); | |
568 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
569 b = _mm_max_epu8(a, b); // Max of r and g and b. | |
570 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
571 b = _mm_slli_epi32(b, 24); | |
572 accum0 = _mm_max_epu8(b, accum0); | |
573 } else { | |
574 __m128i mask = _mm_set1_epi32(0xff000000); | |
575 accum0 = _mm_or_si128(accum0, mask); | |
576 } | |
577 | |
578 for (int i = width; i < pixel_width; ++i) { | |
579 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); | |
580 accum0 = _mm_srli_si128(accum0, 4); | |
581 out_row += 4; | |
582 } | |
583 } | |
584 #endif | |
585 } | |
586 | |
222 } // namespace | 587 } // namespace |
223 | 588 |
224 // ConvolutionFilter1D --------------------------------------------------------- | 589 // ConvolutionFilter1D --------------------------------------------------------- |
225 | 590 |
226 ConvolutionFilter1D::ConvolutionFilter1D() | 591 ConvolutionFilter1D::ConvolutionFilter1D() |
227 : max_filter_(0) { | 592 : max_filter_(0) { |
228 } | 593 } |
229 | 594 |
230 ConvolutionFilter1D::~ConvolutionFilter1D() { | 595 ConvolutionFilter1D::~ConvolutionFilter1D() { |
231 } | 596 } |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
277 // We pushed filter_length elements onto filter_values_ | 642 // We pushed filter_length elements onto filter_values_ |
278 instance.data_location = (static_cast<int>(filter_values_.size()) - | 643 instance.data_location = (static_cast<int>(filter_values_.size()) - |
279 filter_length); | 644 filter_length); |
280 instance.offset = filter_offset; | 645 instance.offset = filter_offset; |
281 instance.length = filter_length; | 646 instance.length = filter_length; |
282 filters_.push_back(instance); | 647 filters_.push_back(instance); |
283 | 648 |
284 max_filter_ = std::max(max_filter_, filter_length); | 649 max_filter_ = std::max(max_filter_, filter_length); |
285 } | 650 } |
286 | 651 |
287 // BGRAConvolve2D ------------------------------------------------------------- | 652 void BGRAConvolve2D_C(const unsigned char* source_data, |
288 | 653 int source_byte_row_stride, |
289 void BGRAConvolve2D(const unsigned char* source_data, | 654 bool source_has_alpha, |
290 int source_byte_row_stride, | 655 const ConvolutionFilter1D& filter_x, |
291 bool source_has_alpha, | 656 const ConvolutionFilter1D& filter_y, |
292 const ConvolutionFilter1D& filter_x, | 657 int output_byte_row_stride, |
293 const ConvolutionFilter1D& filter_y, | 658 unsigned char* output) { |
294 int output_byte_row_stride, | |
295 unsigned char* output) { | |
296 int max_y_filter_size = filter_y.max_filter(); | 659 int max_y_filter_size = filter_y.max_filter(); |
297 | 660 |
298 // The next row in the input that we will generate a horizontally | 661 // The next row in the input that we will generate a horizontally |
299 // convolved row for. If the filter doesn't start at the beginning of the | 662 // convolved row for. If the filter doesn't start at the beginning of the |
300 // image (this is the case when we are only resizing a subset), then we | 663 // image (this is the case when we are only resizing a subset), then we |
301 // don't want to generate any output rows before that. Compute the starting | 664 // don't want to generate any output rows before that. Compute the starting |
302 // row for convolution as the first pixel for the first vertical filter. | 665 // row for convolution as the first pixel for the first vertical filter. |
303 int filter_offset, filter_length; | 666 int filter_offset, filter_length; |
304 const ConvolutionFilter1D::Fixed* filter_values = | 667 const ConvolutionFilter1D::Fixed* filter_values = |
305 filter_y.FilterForValue(0, &filter_offset, &filter_length); | 668 filter_y.FilterForValue(0, &filter_offset, &filter_length); |
306 int next_x_row = filter_offset; | 669 int next_x_row = filter_offset; |
307 | 670 |
308 // We loop over each row in the input doing a horizontal convolution. This | 671 // We loop over each row in the input doing a horizontal convolution. This |
309 // will result in a horizontally convolved image. We write the results into | 672 // will result in a horizontally convolved image. We write the results into |
310 // a circular buffer of convolved rows and do vertical convolution as rows | 673 // a circular buffer of convolved rows and do vertical convolution as rows |
311 // are available. This prevents us from having to store the entire | 674 // are available. This prevents us from having to store the entire |
312 // intermediate image and helps cache coherency. | 675 // intermediate image and helps cache coherency. |
313 CircularRowBuffer row_buffer(filter_x.num_values(), max_y_filter_size, | 676 CircularRowBuffer row_buffer(filter_x.num_values(), max_y_filter_size, |
314 filter_offset); | 677 filter_offset); |
315 | 678 |
316 // Loop over every possible output row, processing just enough horizontal | 679 // Loop over every possible output row, processing just enough horizontal |
317 // convolutions to run each subsequent vertical convolution. | 680 // convolutions to run each subsequent vertical convolution. |
318 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4); | 681 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4); |
319 int num_output_rows = filter_y.num_values(); | 682 int num_output_rows = filter_y.num_values(); |
683 | |
684 int last_filter_offset, last_filter_length; | |
685 filter_y.FilterForValue(num_output_rows-1, &last_filter_offset, | |
686 &last_filter_length); | |
687 | |
320 for (int out_y = 0; out_y < num_output_rows; out_y++) { | 688 for (int out_y = 0; out_y < num_output_rows; out_y++) { |
321 filter_values = filter_y.FilterForValue(out_y, | 689 filter_values = filter_y.FilterForValue(out_y, |
322 &filter_offset, &filter_length); | 690 &filter_offset, &filter_length); |
323 | 691 |
324 // Generate output rows until we have enough to run the current filter. | 692 // Generate output rows until we have enough to run the current filter. |
325 while (next_x_row < filter_offset + filter_length) { | 693 while (next_x_row < filter_offset + filter_length) { |
326 if (source_has_alpha) { | 694 if (source_has_alpha) { |
327 ConvolveHorizontally<true>( | 695 ConvolveHorizontally<true>( |
328 &source_data[next_x_row * source_byte_row_stride], | 696 &source_data[next_x_row * source_byte_row_stride], |
329 filter_x, row_buffer.AdvanceRow()); | 697 filter_x, row_buffer.AdvanceRow()); |
(...skipping 17 matching lines...) Expand all Loading... | |
347 // needs. | 715 // needs. |
348 unsigned char* const* first_row_for_filter = | 716 unsigned char* const* first_row_for_filter = |
349 &rows_to_convolve[filter_offset - first_row_in_circular_buffer]; | 717 &rows_to_convolve[filter_offset - first_row_in_circular_buffer]; |
350 | 718 |
351 if (source_has_alpha) { | 719 if (source_has_alpha) { |
352 ConvolveVertically<true>(filter_values, filter_length, | 720 ConvolveVertically<true>(filter_values, filter_length, |
353 first_row_for_filter, | 721 first_row_for_filter, |
354 filter_x.num_values(), cur_output_row); | 722 filter_x.num_values(), cur_output_row); |
355 } else { | 723 } else { |
356 ConvolveVertically<false>(filter_values, filter_length, | 724 ConvolveVertically<false>(filter_values, filter_length, |
357 first_row_for_filter, | 725 first_row_for_filter, |
358 filter_x.num_values(), cur_output_row); | 726 filter_x.num_values(), cur_output_row); |
359 } | 727 } |
360 } | 728 } |
361 } | 729 } |
362 | 730 |
731 // BGRAConvolve2D ------------------------------------------------------------- | |
732 | |
733 void BGRAConvolve2D_SSE2(const unsigned char* source_data, | |
734 int source_byte_row_stride, | |
735 bool source_has_alpha, | |
736 const ConvolutionFilter1D& filter_x, | |
737 const ConvolutionFilter1D& filter_y, | |
738 int output_byte_row_stride, | |
739 unsigned char* output) { | |
740 int max_y_filter_size = filter_y.max_filter(); | |
741 | |
742 // The next row in the input that we will generate a horizontally | |
743 // convolved row for. If the filter doesn't start at the beginning of the | |
744 // image (this is the case when we are only resizing a subset), then we | |
745 // don't want to generate any output rows before that. Compute the starting | |
746 // row for convolution as the first pixel for the first vertical filter. | |
747 int filter_offset, filter_length; | |
748 const ConvolutionFilter1D::Fixed* filter_values = | |
749 filter_y.FilterForValue(0, &filter_offset, &filter_length); | |
750 int next_x_row = filter_offset; | |
751 | |
752 // We loop over each row in the input doing a horizontal convolution. This | |
753 // will result in a horizontally convolved image. We write the results into | |
754 // a circular buffer of convolved rows and do vertical convolution as rows | |
755 // are available. This prevents us from having to store the entire | |
756 // intermediate image and helps cache coherency. | |
757 // We will need four extra rows to allow horizontal convolution could be done | |
758 // simultaneously. | |
759 int row_buffer_width = (filter_x.num_values() + 15) & ~0xF; | |
760 int row_buffer_height = max_y_filter_size + 4; | |
761 CircularRowBuffer row_buffer(row_buffer_width, | |
762 row_buffer_height, | |
763 filter_offset); | |
764 | |
765 // Loop over every possible output row, processing just enough horizontal | |
766 // convolutions to run each subsequent vertical convolution. | |
767 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4); | |
768 int num_output_rows = filter_y.num_values(); | |
769 | |
770 int last_filter_offset, last_filter_length; | |
771 filter_y.FilterForValue(num_output_rows-1, &last_filter_offset, | |
772 &last_filter_length); | |
773 | |
774 for (int out_y = 0; out_y < num_output_rows; out_y++) { | |
775 filter_values = filter_y.FilterForValue(out_y, | |
776 &filter_offset, &filter_length); | |
777 | |
778 // Generate output rows until we have enough to run the current filter. | |
779 while (next_x_row < filter_offset + filter_length) { | |
780 if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) { | |
781 const unsigned char* src[4]; | |
782 unsigned char* out_row[4]; | |
783 for (int i = 0; i < 4; ++i) { | |
784 src[i] = &source_data[(next_x_row+i) * source_byte_row_stride]; | |
785 out_row[i] = row_buffer.AdvanceRow(); | |
786 } | |
787 ConvolveHorizontally4_SSE2(src, filter_x, out_row); | |
788 next_x_row+=4; | |
789 } else { | |
790 // For the last row, SSE2 load possibly to access data beyond the | |
791 // image area. therefore we use C version here. Hacking into skia | |
792 // to add line paddings is not something in my mind. | |
793 if (next_x_row == last_filter_offset + last_filter_length - 1) { | |
794 if (source_has_alpha) | |
795 ConvolveHorizontally<true>( | |
796 &source_data[next_x_row * source_byte_row_stride], | |
797 filter_x, row_buffer.AdvanceRow()); | |
798 else | |
799 ConvolveHorizontally<false>( | |
800 &source_data[next_x_row * source_byte_row_stride], | |
801 filter_x, row_buffer.AdvanceRow()); | |
802 } else { | |
803 ConvolveHorizontally_SSE2( | |
804 &source_data[next_x_row * source_byte_row_stride], | |
805 filter_x, row_buffer.AdvanceRow()); | |
806 } | |
807 next_x_row++; | |
808 } | |
809 } | |
810 | |
811 // Compute where in the output image this row of final data will go. | |
812 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride]; | |
813 | |
814 // Get the list of rows that the circular buffer has, in order. | |
815 int first_row_in_circular_buffer; | |
816 unsigned char* const* rows_to_convolve = | |
817 row_buffer.GetRowAddresses(&first_row_in_circular_buffer); | |
818 | |
819 // Now compute the start of the subset of those rows that the filter | |
820 // needs. | |
821 unsigned char* const* first_row_for_filter = | |
822 &rows_to_convolve[filter_offset - first_row_in_circular_buffer]; | |
823 | |
824 if (source_has_alpha) { | |
825 ConvolveVertically_SSE2<true>(filter_values, filter_length, | |
826 first_row_for_filter, | |
827 filter_x.num_values(), cur_output_row); | |
828 } else { | |
829 ConvolveVertically_SSE2<false>(filter_values, filter_length, | |
830 first_row_for_filter, | |
831 filter_x.num_values(), cur_output_row); | |
832 } | |
833 } | |
834 } | |
835 | |
836 void BGRAConvolve2D(const unsigned char* source_data, | |
837 int source_byte_row_stride, | |
838 bool source_has_alpha, | |
839 const ConvolutionFilter1D& filter_x, | |
840 const ConvolutionFilter1D& filter_y, | |
841 int output_byte_row_stride, | |
842 unsigned char* output) { | |
843 base::CPU cpu; | |
844 if (cpu.has_sse2()) { | |
845 BGRAConvolve2D_SSE2(source_data, source_byte_row_stride, source_has_alpha, | |
846 filter_x, filter_y, output_byte_row_stride, output); | |
847 } else { | |
848 BGRAConvolve2D_C(source_data, source_byte_row_stride, source_has_alpha, | |
849 filter_x, filter_y, output_byte_row_stride, output); | |
850 } | |
851 } | |
852 | |
363 } // namespace skia | 853 } // namespace skia |
OLD | NEW |