OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 Google Inc. | 2 * Copyright 2013 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBitmapProcState.h" | 8 #include "SkBitmapProcState.h" |
9 #include "SkBitmap.h" | 9 #include "SkBitmap.h" |
10 #include "SkColor.h" | 10 #include "SkColor.h" |
11 #include "SkColorPriv.h" | 11 #include "SkColorPriv.h" |
12 #include "SkUnPreMultiply.h" | 12 #include "SkUnPreMultiply.h" |
13 #include "SkShader.h" | 13 #include "SkShader.h" |
| 14 #include "SkConvolver.h" |
14 | 15 |
15 #include "SkBitmapFilter_opts_SSE2.h" | 16 #include "SkBitmapFilter_opts_SSE2.h" |
16 | 17 |
17 #include <emmintrin.h> | 18 #include <emmintrin.h> |
18 | 19 |
19 #if 0 | 20 #if 0 |
20 static inline void print128i(__m128i value) { | 21 static inline void print128i(__m128i value) { |
21 int *v = (int*) &value; | 22 int *v = (int*) &value; |
22 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]); | 23 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]); |
23 } | 24 } |
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
173 | 174 |
174 *colors++ = SkPackARGB32(a, r, g, b); | 175 *colors++ = SkPackARGB32(a, r, g, b); |
175 | 176 |
176 x++; | 177 x++; |
177 | 178 |
178 s.fInvProc(s.fInvMatrix, SkIntToScalar(x), | 179 s.fInvProc(s.fInvMatrix, SkIntToScalar(x), |
179 SkIntToScalar(y), &srcPt); | 180 SkIntToScalar(y), &srcPt); |
180 | 181 |
181 } | 182 } |
182 } | 183 } |
| 184 |
| 185 // Convolves horizontally along a single row. The row data is given in |
| 186 // |src_data| and continues for the num_values() of the filter. |
| 187 void convolveHorizontally_SSE2(const unsigned char* src_data, |
| 188 const SkConvolutionFilter1D& filter, |
| 189 unsigned char* out_row, |
| 190 bool /*has_alpha*/) { |
| 191 int num_values = filter.numValues(); |
| 192 |
| 193 int filter_offset, filter_length; |
| 194 __m128i zero = _mm_setzero_si128(); |
| 195 __m128i mask[4]; |
| 196 // |mask| will be used to decimate all extra filter coefficients that are |
| 197 // loaded by SIMD when |filter_length| is not divisible by 4. |
| 198 // mask[0] is not used in following algorithm. |
| 199 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); |
| 200 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); |
| 201 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); |
| 202 |
| 203 // Output one pixel each iteration, calculating all channels (RGBA) together. |
| 204 for (int out_x = 0; out_x < num_values; out_x++) { |
| 205 const SkConvolutionFilter1D::Fixed* filter_values = |
| 206 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
| 207 |
| 208 __m128i accum = _mm_setzero_si128(); |
| 209 |
| 210 // Compute the first pixel in this row that the filter affects. It will |
| 211 // touch |filter_length| pixels (4 bytes each) after this. |
| 212 const __m128i* row_to_filter = |
| 213 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); |
| 214 |
| 215 // We will load and accumulate with four coefficients per iteration. |
| 216 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { |
| 217 |
| 218 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. |
| 219 __m128i coeff, coeff16; |
| 220 // [16] xx xx xx xx c3 c2 c1 c0 |
| 221 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); |
| 222 // [16] xx xx xx xx c1 c1 c0 c0 |
| 223 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
| 224 // [16] c1 c1 c1 c1 c0 c0 c0 c0 |
| 225 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 226 |
| 227 // Load four pixels => unpack the first two pixels to 16 bits => |
| 228 // multiply with coefficients => accumulate the convolution result. |
| 229 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 230 __m128i src8 = _mm_loadu_si128(row_to_filter); |
| 231 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 232 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 233 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 234 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 235 // [32] a0*c0 b0*c0 g0*c0 r0*c0 |
| 236 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 237 accum = _mm_add_epi32(accum, t); |
| 238 // [32] a1*c1 b1*c1 g1*c1 r1*c1 |
| 239 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 240 accum = _mm_add_epi32(accum, t); |
| 241 |
| 242 // Duplicate 3rd and 4th coefficients for all channels => |
| 243 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients |
| 244 // => accumulate the convolution results. |
| 245 // [16] xx xx xx xx c3 c3 c2 c2 |
| 246 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
| 247 // [16] c3 c3 c3 c3 c2 c2 c2 c2 |
| 248 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 249 // [16] a3 g3 b3 r3 a2 g2 b2 r2 |
| 250 src16 = _mm_unpackhi_epi8(src8, zero); |
| 251 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 252 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 253 // [32] a2*c2 b2*c2 g2*c2 r2*c2 |
| 254 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 255 accum = _mm_add_epi32(accum, t); |
| 256 // [32] a3*c3 b3*c3 g3*c3 r3*c3 |
| 257 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 258 accum = _mm_add_epi32(accum, t); |
| 259 |
| 260 // Advance the pixel and coefficients pointers. |
| 261 row_to_filter += 1; |
| 262 filter_values += 4; |
| 263 } |
| 264 |
| 265 // When |filter_length| is not divisible by 4, we need to decimate some of |
| 266 // the filter coefficient that was loaded incorrectly to zero; Other than |
| 267 // that the algorithm is same with above, exceot that the 4th pixel will be |
| 268 // always absent. |
| 269 int r = filter_length&3; |
| 270 if (r) { |
| 271 // Note: filter_values must be padded to align_up(filter_offset, 8). |
| 272 __m128i coeff, coeff16; |
| 273 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); |
| 274 // Mask out extra filter taps. |
| 275 coeff = _mm_and_si128(coeff, mask[r]); |
| 276 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
| 277 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 278 |
| 279 // Note: line buffer must be padded to align_up(filter_offset, 16). |
| 280 // We resolve this by use C-version for the last horizontal line. |
| 281 __m128i src8 = _mm_loadu_si128(row_to_filter); |
| 282 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 283 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 284 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 285 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 286 accum = _mm_add_epi32(accum, t); |
| 287 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 288 accum = _mm_add_epi32(accum, t); |
| 289 |
| 290 src16 = _mm_unpackhi_epi8(src8, zero); |
| 291 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
| 292 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 293 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 294 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 295 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 296 accum = _mm_add_epi32(accum, t); |
| 297 } |
| 298 |
| 299 // Shift right for fixed point implementation. |
| 300 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); |
| 301 |
| 302 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |
| 303 accum = _mm_packs_epi32(accum, zero); |
| 304 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |
| 305 accum = _mm_packus_epi16(accum, zero); |
| 306 |
| 307 // Store the pixel value of 32 bits. |
| 308 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); |
| 309 out_row += 4; |
| 310 } |
| 311 } |
| 312 |
| 313 // Convolves horizontally along four rows. The row data is given in |
| 314 // |src_data| and continues for the num_values() of the filter. |
| 315 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please |
| 316 // refer to that function for detailed comments. |
| 317 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], |
| 318 const SkConvolutionFilter1D& filter, |
| 319 unsigned char* out_row[4]) { |
| 320 int num_values = filter.numValues(); |
| 321 |
| 322 int filter_offset, filter_length; |
| 323 __m128i zero = _mm_setzero_si128(); |
| 324 __m128i mask[4]; |
| 325 // |mask| will be used to decimate all extra filter coefficients that are |
| 326 // loaded by SIMD when |filter_length| is not divisible by 4. |
| 327 // mask[0] is not used in following algorithm. |
| 328 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); |
| 329 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); |
| 330 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); |
| 331 |
| 332 // Output one pixel each iteration, calculating all channels (RGBA) together. |
| 333 for (int out_x = 0; out_x < num_values; out_x++) { |
| 334 const SkConvolutionFilter1D::Fixed* filter_values = |
| 335 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
| 336 |
| 337 // four pixels in a column per iteration. |
| 338 __m128i accum0 = _mm_setzero_si128(); |
| 339 __m128i accum1 = _mm_setzero_si128(); |
| 340 __m128i accum2 = _mm_setzero_si128(); |
| 341 __m128i accum3 = _mm_setzero_si128(); |
| 342 int start = (filter_offset<<2); |
| 343 // We will load and accumulate with four coefficients per iteration. |
| 344 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { |
| 345 __m128i coeff, coeff16lo, coeff16hi; |
| 346 // [16] xx xx xx xx c3 c2 c1 c0 |
| 347 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); |
| 348 // [16] xx xx xx xx c1 c1 c0 c0 |
| 349 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
| 350 // [16] c1 c1 c1 c1 c0 c0 c0 c0 |
| 351 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); |
| 352 // [16] xx xx xx xx c3 c3 c2 c2 |
| 353 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
| 354 // [16] c3 c3 c3 c3 c2 c2 c2 c2 |
| 355 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); |
| 356 |
| 357 __m128i src8, src16, mul_hi, mul_lo, t; |
| 358 |
| 359 #define ITERATION(src, accum) \ |
| 360 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ |
| 361 src16 = _mm_unpacklo_epi8(src8, zero); \ |
| 362 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ |
| 363 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ |
| 364 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ |
| 365 accum = _mm_add_epi32(accum, t); \ |
| 366 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ |
| 367 accum = _mm_add_epi32(accum, t); \ |
| 368 src16 = _mm_unpackhi_epi8(src8, zero); \ |
| 369 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ |
| 370 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ |
| 371 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ |
| 372 accum = _mm_add_epi32(accum, t); \ |
| 373 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ |
| 374 accum = _mm_add_epi32(accum, t) |
| 375 |
| 376 ITERATION(src_data[0] + start, accum0); |
| 377 ITERATION(src_data[1] + start, accum1); |
| 378 ITERATION(src_data[2] + start, accum2); |
| 379 ITERATION(src_data[3] + start, accum3); |
| 380 |
| 381 start += 16; |
| 382 filter_values += 4; |
| 383 } |
| 384 |
| 385 int r = filter_length & 3; |
| 386 if (r) { |
| 387 // Note: filter_values must be padded to align_up(filter_offset, 8); |
| 388 __m128i coeff; |
| 389 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); |
| 390 // Mask out extra filter taps. |
| 391 coeff = _mm_and_si128(coeff, mask[r]); |
| 392 |
| 393 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
| 394 /* c1 c1 c1 c1 c0 c0 c0 c0 */ |
| 395 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); |
| 396 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
| 397 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); |
| 398 |
| 399 __m128i src8, src16, mul_hi, mul_lo, t; |
| 400 |
| 401 ITERATION(src_data[0] + start, accum0); |
| 402 ITERATION(src_data[1] + start, accum1); |
| 403 ITERATION(src_data[2] + start, accum2); |
| 404 ITERATION(src_data[3] + start, accum3); |
| 405 } |
| 406 |
| 407 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 408 accum0 = _mm_packs_epi32(accum0, zero); |
| 409 accum0 = _mm_packus_epi16(accum0, zero); |
| 410 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 411 accum1 = _mm_packs_epi32(accum1, zero); |
| 412 accum1 = _mm_packus_epi16(accum1, zero); |
| 413 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 414 accum2 = _mm_packs_epi32(accum2, zero); |
| 415 accum2 = _mm_packus_epi16(accum2, zero); |
| 416 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); |
| 417 accum3 = _mm_packs_epi32(accum3, zero); |
| 418 accum3 = _mm_packus_epi16(accum3, zero); |
| 419 |
| 420 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); |
| 421 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); |
| 422 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); |
| 423 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); |
| 424 |
| 425 out_row[0] += 4; |
| 426 out_row[1] += 4; |
| 427 out_row[2] += 4; |
| 428 out_row[3] += 4; |
| 429 } |
| 430 } |
| 431 |
| 432 // Does vertical convolution to produce one output row. The filter values and |
| 433 // length are given in the first two parameters. These are applied to each |
| 434 // of the rows pointed to in the |source_data_rows| array, with each row |
| 435 // being |pixel_width| wide. |
| 436 // |
| 437 // The output must have room for |pixel_width * 4| bytes. |
| 438 template<bool has_alpha> |
| 439 void convolveVertically_SSE2(const SkConvolutionFilter1D::Fixed* filter_values, |
| 440 int filter_length, |
| 441 unsigned char* const* source_data_rows, |
| 442 int pixel_width, |
| 443 unsigned char* out_row) { |
| 444 int width = pixel_width & ~3; |
| 445 |
| 446 __m128i zero = _mm_setzero_si128(); |
| 447 __m128i accum0, accum1, accum2, accum3, coeff16; |
| 448 const __m128i* src; |
| 449 // Output four pixels per iteration (16 bytes). |
| 450 for (int out_x = 0; out_x < width; out_x += 4) { |
| 451 |
| 452 // Accumulated result for each pixel. 32 bits per RGBA channel. |
| 453 accum0 = _mm_setzero_si128(); |
| 454 accum1 = _mm_setzero_si128(); |
| 455 accum2 = _mm_setzero_si128(); |
| 456 accum3 = _mm_setzero_si128(); |
| 457 |
| 458 // Convolve with one filter coefficient per iteration. |
| 459 for (int filter_y = 0; filter_y < filter_length; filter_y++) { |
| 460 |
| 461 // Duplicate the filter coefficient 8 times. |
| 462 // [16] cj cj cj cj cj cj cj cj |
| 463 coeff16 = _mm_set1_epi16(filter_values[filter_y]); |
| 464 |
| 465 // Load four pixels (16 bytes) together. |
| 466 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 467 src = reinterpret_cast<const __m128i*>( |
| 468 &source_data_rows[filter_y][out_x << 2]); |
| 469 __m128i src8 = _mm_loadu_si128(src); |
| 470 |
| 471 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => |
| 472 // multiply with current coefficient => accumulate the result. |
| 473 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 474 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 475 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 476 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 477 // [32] a0 b0 g0 r0 |
| 478 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 479 accum0 = _mm_add_epi32(accum0, t); |
| 480 // [32] a1 b1 g1 r1 |
| 481 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 482 accum1 = _mm_add_epi32(accum1, t); |
| 483 |
| 484 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => |
| 485 // multiply with current coefficient => accumulate the result. |
| 486 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 487 src16 = _mm_unpackhi_epi8(src8, zero); |
| 488 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 489 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 490 // [32] a2 b2 g2 r2 |
| 491 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 492 accum2 = _mm_add_epi32(accum2, t); |
| 493 // [32] a3 b3 g3 r3 |
| 494 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 495 accum3 = _mm_add_epi32(accum3, t); |
| 496 } |
| 497 |
| 498 // Shift right for fixed point implementation. |
| 499 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 500 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 501 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 502 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); |
| 503 |
| 504 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |
| 505 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 506 accum0 = _mm_packs_epi32(accum0, accum1); |
| 507 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 508 accum2 = _mm_packs_epi32(accum2, accum3); |
| 509 |
| 510 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |
| 511 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 512 accum0 = _mm_packus_epi16(accum0, accum2); |
| 513 |
| 514 if (has_alpha) { |
| 515 // Compute the max(ri, gi, bi) for each pixel. |
| 516 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
| 517 __m128i a = _mm_srli_epi32(accum0, 8); |
| 518 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 519 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. |
| 520 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
| 521 a = _mm_srli_epi32(accum0, 16); |
| 522 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 523 b = _mm_max_epu8(a, b); // Max of r and g and b. |
| 524 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
| 525 b = _mm_slli_epi32(b, 24); |
| 526 |
| 527 // Make sure the value of alpha channel is always larger than maximum |
| 528 // value of color channels. |
| 529 accum0 = _mm_max_epu8(b, accum0); |
| 530 } else { |
| 531 // Set value of alpha channels to 0xFF. |
| 532 __m128i mask = _mm_set1_epi32(0xff000000); |
| 533 accum0 = _mm_or_si128(accum0, mask); |
| 534 } |
| 535 |
| 536 // Store the convolution result (16 bytes) and advance the pixel pointers. |
| 537 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); |
| 538 out_row += 16; |
| 539 } |
| 540 |
| 541 // When the width of the output is not divisible by 4, We need to save one |
| 542 // pixel (4 bytes) each time. And also the fourth pixel is always absent. |
| 543 if (pixel_width & 3) { |
| 544 accum0 = _mm_setzero_si128(); |
| 545 accum1 = _mm_setzero_si128(); |
| 546 accum2 = _mm_setzero_si128(); |
| 547 for (int filter_y = 0; filter_y < filter_length; ++filter_y) { |
| 548 coeff16 = _mm_set1_epi16(filter_values[filter_y]); |
| 549 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 550 src = reinterpret_cast<const __m128i*>( |
| 551 &source_data_rows[filter_y][width<<2]); |
| 552 __m128i src8 = _mm_loadu_si128(src); |
| 553 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 554 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 555 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 556 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 557 // [32] a0 b0 g0 r0 |
| 558 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 559 accum0 = _mm_add_epi32(accum0, t); |
| 560 // [32] a1 b1 g1 r1 |
| 561 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 562 accum1 = _mm_add_epi32(accum1, t); |
| 563 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 564 src16 = _mm_unpackhi_epi8(src8, zero); |
| 565 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 566 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 567 // [32] a2 b2 g2 r2 |
| 568 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 569 accum2 = _mm_add_epi32(accum2, t); |
| 570 } |
| 571 |
| 572 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 573 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 574 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 575 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 576 accum0 = _mm_packs_epi32(accum0, accum1); |
| 577 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 578 accum2 = _mm_packs_epi32(accum2, zero); |
| 579 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 580 accum0 = _mm_packus_epi16(accum0, accum2); |
| 581 if (has_alpha) { |
| 582 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
| 583 __m128i a = _mm_srli_epi32(accum0, 8); |
| 584 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 585 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. |
| 586 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
| 587 a = _mm_srli_epi32(accum0, 16); |
| 588 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 589 b = _mm_max_epu8(a, b); // Max of r and g and b. |
| 590 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
| 591 b = _mm_slli_epi32(b, 24); |
| 592 accum0 = _mm_max_epu8(b, accum0); |
| 593 } else { |
| 594 __m128i mask = _mm_set1_epi32(0xff000000); |
| 595 accum0 = _mm_or_si128(accum0, mask); |
| 596 } |
| 597 |
| 598 for (int out_x = width; out_x < pixel_width; out_x++) { |
| 599 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); |
| 600 accum0 = _mm_srli_si128(accum0, 4); |
| 601 out_row += 4; |
| 602 } |
| 603 } |
| 604 } |
| 605 |
| 606 void convolveVertically_SSE2(const SkConvolutionFilter1D::Fixed* filter_values, |
| 607 int filter_length, |
| 608 unsigned char* const* source_data_rows, |
| 609 int pixel_width, |
| 610 unsigned char* out_row, |
| 611 bool has_alpha) { |
| 612 if (has_alpha) { |
| 613 convolveVertically_SSE2<true>(filter_values, |
| 614 filter_length, |
| 615 source_data_rows, |
| 616 pixel_width, |
| 617 out_row); |
| 618 } else { |
| 619 convolveVertically_SSE2<false>(filter_values, |
| 620 filter_length, |
| 621 source_data_rows, |
| 622 pixel_width, |
| 623 out_row); |
| 624 } |
| 625 } |
| 626 |
| 627 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { |
| 628 // Padding |paddingCount| of more dummy coefficients after the coefficients |
| 629 // of last filter to prevent SIMD instructions which load 8 or 16 bytes |
| 630 // together to access invalid memory areas. We are not trying to align the |
| 631 // coefficients right now due to the opaqueness of <vector> implementation. |
| 632 // This has to be done after all |AddFilter| calls. |
| 633 for (int i = 0; i < 8; ++i) { |
| 634 filter->addFilterValue(static_cast<SkConvolutionFilter1D::Fixed>(0)); |
| 635 } |
| 636 } |
OLD | NEW |