 Chromium Code Reviews
 Chromium Code Reviews Issue 1663623002:
  NEON optimizations for GrayAlpha -> RGBA/BGRA Premul/Unpremul  (Closed) 
  Base URL: https://skia.googlesource.com/skia.git@gralpha
    
  
    Issue 1663623002:
  NEON optimizations for GrayAlpha -> RGBA/BGRA Premul/Unpremul  (Closed) 
  Base URL: https://skia.googlesource.com/skia.git@gralpha| OLD | NEW | 
|---|---|
| 1 /* | 1 /* | 
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. | 
| 3 * | 3 * | 
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be | 
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. | 
| 6 */ | 6 */ | 
| 7 | 7 | 
| 8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED | 
| 9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED | 
| 10 | 10 | 
| (...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 91 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) { | 91 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) { | 
| 92 const uint8_t* src = (const uint8_t*)vsrc; | 92 const uint8_t* src = (const uint8_t*)vsrc; | 
| 93 for (int i = 0; i < count; i++) { | 93 for (int i = 0; i < count; i++) { | 
| 94 dst[i] = (uint32_t)0xFF << 24 | 94 dst[i] = (uint32_t)0xFF << 24 | 
| 95 | (uint32_t)src[i] << 16 | 95 | (uint32_t)src[i] << 16 | 
| 96 | (uint32_t)src[i] << 8 | 96 | (uint32_t)src[i] << 8 | 
| 97 | (uint32_t)src[i] << 0; | 97 | (uint32_t)src[i] << 0; | 
| 98 } | 98 } | 
| 99 } | 99 } | 
| 100 | 100 | 
| 101 static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) { | |
| 102 const uint8_t* src = (const uint8_t*)vsrc; | |
| 103 for (int i = 0; i < count; i++) { | |
| 104 uint8_t g = src[0], | |
| 105 a = src[1]; | |
| 106 src += 2; | |
| 107 dst[i] = (uint32_t)a << 24 | |
| 108 | (uint32_t)g << 16 | |
| 109 | (uint32_t)g << 8 | |
| 110 | (uint32_t)g << 0; | |
| 111 } | |
| 112 } | |
| 113 | |
| 114 static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) { | |
| 115 const uint8_t* src = (const uint8_t*)vsrc; | |
| 116 for (int i = 0; i < count; i++) { | |
| 117 uint8_t g = src[0], | |
| 118 a = src[1]; | |
| 119 src += 2; | |
| 120 g = (g*a+127)/255; | |
| 121 dst[i] = (uint32_t)a << 24 | |
| 122 | (uint32_t)g << 16 | |
| 123 | (uint32_t)g << 8 | |
| 124 | (uint32_t)g << 0; | |
| 125 } | |
| 126 } | |
| 127 | |
| 101 #if defined(SK_ARM_HAS_NEON) | 128 #if defined(SK_ARM_HAS_NEON) | 
| 102 | 129 | 
| 103 // Rounded divide by 255, (x + 127) / 255 | 130 // Rounded divide by 255, (x + 127) / 255 | 
| 104 static uint8x8_t div255_round(uint16x8_t x) { | 131 static uint8x8_t div255_round(uint16x8_t x) { | 
| 105 // result = (x + 127) / 255 | 132 // result = (x + 127) / 255 | 
| 106 // result = (x + 127) / 256 + error1 | 133 // result = (x + 127) / 256 + error1 | 
| 107 // | 134 // | 
| 108 // error1 = (x + 127) / (255 * 256) | 135 // error1 = (x + 127) / (255 * 256) | 
| 109 // error1 = (x + 127) / (256 * 256) + error2 | 136 // error1 = (x + 127) / (256 * 256) + error2 | 
| 110 // | 137 // | 
| (...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 304 // Store 8 pixels. | 331 // Store 8 pixels. | 
| 305 vst4_u8((uint8_t*) dst, rgba); | 332 vst4_u8((uint8_t*) dst, rgba); | 
| 306 src += 8; | 333 src += 8; | 
| 307 dst += 8; | 334 dst += 8; | 
| 308 count -= 8; | 335 count -= 8; | 
| 309 } | 336 } | 
| 310 | 337 | 
| 311 gray_to_RGB1_portable(dst, src, count); | 338 gray_to_RGB1_portable(dst, src, count); | 
| 312 } | 339 } | 
| 313 | 340 | 
| 341 template <bool kPremul> | |
| 342 static void expand_grayA(uint32_t dst[], const void* vsrc, int count) { | |
| 343 const uint8_t* src = (const uint8_t*) vsrc; | |
| 344 while (count >= 16) { | |
| 345 // Load 16 pixels. | |
| 346 uint8x16x2_t ga = vld2q_u8(src); | |
| 
mtklein
2016/02/03 01:08:15
We sure are getting to use all the vldN / vstN, eh
 
msarett
2016/02/03 14:48:51
Yeah it's fun to have good uses for all the instru
 | |
| 347 | |
| 348 // Premultiply if requested. | |
| 349 if (kPremul) { | |
| 350 ga.val[0] = vcombine_u8( | |
| 351 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])), | |
| 352 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1]))); | |
| 353 } | |
| 354 | |
| 355 // Set each of the color channels. | |
| 356 uint8x16x4_t rgba; | |
| 357 rgba.val[0] = ga.val[0]; | |
| 358 rgba.val[1] = ga.val[0]; | |
| 359 rgba.val[2] = ga.val[0]; | |
| 360 rgba.val[3] = ga.val[1]; | |
| 361 | |
| 362 // Store 16 pixels. | |
| 363 vst4q_u8((uint8_t*) dst, rgba); | |
| 364 src += 16*2; | |
| 365 dst += 16; | |
| 366 count -= 16; | |
| 367 } | |
| 368 | |
| 369 if (count >= 8) { | |
| 370 // Load 8 pixels. | |
| 371 uint8x8x2_t ga = vld2_u8(src); | |
| 372 | |
| 373 // Premultiply if requested. | |
| 374 if (kPremul) { | |
| 375 ga.val[0] = scale(ga.val[0], ga.val[1]); | |
| 376 } | |
| 377 | |
| 378 // Set each of the color channels. | |
| 379 uint8x8x4_t rgba; | |
| 380 rgba.val[0] = ga.val[0]; | |
| 381 rgba.val[1] = ga.val[0]; | |
| 382 rgba.val[2] = ga.val[0]; | |
| 383 rgba.val[3] = ga.val[1]; | |
| 384 | |
| 385 // Store 8 pixels. | |
| 386 vst4_u8((uint8_t*) dst, rgba); | |
| 387 src += 8*2; | |
| 388 dst += 8; | |
| 389 count -= 8; | |
| 390 } | |
| 391 | |
| 392 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable; | |
| 393 proc(dst, src, count); | |
| 394 } | |
| 395 | |
| 396 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { | |
| 397 expand_grayA<false>(dst, src, count); | |
| 398 } | |
| 399 | |
| 400 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { | |
| 401 expand_grayA<true>(dst, src, count); | |
| 402 } | |
| 403 | |
| 314 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 404 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 
| 315 | 405 | 
| 316 template <bool kSwapRB> | 406 template <bool kSwapRB> | 
| 317 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { | 407 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { | 
| 318 auto src = (const uint32_t*)vsrc; | 408 auto src = (const uint32_t*)vsrc; | 
| 319 | 409 | 
| 320 auto premul8 = [](__m128i* lo, __m128i* hi) { | 410 auto premul8 = [](__m128i* lo, __m128i* hi) { | 
| 321 const __m128i zeros = _mm_setzero_si128(); | 411 const __m128i zeros = _mm_setzero_si128(); | 
| 322 const __m128i _128 = _mm_set1_epi16(128); | 412 const __m128i _128 = _mm_set1_epi16(128); | 
| 323 const __m128i _257 = _mm_set1_epi16(257); | 413 const __m128i _257 = _mm_set1_epi16(257); | 
| (...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 475 _mm_storeu_si128((__m128i*) (dst + 12), ggga3); | 565 _mm_storeu_si128((__m128i*) (dst + 12), ggga3); | 
| 476 | 566 | 
| 477 src += 16; | 567 src += 16; | 
| 478 dst += 16; | 568 dst += 16; | 
| 479 count -= 16; | 569 count -= 16; | 
| 480 } | 570 } | 
| 481 | 571 | 
| 482 gray_to_RGB1_portable(dst, src, count); | 572 gray_to_RGB1_portable(dst, src, count); | 
| 483 } | 573 } | 
| 484 | 574 | 
| 575 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { | |
| 576 grayA_to_RGBA_portable(dst, src, count); | |
| 577 } | |
| 578 | |
| 579 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { | |
| 580 grayA_to_rgbA_portable(dst, src, count); | |
| 581 } | |
| 582 | |
| 485 #else | 583 #else | 
| 486 | 584 | 
| 487 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 585 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 
| 488 RGBA_to_rgbA_portable(dst, src, count); | 586 RGBA_to_rgbA_portable(dst, src, count); | 
| 489 } | 587 } | 
| 490 | 588 | 
| 491 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 589 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 
| 492 RGBA_to_bgrA_portable(dst, src, count); | 590 RGBA_to_bgrA_portable(dst, src, count); | 
| 493 } | 591 } | 
| 494 | 592 | 
| 495 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { | 593 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { | 
| 496 RGBA_to_BGRA_portable(dst, src, count); | 594 RGBA_to_BGRA_portable(dst, src, count); | 
| 497 } | 595 } | 
| 498 | 596 | 
| 499 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { | 597 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { | 
| 500 RGB_to_RGB1_portable(dst, src, count); | 598 RGB_to_RGB1_portable(dst, src, count); | 
| 501 } | 599 } | 
| 502 | 600 | 
| 503 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { | 601 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { | 
| 504 RGB_to_BGR1_portable(dst, src, count); | 602 RGB_to_BGR1_portable(dst, src, count); | 
| 505 } | 603 } | 
| 506 | 604 | 
| 507 static void gray_to_RGB1(uint32_t dst[], const void* src, int count) { | 605 static void gray_to_RGB1(uint32_t dst[], const void* src, int count) { | 
| 508 gray_to_RGB1_portable(dst, src, count); | 606 gray_to_RGB1_portable(dst, src, count); | 
| 509 } | 607 } | 
| 510 | 608 | 
| 609 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { | |
| 610 grayA_to_RGBA_portable(dst, src, count); | |
| 611 } | |
| 612 | |
| 613 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { | |
| 614 grayA_to_rgbA_portable(dst, src, count); | |
| 615 } | |
| 616 | |
| 511 #endif | 617 #endif | 
| 512 | 618 | 
| 513 } | 619 } | 
| 514 | 620 | 
| 515 #endif // SkSwizzler_opts_DEFINED | 621 #endif // SkSwizzler_opts_DEFINED | 
| OLD | NEW |