Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 #include "SkBlitRow_opts_SSE4.h" | 1 #include "SkBlitRow_opts_SSE4.h" |
| 2 | 2 |
| 3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods. | 3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods. |
| 4 // The stubs should never be called, so we make them crash just to confirm that. | 4 // The stubs should never be called, so we make them crash just to confirm that. |
| 5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 | 5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 |
| 6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) { | 6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) { |
| 7 sk_throw(); | 7 sk_throw(); |
| 8 } | 8 } |
| 9 | 9 |
| 10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { | |
| 11 sk_throw(); | |
| 12 } | |
| 13 | |
| 10 #else | 14 #else |
| 11 | 15 |
| 12 #include <emmintrin.h> // SSE2: Most _mm_foo() in this file. | 16 #include <emmintrin.h> // SSE2: Most _mm_foo() in this file. |
| 13 #include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128. | 17 #include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128. |
| 14 | 18 |
| 15 #include "SkColorPriv.h" | 19 #include "SkColorPriv.h" |
| 16 #include "SkColor_opts_SSE2.h" | 20 #include "SkColor_opts_SSE2.h" |
| 17 | 21 |
| 18 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, | 22 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, |
| 19 const SkPMColor* SK_RESTRICT src, | 23 const SkPMColor* SK_RESTRICT src, |
| (...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 57 // Wrap up the last <= 15 pixels. | 61 // Wrap up the last <= 15 pixels. |
| 58 SkASSERT(count - (count16*16) <= 15); | 62 SkASSERT(count - (count16*16) <= 15); |
| 59 for (int i = count16*16; i < count; i++) { | 63 for (int i = count16*16; i < count; i++) { |
| 60 // This check is not really necessarily, but it prevents pointless autov ectorization. | 64 // This check is not really necessarily, but it prevents pointless autov ectorization. |
| 61 if (src[i] & 0xFF000000) { | 65 if (src[i] & 0xFF000000) { |
| 62 dst[i] = SkPMSrcOver(src[i], dst[i]); | 66 dst[i] = SkPMSrcOver(src[i], dst[i]); |
| 63 } | 67 } |
| 64 } | 68 } |
| 65 } | 69 } |
| 66 | 70 |
| 71 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { | |
| 72 SkASSERT(count > 0); | |
| 73 // dst must be 2-byte aligned. | |
| 74 SkASSERT((((size_t)dst) & 0x01) == 0); | |
|
mtklein
2015/01/30 18:17:37
Seems paranoid? If we need to keep this, let's wr
henrik.smiding
2015/02/10 15:11:51
I thought so too, but I actually found it in the n
| |
| 75 | |
| 76 uint32_t src_expand = (SkGetPackedG32(src) << 24) | | |
| 77 (SkGetPackedR32(src) << 13) | | |
| 78 (SkGetPackedB32(src) << 2); | |
| 79 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; | |
| 80 | |
| 81 // Check if we have enough pixels to run SIMD, at all | |
| 82 if (count >= 8) { | |
|
mtklein
2015/01/30 18:17:37
This logic seems complex, and I'm worried we're ov
henrik.smiding
2015/02/10 15:11:52
I did this optimization based on reports that it w
| |
| 83 __m128i* dst_wide; | |
| 84 const __m128i src_expand_wide = _mm_set1_epi32(src_expand); | |
| 85 const __m128i scale_wide = _mm_set1_epi32(scale); | |
| 86 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE | | |
| 87 SK_B16_MASK_IN_PLACE | | |
| 88 (SK_G16_MASK_IN_PLACE << 16)); | |
| 89 | |
| 90 // Check if we should run the aligned SIMD loop | |
| 91 if ((count >= 64) || ((((size_t)dst) & 0x0F) == 0)) { | |
|
mtklein
2015/01/30 18:17:37
How'd you pick 64?
henrik.smiding
2015/02/10 15:11:52
I measured different widths with skia bench, using
| |
| 92 // Align dst to an even 16 byte address (0-7 pixels) | |
| 93 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { | |
| 94 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; | |
|
mtklein
2015/01/30 18:17:37
You've got a lot of repeated code here. If it nee
henrik.smiding
2015/02/10 15:11:51
I'll re-factor the code a bit. When you're satisfi
| |
| 95 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); | |
| 96 dst += 1; | |
| 97 count--; | |
| 98 } | |
| 99 | |
| 100 dst_wide = reinterpret_cast<__m128i*>(dst); | |
| 101 do { | |
| 102 // Load 8 RGB565 pixels | |
| 103 __m128i pixels = _mm_load_si128(dst_wide); | |
| 104 | |
| 105 // Duplicate and mask | |
| 106 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); | |
| 107 pixels_high = _mm_and_si128(mask_green, pixels_high); | |
| 108 pixels = _mm_unpacklo_epi16(pixels, pixels); | |
| 109 pixels = _mm_and_si128(mask_green, pixels); | |
| 110 | |
| 111 // Scale with alpha | |
| 112 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); | |
|
mtklein
2015/01/30 18:17:37
It doesn't seem like SSE4 is essential to the spee
henrik.smiding
2015/02/10 15:11:51
I added the _mm_hadd instruction at the last minut
| |
| 113 pixels = _mm_mullo_epi32(pixels, scale_wide); | |
| 114 | |
| 115 // Add src_expand_wide and shift down again | |
| 116 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); | |
| 117 pixels_high = _mm_srli_epi32(pixels_high, 5); | |
| 118 pixels = _mm_add_epi32(pixels, src_expand_wide); | |
| 119 pixels = _mm_srli_epi32(pixels, 5); | |
| 120 | |
| 121 // Mask | |
| 122 pixels_high = _mm_and_si128(mask_green, pixels_high); | |
| 123 pixels = _mm_and_si128(mask_green, pixels); | |
| 124 | |
| 125 // Combine into RGB565 and store | |
| 126 pixels = _mm_hadd_epi16(pixels, pixels_high); | |
| 127 _mm_store_si128(dst_wide, pixels); | |
| 128 count -= 8; | |
| 129 dst_wide++; | |
| 130 } while (count >= 8); | |
| 131 } | |
| 132 else { // Unaligned loop to handle medium widths | |
| 133 dst_wide = reinterpret_cast<__m128i*>(dst); | |
| 134 | |
| 135 do { | |
| 136 // Load 8 RGB565 pixels | |
| 137 __m128i pixels = _mm_loadu_si128(dst_wide); | |
| 138 | |
| 139 // Duplicate and mask | |
| 140 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); | |
| 141 pixels_high = _mm_and_si128(mask_green, pixels_high); | |
| 142 pixels = _mm_unpacklo_epi16(pixels, pixels); | |
| 143 pixels = _mm_and_si128(mask_green, pixels); | |
| 144 | |
| 145 // Scale with alpha | |
| 146 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); | |
| 147 pixels = _mm_mullo_epi32(pixels, scale_wide); | |
| 148 | |
| 149 // Add src_expand_wide and shift down again | |
| 150 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); | |
| 151 pixels_high = _mm_srli_epi32(pixels_high, 5); | |
| 152 pixels = _mm_add_epi32(pixels, src_expand_wide); | |
| 153 pixels = _mm_srli_epi32(pixels, 5); | |
| 154 | |
| 155 // Mask | |
| 156 pixels_high = _mm_and_si128(mask_green, pixels_high); | |
| 157 pixels = _mm_and_si128(mask_green, pixels); | |
| 158 | |
| 159 // Combine into RGB565 and store | |
| 160 pixels = _mm_hadd_epi16(pixels, pixels_high); | |
| 161 _mm_storeu_si128(dst_wide, pixels); | |
| 162 count -= 8; | |
| 163 dst_wide++; | |
| 164 } while (count >= 8); | |
| 165 } | |
| 166 | |
| 167 dst = reinterpret_cast<uint16_t*>(dst_wide); | |
| 168 } | |
| 169 | |
| 170 // Small loop to handle remaining 0-7 pixels. | |
| 171 while (count > 0) { | |
| 172 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; | |
| 173 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); | |
| 174 dst += 1; | |
| 175 count--; | |
| 176 } | |
| 177 } | |
| 178 | |
| 67 #endif | 179 #endif |
| OLD | NEW |