OLD | NEW |
1 #include "SkBlitRow_opts_SSE4.h" | 1 #include "SkBlitRow_opts_SSE4.h" |
2 | 2 |
3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met
hods. | 3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met
hods. |
4 // The stubs should never be called, so we make them crash just to confirm that. | 4 // The stubs should never be called, so we make them crash just to confirm that. |
5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 | 5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 |
6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST
RICT, int, U8CPU) { | 6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST
RICT, int, U8CPU) { |
7 sk_throw(); | 7 sk_throw(); |
8 } | 8 } |
9 | 9 |
10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y)
{ | 10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y)
{ |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
59 | 59 |
60 // Wrap up the last <= 15 pixels. | 60 // Wrap up the last <= 15 pixels. |
61 for (int i = count16*16; i < count; i++) { | 61 for (int i = count16*16; i < count; i++) { |
62 // This check is not really necessarily, but it prevents pointless autov
ectorization. | 62 // This check is not really necessarily, but it prevents pointless autov
ectorization. |
63 if (src[i] & 0xFF000000) { | 63 if (src[i] & 0xFF000000) { |
64 dst[i] = SkPMSrcOver(src[i], dst[i]); | 64 dst[i] = SkPMSrcOver(src[i], dst[i]); |
65 } | 65 } |
66 } | 66 } |
67 } | 67 } |
68 | 68 |
69 static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t s
rc_expand) { | |
70 uint32_t dst_expand = SkExpand_rgb_16(dst) * scale; | |
71 return SkCompact_rgb_16((src_expand + dst_expand) >> 5); | |
72 } | |
73 | |
74 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y)
{ | 69 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y)
{ |
75 SkASSERT(count > 0); | 70 SkASSERT(count > 0); |
76 | 71 |
77 uint32_t src_expand = (SkGetPackedG32(src) << 24) | | 72 uint32_t src_expand = (SkGetPackedG32(src) << 24) | |
78 (SkGetPackedR32(src) << 13) | | 73 (SkGetPackedR32(src) << 13) | |
79 (SkGetPackedB32(src) << 2); | 74 (SkGetPackedB32(src) << 2); |
80 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; | 75 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; |
81 | 76 |
82 // Check if we have enough pixels to run SIMD | 77 // Check if we have enough pixels to run SIMD |
83 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { | 78 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { |
84 __m128i* dst_wide; | 79 __m128i* dst_wide; |
85 const __m128i src_expand_wide = _mm_set1_epi32(src_expand); | 80 const __m128i src_expand_wide = _mm_set1_epi32(src_expand); |
86 const __m128i scale_wide = _mm_set1_epi32(scale); | 81 const __m128i scale_wide = _mm_set1_epi32(scale); |
87 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE | | 82 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE | |
88 SK_B16_MASK_IN_PLACE | | 83 SK_B16_MASK_IN_PLACE | |
89 (SK_G16_MASK_IN_PLACE << 16)); | 84 (SK_G16_MASK_IN_PLACE << 16)); |
90 | 85 |
91 // Align dst to an even 16 byte address (0-7 pixels) | 86 // Align dst to an even 16 byte address (0-7 pixels) |
92 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { | 87 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { |
93 *dst = Color32A_D565_1x(*dst, scale, src_expand); | 88 *dst = SkBlend32A_D565(*dst, scale, src_expand); |
94 dst += 1; | 89 dst += 1; |
95 count--; | 90 count--; |
96 } | 91 } |
97 | 92 |
98 dst_wide = reinterpret_cast<__m128i*>(dst); | 93 dst_wide = reinterpret_cast<__m128i*>(dst); |
99 do { | 94 do { |
100 // Load 8 RGB565 pixels | 95 // Load 8 RGB565 pixels |
101 __m128i pixels = _mm_load_si128(dst_wide); | 96 __m128i pixels = _mm_load_si128(dst_wide); |
102 | 97 |
103 // Duplicate and mask | 98 // Duplicate and mask |
(...skipping 21 matching lines...) Expand all Loading... |
125 _mm_store_si128(dst_wide, pixels); | 120 _mm_store_si128(dst_wide, pixels); |
126 count -= 8; | 121 count -= 8; |
127 dst_wide++; | 122 dst_wide++; |
128 } while (count >= 8); | 123 } while (count >= 8); |
129 | 124 |
130 dst = reinterpret_cast<uint16_t*>(dst_wide); | 125 dst = reinterpret_cast<uint16_t*>(dst_wide); |
131 } | 126 } |
132 | 127 |
133 // Small loop to handle remaining pixels. | 128 // Small loop to handle remaining pixels. |
134 while (count > 0) { | 129 while (count > 0) { |
135 *dst = Color32A_D565_1x(*dst, scale, src_expand); | 130 *dst = SkBlend32A_D565(*dst, scale, src_expand); |
136 dst += 1; | 131 dst += 1; |
137 count--; | 132 count--; |
138 } | 133 } |
139 } | 134 } |
140 | 135 |
141 #endif | 136 #endif |
OLD | NEW |