Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(94)

Side by Side Diff: src/opts/SkBlitRow_opts_SSE4.cpp

Issue 892623002: Add SSE optimization of Color32A_D565 (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 #include "SkBlitRow_opts_SSE4.h" 1 #include "SkBlitRow_opts_SSE4.h"
2 2
3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods. 3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods.
4 // The stubs should never be called, so we make them crash just to confirm that. 4 // The stubs should never be called, so we make them crash just to confirm that.
5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41
6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) { 6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) {
7 sk_throw(); 7 sk_throw();
8 } 8 }
9 9
10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
11 sk_throw();
12 }
13
10 #else 14 #else
11 15
12 #include <emmintrin.h> // SSE2: Most _mm_foo() in this file. 16 #include <emmintrin.h> // SSE2: Most _mm_foo() in this file.
13 #include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128. 17 #include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128.
14 18
15 #include "SkColorPriv.h" 19 #include "SkColorPriv.h"
16 #include "SkColor_opts_SSE2.h" 20 #include "SkColor_opts_SSE2.h"
17 21
18 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, 22 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
19 const SkPMColor* SK_RESTRICT src, 23 const SkPMColor* SK_RESTRICT src,
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
57 // Wrap up the last <= 15 pixels. 61 // Wrap up the last <= 15 pixels.
58 SkASSERT(count - (count16*16) <= 15); 62 SkASSERT(count - (count16*16) <= 15);
59 for (int i = count16*16; i < count; i++) { 63 for (int i = count16*16; i < count; i++) {
60 // This check is not really necessarily, but it prevents pointless autov ectorization. 64 // This check is not really necessarily, but it prevents pointless autov ectorization.
61 if (src[i] & 0xFF000000) { 65 if (src[i] & 0xFF000000) {
62 dst[i] = SkPMSrcOver(src[i], dst[i]); 66 dst[i] = SkPMSrcOver(src[i], dst[i]);
63 } 67 }
64 } 68 }
65 } 69 }
66 70
71 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
72 SkASSERT(count > 0);
73 // dst must be 2-byte aligned.
74 SkASSERT((((size_t)dst) & 0x01) == 0);
mtklein 2015/01/30 18:17:37 Seems paranoid? If we need to keep this, let's wr
henrik.smiding 2015/02/10 15:11:51 I thought so too, but I actually found it in the n
75
76 uint32_t src_expand = (SkGetPackedG32(src) << 24) |
77 (SkGetPackedR32(src) << 13) |
78 (SkGetPackedB32(src) << 2);
79 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
80
81 // Check if we have enough pixels to run SIMD, at all
82 if (count >= 8) {
mtklein 2015/01/30 18:17:37 This logic seems complex, and I'm worried we're ov
henrik.smiding 2015/02/10 15:11:52 I did this optimization based on reports that it w
83 __m128i* dst_wide;
84 const __m128i src_expand_wide = _mm_set1_epi32(src_expand);
85 const __m128i scale_wide = _mm_set1_epi32(scale);
86 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE |
87 SK_B16_MASK_IN_PLACE |
88 (SK_G16_MASK_IN_PLACE << 16));
89
90 // Check if we should run the aligned SIMD loop
91 if ((count >= 64) || ((((size_t)dst) & 0x0F) == 0)) {
mtklein 2015/01/30 18:17:37 How'd you pick 64?
henrik.smiding 2015/02/10 15:11:52 I measured different widths with skia bench, using
92 // Align dst to an even 16 byte address (0-7 pixels)
93 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
94 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
mtklein 2015/01/30 18:17:37 You've got a lot of repeated code here. If it nee
henrik.smiding 2015/02/10 15:11:51 I'll re-factor the code a bit. When you're satisfi
95 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
96 dst += 1;
97 count--;
98 }
99
100 dst_wide = reinterpret_cast<__m128i*>(dst);
101 do {
102 // Load 8 RGB565 pixels
103 __m128i pixels = _mm_load_si128(dst_wide);
104
105 // Duplicate and mask
106 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);
107 pixels_high = _mm_and_si128(mask_green, pixels_high);
108 pixels = _mm_unpacklo_epi16(pixels, pixels);
109 pixels = _mm_and_si128(mask_green, pixels);
110
111 // Scale with alpha
112 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);
mtklein 2015/01/30 18:17:37 It doesn't seem like SSE4 is essential to the spee
henrik.smiding 2015/02/10 15:11:51 I added the _mm_hadd instruction at the last minut
113 pixels = _mm_mullo_epi32(pixels, scale_wide);
114
115 // Add src_expand_wide and shift down again
116 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);
117 pixels_high = _mm_srli_epi32(pixels_high, 5);
118 pixels = _mm_add_epi32(pixels, src_expand_wide);
119 pixels = _mm_srli_epi32(pixels, 5);
120
121 // Mask
122 pixels_high = _mm_and_si128(mask_green, pixels_high);
123 pixels = _mm_and_si128(mask_green, pixels);
124
125 // Combine into RGB565 and store
126 pixels = _mm_hadd_epi16(pixels, pixels_high);
127 _mm_store_si128(dst_wide, pixels);
128 count -= 8;
129 dst_wide++;
130 } while (count >= 8);
131 }
132 else { // Unaligned loop to handle medium widths
133 dst_wide = reinterpret_cast<__m128i*>(dst);
134
135 do {
136 // Load 8 RGB565 pixels
137 __m128i pixels = _mm_loadu_si128(dst_wide);
138
139 // Duplicate and mask
140 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);
141 pixels_high = _mm_and_si128(mask_green, pixels_high);
142 pixels = _mm_unpacklo_epi16(pixels, pixels);
143 pixels = _mm_and_si128(mask_green, pixels);
144
145 // Scale with alpha
146 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);
147 pixels = _mm_mullo_epi32(pixels, scale_wide);
148
149 // Add src_expand_wide and shift down again
150 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);
151 pixels_high = _mm_srli_epi32(pixels_high, 5);
152 pixels = _mm_add_epi32(pixels, src_expand_wide);
153 pixels = _mm_srli_epi32(pixels, 5);
154
155 // Mask
156 pixels_high = _mm_and_si128(mask_green, pixels_high);
157 pixels = _mm_and_si128(mask_green, pixels);
158
159 // Combine into RGB565 and store
160 pixels = _mm_hadd_epi16(pixels, pixels_high);
161 _mm_storeu_si128(dst_wide, pixels);
162 count -= 8;
163 dst_wide++;
164 } while (count >= 8);
165 }
166
167 dst = reinterpret_cast<uint16_t*>(dst_wide);
168 }
169
170 // Small loop to handle remaining 0-7 pixels.
171 while (count > 0) {
172 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
173 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
174 dst += 1;
175 count--;
176 }
177 }
178
67 #endif 179 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698