| OLD | NEW |
| 1 // Copyright 2014 Google Inc. All Rights Reserved. | 1 // Copyright 2014 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // Utilities for processing transparent channel. | 10 // Utilities for processing transparent channel. |
| (...skipping 132 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 143 return (alpha_and == 0xff); | 143 return (alpha_and == 0xff); |
| 144 } | 144 } |
| 145 | 145 |
| 146 //------------------------------------------------------------------------------ | 146 //------------------------------------------------------------------------------ |
| 147 // Non-dither premultiplied modes | 147 // Non-dither premultiplied modes |
| 148 | 148 |
| 149 #define MULTIPLIER(a) ((a) * 0x8081) | 149 #define MULTIPLIER(a) ((a) * 0x8081) |
| 150 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23) | 150 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23) |
| 151 | 151 |
| 152 // We can't use a 'const int' for the SHUFFLE value, because it has to be an | 152 // We can't use a 'const int' for the SHUFFLE value, because it has to be an |
| 153 // immediate in the _mm_shufflexx_epi16() instruction. We really a macro here. | 153 // immediate in the _mm_shufflexx_epi16() instruction. We really need a macro. |
| 154 #define APPLY_ALPHA(RGBX, SHUFFLE, MASK, MULT) do { \ | 154 // We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit |
| 155 const __m128i argb0 = _mm_loadl_epi64((__m128i*)&(RGBX)); \ | 155 // value. |
| 156 const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); \ | 156 #define APPLY_ALPHA(RGBX, SHUFFLE) do { \ |
| 157 const __m128i alpha0 = _mm_and_si128(argb1, MASK); \ | 157 const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX)); \ |
| 158 const __m128i alpha1 = _mm_shufflelo_epi16(alpha0, SHUFFLE); \ | 158 const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero); \ |
| 159 const __m128i alpha2 = _mm_shufflehi_epi16(alpha1, SHUFFLE); \ | 159 const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero); \ |
| 160 /* alpha2 = [0 a0 a0 a0][0 a1 a1 a1] */ \ | 160 const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask); \ |
| 161 const __m128i scale0 = _mm_mullo_epi16(alpha2, MULT); \ | 161 const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask); \ |
| 162 const __m128i scale1 = _mm_mulhi_epu16(alpha2, MULT); \ | 162 const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \ |
| 163 const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); \ | 163 const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \ |
| 164 const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); \ | 164 const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \ |
| 165 const __m128i argb4 = _mm_adds_epu16(argb2, argb3); \ | 165 const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \ |
| 166 const __m128i argb5 = _mm_srli_epi16(argb4, 7); \ | 166 /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */ \ |
| 167 const __m128i argb6 = _mm_or_si128(argb5, alpha0); \ | 167 const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo); \ |
| 168 const __m128i argb7 = _mm_packus_epi16(argb6, zero); \ | 168 const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi); \ |
| 169 _mm_storel_epi64((__m128i*)&(RGBX), argb7); \ | 169 const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult); \ |
| 170 const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult); \ |
| 171 const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7); \ |
| 172 const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7); \ |
| 173 const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi); \ |
| 174 _mm_storeu_si128((__m128i*)&(RGBX), A3); \ |
| 170 } while (0) | 175 } while (0) |
| 171 | 176 |
| 172 static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first, | 177 static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first, |
| 173 int w, int h, int stride) { | 178 int w, int h, int stride) { |
| 174 const __m128i zero = _mm_setzero_si128(); | 179 const __m128i zero = _mm_setzero_si128(); |
| 175 const int kSpan = 2; | 180 const __m128i kMult = _mm_set1_epi16(0x8081u); |
| 176 const int w2 = w & ~(kSpan - 1); | 181 const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0); |
| 182 const int kSpan = 4; |
| 177 while (h-- > 0) { | 183 while (h-- > 0) { |
| 178 uint32_t* const rgbx = (uint32_t*)rgba; | 184 uint32_t* const rgbx = (uint32_t*)rgba; |
| 179 int i; | 185 int i; |
| 180 if (!alpha_first) { | 186 if (!alpha_first) { |
| 181 const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0); | 187 for (i = 0; i + kSpan <= w; i += kSpan) { |
| 182 const __m128i kMult = | 188 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3)); |
| 183 _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081); | |
| 184 for (i = 0; i < w2; i += kSpan) { | |
| 185 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult); | |
| 186 } | 189 } |
| 187 } else { | 190 } else { |
| 188 const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff); | 191 for (i = 0; i + kSpan <= w; i += kSpan) { |
| 189 const __m128i kMult = | 192 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1)); |
| 190 _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0); | |
| 191 for (i = 0; i < w2; i += kSpan) { | |
| 192 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult); | |
| 193 } | 193 } |
| 194 } | 194 } |
| 195 // Finish with left-overs. | 195 // Finish with left-overs. |
| 196 for (; i < w; ++i) { | 196 for (; i < w; ++i) { |
| 197 uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); | 197 uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); |
| 198 const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); | 198 const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); |
| 199 const uint32_t a = alpha[4 * i]; | 199 const uint32_t a = alpha[4 * i]; |
| 200 if (a != 0xff) { | 200 if (a != 0xff) { |
| 201 const uint32_t mult = MULTIPLIER(a); | 201 const uint32_t mult = MULTIPLIER(a); |
| 202 rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); | 202 rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); |
| 203 rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); | 203 rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); |
| 204 rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); | 204 rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); |
| 205 } | 205 } |
| 206 } | 206 } |
| 207 rgba += stride; | 207 rgba += stride; |
| 208 } | 208 } |
| 209 } | 209 } |
| 210 #undef MULTIPLIER | 210 #undef MULTIPLIER |
| 211 #undef PREMULTIPLY | 211 #undef PREMULTIPLY |
| 212 | 212 |
| 213 // ----------------------------------------------------------------------------- | 213 // ----------------------------------------------------------------------------- |
| 214 // Apply alpha value to rows | 214 // Apply alpha value to rows |
| 215 | 215 |
| 216 // We use: kINV255 = (1 << 24) / 255 = 0x010101 | 216 static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) { |
| 217 // So: a * kINV255 = (a << 16) | [(a << 8) | a] | |
| 218 // -> _mm_mulhi_epu16() takes care of the (a<<16) part, | |
| 219 // and _mm_mullo_epu16(a * 0x0101,...) takes care of the "(a << 8) | a" one. | |
| 220 | |
| 221 static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { | |
| 222 int x = 0; | 217 int x = 0; |
| 223 if (!inverse) { | 218 if (!inverse) { |
| 224 const int kSpan = 2; | 219 const int kSpan = 2; |
| 225 const __m128i zero = _mm_setzero_si128(); | 220 const __m128i zero = _mm_setzero_si128(); |
| 226 const __m128i kRound = | 221 const __m128i k128 = _mm_set1_epi16(128); |
| 227 _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); | 222 const __m128i kMult = _mm_set1_epi16(0x0101); |
| 228 const __m128i kMult = | 223 const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0); |
| 229 _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); | 224 for (x = 0; x + kSpan <= width; x += kSpan) { |
| 230 const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); | 225 // To compute 'result = (int)(a * x / 255. + .5)', we use: |
| 231 const int w2 = width & ~(kSpan - 1); | 226 // tmp = a * v + 128, result = (tmp * 0x0101u) >> 16 |
| 232 for (x = 0; x < w2; x += kSpan) { | 227 const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]); |
| 233 const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); | 228 const __m128i A1 = _mm_unpacklo_epi8(A0, zero); |
| 234 const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); | 229 const __m128i A2 = _mm_or_si128(A1, kMask); |
| 235 const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); | 230 const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3)); |
| 236 const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); | 231 const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3)); |
| 237 const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); | 232 // here, A4 = [ff a0 a0 a0][ff a1 a1 a1] |
| 238 const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); | 233 const __m128i A5 = _mm_mullo_epi16(A4, A1); |
| 239 const __m128i scale1 = _mm_or_si128(tmp2, kOne64); | 234 const __m128i A6 = _mm_add_epi16(A5, k128); |
| 240 const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); | 235 const __m128i A7 = _mm_mulhi_epu16(A6, kMult); |
| 241 const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); | 236 const __m128i A10 = _mm_packus_epi16(A7, zero); |
| 242 const __m128i argb4 = _mm_adds_epu16(argb2, argb3); | 237 _mm_storel_epi64((__m128i*)&ptr[x], A10); |
| 243 const __m128i argb5 = _mm_adds_epu16(argb4, kRound); | |
| 244 const __m128i argb6 = _mm_srli_epi16(argb5, 8); | |
| 245 const __m128i argb7 = _mm_packus_epi16(argb6, zero); | |
| 246 _mm_storel_epi64((__m128i*)&ptr[x], argb7); | |
| 247 } | 238 } |
| 248 } | 239 } |
| 249 width -= x; | 240 width -= x; |
| 250 if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); | 241 if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); |
| 251 } | 242 } |
| 252 | 243 |
| 253 static void MultRow(uint8_t* const ptr, const uint8_t* const alpha, | 244 static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha, |
| 254 int width, int inverse) { | 245 int width, int inverse) { |
| 255 int x = 0; | 246 int x = 0; |
| 256 if (!inverse) { | 247 if (!inverse) { |
| 257 const int kSpan = 8; | |
| 258 const __m128i zero = _mm_setzero_si128(); | 248 const __m128i zero = _mm_setzero_si128(); |
| 259 const __m128i kRound = _mm_set1_epi16(1 << 7); | 249 const __m128i k128 = _mm_set1_epi16(128); |
| 260 const int w2 = width & ~(kSpan - 1); | 250 const __m128i kMult = _mm_set1_epi16(0x0101); |
| 261 for (x = 0; x < w2; x += kSpan) { | 251 for (x = 0; x + 8 <= width; x += 8) { |
| 262 const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); | 252 const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); |
| 253 const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); |
| 263 const __m128i v1 = _mm_unpacklo_epi8(v0, zero); | 254 const __m128i v1 = _mm_unpacklo_epi8(v0, zero); |
| 264 const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); | 255 const __m128i a1 = _mm_unpacklo_epi8(a0, zero); |
| 265 const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero); | 256 const __m128i v2 = _mm_mullo_epi16(v1, a1); |
| 266 const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0); | 257 const __m128i v3 = _mm_add_epi16(v2, k128); |
| 267 const __m128i v2 = _mm_mulhi_epu16(v1, alpha2); | 258 const __m128i v4 = _mm_mulhi_epu16(v3, kMult); |
| 268 const __m128i v3 = _mm_mullo_epi16(v1, alpha1); | 259 const __m128i v5 = _mm_packus_epi16(v4, zero); |
| 269 const __m128i v4 = _mm_adds_epu16(v2, v3); | 260 _mm_storel_epi64((__m128i*)&ptr[x], v5); |
| 270 const __m128i v5 = _mm_adds_epu16(v4, kRound); | |
| 271 const __m128i v6 = _mm_srli_epi16(v5, 8); | |
| 272 const __m128i v7 = _mm_packus_epi16(v6, zero); | |
| 273 _mm_storel_epi64((__m128i*)&ptr[x], v7); | |
| 274 } | 261 } |
| 275 } | 262 } |
| 276 width -= x; | 263 width -= x; |
| 277 if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); | 264 if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); |
| 278 } | 265 } |
| 279 | 266 |
| 280 //------------------------------------------------------------------------------ | 267 //------------------------------------------------------------------------------ |
| 281 // Entry point | 268 // Entry point |
| 282 | 269 |
| 283 extern void WebPInitAlphaProcessingSSE2(void); | 270 extern void WebPInitAlphaProcessingSSE2(void); |
| 284 | 271 |
| 285 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { | 272 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { |
| 286 WebPMultARGBRow = MultARGBRow; | 273 WebPMultARGBRow = MultARGBRow_SSE2; |
| 287 WebPMultRow = MultRow; | 274 WebPMultRow = MultRow_SSE2; |
| 288 WebPApplyAlphaMultiply = ApplyAlphaMultiply; | 275 WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2; |
| 289 WebPDispatchAlpha = DispatchAlpha; | 276 WebPDispatchAlpha = DispatchAlpha; |
| 290 WebPDispatchAlphaToGreen = DispatchAlphaToGreen; | 277 WebPDispatchAlphaToGreen = DispatchAlphaToGreen; |
| 291 WebPExtractAlpha = ExtractAlpha; | 278 WebPExtractAlpha = ExtractAlpha; |
| 292 } | 279 } |
| 293 | 280 |
| 294 #else // !WEBP_USE_SSE2 | 281 #else // !WEBP_USE_SSE2 |
| 295 | 282 |
| 296 WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2) | 283 WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2) |
| 297 | 284 |
| 298 #endif // WEBP_USE_SSE2 | 285 #endif // WEBP_USE_SSE2 |
| OLD | NEW |