OLD | NEW |
1 // Copyright 2014 Google Inc. All Rights Reserved. | 1 // Copyright 2014 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
9 // | 9 // |
10 // Utilities for processing transparent channel. | 10 // Utilities for processing transparent channel. |
11 // | 11 // |
12 // Author: Skal (pascal.massimino@gmail.com) | 12 // Author: Skal (pascal.massimino@gmail.com) |
13 | 13 |
14 #include "./dsp.h" | 14 #include "./dsp.h" |
15 | 15 |
16 #if defined(WEBP_USE_SSE2) | 16 #if defined(WEBP_USE_SSE2) |
17 #include <emmintrin.h> | 17 #include <emmintrin.h> |
18 | 18 |
19 //------------------------------------------------------------------------------ | 19 //------------------------------------------------------------------------------ |
20 | 20 |
| 21 static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, |
| 22 int width, int height, |
| 23 uint8_t* dst, int dst_stride) { |
| 24 // alpha_and stores an 'and' operation of all the alpha[] values. The final |
| 25 // value is not 0xff if any of the alpha[] is not equal to 0xff. |
| 26 uint32_t alpha_and = 0xff; |
| 27 int i, j; |
| 28 const __m128i zero = _mm_setzero_si128(); |
| 29 const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB |
| 30 const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); |
| 31 __m128i all_alphas = all_0xff; |
| 32 |
| 33 // We must be able to access 3 extra bytes after the last written byte |
| 34 // 'dst[4 * width - 4]', because we don't know if alpha is the first or the |
| 35 // last byte of the quadruplet. |
| 36 const int limit = (width - 1) & ~7; |
| 37 |
| 38 for (j = 0; j < height; ++j) { |
| 39 __m128i* out = (__m128i*)dst; |
| 40 for (i = 0; i < limit; i += 8) { |
| 41 // load 8 alpha bytes |
| 42 const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); |
| 43 const __m128i a1 = _mm_unpacklo_epi8(a0, zero); |
| 44 const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); |
| 45 const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); |
| 46 // load 8 dst pixels (32 bytes) |
| 47 const __m128i b0_lo = _mm_loadu_si128(out + 0); |
| 48 const __m128i b0_hi = _mm_loadu_si128(out + 1); |
| 49 // mask dst alpha values |
| 50 const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask); |
| 51 const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask); |
| 52 // combine |
| 53 const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo); |
| 54 const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi); |
| 55 // store |
| 56 _mm_storeu_si128(out + 0, b2_lo); |
| 57 _mm_storeu_si128(out + 1, b2_hi); |
| 58 // accumulate eight alpha 'and' in parallel |
| 59 all_alphas = _mm_and_si128(all_alphas, a0); |
| 60 out += 2; |
| 61 } |
| 62 for (; i < width; ++i) { |
| 63 const uint32_t alpha_value = alpha[i]; |
| 64 dst[4 * i] = alpha_value; |
| 65 alpha_and &= alpha_value; |
| 66 } |
| 67 alpha += alpha_stride; |
| 68 dst += dst_stride; |
| 69 } |
| 70 // Combine the eight alpha 'and' into a 8-bit mask. |
| 71 alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); |
| 72 return (alpha_and != 0xff); |
| 73 } |
| 74 |
| 75 static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride, |
| 76 int width, int height, |
| 77 uint32_t* dst, int dst_stride) { |
| 78 int i, j; |
| 79 const __m128i zero = _mm_setzero_si128(); |
| 80 const int limit = width & ~15; |
| 81 for (j = 0; j < height; ++j) { |
| 82 for (i = 0; i < limit; i += 16) { // process 16 alpha bytes |
| 83 const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); |
| 84 const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first! |
| 85 const __m128i b1 = _mm_unpackhi_epi8(zero, a0); |
| 86 const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); |
| 87 const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero); |
| 88 const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); |
| 89 const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero); |
| 90 _mm_storeu_si128((__m128i*)&dst[i + 0], a2_lo); |
| 91 _mm_storeu_si128((__m128i*)&dst[i + 4], a2_hi); |
| 92 _mm_storeu_si128((__m128i*)&dst[i + 8], b2_lo); |
| 93 _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi); |
| 94 } |
| 95 for (; i < width; ++i) dst[i] = alpha[i] << 8; |
| 96 alpha += alpha_stride; |
| 97 dst += dst_stride; |
| 98 } |
| 99 } |
| 100 |
21 static int ExtractAlpha(const uint8_t* argb, int argb_stride, | 101 static int ExtractAlpha(const uint8_t* argb, int argb_stride, |
22 int width, int height, | 102 int width, int height, |
23 uint8_t* alpha, int alpha_stride) { | 103 uint8_t* alpha, int alpha_stride) { |
24 // alpha_and stores an 'and' operation of all the alpha[] values. The final | 104 // alpha_and stores an 'and' operation of all the alpha[] values. The final |
25 // value is not 0xff if any of the alpha[] is not equal to 0xff. | 105 // value is not 0xff if any of the alpha[] is not equal to 0xff. |
26 uint32_t alpha_and = 0xff; | 106 uint32_t alpha_and = 0xff; |
27 int i, j; | 107 int i, j; |
28 const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha | 108 const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha |
29 const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); | 109 const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); |
30 __m128i all_alphas = all_0xff; | 110 __m128i all_alphas = all_0xff; |
(...skipping 25 matching lines...) Expand all Loading... |
56 alpha_and &= alpha_value; | 136 alpha_and &= alpha_value; |
57 } | 137 } |
58 argb += argb_stride; | 138 argb += argb_stride; |
59 alpha += alpha_stride; | 139 alpha += alpha_stride; |
60 } | 140 } |
61 // Combine the eight alpha 'and' into a 8-bit mask. | 141 // Combine the eight alpha 'and' into a 8-bit mask. |
62 alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); | 142 alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); |
63 return (alpha_and == 0xff); | 143 return (alpha_and == 0xff); |
64 } | 144 } |
65 | 145 |
66 #endif // WEBP_USE_SSE2 | 146 //------------------------------------------------------------------------------ |
| 147 // Non-dither premultiplied modes |
| 148 |
| 149 #define MULTIPLIER(a) ((a) * 0x8081) |
| 150 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23) |
| 151 |
| 152 // We can't use a 'const int' for the SHUFFLE value, because it has to be an |
| 153 // immediate in the _mm_shufflexx_epi16() instruction. We really a macro here. |
| 154 #define APPLY_ALPHA(RGBX, SHUFFLE, MASK, MULT) do { \ |
| 155 const __m128i argb0 = _mm_loadl_epi64((__m128i*)&(RGBX)); \ |
| 156 const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); \ |
| 157 const __m128i alpha0 = _mm_and_si128(argb1, MASK); \ |
| 158 const __m128i alpha1 = _mm_shufflelo_epi16(alpha0, SHUFFLE); \ |
| 159 const __m128i alpha2 = _mm_shufflehi_epi16(alpha1, SHUFFLE); \ |
| 160 /* alpha2 = [0 a0 a0 a0][0 a1 a1 a1] */ \ |
| 161 const __m128i scale0 = _mm_mullo_epi16(alpha2, MULT); \ |
| 162 const __m128i scale1 = _mm_mulhi_epu16(alpha2, MULT); \ |
| 163 const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); \ |
| 164 const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); \ |
| 165 const __m128i argb4 = _mm_adds_epu16(argb2, argb3); \ |
| 166 const __m128i argb5 = _mm_srli_epi16(argb4, 7); \ |
| 167 const __m128i argb6 = _mm_or_si128(argb5, alpha0); \ |
| 168 const __m128i argb7 = _mm_packus_epi16(argb6, zero); \ |
| 169 _mm_storel_epi64((__m128i*)&(RGBX), argb7); \ |
| 170 } while (0) |
| 171 |
| 172 static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first, |
| 173 int w, int h, int stride) { |
| 174 const __m128i zero = _mm_setzero_si128(); |
| 175 const int kSpan = 2; |
| 176 const int w2 = w & ~(kSpan - 1); |
| 177 while (h-- > 0) { |
| 178 uint32_t* const rgbx = (uint32_t*)rgba; |
| 179 int i; |
| 180 if (!alpha_first) { |
| 181 const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0); |
| 182 const __m128i kMult = |
| 183 _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081); |
| 184 for (i = 0; i < w2; i += kSpan) { |
| 185 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult); |
| 186 } |
| 187 } else { |
| 188 const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff); |
| 189 const __m128i kMult = |
| 190 _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0); |
| 191 for (i = 0; i < w2; i += kSpan) { |
| 192 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult); |
| 193 } |
| 194 } |
| 195 // Finish with left-overs. |
| 196 for (; i < w; ++i) { |
| 197 uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); |
| 198 const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); |
| 199 const uint32_t a = alpha[4 * i]; |
| 200 if (a != 0xff) { |
| 201 const uint32_t mult = MULTIPLIER(a); |
| 202 rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); |
| 203 rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); |
| 204 rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); |
| 205 } |
| 206 } |
| 207 rgba += stride; |
| 208 } |
| 209 } |
| 210 #undef MULTIPLIER |
| 211 #undef PREMULTIPLY |
| 212 |
| 213 // ----------------------------------------------------------------------------- |
| 214 // Apply alpha value to rows |
| 215 |
| 216 // We use: kINV255 = (1 << 24) / 255 = 0x010101 |
| 217 // So: a * kINV255 = (a << 16) | [(a << 8) | a] |
| 218 // -> _mm_mulhi_epu16() takes care of the (a<<16) part, |
| 219 // and _mm_mullo_epu16(a * 0x0101,...) takes care of the "(a << 8) | a" one. |
| 220 |
| 221 static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { |
| 222 int x = 0; |
| 223 if (!inverse) { |
| 224 const int kSpan = 2; |
| 225 const __m128i zero = _mm_setzero_si128(); |
| 226 const __m128i kRound = |
| 227 _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); |
| 228 const __m128i kMult = |
| 229 _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); |
| 230 const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); |
| 231 const int w2 = width & ~(kSpan - 1); |
| 232 for (x = 0; x < w2; x += kSpan) { |
| 233 const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); |
| 234 const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); |
| 235 const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); |
| 236 const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); |
| 237 const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); |
| 238 const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); |
| 239 const __m128i scale1 = _mm_or_si128(tmp2, kOne64); |
| 240 const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); |
| 241 const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); |
| 242 const __m128i argb4 = _mm_adds_epu16(argb2, argb3); |
| 243 const __m128i argb5 = _mm_adds_epu16(argb4, kRound); |
| 244 const __m128i argb6 = _mm_srli_epi16(argb5, 8); |
| 245 const __m128i argb7 = _mm_packus_epi16(argb6, zero); |
| 246 _mm_storel_epi64((__m128i*)&ptr[x], argb7); |
| 247 } |
| 248 } |
| 249 width -= x; |
| 250 if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); |
| 251 } |
| 252 |
| 253 static void MultRow(uint8_t* const ptr, const uint8_t* const alpha, |
| 254 int width, int inverse) { |
| 255 int x = 0; |
| 256 if (!inverse) { |
| 257 const int kSpan = 8; |
| 258 const __m128i zero = _mm_setzero_si128(); |
| 259 const __m128i kRound = _mm_set1_epi16(1 << 7); |
| 260 const int w2 = width & ~(kSpan - 1); |
| 261 for (x = 0; x < w2; x += kSpan) { |
| 262 const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); |
| 263 const __m128i v1 = _mm_unpacklo_epi8(v0, zero); |
| 264 const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); |
| 265 const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero); |
| 266 const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0); |
| 267 const __m128i v2 = _mm_mulhi_epu16(v1, alpha2); |
| 268 const __m128i v3 = _mm_mullo_epi16(v1, alpha1); |
| 269 const __m128i v4 = _mm_adds_epu16(v2, v3); |
| 270 const __m128i v5 = _mm_adds_epu16(v4, kRound); |
| 271 const __m128i v6 = _mm_srli_epi16(v5, 8); |
| 272 const __m128i v7 = _mm_packus_epi16(v6, zero); |
| 273 _mm_storel_epi64((__m128i*)&ptr[x], v7); |
| 274 } |
| 275 } |
| 276 width -= x; |
| 277 if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); |
| 278 } |
67 | 279 |
68 //------------------------------------------------------------------------------ | 280 //------------------------------------------------------------------------------ |
69 // Init function | 281 // Entry point |
70 | 282 |
71 extern void WebPInitAlphaProcessingSSE2(void); | 283 extern void WebPInitAlphaProcessingSSE2(void); |
72 | 284 |
73 void WebPInitAlphaProcessingSSE2(void) { | 285 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { |
74 #if defined(WEBP_USE_SSE2) | 286 WebPMultARGBRow = MultARGBRow; |
| 287 WebPMultRow = MultRow; |
| 288 WebPApplyAlphaMultiply = ApplyAlphaMultiply; |
| 289 WebPDispatchAlpha = DispatchAlpha; |
| 290 WebPDispatchAlphaToGreen = DispatchAlphaToGreen; |
75 WebPExtractAlpha = ExtractAlpha; | 291 WebPExtractAlpha = ExtractAlpha; |
76 #endif | |
77 } | 292 } |
| 293 |
| 294 #else // !WEBP_USE_SSE2 |
| 295 |
| 296 WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2) |
| 297 |
| 298 #endif // WEBP_USE_SSE2 |
OLD | NEW |