| OLD | NEW |
| 1 // Copyright 2014 Google Inc. All Rights Reserved. | 1 // Copyright 2014 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // SSE2 variant of methods for lossless decoder | 10 // SSE2 variant of methods for lossless decoder |
| 11 // | 11 // |
| 12 // Author: Skal (pascal.massimino@gmail.com) | 12 // Author: Skal (pascal.massimino@gmail.com) |
| 13 | 13 |
| 14 #include "./dsp.h" | 14 #include "./dsp.h" |
| 15 | 15 |
| 16 #if defined(WEBP_USE_SSE2) | 16 #if defined(WEBP_USE_SSE2) |
| 17 |
| 18 #include "./common_sse2.h" |
| 19 #include "./lossless.h" |
| 20 #include "./lossless_common.h" |
| 17 #include <assert.h> | 21 #include <assert.h> |
| 18 #include <emmintrin.h> | 22 #include <emmintrin.h> |
| 19 #include "./lossless.h" | |
| 20 | 23 |
| 21 //------------------------------------------------------------------------------ | 24 //------------------------------------------------------------------------------ |
| 22 // Predictor Transform | 25 // Predictor Transform |
| 23 | 26 |
| 24 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, | 27 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, |
| 25 uint32_t c2) { | 28 uint32_t c2) { |
| 26 const __m128i zero = _mm_setzero_si128(); | 29 const __m128i zero = _mm_setzero_si128(); |
| 27 const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); | 30 const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); |
| 28 const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); | 31 const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); |
| 29 const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); | 32 const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 68 const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c| | 71 const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c| |
| 69 const __m128i diff = _mm_sub_epi16(pb, pa); | 72 const __m128i diff = _mm_sub_epi16(pb, pa); |
| 70 { | 73 { |
| 71 int16_t out[8]; | 74 int16_t out[8]; |
| 72 _mm_storeu_si128((__m128i*)out, diff); | 75 _mm_storeu_si128((__m128i*)out, diff); |
| 73 pa_minus_pb = out[0] + out[1] + out[2] + out[3]; | 76 pa_minus_pb = out[0] + out[1] + out[2] + out[3]; |
| 74 } | 77 } |
| 75 return (pa_minus_pb <= 0) ? a : b; | 78 return (pa_minus_pb <= 0) ? a : b; |
| 76 } | 79 } |
| 77 | 80 |
| 78 static WEBP_INLINE __m128i Average2_128i(uint32_t a0, uint32_t a1) { | 81 static WEBP_INLINE void Average2_m128i(const __m128i* const a0, |
| 82 const __m128i* const a1, |
| 83 __m128i* const avg) { |
| 84 // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) |
| 85 const __m128i ones = _mm_set1_epi8(1); |
| 86 const __m128i avg1 = _mm_avg_epu8(*a0, *a1); |
| 87 const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones); |
| 88 *avg = _mm_sub_epi8(avg1, one); |
| 89 } |
| 90 |
| 91 static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1, |
| 92 __m128i* const avg) { |
| 93 // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) |
| 94 const __m128i ones = _mm_set1_epi8(1); |
| 95 const __m128i A0 = _mm_cvtsi32_si128(a0); |
| 96 const __m128i A1 = _mm_cvtsi32_si128(a1); |
| 97 const __m128i avg1 = _mm_avg_epu8(A0, A1); |
| 98 const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones); |
| 99 *avg = _mm_sub_epi8(avg1, one); |
| 100 } |
| 101 |
| 102 static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) { |
| 79 const __m128i zero = _mm_setzero_si128(); | 103 const __m128i zero = _mm_setzero_si128(); |
| 80 const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero); | 104 const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero); |
| 81 const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); | 105 const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); |
| 82 const __m128i sum = _mm_add_epi16(A1, A0); | 106 const __m128i sum = _mm_add_epi16(A1, A0); |
| 83 const __m128i avg = _mm_srli_epi16(sum, 1); | 107 return _mm_srli_epi16(sum, 1); |
| 84 return avg; | |
| 85 } | 108 } |
| 86 | 109 |
| 87 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) { | 110 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) { |
| 88 const __m128i avg = Average2_128i(a0, a1); | 111 __m128i output; |
| 89 const __m128i A2 = _mm_packus_epi16(avg, avg); | 112 Average2_uint32(a0, a1, &output); |
| 90 const uint32_t output = _mm_cvtsi128_si32(A2); | 113 return _mm_cvtsi128_si32(output); |
| 91 return output; | |
| 92 } | 114 } |
| 93 | 115 |
| 94 static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { | 116 static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { |
| 95 const __m128i zero = _mm_setzero_si128(); | 117 const __m128i zero = _mm_setzero_si128(); |
| 96 const __m128i avg1 = Average2_128i(a0, a2); | 118 const __m128i avg1 = Average2_uint32_16(a0, a2); |
| 97 const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); | 119 const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); |
| 98 const __m128i sum = _mm_add_epi16(avg1, A1); | 120 const __m128i sum = _mm_add_epi16(avg1, A1); |
| 99 const __m128i avg2 = _mm_srli_epi16(sum, 1); | 121 const __m128i avg2 = _mm_srli_epi16(sum, 1); |
| 100 const __m128i A2 = _mm_packus_epi16(avg2, avg2); | 122 const __m128i A2 = _mm_packus_epi16(avg2, avg2); |
| 101 const uint32_t output = _mm_cvtsi128_si32(A2); | 123 const uint32_t output = _mm_cvtsi128_si32(A2); |
| 102 return output; | 124 return output; |
| 103 } | 125 } |
| 104 | 126 |
| 105 static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, | 127 static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, |
| 106 uint32_t a2, uint32_t a3) { | 128 uint32_t a2, uint32_t a3) { |
| 107 const __m128i avg1 = Average2_128i(a0, a1); | 129 const __m128i avg1 = Average2_uint32_16(a0, a1); |
| 108 const __m128i avg2 = Average2_128i(a2, a3); | 130 const __m128i avg2 = Average2_uint32_16(a2, a3); |
| 109 const __m128i sum = _mm_add_epi16(avg2, avg1); | 131 const __m128i sum = _mm_add_epi16(avg2, avg1); |
| 110 const __m128i avg3 = _mm_srli_epi16(sum, 1); | 132 const __m128i avg3 = _mm_srli_epi16(sum, 1); |
| 111 const __m128i A0 = _mm_packus_epi16(avg3, avg3); | 133 const __m128i A0 = _mm_packus_epi16(avg3, avg3); |
| 112 const uint32_t output = _mm_cvtsi128_si32(A0); | 134 const uint32_t output = _mm_cvtsi128_si32(A0); |
| 113 return output; | 135 return output; |
| 114 } | 136 } |
| 115 | 137 |
| 116 static uint32_t Predictor5(uint32_t left, const uint32_t* const top) { | 138 static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) { |
| 117 const uint32_t pred = Average3(left, top[0], top[1]); | 139 const uint32_t pred = Average3(left, top[0], top[1]); |
| 118 return pred; | 140 return pred; |
| 119 } | 141 } |
| 120 static uint32_t Predictor6(uint32_t left, const uint32_t* const top) { | 142 static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) { |
| 121 const uint32_t pred = Average2(left, top[-1]); | 143 const uint32_t pred = Average2(left, top[-1]); |
| 122 return pred; | 144 return pred; |
| 123 } | 145 } |
| 124 static uint32_t Predictor7(uint32_t left, const uint32_t* const top) { | 146 static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) { |
| 125 const uint32_t pred = Average2(left, top[0]); | 147 const uint32_t pred = Average2(left, top[0]); |
| 126 return pred; | 148 return pred; |
| 127 } | 149 } |
| 128 static uint32_t Predictor8(uint32_t left, const uint32_t* const top) { | 150 static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) { |
| 129 const uint32_t pred = Average2(top[-1], top[0]); | 151 const uint32_t pred = Average2(top[-1], top[0]); |
| 130 (void)left; | 152 (void)left; |
| 131 return pred; | 153 return pred; |
| 132 } | 154 } |
| 133 static uint32_t Predictor9(uint32_t left, const uint32_t* const top) { | 155 static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) { |
| 134 const uint32_t pred = Average2(top[0], top[1]); | 156 const uint32_t pred = Average2(top[0], top[1]); |
| 135 (void)left; | 157 (void)left; |
| 136 return pred; | 158 return pred; |
| 137 } | 159 } |
| 138 static uint32_t Predictor10(uint32_t left, const uint32_t* const top) { | 160 static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) { |
| 139 const uint32_t pred = Average4(left, top[-1], top[0], top[1]); | 161 const uint32_t pred = Average4(left, top[-1], top[0], top[1]); |
| 140 return pred; | 162 return pred; |
| 141 } | 163 } |
| 142 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) { | 164 static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) { |
| 143 const uint32_t pred = Select(top[0], left, top[-1]); | 165 const uint32_t pred = Select(top[0], left, top[-1]); |
| 144 return pred; | 166 return pred; |
| 145 } | 167 } |
| 146 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) { | 168 static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) { |
| 147 const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]); | 169 const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]); |
| 148 return pred; | 170 return pred; |
| 149 } | 171 } |
| 150 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { | 172 static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) { |
| 151 const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]); | 173 const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]); |
| 152 return pred; | 174 return pred; |
| 153 } | 175 } |
| 154 | 176 |
| 177 // Batch versions of those functions. |
| 178 |
| 179 // Predictor0: ARGB_BLACK. |
| 180 static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper, |
| 181 int num_pixels, uint32_t* out) { |
| 182 int i; |
| 183 const __m128i black = _mm_set1_epi32(ARGB_BLACK); |
| 184 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 185 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); |
| 186 const __m128i res = _mm_add_epi8(src, black); |
| 187 _mm_storeu_si128((__m128i*)&out[i], res); |
| 188 } |
| 189 if (i != num_pixels) { |
| 190 VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i); |
| 191 } |
| 192 } |
| 193 |
| 194 // Predictor1: left. |
| 195 static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper, |
| 196 int num_pixels, uint32_t* out) { |
| 197 int i; |
| 198 __m128i prev = _mm_set1_epi32(out[-1]); |
| 199 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 200 // a | b | c | d |
| 201 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); |
| 202 // 0 | a | b | c |
| 203 const __m128i shift0 = _mm_slli_si128(src, 4); |
| 204 // a | a + b | b + c | c + d |
| 205 const __m128i sum0 = _mm_add_epi8(src, shift0); |
| 206 // 0 | 0 | a | a + b |
| 207 const __m128i shift1 = _mm_slli_si128(sum0, 8); |
| 208 // a | a + b | a + b + c | a + b + c + d |
| 209 const __m128i sum1 = _mm_add_epi8(sum0, shift1); |
| 210 const __m128i res = _mm_add_epi8(sum1, prev); |
| 211 _mm_storeu_si128((__m128i*)&out[i], res); |
| 212 // replicate prev output on the four lanes |
| 213 prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6)); |
| 214 } |
| 215 if (i != num_pixels) { |
| 216 VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i); |
| 217 } |
| 218 } |
| 219 |
| 220 // Macro that adds 32-bit integers from IN using mod 256 arithmetic |
| 221 // per 8 bit channel. |
| 222 #define GENERATE_PREDICTOR_1(X, IN) \ |
| 223 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ |
| 224 int num_pixels, uint32_t* out) { \ |
| 225 int i; \ |
| 226 for (i = 0; i + 4 <= num_pixels; i += 4) { \ |
| 227 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ |
| 228 const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \ |
| 229 const __m128i res = _mm_add_epi8(src, other); \ |
| 230 _mm_storeu_si128((__m128i*)&out[i], res); \ |
| 231 } \ |
| 232 if (i != num_pixels) { \ |
| 233 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ |
| 234 } \ |
| 235 } |
| 236 |
| 237 // Predictor2: Top. |
| 238 GENERATE_PREDICTOR_1(2, upper[i]) |
| 239 // Predictor3: Top-right. |
| 240 GENERATE_PREDICTOR_1(3, upper[i + 1]) |
| 241 // Predictor4: Top-left. |
| 242 GENERATE_PREDICTOR_1(4, upper[i - 1]) |
| 243 #undef GENERATE_PREDICTOR_1 |
| 244 |
| 245 // Due to averages with integers, values cannot be accumulated in parallel for |
| 246 // predictors 5 to 7. |
| 247 GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2) |
| 248 GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2) |
| 249 GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2) |
| 250 |
| 251 #define GENERATE_PREDICTOR_2(X, IN) \ |
| 252 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ |
| 253 int num_pixels, uint32_t* out) { \ |
| 254 int i; \ |
| 255 for (i = 0; i + 4 <= num_pixels; i += 4) { \ |
| 256 const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \ |
| 257 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); \ |
| 258 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ |
| 259 __m128i avg, res; \ |
| 260 Average2_m128i(&T, &Tother, &avg); \ |
| 261 res = _mm_add_epi8(avg, src); \ |
| 262 _mm_storeu_si128((__m128i*)&out[i], res); \ |
| 263 } \ |
| 264 if (i != num_pixels) { \ |
| 265 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ |
| 266 } \ |
| 267 } |
| 268 // Predictor8: average TL T. |
| 269 GENERATE_PREDICTOR_2(8, upper[i - 1]) |
| 270 // Predictor9: average T TR. |
| 271 GENERATE_PREDICTOR_2(9, upper[i + 1]) |
| 272 #undef GENERATE_PREDICTOR_2 |
| 273 |
| 274 // Predictor10: average of (average of (L,TL), average of (T, TR)). |
| 275 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, |
| 276 int num_pixels, uint32_t* out) { |
| 277 int i, j; |
| 278 __m128i L = _mm_cvtsi32_si128(out[-1]); |
| 279 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 280 __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); |
| 281 __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); |
| 282 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); |
| 283 const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]); |
| 284 __m128i avgTTR; |
| 285 Average2_m128i(&T, &TR, &avgTTR); |
| 286 for (j = 0; j < 4; ++j) { |
| 287 __m128i avgLTL, avg; |
| 288 Average2_m128i(&L, &TL, &avgLTL); |
| 289 Average2_m128i(&avgTTR, &avgLTL, &avg); |
| 290 L = _mm_add_epi8(avg, src); |
| 291 out[i + j] = _mm_cvtsi128_si32(L); |
| 292 // Rotate the pre-computed values for the next iteration. |
| 293 avgTTR = _mm_srli_si128(avgTTR, 4); |
| 294 TL = _mm_srli_si128(TL, 4); |
| 295 src = _mm_srli_si128(src, 4); |
| 296 } |
| 297 } |
| 298 if (i != num_pixels) { |
| 299 VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i); |
| 300 } |
| 301 } |
| 302 |
| 303 // Predictor11: select. |
| 304 static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B, |
| 305 __m128i* const out) { |
| 306 // We can unpack with any value on the upper 32 bits, provided it's the same |
| 307 // on both operands (to that their sum of abs diff is zero). Here we use *A. |
| 308 const __m128i A_lo = _mm_unpacklo_epi32(*A, *A); |
| 309 const __m128i B_lo = _mm_unpacklo_epi32(*B, *A); |
| 310 const __m128i A_hi = _mm_unpackhi_epi32(*A, *A); |
| 311 const __m128i B_hi = _mm_unpackhi_epi32(*B, *A); |
| 312 const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo); |
| 313 const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi); |
| 314 *out = _mm_packs_epi32(s_lo, s_hi); |
| 315 } |
| 316 |
| 317 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, |
| 318 int num_pixels, uint32_t* out) { |
| 319 int i, j; |
| 320 __m128i L = _mm_cvtsi32_si128(out[-1]); |
| 321 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 322 __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); |
| 323 __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); |
| 324 __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); |
| 325 __m128i pa; |
| 326 GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL| |
| 327 for (j = 0; j < 4; ++j) { |
| 328 const __m128i L_lo = _mm_unpacklo_epi32(L, L); |
| 329 const __m128i TL_lo = _mm_unpacklo_epi32(TL, L); |
| 330 const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL| |
| 331 const __m128i mask = _mm_cmpgt_epi32(pb, pa); |
| 332 const __m128i A = _mm_and_si128(mask, L); |
| 333 const __m128i B = _mm_andnot_si128(mask, T); |
| 334 const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T |
| 335 L = _mm_add_epi8(src, pred); |
| 336 out[i + j] = _mm_cvtsi128_si32(L); |
| 337 // Shift the pre-computed value for the next iteration. |
| 338 T = _mm_srli_si128(T, 4); |
| 339 TL = _mm_srli_si128(TL, 4); |
| 340 src = _mm_srli_si128(src, 4); |
| 341 pa = _mm_srli_si128(pa, 4); |
| 342 } |
| 343 } |
| 344 if (i != num_pixels) { |
| 345 VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); |
| 346 } |
| 347 } |
| 348 |
| 349 // Predictor12: ClampedAddSubtractFull. |
| 350 #define DO_PRED12(DIFF, LANE, OUT) \ |
| 351 do { \ |
| 352 const __m128i all = _mm_add_epi16(L, (DIFF)); \ |
| 353 const __m128i alls = _mm_packus_epi16(all, all); \ |
| 354 const __m128i res = _mm_add_epi8(src, alls); \ |
| 355 out[i + (OUT)] = _mm_cvtsi128_si32(res); \ |
| 356 L = _mm_unpacklo_epi8(res, zero); \ |
| 357 /* Shift the pre-computed value for the next iteration.*/ \ |
| 358 if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \ |
| 359 src = _mm_srli_si128(src, 4); \ |
| 360 } while (0) |
| 361 |
| 362 static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, |
| 363 int num_pixels, uint32_t* out) { |
| 364 int i; |
| 365 const __m128i zero = _mm_setzero_si128(); |
| 366 const __m128i L8 = _mm_cvtsi32_si128(out[-1]); |
| 367 __m128i L = _mm_unpacklo_epi8(L8, zero); |
| 368 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 369 // Load 4 pixels at a time. |
| 370 __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); |
| 371 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); |
| 372 const __m128i T_lo = _mm_unpacklo_epi8(T, zero); |
| 373 const __m128i T_hi = _mm_unpackhi_epi8(T, zero); |
| 374 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); |
| 375 const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); |
| 376 const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero); |
| 377 __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo); |
| 378 __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi); |
| 379 DO_PRED12(diff_lo, 0, 0); |
| 380 DO_PRED12(diff_lo, 1, 1); |
| 381 DO_PRED12(diff_hi, 0, 2); |
| 382 DO_PRED12(diff_hi, 1, 3); |
| 383 } |
| 384 if (i != num_pixels) { |
| 385 VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i); |
| 386 } |
| 387 } |
| 388 #undef DO_PRED12 |
| 389 |
| 390 // Due to averages with integers, values cannot be accumulated in parallel for |
| 391 // predictors 13. |
| 392 GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2) |
| 393 |
| 155 //------------------------------------------------------------------------------ | 394 //------------------------------------------------------------------------------ |
| 156 // Subtract-Green Transform | 395 // Subtract-Green Transform |
| 157 | 396 |
| 158 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { | 397 static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels, |
| 159 int i; | 398 uint32_t* dst) { |
| 160 for (i = 0; i + 4 <= num_pixels; i += 4) { | 399 int i; |
| 161 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb | 400 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 401 const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb |
| 162 const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g | 402 const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g |
| 163 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); | 403 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); |
| 164 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g | 404 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g |
| 165 const __m128i out = _mm_add_epi8(in, C); | 405 const __m128i out = _mm_add_epi8(in, C); |
| 166 _mm_storeu_si128((__m128i*)&argb_data[i], out); | 406 _mm_storeu_si128((__m128i*)&dst[i], out); |
| 167 } | 407 } |
| 168 // fallthrough and finish off with plain-C | 408 // fallthrough and finish off with plain-C |
| 169 VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i); | 409 if (i != num_pixels) { |
| 410 VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i); |
| 411 } |
| 170 } | 412 } |
| 171 | 413 |
| 172 //------------------------------------------------------------------------------ | 414 //------------------------------------------------------------------------------ |
| 173 // Color Transform | 415 // Color Transform |
| 174 | 416 |
| 175 static void TransformColorInverse(const VP8LMultipliers* const m, | 417 static void TransformColorInverse(const VP8LMultipliers* const m, |
| 176 uint32_t* argb_data, int num_pixels) { | 418 const uint32_t* const src, int num_pixels, |
| 177 // sign-extended multiplying constants, pre-shifted by 5. | 419 uint32_t* dst) { |
| 420 // sign-extended multiplying constants, pre-shifted by 5. |
| 178 #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend | 421 #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend |
| 179 const __m128i mults_rb = _mm_set_epi16( | 422 const __m128i mults_rb = _mm_set_epi16( |
| 180 CST(green_to_red_), CST(green_to_blue_), | 423 CST(green_to_red_), CST(green_to_blue_), |
| 181 CST(green_to_red_), CST(green_to_blue_), | 424 CST(green_to_red_), CST(green_to_blue_), |
| 182 CST(green_to_red_), CST(green_to_blue_), | 425 CST(green_to_red_), CST(green_to_blue_), |
| 183 CST(green_to_red_), CST(green_to_blue_)); | 426 CST(green_to_red_), CST(green_to_blue_)); |
| 184 const __m128i mults_b2 = _mm_set_epi16( | 427 const __m128i mults_b2 = _mm_set_epi16( |
| 185 CST(red_to_blue_), 0, CST(red_to_blue_), 0, | 428 CST(red_to_blue_), 0, CST(red_to_blue_), 0, |
| 186 CST(red_to_blue_), 0, CST(red_to_blue_), 0); | 429 CST(red_to_blue_), 0, CST(red_to_blue_), 0); |
| 187 #undef CST | 430 #undef CST |
| 188 const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks | 431 const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks |
| 189 int i; | 432 int i; |
| 190 for (i = 0; i + 4 <= num_pixels; i += 4) { | 433 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 191 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb | 434 const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb |
| 192 const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 | 435 const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 |
| 193 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); | 436 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); |
| 194 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 | 437 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 |
| 195 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 | 438 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 |
| 196 const __m128i E = _mm_add_epi8(in, D); // x r' x b' | 439 const __m128i E = _mm_add_epi8(in, D); // x r' x b' |
| 197 const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0 | 440 const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0 |
| 198 const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0 | 441 const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0 |
| 199 const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0 | 442 const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0 |
| 200 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0 | 443 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0 |
| 201 const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b'' | 444 const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b'' |
| 202 const __m128i out = _mm_or_si128(J, A); | 445 const __m128i out = _mm_or_si128(J, A); |
| 203 _mm_storeu_si128((__m128i*)&argb_data[i], out); | 446 _mm_storeu_si128((__m128i*)&dst[i], out); |
| 204 } | 447 } |
| 205 // Fall-back to C-version for left-overs. | 448 // Fall-back to C-version for left-overs. |
| 206 VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); | 449 if (i != num_pixels) { |
| 450 VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); |
| 451 } |
| 207 } | 452 } |
| 208 | 453 |
| 209 //------------------------------------------------------------------------------ | 454 //------------------------------------------------------------------------------ |
| 210 // Color-space conversion functions | 455 // Color-space conversion functions |
| 211 | 456 |
| 457 static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, |
| 458 uint8_t* dst) { |
| 459 const __m128i* in = (const __m128i*)src; |
| 460 __m128i* out = (__m128i*)dst; |
| 461 |
| 462 while (num_pixels >= 32) { |
| 463 // Load the BGRA buffers. |
| 464 __m128i in0 = _mm_loadu_si128(in + 0); |
| 465 __m128i in1 = _mm_loadu_si128(in + 1); |
| 466 __m128i in2 = _mm_loadu_si128(in + 2); |
| 467 __m128i in3 = _mm_loadu_si128(in + 3); |
| 468 __m128i in4 = _mm_loadu_si128(in + 4); |
| 469 __m128i in5 = _mm_loadu_si128(in + 5); |
| 470 __m128i in6 = _mm_loadu_si128(in + 6); |
| 471 __m128i in7 = _mm_loadu_si128(in + 7); |
| 472 VP8L32bToPlanar(&in0, &in1, &in2, &in3); |
| 473 VP8L32bToPlanar(&in4, &in5, &in6, &in7); |
| 474 // At this points, in1/in5 contains red only, in2/in6 green only ... |
| 475 // Pack the colors in 24b RGB. |
| 476 VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7); |
| 477 _mm_storeu_si128(out + 0, in1); |
| 478 _mm_storeu_si128(out + 1, in5); |
| 479 _mm_storeu_si128(out + 2, in2); |
| 480 _mm_storeu_si128(out + 3, in6); |
| 481 _mm_storeu_si128(out + 4, in3); |
| 482 _mm_storeu_si128(out + 5, in7); |
| 483 in += 8; |
| 484 out += 6; |
| 485 num_pixels -= 32; |
| 486 } |
| 487 // left-overs |
| 488 if (num_pixels > 0) { |
| 489 VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
| 490 } |
| 491 } |
| 492 |
| 212 static void ConvertBGRAToRGBA(const uint32_t* src, | 493 static void ConvertBGRAToRGBA(const uint32_t* src, |
| 213 int num_pixels, uint8_t* dst) { | 494 int num_pixels, uint8_t* dst) { |
| 214 const __m128i* in = (const __m128i*)src; | 495 const __m128i* in = (const __m128i*)src; |
| 215 __m128i* out = (__m128i*)dst; | 496 __m128i* out = (__m128i*)dst; |
| 216 while (num_pixels >= 8) { | 497 while (num_pixels >= 8) { |
| 217 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 | 498 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 |
| 218 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 | 499 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 |
| 219 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... | 500 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... |
| 220 const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... | 501 const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... |
| 221 const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... | 502 const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... |
| 222 const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... | 503 const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... |
| 223 const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 | 504 const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 |
| 224 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 | 505 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 |
| 225 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 | 506 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 |
| 226 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 | 507 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 |
| 227 const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7 | 508 const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7 |
| 228 const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7 | 509 const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7 |
| 229 const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1... | 510 const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1... |
| 230 const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5... | 511 const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5... |
| 231 _mm_storeu_si128(out++, rgba0); | 512 _mm_storeu_si128(out++, rgba0); |
| 232 _mm_storeu_si128(out++, rgba4); | 513 _mm_storeu_si128(out++, rgba4); |
| 233 num_pixels -= 8; | 514 num_pixels -= 8; |
| 234 } | 515 } |
| 235 // left-overs | 516 // left-overs |
| 236 VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out); | 517 if (num_pixels > 0) { |
| 518 VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
| 519 } |
| 237 } | 520 } |
| 238 | 521 |
| 239 static void ConvertBGRAToRGBA4444(const uint32_t* src, | 522 static void ConvertBGRAToRGBA4444(const uint32_t* src, |
| 240 int num_pixels, uint8_t* dst) { | 523 int num_pixels, uint8_t* dst) { |
| 241 const __m128i mask_0x0f = _mm_set1_epi8(0x0f); | 524 const __m128i mask_0x0f = _mm_set1_epi8(0x0f); |
| 242 const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); | 525 const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); |
| 243 const __m128i* in = (const __m128i*)src; | 526 const __m128i* in = (const __m128i*)src; |
| 244 __m128i* out = (__m128i*)dst; | 527 __m128i* out = (__m128i*)dst; |
| 245 while (num_pixels >= 8) { | 528 while (num_pixels >= 8) { |
| 246 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 | 529 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 260 const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0 | 543 const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0 |
| 261 #ifdef WEBP_SWAP_16BIT_CSP | 544 #ifdef WEBP_SWAP_16BIT_CSP |
| 262 const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7 | 545 const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7 |
| 263 #else | 546 #else |
| 264 const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7 | 547 const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7 |
| 265 #endif | 548 #endif |
| 266 _mm_storeu_si128(out++, rgba); | 549 _mm_storeu_si128(out++, rgba); |
| 267 num_pixels -= 8; | 550 num_pixels -= 8; |
| 268 } | 551 } |
| 269 // left-overs | 552 // left-overs |
| 270 VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out); | 553 if (num_pixels > 0) { |
| 554 VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
| 555 } |
| 271 } | 556 } |
| 272 | 557 |
| 273 static void ConvertBGRAToRGB565(const uint32_t* src, | 558 static void ConvertBGRAToRGB565(const uint32_t* src, |
| 274 int num_pixels, uint8_t* dst) { | 559 int num_pixels, uint8_t* dst) { |
| 275 const __m128i mask_0xe0 = _mm_set1_epi8(0xe0); | 560 const __m128i mask_0xe0 = _mm_set1_epi8(0xe0); |
| 276 const __m128i mask_0xf8 = _mm_set1_epi8(0xf8); | 561 const __m128i mask_0xf8 = _mm_set1_epi8(0xf8); |
| 277 const __m128i mask_0x07 = _mm_set1_epi8(0x07); | 562 const __m128i mask_0x07 = _mm_set1_epi8(0x07); |
| 278 const __m128i* in = (const __m128i*)src; | 563 const __m128i* in = (const __m128i*)src; |
| 279 __m128i* out = (__m128i*)dst; | 564 __m128i* out = (__m128i*)dst; |
| 280 while (num_pixels >= 8) { | 565 while (num_pixels >= 8) { |
| (...skipping 18 matching lines...) Expand all Loading... |
| 299 const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx | 584 const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx |
| 300 #ifdef WEBP_SWAP_16BIT_CSP | 585 #ifdef WEBP_SWAP_16BIT_CSP |
| 301 const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7 | 586 const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7 |
| 302 #else | 587 #else |
| 303 const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7 | 588 const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7 |
| 304 #endif | 589 #endif |
| 305 _mm_storeu_si128(out++, rgba); | 590 _mm_storeu_si128(out++, rgba); |
| 306 num_pixels -= 8; | 591 num_pixels -= 8; |
| 307 } | 592 } |
| 308 // left-overs | 593 // left-overs |
| 309 VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out); | 594 if (num_pixels > 0) { |
| 595 VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
| 596 } |
| 310 } | 597 } |
| 311 | 598 |
| 312 static void ConvertBGRAToBGR(const uint32_t* src, | 599 static void ConvertBGRAToBGR(const uint32_t* src, |
| 313 int num_pixels, uint8_t* dst) { | 600 int num_pixels, uint8_t* dst) { |
| 314 const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff); | 601 const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff); |
| 315 const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0); | 602 const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0); |
| 316 const __m128i* in = (const __m128i*)src; | 603 const __m128i* in = (const __m128i*)src; |
| 317 const uint8_t* const end = dst + num_pixels * 3; | 604 const uint8_t* const end = dst + num_pixels * 3; |
| 318 // the last storel_epi64 below writes 8 bytes starting at offset 18 | 605 // the last storel_epi64 below writes 8 bytes starting at offset 18 |
| 319 while (dst + 26 <= end) { | 606 while (dst + 26 <= end) { |
| (...skipping 10 matching lines...) Expand all Loading... |
| 330 const __m128i c2 = _mm_srli_si128(c0, 8); | 617 const __m128i c2 = _mm_srli_si128(c0, 8); |
| 331 const __m128i c6 = _mm_srli_si128(c4, 8); | 618 const __m128i c6 = _mm_srli_si128(c4, 8); |
| 332 _mm_storel_epi64((__m128i*)(dst + 0), c0); | 619 _mm_storel_epi64((__m128i*)(dst + 0), c0); |
| 333 _mm_storel_epi64((__m128i*)(dst + 6), c2); | 620 _mm_storel_epi64((__m128i*)(dst + 6), c2); |
| 334 _mm_storel_epi64((__m128i*)(dst + 12), c4); | 621 _mm_storel_epi64((__m128i*)(dst + 12), c4); |
| 335 _mm_storel_epi64((__m128i*)(dst + 18), c6); | 622 _mm_storel_epi64((__m128i*)(dst + 18), c6); |
| 336 dst += 24; | 623 dst += 24; |
| 337 num_pixels -= 8; | 624 num_pixels -= 8; |
| 338 } | 625 } |
| 339 // left-overs | 626 // left-overs |
| 340 VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst); | 627 if (num_pixels > 0) { |
| 628 VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst); |
| 629 } |
| 341 } | 630 } |
| 342 | 631 |
| 343 //------------------------------------------------------------------------------ | 632 //------------------------------------------------------------------------------ |
| 344 // Entry point | 633 // Entry point |
| 345 | 634 |
| 346 extern void VP8LDspInitSSE2(void); | 635 extern void VP8LDspInitSSE2(void); |
| 347 | 636 |
| 348 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) { | 637 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) { |
| 349 VP8LPredictors[5] = Predictor5; | 638 VP8LPredictors[5] = Predictor5_SSE2; |
| 350 VP8LPredictors[6] = Predictor6; | 639 VP8LPredictors[6] = Predictor6_SSE2; |
| 351 VP8LPredictors[7] = Predictor7; | 640 VP8LPredictors[7] = Predictor7_SSE2; |
| 352 VP8LPredictors[8] = Predictor8; | 641 VP8LPredictors[8] = Predictor8_SSE2; |
| 353 VP8LPredictors[9] = Predictor9; | 642 VP8LPredictors[9] = Predictor9_SSE2; |
| 354 VP8LPredictors[10] = Predictor10; | 643 VP8LPredictors[10] = Predictor10_SSE2; |
| 355 VP8LPredictors[11] = Predictor11; | 644 VP8LPredictors[11] = Predictor11_SSE2; |
| 356 VP8LPredictors[12] = Predictor12; | 645 VP8LPredictors[12] = Predictor12_SSE2; |
| 357 VP8LPredictors[13] = Predictor13; | 646 VP8LPredictors[13] = Predictor13_SSE2; |
| 647 |
| 648 VP8LPredictorsAdd[0] = PredictorAdd0_SSE2; |
| 649 VP8LPredictorsAdd[1] = PredictorAdd1_SSE2; |
| 650 VP8LPredictorsAdd[2] = PredictorAdd2_SSE2; |
| 651 VP8LPredictorsAdd[3] = PredictorAdd3_SSE2; |
| 652 VP8LPredictorsAdd[4] = PredictorAdd4_SSE2; |
| 653 VP8LPredictorsAdd[5] = PredictorAdd5_SSE2; |
| 654 VP8LPredictorsAdd[6] = PredictorAdd6_SSE2; |
| 655 VP8LPredictorsAdd[7] = PredictorAdd7_SSE2; |
| 656 VP8LPredictorsAdd[8] = PredictorAdd8_SSE2; |
| 657 VP8LPredictorsAdd[9] = PredictorAdd9_SSE2; |
| 658 VP8LPredictorsAdd[10] = PredictorAdd10_SSE2; |
| 659 VP8LPredictorsAdd[11] = PredictorAdd11_SSE2; |
| 660 VP8LPredictorsAdd[12] = PredictorAdd12_SSE2; |
| 661 VP8LPredictorsAdd[13] = PredictorAdd13_SSE2; |
| 358 | 662 |
| 359 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; | 663 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; |
| 360 VP8LTransformColorInverse = TransformColorInverse; | 664 VP8LTransformColorInverse = TransformColorInverse; |
| 361 | 665 |
| 666 VP8LConvertBGRAToRGB = ConvertBGRAToRGB; |
| 362 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; | 667 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; |
| 363 VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444; | 668 VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444; |
| 364 VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565; | 669 VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565; |
| 365 VP8LConvertBGRAToBGR = ConvertBGRAToBGR; | 670 VP8LConvertBGRAToBGR = ConvertBGRAToBGR; |
| 366 } | 671 } |
| 367 | 672 |
| 368 #else // !WEBP_USE_SSE2 | 673 #else // !WEBP_USE_SSE2 |
| 369 | 674 |
| 370 WEBP_DSP_INIT_STUB(VP8LDspInitSSE2) | 675 WEBP_DSP_INIT_STUB(VP8LDspInitSSE2) |
| 371 | 676 |
| 372 #endif // WEBP_USE_SSE2 | 677 #endif // WEBP_USE_SSE2 |
| OLD | NEW |