OLD | NEW |
(Empty) | |
| 1 // Copyright 2014 Google Inc. All Rights Reserved. |
| 2 // |
| 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- |
| 9 // |
| 10 // SSE2 variant of methods for lossless decoder |
| 11 // |
| 12 // Author: Skal (pascal.massimino@gmail.com) |
| 13 |
| 14 #include "./dsp.h" |
| 15 |
| 16 #include <assert.h> |
| 17 |
| 18 #if defined(WEBP_USE_SSE2) |
| 19 #include <emmintrin.h> |
| 20 #include "./lossless.h" |
| 21 |
| 22 //------------------------------------------------------------------------------ |
| 23 // Predictor Transform |
| 24 |
| 25 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, |
| 26 uint32_t c2) { |
| 27 const __m128i zero = _mm_setzero_si128(); |
| 28 const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); |
| 29 const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); |
| 30 const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); |
| 31 const __m128i V1 = _mm_add_epi16(C0, C1); |
| 32 const __m128i V2 = _mm_sub_epi16(V1, C2); |
| 33 const __m128i b = _mm_packus_epi16(V2, V2); |
| 34 const uint32_t output = _mm_cvtsi128_si32(b); |
| 35 return output; |
| 36 } |
| 37 |
| 38 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, |
| 39 uint32_t c2) { |
| 40 const __m128i zero = _mm_setzero_si128(); |
| 41 const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); |
| 42 const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); |
| 43 const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); |
| 44 const __m128i avg = _mm_add_epi16(C1, C0); |
| 45 const __m128i A0 = _mm_srli_epi16(avg, 1); |
| 46 const __m128i A1 = _mm_sub_epi16(A0, B0); |
| 47 const __m128i BgtA = _mm_cmpgt_epi16(B0, A0); |
| 48 const __m128i A2 = _mm_sub_epi16(A1, BgtA); |
| 49 const __m128i A3 = _mm_srai_epi16(A2, 1); |
| 50 const __m128i A4 = _mm_add_epi16(A0, A3); |
| 51 const __m128i A5 = _mm_packus_epi16(A4, A4); |
| 52 const uint32_t output = _mm_cvtsi128_si32(A5); |
| 53 return output; |
| 54 } |
| 55 |
| 56 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) { |
| 57 int pa_minus_pb; |
| 58 const __m128i zero = _mm_setzero_si128(); |
| 59 const __m128i A0 = _mm_cvtsi32_si128(a); |
| 60 const __m128i B0 = _mm_cvtsi32_si128(b); |
| 61 const __m128i C0 = _mm_cvtsi32_si128(c); |
| 62 const __m128i AC0 = _mm_subs_epu8(A0, C0); |
| 63 const __m128i CA0 = _mm_subs_epu8(C0, A0); |
| 64 const __m128i BC0 = _mm_subs_epu8(B0, C0); |
| 65 const __m128i CB0 = _mm_subs_epu8(C0, B0); |
| 66 const __m128i AC = _mm_or_si128(AC0, CA0); |
| 67 const __m128i BC = _mm_or_si128(BC0, CB0); |
| 68 const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c| |
| 69 const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c| |
| 70 const __m128i diff = _mm_sub_epi16(pb, pa); |
| 71 { |
| 72 int16_t out[8]; |
| 73 _mm_storeu_si128((__m128i*)out, diff); |
| 74 pa_minus_pb = out[0] + out[1] + out[2] + out[3]; |
| 75 } |
| 76 return (pa_minus_pb <= 0) ? a : b; |
| 77 } |
| 78 |
| 79 static WEBP_INLINE __m128i Average2_128i(uint32_t a0, uint32_t a1) { |
| 80 const __m128i zero = _mm_setzero_si128(); |
| 81 const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero); |
| 82 const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); |
| 83 const __m128i sum = _mm_add_epi16(A1, A0); |
| 84 const __m128i avg = _mm_srli_epi16(sum, 1); |
| 85 return avg; |
| 86 } |
| 87 |
| 88 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) { |
| 89 const __m128i avg = Average2_128i(a0, a1); |
| 90 const __m128i A2 = _mm_packus_epi16(avg, avg); |
| 91 const uint32_t output = _mm_cvtsi128_si32(A2); |
| 92 return output; |
| 93 } |
| 94 |
| 95 static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { |
| 96 const __m128i zero = _mm_setzero_si128(); |
| 97 const __m128i avg1 = Average2_128i(a0, a2); |
| 98 const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); |
| 99 const __m128i sum = _mm_add_epi16(avg1, A1); |
| 100 const __m128i avg2 = _mm_srli_epi16(sum, 1); |
| 101 const __m128i A2 = _mm_packus_epi16(avg2, avg2); |
| 102 const uint32_t output = _mm_cvtsi128_si32(A2); |
| 103 return output; |
| 104 } |
| 105 |
| 106 static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, |
| 107 uint32_t a2, uint32_t a3) { |
| 108 const __m128i avg1 = Average2_128i(a0, a1); |
| 109 const __m128i avg2 = Average2_128i(a2, a3); |
| 110 const __m128i sum = _mm_add_epi16(avg2, avg1); |
| 111 const __m128i avg3 = _mm_srli_epi16(sum, 1); |
| 112 const __m128i A0 = _mm_packus_epi16(avg3, avg3); |
| 113 const uint32_t output = _mm_cvtsi128_si32(A0); |
| 114 return output; |
| 115 } |
| 116 |
| 117 static uint32_t Predictor5(uint32_t left, const uint32_t* const top) { |
| 118 const uint32_t pred = Average3(left, top[0], top[1]); |
| 119 return pred; |
| 120 } |
| 121 static uint32_t Predictor6(uint32_t left, const uint32_t* const top) { |
| 122 const uint32_t pred = Average2(left, top[-1]); |
| 123 return pred; |
| 124 } |
| 125 static uint32_t Predictor7(uint32_t left, const uint32_t* const top) { |
| 126 const uint32_t pred = Average2(left, top[0]); |
| 127 return pred; |
| 128 } |
| 129 static uint32_t Predictor8(uint32_t left, const uint32_t* const top) { |
| 130 const uint32_t pred = Average2(top[-1], top[0]); |
| 131 (void)left; |
| 132 return pred; |
| 133 } |
| 134 static uint32_t Predictor9(uint32_t left, const uint32_t* const top) { |
| 135 const uint32_t pred = Average2(top[0], top[1]); |
| 136 (void)left; |
| 137 return pred; |
| 138 } |
| 139 static uint32_t Predictor10(uint32_t left, const uint32_t* const top) { |
| 140 const uint32_t pred = Average4(left, top[-1], top[0], top[1]); |
| 141 return pred; |
| 142 } |
| 143 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) { |
| 144 const uint32_t pred = Select(top[0], left, top[-1]); |
| 145 return pred; |
| 146 } |
| 147 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) { |
| 148 const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]); |
| 149 return pred; |
| 150 } |
| 151 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { |
| 152 const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]); |
| 153 return pred; |
| 154 } |
| 155 |
| 156 //------------------------------------------------------------------------------ |
| 157 // Subtract-Green Transform |
| 158 |
| 159 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { |
| 160 const __m128i mask = _mm_set1_epi32(0x0000ff00); |
| 161 int i; |
| 162 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 163 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); |
| 164 const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|... |
| 165 const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|... |
| 166 const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|... |
| 167 const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g); |
| 168 const __m128i out = _mm_sub_epi8(in, in_0g0g); |
| 169 _mm_storeu_si128((__m128i*)&argb_data[i], out); |
| 170 } |
| 171 // fallthrough and finish off with plain-C |
| 172 VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); |
| 173 } |
| 174 |
| 175 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { |
| 176 const __m128i mask = _mm_set1_epi32(0x0000ff00); |
| 177 int i; |
| 178 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 179 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); |
| 180 const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|... |
| 181 const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|... |
| 182 const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|... |
| 183 const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g); |
| 184 const __m128i out = _mm_add_epi8(in, in_0g0g); |
| 185 _mm_storeu_si128((__m128i*)&argb_data[i], out); |
| 186 } |
| 187 // fallthrough and finish off with plain-C |
| 188 VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i); |
| 189 } |
| 190 |
| 191 //------------------------------------------------------------------------------ |
| 192 // Color Transform |
| 193 |
| 194 static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred, |
| 195 __m128i color) { |
| 196 // We simulate signed 8-bit multiplication as: |
| 197 // * Left shift the two (8-bit) numbers by 8 bits, |
| 198 // * Perform a 16-bit signed multiplication and retain the higher 16-bits. |
| 199 const __m128i color_pred_shifted = _mm_slli_epi32(color_pred, 8); |
| 200 const __m128i color_shifted = _mm_slli_epi32(color, 8); |
| 201 // Note: This performs multiplication on 8 packed 16-bit numbers, 4 of which |
| 202 // happen to be zeroes. |
| 203 const __m128i signed_mult = |
| 204 _mm_mulhi_epi16(color_pred_shifted, color_shifted); |
| 205 return _mm_srli_epi32(signed_mult, 5); |
| 206 } |
| 207 |
| 208 static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m, |
| 209 uint32_t* argb_data, |
| 210 int num_pixels) { |
| 211 const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_); // multipliers |
| 212 const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_); |
| 213 const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_); |
| 214 |
| 215 int i; |
| 216 |
| 217 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 218 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); |
| 219 const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00); // masks |
| 220 const __m128i red_mask = _mm_set1_epi32(0x00ff0000); |
| 221 const __m128i green_mask = _mm_set1_epi32(0x0000ff00); |
| 222 const __m128i lower_8bit_mask = _mm_set1_epi32(0x000000ff); |
| 223 const __m128i ag = _mm_and_si128(in, alpha_green_mask); // alpha, green |
| 224 const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16); |
| 225 const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8); |
| 226 const __m128i b = in; |
| 227 |
| 228 const __m128i r_delta = ColorTransformDelta(g_to_r, g); // red |
| 229 const __m128i r_new = |
| 230 _mm_and_si128(_mm_sub_epi32(r, r_delta), lower_8bit_mask); |
| 231 const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16); |
| 232 |
| 233 const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g); // blue |
| 234 const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r); |
| 235 const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2); |
| 236 const __m128i b_new = |
| 237 _mm_and_si128(_mm_sub_epi32(b, b_delta), lower_8bit_mask); |
| 238 |
| 239 const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new); |
| 240 _mm_storeu_si128((__m128i*)&argb_data[i], out); |
| 241 } |
| 242 |
| 243 // Fall-back to C-version for left-overs. |
| 244 VP8LTransformColor_C(m, argb_data + i, num_pixels - i); |
| 245 } |
| 246 |
| 247 static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m, |
| 248 uint32_t* argb_data, |
| 249 int num_pixels) { |
| 250 const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_); // multipliers |
| 251 const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_); |
| 252 const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_); |
| 253 |
| 254 int i; |
| 255 |
| 256 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 257 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); |
| 258 const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00); // masks |
| 259 const __m128i red_mask = _mm_set1_epi32(0x00ff0000); |
| 260 const __m128i green_mask = _mm_set1_epi32(0x0000ff00); |
| 261 const __m128i lower_8bit_mask = _mm_set1_epi32(0x000000ff); |
| 262 const __m128i ag = _mm_and_si128(in, alpha_green_mask); // alpha, green |
| 263 const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16); |
| 264 const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8); |
| 265 const __m128i b = in; |
| 266 |
| 267 const __m128i r_delta = ColorTransformDelta(g_to_r, g); // red |
| 268 const __m128i r_new = |
| 269 _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask); |
| 270 const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16); |
| 271 |
| 272 const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g); // blue |
| 273 const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new); |
| 274 const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2); |
| 275 const __m128i b_new = |
| 276 _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask); |
| 277 |
| 278 const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new); |
| 279 _mm_storeu_si128((__m128i*)&argb_data[i], out); |
| 280 } |
| 281 |
| 282 // Fall-back to C-version for left-overs. |
| 283 VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); |
| 284 } |
| 285 |
| 286 //------------------------------------------------------------------------------ |
| 287 // Color-space conversion functions |
| 288 |
| 289 static void ConvertBGRAToRGBA(const uint32_t* src, |
| 290 int num_pixels, uint8_t* dst) { |
| 291 const __m128i* in = (const __m128i*)src; |
| 292 __m128i* out = (__m128i*)dst; |
| 293 while (num_pixels >= 8) { |
| 294 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 |
| 295 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 |
| 296 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... |
| 297 const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... |
| 298 const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... |
| 299 const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... |
| 300 const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 |
| 301 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 |
| 302 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 |
| 303 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 |
| 304 const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7 |
| 305 const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7 |
| 306 const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1... |
| 307 const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5... |
| 308 _mm_storeu_si128(out++, rgba0); |
| 309 _mm_storeu_si128(out++, rgba4); |
| 310 num_pixels -= 8; |
| 311 } |
| 312 // left-overs |
| 313 VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
| 314 } |
| 315 |
| 316 static void ConvertBGRAToRGBA4444(const uint32_t* src, |
| 317 int num_pixels, uint8_t* dst) { |
| 318 const __m128i mask_0x0f = _mm_set1_epi8(0x0f); |
| 319 const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); |
| 320 const __m128i* in = (const __m128i*)src; |
| 321 __m128i* out = (__m128i*)dst; |
| 322 while (num_pixels >= 8) { |
| 323 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 |
| 324 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 |
| 325 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... |
| 326 const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... |
| 327 const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... |
| 328 const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... |
| 329 const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 |
| 330 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 |
| 331 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 |
| 332 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 |
| 333 const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7- |
| 334 const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7 |
| 335 const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7- |
| 336 const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7 |
| 337 const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0 |
| 338 #ifdef WEBP_SWAP_16BIT_CSP |
| 339 const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7 |
| 340 #else |
| 341 const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7 |
| 342 #endif |
| 343 _mm_storeu_si128(out++, rgba); |
| 344 num_pixels -= 8; |
| 345 } |
| 346 // left-overs |
| 347 VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
| 348 } |
| 349 |
| 350 static void ConvertBGRAToRGB565(const uint32_t* src, |
| 351 int num_pixels, uint8_t* dst) { |
| 352 const __m128i mask_0xe0 = _mm_set1_epi8(0xe0); |
| 353 const __m128i mask_0xf8 = _mm_set1_epi8(0xf8); |
| 354 const __m128i mask_0x07 = _mm_set1_epi8(0x07); |
| 355 const __m128i* in = (const __m128i*)src; |
| 356 __m128i* out = (__m128i*)dst; |
| 357 while (num_pixels >= 8) { |
| 358 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 |
| 359 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 |
| 360 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... |
| 361 const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... |
| 362 const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... |
| 363 const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... |
| 364 const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 |
| 365 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 |
| 366 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 |
| 367 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 |
| 368 const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7 |
| 369 const __m128i g_lo1 = _mm_srli_epi16(ga0, 5); |
| 370 const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b) |
| 371 const __m128i g_hi1 = _mm_slli_epi16(ga0, 3); |
| 372 const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b) |
| 373 const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0 |
| 374 const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx |
| 375 const __m128i b1 = _mm_srli_epi16(b0, 3); |
| 376 const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx |
| 377 #ifdef WEBP_SWAP_16BIT_CSP |
| 378 const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7 |
| 379 #else |
| 380 const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7 |
| 381 #endif |
| 382 _mm_storeu_si128(out++, rgba); |
| 383 num_pixels -= 8; |
| 384 } |
| 385 // left-overs |
| 386 VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
| 387 } |
| 388 |
| 389 static void ConvertBGRAToBGR(const uint32_t* src, |
| 390 int num_pixels, uint8_t* dst) { |
| 391 const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff); |
| 392 const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0); |
| 393 const __m128i* in = (const __m128i*)src; |
| 394 const uint8_t* const end = dst + num_pixels * 3; |
| 395 // the last storel_epi64 below writes 8 bytes starting at offset 18 |
| 396 while (dst + 26 <= end) { |
| 397 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 |
| 398 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 |
| 399 const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0 |
| 400 const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0 |
| 401 const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0 |
| 402 const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0 |
| 403 const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00 |
| 404 const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00 |
| 405 const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00 |
| 406 const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00 |
| 407 const __m128i c2 = _mm_srli_si128(c0, 8); |
| 408 const __m128i c6 = _mm_srli_si128(c4, 8); |
| 409 _mm_storel_epi64((__m128i*)(dst + 0), c0); |
| 410 _mm_storel_epi64((__m128i*)(dst + 6), c2); |
| 411 _mm_storel_epi64((__m128i*)(dst + 12), c4); |
| 412 _mm_storel_epi64((__m128i*)(dst + 18), c6); |
| 413 dst += 24; |
| 414 num_pixels -= 8; |
| 415 } |
| 416 // left-overs |
| 417 VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst); |
| 418 } |
| 419 |
| 420 //------------------------------------------------------------------------------ |
| 421 |
| 422 #define LINE_SIZE 16 // 8 or 16 |
| 423 static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out, |
| 424 int size) { |
| 425 int i; |
| 426 assert(size % LINE_SIZE == 0); |
| 427 for (i = 0; i < size; i += LINE_SIZE) { |
| 428 const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i + 0]); |
| 429 const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i + 4]); |
| 430 #if (LINE_SIZE == 16) |
| 431 const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i + 8]); |
| 432 const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]); |
| 433 #endif |
| 434 const __m128i b0 = _mm_loadu_si128((__m128i*)&b[i + 0]); |
| 435 const __m128i b1 = _mm_loadu_si128((__m128i*)&b[i + 4]); |
| 436 #if (LINE_SIZE == 16) |
| 437 const __m128i b2 = _mm_loadu_si128((__m128i*)&b[i + 8]); |
| 438 const __m128i b3 = _mm_loadu_si128((__m128i*)&b[i + 12]); |
| 439 #endif |
| 440 _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); |
| 441 _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); |
| 442 #if (LINE_SIZE == 16) |
| 443 _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); |
| 444 _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); |
| 445 #endif |
| 446 } |
| 447 } |
| 448 |
| 449 static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) { |
| 450 int i; |
| 451 assert(size % LINE_SIZE == 0); |
| 452 for (i = 0; i < size; i += LINE_SIZE) { |
| 453 const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i + 0]); |
| 454 const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i + 4]); |
| 455 #if (LINE_SIZE == 16) |
| 456 const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i + 8]); |
| 457 const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]); |
| 458 #endif |
| 459 const __m128i b0 = _mm_loadu_si128((__m128i*)&out[i + 0]); |
| 460 const __m128i b1 = _mm_loadu_si128((__m128i*)&out[i + 4]); |
| 461 #if (LINE_SIZE == 16) |
| 462 const __m128i b2 = _mm_loadu_si128((__m128i*)&out[i + 8]); |
| 463 const __m128i b3 = _mm_loadu_si128((__m128i*)&out[i + 12]); |
| 464 #endif |
| 465 _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); |
| 466 _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); |
| 467 #if (LINE_SIZE == 16) |
| 468 _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); |
| 469 _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); |
| 470 #endif |
| 471 } |
| 472 } |
| 473 #undef LINE_SIZE |
| 474 |
| 475 // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But |
| 476 // that's ok since the histogram values are less than 1<<28 (max picture size). |
| 477 static void HistogramAdd(const VP8LHistogram* const a, |
| 478 const VP8LHistogram* const b, |
| 479 VP8LHistogram* const out) { |
| 480 int i; |
| 481 const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_); |
| 482 assert(a->palette_code_bits_ == b->palette_code_bits_); |
| 483 if (b != out) { |
| 484 AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES); |
| 485 AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES); |
| 486 AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES); |
| 487 AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES); |
| 488 } else { |
| 489 AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES); |
| 490 AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES); |
| 491 AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES); |
| 492 AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES); |
| 493 } |
| 494 for (i = NUM_LITERAL_CODES; i < literal_size; ++i) { |
| 495 out->literal_[i] = a->literal_[i] + b->literal_[i]; |
| 496 } |
| 497 for (i = 0; i < NUM_DISTANCE_CODES; ++i) { |
| 498 out->distance_[i] = a->distance_[i] + b->distance_[i]; |
| 499 } |
| 500 } |
| 501 |
| 502 #endif // WEBP_USE_SSE2 |
| 503 |
| 504 //------------------------------------------------------------------------------ |
| 505 |
| 506 extern void VP8LDspInitSSE2(void); |
| 507 |
| 508 void VP8LDspInitSSE2(void) { |
| 509 #if defined(WEBP_USE_SSE2) |
| 510 VP8LPredictors[5] = Predictor5; |
| 511 VP8LPredictors[6] = Predictor6; |
| 512 VP8LPredictors[7] = Predictor7; |
| 513 VP8LPredictors[8] = Predictor8; |
| 514 VP8LPredictors[9] = Predictor9; |
| 515 VP8LPredictors[10] = Predictor10; |
| 516 VP8LPredictors[11] = Predictor11; |
| 517 VP8LPredictors[12] = Predictor12; |
| 518 VP8LPredictors[13] = Predictor13; |
| 519 |
| 520 VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; |
| 521 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; |
| 522 |
| 523 VP8LTransformColor = TransformColor; |
| 524 VP8LTransformColorInverse = TransformColorInverse; |
| 525 |
| 526 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; |
| 527 VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444; |
| 528 VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565; |
| 529 VP8LConvertBGRAToBGR = ConvertBGRAToBGR; |
| 530 |
| 531 VP8LHistogramAdd = HistogramAdd; |
| 532 #endif // WEBP_USE_SSE2 |
| 533 } |
| 534 |
| 535 //------------------------------------------------------------------------------ |
OLD | NEW |