| OLD | NEW |
| 1 // Copyright 2014 Google Inc. All Rights Reserved. | 1 // Copyright 2014 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // YUV->RGB conversion functions | 10 // YUV->RGB conversion functions |
| 11 // | 11 // |
| 12 // Author: Skal (pascal.massimino@gmail.com) | 12 // Author: Skal (pascal.massimino@gmail.com) |
| 13 | 13 |
| 14 #include "./yuv.h" | 14 #include "./yuv.h" |
| 15 | 15 |
| 16 #if defined(WEBP_USE_SSE2) | 16 #if defined(WEBP_USE_SSE2) |
| 17 | 17 |
| 18 #include "./common_sse2.h" |
| 19 #include <stdlib.h> |
| 18 #include <emmintrin.h> | 20 #include <emmintrin.h> |
| 19 | 21 |
| 20 //----------------------------------------------------------------------------- | 22 //----------------------------------------------------------------------------- |
| 21 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler. | 23 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler. |
| 22 | 24 |
| 23 // These constants are 14b fixed-point version of ITU-R BT.601 constants. | 25 // These constants are 14b fixed-point version of ITU-R BT.601 constants. |
| 24 // R = (19077 * y + 26149 * v - 14234) >> 6 | 26 // R = (19077 * y + 26149 * v - 14234) >> 6 |
| 25 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 | 27 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 |
| 26 // B = (19077 * y + 33050 * u - 17685) >> 6 | 28 // B = (19077 * y + 33050 * u - 17685) >> 6 |
| 27 static void ConvertYUV444ToRGB(const __m128i* const Y0, | 29 static void ConvertYUV444ToRGB(const __m128i* const Y0, |
| (...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 148 const __m128i rg = _mm_or_si128(r1, g1); | 150 const __m128i rg = _mm_or_si128(r1, g1); |
| 149 const __m128i gb = _mm_or_si128(g2, b1); | 151 const __m128i gb = _mm_or_si128(g2, b1); |
| 150 #if !defined(WEBP_SWAP_16BIT_CSP) | 152 #if !defined(WEBP_SWAP_16BIT_CSP) |
| 151 const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb); | 153 const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb); |
| 152 #else | 154 #else |
| 153 const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg); | 155 const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg); |
| 154 #endif | 156 #endif |
| 155 _mm_storeu_si128((__m128i*)dst, rgb565); | 157 _mm_storeu_si128((__m128i*)dst, rgb565); |
| 156 } | 158 } |
| 157 | 159 |
| 158 // Function used several times in PlanarTo24b. | |
| 159 // It samples the in buffer as follows: one every two unsigned char is stored | |
| 160 // at the beginning of the buffer, while the other half is stored at the end. | |
| 161 static WEBP_INLINE void PlanarTo24bHelper(const __m128i* const in /*in[6]*/, | |
| 162 __m128i* const out /*out[6]*/) { | |
| 163 const __m128i v_mask = _mm_set1_epi16(0x00ff); | |
| 164 | |
| 165 // Take one every two upper 8b values. | |
| 166 out[0] = _mm_packus_epi16(_mm_and_si128(in[0], v_mask), | |
| 167 _mm_and_si128(in[1], v_mask)); | |
| 168 out[1] = _mm_packus_epi16(_mm_and_si128(in[2], v_mask), | |
| 169 _mm_and_si128(in[3], v_mask)); | |
| 170 out[2] = _mm_packus_epi16(_mm_and_si128(in[4], v_mask), | |
| 171 _mm_and_si128(in[5], v_mask)); | |
| 172 // Take one every two lower 8b values. | |
| 173 out[3] = _mm_packus_epi16(_mm_srli_epi16(in[0], 8), _mm_srli_epi16(in[1], 8)); | |
| 174 out[4] = _mm_packus_epi16(_mm_srli_epi16(in[2], 8), _mm_srli_epi16(in[3], 8)); | |
| 175 out[5] = _mm_packus_epi16(_mm_srli_epi16(in[4], 8), _mm_srli_epi16(in[5], 8)); | |
| 176 } | |
| 177 | |
| 178 // Pack the planar buffers | 160 // Pack the planar buffers |
| 179 // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... | 161 // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... |
| 180 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... | 162 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... |
| 181 static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) { | 163 static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1, |
| 164 __m128i* const in2, __m128i* const in3, |
| 165 __m128i* const in4, __m128i* const in5, |
| 166 uint8_t* const rgb) { |
| 182 // The input is 6 registers of sixteen 8b but for the sake of explanation, | 167 // The input is 6 registers of sixteen 8b but for the sake of explanation, |
| 183 // let's take 6 registers of four 8b values. | 168 // let's take 6 registers of four 8b values. |
| 184 // To pack, we will keep taking one every two 8b integer and move it | 169 // To pack, we will keep taking one every two 8b integer and move it |
| 185 // around as follows: | 170 // around as follows: |
| 186 // Input: | 171 // Input: |
| 187 // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7 | 172 // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7 |
| 188 // Split the 6 registers in two sets of 3 registers: the first set as the even | 173 // Split the 6 registers in two sets of 3 registers: the first set as the even |
| 189 // 8b bytes, the second the odd ones: | 174 // 8b bytes, the second the odd ones: |
| 190 // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7 | 175 // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7 |
| 191 // Repeat the same permutations twice more: | 176 // Repeat the same permutations twice more: |
| 192 // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 | 177 // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 |
| 193 // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 | 178 // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 |
| 194 __m128i tmp[6]; | 179 VP8PlanarTo24b(in0, in1, in2, in3, in4, in5); |
| 195 PlanarTo24bHelper(in, tmp); | |
| 196 PlanarTo24bHelper(tmp, in); | |
| 197 PlanarTo24bHelper(in, tmp); | |
| 198 // We need to do it two more times than the example as we have sixteen bytes. | |
| 199 PlanarTo24bHelper(tmp, in); | |
| 200 PlanarTo24bHelper(in, tmp); | |
| 201 | 180 |
| 202 _mm_storeu_si128((__m128i*)(rgb + 0), tmp[0]); | 181 _mm_storeu_si128((__m128i*)(rgb + 0), *in0); |
| 203 _mm_storeu_si128((__m128i*)(rgb + 16), tmp[1]); | 182 _mm_storeu_si128((__m128i*)(rgb + 16), *in1); |
| 204 _mm_storeu_si128((__m128i*)(rgb + 32), tmp[2]); | 183 _mm_storeu_si128((__m128i*)(rgb + 32), *in2); |
| 205 _mm_storeu_si128((__m128i*)(rgb + 48), tmp[3]); | 184 _mm_storeu_si128((__m128i*)(rgb + 48), *in3); |
| 206 _mm_storeu_si128((__m128i*)(rgb + 64), tmp[4]); | 185 _mm_storeu_si128((__m128i*)(rgb + 64), *in4); |
| 207 _mm_storeu_si128((__m128i*)(rgb + 80), tmp[5]); | 186 _mm_storeu_si128((__m128i*)(rgb + 80), *in5); |
| 208 } | 187 } |
| 209 #undef MK_UINT32 | |
| 210 | 188 |
| 211 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, | 189 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 212 uint8_t* dst) { | 190 uint8_t* dst) { |
| 213 const __m128i kAlpha = _mm_set1_epi16(255); | 191 const __m128i kAlpha = _mm_set1_epi16(255); |
| 214 int n; | 192 int n; |
| 215 for (n = 0; n < 32; n += 8, dst += 32) { | 193 for (n = 0; n < 32; n += 8, dst += 32) { |
| 216 __m128i R, G, B; | 194 __m128i R, G, B; |
| 217 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); | 195 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); |
| 218 PackAndStore4(&R, &G, &B, &kAlpha, dst); | 196 PackAndStore4(&R, &G, &B, &kAlpha, dst); |
| 219 } | 197 } |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 258 for (n = 0; n < 32; n += 8, dst += 16) { | 236 for (n = 0; n < 32; n += 8, dst += 16) { |
| 259 __m128i R, G, B; | 237 __m128i R, G, B; |
| 260 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); | 238 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); |
| 261 PackAndStore565(&R, &G, &B, dst); | 239 PackAndStore565(&R, &G, &B, dst); |
| 262 } | 240 } |
| 263 } | 241 } |
| 264 | 242 |
| 265 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, | 243 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 266 uint8_t* dst) { | 244 uint8_t* dst) { |
| 267 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; | 245 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; |
| 268 __m128i rgb[6]; | 246 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; |
| 269 | 247 |
| 270 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); | 248 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); |
| 271 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); | 249 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); |
| 272 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); | 250 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); |
| 273 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); | 251 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); |
| 274 | 252 |
| 275 // Cast to 8b and store as RRRRGGGGBBBB. | 253 // Cast to 8b and store as RRRRGGGGBBBB. |
| 276 rgb[0] = _mm_packus_epi16(R0, R1); | 254 rgb0 = _mm_packus_epi16(R0, R1); |
| 277 rgb[1] = _mm_packus_epi16(R2, R3); | 255 rgb1 = _mm_packus_epi16(R2, R3); |
| 278 rgb[2] = _mm_packus_epi16(G0, G1); | 256 rgb2 = _mm_packus_epi16(G0, G1); |
| 279 rgb[3] = _mm_packus_epi16(G2, G3); | 257 rgb3 = _mm_packus_epi16(G2, G3); |
| 280 rgb[4] = _mm_packus_epi16(B0, B1); | 258 rgb4 = _mm_packus_epi16(B0, B1); |
| 281 rgb[5] = _mm_packus_epi16(B2, B3); | 259 rgb5 = _mm_packus_epi16(B2, B3); |
| 282 | 260 |
| 283 // Pack as RGBRGBRGBRGB. | 261 // Pack as RGBRGBRGBRGB. |
| 284 PlanarTo24b(rgb, dst); | 262 PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); |
| 285 } | 263 } |
| 286 | 264 |
| 287 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, | 265 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 288 uint8_t* dst) { | 266 uint8_t* dst) { |
| 289 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; | 267 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; |
| 290 __m128i bgr[6]; | 268 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; |
| 291 | 269 |
| 292 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); | 270 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); |
| 293 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); | 271 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); |
| 294 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); | 272 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); |
| 295 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); | 273 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); |
| 296 | 274 |
| 297 // Cast to 8b and store as BBBBGGGGRRRR. | 275 // Cast to 8b and store as BBBBGGGGRRRR. |
| 298 bgr[0] = _mm_packus_epi16(B0, B1); | 276 bgr0 = _mm_packus_epi16(B0, B1); |
| 299 bgr[1] = _mm_packus_epi16(B2, B3); | 277 bgr1 = _mm_packus_epi16(B2, B3); |
| 300 bgr[2] = _mm_packus_epi16(G0, G1); | 278 bgr2 = _mm_packus_epi16(G0, G1); |
| 301 bgr[3] = _mm_packus_epi16(G2, G3); | 279 bgr3 = _mm_packus_epi16(G2, G3); |
| 302 bgr[4] = _mm_packus_epi16(R0, R1); | 280 bgr4 = _mm_packus_epi16(R0, R1); |
| 303 bgr[5] = _mm_packus_epi16(R2, R3); | 281 bgr5= _mm_packus_epi16(R2, R3); |
| 304 | 282 |
| 305 // Pack as BGRBGRBGRBGR. | 283 // Pack as BGRBGRBGRBGR. |
| 306 PlanarTo24b(bgr, dst); | 284 PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); |
| 307 } | 285 } |
| 308 | 286 |
| 309 //----------------------------------------------------------------------------- | 287 //----------------------------------------------------------------------------- |
| 310 // Arbitrary-length row conversion functions | 288 // Arbitrary-length row conversion functions |
| 311 | 289 |
| 312 static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, | 290 static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 313 uint8_t* dst, int len) { | 291 uint8_t* dst, int len) { |
| 314 const __m128i kAlpha = _mm_set1_epi16(255); | 292 const __m128i kAlpha = _mm_set1_epi16(255); |
| 315 int n; | 293 int n; |
| 316 for (n = 0; n + 8 <= len; n += 8, dst += 32) { | 294 for (n = 0; n + 8 <= len; n += 8, dst += 32) { |
| (...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 370 u += (n & 1); | 348 u += (n & 1); |
| 371 v += (n & 1); | 349 v += (n & 1); |
| 372 } | 350 } |
| 373 } | 351 } |
| 374 | 352 |
| 375 static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, | 353 static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 376 uint8_t* dst, int len) { | 354 uint8_t* dst, int len) { |
| 377 int n; | 355 int n; |
| 378 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { | 356 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { |
| 379 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; | 357 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; |
| 380 __m128i rgb[6]; | 358 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; |
| 381 | 359 |
| 382 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); | 360 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); |
| 383 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); | 361 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); |
| 384 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); | 362 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); |
| 385 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); | 363 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); |
| 386 | 364 |
| 387 // Cast to 8b and store as RRRRGGGGBBBB. | 365 // Cast to 8b and store as RRRRGGGGBBBB. |
| 388 rgb[0] = _mm_packus_epi16(R0, R1); | 366 rgb0 = _mm_packus_epi16(R0, R1); |
| 389 rgb[1] = _mm_packus_epi16(R2, R3); | 367 rgb1 = _mm_packus_epi16(R2, R3); |
| 390 rgb[2] = _mm_packus_epi16(G0, G1); | 368 rgb2 = _mm_packus_epi16(G0, G1); |
| 391 rgb[3] = _mm_packus_epi16(G2, G3); | 369 rgb3 = _mm_packus_epi16(G2, G3); |
| 392 rgb[4] = _mm_packus_epi16(B0, B1); | 370 rgb4 = _mm_packus_epi16(B0, B1); |
| 393 rgb[5] = _mm_packus_epi16(B2, B3); | 371 rgb5 = _mm_packus_epi16(B2, B3); |
| 394 | 372 |
| 395 // Pack as RGBRGBRGBRGB. | 373 // Pack as RGBRGBRGBRGB. |
| 396 PlanarTo24b(rgb, dst); | 374 PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); |
| 397 | 375 |
| 398 y += 32; | 376 y += 32; |
| 399 u += 16; | 377 u += 16; |
| 400 v += 16; | 378 v += 16; |
| 401 } | 379 } |
| 402 for (; n < len; ++n) { // Finish off | 380 for (; n < len; ++n) { // Finish off |
| 403 VP8YuvToRgb(y[0], u[0], v[0], dst); | 381 VP8YuvToRgb(y[0], u[0], v[0], dst); |
| 404 dst += 3; | 382 dst += 3; |
| 405 y += 1; | 383 y += 1; |
| 406 u += (n & 1); | 384 u += (n & 1); |
| 407 v += (n & 1); | 385 v += (n & 1); |
| 408 } | 386 } |
| 409 } | 387 } |
| 410 | 388 |
| 411 static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, | 389 static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 412 uint8_t* dst, int len) { | 390 uint8_t* dst, int len) { |
| 413 int n; | 391 int n; |
| 414 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { | 392 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { |
| 415 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; | 393 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; |
| 416 __m128i bgr[6]; | 394 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; |
| 417 | 395 |
| 418 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); | 396 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); |
| 419 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); | 397 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); |
| 420 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); | 398 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); |
| 421 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); | 399 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); |
| 422 | 400 |
| 423 // Cast to 8b and store as BBBBGGGGRRRR. | 401 // Cast to 8b and store as BBBBGGGGRRRR. |
| 424 bgr[0] = _mm_packus_epi16(B0, B1); | 402 bgr0 = _mm_packus_epi16(B0, B1); |
| 425 bgr[1] = _mm_packus_epi16(B2, B3); | 403 bgr1 = _mm_packus_epi16(B2, B3); |
| 426 bgr[2] = _mm_packus_epi16(G0, G1); | 404 bgr2 = _mm_packus_epi16(G0, G1); |
| 427 bgr[3] = _mm_packus_epi16(G2, G3); | 405 bgr3 = _mm_packus_epi16(G2, G3); |
| 428 bgr[4] = _mm_packus_epi16(R0, R1); | 406 bgr4 = _mm_packus_epi16(R0, R1); |
| 429 bgr[5] = _mm_packus_epi16(R2, R3); | 407 bgr5 = _mm_packus_epi16(R2, R3); |
| 430 | 408 |
| 431 // Pack as BGRBGRBGRBGR. | 409 // Pack as BGRBGRBGRBGR. |
| 432 PlanarTo24b(bgr, dst); | 410 PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); |
| 433 | 411 |
| 434 y += 32; | 412 y += 32; |
| 435 u += 16; | 413 u += 16; |
| 436 v += 16; | 414 v += 16; |
| 437 } | 415 } |
| 438 for (; n < len; ++n) { // Finish off | 416 for (; n < len; ++n) { // Finish off |
| 439 VP8YuvToBgr(y[0], u[0], v[0], dst); | 417 VP8YuvToBgr(y[0], u[0], v[0], dst); |
| 440 dst += 3; | 418 dst += 3; |
| 441 y += 1; | 419 y += 1; |
| 442 u += (n & 1); | 420 u += (n & 1); |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 492 | 470 |
| 493 RGB24PackedToPlanarHelper(tmp, out); | 471 RGB24PackedToPlanarHelper(tmp, out); |
| 494 RGB24PackedToPlanarHelper(out, tmp); | 472 RGB24PackedToPlanarHelper(out, tmp); |
| 495 RGB24PackedToPlanarHelper(tmp, out); | 473 RGB24PackedToPlanarHelper(tmp, out); |
| 496 RGB24PackedToPlanarHelper(out, tmp); | 474 RGB24PackedToPlanarHelper(out, tmp); |
| 497 RGB24PackedToPlanarHelper(tmp, out); | 475 RGB24PackedToPlanarHelper(tmp, out); |
| 498 } | 476 } |
| 499 | 477 |
| 500 // Convert 8 packed ARGB to r[], g[], b[] | 478 // Convert 8 packed ARGB to r[], g[], b[] |
| 501 static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb, | 479 static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb, |
| 502 __m128i* const r, | 480 __m128i* const rgb /*in[6]*/) { |
| 503 __m128i* const g, | |
| 504 __m128i* const b) { | |
| 505 const __m128i zero = _mm_setzero_si128(); | 481 const __m128i zero = _mm_setzero_si128(); |
| 506 const __m128i in0 = LOAD_16(argb + 0); // argb3 | argb2 | argb1 | argb0 | 482 __m128i a0 = LOAD_16(argb + 0); |
| 507 const __m128i in1 = LOAD_16(argb + 4); // argb7 | argb6 | argb5 | argb4 | 483 __m128i a1 = LOAD_16(argb + 4); |
| 508 // column-wise transpose | 484 __m128i a2 = LOAD_16(argb + 8); |
| 509 const __m128i A0 = _mm_unpacklo_epi8(in0, in1); | 485 __m128i a3 = LOAD_16(argb + 12); |
| 510 const __m128i A1 = _mm_unpackhi_epi8(in0, in1); | 486 VP8L32bToPlanar(&a0, &a1, &a2, &a3); |
| 511 const __m128i B0 = _mm_unpacklo_epi8(A0, A1); | 487 rgb[0] = _mm_unpacklo_epi8(a1, zero); |
| 512 const __m128i B1 = _mm_unpackhi_epi8(A0, A1); | 488 rgb[1] = _mm_unpackhi_epi8(a1, zero); |
| 513 // C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0 | 489 rgb[2] = _mm_unpacklo_epi8(a2, zero); |
| 514 // C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0 | 490 rgb[3] = _mm_unpackhi_epi8(a2, zero); |
| 515 const __m128i C0 = _mm_unpacklo_epi8(B0, B1); | 491 rgb[4] = _mm_unpacklo_epi8(a3, zero); |
| 516 const __m128i C1 = _mm_unpackhi_epi8(B0, B1); | 492 rgb[5] = _mm_unpackhi_epi8(a3, zero); |
| 517 // store 16b | |
| 518 *r = _mm_unpacklo_epi8(C1, zero); | |
| 519 *g = _mm_unpackhi_epi8(C0, zero); | |
| 520 *b = _mm_unpacklo_epi8(C0, zero); | |
| 521 } | 493 } |
| 522 | 494 |
| 523 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX | 495 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX |
| 524 // It's a macro and not a function because we need to use immediate values with | 496 // It's a macro and not a function because we need to use immediate values with |
| 525 // srai_epi32, e.g. | 497 // srai_epi32, e.g. |
| 526 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \ | 498 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \ |
| 527 ROUNDER, DESCALE_FIX, OUT) do { \ | 499 ROUNDER, DESCALE_FIX, OUT) do { \ |
| 528 const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \ | 500 const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \ |
| 529 const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \ | 501 const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \ |
| 530 const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \ | 502 const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \ |
| (...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 642 } | 614 } |
| 643 for (; i < width; ++i, bgr += 3) { // left-over | 615 for (; i < width; ++i, bgr += 3) { // left-over |
| 644 y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); | 616 y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); |
| 645 } | 617 } |
| 646 } | 618 } |
| 647 | 619 |
| 648 static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) { | 620 static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) { |
| 649 const int max_width = width & ~15; | 621 const int max_width = width & ~15; |
| 650 int i; | 622 int i; |
| 651 for (i = 0; i < max_width; i += 16) { | 623 for (i = 0; i < max_width; i += 16) { |
| 652 __m128i r, g, b, Y0, Y1; | 624 __m128i Y0, Y1, rgb[6]; |
| 653 RGB32PackedToPlanar(&argb[i + 0], &r, &g, &b); | 625 RGB32PackedToPlanar(&argb[i], rgb); |
| 654 ConvertRGBToY(&r, &g, &b, &Y0); | 626 ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0); |
| 655 RGB32PackedToPlanar(&argb[i + 8], &r, &g, &b); | 627 ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1); |
| 656 ConvertRGBToY(&r, &g, &b, &Y1); | |
| 657 STORE_16(_mm_packus_epi16(Y0, Y1), y + i); | 628 STORE_16(_mm_packus_epi16(Y0, Y1), y + i); |
| 658 } | 629 } |
| 659 for (; i < width; ++i) { // left-over | 630 for (; i < width; ++i) { // left-over |
| 660 const uint32_t p = argb[i]; | 631 const uint32_t p = argb[i]; |
| 661 y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff, | 632 y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff, |
| 662 YUV_HALF); | 633 YUV_HALF); |
| 663 } | 634 } |
| 664 } | 635 } |
| 665 | 636 |
| 666 // Horizontal add (doubled) of two 16b values, result is 16b. | 637 // Horizontal add (doubled) of two 16b values, result is 16b. |
| 667 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ... | 638 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ... |
| 668 static void HorizontalAddPack(const __m128i* const A, const __m128i* const B, | 639 static void HorizontalAddPack(const __m128i* const A, const __m128i* const B, |
| 669 __m128i* const out) { | 640 __m128i* const out) { |
| 670 const __m128i k2 = _mm_set1_epi16(2); | 641 const __m128i k2 = _mm_set1_epi16(2); |
| 671 const __m128i C = _mm_madd_epi16(*A, k2); | 642 const __m128i C = _mm_madd_epi16(*A, k2); |
| 672 const __m128i D = _mm_madd_epi16(*B, k2); | 643 const __m128i D = _mm_madd_epi16(*B, k2); |
| 673 *out = _mm_packs_epi32(C, D); | 644 *out = _mm_packs_epi32(C, D); |
| 674 } | 645 } |
| 675 | 646 |
| 676 static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v, | 647 static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v, |
| 677 int src_width, int do_store) { | 648 int src_width, int do_store) { |
| 678 const int max_width = src_width & ~31; | 649 const int max_width = src_width & ~31; |
| 679 int i; | 650 int i; |
| 680 for (i = 0; i < max_width; i += 32, u += 16, v += 16) { | 651 for (i = 0; i < max_width; i += 32, u += 16, v += 16) { |
| 681 __m128i r0, g0, b0, r1, g1, b1, U0, V0, U1, V1; | 652 __m128i rgb[6], U0, V0, U1, V1; |
| 682 RGB32PackedToPlanar(&argb[i + 0], &r0, &g0, &b0); | 653 RGB32PackedToPlanar(&argb[i], rgb); |
| 683 RGB32PackedToPlanar(&argb[i + 8], &r1, &g1, &b1); | 654 HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]); |
| 684 HorizontalAddPack(&r0, &r1, &r0); | 655 HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]); |
| 685 HorizontalAddPack(&g0, &g1, &g0); | 656 HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]); |
| 686 HorizontalAddPack(&b0, &b1, &b0); | 657 ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); |
| 687 ConvertRGBToUV(&r0, &g0, &b0, &U0, &V0); | |
| 688 | 658 |
| 689 RGB32PackedToPlanar(&argb[i + 16], &r0, &g0, &b0); | 659 RGB32PackedToPlanar(&argb[i + 16], rgb); |
| 690 RGB32PackedToPlanar(&argb[i + 24], &r1, &g1, &b1); | 660 HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]); |
| 691 HorizontalAddPack(&r0, &r1, &r0); | 661 HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]); |
| 692 HorizontalAddPack(&g0, &g1, &g0); | 662 HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]); |
| 693 HorizontalAddPack(&b0, &b1, &b0); | 663 ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1); |
| 694 ConvertRGBToUV(&r0, &g0, &b0, &U1, &V1); | |
| 695 | 664 |
| 696 U0 = _mm_packus_epi16(U0, U1); | 665 U0 = _mm_packus_epi16(U0, U1); |
| 697 V0 = _mm_packus_epi16(V0, V1); | 666 V0 = _mm_packus_epi16(V0, V1); |
| 698 if (!do_store) { | 667 if (!do_store) { |
| 699 const __m128i prev_u = LOAD_16(u); | 668 const __m128i prev_u = LOAD_16(u); |
| 700 const __m128i prev_v = LOAD_16(v); | 669 const __m128i prev_v = LOAD_16(v); |
| 701 U0 = _mm_avg_epu8(U0, prev_u); | 670 U0 = _mm_avg_epu8(U0, prev_u); |
| 702 V0 = _mm_avg_epu8(V0, prev_v); | 671 V0 = _mm_avg_epu8(V0, prev_v); |
| 703 } | 672 } |
| 704 STORE_16(U0, u); | 673 STORE_16(U0, u); |
| (...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 760 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) { | 729 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) { |
| 761 WebPConvertARGBToY = ConvertARGBToY; | 730 WebPConvertARGBToY = ConvertARGBToY; |
| 762 WebPConvertARGBToUV = ConvertARGBToUV; | 731 WebPConvertARGBToUV = ConvertARGBToUV; |
| 763 | 732 |
| 764 WebPConvertRGB24ToY = ConvertRGB24ToY; | 733 WebPConvertRGB24ToY = ConvertRGB24ToY; |
| 765 WebPConvertBGR24ToY = ConvertBGR24ToY; | 734 WebPConvertBGR24ToY = ConvertBGR24ToY; |
| 766 | 735 |
| 767 WebPConvertRGBA32ToUV = ConvertRGBA32ToUV; | 736 WebPConvertRGBA32ToUV = ConvertRGBA32ToUV; |
| 768 } | 737 } |
| 769 | 738 |
| 739 //------------------------------------------------------------------------------ |
| 740 |
| 741 #define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic |
| 742 static uint16_t clip_y(int v) { |
| 743 return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; |
| 744 } |
| 745 |
| 746 static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src, |
| 747 uint16_t* dst, int len) { |
| 748 uint64_t diff = 0; |
| 749 uint32_t tmp[4]; |
| 750 int i; |
| 751 const __m128i zero = _mm_setzero_si128(); |
| 752 const __m128i max = _mm_set1_epi16(MAX_Y); |
| 753 const __m128i one = _mm_set1_epi16(1); |
| 754 __m128i sum = zero; |
| 755 |
| 756 for (i = 0; i + 8 <= len; i += 8) { |
| 757 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); |
| 758 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); |
| 759 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); |
| 760 const __m128i D = _mm_sub_epi16(A, B); // diff_y |
| 761 const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0) |
| 762 const __m128i F = _mm_add_epi16(C, D); // new_y |
| 763 const __m128i G = _mm_or_si128(E, one); // -1 or 1 |
| 764 const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero); |
| 765 const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...)) |
| 766 _mm_storeu_si128((__m128i*)(dst + i), H); |
| 767 sum = _mm_add_epi32(sum, I); |
| 768 } |
| 769 _mm_storeu_si128((__m128i*)tmp, sum); |
| 770 diff = tmp[3] + tmp[2] + tmp[1] + tmp[0]; |
| 771 for (; i < len; ++i) { |
| 772 const int diff_y = ref[i] - src[i]; |
| 773 const int new_y = (int)dst[i] + diff_y; |
| 774 dst[i] = clip_y(new_y); |
| 775 diff += (uint64_t)abs(diff_y); |
| 776 } |
| 777 return diff; |
| 778 } |
| 779 |
| 780 static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src, |
| 781 int16_t* dst, int len) { |
| 782 int i = 0; |
| 783 for (i = 0; i + 8 <= len; i += 8) { |
| 784 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); |
| 785 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); |
| 786 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); |
| 787 const __m128i D = _mm_sub_epi16(A, B); // diff_uv |
| 788 const __m128i E = _mm_add_epi16(C, D); // new_uv |
| 789 _mm_storeu_si128((__m128i*)(dst + i), E); |
| 790 } |
| 791 for (; i < len; ++i) { |
| 792 const int diff_uv = ref[i] - src[i]; |
| 793 dst[i] += diff_uv; |
| 794 } |
| 795 } |
| 796 |
| 797 static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, |
| 798 const uint16_t* best_y, uint16_t* out) { |
| 799 int i; |
| 800 const __m128i kCst8 = _mm_set1_epi16(8); |
| 801 const __m128i max = _mm_set1_epi16(MAX_Y); |
| 802 const __m128i zero = _mm_setzero_si128(); |
| 803 for (i = 0; i + 8 <= len; i += 8) { |
| 804 const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0)); |
| 805 const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1)); |
| 806 const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0)); |
| 807 const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1)); |
| 808 const __m128i a0b1 = _mm_add_epi16(a0, b1); |
| 809 const __m128i a1b0 = _mm_add_epi16(a1, b0); |
| 810 const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1 |
| 811 const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8); |
| 812 const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1) |
| 813 const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0) |
| 814 const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3); |
| 815 const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3); |
| 816 const __m128i d0 = _mm_add_epi16(c1, a0); |
| 817 const __m128i d1 = _mm_add_epi16(c0, a1); |
| 818 const __m128i e0 = _mm_srai_epi16(d0, 1); |
| 819 const __m128i e1 = _mm_srai_epi16(d1, 1); |
| 820 const __m128i f0 = _mm_unpacklo_epi16(e0, e1); |
| 821 const __m128i f1 = _mm_unpackhi_epi16(e0, e1); |
| 822 const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0)); |
| 823 const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8)); |
| 824 const __m128i h0 = _mm_add_epi16(g0, f0); |
| 825 const __m128i h1 = _mm_add_epi16(g1, f1); |
| 826 const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero); |
| 827 const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero); |
| 828 _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0); |
| 829 _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1); |
| 830 } |
| 831 for (; i < len; ++i) { |
| 832 // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 = |
| 833 // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4 |
| 834 // We reuse the common sub-expressions. |
| 835 const int a0b1 = A[i + 0] + B[i + 1]; |
| 836 const int a1b0 = A[i + 1] + B[i + 0]; |
| 837 const int a0a1b0b1 = a0b1 + a1b0 + 8; |
| 838 const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; |
| 839 const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; |
| 840 out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); |
| 841 out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); |
| 842 } |
| 843 } |
| 844 |
| 845 #undef MAX_Y |
| 846 |
| 847 //------------------------------------------------------------------------------ |
| 848 |
| 849 extern void WebPInitSharpYUVSSE2(void); |
| 850 |
| 851 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) { |
| 852 WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2; |
| 853 WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2; |
| 854 WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2; |
| 855 } |
| 856 |
| 770 #else // !WEBP_USE_SSE2 | 857 #else // !WEBP_USE_SSE2 |
| 771 | 858 |
| 772 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2) | 859 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2) |
| 773 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2) | 860 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2) |
| 861 WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2) |
| 774 | 862 |
| 775 #endif // WEBP_USE_SSE2 | 863 #endif // WEBP_USE_SSE2 |
| OLD | NEW |