Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(83)

Side by Side Diff: third_party/libwebp/dsp/yuv_sse2.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/yuv.c ('k') | third_party/libwebp/enc/alpha.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 Google Inc. All Rights Reserved. 1 // Copyright 2014 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // YUV->RGB conversion functions 10 // YUV->RGB conversion functions
11 // 11 //
12 // Author: Skal (pascal.massimino@gmail.com) 12 // Author: Skal (pascal.massimino@gmail.com)
13 13
14 #include "./yuv.h" 14 #include "./yuv.h"
15 15
16 #if defined(WEBP_USE_SSE2) 16 #if defined(WEBP_USE_SSE2)
17 17
18 #include "./common_sse2.h"
19 #include <stdlib.h>
18 #include <emmintrin.h> 20 #include <emmintrin.h>
19 21
20 //----------------------------------------------------------------------------- 22 //-----------------------------------------------------------------------------
21 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler. 23 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
22 24
23 // These constants are 14b fixed-point version of ITU-R BT.601 constants. 25 // These constants are 14b fixed-point version of ITU-R BT.601 constants.
24 // R = (19077 * y + 26149 * v - 14234) >> 6 26 // R = (19077 * y + 26149 * v - 14234) >> 6
25 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 27 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
26 // B = (19077 * y + 33050 * u - 17685) >> 6 28 // B = (19077 * y + 33050 * u - 17685) >> 6
27 static void ConvertYUV444ToRGB(const __m128i* const Y0, 29 static void ConvertYUV444ToRGB(const __m128i* const Y0,
(...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after
148 const __m128i rg = _mm_or_si128(r1, g1); 150 const __m128i rg = _mm_or_si128(r1, g1);
149 const __m128i gb = _mm_or_si128(g2, b1); 151 const __m128i gb = _mm_or_si128(g2, b1);
150 #if !defined(WEBP_SWAP_16BIT_CSP) 152 #if !defined(WEBP_SWAP_16BIT_CSP)
151 const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb); 153 const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
152 #else 154 #else
153 const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg); 155 const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
154 #endif 156 #endif
155 _mm_storeu_si128((__m128i*)dst, rgb565); 157 _mm_storeu_si128((__m128i*)dst, rgb565);
156 } 158 }
157 159
158 // Function used several times in PlanarTo24b.
159 // It samples the in buffer as follows: one every two unsigned char is stored
160 // at the beginning of the buffer, while the other half is stored at the end.
161 static WEBP_INLINE void PlanarTo24bHelper(const __m128i* const in /*in[6]*/,
162 __m128i* const out /*out[6]*/) {
163 const __m128i v_mask = _mm_set1_epi16(0x00ff);
164
165 // Take one every two upper 8b values.
166 out[0] = _mm_packus_epi16(_mm_and_si128(in[0], v_mask),
167 _mm_and_si128(in[1], v_mask));
168 out[1] = _mm_packus_epi16(_mm_and_si128(in[2], v_mask),
169 _mm_and_si128(in[3], v_mask));
170 out[2] = _mm_packus_epi16(_mm_and_si128(in[4], v_mask),
171 _mm_and_si128(in[5], v_mask));
172 // Take one every two lower 8b values.
173 out[3] = _mm_packus_epi16(_mm_srli_epi16(in[0], 8), _mm_srli_epi16(in[1], 8));
174 out[4] = _mm_packus_epi16(_mm_srli_epi16(in[2], 8), _mm_srli_epi16(in[3], 8));
175 out[5] = _mm_packus_epi16(_mm_srli_epi16(in[4], 8), _mm_srli_epi16(in[5], 8));
176 }
177
178 // Pack the planar buffers 160 // Pack the planar buffers
179 // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... 161 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
180 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... 162 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
181 static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) { 163 static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
164 __m128i* const in2, __m128i* const in3,
165 __m128i* const in4, __m128i* const in5,
166 uint8_t* const rgb) {
182 // The input is 6 registers of sixteen 8b but for the sake of explanation, 167 // The input is 6 registers of sixteen 8b but for the sake of explanation,
183 // let's take 6 registers of four 8b values. 168 // let's take 6 registers of four 8b values.
184 // To pack, we will keep taking one every two 8b integer and move it 169 // To pack, we will keep taking one every two 8b integer and move it
185 // around as follows: 170 // around as follows:
186 // Input: 171 // Input:
187 // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7 172 // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
188 // Split the 6 registers in two sets of 3 registers: the first set as the even 173 // Split the 6 registers in two sets of 3 registers: the first set as the even
189 // 8b bytes, the second the odd ones: 174 // 8b bytes, the second the odd ones:
190 // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7 175 // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
191 // Repeat the same permutations twice more: 176 // Repeat the same permutations twice more:
192 // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 177 // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
193 // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 178 // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
194 __m128i tmp[6]; 179 VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);
195 PlanarTo24bHelper(in, tmp);
196 PlanarTo24bHelper(tmp, in);
197 PlanarTo24bHelper(in, tmp);
198 // We need to do it two more times than the example as we have sixteen bytes.
199 PlanarTo24bHelper(tmp, in);
200 PlanarTo24bHelper(in, tmp);
201 180
202 _mm_storeu_si128((__m128i*)(rgb + 0), tmp[0]); 181 _mm_storeu_si128((__m128i*)(rgb + 0), *in0);
203 _mm_storeu_si128((__m128i*)(rgb + 16), tmp[1]); 182 _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
204 _mm_storeu_si128((__m128i*)(rgb + 32), tmp[2]); 183 _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
205 _mm_storeu_si128((__m128i*)(rgb + 48), tmp[3]); 184 _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
206 _mm_storeu_si128((__m128i*)(rgb + 64), tmp[4]); 185 _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
207 _mm_storeu_si128((__m128i*)(rgb + 80), tmp[5]); 186 _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
208 } 187 }
209 #undef MK_UINT32
210 188
211 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, 189 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
212 uint8_t* dst) { 190 uint8_t* dst) {
213 const __m128i kAlpha = _mm_set1_epi16(255); 191 const __m128i kAlpha = _mm_set1_epi16(255);
214 int n; 192 int n;
215 for (n = 0; n < 32; n += 8, dst += 32) { 193 for (n = 0; n < 32; n += 8, dst += 32) {
216 __m128i R, G, B; 194 __m128i R, G, B;
217 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); 195 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
218 PackAndStore4(&R, &G, &B, &kAlpha, dst); 196 PackAndStore4(&R, &G, &B, &kAlpha, dst);
219 } 197 }
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
258 for (n = 0; n < 32; n += 8, dst += 16) { 236 for (n = 0; n < 32; n += 8, dst += 16) {
259 __m128i R, G, B; 237 __m128i R, G, B;
260 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); 238 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
261 PackAndStore565(&R, &G, &B, dst); 239 PackAndStore565(&R, &G, &B, dst);
262 } 240 }
263 } 241 }
264 242
265 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, 243 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
266 uint8_t* dst) { 244 uint8_t* dst) {
267 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; 245 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
268 __m128i rgb[6]; 246 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
269 247
270 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); 248 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
271 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); 249 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
272 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); 250 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
273 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); 251 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
274 252
275 // Cast to 8b and store as RRRRGGGGBBBB. 253 // Cast to 8b and store as RRRRGGGGBBBB.
276 rgb[0] = _mm_packus_epi16(R0, R1); 254 rgb0 = _mm_packus_epi16(R0, R1);
277 rgb[1] = _mm_packus_epi16(R2, R3); 255 rgb1 = _mm_packus_epi16(R2, R3);
278 rgb[2] = _mm_packus_epi16(G0, G1); 256 rgb2 = _mm_packus_epi16(G0, G1);
279 rgb[3] = _mm_packus_epi16(G2, G3); 257 rgb3 = _mm_packus_epi16(G2, G3);
280 rgb[4] = _mm_packus_epi16(B0, B1); 258 rgb4 = _mm_packus_epi16(B0, B1);
281 rgb[5] = _mm_packus_epi16(B2, B3); 259 rgb5 = _mm_packus_epi16(B2, B3);
282 260
283 // Pack as RGBRGBRGBRGB. 261 // Pack as RGBRGBRGBRGB.
284 PlanarTo24b(rgb, dst); 262 PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
285 } 263 }
286 264
287 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, 265 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
288 uint8_t* dst) { 266 uint8_t* dst) {
289 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; 267 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
290 __m128i bgr[6]; 268 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
291 269
292 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); 270 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
293 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); 271 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
294 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); 272 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
295 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); 273 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
296 274
297 // Cast to 8b and store as BBBBGGGGRRRR. 275 // Cast to 8b and store as BBBBGGGGRRRR.
298 bgr[0] = _mm_packus_epi16(B0, B1); 276 bgr0 = _mm_packus_epi16(B0, B1);
299 bgr[1] = _mm_packus_epi16(B2, B3); 277 bgr1 = _mm_packus_epi16(B2, B3);
300 bgr[2] = _mm_packus_epi16(G0, G1); 278 bgr2 = _mm_packus_epi16(G0, G1);
301 bgr[3] = _mm_packus_epi16(G2, G3); 279 bgr3 = _mm_packus_epi16(G2, G3);
302 bgr[4] = _mm_packus_epi16(R0, R1); 280 bgr4 = _mm_packus_epi16(R0, R1);
303 bgr[5] = _mm_packus_epi16(R2, R3); 281 bgr5= _mm_packus_epi16(R2, R3);
304 282
305 // Pack as BGRBGRBGRBGR. 283 // Pack as BGRBGRBGRBGR.
306 PlanarTo24b(bgr, dst); 284 PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
307 } 285 }
308 286
309 //----------------------------------------------------------------------------- 287 //-----------------------------------------------------------------------------
310 // Arbitrary-length row conversion functions 288 // Arbitrary-length row conversion functions
311 289
312 static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, 290 static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
313 uint8_t* dst, int len) { 291 uint8_t* dst, int len) {
314 const __m128i kAlpha = _mm_set1_epi16(255); 292 const __m128i kAlpha = _mm_set1_epi16(255);
315 int n; 293 int n;
316 for (n = 0; n + 8 <= len; n += 8, dst += 32) { 294 for (n = 0; n + 8 <= len; n += 8, dst += 32) {
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
370 u += (n & 1); 348 u += (n & 1);
371 v += (n & 1); 349 v += (n & 1);
372 } 350 }
373 } 351 }
374 352
375 static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, 353 static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
376 uint8_t* dst, int len) { 354 uint8_t* dst, int len) {
377 int n; 355 int n;
378 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { 356 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
379 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; 357 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
380 __m128i rgb[6]; 358 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
381 359
382 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); 360 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
383 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); 361 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);
384 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); 362 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);
385 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); 363 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
386 364
387 // Cast to 8b and store as RRRRGGGGBBBB. 365 // Cast to 8b and store as RRRRGGGGBBBB.
388 rgb[0] = _mm_packus_epi16(R0, R1); 366 rgb0 = _mm_packus_epi16(R0, R1);
389 rgb[1] = _mm_packus_epi16(R2, R3); 367 rgb1 = _mm_packus_epi16(R2, R3);
390 rgb[2] = _mm_packus_epi16(G0, G1); 368 rgb2 = _mm_packus_epi16(G0, G1);
391 rgb[3] = _mm_packus_epi16(G2, G3); 369 rgb3 = _mm_packus_epi16(G2, G3);
392 rgb[4] = _mm_packus_epi16(B0, B1); 370 rgb4 = _mm_packus_epi16(B0, B1);
393 rgb[5] = _mm_packus_epi16(B2, B3); 371 rgb5 = _mm_packus_epi16(B2, B3);
394 372
395 // Pack as RGBRGBRGBRGB. 373 // Pack as RGBRGBRGBRGB.
396 PlanarTo24b(rgb, dst); 374 PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
397 375
398 y += 32; 376 y += 32;
399 u += 16; 377 u += 16;
400 v += 16; 378 v += 16;
401 } 379 }
402 for (; n < len; ++n) { // Finish off 380 for (; n < len; ++n) { // Finish off
403 VP8YuvToRgb(y[0], u[0], v[0], dst); 381 VP8YuvToRgb(y[0], u[0], v[0], dst);
404 dst += 3; 382 dst += 3;
405 y += 1; 383 y += 1;
406 u += (n & 1); 384 u += (n & 1);
407 v += (n & 1); 385 v += (n & 1);
408 } 386 }
409 } 387 }
410 388
411 static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, 389 static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
412 uint8_t* dst, int len) { 390 uint8_t* dst, int len) {
413 int n; 391 int n;
414 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { 392 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
415 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; 393 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
416 __m128i bgr[6]; 394 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
417 395
418 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); 396 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
419 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); 397 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);
420 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); 398 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);
421 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); 399 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
422 400
423 // Cast to 8b and store as BBBBGGGGRRRR. 401 // Cast to 8b and store as BBBBGGGGRRRR.
424 bgr[0] = _mm_packus_epi16(B0, B1); 402 bgr0 = _mm_packus_epi16(B0, B1);
425 bgr[1] = _mm_packus_epi16(B2, B3); 403 bgr1 = _mm_packus_epi16(B2, B3);
426 bgr[2] = _mm_packus_epi16(G0, G1); 404 bgr2 = _mm_packus_epi16(G0, G1);
427 bgr[3] = _mm_packus_epi16(G2, G3); 405 bgr3 = _mm_packus_epi16(G2, G3);
428 bgr[4] = _mm_packus_epi16(R0, R1); 406 bgr4 = _mm_packus_epi16(R0, R1);
429 bgr[5] = _mm_packus_epi16(R2, R3); 407 bgr5 = _mm_packus_epi16(R2, R3);
430 408
431 // Pack as BGRBGRBGRBGR. 409 // Pack as BGRBGRBGRBGR.
432 PlanarTo24b(bgr, dst); 410 PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
433 411
434 y += 32; 412 y += 32;
435 u += 16; 413 u += 16;
436 v += 16; 414 v += 16;
437 } 415 }
438 for (; n < len; ++n) { // Finish off 416 for (; n < len; ++n) { // Finish off
439 VP8YuvToBgr(y[0], u[0], v[0], dst); 417 VP8YuvToBgr(y[0], u[0], v[0], dst);
440 dst += 3; 418 dst += 3;
441 y += 1; 419 y += 1;
442 u += (n & 1); 420 u += (n & 1);
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
492 470
493 RGB24PackedToPlanarHelper(tmp, out); 471 RGB24PackedToPlanarHelper(tmp, out);
494 RGB24PackedToPlanarHelper(out, tmp); 472 RGB24PackedToPlanarHelper(out, tmp);
495 RGB24PackedToPlanarHelper(tmp, out); 473 RGB24PackedToPlanarHelper(tmp, out);
496 RGB24PackedToPlanarHelper(out, tmp); 474 RGB24PackedToPlanarHelper(out, tmp);
497 RGB24PackedToPlanarHelper(tmp, out); 475 RGB24PackedToPlanarHelper(tmp, out);
498 } 476 }
499 477
500 // Convert 8 packed ARGB to r[], g[], b[] 478 // Convert 8 packed ARGB to r[], g[], b[]
501 static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb, 479 static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
502 __m128i* const r, 480 __m128i* const rgb /*in[6]*/) {
503 __m128i* const g,
504 __m128i* const b) {
505 const __m128i zero = _mm_setzero_si128(); 481 const __m128i zero = _mm_setzero_si128();
506 const __m128i in0 = LOAD_16(argb + 0); // argb3 | argb2 | argb1 | argb0 482 __m128i a0 = LOAD_16(argb + 0);
507 const __m128i in1 = LOAD_16(argb + 4); // argb7 | argb6 | argb5 | argb4 483 __m128i a1 = LOAD_16(argb + 4);
508 // column-wise transpose 484 __m128i a2 = LOAD_16(argb + 8);
509 const __m128i A0 = _mm_unpacklo_epi8(in0, in1); 485 __m128i a3 = LOAD_16(argb + 12);
510 const __m128i A1 = _mm_unpackhi_epi8(in0, in1); 486 VP8L32bToPlanar(&a0, &a1, &a2, &a3);
511 const __m128i B0 = _mm_unpacklo_epi8(A0, A1); 487 rgb[0] = _mm_unpacklo_epi8(a1, zero);
512 const __m128i B1 = _mm_unpackhi_epi8(A0, A1); 488 rgb[1] = _mm_unpackhi_epi8(a1, zero);
513 // C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0 489 rgb[2] = _mm_unpacklo_epi8(a2, zero);
514 // C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0 490 rgb[3] = _mm_unpackhi_epi8(a2, zero);
515 const __m128i C0 = _mm_unpacklo_epi8(B0, B1); 491 rgb[4] = _mm_unpacklo_epi8(a3, zero);
516 const __m128i C1 = _mm_unpackhi_epi8(B0, B1); 492 rgb[5] = _mm_unpackhi_epi8(a3, zero);
517 // store 16b
518 *r = _mm_unpacklo_epi8(C1, zero);
519 *g = _mm_unpackhi_epi8(C0, zero);
520 *b = _mm_unpacklo_epi8(C0, zero);
521 } 493 }
522 494
523 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX 495 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
524 // It's a macro and not a function because we need to use immediate values with 496 // It's a macro and not a function because we need to use immediate values with
525 // srai_epi32, e.g. 497 // srai_epi32, e.g.
526 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \ 498 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
527 ROUNDER, DESCALE_FIX, OUT) do { \ 499 ROUNDER, DESCALE_FIX, OUT) do { \
528 const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \ 500 const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
529 const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \ 501 const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
530 const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \ 502 const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after
642 } 614 }
643 for (; i < width; ++i, bgr += 3) { // left-over 615 for (; i < width; ++i, bgr += 3) { // left-over
644 y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); 616 y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
645 } 617 }
646 } 618 }
647 619
648 static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) { 620 static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
649 const int max_width = width & ~15; 621 const int max_width = width & ~15;
650 int i; 622 int i;
651 for (i = 0; i < max_width; i += 16) { 623 for (i = 0; i < max_width; i += 16) {
652 __m128i r, g, b, Y0, Y1; 624 __m128i Y0, Y1, rgb[6];
653 RGB32PackedToPlanar(&argb[i + 0], &r, &g, &b); 625 RGB32PackedToPlanar(&argb[i], rgb);
654 ConvertRGBToY(&r, &g, &b, &Y0); 626 ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);
655 RGB32PackedToPlanar(&argb[i + 8], &r, &g, &b); 627 ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);
656 ConvertRGBToY(&r, &g, &b, &Y1);
657 STORE_16(_mm_packus_epi16(Y0, Y1), y + i); 628 STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
658 } 629 }
659 for (; i < width; ++i) { // left-over 630 for (; i < width; ++i) { // left-over
660 const uint32_t p = argb[i]; 631 const uint32_t p = argb[i];
661 y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff, 632 y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
662 YUV_HALF); 633 YUV_HALF);
663 } 634 }
664 } 635 }
665 636
666 // Horizontal add (doubled) of two 16b values, result is 16b. 637 // Horizontal add (doubled) of two 16b values, result is 16b.
667 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ... 638 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
668 static void HorizontalAddPack(const __m128i* const A, const __m128i* const B, 639 static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,
669 __m128i* const out) { 640 __m128i* const out) {
670 const __m128i k2 = _mm_set1_epi16(2); 641 const __m128i k2 = _mm_set1_epi16(2);
671 const __m128i C = _mm_madd_epi16(*A, k2); 642 const __m128i C = _mm_madd_epi16(*A, k2);
672 const __m128i D = _mm_madd_epi16(*B, k2); 643 const __m128i D = _mm_madd_epi16(*B, k2);
673 *out = _mm_packs_epi32(C, D); 644 *out = _mm_packs_epi32(C, D);
674 } 645 }
675 646
676 static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v, 647 static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
677 int src_width, int do_store) { 648 int src_width, int do_store) {
678 const int max_width = src_width & ~31; 649 const int max_width = src_width & ~31;
679 int i; 650 int i;
680 for (i = 0; i < max_width; i += 32, u += 16, v += 16) { 651 for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
681 __m128i r0, g0, b0, r1, g1, b1, U0, V0, U1, V1; 652 __m128i rgb[6], U0, V0, U1, V1;
682 RGB32PackedToPlanar(&argb[i + 0], &r0, &g0, &b0); 653 RGB32PackedToPlanar(&argb[i], rgb);
683 RGB32PackedToPlanar(&argb[i + 8], &r1, &g1, &b1); 654 HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
684 HorizontalAddPack(&r0, &r1, &r0); 655 HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
685 HorizontalAddPack(&g0, &g1, &g0); 656 HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
686 HorizontalAddPack(&b0, &b1, &b0); 657 ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
687 ConvertRGBToUV(&r0, &g0, &b0, &U0, &V0);
688 658
689 RGB32PackedToPlanar(&argb[i + 16], &r0, &g0, &b0); 659 RGB32PackedToPlanar(&argb[i + 16], rgb);
690 RGB32PackedToPlanar(&argb[i + 24], &r1, &g1, &b1); 660 HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
691 HorizontalAddPack(&r0, &r1, &r0); 661 HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
692 HorizontalAddPack(&g0, &g1, &g0); 662 HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
693 HorizontalAddPack(&b0, &b1, &b0); 663 ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
694 ConvertRGBToUV(&r0, &g0, &b0, &U1, &V1);
695 664
696 U0 = _mm_packus_epi16(U0, U1); 665 U0 = _mm_packus_epi16(U0, U1);
697 V0 = _mm_packus_epi16(V0, V1); 666 V0 = _mm_packus_epi16(V0, V1);
698 if (!do_store) { 667 if (!do_store) {
699 const __m128i prev_u = LOAD_16(u); 668 const __m128i prev_u = LOAD_16(u);
700 const __m128i prev_v = LOAD_16(v); 669 const __m128i prev_v = LOAD_16(v);
701 U0 = _mm_avg_epu8(U0, prev_u); 670 U0 = _mm_avg_epu8(U0, prev_u);
702 V0 = _mm_avg_epu8(V0, prev_v); 671 V0 = _mm_avg_epu8(V0, prev_v);
703 } 672 }
704 STORE_16(U0, u); 673 STORE_16(U0, u);
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
760 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) { 729 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
761 WebPConvertARGBToY = ConvertARGBToY; 730 WebPConvertARGBToY = ConvertARGBToY;
762 WebPConvertARGBToUV = ConvertARGBToUV; 731 WebPConvertARGBToUV = ConvertARGBToUV;
763 732
764 WebPConvertRGB24ToY = ConvertRGB24ToY; 733 WebPConvertRGB24ToY = ConvertRGB24ToY;
765 WebPConvertBGR24ToY = ConvertBGR24ToY; 734 WebPConvertBGR24ToY = ConvertBGR24ToY;
766 735
767 WebPConvertRGBA32ToUV = ConvertRGBA32ToUV; 736 WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
768 } 737 }
769 738
739 //------------------------------------------------------------------------------
740
741 #define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
742 static uint16_t clip_y(int v) {
743 return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
744 }
745
746 static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
747 uint16_t* dst, int len) {
748 uint64_t diff = 0;
749 uint32_t tmp[4];
750 int i;
751 const __m128i zero = _mm_setzero_si128();
752 const __m128i max = _mm_set1_epi16(MAX_Y);
753 const __m128i one = _mm_set1_epi16(1);
754 __m128i sum = zero;
755
756 for (i = 0; i + 8 <= len; i += 8) {
757 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
758 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
759 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
760 const __m128i D = _mm_sub_epi16(A, B); // diff_y
761 const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
762 const __m128i F = _mm_add_epi16(C, D); // new_y
763 const __m128i G = _mm_or_si128(E, one); // -1 or 1
764 const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
765 const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
766 _mm_storeu_si128((__m128i*)(dst + i), H);
767 sum = _mm_add_epi32(sum, I);
768 }
769 _mm_storeu_si128((__m128i*)tmp, sum);
770 diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
771 for (; i < len; ++i) {
772 const int diff_y = ref[i] - src[i];
773 const int new_y = (int)dst[i] + diff_y;
774 dst[i] = clip_y(new_y);
775 diff += (uint64_t)abs(diff_y);
776 }
777 return diff;
778 }
779
780 static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
781 int16_t* dst, int len) {
782 int i = 0;
783 for (i = 0; i + 8 <= len; i += 8) {
784 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
785 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
786 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
787 const __m128i D = _mm_sub_epi16(A, B); // diff_uv
788 const __m128i E = _mm_add_epi16(C, D); // new_uv
789 _mm_storeu_si128((__m128i*)(dst + i), E);
790 }
791 for (; i < len; ++i) {
792 const int diff_uv = ref[i] - src[i];
793 dst[i] += diff_uv;
794 }
795 }
796
797 static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
798 const uint16_t* best_y, uint16_t* out) {
799 int i;
800 const __m128i kCst8 = _mm_set1_epi16(8);
801 const __m128i max = _mm_set1_epi16(MAX_Y);
802 const __m128i zero = _mm_setzero_si128();
803 for (i = 0; i + 8 <= len; i += 8) {
804 const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
805 const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
806 const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
807 const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
808 const __m128i a0b1 = _mm_add_epi16(a0, b1);
809 const __m128i a1b0 = _mm_add_epi16(a1, b0);
810 const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
811 const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
812 const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
813 const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
814 const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
815 const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
816 const __m128i d0 = _mm_add_epi16(c1, a0);
817 const __m128i d1 = _mm_add_epi16(c0, a1);
818 const __m128i e0 = _mm_srai_epi16(d0, 1);
819 const __m128i e1 = _mm_srai_epi16(d1, 1);
820 const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
821 const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
822 const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
823 const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
824 const __m128i h0 = _mm_add_epi16(g0, f0);
825 const __m128i h1 = _mm_add_epi16(g1, f1);
826 const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
827 const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
828 _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
829 _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
830 }
831 for (; i < len; ++i) {
832 // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
833 // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
834 // We reuse the common sub-expressions.
835 const int a0b1 = A[i + 0] + B[i + 1];
836 const int a1b0 = A[i + 1] + B[i + 0];
837 const int a0a1b0b1 = a0b1 + a1b0 + 8;
838 const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
839 const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
840 out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
841 out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
842 }
843 }
844
845 #undef MAX_Y
846
847 //------------------------------------------------------------------------------
848
849 extern void WebPInitSharpYUVSSE2(void);
850
851 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
852 WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
853 WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
854 WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
855 }
856
770 #else // !WEBP_USE_SSE2 857 #else // !WEBP_USE_SSE2
771 858
772 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2) 859 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
773 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2) 860 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
861 WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
774 862
775 #endif // WEBP_USE_SSE2 863 #endif // WEBP_USE_SSE2
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/yuv.c ('k') | third_party/libwebp/enc/alpha.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698