Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1370)

Side by Side Diff: third_party/libwebp/dsp/lossless_enc_sse2.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)
Patch Set: Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 Google Inc. All Rights Reserved. 1 // Copyright 2015 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // SSE2 variant of methods for lossless encoder 10 // SSE2 variant of methods for lossless encoder
11 // 11 //
12 // Author: Skal (pascal.massimino@gmail.com) 12 // Author: Skal (pascal.massimino@gmail.com)
13 13
14 #include "./dsp.h" 14 #include "./dsp.h"
15 15
16 #if defined(WEBP_USE_SSE2) 16 #if defined(WEBP_USE_SSE2)
17 #include <assert.h> 17 #include <assert.h>
18 #include <emmintrin.h> 18 #include <emmintrin.h>
19 #include "./lossless.h" 19 #include "./lossless.h"
20 #include "./common_sse2.h"
21 #include "./lossless_common.h"
20 22
21 // For sign-extended multiplying constants, pre-shifted by 5: 23 // For sign-extended multiplying constants, pre-shifted by 5:
22 #define CST_5b(X) (((int16_t)((uint16_t)X << 8)) >> 5) 24 #define CST_5b(X) (((int16_t)((uint16_t)X << 8)) >> 5)
23 25
24 //------------------------------------------------------------------------------ 26 //------------------------------------------------------------------------------
25 // Subtract-Green Transform 27 // Subtract-Green Transform
26 28
27 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { 29 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
28 int i; 30 int i;
29 for (i = 0; i + 4 <= num_pixels; i += 4) { 31 for (i = 0; i + 4 <= num_pixels; i += 4) {
30 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb 32 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
31 const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g 33 const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
32 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); 34 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
33 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g 35 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
34 const __m128i out = _mm_sub_epi8(in, C); 36 const __m128i out = _mm_sub_epi8(in, C);
35 _mm_storeu_si128((__m128i*)&argb_data[i], out); 37 _mm_storeu_si128((__m128i*)&argb_data[i], out);
36 } 38 }
37 // fallthrough and finish off with plain-C 39 // fallthrough and finish off with plain-C
38 VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); 40 if (i != num_pixels) {
41 VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
42 }
39 } 43 }
40 44
41 //------------------------------------------------------------------------------ 45 //------------------------------------------------------------------------------
42 // Color Transform 46 // Color Transform
43 47
44 static void TransformColor(const VP8LMultipliers* const m, 48 static void TransformColor(const VP8LMultipliers* const m,
45 uint32_t* argb_data, int num_pixels) { 49 uint32_t* argb_data, int num_pixels) {
46 const __m128i mults_rb = _mm_set_epi16( 50 const __m128i mults_rb = _mm_set_epi16(
47 CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), 51 CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
48 CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), 52 CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
(...skipping 13 matching lines...) Expand all
62 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 66 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
63 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0 67 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0
64 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0 68 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0
65 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2 69 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2
66 const __m128i H = _mm_add_epi8(G, D); // x dr x db 70 const __m128i H = _mm_add_epi8(G, D); // x dr x db
67 const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db 71 const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db
68 const __m128i out = _mm_sub_epi8(in, I); 72 const __m128i out = _mm_sub_epi8(in, I);
69 _mm_storeu_si128((__m128i*)&argb_data[i], out); 73 _mm_storeu_si128((__m128i*)&argb_data[i], out);
70 } 74 }
71 // fallthrough and finish off with plain-C 75 // fallthrough and finish off with plain-C
72 VP8LTransformColor_C(m, argb_data + i, num_pixels - i); 76 if (i != num_pixels) {
77 VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
78 }
73 } 79 }
74 80
75 //------------------------------------------------------------------------------ 81 //------------------------------------------------------------------------------
76 #define SPAN 8 82 #define SPAN 8
77 static void CollectColorBlueTransforms(const uint32_t* argb, int stride, 83 static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
78 int tile_width, int tile_height, 84 int tile_width, int tile_height,
79 int green_to_blue, int red_to_blue, 85 int green_to_blue, int red_to_blue,
80 int histo[]) { 86 int histo[]) {
81 const __m128i mults_r = _mm_set_epi16( 87 const __m128i mults_r = _mm_set_epi16(
82 CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0, 88 CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
(...skipping 274 matching lines...) Expand 10 before | Expand all | Expand 10 after
357 match_len = 0; 363 match_len = 0;
358 // Unroll the potential first two loops. 364 // Unroll the potential first two loops.
359 if (length >= 4 && 365 if (length >= 4 &&
360 _mm_movemask_epi8(_mm_cmpeq_epi32( 366 _mm_movemask_epi8(_mm_cmpeq_epi32(
361 _mm_loadu_si128((const __m128i*)&array1[0]), 367 _mm_loadu_si128((const __m128i*)&array1[0]),
362 _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) { 368 _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {
363 match_len = 4; 369 match_len = 4;
364 if (length >= 8 && 370 if (length >= 8 &&
365 _mm_movemask_epi8(_mm_cmpeq_epi32( 371 _mm_movemask_epi8(_mm_cmpeq_epi32(
366 _mm_loadu_si128((const __m128i*)&array1[4]), 372 _mm_loadu_si128((const __m128i*)&array1[4]),
367 _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) 373 _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {
368 match_len = 8; 374 match_len = 8;
375 }
369 } 376 }
370 } 377 }
371 378
372 while (match_len < length && array1[match_len] == array2[match_len]) { 379 while (match_len < length && array1[match_len] == array2[match_len]) {
373 ++match_len; 380 ++match_len;
374 } 381 }
375 return match_len; 382 return match_len;
376 } 383 }
377 384
385 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
386 static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
387 uint32_t* dst) {
388 int x;
389 assert(xbits >= 0);
390 assert(xbits <= 3);
391 switch (xbits) {
392 case 0: {
393 const __m128i ff = _mm_set1_epi16(0xff00);
394 const __m128i zero = _mm_setzero_si128();
395 // Store 0xff000000 | (row[x] << 8).
396 for (x = 0; x + 16 <= width; x += 16, dst += 16) {
397 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
398 const __m128i in_lo = _mm_unpacklo_epi8(zero, in);
399 const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff);
400 const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff);
401 const __m128i in_hi = _mm_unpackhi_epi8(zero, in);
402 const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff);
403 const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff);
404 _mm_storeu_si128((__m128i*)&dst[0], dst0);
405 _mm_storeu_si128((__m128i*)&dst[4], dst1);
406 _mm_storeu_si128((__m128i*)&dst[8], dst2);
407 _mm_storeu_si128((__m128i*)&dst[12], dst3);
408 }
409 break;
410 }
411 case 1: {
412 const __m128i ff = _mm_set1_epi16(0xff00);
413 const __m128i mul = _mm_set1_epi16(0x110);
414 for (x = 0; x + 16 <= width; x += 16, dst += 8) {
415 // 0a0b | (where a/b are 4 bits).
416 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
417 const __m128i tmp = _mm_mullo_epi16(in, mul); // aba0
418 const __m128i pack = _mm_and_si128(tmp, ff); // ab00
419 const __m128i dst0 = _mm_unpacklo_epi16(pack, ff);
420 const __m128i dst1 = _mm_unpackhi_epi16(pack, ff);
421 _mm_storeu_si128((__m128i*)&dst[0], dst0);
422 _mm_storeu_si128((__m128i*)&dst[4], dst1);
423 }
424 break;
425 }
426 case 2: {
427 const __m128i mask_or = _mm_set1_epi32(0xff000000);
428 const __m128i mul_cst = _mm_set1_epi16(0x0104);
429 const __m128i mask_mul = _mm_set1_epi16(0x0f00);
430 for (x = 0; x + 16 <= width; x += 16, dst += 4) {
431 // 000a000b000c000d | (where a/b/c/d are 2 bits).
432 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
433 const __m128i mul = _mm_mullo_epi16(in, mul_cst); // 00ab00b000cd00d0
434 const __m128i tmp = _mm_and_si128(mul, mask_mul); // 00ab000000cd0000
435 const __m128i shift = _mm_srli_epi32(tmp, 12); // 00000000ab000000
436 const __m128i pack = _mm_or_si128(shift, tmp); // 00000000abcd0000
437 // Convert to 0xff00**00.
438 const __m128i res = _mm_or_si128(pack, mask_or);
439 _mm_storeu_si128((__m128i*)dst, res);
440 }
441 break;
442 }
443 default: {
444 assert(xbits == 3);
445 for (x = 0; x + 16 <= width; x += 16, dst += 2) {
446 // 0000000a00000000b... | (where a/b are 1 bit).
447 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
448 const __m128i shift = _mm_slli_epi64(in, 7);
449 const uint32_t move = _mm_movemask_epi8(shift);
450 dst[0] = 0xff000000 | ((move & 0xff) << 8);
451 dst[1] = 0xff000000 | (move & 0xff00);
452 }
453 break;
454 }
455 }
456 if (x != width) {
457 VP8LBundleColorMap_C(row + x, width - x, xbits, dst);
458 }
459 }
460
461 //------------------------------------------------------------------------------
462 // Batch version of Predictor Transform subtraction
463
464 static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
465 const __m128i* const a1,
466 __m128i* const avg) {
467 // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
468 const __m128i ones = _mm_set1_epi8(1);
469 const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
470 const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
471 *avg = _mm_sub_epi8(avg1, one);
472 }
473
474 // Predictor0: ARGB_BLACK.
475 static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
476 int num_pixels, uint32_t* out) {
477 int i;
478 const __m128i black = _mm_set1_epi32(ARGB_BLACK);
479 for (i = 0; i + 4 <= num_pixels; i += 4) {
480 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
481 const __m128i res = _mm_sub_epi8(src, black);
482 _mm_storeu_si128((__m128i*)&out[i], res);
483 }
484 if (i != num_pixels) {
485 VP8LPredictorsSub_C[0](in + i, upper + i, num_pixels - i, out + i);
486 }
487 }
488
489 #define GENERATE_PREDICTOR_1(X, IN) \
490 static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
491 int num_pixels, uint32_t* out) { \
492 int i; \
493 for (i = 0; i + 4 <= num_pixels; i += 4) { \
494 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
495 const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \
496 const __m128i res = _mm_sub_epi8(src, pred); \
497 _mm_storeu_si128((__m128i*)&out[i], res); \
498 } \
499 if (i != num_pixels) { \
500 VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
501 } \
502 }
503
504 GENERATE_PREDICTOR_1(1, in[i - 1]) // Predictor1: L
505 GENERATE_PREDICTOR_1(2, upper[i]) // Predictor2: T
506 GENERATE_PREDICTOR_1(3, upper[i + 1]) // Predictor3: TR
507 GENERATE_PREDICTOR_1(4, upper[i - 1]) // Predictor4: TL
508 #undef GENERATE_PREDICTOR_1
509
510 // Predictor5: avg2(avg2(L, TR), T)
511 static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
512 int num_pixels, uint32_t* out) {
513 int i;
514 for (i = 0; i + 4 <= num_pixels; i += 4) {
515 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
516 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
517 const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
518 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
519 __m128i avg, pred, res;
520 Average2_m128i(&L, &TR, &avg);
521 Average2_m128i(&avg, &T, &pred);
522 res = _mm_sub_epi8(src, pred);
523 _mm_storeu_si128((__m128i*)&out[i], res);
524 }
525 if (i != num_pixels) {
526 VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i);
527 }
528 }
529
530 #define GENERATE_PREDICTOR_2(X, A, B) \
531 static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
532 int num_pixels, uint32_t* out) { \
533 int i; \
534 for (i = 0; i + 4 <= num_pixels; i += 4) { \
535 const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \
536 const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \
537 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
538 __m128i pred, res; \
539 Average2_m128i(&tA, &tB, &pred); \
540 res = _mm_sub_epi8(src, pred); \
541 _mm_storeu_si128((__m128i*)&out[i], res); \
542 } \
543 if (i != num_pixels) { \
544 VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
545 } \
546 }
547
548 GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1]) // Predictor6: avg(L, TL)
549 GENERATE_PREDICTOR_2(7, in[i - 1], upper[i]) // Predictor7: avg(L, T)
550 GENERATE_PREDICTOR_2(8, upper[i - 1], upper[i]) // Predictor8: avg(TL, T)
551 GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1]) // Predictor9: average(T, TR)
552 #undef GENERATE_PREDICTOR_2
553
554 // Predictor10: avg(avg(L,TL), avg(T, TR)).
555 static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
556 int num_pixels, uint32_t* out) {
557 int i;
558 for (i = 0; i + 4 <= num_pixels; i += 4) {
559 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
560 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
561 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
562 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
563 const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
564 __m128i avgTTR, avgLTL, avg, res;
565 Average2_m128i(&T, &TR, &avgTTR);
566 Average2_m128i(&L, &TL, &avgLTL);
567 Average2_m128i(&avgTTR, &avgLTL, &avg);
568 res = _mm_sub_epi8(src, avg);
569 _mm_storeu_si128((__m128i*)&out[i], res);
570 }
571 if (i != num_pixels) {
572 VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i);
573 }
574 }
575
576 // Predictor11: select.
577 static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
578 __m128i* const out) {
579 // We can unpack with any value on the upper 32 bits, provided it's the same
580 // on both operands (to that their sum of abs diff is zero). Here we use *A.
581 const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
582 const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
583 const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
584 const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
585 const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
586 const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
587 *out = _mm_packs_epi32(s_lo, s_hi);
588 }
589
590 static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
591 int num_pixels, uint32_t* out) {
592 int i;
593 for (i = 0; i + 4 <= num_pixels; i += 4) {
594 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
595 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
596 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
597 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
598 __m128i pa, pb;
599 GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL|
600 GetSumAbsDiff32(&L, &TL, &pb); // pb = sum |L-TL|
601 {
602 const __m128i mask = _mm_cmpgt_epi32(pb, pa);
603 const __m128i A = _mm_and_si128(mask, L);
604 const __m128i B = _mm_andnot_si128(mask, T);
605 const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T
606 const __m128i res = _mm_sub_epi8(src, pred);
607 _mm_storeu_si128((__m128i*)&out[i], res);
608 }
609 }
610 if (i != num_pixels) {
611 VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
612 }
613 }
614
615 // Predictor12: ClampedSubSubtractFull.
616 static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
617 int num_pixels, uint32_t* out) {
618 int i;
619 const __m128i zero = _mm_setzero_si128();
620 for (i = 0; i + 4 <= num_pixels; i += 4) {
621 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
622 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
623 const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
624 const __m128i L_hi = _mm_unpackhi_epi8(L, zero);
625 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
626 const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
627 const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
628 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
629 const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
630 const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
631 const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
632 const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
633 const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo);
634 const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi);
635 const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
636 const __m128i res = _mm_sub_epi8(src, pred);
637 _mm_storeu_si128((__m128i*)&out[i], res);
638 }
639 if (i != num_pixels) {
640 VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i);
641 }
642 }
643
644 // Predictors13: ClampedAddSubtractHalf
645 static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
646 int num_pixels, uint32_t* out) {
647 int i;
648 const __m128i zero = _mm_setzero_si128();
649 for (i = 0; i + 2 <= num_pixels; i += 2) {
650 // we can only process two pixels at a time
651 const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);
652 const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);
653 const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);
654 const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);
655 const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
656 const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
657 const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
658 const __m128i sum = _mm_add_epi16(T_lo, L_lo);
659 const __m128i avg = _mm_srli_epi16(sum, 1);
660 const __m128i A1 = _mm_sub_epi16(avg, TL_lo);
661 const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);
662 const __m128i A2 = _mm_sub_epi16(A1, bit_fix);
663 const __m128i A3 = _mm_srai_epi16(A2, 1);
664 const __m128i A4 = _mm_add_epi16(avg, A3);
665 const __m128i pred = _mm_packus_epi16(A4, A4);
666 const __m128i res = _mm_sub_epi8(src, pred);
667 _mm_storel_epi64((__m128i*)&out[i], res);
668 }
669 if (i != num_pixels) {
670 VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);
671 }
672 }
673
378 //------------------------------------------------------------------------------ 674 //------------------------------------------------------------------------------
379 // Entry point 675 // Entry point
380 676
381 extern void VP8LEncDspInitSSE2(void); 677 extern void VP8LEncDspInitSSE2(void);
382 678
383 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { 679 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
384 VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; 680 VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
385 VP8LTransformColor = TransformColor; 681 VP8LTransformColor = TransformColor;
386 VP8LCollectColorBlueTransforms = CollectColorBlueTransforms; 682 VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
387 VP8LCollectColorRedTransforms = CollectColorRedTransforms; 683 VP8LCollectColorRedTransforms = CollectColorRedTransforms;
388 VP8LHistogramAdd = HistogramAdd; 684 VP8LHistogramAdd = HistogramAdd;
389 VP8LCombinedShannonEntropy = CombinedShannonEntropy; 685 VP8LCombinedShannonEntropy = CombinedShannonEntropy;
390 VP8LVectorMismatch = VectorMismatch; 686 VP8LVectorMismatch = VectorMismatch;
687 VP8LBundleColorMap = BundleColorMap_SSE2;
688
689 VP8LPredictorsSub[0] = PredictorSub0_SSE2;
690 VP8LPredictorsSub[1] = PredictorSub1_SSE2;
691 VP8LPredictorsSub[2] = PredictorSub2_SSE2;
692 VP8LPredictorsSub[3] = PredictorSub3_SSE2;
693 VP8LPredictorsSub[4] = PredictorSub4_SSE2;
694 VP8LPredictorsSub[5] = PredictorSub5_SSE2;
695 VP8LPredictorsSub[6] = PredictorSub6_SSE2;
696 VP8LPredictorsSub[7] = PredictorSub7_SSE2;
697 VP8LPredictorsSub[8] = PredictorSub8_SSE2;
698 VP8LPredictorsSub[9] = PredictorSub9_SSE2;
699 VP8LPredictorsSub[10] = PredictorSub10_SSE2;
700 VP8LPredictorsSub[11] = PredictorSub11_SSE2;
701 VP8LPredictorsSub[12] = PredictorSub12_SSE2;
702 VP8LPredictorsSub[13] = PredictorSub13_SSE2;
703 VP8LPredictorsSub[14] = PredictorSub0_SSE2; // <- padding security sentinels
704 VP8LPredictorsSub[15] = PredictorSub0_SSE2;
391 } 705 }
392 706
393 #else // !WEBP_USE_SSE2 707 #else // !WEBP_USE_SSE2
394 708
395 WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2) 709 WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2)
396 710
397 #endif // WEBP_USE_SSE2 711 #endif // WEBP_USE_SSE2
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/lossless_enc_msa.c ('k') | third_party/libwebp/dsp/lossless_enc_sse41.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698