Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(247)

Side by Side Diff: third_party/libwebp/dsp/enc_sse2.c

Issue 116213006: Update libwebp to 0.4.0 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 Google Inc. All Rights Reserved. 1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // SSE2 version of speed-critical encoding functions. 10 // SSE2 version of speed-critical encoding functions.
11 // 11 //
12 // Author: Christian Duvivier (cduvivier@google.com) 12 // Author: Christian Duvivier (cduvivier@google.com)
13 13
14 #include "./dsp.h" 14 #include "./dsp.h"
15 15
16 #if defined(__cplusplus) || defined(c_plusplus)
17 extern "C" {
18 #endif
19
20 #if defined(WEBP_USE_SSE2) 16 #if defined(WEBP_USE_SSE2)
21 #include <stdlib.h> // for abs() 17 #include <stdlib.h> // for abs()
22 #include <emmintrin.h> 18 #include <emmintrin.h>
23 19
24 #include "../enc/vp8enci.h" 20 #include "../enc/vp8enci.h"
25 21
26 //------------------------------------------------------------------------------ 22 //------------------------------------------------------------------------------
27 // Quite useful macro for debugging. Left here for convenience. 23 // Quite useful macro for debugging. Left here for convenience.
28 24
29 #if 0 25 #if 0
(...skipping 419 matching lines...) Expand 10 before | Expand all | Expand 10 after
449 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); 445 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
450 446
451 _mm_storel_epi64((__m128i*)&out[ 0], d0); 447 _mm_storel_epi64((__m128i*)&out[ 0], d0);
452 _mm_storel_epi64((__m128i*)&out[ 4], g1); 448 _mm_storel_epi64((__m128i*)&out[ 4], g1);
453 _mm_storel_epi64((__m128i*)&out[ 8], d2); 449 _mm_storel_epi64((__m128i*)&out[ 8], d2);
454 _mm_storel_epi64((__m128i*)&out[12], f3); 450 _mm_storel_epi64((__m128i*)&out[12], f3);
455 } 451 }
456 } 452 }
457 453
458 static void FTransformWHTSSE2(const int16_t* in, int16_t* out) { 454 static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
459 int16_t tmp[16]; 455 int32_t tmp[16];
460 int i; 456 int i;
461 for (i = 0; i < 4; ++i, in += 64) { 457 for (i = 0; i < 4; ++i, in += 64) {
462 const int a0 = (in[0 * 16] + in[2 * 16]); 458 const int a0 = (in[0 * 16] + in[2 * 16]);
463 const int a1 = (in[1 * 16] + in[3 * 16]); 459 const int a1 = (in[1 * 16] + in[3 * 16]);
464 const int a2 = (in[1 * 16] - in[3 * 16]); 460 const int a2 = (in[1 * 16] - in[3 * 16]);
465 const int a3 = (in[0 * 16] - in[2 * 16]); 461 const int a3 = (in[0 * 16] - in[2 * 16]);
466 tmp[0 + i * 4] = a0 + a1; 462 tmp[0 + i * 4] = a0 + a1;
467 tmp[1 + i * 4] = a3 + a2; 463 tmp[1 + i * 4] = a3 + a2;
468 tmp[2 + i * 4] = a3 - a2; 464 tmp[2 + i * 4] = a3 - a2;
469 tmp[3 + i * 4] = a0 - a1; 465 tmp[3 + i * 4] = a0 - a1;
470 } 466 }
471 { 467 {
472 const __m128i src0 = _mm_loadl_epi64((__m128i*)&tmp[0]); 468 const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);
473 const __m128i src1 = _mm_loadl_epi64((__m128i*)&tmp[4]); 469 const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);
474 const __m128i src2 = _mm_loadl_epi64((__m128i*)&tmp[8]); 470 const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);
475 const __m128i src3 = _mm_loadl_epi64((__m128i*)&tmp[12]); 471 const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);
476 const __m128i a0 = _mm_add_epi16(src0, src2); 472 const __m128i a0 = _mm_add_epi32(src0, src2);
477 const __m128i a1 = _mm_add_epi16(src1, src3); 473 const __m128i a1 = _mm_add_epi32(src1, src3);
478 const __m128i a2 = _mm_sub_epi16(src1, src3); 474 const __m128i a2 = _mm_sub_epi32(src1, src3);
479 const __m128i a3 = _mm_sub_epi16(src0, src2); 475 const __m128i a3 = _mm_sub_epi32(src0, src2);
480 const __m128i b0 = _mm_srai_epi16(_mm_adds_epi16(a0, a1), 1); 476 const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
481 const __m128i b1 = _mm_srai_epi16(_mm_adds_epi16(a3, a2), 1); 477 const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
482 const __m128i b2 = _mm_srai_epi16(_mm_subs_epi16(a3, a2), 1); 478 const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
483 const __m128i b3 = _mm_srai_epi16(_mm_subs_epi16(a0, a1), 1); 479 const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
484 _mm_storel_epi64((__m128i*)&out[ 0], b0); 480 const __m128i out0 = _mm_packs_epi32(b0, b1);
485 _mm_storel_epi64((__m128i*)&out[ 4], b1); 481 const __m128i out1 = _mm_packs_epi32(b2, b3);
486 _mm_storel_epi64((__m128i*)&out[ 8], b2); 482 _mm_storeu_si128((__m128i*)&out[0], out0);
487 _mm_storel_epi64((__m128i*)&out[12], b3); 483 _mm_storeu_si128((__m128i*)&out[8], out1);
488 } 484 }
489 } 485 }
490 486
491 //------------------------------------------------------------------------------ 487 //------------------------------------------------------------------------------
492 // Metric 488 // Metric
493 489
494 static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b, 490 static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
495 int num_quads, int do_16) { 491 int num_quads, int do_16) {
496 const __m128i zero = _mm_setzero_si128(); 492 const __m128i zero = _mm_setzero_si128();
497 __m128i sum1 = zero; 493 __m128i sum1 = zero;
(...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after
637 633
638 // Hadamard transform 634 // Hadamard transform
639 // Returns the difference between the weighted sum of the absolute value of 635 // Returns the difference between the weighted sum of the absolute value of
640 // transformed coefficients. 636 // transformed coefficients.
641 static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, 637 static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
642 const uint16_t* const w) { 638 const uint16_t* const w) {
643 int32_t sum[4]; 639 int32_t sum[4];
644 __m128i tmp_0, tmp_1, tmp_2, tmp_3; 640 __m128i tmp_0, tmp_1, tmp_2, tmp_3;
645 const __m128i zero = _mm_setzero_si128(); 641 const __m128i zero = _mm_setzero_si128();
646 642
647 // Load, combine and tranpose inputs. 643 // Load, combine and transpose inputs.
648 { 644 {
649 const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]); 645 const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
650 const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]); 646 const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
651 const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]); 647 const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
652 const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]); 648 const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
653 const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]); 649 const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
654 const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]); 650 const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
655 const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]); 651 const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
656 const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]); 652 const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);
657 653
(...skipping 165 matching lines...) Expand 10 before | Expand all | Expand 10 after
823 __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); 819 __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
824 __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); 820 __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
825 const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); 821 const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
826 const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); 822 const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
827 const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); 823 const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
828 const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); 824 const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
829 const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); 825 const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
830 const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); 826 const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
831 const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); 827 const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
832 const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); 828 const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
833 const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
834 const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);
835 829
836 // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) 830 // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative)
837 const __m128i sign0 = _mm_srai_epi16(in0, 15); 831 const __m128i sign0 = _mm_srai_epi16(in0, 15);
838 const __m128i sign8 = _mm_srai_epi16(in8, 15); 832 const __m128i sign8 = _mm_srai_epi16(in8, 15);
839 833
840 // coeff = abs(in) = (in ^ sign) - sign 834 // coeff = abs(in) = (in ^ sign) - sign
841 coeff0 = _mm_xor_si128(in0, sign0); 835 coeff0 = _mm_xor_si128(in0, sign0);
842 coeff8 = _mm_xor_si128(in8, sign8); 836 coeff8 = _mm_xor_si128(in8, sign8);
843 coeff0 = _mm_sub_epi16(coeff0, sign0); 837 coeff0 = _mm_sub_epi16(coeff0, sign0);
844 coeff8 = _mm_sub_epi16(coeff8, sign8); 838 coeff8 = _mm_sub_epi16(coeff8, sign8);
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
887 // get sign back (if (sign[j]) out_n = -out_n) 881 // get sign back (if (sign[j]) out_n = -out_n)
888 out0 = _mm_xor_si128(out0, sign0); 882 out0 = _mm_xor_si128(out0, sign0);
889 out8 = _mm_xor_si128(out8, sign8); 883 out8 = _mm_xor_si128(out8, sign8);
890 out0 = _mm_sub_epi16(out0, sign0); 884 out0 = _mm_sub_epi16(out0, sign0);
891 out8 = _mm_sub_epi16(out8, sign8); 885 out8 = _mm_sub_epi16(out8, sign8);
892 886
893 // in = out * Q 887 // in = out * Q
894 in0 = _mm_mullo_epi16(out0, q0); 888 in0 = _mm_mullo_epi16(out0, q0);
895 in8 = _mm_mullo_epi16(out8, q8); 889 in8 = _mm_mullo_epi16(out8, q8);
896 890
897 // if (coeff <= mtx->zthresh_) {in=0; out=0;} 891 _mm_storeu_si128((__m128i*)&in[0], in0);
898 { 892 _mm_storeu_si128((__m128i*)&in[8], in8);
899 __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
900 __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
901 in0 = _mm_and_si128(in0, cmp0);
902 in8 = _mm_and_si128(in8, cmp8);
903 _mm_storeu_si128((__m128i*)&in[0], in0);
904 _mm_storeu_si128((__m128i*)&in[8], in8);
905 out0 = _mm_and_si128(out0, cmp0);
906 out8 = _mm_and_si128(out8, cmp8);
907 }
908 893
909 // zigzag the output before storing it. 894 // zigzag the output before storing it.
910 // 895 //
911 // The zigzag pattern can almost be reproduced with a small sequence of 896 // The zigzag pattern can almost be reproduced with a small sequence of
912 // shuffles. After it, we only need to swap the 7th (ending up in third 897 // shuffles. After it, we only need to swap the 7th (ending up in third
913 // position instead of twelfth) and 8th values. 898 // position instead of twelfth) and 8th values.
914 { 899 {
915 __m128i outZ0, outZ8; 900 __m128i outZ0, outZ8;
916 outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); 901 outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0));
917 outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); 902 outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
(...skipping 16 matching lines...) Expand all
934 { 919 {
935 int32_t tmp[4]; 920 int32_t tmp[4];
936 _mm_storeu_si128((__m128i*)tmp, packed_out); 921 _mm_storeu_si128((__m128i*)tmp, packed_out);
937 if (n) { 922 if (n) {
938 tmp[0] &= ~0xff; 923 tmp[0] &= ~0xff;
939 } 924 }
940 return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); 925 return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
941 } 926 }
942 } 927 }
943 928
929 static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16],
930 const VP8Matrix* const mtx) {
931 return QuantizeBlockSSE2(in, out, 0, mtx);
932 }
933
944 #endif // WEBP_USE_SSE2 934 #endif // WEBP_USE_SSE2
945 935
946 //------------------------------------------------------------------------------ 936 //------------------------------------------------------------------------------
947 // Entry point 937 // Entry point
948 938
949 extern void VP8EncDspInitSSE2(void); 939 extern void VP8EncDspInitSSE2(void);
950 940
951 void VP8EncDspInitSSE2(void) { 941 void VP8EncDspInitSSE2(void) {
952 #if defined(WEBP_USE_SSE2) 942 #if defined(WEBP_USE_SSE2)
953 VP8CollectHistogram = CollectHistogramSSE2; 943 VP8CollectHistogram = CollectHistogramSSE2;
954 VP8EncQuantizeBlock = QuantizeBlockSSE2; 944 VP8EncQuantizeBlock = QuantizeBlockSSE2;
945 VP8EncQuantizeBlockWHT = QuantizeBlockWHTSSE2;
955 VP8ITransform = ITransformSSE2; 946 VP8ITransform = ITransformSSE2;
956 VP8FTransform = FTransformSSE2; 947 VP8FTransform = FTransformSSE2;
957 VP8FTransformWHT = FTransformWHTSSE2; 948 VP8FTransformWHT = FTransformWHTSSE2;
958 VP8SSE16x16 = SSE16x16SSE2; 949 VP8SSE16x16 = SSE16x16SSE2;
959 VP8SSE16x8 = SSE16x8SSE2; 950 VP8SSE16x8 = SSE16x8SSE2;
960 VP8SSE8x8 = SSE8x8SSE2; 951 VP8SSE8x8 = SSE8x8SSE2;
961 VP8SSE4x4 = SSE4x4SSE2; 952 VP8SSE4x4 = SSE4x4SSE2;
962 VP8TDisto4x4 = Disto4x4SSE2; 953 VP8TDisto4x4 = Disto4x4SSE2;
963 VP8TDisto16x16 = Disto16x16SSE2; 954 VP8TDisto16x16 = Disto16x16SSE2;
964 #endif // WEBP_USE_SSE2 955 #endif // WEBP_USE_SSE2
965 } 956 }
966 957
967 #if defined(__cplusplus) || defined(c_plusplus)
968 } // extern "C"
969 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698