third_party/libwebp/dsp/enc_sse2.c - Issue 116213006: Update libwebp to 0.4.0

Side by Side Diff: third_party/libwebp/dsp/enc_sse2.c

Issue 116213006: Update libwebp to 0.4.0 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 Google Inc. All Rights Reserved.	1 // Copyright 2011 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // SSE2 version of speed-critical encoding functions.	10 // SSE2 version of speed-critical encoding functions.

11 //	11 //

12 // Author: Christian Duvivier (cduvivier@google.com)	12 // Author: Christian Duvivier (cduvivier@google.com)

13	13

14 #include "./dsp.h"	14 #include "./dsp.h"

15	15

16 #if defined(__cplusplus) \|\| defined(c_plusplus)

17 extern "C" {

18 #endif

19

20 #if defined(WEBP_USE_SSE2)	16 #if defined(WEBP_USE_SSE2)

21 #include <stdlib.h> // for abs()	17 #include <stdlib.h> // for abs()

22 #include <emmintrin.h>	18 #include <emmintrin.h>

23	19

24 #include "../enc/vp8enci.h"	20 #include "../enc/vp8enci.h"

25	21

26 //------------------------------------------------------------------------------	22 //------------------------------------------------------------------------------

27 // Quite useful macro for debugging. Left here for convenience.	23 // Quite useful macro for debugging. Left here for convenience.

28	24

29 #if 0	25 #if 0

(...skipping 419 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
449 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));	445 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

450	446

451 _mm_storel_epi64((__m128i*)&out[ 0], d0);	447 _mm_storel_epi64((__m128i*)&out[ 0], d0);

452 _mm_storel_epi64((__m128i*)&out[ 4], g1);	448 _mm_storel_epi64((__m128i*)&out[ 4], g1);

453 _mm_storel_epi64((__m128i*)&out[ 8], d2);	449 _mm_storel_epi64((__m128i*)&out[ 8], d2);

454 _mm_storel_epi64((__m128i*)&out[12], f3);	450 _mm_storel_epi64((__m128i*)&out[12], f3);

455 }	451 }

456 }	452 }

457	453

458 static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {	454 static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {

459 int16_t tmp[16];	455 int32_t tmp[16];

460 int i;	456 int i;

461 for (i = 0; i < 4; ++i, in += 64) {	457 for (i = 0; i < 4; ++i, in += 64) {

462 const int a0 = (in[0 * 16] + in[2 * 16]);	458 const int a0 = (in[0 * 16] + in[2 * 16]);

463 const int a1 = (in[1 * 16] + in[3 * 16]);	459 const int a1 = (in[1 * 16] + in[3 * 16]);

464 const int a2 = (in[1 * 16] - in[3 * 16]);	460 const int a2 = (in[1 * 16] - in[3 * 16]);

465 const int a3 = (in[0 * 16] - in[2 * 16]);	461 const int a3 = (in[0 * 16] - in[2 * 16]);

466 tmp[0 + i * 4] = a0 + a1;	462 tmp[0 + i * 4] = a0 + a1;

467 tmp[1 + i * 4] = a3 + a2;	463 tmp[1 + i * 4] = a3 + a2;

468 tmp[2 + i * 4] = a3 - a2;	464 tmp[2 + i * 4] = a3 - a2;

469 tmp[3 + i * 4] = a0 - a1;	465 tmp[3 + i * 4] = a0 - a1;

470 }	466 }

471 {	467 {

472 const __m128i src0 = _mm_loadl_epi64((__m128i*)&tmp[0]);	468 const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);

473 const __m128i src1 = _mm_loadl_epi64((__m128i*)&tmp[4]);	469 const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);

474 const __m128i src2 = _mm_loadl_epi64((__m128i*)&tmp[8]);	470 const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);

475 const __m128i src3 = _mm_loadl_epi64((__m128i*)&tmp[12]);	471 const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);

476 const __m128i a0 = _mm_add_epi16(src0, src2);	472 const __m128i a0 = _mm_add_epi32(src0, src2);

477 const __m128i a1 = _mm_add_epi16(src1, src3);	473 const __m128i a1 = _mm_add_epi32(src1, src3);

478 const __m128i a2 = _mm_sub_epi16(src1, src3);	474 const __m128i a2 = _mm_sub_epi32(src1, src3);

479 const __m128i a3 = _mm_sub_epi16(src0, src2);	475 const __m128i a3 = _mm_sub_epi32(src0, src2);

480 const __m128i b0 = _mm_srai_epi16(_mm_adds_epi16(a0, a1), 1);	476 const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);

481 const __m128i b1 = _mm_srai_epi16(_mm_adds_epi16(a3, a2), 1);	477 const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);

482 const __m128i b2 = _mm_srai_epi16(_mm_subs_epi16(a3, a2), 1);	478 const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);

483 const __m128i b3 = _mm_srai_epi16(_mm_subs_epi16(a0, a1), 1);	479 const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);

484 _mm_storel_epi64((__m128i*)&out[ 0], b0);	480 const __m128i out0 = _mm_packs_epi32(b0, b1);

485 _mm_storel_epi64((__m128i*)&out[ 4], b1);	481 const __m128i out1 = _mm_packs_epi32(b2, b3);

486 _mm_storel_epi64((__m128i*)&out[ 8], b2);	482 _mm_storeu_si128((__m128i*)&out[0], out0);

487 _mm_storel_epi64((__m128i*)&out[12], b3);	483 _mm_storeu_si128((__m128i*)&out[8], out1);

488 }	484 }

489 }	485 }

490	486

491 //------------------------------------------------------------------------------	487 //------------------------------------------------------------------------------

492 // Metric	488 // Metric

493	489

494 static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,	490 static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,

495 int num_quads, int do_16) {	491 int num_quads, int do_16) {

496 const __m128i zero = _mm_setzero_si128();	492 const __m128i zero = _mm_setzero_si128();

497 __m128i sum1 = zero;	493 __m128i sum1 = zero;

(...skipping 139 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
637	633

638 // Hadamard transform	634 // Hadamard transform

639 // Returns the difference between the weighted sum of the absolute value of	635 // Returns the difference between the weighted sum of the absolute value of

640 // transformed coefficients.	636 // transformed coefficients.

641 static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,	637 static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,

642 const uint16_t* const w) {	638 const uint16_t* const w) {

643 int32_t sum[4];	639 int32_t sum[4];

644 __m128i tmp_0, tmp_1, tmp_2, tmp_3;	640 __m128i tmp_0, tmp_1, tmp_2, tmp_3;

645 const __m128i zero = _mm_setzero_si128();	641 const __m128i zero = _mm_setzero_si128();

646	642

647 // Load, combine and tranpose inputs.	643 // Load, combine and transpose inputs.

648 {	644 {

649 const __m128i inA_0 = _mm_loadl_epi64((__m128i)&inA[BPS 0]);	645 const __m128i inA_0 = _mm_loadl_epi64((__m128i)&inA[BPS 0]);

650 const __m128i inA_1 = _mm_loadl_epi64((__m128i)&inA[BPS 1]);	646 const __m128i inA_1 = _mm_loadl_epi64((__m128i)&inA[BPS 1]);

651 const __m128i inA_2 = _mm_loadl_epi64((__m128i)&inA[BPS 2]);	647 const __m128i inA_2 = _mm_loadl_epi64((__m128i)&inA[BPS 2]);

652 const __m128i inA_3 = _mm_loadl_epi64((__m128i)&inA[BPS 3]);	648 const __m128i inA_3 = _mm_loadl_epi64((__m128i)&inA[BPS 3]);

653 const __m128i inB_0 = _mm_loadl_epi64((__m128i)&inB[BPS 0]);	649 const __m128i inB_0 = _mm_loadl_epi64((__m128i)&inB[BPS 0]);

654 const __m128i inB_1 = _mm_loadl_epi64((__m128i)&inB[BPS 1]);	650 const __m128i inB_1 = _mm_loadl_epi64((__m128i)&inB[BPS 1]);

655 const __m128i inB_2 = _mm_loadl_epi64((__m128i)&inB[BPS 2]);	651 const __m128i inB_2 = _mm_loadl_epi64((__m128i)&inB[BPS 2]);

656 const __m128i inB_3 = _mm_loadl_epi64((__m128i)&inB[BPS 3]);	652 const __m128i inB_3 = _mm_loadl_epi64((__m128i)&inB[BPS 3]);

657	653

(...skipping 165 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
823 __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);	819 __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);

824 __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);	820 __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);

825 const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);	821 const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);

826 const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);	822 const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);

827 const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);	823 const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);

828 const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);	824 const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);

829 const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);	825 const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);

830 const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);	826 const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);

831 const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);	827 const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);

832 const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);	828 const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);

833 const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);

834 const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

835	829

836 // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative)	830 // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative)

837 const __m128i sign0 = _mm_srai_epi16(in0, 15);	831 const __m128i sign0 = _mm_srai_epi16(in0, 15);

838 const __m128i sign8 = _mm_srai_epi16(in8, 15);	832 const __m128i sign8 = _mm_srai_epi16(in8, 15);

839	833

840 // coeff = abs(in) = (in ^ sign) - sign	834 // coeff = abs(in) = (in ^ sign) - sign

841 coeff0 = _mm_xor_si128(in0, sign0);	835 coeff0 = _mm_xor_si128(in0, sign0);

842 coeff8 = _mm_xor_si128(in8, sign8);	836 coeff8 = _mm_xor_si128(in8, sign8);

843 coeff0 = _mm_sub_epi16(coeff0, sign0);	837 coeff0 = _mm_sub_epi16(coeff0, sign0);

844 coeff8 = _mm_sub_epi16(coeff8, sign8);	838 coeff8 = _mm_sub_epi16(coeff8, sign8);

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
887 // get sign back (if (sign[j]) out_n = -out_n)	881 // get sign back (if (sign[j]) out_n = -out_n)

888 out0 = _mm_xor_si128(out0, sign0);	882 out0 = _mm_xor_si128(out0, sign0);

889 out8 = _mm_xor_si128(out8, sign8);	883 out8 = _mm_xor_si128(out8, sign8);

890 out0 = _mm_sub_epi16(out0, sign0);	884 out0 = _mm_sub_epi16(out0, sign0);

891 out8 = _mm_sub_epi16(out8, sign8);	885 out8 = _mm_sub_epi16(out8, sign8);

892	886

893 // in = out * Q	887 // in = out * Q

894 in0 = _mm_mullo_epi16(out0, q0);	888 in0 = _mm_mullo_epi16(out0, q0);

895 in8 = _mm_mullo_epi16(out8, q8);	889 in8 = _mm_mullo_epi16(out8, q8);

896	890

897 // if (coeff <= mtx->zthresh_) {in=0; out=0;}	891 _mm_storeu_si128((__m128i*)&in[0], in0);

898 {	892 _mm_storeu_si128((__m128i*)&in[8], in8);

899 __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);

900 __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);

901 in0 = _mm_and_si128(in0, cmp0);

902 in8 = _mm_and_si128(in8, cmp8);

903 _mm_storeu_si128((__m128i*)&in[0], in0);

904 _mm_storeu_si128((__m128i*)&in[8], in8);

905 out0 = _mm_and_si128(out0, cmp0);

906 out8 = _mm_and_si128(out8, cmp8);

907 }

908	893

909 // zigzag the output before storing it.	894 // zigzag the output before storing it.

910 //	895 //

911 // The zigzag pattern can almost be reproduced with a small sequence of	896 // The zigzag pattern can almost be reproduced with a small sequence of

912 // shuffles. After it, we only need to swap the 7th (ending up in third	897 // shuffles. After it, we only need to swap the 7th (ending up in third

913 // position instead of twelfth) and 8th values.	898 // position instead of twelfth) and 8th values.

914 {	899 {

915 __m128i outZ0, outZ8;	900 __m128i outZ0, outZ8;

916 outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0));	901 outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0));

917 outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0));	902 outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0));

(...skipping 16 matching lines...) Expand all Loading...
934 {	919 {

935 int32_t tmp[4];	920 int32_t tmp[4];

936 _mm_storeu_si128((__m128i*)tmp, packed_out);	921 _mm_storeu_si128((__m128i*)tmp, packed_out);

937 if (n) {	922 if (n) {

938 tmp[0] &= ~0xff;	923 tmp[0] &= ~0xff;

939 }	924 }

940 return (tmp[3] \|\| tmp[2] \|\| tmp[1] \|\| tmp[0]);	925 return (tmp[3] \|\| tmp[2] \|\| tmp[1] \|\| tmp[0]);

941 }	926 }

942 }	927 }

943	928

	929 static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16],

	930 const VP8Matrix* const mtx) {

	931 return QuantizeBlockSSE2(in, out, 0, mtx);

	932 }

	933

944 #endif // WEBP_USE_SSE2	934 #endif // WEBP_USE_SSE2

945	935

946 //------------------------------------------------------------------------------	936 //------------------------------------------------------------------------------

947 // Entry point	937 // Entry point

948	938

949 extern void VP8EncDspInitSSE2(void);	939 extern void VP8EncDspInitSSE2(void);

950	940

951 void VP8EncDspInitSSE2(void) {	941 void VP8EncDspInitSSE2(void) {

952 #if defined(WEBP_USE_SSE2)	942 #if defined(WEBP_USE_SSE2)

953 VP8CollectHistogram = CollectHistogramSSE2;	943 VP8CollectHistogram = CollectHistogramSSE2;

954 VP8EncQuantizeBlock = QuantizeBlockSSE2;	944 VP8EncQuantizeBlock = QuantizeBlockSSE2;

	945 VP8EncQuantizeBlockWHT = QuantizeBlockWHTSSE2;

955 VP8ITransform = ITransformSSE2;	946 VP8ITransform = ITransformSSE2;

956 VP8FTransform = FTransformSSE2;	947 VP8FTransform = FTransformSSE2;

957 VP8FTransformWHT = FTransformWHTSSE2;	948 VP8FTransformWHT = FTransformWHTSSE2;

958 VP8SSE16x16 = SSE16x16SSE2;	949 VP8SSE16x16 = SSE16x16SSE2;

959 VP8SSE16x8 = SSE16x8SSE2;	950 VP8SSE16x8 = SSE16x8SSE2;

960 VP8SSE8x8 = SSE8x8SSE2;	951 VP8SSE8x8 = SSE8x8SSE2;

961 VP8SSE4x4 = SSE4x4SSE2;	952 VP8SSE4x4 = SSE4x4SSE2;

962 VP8TDisto4x4 = Disto4x4SSE2;	953 VP8TDisto4x4 = Disto4x4SSE2;

963 VP8TDisto16x16 = Disto16x16SSE2;	954 VP8TDisto16x16 = Disto16x16SSE2;

964 #endif // WEBP_USE_SSE2	955 #endif // WEBP_USE_SSE2

965 }	956 }

966	957

967 #if defined(__cplusplus) \|\| defined(c_plusplus)

968 } // extern "C"

969 #endif

OLD	NEW

« third_party/libwebp/README.chromium ('K') | « third_party/libwebp/dsp/enc_neon.c ('k') | third_party/libwebp/dsp/lossless.h » ('j') | no next file with comments »