| OLD | NEW |
| 1 // Copyright 2011 Google Inc. All Rights Reserved. | 1 // Copyright 2011 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // SSE2 version of speed-critical encoding functions. | 10 // SSE2 version of speed-critical encoding functions. |
| 11 // | 11 // |
| 12 // Author: Christian Duvivier (cduvivier@google.com) | 12 // Author: Christian Duvivier (cduvivier@google.com) |
| 13 | 13 |
| 14 #include "./dsp.h" | 14 #include "./dsp.h" |
| 15 | 15 |
| 16 #if defined(__cplusplus) || defined(c_plusplus) | |
| 17 extern "C" { | |
| 18 #endif | |
| 19 | |
| 20 #if defined(WEBP_USE_SSE2) | 16 #if defined(WEBP_USE_SSE2) |
| 21 #include <stdlib.h> // for abs() | 17 #include <stdlib.h> // for abs() |
| 22 #include <emmintrin.h> | 18 #include <emmintrin.h> |
| 23 | 19 |
| 24 #include "../enc/vp8enci.h" | 20 #include "../enc/vp8enci.h" |
| 25 | 21 |
| 26 //------------------------------------------------------------------------------ | 22 //------------------------------------------------------------------------------ |
| 27 // Quite useful macro for debugging. Left here for convenience. | 23 // Quite useful macro for debugging. Left here for convenience. |
| 28 | 24 |
| 29 #if 0 | 25 #if 0 |
| (...skipping 419 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 449 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); | 445 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); |
| 450 | 446 |
| 451 _mm_storel_epi64((__m128i*)&out[ 0], d0); | 447 _mm_storel_epi64((__m128i*)&out[ 0], d0); |
| 452 _mm_storel_epi64((__m128i*)&out[ 4], g1); | 448 _mm_storel_epi64((__m128i*)&out[ 4], g1); |
| 453 _mm_storel_epi64((__m128i*)&out[ 8], d2); | 449 _mm_storel_epi64((__m128i*)&out[ 8], d2); |
| 454 _mm_storel_epi64((__m128i*)&out[12], f3); | 450 _mm_storel_epi64((__m128i*)&out[12], f3); |
| 455 } | 451 } |
| 456 } | 452 } |
| 457 | 453 |
| 458 static void FTransformWHTSSE2(const int16_t* in, int16_t* out) { | 454 static void FTransformWHTSSE2(const int16_t* in, int16_t* out) { |
| 459 int16_t tmp[16]; | 455 int32_t tmp[16]; |
| 460 int i; | 456 int i; |
| 461 for (i = 0; i < 4; ++i, in += 64) { | 457 for (i = 0; i < 4; ++i, in += 64) { |
| 462 const int a0 = (in[0 * 16] + in[2 * 16]); | 458 const int a0 = (in[0 * 16] + in[2 * 16]); |
| 463 const int a1 = (in[1 * 16] + in[3 * 16]); | 459 const int a1 = (in[1 * 16] + in[3 * 16]); |
| 464 const int a2 = (in[1 * 16] - in[3 * 16]); | 460 const int a2 = (in[1 * 16] - in[3 * 16]); |
| 465 const int a3 = (in[0 * 16] - in[2 * 16]); | 461 const int a3 = (in[0 * 16] - in[2 * 16]); |
| 466 tmp[0 + i * 4] = a0 + a1; | 462 tmp[0 + i * 4] = a0 + a1; |
| 467 tmp[1 + i * 4] = a3 + a2; | 463 tmp[1 + i * 4] = a3 + a2; |
| 468 tmp[2 + i * 4] = a3 - a2; | 464 tmp[2 + i * 4] = a3 - a2; |
| 469 tmp[3 + i * 4] = a0 - a1; | 465 tmp[3 + i * 4] = a0 - a1; |
| 470 } | 466 } |
| 471 { | 467 { |
| 472 const __m128i src0 = _mm_loadl_epi64((__m128i*)&tmp[0]); | 468 const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]); |
| 473 const __m128i src1 = _mm_loadl_epi64((__m128i*)&tmp[4]); | 469 const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]); |
| 474 const __m128i src2 = _mm_loadl_epi64((__m128i*)&tmp[8]); | 470 const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]); |
| 475 const __m128i src3 = _mm_loadl_epi64((__m128i*)&tmp[12]); | 471 const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]); |
| 476 const __m128i a0 = _mm_add_epi16(src0, src2); | 472 const __m128i a0 = _mm_add_epi32(src0, src2); |
| 477 const __m128i a1 = _mm_add_epi16(src1, src3); | 473 const __m128i a1 = _mm_add_epi32(src1, src3); |
| 478 const __m128i a2 = _mm_sub_epi16(src1, src3); | 474 const __m128i a2 = _mm_sub_epi32(src1, src3); |
| 479 const __m128i a3 = _mm_sub_epi16(src0, src2); | 475 const __m128i a3 = _mm_sub_epi32(src0, src2); |
| 480 const __m128i b0 = _mm_srai_epi16(_mm_adds_epi16(a0, a1), 1); | 476 const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1); |
| 481 const __m128i b1 = _mm_srai_epi16(_mm_adds_epi16(a3, a2), 1); | 477 const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1); |
| 482 const __m128i b2 = _mm_srai_epi16(_mm_subs_epi16(a3, a2), 1); | 478 const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1); |
| 483 const __m128i b3 = _mm_srai_epi16(_mm_subs_epi16(a0, a1), 1); | 479 const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1); |
| 484 _mm_storel_epi64((__m128i*)&out[ 0], b0); | 480 const __m128i out0 = _mm_packs_epi32(b0, b1); |
| 485 _mm_storel_epi64((__m128i*)&out[ 4], b1); | 481 const __m128i out1 = _mm_packs_epi32(b2, b3); |
| 486 _mm_storel_epi64((__m128i*)&out[ 8], b2); | 482 _mm_storeu_si128((__m128i*)&out[0], out0); |
| 487 _mm_storel_epi64((__m128i*)&out[12], b3); | 483 _mm_storeu_si128((__m128i*)&out[8], out1); |
| 488 } | 484 } |
| 489 } | 485 } |
| 490 | 486 |
| 491 //------------------------------------------------------------------------------ | 487 //------------------------------------------------------------------------------ |
| 492 // Metric | 488 // Metric |
| 493 | 489 |
| 494 static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b, | 490 static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b, |
| 495 int num_quads, int do_16) { | 491 int num_quads, int do_16) { |
| 496 const __m128i zero = _mm_setzero_si128(); | 492 const __m128i zero = _mm_setzero_si128(); |
| 497 __m128i sum1 = zero; | 493 __m128i sum1 = zero; |
| (...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 637 | 633 |
| 638 // Hadamard transform | 634 // Hadamard transform |
| 639 // Returns the difference between the weighted sum of the absolute value of | 635 // Returns the difference between the weighted sum of the absolute value of |
| 640 // transformed coefficients. | 636 // transformed coefficients. |
| 641 static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, | 637 static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, |
| 642 const uint16_t* const w) { | 638 const uint16_t* const w) { |
| 643 int32_t sum[4]; | 639 int32_t sum[4]; |
| 644 __m128i tmp_0, tmp_1, tmp_2, tmp_3; | 640 __m128i tmp_0, tmp_1, tmp_2, tmp_3; |
| 645 const __m128i zero = _mm_setzero_si128(); | 641 const __m128i zero = _mm_setzero_si128(); |
| 646 | 642 |
| 647 // Load, combine and tranpose inputs. | 643 // Load, combine and transpose inputs. |
| 648 { | 644 { |
| 649 const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]); | 645 const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]); |
| 650 const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]); | 646 const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]); |
| 651 const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]); | 647 const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]); |
| 652 const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]); | 648 const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]); |
| 653 const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]); | 649 const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]); |
| 654 const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]); | 650 const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]); |
| 655 const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]); | 651 const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]); |
| 656 const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]); | 652 const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]); |
| 657 | 653 |
| (...skipping 165 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 823 __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); | 819 __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); |
| 824 __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); | 820 __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); |
| 825 const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); | 821 const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); |
| 826 const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); | 822 const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); |
| 827 const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); | 823 const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); |
| 828 const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); | 824 const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); |
| 829 const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); | 825 const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); |
| 830 const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); | 826 const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); |
| 831 const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); | 827 const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); |
| 832 const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); | 828 const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); |
| 833 const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); | |
| 834 const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); | |
| 835 | 829 |
| 836 // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) | 830 // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) |
| 837 const __m128i sign0 = _mm_srai_epi16(in0, 15); | 831 const __m128i sign0 = _mm_srai_epi16(in0, 15); |
| 838 const __m128i sign8 = _mm_srai_epi16(in8, 15); | 832 const __m128i sign8 = _mm_srai_epi16(in8, 15); |
| 839 | 833 |
| 840 // coeff = abs(in) = (in ^ sign) - sign | 834 // coeff = abs(in) = (in ^ sign) - sign |
| 841 coeff0 = _mm_xor_si128(in0, sign0); | 835 coeff0 = _mm_xor_si128(in0, sign0); |
| 842 coeff8 = _mm_xor_si128(in8, sign8); | 836 coeff8 = _mm_xor_si128(in8, sign8); |
| 843 coeff0 = _mm_sub_epi16(coeff0, sign0); | 837 coeff0 = _mm_sub_epi16(coeff0, sign0); |
| 844 coeff8 = _mm_sub_epi16(coeff8, sign8); | 838 coeff8 = _mm_sub_epi16(coeff8, sign8); |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 887 // get sign back (if (sign[j]) out_n = -out_n) | 881 // get sign back (if (sign[j]) out_n = -out_n) |
| 888 out0 = _mm_xor_si128(out0, sign0); | 882 out0 = _mm_xor_si128(out0, sign0); |
| 889 out8 = _mm_xor_si128(out8, sign8); | 883 out8 = _mm_xor_si128(out8, sign8); |
| 890 out0 = _mm_sub_epi16(out0, sign0); | 884 out0 = _mm_sub_epi16(out0, sign0); |
| 891 out8 = _mm_sub_epi16(out8, sign8); | 885 out8 = _mm_sub_epi16(out8, sign8); |
| 892 | 886 |
| 893 // in = out * Q | 887 // in = out * Q |
| 894 in0 = _mm_mullo_epi16(out0, q0); | 888 in0 = _mm_mullo_epi16(out0, q0); |
| 895 in8 = _mm_mullo_epi16(out8, q8); | 889 in8 = _mm_mullo_epi16(out8, q8); |
| 896 | 890 |
| 897 // if (coeff <= mtx->zthresh_) {in=0; out=0;} | 891 _mm_storeu_si128((__m128i*)&in[0], in0); |
| 898 { | 892 _mm_storeu_si128((__m128i*)&in[8], in8); |
| 899 __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); | |
| 900 __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); | |
| 901 in0 = _mm_and_si128(in0, cmp0); | |
| 902 in8 = _mm_and_si128(in8, cmp8); | |
| 903 _mm_storeu_si128((__m128i*)&in[0], in0); | |
| 904 _mm_storeu_si128((__m128i*)&in[8], in8); | |
| 905 out0 = _mm_and_si128(out0, cmp0); | |
| 906 out8 = _mm_and_si128(out8, cmp8); | |
| 907 } | |
| 908 | 893 |
| 909 // zigzag the output before storing it. | 894 // zigzag the output before storing it. |
| 910 // | 895 // |
| 911 // The zigzag pattern can almost be reproduced with a small sequence of | 896 // The zigzag pattern can almost be reproduced with a small sequence of |
| 912 // shuffles. After it, we only need to swap the 7th (ending up in third | 897 // shuffles. After it, we only need to swap the 7th (ending up in third |
| 913 // position instead of twelfth) and 8th values. | 898 // position instead of twelfth) and 8th values. |
| 914 { | 899 { |
| 915 __m128i outZ0, outZ8; | 900 __m128i outZ0, outZ8; |
| 916 outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); | 901 outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); |
| 917 outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); | 902 outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); |
| (...skipping 16 matching lines...) Expand all Loading... |
| 934 { | 919 { |
| 935 int32_t tmp[4]; | 920 int32_t tmp[4]; |
| 936 _mm_storeu_si128((__m128i*)tmp, packed_out); | 921 _mm_storeu_si128((__m128i*)tmp, packed_out); |
| 937 if (n) { | 922 if (n) { |
| 938 tmp[0] &= ~0xff; | 923 tmp[0] &= ~0xff; |
| 939 } | 924 } |
| 940 return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); | 925 return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); |
| 941 } | 926 } |
| 942 } | 927 } |
| 943 | 928 |
| 929 static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16], |
| 930 const VP8Matrix* const mtx) { |
| 931 return QuantizeBlockSSE2(in, out, 0, mtx); |
| 932 } |
| 933 |
| 944 #endif // WEBP_USE_SSE2 | 934 #endif // WEBP_USE_SSE2 |
| 945 | 935 |
| 946 //------------------------------------------------------------------------------ | 936 //------------------------------------------------------------------------------ |
| 947 // Entry point | 937 // Entry point |
| 948 | 938 |
| 949 extern void VP8EncDspInitSSE2(void); | 939 extern void VP8EncDspInitSSE2(void); |
| 950 | 940 |
| 951 void VP8EncDspInitSSE2(void) { | 941 void VP8EncDspInitSSE2(void) { |
| 952 #if defined(WEBP_USE_SSE2) | 942 #if defined(WEBP_USE_SSE2) |
| 953 VP8CollectHistogram = CollectHistogramSSE2; | 943 VP8CollectHistogram = CollectHistogramSSE2; |
| 954 VP8EncQuantizeBlock = QuantizeBlockSSE2; | 944 VP8EncQuantizeBlock = QuantizeBlockSSE2; |
| 945 VP8EncQuantizeBlockWHT = QuantizeBlockWHTSSE2; |
| 955 VP8ITransform = ITransformSSE2; | 946 VP8ITransform = ITransformSSE2; |
| 956 VP8FTransform = FTransformSSE2; | 947 VP8FTransform = FTransformSSE2; |
| 957 VP8FTransformWHT = FTransformWHTSSE2; | 948 VP8FTransformWHT = FTransformWHTSSE2; |
| 958 VP8SSE16x16 = SSE16x16SSE2; | 949 VP8SSE16x16 = SSE16x16SSE2; |
| 959 VP8SSE16x8 = SSE16x8SSE2; | 950 VP8SSE16x8 = SSE16x8SSE2; |
| 960 VP8SSE8x8 = SSE8x8SSE2; | 951 VP8SSE8x8 = SSE8x8SSE2; |
| 961 VP8SSE4x4 = SSE4x4SSE2; | 952 VP8SSE4x4 = SSE4x4SSE2; |
| 962 VP8TDisto4x4 = Disto4x4SSE2; | 953 VP8TDisto4x4 = Disto4x4SSE2; |
| 963 VP8TDisto16x16 = Disto16x16SSE2; | 954 VP8TDisto16x16 = Disto16x16SSE2; |
| 964 #endif // WEBP_USE_SSE2 | 955 #endif // WEBP_USE_SSE2 |
| 965 } | 956 } |
| 966 | 957 |
| 967 #if defined(__cplusplus) || defined(c_plusplus) | |
| 968 } // extern "C" | |
| 969 #endif | |
| OLD | NEW |