| OLD | NEW |
| 1 // Copyright 2011 Google Inc. All Rights Reserved. | 1 // Copyright 2011 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // SSE2 version of some decoding functions (idct, loop filtering). | 10 // SSE2 version of some decoding functions (idct, loop filtering). |
| 11 // | 11 // |
| 12 // Author: somnath@google.com (Somnath Banerjee) | 12 // Author: somnath@google.com (Somnath Banerjee) |
| 13 // cduvivier@google.com (Christian Duvivier) | 13 // cduvivier@google.com (Christian Duvivier) |
| 14 | 14 |
| 15 #include "./dsp.h" | 15 #include "./dsp.h" |
| 16 | 16 |
| 17 #if defined(WEBP_USE_SSE2) | 17 #if defined(WEBP_USE_SSE2) |
| 18 | 18 |
| 19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C | 19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C |
| 20 // one it seems => disable it by default. Uncomment the following to enable: | 20 // one it seems => disable it by default. Uncomment the following to enable: |
| 21 // #define USE_TRANSFORM_AC3 | 21 // #define USE_TRANSFORM_AC3 |
| 22 | 22 |
| 23 #include <emmintrin.h> | 23 #include <emmintrin.h> |
| 24 #include "./common_sse2.h" | 24 #include "./common_sse2.h" |
| 25 #include "../dec/vp8i.h" | 25 #include "../dec/vp8i_dec.h" |
| 26 #include "../utils/utils.h" | 26 #include "../utils/utils.h" |
| 27 | 27 |
| 28 //------------------------------------------------------------------------------ | 28 //------------------------------------------------------------------------------ |
| 29 // Transforms (Paragraph 14.4) | 29 // Transforms (Paragraph 14.4) |
| 30 | 30 |
| 31 static void Transform(const int16_t* in, uint8_t* dst, int do_two) { | 31 static void Transform(const int16_t* in, uint8_t* dst, int do_two) { |
| 32 // This implementation makes use of 16-bit fixed point versions of two | 32 // This implementation makes use of 16-bit fixed point versions of two |
| 33 // multiply constants: | 33 // multiply constants: |
| 34 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 | 34 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 |
| 35 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 | 35 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 |
| (...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 133 const __m128i tmp1 = _mm_add_epi16(b, c); | 133 const __m128i tmp1 = _mm_add_epi16(b, c); |
| 134 const __m128i tmp2 = _mm_sub_epi16(b, c); | 134 const __m128i tmp2 = _mm_sub_epi16(b, c); |
| 135 const __m128i tmp3 = _mm_sub_epi16(a, d); | 135 const __m128i tmp3 = _mm_sub_epi16(a, d); |
| 136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); | 136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); |
| 137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); | 137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); |
| 138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); | 138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); |
| 139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); | 139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); |
| 140 | 140 |
| 141 // Transpose the two 4x4. | 141 // Transpose the two 4x4. |
| 142 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, | 142 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, |
| 143 &T2, &T3); | 143 &T2, &T3); |
| 144 } | 144 } |
| 145 | 145 |
| 146 // Add inverse transform to 'dst' and store. | 146 // Add inverse transform to 'dst' and store. |
| 147 { | 147 { |
| 148 const __m128i zero = _mm_setzero_si128(); | 148 const __m128i zero = _mm_setzero_si128(); |
| 149 // Load the reference(s). | 149 // Load the reference(s). |
| 150 __m128i dst0, dst1, dst2, dst3; | 150 __m128i dst0, dst1, dst2, dst3; |
| 151 if (do_two) { | 151 if (do_two) { |
| 152 // Load eight bytes/pixels per line. | 152 // Load eight bytes/pixels per line. |
| 153 dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS)); | 153 dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS)); |
| (...skipping 1068 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1222 VP8PredChroma8[4] = DC8uvNoTop; | 1222 VP8PredChroma8[4] = DC8uvNoTop; |
| 1223 VP8PredChroma8[5] = DC8uvNoLeft; | 1223 VP8PredChroma8[5] = DC8uvNoLeft; |
| 1224 VP8PredChroma8[6] = DC8uvNoTopLeft; | 1224 VP8PredChroma8[6] = DC8uvNoTopLeft; |
| 1225 } | 1225 } |
| 1226 | 1226 |
| 1227 #else // !WEBP_USE_SSE2 | 1227 #else // !WEBP_USE_SSE2 |
| 1228 | 1228 |
| 1229 WEBP_DSP_INIT_STUB(VP8DspInitSSE2) | 1229 WEBP_DSP_INIT_STUB(VP8DspInitSSE2) |
| 1230 | 1230 |
| 1231 #endif // WEBP_USE_SSE2 | 1231 #endif // WEBP_USE_SSE2 |
| OLD | NEW |