OLD | NEW |
1 // Copyright 2011 Google Inc. All Rights Reserved. | 1 // Copyright 2011 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
9 // | 9 // |
10 // SSE2 version of some decoding functions (idct, loop filtering). | 10 // SSE2 version of some decoding functions (idct, loop filtering). |
11 // | 11 // |
12 // Author: somnath@google.com (Somnath Banerjee) | 12 // Author: somnath@google.com (Somnath Banerjee) |
13 // cduvivier@google.com (Christian Duvivier) | 13 // cduvivier@google.com (Christian Duvivier) |
14 | 14 |
15 #include "./dsp.h" | 15 #include "./dsp.h" |
16 | 16 |
17 #if defined(WEBP_USE_SSE2) | 17 #if defined(WEBP_USE_SSE2) |
18 | 18 |
19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C | 19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C |
20 // one it seems => disable it by default. Uncomment the following to enable: | 20 // one it seems => disable it by default. Uncomment the following to enable: |
21 // #define USE_TRANSFORM_AC3 | 21 // #define USE_TRANSFORM_AC3 |
22 | 22 |
23 #include <emmintrin.h> | 23 #include <emmintrin.h> |
| 24 #include "./common_sse2.h" |
24 #include "../dec/vp8i.h" | 25 #include "../dec/vp8i.h" |
| 26 #include "../utils/utils.h" |
25 | 27 |
26 //------------------------------------------------------------------------------ | 28 //------------------------------------------------------------------------------ |
27 // Transforms (Paragraph 14.4) | 29 // Transforms (Paragraph 14.4) |
28 | 30 |
29 static void Transform(const int16_t* in, uint8_t* dst, int do_two) { | 31 static void Transform(const int16_t* in, uint8_t* dst, int do_two) { |
30 // This implementation makes use of 16-bit fixed point versions of two | 32 // This implementation makes use of 16-bit fixed point versions of two |
31 // multiply constants: | 33 // multiply constants: |
32 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 | 34 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 |
33 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 | 35 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 |
34 // | 36 // |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
95 const __m128i d4 = _mm_add_epi16(d1, d2); | 97 const __m128i d4 = _mm_add_epi16(d1, d2); |
96 const __m128i d = _mm_add_epi16(d3, d4); | 98 const __m128i d = _mm_add_epi16(d3, d4); |
97 | 99 |
98 // Second pass. | 100 // Second pass. |
99 const __m128i tmp0 = _mm_add_epi16(a, d); | 101 const __m128i tmp0 = _mm_add_epi16(a, d); |
100 const __m128i tmp1 = _mm_add_epi16(b, c); | 102 const __m128i tmp1 = _mm_add_epi16(b, c); |
101 const __m128i tmp2 = _mm_sub_epi16(b, c); | 103 const __m128i tmp2 = _mm_sub_epi16(b, c); |
102 const __m128i tmp3 = _mm_sub_epi16(a, d); | 104 const __m128i tmp3 = _mm_sub_epi16(a, d); |
103 | 105 |
104 // Transpose the two 4x4. | 106 // Transpose the two 4x4. |
105 // a00 a01 a02 a03 b00 b01 b02 b03 | 107 VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3); |
106 // a10 a11 a12 a13 b10 b11 b12 b13 | |
107 // a20 a21 a22 a23 b20 b21 b22 b23 | |
108 // a30 a31 a32 a33 b30 b31 b32 b33 | |
109 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); | |
110 const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); | |
111 const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); | |
112 const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); | |
113 // a00 a10 a01 a11 a02 a12 a03 a13 | |
114 // a20 a30 a21 a31 a22 a32 a23 a33 | |
115 // b00 b10 b01 b11 b02 b12 b03 b13 | |
116 // b20 b30 b21 b31 b22 b32 b23 b33 | |
117 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); | |
118 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); | |
119 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); | |
120 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); | |
121 // a00 a10 a20 a30 a01 a11 a21 a31 | |
122 // b00 b10 b20 b30 b01 b11 b21 b31 | |
123 // a02 a12 a22 a32 a03 a13 a23 a33 | |
124 // b02 b12 a22 b32 b03 b13 b23 b33 | |
125 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); | |
126 T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); | |
127 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); | |
128 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); | |
129 // a00 a10 a20 a30 b00 b10 b20 b30 | |
130 // a01 a11 a21 a31 b01 b11 b21 b31 | |
131 // a02 a12 a22 a32 b02 b12 b22 b32 | |
132 // a03 a13 a23 a33 b03 b13 b23 b33 | |
133 } | 108 } |
134 | 109 |
135 // Horizontal pass and subsequent transpose. | 110 // Horizontal pass and subsequent transpose. |
136 { | 111 { |
137 // First pass, c and d calculations are longer because of the "trick" | 112 // First pass, c and d calculations are longer because of the "trick" |
138 // multiplications. | 113 // multiplications. |
139 const __m128i four = _mm_set1_epi16(4); | 114 const __m128i four = _mm_set1_epi16(4); |
140 const __m128i dc = _mm_add_epi16(T0, four); | 115 const __m128i dc = _mm_add_epi16(T0, four); |
141 const __m128i a = _mm_add_epi16(dc, T2); | 116 const __m128i a = _mm_add_epi16(dc, T2); |
142 const __m128i b = _mm_sub_epi16(dc, T2); | 117 const __m128i b = _mm_sub_epi16(dc, T2); |
(...skipping 14 matching lines...) Expand all Loading... |
157 const __m128i tmp0 = _mm_add_epi16(a, d); | 132 const __m128i tmp0 = _mm_add_epi16(a, d); |
158 const __m128i tmp1 = _mm_add_epi16(b, c); | 133 const __m128i tmp1 = _mm_add_epi16(b, c); |
159 const __m128i tmp2 = _mm_sub_epi16(b, c); | 134 const __m128i tmp2 = _mm_sub_epi16(b, c); |
160 const __m128i tmp3 = _mm_sub_epi16(a, d); | 135 const __m128i tmp3 = _mm_sub_epi16(a, d); |
161 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); | 136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); |
162 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); | 137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); |
163 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); | 138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); |
164 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); | 139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); |
165 | 140 |
166 // Transpose the two 4x4. | 141 // Transpose the two 4x4. |
167 // a00 a01 a02 a03 b00 b01 b02 b03 | 142 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, |
168 // a10 a11 a12 a13 b10 b11 b12 b13 | 143 &T2, &T3); |
169 // a20 a21 a22 a23 b20 b21 b22 b23 | |
170 // a30 a31 a32 a33 b30 b31 b32 b33 | |
171 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); | |
172 const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); | |
173 const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); | |
174 const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); | |
175 // a00 a10 a01 a11 a02 a12 a03 a13 | |
176 // a20 a30 a21 a31 a22 a32 a23 a33 | |
177 // b00 b10 b01 b11 b02 b12 b03 b13 | |
178 // b20 b30 b21 b31 b22 b32 b23 b33 | |
179 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); | |
180 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); | |
181 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); | |
182 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); | |
183 // a00 a10 a20 a30 a01 a11 a21 a31 | |
184 // b00 b10 b20 b30 b01 b11 b21 b31 | |
185 // a02 a12 a22 a32 a03 a13 a23 a33 | |
186 // b02 b12 a22 b32 b03 b13 b23 b33 | |
187 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); | |
188 T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); | |
189 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); | |
190 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); | |
191 // a00 a10 a20 a30 b00 b10 b20 b30 | |
192 // a01 a11 a21 a31 b01 b11 b21 b31 | |
193 // a02 a12 a22 a32 b02 b12 b22 b32 | |
194 // a03 a13 a23 a33 b03 b13 b23 b33 | |
195 } | 144 } |
196 | 145 |
197 // Add inverse transform to 'dst' and store. | 146 // Add inverse transform to 'dst' and store. |
198 { | 147 { |
199 const __m128i zero = _mm_setzero_si128(); | 148 const __m128i zero = _mm_setzero_si128(); |
200 // Load the reference(s). | 149 // Load the reference(s). |
201 __m128i dst0, dst1, dst2, dst3; | 150 __m128i dst0, dst1, dst2, dst3; |
202 if (do_two) { | 151 if (do_two) { |
203 // Load eight bytes/pixels per line. | 152 // Load eight bytes/pixels per line. |
204 dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS)); | 153 dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS)); |
(...skipping 1068 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1273 VP8PredChroma8[4] = DC8uvNoTop; | 1222 VP8PredChroma8[4] = DC8uvNoTop; |
1274 VP8PredChroma8[5] = DC8uvNoLeft; | 1223 VP8PredChroma8[5] = DC8uvNoLeft; |
1275 VP8PredChroma8[6] = DC8uvNoTopLeft; | 1224 VP8PredChroma8[6] = DC8uvNoTopLeft; |
1276 } | 1225 } |
1277 | 1226 |
1278 #else // !WEBP_USE_SSE2 | 1227 #else // !WEBP_USE_SSE2 |
1279 | 1228 |
1280 WEBP_DSP_INIT_STUB(VP8DspInitSSE2) | 1229 WEBP_DSP_INIT_STUB(VP8DspInitSSE2) |
1281 | 1230 |
1282 #endif // WEBP_USE_SSE2 | 1231 #endif // WEBP_USE_SSE2 |
OLD | NEW |