OLD | NEW |
1 // Copyright 2011 Google Inc. All Rights Reserved. | 1 // Copyright 2011 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
9 // | 9 // |
10 // SSE2 version of some decoding functions (idct, loop filtering). | 10 // SSE2 version of some decoding functions (idct, loop filtering). |
11 // | 11 // |
12 // Author: somnath@google.com (Somnath Banerjee) | 12 // Author: somnath@google.com (Somnath Banerjee) |
13 // cduvivier@google.com (Christian Duvivier) | 13 // cduvivier@google.com (Christian Duvivier) |
14 | 14 |
15 #include "./dsp.h" | 15 #include "./dsp.h" |
16 | 16 |
17 #if defined(__cplusplus) || defined(c_plusplus) | 17 #if defined(WEBP_USE_SSE2) |
18 extern "C" { | |
19 #endif | |
20 | 18 |
21 #if defined(WEBP_USE_SSE2) | 19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C |
| 20 // one it seems => disable it by default. Uncomment the following to enable: |
| 21 // #define USE_TRANSFORM_AC3 |
22 | 22 |
23 #include <emmintrin.h> | 23 #include <emmintrin.h> |
24 #include "../dec/vp8i.h" | 24 #include "../dec/vp8i.h" |
25 | 25 |
26 //------------------------------------------------------------------------------ | 26 //------------------------------------------------------------------------------ |
27 // Transforms (Paragraph 14.4) | 27 // Transforms (Paragraph 14.4) |
28 | 28 |
29 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { | 29 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { |
30 // This implementation makes use of 16-bit fixed point versions of two | 30 // This implementation makes use of 16-bit fixed point versions of two |
31 // multiply constants: | 31 // multiply constants: |
(...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
194 // a03 a13 a23 a33 b03 b13 b23 b33 | 194 // a03 a13 a23 a33 b03 b13 b23 b33 |
195 } | 195 } |
196 | 196 |
197 // Add inverse transform to 'dst' and store. | 197 // Add inverse transform to 'dst' and store. |
198 { | 198 { |
199 const __m128i zero = _mm_setzero_si128(); | 199 const __m128i zero = _mm_setzero_si128(); |
200 // Load the reference(s). | 200 // Load the reference(s). |
201 __m128i dst0, dst1, dst2, dst3; | 201 __m128i dst0, dst1, dst2, dst3; |
202 if (do_two) { | 202 if (do_two) { |
203 // Load eight bytes/pixels per line. | 203 // Load eight bytes/pixels per line. |
204 dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]); | 204 dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS)); |
205 dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]); | 205 dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS)); |
206 dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]); | 206 dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS)); |
207 dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]); | 207 dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS)); |
208 } else { | 208 } else { |
209 // Load four bytes/pixels per line. | 209 // Load four bytes/pixels per line. |
210 dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]); | 210 dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); |
211 dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]); | 211 dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); |
212 dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]); | 212 dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); |
213 dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]); | 213 dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); |
214 } | 214 } |
215 // Convert to 16b. | 215 // Convert to 16b. |
216 dst0 = _mm_unpacklo_epi8(dst0, zero); | 216 dst0 = _mm_unpacklo_epi8(dst0, zero); |
217 dst1 = _mm_unpacklo_epi8(dst1, zero); | 217 dst1 = _mm_unpacklo_epi8(dst1, zero); |
218 dst2 = _mm_unpacklo_epi8(dst2, zero); | 218 dst2 = _mm_unpacklo_epi8(dst2, zero); |
219 dst3 = _mm_unpacklo_epi8(dst3, zero); | 219 dst3 = _mm_unpacklo_epi8(dst3, zero); |
220 // Add the inverse transform(s). | 220 // Add the inverse transform(s). |
221 dst0 = _mm_add_epi16(dst0, T0); | 221 dst0 = _mm_add_epi16(dst0, T0); |
222 dst1 = _mm_add_epi16(dst1, T1); | 222 dst1 = _mm_add_epi16(dst1, T1); |
223 dst2 = _mm_add_epi16(dst2, T2); | 223 dst2 = _mm_add_epi16(dst2, T2); |
224 dst3 = _mm_add_epi16(dst3, T3); | 224 dst3 = _mm_add_epi16(dst3, T3); |
225 // Unsigned saturate to 8b. | 225 // Unsigned saturate to 8b. |
226 dst0 = _mm_packus_epi16(dst0, dst0); | 226 dst0 = _mm_packus_epi16(dst0, dst0); |
227 dst1 = _mm_packus_epi16(dst1, dst1); | 227 dst1 = _mm_packus_epi16(dst1, dst1); |
228 dst2 = _mm_packus_epi16(dst2, dst2); | 228 dst2 = _mm_packus_epi16(dst2, dst2); |
229 dst3 = _mm_packus_epi16(dst3, dst3); | 229 dst3 = _mm_packus_epi16(dst3, dst3); |
230 // Store the results. | 230 // Store the results. |
231 if (do_two) { | 231 if (do_two) { |
232 // Store eight bytes/pixels per line. | 232 // Store eight bytes/pixels per line. |
233 _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0); | 233 _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0); |
234 _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1); | 234 _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1); |
235 _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2); | 235 _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2); |
236 _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3); | 236 _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3); |
237 } else { | 237 } else { |
238 // Store four bytes/pixels per line. | 238 // Store four bytes/pixels per line. |
239 *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0); | 239 *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); |
240 *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1); | 240 *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); |
241 *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2); | 241 *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); |
242 *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3); | 242 *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); |
243 } | 243 } |
244 } | 244 } |
245 } | 245 } |
246 | 246 |
| 247 #if defined(USE_TRANSFORM_AC3) |
| 248 #define MUL(a, b) (((a) * (b)) >> 16) |
| 249 static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) { |
| 250 static const int kC1 = 20091 + (1 << 16); |
| 251 static const int kC2 = 35468; |
| 252 const __m128i A = _mm_set1_epi16(in[0] + 4); |
| 253 const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2)); |
| 254 const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1)); |
| 255 const int c1 = MUL(in[1], kC2); |
| 256 const int d1 = MUL(in[1], kC1); |
| 257 const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1); |
| 258 const __m128i B = _mm_adds_epi16(A, CD); |
| 259 const __m128i m0 = _mm_adds_epi16(B, d4); |
| 260 const __m128i m1 = _mm_adds_epi16(B, c4); |
| 261 const __m128i m2 = _mm_subs_epi16(B, c4); |
| 262 const __m128i m3 = _mm_subs_epi16(B, d4); |
| 263 const __m128i zero = _mm_setzero_si128(); |
| 264 // Load the source pixels. |
| 265 __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); |
| 266 __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); |
| 267 __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); |
| 268 __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); |
| 269 // Convert to 16b. |
| 270 dst0 = _mm_unpacklo_epi8(dst0, zero); |
| 271 dst1 = _mm_unpacklo_epi8(dst1, zero); |
| 272 dst2 = _mm_unpacklo_epi8(dst2, zero); |
| 273 dst3 = _mm_unpacklo_epi8(dst3, zero); |
| 274 // Add the inverse transform. |
| 275 dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3)); |
| 276 dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3)); |
| 277 dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3)); |
| 278 dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3)); |
| 279 // Unsigned saturate to 8b. |
| 280 dst0 = _mm_packus_epi16(dst0, dst0); |
| 281 dst1 = _mm_packus_epi16(dst1, dst1); |
| 282 dst2 = _mm_packus_epi16(dst2, dst2); |
| 283 dst3 = _mm_packus_epi16(dst3, dst3); |
| 284 // Store the results. |
| 285 *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); |
| 286 *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); |
| 287 *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); |
| 288 *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); |
| 289 } |
| 290 #undef MUL |
| 291 #endif // USE_TRANSFORM_AC3 |
| 292 |
247 //------------------------------------------------------------------------------ | 293 //------------------------------------------------------------------------------ |
248 // Loop Filter (Paragraph 15) | 294 // Loop Filter (Paragraph 15) |
249 | 295 |
250 // Compute abs(p - q) = subs(p - q) OR subs(q - p) | 296 // Compute abs(p - q) = subs(p - q) OR subs(q - p) |
251 #define MM_ABS(p, q) _mm_or_si128( \ | 297 #define MM_ABS(p, q) _mm_or_si128( \ |
252 _mm_subs_epu8((q), (p)), \ | 298 _mm_subs_epu8((q), (p)), \ |
253 _mm_subs_epu8((p), (q))) | 299 _mm_subs_epu8((p), (q))) |
254 | 300 |
255 // Shift each byte of "a" by N bits while preserving by the sign bit. | 301 // Shift each byte of "a" by N bits while preserving by the sign bit. |
256 // | 302 // |
(...skipping 624 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
881 #endif // WEBP_USE_SSE2 | 927 #endif // WEBP_USE_SSE2 |
882 | 928 |
883 //------------------------------------------------------------------------------ | 929 //------------------------------------------------------------------------------ |
884 // Entry point | 930 // Entry point |
885 | 931 |
886 extern void VP8DspInitSSE2(void); | 932 extern void VP8DspInitSSE2(void); |
887 | 933 |
888 void VP8DspInitSSE2(void) { | 934 void VP8DspInitSSE2(void) { |
889 #if defined(WEBP_USE_SSE2) | 935 #if defined(WEBP_USE_SSE2) |
890 VP8Transform = TransformSSE2; | 936 VP8Transform = TransformSSE2; |
| 937 #if defined(USE_TRANSFORM_AC3) |
| 938 VP8TransformAC3 = TransformAC3SSE2; |
| 939 #endif |
891 | 940 |
892 VP8VFilter16 = VFilter16SSE2; | 941 VP8VFilter16 = VFilter16SSE2; |
893 VP8HFilter16 = HFilter16SSE2; | 942 VP8HFilter16 = HFilter16SSE2; |
894 VP8VFilter8 = VFilter8SSE2; | 943 VP8VFilter8 = VFilter8SSE2; |
895 VP8HFilter8 = HFilter8SSE2; | 944 VP8HFilter8 = HFilter8SSE2; |
896 VP8VFilter16i = VFilter16iSSE2; | 945 VP8VFilter16i = VFilter16iSSE2; |
897 VP8HFilter16i = HFilter16iSSE2; | 946 VP8HFilter16i = HFilter16iSSE2; |
898 VP8VFilter8i = VFilter8iSSE2; | 947 VP8VFilter8i = VFilter8iSSE2; |
899 VP8HFilter8i = HFilter8iSSE2; | 948 VP8HFilter8i = HFilter8iSSE2; |
900 | 949 |
901 VP8SimpleVFilter16 = SimpleVFilter16SSE2; | 950 VP8SimpleVFilter16 = SimpleVFilter16SSE2; |
902 VP8SimpleHFilter16 = SimpleHFilter16SSE2; | 951 VP8SimpleHFilter16 = SimpleHFilter16SSE2; |
903 VP8SimpleVFilter16i = SimpleVFilter16iSSE2; | 952 VP8SimpleVFilter16i = SimpleVFilter16iSSE2; |
904 VP8SimpleHFilter16i = SimpleHFilter16iSSE2; | 953 VP8SimpleHFilter16i = SimpleHFilter16iSSE2; |
905 #endif // WEBP_USE_SSE2 | 954 #endif // WEBP_USE_SSE2 |
906 } | 955 } |
907 | 956 |
908 #if defined(__cplusplus) || defined(c_plusplus) | |
909 } // extern "C" | |
910 #endif | |
OLD | NEW |