OLD | NEW |
1 // Copyright 2011 Google Inc. All Rights Reserved. | 1 // Copyright 2011 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // This code is licensed under the same terms as WebM: | 3 // This code is licensed under the same terms as WebM: |
4 // Software License Agreement: http://www.webmproject.org/license/software/ | 4 // Software License Agreement: http://www.webmproject.org/license/software/ |
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ | 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ |
6 // ----------------------------------------------------------------------------- | 6 // ----------------------------------------------------------------------------- |
7 // | 7 // |
8 // SSE2 version of some decoding functions (idct, loop filtering). | 8 // SSE2 version of some decoding functions (idct, loop filtering). |
9 // | 9 // |
10 // Author: somnath@google.com (Somnath Banerjee) | 10 // Author: somnath@google.com (Somnath Banerjee) |
11 // cduvivier@google.com (Christian Duvivier) | 11 // cduvivier@google.com (Christian Duvivier) |
12 | 12 |
13 #include "./dsp.h" | 13 #include "./dsp.h" |
14 | 14 |
| 15 #if defined(__cplusplus) || defined(c_plusplus) |
| 16 extern "C" { |
| 17 #endif |
| 18 |
15 #if defined(WEBP_USE_SSE2) | 19 #if defined(WEBP_USE_SSE2) |
16 | 20 |
17 #include <emmintrin.h> | 21 #include <emmintrin.h> |
18 #include "../dec/vp8i.h" | 22 #include "../dec/vp8i.h" |
19 | 23 |
20 #if defined(__cplusplus) || defined(c_plusplus) | |
21 extern "C" { | |
22 #endif | |
23 | |
24 //------------------------------------------------------------------------------ | 24 //------------------------------------------------------------------------------ |
25 // Transforms (Paragraph 14.4) | 25 // Transforms (Paragraph 14.4) |
26 | 26 |
27 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { | 27 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { |
28 // This implementation makes use of 16-bit fixed point versions of two | 28 // This implementation makes use of 16-bit fixed point versions of two |
29 // multiply constants: | 29 // multiply constants: |
30 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 | 30 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 |
31 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 | 31 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 |
32 // | 32 // |
33 // To be able to use signed 16-bit integers, we use the following trick to | 33 // To be able to use signed 16-bit integers, we use the following trick to |
(...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
187 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); | 187 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); |
188 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); | 188 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); |
189 // a00 a10 a20 a30 b00 b10 b20 b30 | 189 // a00 a10 a20 a30 b00 b10 b20 b30 |
190 // a01 a11 a21 a31 b01 b11 b21 b31 | 190 // a01 a11 a21 a31 b01 b11 b21 b31 |
191 // a02 a12 a22 a32 b02 b12 b22 b32 | 191 // a02 a12 a22 a32 b02 b12 b22 b32 |
192 // a03 a13 a23 a33 b03 b13 b23 b33 | 192 // a03 a13 a23 a33 b03 b13 b23 b33 |
193 } | 193 } |
194 | 194 |
195 // Add inverse transform to 'dst' and store. | 195 // Add inverse transform to 'dst' and store. |
196 { | 196 { |
197 const __m128i zero = _mm_set1_epi16(0); | 197 const __m128i zero = _mm_setzero_si128(); |
198 // Load the reference(s). | 198 // Load the reference(s). |
199 __m128i dst0, dst1, dst2, dst3; | 199 __m128i dst0, dst1, dst2, dst3; |
200 if (do_two) { | 200 if (do_two) { |
201 // Load eight bytes/pixels per line. | 201 // Load eight bytes/pixels per line. |
202 dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]); | 202 dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]); |
203 dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]); | 203 dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]); |
204 dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]); | 204 dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]); |
205 dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]); | 205 dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]); |
206 } else { | 206 } else { |
207 // Load four bytes/pixels per line. | 207 // Load four bytes/pixels per line. |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
271 b = _mm_xor_si128(b, sign_bit); \ | 271 b = _mm_xor_si128(b, sign_bit); \ |
272 } | 272 } |
273 | 273 |
274 #define FLIP_SIGN_BIT4(a, b, c, d) { \ | 274 #define FLIP_SIGN_BIT4(a, b, c, d) { \ |
275 FLIP_SIGN_BIT2(a, b); \ | 275 FLIP_SIGN_BIT2(a, b); \ |
276 FLIP_SIGN_BIT2(c, d); \ | 276 FLIP_SIGN_BIT2(c, d); \ |
277 } | 277 } |
278 | 278 |
279 #define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) { \ | 279 #define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) { \ |
280 const __m128i zero = _mm_setzero_si128(); \ | 280 const __m128i zero = _mm_setzero_si128(); \ |
281 const __m128i t1 = MM_ABS(p1, p0); \ | 281 const __m128i t_1 = MM_ABS(p1, p0); \ |
282 const __m128i t2 = MM_ABS(q1, q0); \ | 282 const __m128i t_2 = MM_ABS(q1, q0); \ |
283 \ | 283 \ |
284 const __m128i h = _mm_set1_epi8(hev_thresh); \ | 284 const __m128i h = _mm_set1_epi8(hev_thresh); \ |
285 const __m128i t3 = _mm_subs_epu8(t1, h); /* abs(p1 - p0) - hev_tresh */ \ | 285 const __m128i t_3 = _mm_subs_epu8(t_1, h); /* abs(p1 - p0) - hev_tresh */ \ |
286 const __m128i t4 = _mm_subs_epu8(t2, h); /* abs(q1 - q0) - hev_tresh */ \ | 286 const __m128i t_4 = _mm_subs_epu8(t_2, h); /* abs(q1 - q0) - hev_tresh */ \ |
287 \ | 287 \ |
288 not_hev = _mm_or_si128(t3, t4); \ | 288 not_hev = _mm_or_si128(t_3, t_4); \ |
289 not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\ | 289 not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\ |
290 } | 290 } |
291 | 291 |
292 #define GET_BASE_DELTA(p1, p0, q0, q1, o) { \ | 292 #define GET_BASE_DELTA(p1, p0, q0, q1, o) { \ |
293 const __m128i qp0 = _mm_subs_epi8(q0, p0); /* q0 - p0 */ \ | 293 const __m128i qp0 = _mm_subs_epi8(q0, p0); /* q0 - p0 */ \ |
294 o = _mm_subs_epi8(p1, q1); /* p1 - q1 */ \ | 294 o = _mm_subs_epi8(p1, q1); /* p1 - q1 */ \ |
295 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 1 * (q0 - p0) */ \ | 295 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 1 * (q0 - p0) */ \ |
296 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 2 * (q0 - p0) */ \ | 296 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 2 * (q0 - p0) */ \ |
297 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 3 * (q0 - p0) */ \ | 297 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 3 * (q0 - p0) */ \ |
298 } | 298 } |
299 | 299 |
300 #define DO_SIMPLE_FILTER(p0, q0, fl) { \ | 300 #define DO_SIMPLE_FILTER(p0, q0, fl) { \ |
301 const __m128i three = _mm_set1_epi8(3); \ | 301 const __m128i three = _mm_set1_epi8(3); \ |
302 const __m128i four = _mm_set1_epi8(4); \ | 302 const __m128i four = _mm_set1_epi8(4); \ |
303 __m128i v3 = _mm_adds_epi8(fl, three); \ | 303 __m128i v3 = _mm_adds_epi8(fl, three); \ |
304 __m128i v4 = _mm_adds_epi8(fl, four); \ | 304 __m128i v4 = _mm_adds_epi8(fl, four); \ |
305 \ | 305 \ |
306 /* Do +4 side */ \ | 306 /* Do +4 side */ \ |
307 SIGNED_SHIFT_N(v4, 3); /* v4 >> 3 */ \ | 307 SIGNED_SHIFT_N(v4, 3); /* v4 >> 3 */ \ |
308 q0 = _mm_subs_epi8(q0, v4); /* q0 -= v4 */ \ | 308 q0 = _mm_subs_epi8(q0, v4); /* q0 -= v4 */ \ |
309 \ | 309 \ |
310 /* Now do +3 side */ \ | 310 /* Now do +3 side */ \ |
311 SIGNED_SHIFT_N(v3, 3); /* v3 >> 3 */ \ | 311 SIGNED_SHIFT_N(v3, 3); /* v3 >> 3 */ \ |
312 p0 = _mm_adds_epi8(p0, v3); /* p0 += v3 */ \ | 312 p0 = _mm_adds_epi8(p0, v3); /* p0 += v3 */ \ |
313 } | 313 } |
314 | 314 |
315 // Updates values of 2 pixels at MB edge during complex filtering. | 315 // Updates values of 2 pixels at MB edge during complex filtering. |
316 // Update operations: | 316 // Update operations: |
317 // q = q - a and p = p + a; where a = [(a_hi >> 7), (a_lo >> 7)] | 317 // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)] |
318 #define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) { \ | 318 #define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) { \ |
319 const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7); \ | 319 const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7); \ |
320 const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7); \ | 320 const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7); \ |
321 const __m128i a = _mm_packs_epi16(a_lo7, a_hi7); \ | 321 const __m128i delta = _mm_packs_epi16(a_lo7, a_hi7); \ |
322 pi = _mm_adds_epi8(pi, a); \ | 322 pi = _mm_adds_epi8(pi, delta); \ |
323 qi = _mm_subs_epi8(qi, a); \ | 323 qi = _mm_subs_epi8(qi, delta); \ |
324 } | 324 } |
325 | 325 |
326 static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0, | 326 static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0, |
327 const __m128i* q1, int thresh, __m128i *mask) { | 327 const __m128i* q1, int thresh, __m128i *mask) { |
328 __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) | 328 __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) |
329 *mask = _mm_set1_epi8(0xFE); | 329 *mask = _mm_set1_epi8(0xFE); |
330 t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero | 330 t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero |
331 t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2 | 331 t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2 |
332 | 332 |
333 *mask = MM_ABS(*p0, *q0); // abs(p0 - q0) | 333 *mask = MM_ABS(*p0, *q0); // abs(p0 - q0) |
(...skipping 535 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
869 MAX_DIFF2(t2, t1, q1, q0, mask); | 869 MAX_DIFF2(t2, t1, q1, q0, mask); |
870 | 870 |
871 COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); | 871 COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); |
872 DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); | 872 DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); |
873 | 873 |
874 u -= 2; // beginning of p1 | 874 u -= 2; // beginning of p1 |
875 v -= 2; | 875 v -= 2; |
876 Store16x4(u, v, stride, &p1, &p0, &q0, &q1); | 876 Store16x4(u, v, stride, &p1, &p0, &q0, &q1); |
877 } | 877 } |
878 | 878 |
| 879 #endif // WEBP_USE_SSE2 |
| 880 |
| 881 //------------------------------------------------------------------------------ |
| 882 // Entry point |
| 883 |
879 extern void VP8DspInitSSE2(void); | 884 extern void VP8DspInitSSE2(void); |
880 | 885 |
881 void VP8DspInitSSE2(void) { | 886 void VP8DspInitSSE2(void) { |
| 887 #if defined(WEBP_USE_SSE2) |
882 VP8Transform = TransformSSE2; | 888 VP8Transform = TransformSSE2; |
883 | 889 |
884 VP8VFilter16 = VFilter16SSE2; | 890 VP8VFilter16 = VFilter16SSE2; |
885 VP8HFilter16 = HFilter16SSE2; | 891 VP8HFilter16 = HFilter16SSE2; |
886 VP8VFilter8 = VFilter8SSE2; | 892 VP8VFilter8 = VFilter8SSE2; |
887 VP8HFilter8 = HFilter8SSE2; | 893 VP8HFilter8 = HFilter8SSE2; |
888 VP8VFilter16i = VFilter16iSSE2; | 894 VP8VFilter16i = VFilter16iSSE2; |
889 VP8HFilter16i = HFilter16iSSE2; | 895 VP8HFilter16i = HFilter16iSSE2; |
890 VP8VFilter8i = VFilter8iSSE2; | 896 VP8VFilter8i = VFilter8iSSE2; |
891 VP8HFilter8i = HFilter8iSSE2; | 897 VP8HFilter8i = HFilter8iSSE2; |
892 | 898 |
893 VP8SimpleVFilter16 = SimpleVFilter16SSE2; | 899 VP8SimpleVFilter16 = SimpleVFilter16SSE2; |
894 VP8SimpleHFilter16 = SimpleHFilter16SSE2; | 900 VP8SimpleHFilter16 = SimpleHFilter16SSE2; |
895 VP8SimpleVFilter16i = SimpleVFilter16iSSE2; | 901 VP8SimpleVFilter16i = SimpleVFilter16iSSE2; |
896 VP8SimpleHFilter16i = SimpleHFilter16iSSE2; | 902 VP8SimpleHFilter16i = SimpleHFilter16iSSE2; |
| 903 #endif // WEBP_USE_SSE2 |
897 } | 904 } |
898 | 905 |
899 #if defined(__cplusplus) || defined(c_plusplus) | 906 #if defined(__cplusplus) || defined(c_plusplus) |
900 } // extern "C" | 907 } // extern "C" |
901 #endif | 908 #endif |
902 | |
903 #endif // WEBP_USE_SSE2 | |
OLD | NEW |