Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(257)

Side by Side Diff: third_party/libwebp/dsp/dec_sse2.c

Issue 12942006: libwebp: update snapshot to v0.3.0-rc6 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: local webkit layout expectations Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 Google Inc. All Rights Reserved. 1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // 2 //
3 // This code is licensed under the same terms as WebM: 3 // This code is licensed under the same terms as WebM:
4 // Software License Agreement: http://www.webmproject.org/license/software/ 4 // Software License Agreement: http://www.webmproject.org/license/software/
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/
6 // ----------------------------------------------------------------------------- 6 // -----------------------------------------------------------------------------
7 // 7 //
8 // SSE2 version of some decoding functions (idct, loop filtering). 8 // SSE2 version of some decoding functions (idct, loop filtering).
9 // 9 //
10 // Author: somnath@google.com (Somnath Banerjee) 10 // Author: somnath@google.com (Somnath Banerjee)
11 // cduvivier@google.com (Christian Duvivier) 11 // cduvivier@google.com (Christian Duvivier)
12 12
13 #include "./dsp.h" 13 #include "./dsp.h"
14 14
15 #if defined(__cplusplus) || defined(c_plusplus)
16 extern "C" {
17 #endif
18
15 #if defined(WEBP_USE_SSE2) 19 #if defined(WEBP_USE_SSE2)
16 20
17 #include <emmintrin.h> 21 #include <emmintrin.h>
18 #include "../dec/vp8i.h" 22 #include "../dec/vp8i.h"
19 23
20 #if defined(__cplusplus) || defined(c_plusplus)
21 extern "C" {
22 #endif
23
24 //------------------------------------------------------------------------------ 24 //------------------------------------------------------------------------------
25 // Transforms (Paragraph 14.4) 25 // Transforms (Paragraph 14.4)
26 26
27 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { 27 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
28 // This implementation makes use of 16-bit fixed point versions of two 28 // This implementation makes use of 16-bit fixed point versions of two
29 // multiply constants: 29 // multiply constants:
30 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 30 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
31 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 31 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
32 // 32 //
33 // To be able to use signed 16-bit integers, we use the following trick to 33 // To be able to use signed 16-bit integers, we use the following trick to
(...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after
187 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); 187 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
188 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); 188 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
189 // a00 a10 a20 a30 b00 b10 b20 b30 189 // a00 a10 a20 a30 b00 b10 b20 b30
190 // a01 a11 a21 a31 b01 b11 b21 b31 190 // a01 a11 a21 a31 b01 b11 b21 b31
191 // a02 a12 a22 a32 b02 b12 b22 b32 191 // a02 a12 a22 a32 b02 b12 b22 b32
192 // a03 a13 a23 a33 b03 b13 b23 b33 192 // a03 a13 a23 a33 b03 b13 b23 b33
193 } 193 }
194 194
195 // Add inverse transform to 'dst' and store. 195 // Add inverse transform to 'dst' and store.
196 { 196 {
197 const __m128i zero = _mm_set1_epi16(0); 197 const __m128i zero = _mm_setzero_si128();
198 // Load the reference(s). 198 // Load the reference(s).
199 __m128i dst0, dst1, dst2, dst3; 199 __m128i dst0, dst1, dst2, dst3;
200 if (do_two) { 200 if (do_two) {
201 // Load eight bytes/pixels per line. 201 // Load eight bytes/pixels per line.
202 dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]); 202 dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
203 dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]); 203 dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
204 dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]); 204 dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
205 dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]); 205 dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
206 } else { 206 } else {
207 // Load four bytes/pixels per line. 207 // Load four bytes/pixels per line.
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
271 b = _mm_xor_si128(b, sign_bit); \ 271 b = _mm_xor_si128(b, sign_bit); \
272 } 272 }
273 273
274 #define FLIP_SIGN_BIT4(a, b, c, d) { \ 274 #define FLIP_SIGN_BIT4(a, b, c, d) { \
275 FLIP_SIGN_BIT2(a, b); \ 275 FLIP_SIGN_BIT2(a, b); \
276 FLIP_SIGN_BIT2(c, d); \ 276 FLIP_SIGN_BIT2(c, d); \
277 } 277 }
278 278
279 #define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) { \ 279 #define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) { \
280 const __m128i zero = _mm_setzero_si128(); \ 280 const __m128i zero = _mm_setzero_si128(); \
281 const __m128i t1 = MM_ABS(p1, p0); \ 281 const __m128i t_1 = MM_ABS(p1, p0); \
282 const __m128i t2 = MM_ABS(q1, q0); \ 282 const __m128i t_2 = MM_ABS(q1, q0); \
283 \ 283 \
284 const __m128i h = _mm_set1_epi8(hev_thresh); \ 284 const __m128i h = _mm_set1_epi8(hev_thresh); \
285 const __m128i t3 = _mm_subs_epu8(t1, h); /* abs(p1 - p0) - hev_tresh */ \ 285 const __m128i t_3 = _mm_subs_epu8(t_1, h); /* abs(p1 - p0) - hev_tresh */ \
286 const __m128i t4 = _mm_subs_epu8(t2, h); /* abs(q1 - q0) - hev_tresh */ \ 286 const __m128i t_4 = _mm_subs_epu8(t_2, h); /* abs(q1 - q0) - hev_tresh */ \
287 \ 287 \
288 not_hev = _mm_or_si128(t3, t4); \ 288 not_hev = _mm_or_si128(t_3, t_4); \
289 not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\ 289 not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\
290 } 290 }
291 291
292 #define GET_BASE_DELTA(p1, p0, q0, q1, o) { \ 292 #define GET_BASE_DELTA(p1, p0, q0, q1, o) { \
293 const __m128i qp0 = _mm_subs_epi8(q0, p0); /* q0 - p0 */ \ 293 const __m128i qp0 = _mm_subs_epi8(q0, p0); /* q0 - p0 */ \
294 o = _mm_subs_epi8(p1, q1); /* p1 - q1 */ \ 294 o = _mm_subs_epi8(p1, q1); /* p1 - q1 */ \
295 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 1 * (q0 - p0) */ \ 295 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 1 * (q0 - p0) */ \
296 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 2 * (q0 - p0) */ \ 296 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 2 * (q0 - p0) */ \
297 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 3 * (q0 - p0) */ \ 297 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 3 * (q0 - p0) */ \
298 } 298 }
299 299
300 #define DO_SIMPLE_FILTER(p0, q0, fl) { \ 300 #define DO_SIMPLE_FILTER(p0, q0, fl) { \
301 const __m128i three = _mm_set1_epi8(3); \ 301 const __m128i three = _mm_set1_epi8(3); \
302 const __m128i four = _mm_set1_epi8(4); \ 302 const __m128i four = _mm_set1_epi8(4); \
303 __m128i v3 = _mm_adds_epi8(fl, three); \ 303 __m128i v3 = _mm_adds_epi8(fl, three); \
304 __m128i v4 = _mm_adds_epi8(fl, four); \ 304 __m128i v4 = _mm_adds_epi8(fl, four); \
305 \ 305 \
306 /* Do +4 side */ \ 306 /* Do +4 side */ \
307 SIGNED_SHIFT_N(v4, 3); /* v4 >> 3 */ \ 307 SIGNED_SHIFT_N(v4, 3); /* v4 >> 3 */ \
308 q0 = _mm_subs_epi8(q0, v4); /* q0 -= v4 */ \ 308 q0 = _mm_subs_epi8(q0, v4); /* q0 -= v4 */ \
309 \ 309 \
310 /* Now do +3 side */ \ 310 /* Now do +3 side */ \
311 SIGNED_SHIFT_N(v3, 3); /* v3 >> 3 */ \ 311 SIGNED_SHIFT_N(v3, 3); /* v3 >> 3 */ \
312 p0 = _mm_adds_epi8(p0, v3); /* p0 += v3 */ \ 312 p0 = _mm_adds_epi8(p0, v3); /* p0 += v3 */ \
313 } 313 }
314 314
315 // Updates values of 2 pixels at MB edge during complex filtering. 315 // Updates values of 2 pixels at MB edge during complex filtering.
316 // Update operations: 316 // Update operations:
317 // q = q - a and p = p + a; where a = [(a_hi >> 7), (a_lo >> 7)] 317 // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
318 #define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) { \ 318 #define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) { \
319 const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7); \ 319 const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7); \
320 const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7); \ 320 const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7); \
321 const __m128i a = _mm_packs_epi16(a_lo7, a_hi7); \ 321 const __m128i delta = _mm_packs_epi16(a_lo7, a_hi7); \
322 pi = _mm_adds_epi8(pi, a); \ 322 pi = _mm_adds_epi8(pi, delta); \
323 qi = _mm_subs_epi8(qi, a); \ 323 qi = _mm_subs_epi8(qi, delta); \
324 } 324 }
325 325
326 static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0, 326 static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
327 const __m128i* q1, int thresh, __m128i *mask) { 327 const __m128i* q1, int thresh, __m128i *mask) {
328 __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) 328 __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1)
329 *mask = _mm_set1_epi8(0xFE); 329 *mask = _mm_set1_epi8(0xFE);
330 t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero 330 t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero
331 t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2 331 t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2
332 332
333 *mask = MM_ABS(*p0, *q0); // abs(p0 - q0) 333 *mask = MM_ABS(*p0, *q0); // abs(p0 - q0)
(...skipping 535 matching lines...) Expand 10 before | Expand all | Expand 10 after
869 MAX_DIFF2(t2, t1, q1, q0, mask); 869 MAX_DIFF2(t2, t1, q1, q0, mask);
870 870
871 COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); 871 COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
872 DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); 872 DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
873 873
874 u -= 2; // beginning of p1 874 u -= 2; // beginning of p1
875 v -= 2; 875 v -= 2;
876 Store16x4(u, v, stride, &p1, &p0, &q0, &q1); 876 Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
877 } 877 }
878 878
879 #endif // WEBP_USE_SSE2
880
881 //------------------------------------------------------------------------------
882 // Entry point
883
879 extern void VP8DspInitSSE2(void); 884 extern void VP8DspInitSSE2(void);
880 885
881 void VP8DspInitSSE2(void) { 886 void VP8DspInitSSE2(void) {
887 #if defined(WEBP_USE_SSE2)
882 VP8Transform = TransformSSE2; 888 VP8Transform = TransformSSE2;
883 889
884 VP8VFilter16 = VFilter16SSE2; 890 VP8VFilter16 = VFilter16SSE2;
885 VP8HFilter16 = HFilter16SSE2; 891 VP8HFilter16 = HFilter16SSE2;
886 VP8VFilter8 = VFilter8SSE2; 892 VP8VFilter8 = VFilter8SSE2;
887 VP8HFilter8 = HFilter8SSE2; 893 VP8HFilter8 = HFilter8SSE2;
888 VP8VFilter16i = VFilter16iSSE2; 894 VP8VFilter16i = VFilter16iSSE2;
889 VP8HFilter16i = HFilter16iSSE2; 895 VP8HFilter16i = HFilter16iSSE2;
890 VP8VFilter8i = VFilter8iSSE2; 896 VP8VFilter8i = VFilter8iSSE2;
891 VP8HFilter8i = HFilter8iSSE2; 897 VP8HFilter8i = HFilter8iSSE2;
892 898
893 VP8SimpleVFilter16 = SimpleVFilter16SSE2; 899 VP8SimpleVFilter16 = SimpleVFilter16SSE2;
894 VP8SimpleHFilter16 = SimpleHFilter16SSE2; 900 VP8SimpleHFilter16 = SimpleHFilter16SSE2;
895 VP8SimpleVFilter16i = SimpleVFilter16iSSE2; 901 VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
896 VP8SimpleHFilter16i = SimpleHFilter16iSSE2; 902 VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
903 #endif // WEBP_USE_SSE2
897 } 904 }
898 905
899 #if defined(__cplusplus) || defined(c_plusplus) 906 #if defined(__cplusplus) || defined(c_plusplus)
900 } // extern "C" 907 } // extern "C"
901 #endif 908 #endif
902
903 #endif // WEBP_USE_SSE2
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698