third_party/libwebp/dsp/dec_sse2.c - Issue 12942006: libwebp: update snapshot to v0.3.0-rc6

Side by Side Diff: third_party/libwebp/dsp/dec_sse2.c

Issue 12942006: libwebp: update snapshot to v0.3.0-rc6 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: local webkit layout expectations Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 Google Inc. All Rights Reserved.	1 // Copyright 2011 Google Inc. All Rights Reserved.

2 //	2 //

3 // This code is licensed under the same terms as WebM:	3 // This code is licensed under the same terms as WebM:

4 // Software License Agreement: http://www.webmproject.org/license/software/	4 // Software License Agreement: http://www.webmproject.org/license/software/

5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/	5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/

6 // -----------------------------------------------------------------------------	6 // -----------------------------------------------------------------------------

7 //	7 //

8 // SSE2 version of some decoding functions (idct, loop filtering).	8 // SSE2 version of some decoding functions (idct, loop filtering).

9 //	9 //

10 // Author: somnath@google.com (Somnath Banerjee)	10 // Author: somnath@google.com (Somnath Banerjee)

11 // cduvivier@google.com (Christian Duvivier)	11 // cduvivier@google.com (Christian Duvivier)

12	12

13 #include "./dsp.h"	13 #include "./dsp.h"

14	14

	15 #if defined(__cplusplus) \|\| defined(c_plusplus)

	16 extern "C" {

	17 #endif

	18

15 #if defined(WEBP_USE_SSE2)	19 #if defined(WEBP_USE_SSE2)

16	20

17 #include <emmintrin.h>	21 #include <emmintrin.h>

18 #include "../dec/vp8i.h"	22 #include "../dec/vp8i.h"

19	23

20 #if defined(__cplusplus) \|\| defined(c_plusplus)

21 extern "C" {

22 #endif

23

24 //------------------------------------------------------------------------------	24 //------------------------------------------------------------------------------

25 // Transforms (Paragraph 14.4)	25 // Transforms (Paragraph 14.4)

26	26

27 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {	27 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {

28 // This implementation makes use of 16-bit fixed point versions of two	28 // This implementation makes use of 16-bit fixed point versions of two

29 // multiply constants:	29 // multiply constants:

30 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16	30 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16

31 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16	31 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16

32 //	32 //

33 // To be able to use signed 16-bit integers, we use the following trick to	33 // To be able to use signed 16-bit integers, we use the following trick to

(...skipping 153 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
187 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);	187 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);

188 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);	188 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);

189 // a00 a10 a20 a30 b00 b10 b20 b30	189 // a00 a10 a20 a30 b00 b10 b20 b30

190 // a01 a11 a21 a31 b01 b11 b21 b31	190 // a01 a11 a21 a31 b01 b11 b21 b31

191 // a02 a12 a22 a32 b02 b12 b22 b32	191 // a02 a12 a22 a32 b02 b12 b22 b32

192 // a03 a13 a23 a33 b03 b13 b23 b33	192 // a03 a13 a23 a33 b03 b13 b23 b33

193 }	193 }

194	194

195 // Add inverse transform to 'dst' and store.	195 // Add inverse transform to 'dst' and store.

196 {	196 {

197 const __m128i zero = _mm_set1_epi16(0);	197 const __m128i zero = _mm_setzero_si128();

198 // Load the reference(s).	198 // Load the reference(s).

199 __m128i dst0, dst1, dst2, dst3;	199 __m128i dst0, dst1, dst2, dst3;

200 if (do_two) {	200 if (do_two) {

201 // Load eight bytes/pixels per line.	201 // Load eight bytes/pixels per line.

202 dst0 = _mm_loadl_epi64((__m128i)&dst[0 BPS]);	202 dst0 = _mm_loadl_epi64((__m128i)&dst[0 BPS]);

203 dst1 = _mm_loadl_epi64((__m128i)&dst[1 BPS]);	203 dst1 = _mm_loadl_epi64((__m128i)&dst[1 BPS]);

204 dst2 = _mm_loadl_epi64((__m128i)&dst[2 BPS]);	204 dst2 = _mm_loadl_epi64((__m128i)&dst[2 BPS]);

205 dst3 = _mm_loadl_epi64((__m128i)&dst[3 BPS]);	205 dst3 = _mm_loadl_epi64((__m128i)&dst[3 BPS]);

206 } else {	206 } else {

207 // Load four bytes/pixels per line.	207 // Load four bytes/pixels per line.

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
271 b = _mm_xor_si128(b, sign_bit); \	271 b = _mm_xor_si128(b, sign_bit); \

272 }	272 }

273	273

274 #define FLIP_SIGN_BIT4(a, b, c, d) { \	274 #define FLIP_SIGN_BIT4(a, b, c, d) { \

275 FLIP_SIGN_BIT2(a, b); \	275 FLIP_SIGN_BIT2(a, b); \

276 FLIP_SIGN_BIT2(c, d); \	276 FLIP_SIGN_BIT2(c, d); \

277 }	277 }

278	278

279 #define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) { \	279 #define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) { \

280 const __m128i zero = _mm_setzero_si128(); \	280 const __m128i zero = _mm_setzero_si128(); \

281 const __m128i t1 = MM_ABS(p1, p0); \	281 const __m128i t_1 = MM_ABS(p1, p0); \

282 const __m128i t2 = MM_ABS(q1, q0); \	282 const __m128i t_2 = MM_ABS(q1, q0); \

283 \	283 \

284 const __m128i h = _mm_set1_epi8(hev_thresh); \	284 const __m128i h = _mm_set1_epi8(hev_thresh); \

285 const __m128i t3 = _mm_subs_epu8(t1, h); /* abs(p1 - p0) - hev_tresh */ \	285 const __m128i t_3 = _mm_subs_epu8(t_1, h); /* abs(p1 - p0) - hev_tresh */ \

286 const __m128i t4 = _mm_subs_epu8(t2, h); /* abs(q1 - q0) - hev_tresh */ \	286 const __m128i t_4 = _mm_subs_epu8(t_2, h); /* abs(q1 - q0) - hev_tresh */ \

287 \	287 \

288 not_hev = _mm_or_si128(t3, t4); \	288 not_hev = _mm_or_si128(t_3, t_4); \

289 not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\	289 not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\

290 }	290 }

291	291

292 #define GET_BASE_DELTA(p1, p0, q0, q1, o) { \	292 #define GET_BASE_DELTA(p1, p0, q0, q1, o) { \

293 const __m128i qp0 = _mm_subs_epi8(q0, p0); /* q0 - p0 */ \	293 const __m128i qp0 = _mm_subs_epi8(q0, p0); /* q0 - p0 */ \

294 o = _mm_subs_epi8(p1, q1); /* p1 - q1 */ \	294 o = _mm_subs_epi8(p1, q1); /* p1 - q1 */ \

295 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 1 * (q0 - p0) */ \	295 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 1 * (q0 - p0) */ \

296 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 2 * (q0 - p0) */ \	296 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 2 * (q0 - p0) */ \

297 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 3 * (q0 - p0) */ \	297 o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 3 * (q0 - p0) */ \

298 }	298 }

299	299

300 #define DO_SIMPLE_FILTER(p0, q0, fl) { \	300 #define DO_SIMPLE_FILTER(p0, q0, fl) { \

301 const __m128i three = _mm_set1_epi8(3); \	301 const __m128i three = _mm_set1_epi8(3); \

302 const __m128i four = _mm_set1_epi8(4); \	302 const __m128i four = _mm_set1_epi8(4); \

303 __m128i v3 = _mm_adds_epi8(fl, three); \	303 __m128i v3 = _mm_adds_epi8(fl, three); \

304 __m128i v4 = _mm_adds_epi8(fl, four); \	304 __m128i v4 = _mm_adds_epi8(fl, four); \

305 \	305 \

306 /* Do +4 side */ \	306 /* Do +4 side */ \

307 SIGNED_SHIFT_N(v4, 3); /* v4 >> 3 */ \	307 SIGNED_SHIFT_N(v4, 3); /* v4 >> 3 */ \

308 q0 = _mm_subs_epi8(q0, v4); /* q0 -= v4 */ \	308 q0 = _mm_subs_epi8(q0, v4); /* q0 -= v4 */ \

309 \	309 \

310 /* Now do +3 side */ \	310 /* Now do +3 side */ \

311 SIGNED_SHIFT_N(v3, 3); /* v3 >> 3 */ \	311 SIGNED_SHIFT_N(v3, 3); /* v3 >> 3 */ \

312 p0 = _mm_adds_epi8(p0, v3); /* p0 += v3 */ \	312 p0 = _mm_adds_epi8(p0, v3); /* p0 += v3 */ \

313 }	313 }

314	314

315 // Updates values of 2 pixels at MB edge during complex filtering.	315 // Updates values of 2 pixels at MB edge during complex filtering.

316 // Update operations:	316 // Update operations:

317 // q = q - a and p = p + a; where a = [(a_hi >> 7), (a_lo >> 7)]	317 // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]

318 #define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) { \	318 #define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) { \

319 const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7); \	319 const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7); \

320 const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7); \	320 const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7); \

321 const __m128i a = _mm_packs_epi16(a_lo7, a_hi7); \	321 const __m128i delta = _mm_packs_epi16(a_lo7, a_hi7); \

322 pi = _mm_adds_epi8(pi, a); \	322 pi = _mm_adds_epi8(pi, delta); \

323 qi = _mm_subs_epi8(qi, a); \	323 qi = _mm_subs_epi8(qi, delta); \

324 }	324 }

325	325

326 static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,	326 static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,

327 const __m128i* q1, int thresh, __m128i *mask) {	327 const __m128i* q1, int thresh, __m128i *mask) {

328 __m128i t1 = MM_ABS(p1, q1); // abs(p1 - q1)	328 __m128i t1 = MM_ABS(p1, q1); // abs(p1 - q1)

329 *mask = _mm_set1_epi8(0xFE);	329 *mask = _mm_set1_epi8(0xFE);

330 t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero	330 t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero

331 t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2	331 t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2

332	332

333 mask = MM_ABS(p0, *q0); // abs(p0 - q0)	333 mask = MM_ABS(p0, *q0); // abs(p0 - q0)

(...skipping 535 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
869 MAX_DIFF2(t2, t1, q1, q0, mask);	869 MAX_DIFF2(t2, t1, q1, q0, mask);

870	870

871 COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);	871 COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);

872 DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);	872 DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);

873	873

874 u -= 2; // beginning of p1	874 u -= 2; // beginning of p1

875 v -= 2;	875 v -= 2;

876 Store16x4(u, v, stride, &p1, &p0, &q0, &q1);	876 Store16x4(u, v, stride, &p1, &p0, &q0, &q1);

877 }	877 }

878	878

	879 #endif // WEBP_USE_SSE2

	880

	881 //------------------------------------------------------------------------------

	882 // Entry point

	883

879 extern void VP8DspInitSSE2(void);	884 extern void VP8DspInitSSE2(void);

880	885

881 void VP8DspInitSSE2(void) {	886 void VP8DspInitSSE2(void) {

	887 #if defined(WEBP_USE_SSE2)

882 VP8Transform = TransformSSE2;	888 VP8Transform = TransformSSE2;

883	889

884 VP8VFilter16 = VFilter16SSE2;	890 VP8VFilter16 = VFilter16SSE2;

885 VP8HFilter16 = HFilter16SSE2;	891 VP8HFilter16 = HFilter16SSE2;

886 VP8VFilter8 = VFilter8SSE2;	892 VP8VFilter8 = VFilter8SSE2;

887 VP8HFilter8 = HFilter8SSE2;	893 VP8HFilter8 = HFilter8SSE2;

888 VP8VFilter16i = VFilter16iSSE2;	894 VP8VFilter16i = VFilter16iSSE2;

889 VP8HFilter16i = HFilter16iSSE2;	895 VP8HFilter16i = HFilter16iSSE2;

890 VP8VFilter8i = VFilter8iSSE2;	896 VP8VFilter8i = VFilter8iSSE2;

891 VP8HFilter8i = HFilter8iSSE2;	897 VP8HFilter8i = HFilter8iSSE2;

892	898

893 VP8SimpleVFilter16 = SimpleVFilter16SSE2;	899 VP8SimpleVFilter16 = SimpleVFilter16SSE2;

894 VP8SimpleHFilter16 = SimpleHFilter16SSE2;	900 VP8SimpleHFilter16 = SimpleHFilter16SSE2;

895 VP8SimpleVFilter16i = SimpleVFilter16iSSE2;	901 VP8SimpleVFilter16i = SimpleVFilter16iSSE2;

896 VP8SimpleHFilter16i = SimpleHFilter16iSSE2;	902 VP8SimpleHFilter16i = SimpleHFilter16iSSE2;

	903 #endif // WEBP_USE_SSE2

897 }	904 }

898	905

899 #if defined(__cplusplus) \|\| defined(c_plusplus)	906 #if defined(__cplusplus) \|\| defined(c_plusplus)

900 } // extern "C"	907 } // extern "C"

901 #endif	908 #endif

902

903 #endif // WEBP_USE_SSE2

OLD	NEW

« third_party/libwebp/dsp/dec_neon.c ('K') | « third_party/libwebp/dsp/dec_neon.c ('k') | third_party/libwebp/dsp/dsp.h » ('j') | no next file with comments »