third_party/libwebp/dsp/dec_sse2.c - Issue 116213006: Update libwebp to 0.4.0

Side by Side Diff: third_party/libwebp/dsp/dec_sse2.c

Issue 116213006: Update libwebp to 0.4.0 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: After Blink Roll Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 Google Inc. All Rights Reserved.	1 // Copyright 2011 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // SSE2 version of some decoding functions (idct, loop filtering).	10 // SSE2 version of some decoding functions (idct, loop filtering).

11 //	11 //

12 // Author: somnath@google.com (Somnath Banerjee)	12 // Author: somnath@google.com (Somnath Banerjee)

13 // cduvivier@google.com (Christian Duvivier)	13 // cduvivier@google.com (Christian Duvivier)

14	14

15 #include "./dsp.h"	15 #include "./dsp.h"

16	16

17 #if defined(__cplusplus) \|\| defined(c_plusplus)	17 #if defined(WEBP_USE_SSE2)

18 extern "C" {

19 #endif

20	18

21 #if defined(WEBP_USE_SSE2)	19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C

	20 // one it seems => disable it by default. Uncomment the following to enable:

	21 // #define USE_TRANSFORM_AC3

22	22

23 #include <emmintrin.h>	23 #include <emmintrin.h>

24 #include "../dec/vp8i.h"	24 #include "../dec/vp8i.h"

25	25

26 //------------------------------------------------------------------------------	26 //------------------------------------------------------------------------------

27 // Transforms (Paragraph 14.4)	27 // Transforms (Paragraph 14.4)

28	28

29 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {	29 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {

30 // This implementation makes use of 16-bit fixed point versions of two	30 // This implementation makes use of 16-bit fixed point versions of two

31 // multiply constants:	31 // multiply constants:

(...skipping 162 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
194 // a03 a13 a23 a33 b03 b13 b23 b33	194 // a03 a13 a23 a33 b03 b13 b23 b33

195 }	195 }

196	196

197 // Add inverse transform to 'dst' and store.	197 // Add inverse transform to 'dst' and store.

198 {	198 {

199 const __m128i zero = _mm_setzero_si128();	199 const __m128i zero = _mm_setzero_si128();

200 // Load the reference(s).	200 // Load the reference(s).

201 __m128i dst0, dst1, dst2, dst3;	201 __m128i dst0, dst1, dst2, dst3;

202 if (do_two) {	202 if (do_two) {

203 // Load eight bytes/pixels per line.	203 // Load eight bytes/pixels per line.

204 dst0 = _mm_loadl_epi64((__m128i)&dst[0 BPS]);	204 dst0 = _mm_loadl_epi64((__m128i)(dst + 0 BPS));

205 dst1 = _mm_loadl_epi64((__m128i)&dst[1 BPS]);	205 dst1 = _mm_loadl_epi64((__m128i)(dst + 1 BPS));

206 dst2 = _mm_loadl_epi64((__m128i)&dst[2 BPS]);	206 dst2 = _mm_loadl_epi64((__m128i)(dst + 2 BPS));

207 dst3 = _mm_loadl_epi64((__m128i)&dst[3 BPS]);	207 dst3 = _mm_loadl_epi64((__m128i)(dst + 3 BPS));

208 } else {	208 } else {

209 // Load four bytes/pixels per line.	209 // Load four bytes/pixels per line.

210 dst0 = _mm_cvtsi32_si128((int)&dst[0 * BPS]);	210 dst0 = _mm_cvtsi32_si128((int)(dst + 0 * BPS));

211 dst1 = _mm_cvtsi32_si128((int)&dst[1 * BPS]);	211 dst1 = _mm_cvtsi32_si128((int)(dst + 1 * BPS));

212 dst2 = _mm_cvtsi32_si128((int)&dst[2 * BPS]);	212 dst2 = _mm_cvtsi32_si128((int)(dst + 2 * BPS));

213 dst3 = _mm_cvtsi32_si128((int)&dst[3 * BPS]);	213 dst3 = _mm_cvtsi32_si128((int)(dst + 3 * BPS));

214 }	214 }

215 // Convert to 16b.	215 // Convert to 16b.

216 dst0 = _mm_unpacklo_epi8(dst0, zero);	216 dst0 = _mm_unpacklo_epi8(dst0, zero);

217 dst1 = _mm_unpacklo_epi8(dst1, zero);	217 dst1 = _mm_unpacklo_epi8(dst1, zero);

218 dst2 = _mm_unpacklo_epi8(dst2, zero);	218 dst2 = _mm_unpacklo_epi8(dst2, zero);

219 dst3 = _mm_unpacklo_epi8(dst3, zero);	219 dst3 = _mm_unpacklo_epi8(dst3, zero);

220 // Add the inverse transform(s).	220 // Add the inverse transform(s).

221 dst0 = _mm_add_epi16(dst0, T0);	221 dst0 = _mm_add_epi16(dst0, T0);

222 dst1 = _mm_add_epi16(dst1, T1);	222 dst1 = _mm_add_epi16(dst1, T1);

223 dst2 = _mm_add_epi16(dst2, T2);	223 dst2 = _mm_add_epi16(dst2, T2);

224 dst3 = _mm_add_epi16(dst3, T3);	224 dst3 = _mm_add_epi16(dst3, T3);

225 // Unsigned saturate to 8b.	225 // Unsigned saturate to 8b.

226 dst0 = _mm_packus_epi16(dst0, dst0);	226 dst0 = _mm_packus_epi16(dst0, dst0);

227 dst1 = _mm_packus_epi16(dst1, dst1);	227 dst1 = _mm_packus_epi16(dst1, dst1);

228 dst2 = _mm_packus_epi16(dst2, dst2);	228 dst2 = _mm_packus_epi16(dst2, dst2);

229 dst3 = _mm_packus_epi16(dst3, dst3);	229 dst3 = _mm_packus_epi16(dst3, dst3);

230 // Store the results.	230 // Store the results.

231 if (do_two) {	231 if (do_two) {

232 // Store eight bytes/pixels per line.	232 // Store eight bytes/pixels per line.

233 _mm_storel_epi64((__m128i)&dst[0 BPS], dst0);	233 _mm_storel_epi64((__m128i)(dst + 0 BPS), dst0);

234 _mm_storel_epi64((__m128i)&dst[1 BPS], dst1);	234 _mm_storel_epi64((__m128i)(dst + 1 BPS), dst1);

235 _mm_storel_epi64((__m128i)&dst[2 BPS], dst2);	235 _mm_storel_epi64((__m128i)(dst + 2 BPS), dst2);

236 _mm_storel_epi64((__m128i)&dst[3 BPS], dst3);	236 _mm_storel_epi64((__m128i)(dst + 3 BPS), dst3);

237 } else {	237 } else {

238 // Store four bytes/pixels per line.	238 // Store four bytes/pixels per line.

239 ((int32_t )&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);	239 (int)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);

240 ((int32_t )&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);	240 (int)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);

241 ((int32_t )&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);	241 (int)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);

242 ((int32_t )&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);	242 (int)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);

243 }	243 }

244 }	244 }

245 }	245 }

246	246

	247 #if defined(USE_TRANSFORM_AC3)

	248 #define MUL(a, b) (((a) * (b)) >> 16)

	249 static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {

	250 static const int kC1 = 20091 + (1 << 16);

	251 static const int kC2 = 35468;

	252 const __m128i A = _mm_set1_epi16(in[0] + 4);

	253 const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));

	254 const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));

	255 const int c1 = MUL(in[1], kC2);

	256 const int d1 = MUL(in[1], kC1);

	257 const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);

	258 const __m128i B = _mm_adds_epi16(A, CD);

	259 const __m128i m0 = _mm_adds_epi16(B, d4);

	260 const __m128i m1 = _mm_adds_epi16(B, c4);

	261 const __m128i m2 = _mm_subs_epi16(B, c4);

	262 const __m128i m3 = _mm_subs_epi16(B, d4);

	263 const __m128i zero = _mm_setzero_si128();

	264 // Load the source pixels.

	265 __m128i dst0 = _mm_cvtsi32_si128((int)(dst + 0 * BPS));

	266 __m128i dst1 = _mm_cvtsi32_si128((int)(dst + 1 * BPS));

	267 __m128i dst2 = _mm_cvtsi32_si128((int)(dst + 2 * BPS));

	268 __m128i dst3 = _mm_cvtsi32_si128((int)(dst + 3 * BPS));

	269 // Convert to 16b.

	270 dst0 = _mm_unpacklo_epi8(dst0, zero);

	271 dst1 = _mm_unpacklo_epi8(dst1, zero);

	272 dst2 = _mm_unpacklo_epi8(dst2, zero);

	273 dst3 = _mm_unpacklo_epi8(dst3, zero);

	274 // Add the inverse transform.

	275 dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));

	276 dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));

	277 dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));

	278 dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));

	279 // Unsigned saturate to 8b.

	280 dst0 = _mm_packus_epi16(dst0, dst0);

	281 dst1 = _mm_packus_epi16(dst1, dst1);

	282 dst2 = _mm_packus_epi16(dst2, dst2);

	283 dst3 = _mm_packus_epi16(dst3, dst3);

	284 // Store the results.

	285 (int)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);

	286 (int)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);

	287 (int)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);

	288 (int)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);

	289 }

	290 #undef MUL

	291 #endif // USE_TRANSFORM_AC3

	292

247 //------------------------------------------------------------------------------	293 //------------------------------------------------------------------------------

248 // Loop Filter (Paragraph 15)	294 // Loop Filter (Paragraph 15)

249	295

250 // Compute abs(p - q) = subs(p - q) OR subs(q - p)	296 // Compute abs(p - q) = subs(p - q) OR subs(q - p)

251 #define MM_ABS(p, q) _mm_or_si128( \	297 #define MM_ABS(p, q) _mm_or_si128( \

252 _mm_subs_epu8((q), (p)), \	298 _mm_subs_epu8((q), (p)), \

253 _mm_subs_epu8((p), (q)))	299 _mm_subs_epu8((p), (q)))

254	300

255 // Shift each byte of "a" by N bits while preserving by the sign bit.	301 // Shift each byte of "a" by N bits while preserving by the sign bit.

256 //	302 //

(...skipping 624 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
881 #endif // WEBP_USE_SSE2	927 #endif // WEBP_USE_SSE2

882	928

883 //------------------------------------------------------------------------------	929 //------------------------------------------------------------------------------

884 // Entry point	930 // Entry point

885	931

886 extern void VP8DspInitSSE2(void);	932 extern void VP8DspInitSSE2(void);

887	933

888 void VP8DspInitSSE2(void) {	934 void VP8DspInitSSE2(void) {

889 #if defined(WEBP_USE_SSE2)	935 #if defined(WEBP_USE_SSE2)

890 VP8Transform = TransformSSE2;	936 VP8Transform = TransformSSE2;

	937 #if defined(USE_TRANSFORM_AC3)

	938 VP8TransformAC3 = TransformAC3SSE2;

	939 #endif

891	940

892 VP8VFilter16 = VFilter16SSE2;	941 VP8VFilter16 = VFilter16SSE2;

893 VP8HFilter16 = HFilter16SSE2;	942 VP8HFilter16 = HFilter16SSE2;

894 VP8VFilter8 = VFilter8SSE2;	943 VP8VFilter8 = VFilter8SSE2;

895 VP8HFilter8 = HFilter8SSE2;	944 VP8HFilter8 = HFilter8SSE2;

896 VP8VFilter16i = VFilter16iSSE2;	945 VP8VFilter16i = VFilter16iSSE2;

897 VP8HFilter16i = HFilter16iSSE2;	946 VP8HFilter16i = HFilter16iSSE2;

898 VP8VFilter8i = VFilter8iSSE2;	947 VP8VFilter8i = VFilter8iSSE2;

899 VP8HFilter8i = HFilter8iSSE2;	948 VP8HFilter8i = HFilter8iSSE2;

900	949

901 VP8SimpleVFilter16 = SimpleVFilter16SSE2;	950 VP8SimpleVFilter16 = SimpleVFilter16SSE2;

902 VP8SimpleHFilter16 = SimpleHFilter16SSE2;	951 VP8SimpleHFilter16 = SimpleHFilter16SSE2;

903 VP8SimpleVFilter16i = SimpleVFilter16iSSE2;	952 VP8SimpleVFilter16i = SimpleVFilter16iSSE2;

904 VP8SimpleHFilter16i = SimpleHFilter16iSSE2;	953 VP8SimpleHFilter16i = SimpleHFilter16iSSE2;

905 #endif // WEBP_USE_SSE2	954 #endif // WEBP_USE_SSE2

906 }	955 }

907	956

908 #if defined(__cplusplus) \|\| defined(c_plusplus)

909 } // extern "C"

910 #endif

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/dec_neon.c ('k') | third_party/libwebp/dsp/dsp.h » ('j') | no next file with comments »