Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(714)

Side by Side Diff: third_party/libwebp/dsp/dec_sse2.c

Issue 116213006: Update libwebp to 0.4.0 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: After Blink Roll Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/libwebp/dsp/dec_neon.c ('k') | third_party/libwebp/dsp/dsp.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 Google Inc. All Rights Reserved. 1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // SSE2 version of some decoding functions (idct, loop filtering). 10 // SSE2 version of some decoding functions (idct, loop filtering).
11 // 11 //
12 // Author: somnath@google.com (Somnath Banerjee) 12 // Author: somnath@google.com (Somnath Banerjee)
13 // cduvivier@google.com (Christian Duvivier) 13 // cduvivier@google.com (Christian Duvivier)
14 14
15 #include "./dsp.h" 15 #include "./dsp.h"
16 16
17 #if defined(__cplusplus) || defined(c_plusplus) 17 #if defined(WEBP_USE_SSE2)
18 extern "C" {
19 #endif
20 18
21 #if defined(WEBP_USE_SSE2) 19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
20 // one it seems => disable it by default. Uncomment the following to enable:
21 // #define USE_TRANSFORM_AC3
22 22
23 #include <emmintrin.h> 23 #include <emmintrin.h>
24 #include "../dec/vp8i.h" 24 #include "../dec/vp8i.h"
25 25
26 //------------------------------------------------------------------------------ 26 //------------------------------------------------------------------------------
27 // Transforms (Paragraph 14.4) 27 // Transforms (Paragraph 14.4)
28 28
29 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { 29 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
30 // This implementation makes use of 16-bit fixed point versions of two 30 // This implementation makes use of 16-bit fixed point versions of two
31 // multiply constants: 31 // multiply constants:
(...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after
194 // a03 a13 a23 a33 b03 b13 b23 b33 194 // a03 a13 a23 a33 b03 b13 b23 b33
195 } 195 }
196 196
197 // Add inverse transform to 'dst' and store. 197 // Add inverse transform to 'dst' and store.
198 { 198 {
199 const __m128i zero = _mm_setzero_si128(); 199 const __m128i zero = _mm_setzero_si128();
200 // Load the reference(s). 200 // Load the reference(s).
201 __m128i dst0, dst1, dst2, dst3; 201 __m128i dst0, dst1, dst2, dst3;
202 if (do_two) { 202 if (do_two) {
203 // Load eight bytes/pixels per line. 203 // Load eight bytes/pixels per line.
204 dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]); 204 dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
205 dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]); 205 dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
206 dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]); 206 dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
207 dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]); 207 dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
208 } else { 208 } else {
209 // Load four bytes/pixels per line. 209 // Load four bytes/pixels per line.
210 dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]); 210 dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
211 dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]); 211 dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
212 dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]); 212 dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
213 dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]); 213 dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
214 } 214 }
215 // Convert to 16b. 215 // Convert to 16b.
216 dst0 = _mm_unpacklo_epi8(dst0, zero); 216 dst0 = _mm_unpacklo_epi8(dst0, zero);
217 dst1 = _mm_unpacklo_epi8(dst1, zero); 217 dst1 = _mm_unpacklo_epi8(dst1, zero);
218 dst2 = _mm_unpacklo_epi8(dst2, zero); 218 dst2 = _mm_unpacklo_epi8(dst2, zero);
219 dst3 = _mm_unpacklo_epi8(dst3, zero); 219 dst3 = _mm_unpacklo_epi8(dst3, zero);
220 // Add the inverse transform(s). 220 // Add the inverse transform(s).
221 dst0 = _mm_add_epi16(dst0, T0); 221 dst0 = _mm_add_epi16(dst0, T0);
222 dst1 = _mm_add_epi16(dst1, T1); 222 dst1 = _mm_add_epi16(dst1, T1);
223 dst2 = _mm_add_epi16(dst2, T2); 223 dst2 = _mm_add_epi16(dst2, T2);
224 dst3 = _mm_add_epi16(dst3, T3); 224 dst3 = _mm_add_epi16(dst3, T3);
225 // Unsigned saturate to 8b. 225 // Unsigned saturate to 8b.
226 dst0 = _mm_packus_epi16(dst0, dst0); 226 dst0 = _mm_packus_epi16(dst0, dst0);
227 dst1 = _mm_packus_epi16(dst1, dst1); 227 dst1 = _mm_packus_epi16(dst1, dst1);
228 dst2 = _mm_packus_epi16(dst2, dst2); 228 dst2 = _mm_packus_epi16(dst2, dst2);
229 dst3 = _mm_packus_epi16(dst3, dst3); 229 dst3 = _mm_packus_epi16(dst3, dst3);
230 // Store the results. 230 // Store the results.
231 if (do_two) { 231 if (do_two) {
232 // Store eight bytes/pixels per line. 232 // Store eight bytes/pixels per line.
233 _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0); 233 _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
234 _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1); 234 _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
235 _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2); 235 _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
236 _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3); 236 _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
237 } else { 237 } else {
238 // Store four bytes/pixels per line. 238 // Store four bytes/pixels per line.
239 *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0); 239 *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
240 *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1); 240 *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
241 *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2); 241 *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
242 *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3); 242 *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
243 } 243 }
244 } 244 }
245 } 245 }
246 246
247 #if defined(USE_TRANSFORM_AC3)
248 #define MUL(a, b) (((a) * (b)) >> 16)
249 static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
250 static const int kC1 = 20091 + (1 << 16);
251 static const int kC2 = 35468;
252 const __m128i A = _mm_set1_epi16(in[0] + 4);
253 const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
254 const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
255 const int c1 = MUL(in[1], kC2);
256 const int d1 = MUL(in[1], kC1);
257 const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
258 const __m128i B = _mm_adds_epi16(A, CD);
259 const __m128i m0 = _mm_adds_epi16(B, d4);
260 const __m128i m1 = _mm_adds_epi16(B, c4);
261 const __m128i m2 = _mm_subs_epi16(B, c4);
262 const __m128i m3 = _mm_subs_epi16(B, d4);
263 const __m128i zero = _mm_setzero_si128();
264 // Load the source pixels.
265 __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
266 __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
267 __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
268 __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
269 // Convert to 16b.
270 dst0 = _mm_unpacklo_epi8(dst0, zero);
271 dst1 = _mm_unpacklo_epi8(dst1, zero);
272 dst2 = _mm_unpacklo_epi8(dst2, zero);
273 dst3 = _mm_unpacklo_epi8(dst3, zero);
274 // Add the inverse transform.
275 dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
276 dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
277 dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
278 dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
279 // Unsigned saturate to 8b.
280 dst0 = _mm_packus_epi16(dst0, dst0);
281 dst1 = _mm_packus_epi16(dst1, dst1);
282 dst2 = _mm_packus_epi16(dst2, dst2);
283 dst3 = _mm_packus_epi16(dst3, dst3);
284 // Store the results.
285 *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
286 *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
287 *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
288 *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
289 }
290 #undef MUL
291 #endif // USE_TRANSFORM_AC3
292
247 //------------------------------------------------------------------------------ 293 //------------------------------------------------------------------------------
248 // Loop Filter (Paragraph 15) 294 // Loop Filter (Paragraph 15)
249 295
250 // Compute abs(p - q) = subs(p - q) OR subs(q - p) 296 // Compute abs(p - q) = subs(p - q) OR subs(q - p)
251 #define MM_ABS(p, q) _mm_or_si128( \ 297 #define MM_ABS(p, q) _mm_or_si128( \
252 _mm_subs_epu8((q), (p)), \ 298 _mm_subs_epu8((q), (p)), \
253 _mm_subs_epu8((p), (q))) 299 _mm_subs_epu8((p), (q)))
254 300
255 // Shift each byte of "a" by N bits while preserving by the sign bit. 301 // Shift each byte of "a" by N bits while preserving by the sign bit.
256 // 302 //
(...skipping 624 matching lines...) Expand 10 before | Expand all | Expand 10 after
881 #endif // WEBP_USE_SSE2 927 #endif // WEBP_USE_SSE2
882 928
883 //------------------------------------------------------------------------------ 929 //------------------------------------------------------------------------------
884 // Entry point 930 // Entry point
885 931
886 extern void VP8DspInitSSE2(void); 932 extern void VP8DspInitSSE2(void);
887 933
888 void VP8DspInitSSE2(void) { 934 void VP8DspInitSSE2(void) {
889 #if defined(WEBP_USE_SSE2) 935 #if defined(WEBP_USE_SSE2)
890 VP8Transform = TransformSSE2; 936 VP8Transform = TransformSSE2;
937 #if defined(USE_TRANSFORM_AC3)
938 VP8TransformAC3 = TransformAC3SSE2;
939 #endif
891 940
892 VP8VFilter16 = VFilter16SSE2; 941 VP8VFilter16 = VFilter16SSE2;
893 VP8HFilter16 = HFilter16SSE2; 942 VP8HFilter16 = HFilter16SSE2;
894 VP8VFilter8 = VFilter8SSE2; 943 VP8VFilter8 = VFilter8SSE2;
895 VP8HFilter8 = HFilter8SSE2; 944 VP8HFilter8 = HFilter8SSE2;
896 VP8VFilter16i = VFilter16iSSE2; 945 VP8VFilter16i = VFilter16iSSE2;
897 VP8HFilter16i = HFilter16iSSE2; 946 VP8HFilter16i = HFilter16iSSE2;
898 VP8VFilter8i = VFilter8iSSE2; 947 VP8VFilter8i = VFilter8iSSE2;
899 VP8HFilter8i = HFilter8iSSE2; 948 VP8HFilter8i = HFilter8iSSE2;
900 949
901 VP8SimpleVFilter16 = SimpleVFilter16SSE2; 950 VP8SimpleVFilter16 = SimpleVFilter16SSE2;
902 VP8SimpleHFilter16 = SimpleHFilter16SSE2; 951 VP8SimpleHFilter16 = SimpleHFilter16SSE2;
903 VP8SimpleVFilter16i = SimpleVFilter16iSSE2; 952 VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
904 VP8SimpleHFilter16i = SimpleHFilter16iSSE2; 953 VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
905 #endif // WEBP_USE_SSE2 954 #endif // WEBP_USE_SSE2
906 } 955 }
907 956
908 #if defined(__cplusplus) || defined(c_plusplus)
909 } // extern "C"
910 #endif
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/dec_neon.c ('k') | third_party/libwebp/dsp/dsp.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698