third_party/libwebp/dsp/dec_sse2.c - Issue 2651883004: libwebp-0.6.0-rc1

Side by Side Diff: third_party/libwebp/dsp/dec_sse2.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/libwebp/dsp/dec_neon.c ('k') | third_party/libwebp/dsp/dec_sse41.c » ('j') | no next file with comments »

OLD	NEW
1 // Copyright 2011 Google Inc. All Rights Reserved.	1 // Copyright 2011 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // SSE2 version of some decoding functions (idct, loop filtering).	10 // SSE2 version of some decoding functions (idct, loop filtering).

11 //	11 //

12 // Author: somnath@google.com (Somnath Banerjee)	12 // Author: somnath@google.com (Somnath Banerjee)

13 // cduvivier@google.com (Christian Duvivier)	13 // cduvivier@google.com (Christian Duvivier)

14	14

15 #include "./dsp.h"	15 #include "./dsp.h"

16	16

17 #if defined(WEBP_USE_SSE2)	17 #if defined(WEBP_USE_SSE2)

18	18

19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C	19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C

20 // one it seems => disable it by default. Uncomment the following to enable:	20 // one it seems => disable it by default. Uncomment the following to enable:

21 // #define USE_TRANSFORM_AC3	21 // #define USE_TRANSFORM_AC3

22	22

23 #include <emmintrin.h>	23 #include <emmintrin.h>

24 #include "./common_sse2.h"	24 #include "./common_sse2.h"

25 #include "../dec/vp8i.h"	25 #include "../dec/vp8i_dec.h"

26 #include "../utils/utils.h"	26 #include "../utils/utils.h"

27	27

28 //------------------------------------------------------------------------------	28 //------------------------------------------------------------------------------

29 // Transforms (Paragraph 14.4)	29 // Transforms (Paragraph 14.4)

30	30

31 static void Transform(const int16_t* in, uint8_t* dst, int do_two) {	31 static void Transform(const int16_t* in, uint8_t* dst, int do_two) {

32 // This implementation makes use of 16-bit fixed point versions of two	32 // This implementation makes use of 16-bit fixed point versions of two

33 // multiply constants:	33 // multiply constants:

34 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16	34 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16

35 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16	35 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16

(...skipping 97 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
133 const __m128i tmp1 = _mm_add_epi16(b, c);	133 const __m128i tmp1 = _mm_add_epi16(b, c);

134 const __m128i tmp2 = _mm_sub_epi16(b, c);	134 const __m128i tmp2 = _mm_sub_epi16(b, c);

135 const __m128i tmp3 = _mm_sub_epi16(a, d);	135 const __m128i tmp3 = _mm_sub_epi16(a, d);

136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);	136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);

137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);	137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);

138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);	138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);

139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);	139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);

140	140

141 // Transpose the two 4x4.	141 // Transpose the two 4x4.

142 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,	142 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,

143 &T2, &T3);	143 &T2, &T3);

144 }	144 }

145	145

146 // Add inverse transform to 'dst' and store.	146 // Add inverse transform to 'dst' and store.

147 {	147 {

148 const __m128i zero = _mm_setzero_si128();	148 const __m128i zero = _mm_setzero_si128();

149 // Load the reference(s).	149 // Load the reference(s).

150 __m128i dst0, dst1, dst2, dst3;	150 __m128i dst0, dst1, dst2, dst3;

151 if (do_two) {	151 if (do_two) {

152 // Load eight bytes/pixels per line.	152 // Load eight bytes/pixels per line.

153 dst0 = _mm_loadl_epi64((__m128i)(dst + 0 BPS));	153 dst0 = _mm_loadl_epi64((__m128i)(dst + 0 BPS));

(...skipping 1068 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1222 VP8PredChroma8[4] = DC8uvNoTop;	1222 VP8PredChroma8[4] = DC8uvNoTop;

1223 VP8PredChroma8[5] = DC8uvNoLeft;	1223 VP8PredChroma8[5] = DC8uvNoLeft;

1224 VP8PredChroma8[6] = DC8uvNoTopLeft;	1224 VP8PredChroma8[6] = DC8uvNoTopLeft;

1225 }	1225 }

1226	1226

1227 #else // !WEBP_USE_SSE2	1227 #else // !WEBP_USE_SSE2

1228	1228

1229 WEBP_DSP_INIT_STUB(VP8DspInitSSE2)	1229 WEBP_DSP_INIT_STUB(VP8DspInitSSE2)

1230	1230

1231 #endif // WEBP_USE_SSE2	1231 #endif // WEBP_USE_SSE2

OLD	NEW