third_party/libwebp/dsp/dec_sse2.c - Issue 2149863002: libwebp: update to v0.5.1

Side by Side Diff: third_party/libwebp/dsp/dec_sse2.c

Issue 2149863002: libwebp: update to v0.5.1 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2011 Google Inc. All Rights Reserved.	1 // Copyright 2011 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // SSE2 version of some decoding functions (idct, loop filtering).	10 // SSE2 version of some decoding functions (idct, loop filtering).

11 //	11 //

12 // Author: somnath@google.com (Somnath Banerjee)	12 // Author: somnath@google.com (Somnath Banerjee)

13 // cduvivier@google.com (Christian Duvivier)	13 // cduvivier@google.com (Christian Duvivier)

14	14

15 #include "./dsp.h"	15 #include "./dsp.h"

16	16

17 #if defined(WEBP_USE_SSE2)	17 #if defined(WEBP_USE_SSE2)

18	18

19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C	19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C

20 // one it seems => disable it by default. Uncomment the following to enable:	20 // one it seems => disable it by default. Uncomment the following to enable:

21 // #define USE_TRANSFORM_AC3	21 // #define USE_TRANSFORM_AC3

22	22

23 #include <emmintrin.h>	23 #include <emmintrin.h>

	24 #include "./common_sse2.h"

24 #include "../dec/vp8i.h"	25 #include "../dec/vp8i.h"

	26 #include "../utils/utils.h"

25	27

26 //------------------------------------------------------------------------------	28 //------------------------------------------------------------------------------

27 // Transforms (Paragraph 14.4)	29 // Transforms (Paragraph 14.4)

28	30

29 static void Transform(const int16_t* in, uint8_t* dst, int do_two) {	31 static void Transform(const int16_t* in, uint8_t* dst, int do_two) {

30 // This implementation makes use of 16-bit fixed point versions of two	32 // This implementation makes use of 16-bit fixed point versions of two

31 // multiply constants:	33 // multiply constants:

32 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16	34 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16

33 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16	35 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16

34 //	36 //

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
95 const __m128i d4 = _mm_add_epi16(d1, d2);	97 const __m128i d4 = _mm_add_epi16(d1, d2);

96 const __m128i d = _mm_add_epi16(d3, d4);	98 const __m128i d = _mm_add_epi16(d3, d4);

97	99

98 // Second pass.	100 // Second pass.

99 const __m128i tmp0 = _mm_add_epi16(a, d);	101 const __m128i tmp0 = _mm_add_epi16(a, d);

100 const __m128i tmp1 = _mm_add_epi16(b, c);	102 const __m128i tmp1 = _mm_add_epi16(b, c);

101 const __m128i tmp2 = _mm_sub_epi16(b, c);	103 const __m128i tmp2 = _mm_sub_epi16(b, c);

102 const __m128i tmp3 = _mm_sub_epi16(a, d);	104 const __m128i tmp3 = _mm_sub_epi16(a, d);

103	105

104 // Transpose the two 4x4.	106 // Transpose the two 4x4.

105 // a00 a01 a02 a03 b00 b01 b02 b03	107 VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3);

106 // a10 a11 a12 a13 b10 b11 b12 b13

107 // a20 a21 a22 a23 b20 b21 b22 b23

108 // a30 a31 a32 a33 b30 b31 b32 b33

109 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);

110 const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);

111 const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);

112 const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);

113 // a00 a10 a01 a11 a02 a12 a03 a13

114 // a20 a30 a21 a31 a22 a32 a23 a33

115 // b00 b10 b01 b11 b02 b12 b03 b13

116 // b20 b30 b21 b31 b22 b32 b23 b33

117 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);

118 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);

119 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);

120 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);

121 // a00 a10 a20 a30 a01 a11 a21 a31

122 // b00 b10 b20 b30 b01 b11 b21 b31

123 // a02 a12 a22 a32 a03 a13 a23 a33

124 // b02 b12 a22 b32 b03 b13 b23 b33

125 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);

126 T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);

127 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);

128 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);

129 // a00 a10 a20 a30 b00 b10 b20 b30

130 // a01 a11 a21 a31 b01 b11 b21 b31

131 // a02 a12 a22 a32 b02 b12 b22 b32

132 // a03 a13 a23 a33 b03 b13 b23 b33

133 }	108 }

134	109

135 // Horizontal pass and subsequent transpose.	110 // Horizontal pass and subsequent transpose.

136 {	111 {

137 // First pass, c and d calculations are longer because of the "trick"	112 // First pass, c and d calculations are longer because of the "trick"

138 // multiplications.	113 // multiplications.

139 const __m128i four = _mm_set1_epi16(4);	114 const __m128i four = _mm_set1_epi16(4);

140 const __m128i dc = _mm_add_epi16(T0, four);	115 const __m128i dc = _mm_add_epi16(T0, four);

141 const __m128i a = _mm_add_epi16(dc, T2);	116 const __m128i a = _mm_add_epi16(dc, T2);

142 const __m128i b = _mm_sub_epi16(dc, T2);	117 const __m128i b = _mm_sub_epi16(dc, T2);

(...skipping 14 matching lines...) Expand all Loading...
157 const __m128i tmp0 = _mm_add_epi16(a, d);	132 const __m128i tmp0 = _mm_add_epi16(a, d);

158 const __m128i tmp1 = _mm_add_epi16(b, c);	133 const __m128i tmp1 = _mm_add_epi16(b, c);

159 const __m128i tmp2 = _mm_sub_epi16(b, c);	134 const __m128i tmp2 = _mm_sub_epi16(b, c);

160 const __m128i tmp3 = _mm_sub_epi16(a, d);	135 const __m128i tmp3 = _mm_sub_epi16(a, d);

161 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);	136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);

162 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);	137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);

163 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);	138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);

164 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);	139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);

165	140

166 // Transpose the two 4x4.	141 // Transpose the two 4x4.

167 // a00 a01 a02 a03 b00 b01 b02 b03	142 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,

168 // a10 a11 a12 a13 b10 b11 b12 b13	143 &T2, &T3);

169 // a20 a21 a22 a23 b20 b21 b22 b23

170 // a30 a31 a32 a33 b30 b31 b32 b33

171 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);

172 const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);

173 const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);

174 const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);

175 // a00 a10 a01 a11 a02 a12 a03 a13

176 // a20 a30 a21 a31 a22 a32 a23 a33

177 // b00 b10 b01 b11 b02 b12 b03 b13

178 // b20 b30 b21 b31 b22 b32 b23 b33

179 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);

180 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);

181 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);

182 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);

183 // a00 a10 a20 a30 a01 a11 a21 a31

184 // b00 b10 b20 b30 b01 b11 b21 b31

185 // a02 a12 a22 a32 a03 a13 a23 a33

186 // b02 b12 a22 b32 b03 b13 b23 b33

187 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);

188 T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);

189 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);

190 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);

191 // a00 a10 a20 a30 b00 b10 b20 b30

192 // a01 a11 a21 a31 b01 b11 b21 b31

193 // a02 a12 a22 a32 b02 b12 b22 b32

194 // a03 a13 a23 a33 b03 b13 b23 b33

195 }	144 }

196	145

197 // Add inverse transform to 'dst' and store.	146 // Add inverse transform to 'dst' and store.

198 {	147 {

199 const __m128i zero = _mm_setzero_si128();	148 const __m128i zero = _mm_setzero_si128();

200 // Load the reference(s).	149 // Load the reference(s).

201 __m128i dst0, dst1, dst2, dst3;	150 __m128i dst0, dst1, dst2, dst3;

202 if (do_two) {	151 if (do_two) {

203 // Load eight bytes/pixels per line.	152 // Load eight bytes/pixels per line.

204 dst0 = _mm_loadl_epi64((__m128i)(dst + 0 BPS));	153 dst0 = _mm_loadl_epi64((__m128i)(dst + 0 BPS));

(...skipping 1068 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1273 VP8PredChroma8[4] = DC8uvNoTop;	1222 VP8PredChroma8[4] = DC8uvNoTop;

1274 VP8PredChroma8[5] = DC8uvNoLeft;	1223 VP8PredChroma8[5] = DC8uvNoLeft;

1275 VP8PredChroma8[6] = DC8uvNoTopLeft;	1224 VP8PredChroma8[6] = DC8uvNoTopLeft;

1276 }	1225 }

1277	1226

1278 #else // !WEBP_USE_SSE2	1227 #else // !WEBP_USE_SSE2

1279	1228

1280 WEBP_DSP_INIT_STUB(VP8DspInitSSE2)	1229 WEBP_DSP_INIT_STUB(VP8DspInitSSE2)

1281	1230

1282 #endif // WEBP_USE_SSE2	1231 #endif // WEBP_USE_SSE2

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/dec_msa.c ('k') | third_party/libwebp/dsp/dec_sse41.c » ('j') | no next file with comments »