Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(244)

Side by Side Diff: third_party/libwebp/dsp/dec_sse2.c

Issue 2149863002: libwebp: update to v0.5.1 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/dec_msa.c ('k') | third_party/libwebp/dsp/dec_sse41.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 Google Inc. All Rights Reserved. 1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // SSE2 version of some decoding functions (idct, loop filtering). 10 // SSE2 version of some decoding functions (idct, loop filtering).
11 // 11 //
12 // Author: somnath@google.com (Somnath Banerjee) 12 // Author: somnath@google.com (Somnath Banerjee)
13 // cduvivier@google.com (Christian Duvivier) 13 // cduvivier@google.com (Christian Duvivier)
14 14
15 #include "./dsp.h" 15 #include "./dsp.h"
16 16
17 #if defined(WEBP_USE_SSE2) 17 #if defined(WEBP_USE_SSE2)
18 18
19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C 19 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
20 // one it seems => disable it by default. Uncomment the following to enable: 20 // one it seems => disable it by default. Uncomment the following to enable:
21 // #define USE_TRANSFORM_AC3 21 // #define USE_TRANSFORM_AC3
22 22
23 #include <emmintrin.h> 23 #include <emmintrin.h>
24 #include "./common_sse2.h"
24 #include "../dec/vp8i.h" 25 #include "../dec/vp8i.h"
26 #include "../utils/utils.h"
25 27
26 //------------------------------------------------------------------------------ 28 //------------------------------------------------------------------------------
27 // Transforms (Paragraph 14.4) 29 // Transforms (Paragraph 14.4)
28 30
29 static void Transform(const int16_t* in, uint8_t* dst, int do_two) { 31 static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
30 // This implementation makes use of 16-bit fixed point versions of two 32 // This implementation makes use of 16-bit fixed point versions of two
31 // multiply constants: 33 // multiply constants:
32 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 34 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
33 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 35 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
34 // 36 //
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
95 const __m128i d4 = _mm_add_epi16(d1, d2); 97 const __m128i d4 = _mm_add_epi16(d1, d2);
96 const __m128i d = _mm_add_epi16(d3, d4); 98 const __m128i d = _mm_add_epi16(d3, d4);
97 99
98 // Second pass. 100 // Second pass.
99 const __m128i tmp0 = _mm_add_epi16(a, d); 101 const __m128i tmp0 = _mm_add_epi16(a, d);
100 const __m128i tmp1 = _mm_add_epi16(b, c); 102 const __m128i tmp1 = _mm_add_epi16(b, c);
101 const __m128i tmp2 = _mm_sub_epi16(b, c); 103 const __m128i tmp2 = _mm_sub_epi16(b, c);
102 const __m128i tmp3 = _mm_sub_epi16(a, d); 104 const __m128i tmp3 = _mm_sub_epi16(a, d);
103 105
104 // Transpose the two 4x4. 106 // Transpose the two 4x4.
105 // a00 a01 a02 a03 b00 b01 b02 b03 107 VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3);
106 // a10 a11 a12 a13 b10 b11 b12 b13
107 // a20 a21 a22 a23 b20 b21 b22 b23
108 // a30 a31 a32 a33 b30 b31 b32 b33
109 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
110 const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
111 const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
112 const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
113 // a00 a10 a01 a11 a02 a12 a03 a13
114 // a20 a30 a21 a31 a22 a32 a23 a33
115 // b00 b10 b01 b11 b02 b12 b03 b13
116 // b20 b30 b21 b31 b22 b32 b23 b33
117 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
118 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
119 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
120 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
121 // a00 a10 a20 a30 a01 a11 a21 a31
122 // b00 b10 b20 b30 b01 b11 b21 b31
123 // a02 a12 a22 a32 a03 a13 a23 a33
124 // b02 b12 a22 b32 b03 b13 b23 b33
125 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
126 T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
127 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
128 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
129 // a00 a10 a20 a30 b00 b10 b20 b30
130 // a01 a11 a21 a31 b01 b11 b21 b31
131 // a02 a12 a22 a32 b02 b12 b22 b32
132 // a03 a13 a23 a33 b03 b13 b23 b33
133 } 108 }
134 109
135 // Horizontal pass and subsequent transpose. 110 // Horizontal pass and subsequent transpose.
136 { 111 {
137 // First pass, c and d calculations are longer because of the "trick" 112 // First pass, c and d calculations are longer because of the "trick"
138 // multiplications. 113 // multiplications.
139 const __m128i four = _mm_set1_epi16(4); 114 const __m128i four = _mm_set1_epi16(4);
140 const __m128i dc = _mm_add_epi16(T0, four); 115 const __m128i dc = _mm_add_epi16(T0, four);
141 const __m128i a = _mm_add_epi16(dc, T2); 116 const __m128i a = _mm_add_epi16(dc, T2);
142 const __m128i b = _mm_sub_epi16(dc, T2); 117 const __m128i b = _mm_sub_epi16(dc, T2);
(...skipping 14 matching lines...) Expand all
157 const __m128i tmp0 = _mm_add_epi16(a, d); 132 const __m128i tmp0 = _mm_add_epi16(a, d);
158 const __m128i tmp1 = _mm_add_epi16(b, c); 133 const __m128i tmp1 = _mm_add_epi16(b, c);
159 const __m128i tmp2 = _mm_sub_epi16(b, c); 134 const __m128i tmp2 = _mm_sub_epi16(b, c);
160 const __m128i tmp3 = _mm_sub_epi16(a, d); 135 const __m128i tmp3 = _mm_sub_epi16(a, d);
161 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); 136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
162 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); 137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
163 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); 138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
164 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); 139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
165 140
166 // Transpose the two 4x4. 141 // Transpose the two 4x4.
167 // a00 a01 a02 a03 b00 b01 b02 b03 142 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
168 // a10 a11 a12 a13 b10 b11 b12 b13 143 &T2, &T3);
169 // a20 a21 a22 a23 b20 b21 b22 b23
170 // a30 a31 a32 a33 b30 b31 b32 b33
171 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
172 const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
173 const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
174 const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
175 // a00 a10 a01 a11 a02 a12 a03 a13
176 // a20 a30 a21 a31 a22 a32 a23 a33
177 // b00 b10 b01 b11 b02 b12 b03 b13
178 // b20 b30 b21 b31 b22 b32 b23 b33
179 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
180 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
181 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
182 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
183 // a00 a10 a20 a30 a01 a11 a21 a31
184 // b00 b10 b20 b30 b01 b11 b21 b31
185 // a02 a12 a22 a32 a03 a13 a23 a33
186 // b02 b12 a22 b32 b03 b13 b23 b33
187 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
188 T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
189 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
190 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
191 // a00 a10 a20 a30 b00 b10 b20 b30
192 // a01 a11 a21 a31 b01 b11 b21 b31
193 // a02 a12 a22 a32 b02 b12 b22 b32
194 // a03 a13 a23 a33 b03 b13 b23 b33
195 } 144 }
196 145
197 // Add inverse transform to 'dst' and store. 146 // Add inverse transform to 'dst' and store.
198 { 147 {
199 const __m128i zero = _mm_setzero_si128(); 148 const __m128i zero = _mm_setzero_si128();
200 // Load the reference(s). 149 // Load the reference(s).
201 __m128i dst0, dst1, dst2, dst3; 150 __m128i dst0, dst1, dst2, dst3;
202 if (do_two) { 151 if (do_two) {
203 // Load eight bytes/pixels per line. 152 // Load eight bytes/pixels per line.
204 dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS)); 153 dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
(...skipping 1068 matching lines...) Expand 10 before | Expand all | Expand 10 after
1273 VP8PredChroma8[4] = DC8uvNoTop; 1222 VP8PredChroma8[4] = DC8uvNoTop;
1274 VP8PredChroma8[5] = DC8uvNoLeft; 1223 VP8PredChroma8[5] = DC8uvNoLeft;
1275 VP8PredChroma8[6] = DC8uvNoTopLeft; 1224 VP8PredChroma8[6] = DC8uvNoTopLeft;
1276 } 1225 }
1277 1226
1278 #else // !WEBP_USE_SSE2 1227 #else // !WEBP_USE_SSE2
1279 1228
1280 WEBP_DSP_INIT_STUB(VP8DspInitSSE2) 1229 WEBP_DSP_INIT_STUB(VP8DspInitSSE2)
1281 1230
1282 #endif // WEBP_USE_SSE2 1231 #endif // WEBP_USE_SSE2
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/dec_msa.c ('k') | third_party/libwebp/dsp/dec_sse41.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698