Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(217)

Side by Side Diff: third_party/libwebp/dsp/enc_sse2.c

Issue 2149863002: libwebp: update to v0.5.1 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/enc_neon.c ('k') | third_party/libwebp/dsp/enc_sse41.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 Google Inc. All Rights Reserved. 1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // SSE2 version of speed-critical encoding functions. 10 // SSE2 version of speed-critical encoding functions.
11 // 11 //
12 // Author: Christian Duvivier (cduvivier@google.com) 12 // Author: Christian Duvivier (cduvivier@google.com)
13 13
14 #include "./dsp.h" 14 #include "./dsp.h"
15 15
16 #if defined(WEBP_USE_SSE2) 16 #if defined(WEBP_USE_SSE2)
17 #include <stdlib.h> // for abs() 17 #include <stdlib.h> // for abs()
18 #include <emmintrin.h> 18 #include <emmintrin.h>
19 19
20 #include "./common_sse2.h"
20 #include "../enc/cost.h" 21 #include "../enc/cost.h"
21 #include "../enc/vp8enci.h" 22 #include "../enc/vp8enci.h"
22 23
23 //------------------------------------------------------------------------------ 24 //------------------------------------------------------------------------------
24 // Quite useful macro for debugging. Left here for convenience.
25
26 #if 0
27 #include <stdio.h>
28 static void PrintReg(const __m128i r, const char* const name, int size) {
29 int n;
30 union {
31 __m128i r;
32 uint8_t i8[16];
33 uint16_t i16[8];
34 uint32_t i32[4];
35 uint64_t i64[2];
36 } tmp;
37 tmp.r = r;
38 fprintf(stderr, "%s\t: ", name);
39 if (size == 8) {
40 for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
41 } else if (size == 16) {
42 for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
43 } else if (size == 32) {
44 for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
45 } else {
46 for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]);
47 }
48 fprintf(stderr, "\n");
49 }
50 #endif
51
52 //------------------------------------------------------------------------------
53 // Transforms (Paragraph 14.4) 25 // Transforms (Paragraph 14.4)
54 26
55 // Does one or two inverse transforms. 27 // Does one or two inverse transforms.
56 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, 28 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
57 int do_two) { 29 int do_two) {
58 // This implementation makes use of 16-bit fixed point versions of two 30 // This implementation makes use of 16-bit fixed point versions of two
59 // multiply constants: 31 // multiply constants:
60 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 32 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
61 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 33 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
62 // 34 //
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after
124 const __m128i d4 = _mm_add_epi16(d1, d2); 96 const __m128i d4 = _mm_add_epi16(d1, d2);
125 const __m128i d = _mm_add_epi16(d3, d4); 97 const __m128i d = _mm_add_epi16(d3, d4);
126 98
127 // Second pass. 99 // Second pass.
128 const __m128i tmp0 = _mm_add_epi16(a, d); 100 const __m128i tmp0 = _mm_add_epi16(a, d);
129 const __m128i tmp1 = _mm_add_epi16(b, c); 101 const __m128i tmp1 = _mm_add_epi16(b, c);
130 const __m128i tmp2 = _mm_sub_epi16(b, c); 102 const __m128i tmp2 = _mm_sub_epi16(b, c);
131 const __m128i tmp3 = _mm_sub_epi16(a, d); 103 const __m128i tmp3 = _mm_sub_epi16(a, d);
132 104
133 // Transpose the two 4x4. 105 // Transpose the two 4x4.
134 // a00 a01 a02 a03 b00 b01 b02 b03 106 VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3);
135 // a10 a11 a12 a13 b10 b11 b12 b13
136 // a20 a21 a22 a23 b20 b21 b22 b23
137 // a30 a31 a32 a33 b30 b31 b32 b33
138 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
139 const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
140 const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
141 const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
142 // a00 a10 a01 a11 a02 a12 a03 a13
143 // a20 a30 a21 a31 a22 a32 a23 a33
144 // b00 b10 b01 b11 b02 b12 b03 b13
145 // b20 b30 b21 b31 b22 b32 b23 b33
146 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
147 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
148 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
149 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
150 // a00 a10 a20 a30 a01 a11 a21 a31
151 // b00 b10 b20 b30 b01 b11 b21 b31
152 // a02 a12 a22 a32 a03 a13 a23 a33
153 // b02 b12 a22 b32 b03 b13 b23 b33
154 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
155 T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
156 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
157 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
158 // a00 a10 a20 a30 b00 b10 b20 b30
159 // a01 a11 a21 a31 b01 b11 b21 b31
160 // a02 a12 a22 a32 b02 b12 b22 b32
161 // a03 a13 a23 a33 b03 b13 b23 b33
162 } 107 }
163 108
164 // Horizontal pass and subsequent transpose. 109 // Horizontal pass and subsequent transpose.
165 { 110 {
166 // First pass, c and d calculations are longer because of the "trick" 111 // First pass, c and d calculations are longer because of the "trick"
167 // multiplications. 112 // multiplications.
168 const __m128i four = _mm_set1_epi16(4); 113 const __m128i four = _mm_set1_epi16(4);
169 const __m128i dc = _mm_add_epi16(T0, four); 114 const __m128i dc = _mm_add_epi16(T0, four);
170 const __m128i a = _mm_add_epi16(dc, T2); 115 const __m128i a = _mm_add_epi16(dc, T2);
171 const __m128i b = _mm_sub_epi16(dc, T2); 116 const __m128i b = _mm_sub_epi16(dc, T2);
(...skipping 14 matching lines...) Expand all
186 const __m128i tmp0 = _mm_add_epi16(a, d); 131 const __m128i tmp0 = _mm_add_epi16(a, d);
187 const __m128i tmp1 = _mm_add_epi16(b, c); 132 const __m128i tmp1 = _mm_add_epi16(b, c);
188 const __m128i tmp2 = _mm_sub_epi16(b, c); 133 const __m128i tmp2 = _mm_sub_epi16(b, c);
189 const __m128i tmp3 = _mm_sub_epi16(a, d); 134 const __m128i tmp3 = _mm_sub_epi16(a, d);
190 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); 135 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
191 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); 136 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
192 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); 137 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
193 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); 138 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
194 139
195 // Transpose the two 4x4. 140 // Transpose the two 4x4.
196 // a00 a01 a02 a03 b00 b01 b02 b03 141 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
197 // a10 a11 a12 a13 b10 b11 b12 b13 142 &T2, &T3);
198 // a20 a21 a22 a23 b20 b21 b22 b23
199 // a30 a31 a32 a33 b30 b31 b32 b33
200 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
201 const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
202 const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
203 const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
204 // a00 a10 a01 a11 a02 a12 a03 a13
205 // a20 a30 a21 a31 a22 a32 a23 a33
206 // b00 b10 b01 b11 b02 b12 b03 b13
207 // b20 b30 b21 b31 b22 b32 b23 b33
208 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
209 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
210 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
211 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
212 // a00 a10 a20 a30 a01 a11 a21 a31
213 // b00 b10 b20 b30 b01 b11 b21 b31
214 // a02 a12 a22 a32 a03 a13 a23 a33
215 // b02 b12 a22 b32 b03 b13 b23 b33
216 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
217 T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
218 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
219 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
220 // a00 a10 a20 a30 b00 b10 b20 b30
221 // a01 a11 a21 a31 b01 b11 b21 b31
222 // a02 a12 a22 a32 b02 b12 b22 b32
223 // a03 a13 a23 a33 b03 b13 b23 b33
224 } 143 }
225 144
226 // Add inverse transform to 'ref' and store. 145 // Add inverse transform to 'ref' and store.
227 { 146 {
228 const __m128i zero = _mm_setzero_si128(); 147 const __m128i zero = _mm_setzero_si128();
229 // Load the reference(s). 148 // Load the reference(s).
230 __m128i ref0, ref1, ref2, ref3; 149 __m128i ref0, ref1, ref2, ref3;
231 if (do_two) { 150 if (do_two) {
232 // Load eight bytes/pixels per line. 151 // Load eight bytes/pixels per line.
233 ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 152 ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
(...skipping 132 matching lines...) Expand 10 before | Expand all | Expand 10 after
366 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); 285 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
367 286
368 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); 287 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
369 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); 288 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
370 _mm_storeu_si128((__m128i*)&out[0], d0_g1); 289 _mm_storeu_si128((__m128i*)&out[0], d0_g1);
371 _mm_storeu_si128((__m128i*)&out[8], d2_f3); 290 _mm_storeu_si128((__m128i*)&out[8], d2_f3);
372 } 291 }
373 292
374 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { 293 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
375 const __m128i zero = _mm_setzero_si128(); 294 const __m128i zero = _mm_setzero_si128();
376 295 // Load src.
377 // Load src and convert to 16b.
378 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); 296 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
379 const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); 297 const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
380 const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); 298 const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
381 const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); 299 const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
382 const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); 300 // 00 01 02 03 *
383 const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); 301 // 10 11 12 13 *
384 const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); 302 // 20 21 22 23 *
385 const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); 303 // 30 31 32 33 *
386 // Load ref and convert to 16b. 304 // Shuffle.
305 const __m128i src_0 = _mm_unpacklo_epi16(src0, src1);
306 const __m128i src_1 = _mm_unpacklo_epi16(src2, src3);
307 // 00 01 10 11 02 03 12 13 * * ...
308 // 20 21 30 31 22 22 32 33 * * ...
309
310 // Load ref.
387 const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 311 const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
388 const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); 312 const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
389 const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); 313 const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
390 const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); 314 const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
391 const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); 315 const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1);
392 const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); 316 const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3);
393 const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
394 const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
395 // Compute difference. -> 00 01 02 03 00 00 00 00
396 const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
397 const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
398 const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
399 const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
400 317
401 // Unpack and shuffle 318 // Convert both to 16 bit.
402 // 00 01 02 03 0 0 0 0 319 const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero);
403 // 10 11 12 13 0 0 0 0 320 const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero);
404 // 20 21 22 23 0 0 0 0 321 const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero);
405 // 30 31 32 33 0 0 0 0 322 const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero);
406 const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); 323
407 const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); 324 // Compute the difference.
325 const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b);
326 const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b);
408 __m128i v01, v32; 327 __m128i v01, v32;
409 328
410 // First pass 329 // First pass
411 FTransformPass1(&shuf01, &shuf23, &v01, &v32); 330 FTransformPass1(&row01, &row23, &v01, &v32);
412 331
413 // Second pass 332 // Second pass
414 FTransformPass2(&v01, &v32, out); 333 FTransformPass2(&v01, &v32, out);
415 } 334 }
416 335
417 static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { 336 static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
418 const __m128i zero = _mm_setzero_si128(); 337 const __m128i zero = _mm_setzero_si128();
419 338
420 // Load src and convert to 16b. 339 // Load src and convert to 16b.
421 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); 340 const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
456 // First pass 375 // First pass
457 FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l); 376 FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
458 FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h); 377 FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
459 378
460 // Second pass 379 // Second pass
461 FTransformPass2(&v01l, &v32l, out + 0); 380 FTransformPass2(&v01l, &v32l, out + 0);
462 FTransformPass2(&v01h, &v32h, out + 16); 381 FTransformPass2(&v01h, &v32h, out + 16);
463 } 382 }
464 383
465 static void FTransformWHTRow(const int16_t* const in, __m128i* const out) { 384 static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
466 const __m128i kMult1 = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1); 385 const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
467 const __m128i kMult2 = _mm_set_epi16(0, 0, 0, 0, -1, 1, -1, 1);
468 const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]); 386 const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
469 const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]); 387 const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
470 const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]); 388 const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]);
471 const __m128i src3 = _mm_loadl_epi64((__m128i*)&in[3 * 16]); 389 const __m128i src3 = _mm_loadl_epi64((__m128i*)&in[3 * 16]);
472 const __m128i A01 = _mm_unpacklo_epi16(src0, src1); // A0 A1 | ... 390 const __m128i A01 = _mm_unpacklo_epi16(src0, src1); // A0 A1 | ...
473 const __m128i A23 = _mm_unpacklo_epi16(src2, src3); // A2 A3 | ... 391 const __m128i A23 = _mm_unpacklo_epi16(src2, src3); // A2 A3 | ...
474 const __m128i B0 = _mm_adds_epi16(A01, A23); // a0 | a1 | ... 392 const __m128i B0 = _mm_adds_epi16(A01, A23); // a0 | a1 | ...
475 const __m128i B1 = _mm_subs_epi16(A01, A23); // a3 | a2 | ... 393 const __m128i B1 = _mm_subs_epi16(A01, A23); // a3 | a2 | ...
476 const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 | a1 | a3 | a2 394 const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 | a1 | a3 | a2 | ...
477 const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 | a2 | a0 | a1 395 const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 | a2 | a0 | a1 | ...
478 const __m128i D0 = _mm_madd_epi16(C0, kMult1); // out0, out1 396 const __m128i D = _mm_unpacklo_epi64(C0, C1); // a0 a1 a3 a2 a3 a2 a0 a1
479 const __m128i D1 = _mm_madd_epi16(C1, kMult2); // out2, out3 397 *out = _mm_madd_epi16(D, kMult);
480 *out = _mm_unpacklo_epi64(D0, D1);
481 } 398 }
482 399
483 static void FTransformWHT(const int16_t* in, int16_t* out) { 400 static void FTransformWHT(const int16_t* in, int16_t* out) {
401 // Input is 12b signed.
484 __m128i row0, row1, row2, row3; 402 __m128i row0, row1, row2, row3;
403 // Rows are 14b signed.
485 FTransformWHTRow(in + 0 * 64, &row0); 404 FTransformWHTRow(in + 0 * 64, &row0);
486 FTransformWHTRow(in + 1 * 64, &row1); 405 FTransformWHTRow(in + 1 * 64, &row1);
487 FTransformWHTRow(in + 2 * 64, &row2); 406 FTransformWHTRow(in + 2 * 64, &row2);
488 FTransformWHTRow(in + 3 * 64, &row3); 407 FTransformWHTRow(in + 3 * 64, &row3);
489 408
490 { 409 {
410 // The a* are 15b signed.
491 const __m128i a0 = _mm_add_epi32(row0, row2); 411 const __m128i a0 = _mm_add_epi32(row0, row2);
492 const __m128i a1 = _mm_add_epi32(row1, row3); 412 const __m128i a1 = _mm_add_epi32(row1, row3);
493 const __m128i a2 = _mm_sub_epi32(row1, row3); 413 const __m128i a2 = _mm_sub_epi32(row1, row3);
494 const __m128i a3 = _mm_sub_epi32(row0, row2); 414 const __m128i a3 = _mm_sub_epi32(row0, row2);
495 const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1); 415 const __m128i a0a3 = _mm_packs_epi32(a0, a3);
496 const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1); 416 const __m128i a1a2 = _mm_packs_epi32(a1, a2);
497 const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1); 417
498 const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1); 418 // The b* are 16b signed.
499 const __m128i out0 = _mm_packs_epi32(b0, b1); 419 const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2);
500 const __m128i out1 = _mm_packs_epi32(b2, b3); 420 const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2);
501 _mm_storeu_si128((__m128i*)&out[0], out0); 421 const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2);
502 _mm_storeu_si128((__m128i*)&out[8], out1); 422 const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2);
423
424 _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1));
425 _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1));
503 } 426 }
504 } 427 }
505 428
506 //------------------------------------------------------------------------------ 429 //------------------------------------------------------------------------------
507 // Compute susceptibility based on DCT-coeff histograms: 430 // Compute susceptibility based on DCT-coeff histograms:
508 // the higher, the "easier" the macroblock is to compress. 431 // the higher, the "easier" the macroblock is to compress.
509 432
510 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, 433 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
511 int start_block, int end_block, 434 int start_block, int end_block,
512 VP8Histogram* const histo) { 435 VP8Histogram* const histo) {
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after
685 if (top != NULL) { 608 if (top != NULL) {
686 VerticalPred(dst, top, size); 609 VerticalPred(dst, top, size);
687 } else { 610 } else {
688 Fill(dst, 129, size); 611 Fill(dst, 129, size);
689 } 612 }
690 } 613 }
691 } 614 }
692 615
693 static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left, 616 static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
694 const uint8_t* top) { 617 const uint8_t* top) {
695 const __m128i zero = _mm_setzero_si128();
696 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 618 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
697 const __m128i left_values = _mm_loadl_epi64((const __m128i*)left); 619 const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
698 const __m128i sum_top = _mm_sad_epu8(top_values, zero); 620 const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
699 const __m128i sum_left = _mm_sad_epu8(left_values, zero); 621 const int DC = VP8HorizontalAdd8b(&combined) + 8;
700 const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 8;
701 Put8x8uv(DC >> 4, dst); 622 Put8x8uv(DC >> 4, dst);
702 } 623 }
703 624
704 static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) { 625 static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
705 const __m128i zero = _mm_setzero_si128(); 626 const __m128i zero = _mm_setzero_si128();
706 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 627 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
707 const __m128i sum = _mm_sad_epu8(top_values, zero); 628 const __m128i sum = _mm_sad_epu8(top_values, zero);
708 const int DC = _mm_cvtsi128_si32(sum) + 4; 629 const int DC = _mm_cvtsi128_si32(sum) + 4;
709 Put8x8uv(DC >> 3, dst); 630 Put8x8uv(DC >> 3, dst);
710 } 631 }
(...skipping 17 matching lines...) Expand all
728 } 649 }
729 } else if (left != NULL) { // left but no top 650 } else if (left != NULL) { // left but no top
730 DC8uvNoTop(dst, left); 651 DC8uvNoTop(dst, left);
731 } else { // no top, no left, nothing. 652 } else { // no top, no left, nothing.
732 DC8uvNoTopLeft(dst); 653 DC8uvNoTopLeft(dst);
733 } 654 }
734 } 655 }
735 656
736 static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left, 657 static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
737 const uint8_t* top) { 658 const uint8_t* top) {
738 const __m128i zero = _mm_setzero_si128();
739 const __m128i top_row = _mm_load_si128((const __m128i*)top); 659 const __m128i top_row = _mm_load_si128((const __m128i*)top);
740 const __m128i left_row = _mm_load_si128((const __m128i*)left); 660 const __m128i left_row = _mm_load_si128((const __m128i*)left);
741 const __m128i sad8x2 = _mm_sad_epu8(top_row, zero); 661 const int DC =
742 // sum the two sads: sad8x2[0:1] + sad8x2[8:9] 662 VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
743 const __m128i sum_top = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
744 const __m128i sad8x2_left = _mm_sad_epu8(left_row, zero);
745 // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
746 const __m128i sum_left =
747 _mm_add_epi16(sad8x2_left, _mm_shuffle_epi32(sad8x2_left, 2));
748 const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 16;
749 Put16(DC >> 5, dst); 663 Put16(DC >> 5, dst);
750 } 664 }
751 665
752 static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) { 666 static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
753 const __m128i zero = _mm_setzero_si128();
754 const __m128i top_row = _mm_load_si128((const __m128i*)top); 667 const __m128i top_row = _mm_load_si128((const __m128i*)top);
755 const __m128i sad8x2 = _mm_sad_epu8(top_row, zero); 668 const int DC = VP8HorizontalAdd8b(&top_row) + 8;
756 // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
757 const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
758 const int DC = _mm_cvtsi128_si32(sum) + 8;
759 Put16(DC >> 4, dst); 669 Put16(DC >> 4, dst);
760 } 670 }
761 671
762 static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) { 672 static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
763 // 'left' is contiguous so we can reuse the top summation. 673 // 'left' is contiguous so we can reuse the top summation.
764 DC16NoLeft(dst, left); 674 DC16NoLeft(dst, left);
765 } 675 }
766 676
767 static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) { 677 static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
768 Put16(0x80, dst); 678 Put16(0x80, dst);
(...skipping 366 matching lines...) Expand 10 before | Expand all | Expand 10 after
1135 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 1045 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
1136 } 1046 }
1137 1047
1138 //------------------------------------------------------------------------------ 1048 //------------------------------------------------------------------------------
1139 // Texture distortion 1049 // Texture distortion
1140 // 1050 //
1141 // We try to match the spectral content (weighted) between source and 1051 // We try to match the spectral content (weighted) between source and
1142 // reconstructed samples. 1052 // reconstructed samples.
1143 1053
1144 // Hadamard transform 1054 // Hadamard transform
1145 // Returns the difference between the weighted sum of the absolute value of 1055 // Returns the weighted sum of the absolute value of transformed coefficients.
1146 // transformed coefficients. 1056 // w[] contains a row-major 4 by 4 symmetric matrix.
1147 static int TTransform(const uint8_t* inA, const uint8_t* inB, 1057 static int TTransform(const uint8_t* inA, const uint8_t* inB,
1148 const uint16_t* const w) { 1058 const uint16_t* const w) {
1149 int32_t sum[4]; 1059 int32_t sum[4];
1150 __m128i tmp_0, tmp_1, tmp_2, tmp_3; 1060 __m128i tmp_0, tmp_1, tmp_2, tmp_3;
1151 const __m128i zero = _mm_setzero_si128(); 1061 const __m128i zero = _mm_setzero_si128();
1152 1062
1153 // Load, combine and transpose inputs. 1063 // Load and combine inputs.
1154 { 1064 {
1155 const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); 1065 const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
1156 const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); 1066 const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
1157 const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); 1067 const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
1158 const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); 1068 const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
1159 const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); 1069 const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
1160 const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); 1070 const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
1161 const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); 1071 const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
1162 const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); 1072 const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
1163 1073
1164 // Combine inA and inB (we'll do two transforms in parallel). 1074 // Combine inA and inB (we'll do two transforms in parallel).
1165 const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); 1075 const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
1166 const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); 1076 const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
1167 const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); 1077 const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
1168 const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); 1078 const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
1169 // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 1079 tmp_0 = _mm_unpacklo_epi8(inAB_0, zero);
1170 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 1080 tmp_1 = _mm_unpacklo_epi8(inAB_1, zero);
1171 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 1081 tmp_2 = _mm_unpacklo_epi8(inAB_2, zero);
1172 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 1082 tmp_3 = _mm_unpacklo_epi8(inAB_3, zero);
1173 1083 // a00 a01 a02 a03 b00 b01 b02 b03
1174 // Transpose the two 4x4, discarding the filling zeroes. 1084 // a10 a11 a12 a13 b10 b11 b12 b13
1175 const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); 1085 // a20 a21 a22 a23 b20 b21 b22 b23
1176 const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); 1086 // a30 a31 a32 a33 b30 b31 b32 b33
1177 // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23
1178 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33
1179 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
1180 const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
1181 // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31
1182 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33
1183
1184 // Convert to 16b.
1185 tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
1186 tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
1187 tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
1188 tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
1189 // a00 a10 a20 a30 b00 b10 b20 b30
1190 // a01 a11 a21 a31 b01 b11 b21 b31
1191 // a02 a12 a22 a32 b02 b12 b22 b32
1192 // a03 a13 a23 a33 b03 b13 b23 b33
1193 } 1087 }
1194 1088
1195 // Horizontal pass and subsequent transpose. 1089 // Vertical pass first to avoid a transpose (vertical and horizontal passes
1090 // are commutative because w/kWeightY is symmetric) and subsequent transpose.
1196 { 1091 {
1197 // Calculate a and b (two 4x4 at once). 1092 // Calculate a and b (two 4x4 at once).
1198 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); 1093 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
1199 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); 1094 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
1200 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); 1095 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
1201 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); 1096 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
1202 const __m128i b0 = _mm_add_epi16(a0, a1); 1097 const __m128i b0 = _mm_add_epi16(a0, a1);
1203 const __m128i b1 = _mm_add_epi16(a3, a2); 1098 const __m128i b1 = _mm_add_epi16(a3, a2);
1204 const __m128i b2 = _mm_sub_epi16(a3, a2); 1099 const __m128i b2 = _mm_sub_epi16(a3, a2);
1205 const __m128i b3 = _mm_sub_epi16(a0, a1); 1100 const __m128i b3 = _mm_sub_epi16(a0, a1);
1206 // a00 a01 a02 a03 b00 b01 b02 b03 1101 // a00 a01 a02 a03 b00 b01 b02 b03
1207 // a10 a11 a12 a13 b10 b11 b12 b13 1102 // a10 a11 a12 a13 b10 b11 b12 b13
1208 // a20 a21 a22 a23 b20 b21 b22 b23 1103 // a20 a21 a22 a23 b20 b21 b22 b23
1209 // a30 a31 a32 a33 b30 b31 b32 b33 1104 // a30 a31 a32 a33 b30 b31 b32 b33
1210 1105
1211 // Transpose the two 4x4. 1106 // Transpose the two 4x4.
1212 const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); 1107 VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
1213 const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
1214 const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
1215 const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
1216 // a00 a10 a01 a11 a02 a12 a03 a13
1217 // a20 a30 a21 a31 a22 a32 a23 a33
1218 // b00 b10 b01 b11 b02 b12 b03 b13
1219 // b20 b30 b21 b31 b22 b32 b23 b33
1220 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
1221 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
1222 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
1223 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
1224 // a00 a10 a20 a30 a01 a11 a21 a31
1225 // b00 b10 b20 b30 b01 b11 b21 b31
1226 // a02 a12 a22 a32 a03 a13 a23 a33
1227 // b02 b12 a22 b32 b03 b13 b23 b33
1228 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
1229 tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
1230 tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
1231 tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
1232 // a00 a10 a20 a30 b00 b10 b20 b30
1233 // a01 a11 a21 a31 b01 b11 b21 b31
1234 // a02 a12 a22 a32 b02 b12 b22 b32
1235 // a03 a13 a23 a33 b03 b13 b23 b33
1236 } 1108 }
1237 1109
1238 // Vertical pass and difference of weighted sums. 1110 // Horizontal pass and difference of weighted sums.
1239 { 1111 {
1240 // Load all inputs. 1112 // Load all inputs.
1241 const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); 1113 const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
1242 const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); 1114 const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
1243 1115
1244 // Calculate a and b (two 4x4 at once). 1116 // Calculate a and b (two 4x4 at once).
1245 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); 1117 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
1246 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); 1118 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
1247 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); 1119 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
1248 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); 1120 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
(...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after
1459 VP8SSE4x4 = SSE4x4; 1331 VP8SSE4x4 = SSE4x4;
1460 VP8TDisto4x4 = Disto4x4; 1332 VP8TDisto4x4 = Disto4x4;
1461 VP8TDisto16x16 = Disto16x16; 1333 VP8TDisto16x16 = Disto16x16;
1462 } 1334 }
1463 1335
1464 #else // !WEBP_USE_SSE2 1336 #else // !WEBP_USE_SSE2
1465 1337
1466 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2) 1338 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
1467 1339
1468 #endif // WEBP_USE_SSE2 1340 #endif // WEBP_USE_SSE2
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/enc_neon.c ('k') | third_party/libwebp/dsp/enc_sse41.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698