third_party/libwebp/dsp/enc_sse2.c - Issue 2149863002: libwebp: update to v0.5.1

Side by Side Diff: third_party/libwebp/dsp/enc_sse2.c

Issue 2149863002: libwebp: update to v0.5.1 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2011 Google Inc. All Rights Reserved.	1 // Copyright 2011 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // SSE2 version of speed-critical encoding functions.	10 // SSE2 version of speed-critical encoding functions.

11 //	11 //

12 // Author: Christian Duvivier (cduvivier@google.com)	12 // Author: Christian Duvivier (cduvivier@google.com)

13	13

14 #include "./dsp.h"	14 #include "./dsp.h"

15	15

16 #if defined(WEBP_USE_SSE2)	16 #if defined(WEBP_USE_SSE2)

17 #include <stdlib.h> // for abs()	17 #include <stdlib.h> // for abs()

18 #include <emmintrin.h>	18 #include <emmintrin.h>

19	19

	20 #include "./common_sse2.h"

20 #include "../enc/cost.h"	21 #include "../enc/cost.h"

21 #include "../enc/vp8enci.h"	22 #include "../enc/vp8enci.h"

22	23

23 //------------------------------------------------------------------------------	24 //------------------------------------------------------------------------------

24 // Quite useful macro for debugging. Left here for convenience.

25

26 #if 0

27 #include <stdio.h>

28 static void PrintReg(const __m128i r, const char* const name, int size) {

29 int n;

30 union {

31 __m128i r;

32 uint8_t i8[16];

33 uint16_t i16[8];

34 uint32_t i32[4];

35 uint64_t i64[2];

36 } tmp;

37 tmp.r = r;

38 fprintf(stderr, "%s\t: ", name);

39 if (size == 8) {

40 for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);

41 } else if (size == 16) {

42 for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);

43 } else if (size == 32) {

44 for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);

45 } else {

46 for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]);

47 }

48 fprintf(stderr, "\n");

49 }

50 #endif

51

52 //------------------------------------------------------------------------------

53 // Transforms (Paragraph 14.4)	25 // Transforms (Paragraph 14.4)

54	26

55 // Does one or two inverse transforms.	27 // Does one or two inverse transforms.

56 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,	28 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,

57 int do_two) {	29 int do_two) {

58 // This implementation makes use of 16-bit fixed point versions of two	30 // This implementation makes use of 16-bit fixed point versions of two

59 // multiply constants:	31 // multiply constants:

60 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16	32 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16

61 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16	33 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16

62 //	34 //

(...skipping 61 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
124 const __m128i d4 = _mm_add_epi16(d1, d2);	96 const __m128i d4 = _mm_add_epi16(d1, d2);

125 const __m128i d = _mm_add_epi16(d3, d4);	97 const __m128i d = _mm_add_epi16(d3, d4);

126	98

127 // Second pass.	99 // Second pass.

128 const __m128i tmp0 = _mm_add_epi16(a, d);	100 const __m128i tmp0 = _mm_add_epi16(a, d);

129 const __m128i tmp1 = _mm_add_epi16(b, c);	101 const __m128i tmp1 = _mm_add_epi16(b, c);

130 const __m128i tmp2 = _mm_sub_epi16(b, c);	102 const __m128i tmp2 = _mm_sub_epi16(b, c);

131 const __m128i tmp3 = _mm_sub_epi16(a, d);	103 const __m128i tmp3 = _mm_sub_epi16(a, d);

132	104

133 // Transpose the two 4x4.	105 // Transpose the two 4x4.

134 // a00 a01 a02 a03 b00 b01 b02 b03	106 VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3);

135 // a10 a11 a12 a13 b10 b11 b12 b13

136 // a20 a21 a22 a23 b20 b21 b22 b23

137 // a30 a31 a32 a33 b30 b31 b32 b33

138 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);

139 const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);

140 const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);

141 const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);

142 // a00 a10 a01 a11 a02 a12 a03 a13

143 // a20 a30 a21 a31 a22 a32 a23 a33

144 // b00 b10 b01 b11 b02 b12 b03 b13

145 // b20 b30 b21 b31 b22 b32 b23 b33

146 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);

147 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);

148 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);

149 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);

150 // a00 a10 a20 a30 a01 a11 a21 a31

151 // b00 b10 b20 b30 b01 b11 b21 b31

152 // a02 a12 a22 a32 a03 a13 a23 a33

153 // b02 b12 a22 b32 b03 b13 b23 b33

154 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);

155 T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);

156 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);

157 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);

158 // a00 a10 a20 a30 b00 b10 b20 b30

159 // a01 a11 a21 a31 b01 b11 b21 b31

160 // a02 a12 a22 a32 b02 b12 b22 b32

161 // a03 a13 a23 a33 b03 b13 b23 b33

162 }	107 }

163	108

164 // Horizontal pass and subsequent transpose.	109 // Horizontal pass and subsequent transpose.

165 {	110 {

166 // First pass, c and d calculations are longer because of the "trick"	111 // First pass, c and d calculations are longer because of the "trick"

167 // multiplications.	112 // multiplications.

168 const __m128i four = _mm_set1_epi16(4);	113 const __m128i four = _mm_set1_epi16(4);

169 const __m128i dc = _mm_add_epi16(T0, four);	114 const __m128i dc = _mm_add_epi16(T0, four);

170 const __m128i a = _mm_add_epi16(dc, T2);	115 const __m128i a = _mm_add_epi16(dc, T2);

171 const __m128i b = _mm_sub_epi16(dc, T2);	116 const __m128i b = _mm_sub_epi16(dc, T2);

(...skipping 14 matching lines...) Expand all Loading...
186 const __m128i tmp0 = _mm_add_epi16(a, d);	131 const __m128i tmp0 = _mm_add_epi16(a, d);

187 const __m128i tmp1 = _mm_add_epi16(b, c);	132 const __m128i tmp1 = _mm_add_epi16(b, c);

188 const __m128i tmp2 = _mm_sub_epi16(b, c);	133 const __m128i tmp2 = _mm_sub_epi16(b, c);

189 const __m128i tmp3 = _mm_sub_epi16(a, d);	134 const __m128i tmp3 = _mm_sub_epi16(a, d);

190 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);	135 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);

191 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);	136 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);

192 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);	137 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);

193 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);	138 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);

194	139

195 // Transpose the two 4x4.	140 // Transpose the two 4x4.

196 // a00 a01 a02 a03 b00 b01 b02 b03	141 VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,

197 // a10 a11 a12 a13 b10 b11 b12 b13	142 &T2, &T3);

198 // a20 a21 a22 a23 b20 b21 b22 b23

199 // a30 a31 a32 a33 b30 b31 b32 b33

200 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);

201 const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);

202 const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);

203 const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);

204 // a00 a10 a01 a11 a02 a12 a03 a13

205 // a20 a30 a21 a31 a22 a32 a23 a33

206 // b00 b10 b01 b11 b02 b12 b03 b13

207 // b20 b30 b21 b31 b22 b32 b23 b33

208 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);

209 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);

210 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);

211 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);

212 // a00 a10 a20 a30 a01 a11 a21 a31

213 // b00 b10 b20 b30 b01 b11 b21 b31

214 // a02 a12 a22 a32 a03 a13 a23 a33

215 // b02 b12 a22 b32 b03 b13 b23 b33

216 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);

217 T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);

218 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);

219 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);

220 // a00 a10 a20 a30 b00 b10 b20 b30

221 // a01 a11 a21 a31 b01 b11 b21 b31

222 // a02 a12 a22 a32 b02 b12 b22 b32

223 // a03 a13 a23 a33 b03 b13 b23 b33

224 }	143 }

225	144

226 // Add inverse transform to 'ref' and store.	145 // Add inverse transform to 'ref' and store.

227 {	146 {

228 const __m128i zero = _mm_setzero_si128();	147 const __m128i zero = _mm_setzero_si128();

229 // Load the reference(s).	148 // Load the reference(s).

230 __m128i ref0, ref1, ref2, ref3;	149 __m128i ref0, ref1, ref2, ref3;

231 if (do_two) {	150 if (do_two) {

232 // Load eight bytes/pixels per line.	151 // Load eight bytes/pixels per line.

233 ref0 = _mm_loadl_epi64((const __m128i)&ref[0 BPS]);	152 ref0 = _mm_loadl_epi64((const __m128i)&ref[0 BPS]);

(...skipping 132 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
366 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));	285 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

367	286

368 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);	287 const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);

369 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);	288 const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);

370 _mm_storeu_si128((__m128i*)&out[0], d0_g1);	289 _mm_storeu_si128((__m128i*)&out[0], d0_g1);

371 _mm_storeu_si128((__m128i*)&out[8], d2_f3);	290 _mm_storeu_si128((__m128i*)&out[8], d2_f3);

372 }	291 }

373	292

374 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {	293 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {

375 const __m128i zero = _mm_setzero_si128();	294 const __m128i zero = _mm_setzero_si128();

376	295 // Load src.

377 // Load src and convert to 16b.

378 const __m128i src0 = _mm_loadl_epi64((const __m128i)&src[0 BPS]);	296 const __m128i src0 = _mm_loadl_epi64((const __m128i)&src[0 BPS]);

379 const __m128i src1 = _mm_loadl_epi64((const __m128i)&src[1 BPS]);	297 const __m128i src1 = _mm_loadl_epi64((const __m128i)&src[1 BPS]);

380 const __m128i src2 = _mm_loadl_epi64((const __m128i)&src[2 BPS]);	298 const __m128i src2 = _mm_loadl_epi64((const __m128i)&src[2 BPS]);

381 const __m128i src3 = _mm_loadl_epi64((const __m128i)&src[3 BPS]);	299 const __m128i src3 = _mm_loadl_epi64((const __m128i)&src[3 BPS]);

382 const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);	300 // 00 01 02 03 *

383 const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);	301 // 10 11 12 13 *

384 const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);	302 // 20 21 22 23 *

385 const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);	303 // 30 31 32 33 *

386 // Load ref and convert to 16b.	304 // Shuffle.

	305 const __m128i src_0 = _mm_unpacklo_epi16(src0, src1);

	306 const __m128i src_1 = _mm_unpacklo_epi16(src2, src3);

	307 // 00 01 10 11 02 03 12 13 * * ...

	308 // 20 21 30 31 22 22 32 33 * * ...

	309

	310 // Load ref.

387 const __m128i ref0 = _mm_loadl_epi64((const __m128i)&ref[0 BPS]);	311 const __m128i ref0 = _mm_loadl_epi64((const __m128i)&ref[0 BPS]);

388 const __m128i ref1 = _mm_loadl_epi64((const __m128i)&ref[1 BPS]);	312 const __m128i ref1 = _mm_loadl_epi64((const __m128i)&ref[1 BPS]);

389 const __m128i ref2 = _mm_loadl_epi64((const __m128i)&ref[2 BPS]);	313 const __m128i ref2 = _mm_loadl_epi64((const __m128i)&ref[2 BPS]);

390 const __m128i ref3 = _mm_loadl_epi64((const __m128i)&ref[3 BPS]);	314 const __m128i ref3 = _mm_loadl_epi64((const __m128i)&ref[3 BPS]);

391 const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);	315 const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1);

392 const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);	316 const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3);

393 const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);

394 const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);

395 // Compute difference. -> 00 01 02 03 00 00 00 00

396 const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);

397 const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);

398 const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);

399 const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);

400	317

401 // Unpack and shuffle	318 // Convert both to 16 bit.

402 // 00 01 02 03 0 0 0 0	319 const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero);

403 // 10 11 12 13 0 0 0 0	320 const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero);

404 // 20 21 22 23 0 0 0 0	321 const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero);

405 // 30 31 32 33 0 0 0 0	322 const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero);

406 const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);	323

407 const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);	324 // Compute the difference.

	325 const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b);

	326 const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b);

408 __m128i v01, v32;	327 __m128i v01, v32;

409	328

410 // First pass	329 // First pass

411 FTransformPass1(&shuf01, &shuf23, &v01, &v32);	330 FTransformPass1(&row01, &row23, &v01, &v32);

412	331

413 // Second pass	332 // Second pass

414 FTransformPass2(&v01, &v32, out);	333 FTransformPass2(&v01, &v32, out);

415 }	334 }

416	335

417 static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {	336 static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {

418 const __m128i zero = _mm_setzero_si128();	337 const __m128i zero = _mm_setzero_si128();

419	338

420 // Load src and convert to 16b.	339 // Load src and convert to 16b.

421 const __m128i src0 = _mm_loadl_epi64((const __m128i)&src[0 BPS]);	340 const __m128i src0 = _mm_loadl_epi64((const __m128i)&src[0 BPS]);

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
456 // First pass	375 // First pass

457 FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);	376 FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);

458 FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);	377 FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);

459	378

460 // Second pass	379 // Second pass

461 FTransformPass2(&v01l, &v32l, out + 0);	380 FTransformPass2(&v01l, &v32l, out + 0);

462 FTransformPass2(&v01h, &v32h, out + 16);	381 FTransformPass2(&v01h, &v32h, out + 16);

463 }	382 }

464	383

465 static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {	384 static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {

466 const __m128i kMult1 = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);	385 const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);

467 const __m128i kMult2 = _mm_set_epi16(0, 0, 0, 0, -1, 1, -1, 1);

468 const __m128i src0 = _mm_loadl_epi64((__m128i)&in[0 16]);	386 const __m128i src0 = _mm_loadl_epi64((__m128i)&in[0 16]);

469 const __m128i src1 = _mm_loadl_epi64((__m128i)&in[1 16]);	387 const __m128i src1 = _mm_loadl_epi64((__m128i)&in[1 16]);

470 const __m128i src2 = _mm_loadl_epi64((__m128i)&in[2 16]);	388 const __m128i src2 = _mm_loadl_epi64((__m128i)&in[2 16]);

471 const __m128i src3 = _mm_loadl_epi64((__m128i)&in[3 16]);	389 const __m128i src3 = _mm_loadl_epi64((__m128i)&in[3 16]);

472 const __m128i A01 = _mm_unpacklo_epi16(src0, src1); // A0 A1 \| ...	390 const __m128i A01 = _mm_unpacklo_epi16(src0, src1); // A0 A1 \| ...

473 const __m128i A23 = _mm_unpacklo_epi16(src2, src3); // A2 A3 \| ...	391 const __m128i A23 = _mm_unpacklo_epi16(src2, src3); // A2 A3 \| ...

474 const __m128i B0 = _mm_adds_epi16(A01, A23); // a0 \| a1 \| ...	392 const __m128i B0 = _mm_adds_epi16(A01, A23); // a0 \| a1 \| ...

475 const __m128i B1 = _mm_subs_epi16(A01, A23); // a3 \| a2 \| ...	393 const __m128i B1 = _mm_subs_epi16(A01, A23); // a3 \| a2 \| ...

476 const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 \| a1 \| a3 \| a2	394 const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 \| a1 \| a3 \| a2 \| ...

477 const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 \| a2 \| a0 \| a1	395 const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 \| a2 \| a0 \| a1 \| ...

478 const __m128i D0 = _mm_madd_epi16(C0, kMult1); // out0, out1	396 const __m128i D = _mm_unpacklo_epi64(C0, C1); // a0 a1 a3 a2 a3 a2 a0 a1

479 const __m128i D1 = _mm_madd_epi16(C1, kMult2); // out2, out3	397 *out = _mm_madd_epi16(D, kMult);

480 *out = _mm_unpacklo_epi64(D0, D1);

481 }	398 }

482	399

483 static void FTransformWHT(const int16_t* in, int16_t* out) {	400 static void FTransformWHT(const int16_t* in, int16_t* out) {

	401 // Input is 12b signed.

484 __m128i row0, row1, row2, row3;	402 __m128i row0, row1, row2, row3;

	403 // Rows are 14b signed.

485 FTransformWHTRow(in + 0 * 64, &row0);	404 FTransformWHTRow(in + 0 * 64, &row0);

486 FTransformWHTRow(in + 1 * 64, &row1);	405 FTransformWHTRow(in + 1 * 64, &row1);

487 FTransformWHTRow(in + 2 * 64, &row2);	406 FTransformWHTRow(in + 2 * 64, &row2);

488 FTransformWHTRow(in + 3 * 64, &row3);	407 FTransformWHTRow(in + 3 * 64, &row3);

489	408

490 {	409 {

	410 // The a* are 15b signed.

491 const __m128i a0 = _mm_add_epi32(row0, row2);	411 const __m128i a0 = _mm_add_epi32(row0, row2);

492 const __m128i a1 = _mm_add_epi32(row1, row3);	412 const __m128i a1 = _mm_add_epi32(row1, row3);

493 const __m128i a2 = _mm_sub_epi32(row1, row3);	413 const __m128i a2 = _mm_sub_epi32(row1, row3);

494 const __m128i a3 = _mm_sub_epi32(row0, row2);	414 const __m128i a3 = _mm_sub_epi32(row0, row2);

495 const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);	415 const __m128i a0a3 = _mm_packs_epi32(a0, a3);

496 const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);	416 const __m128i a1a2 = _mm_packs_epi32(a1, a2);

497 const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);	417

498 const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);	418 // The b* are 16b signed.

499 const __m128i out0 = _mm_packs_epi32(b0, b1);	419 const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2);

500 const __m128i out1 = _mm_packs_epi32(b2, b3);	420 const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2);

501 _mm_storeu_si128((__m128i*)&out[0], out0);	421 const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2);

502 _mm_storeu_si128((__m128i*)&out[8], out1);	422 const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2);

	423

	424 _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1));

	425 _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1));

503 }	426 }

504 }	427 }

505	428

506 //------------------------------------------------------------------------------	429 //------------------------------------------------------------------------------

507 // Compute susceptibility based on DCT-coeff histograms:	430 // Compute susceptibility based on DCT-coeff histograms:

508 // the higher, the "easier" the macroblock is to compress.	431 // the higher, the "easier" the macroblock is to compress.

509	432

510 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,	433 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,

511 int start_block, int end_block,	434 int start_block, int end_block,

512 VP8Histogram* const histo) {	435 VP8Histogram* const histo) {

(...skipping 172 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
685 if (top != NULL) {	608 if (top != NULL) {

686 VerticalPred(dst, top, size);	609 VerticalPred(dst, top, size);

687 } else {	610 } else {

688 Fill(dst, 129, size);	611 Fill(dst, 129, size);

689 }	612 }

690 }	613 }

691 }	614 }

692	615

693 static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,	616 static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,

694 const uint8_t* top) {	617 const uint8_t* top) {

695 const __m128i zero = _mm_setzero_si128();

696 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);	618 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);

697 const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);	619 const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);

698 const __m128i sum_top = _mm_sad_epu8(top_values, zero);	620 const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);

699 const __m128i sum_left = _mm_sad_epu8(left_values, zero);	621 const int DC = VP8HorizontalAdd8b(&combined) + 8;

700 const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 8;

701 Put8x8uv(DC >> 4, dst);	622 Put8x8uv(DC >> 4, dst);

702 }	623 }

703	624

704 static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {	625 static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {

705 const __m128i zero = _mm_setzero_si128();	626 const __m128i zero = _mm_setzero_si128();

706 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);	627 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);

707 const __m128i sum = _mm_sad_epu8(top_values, zero);	628 const __m128i sum = _mm_sad_epu8(top_values, zero);

708 const int DC = _mm_cvtsi128_si32(sum) + 4;	629 const int DC = _mm_cvtsi128_si32(sum) + 4;

709 Put8x8uv(DC >> 3, dst);	630 Put8x8uv(DC >> 3, dst);

710 }	631 }

(...skipping 17 matching lines...) Expand all Loading...
728 }	649 }

729 } else if (left != NULL) { // left but no top	650 } else if (left != NULL) { // left but no top

730 DC8uvNoTop(dst, left);	651 DC8uvNoTop(dst, left);

731 } else { // no top, no left, nothing.	652 } else { // no top, no left, nothing.

732 DC8uvNoTopLeft(dst);	653 DC8uvNoTopLeft(dst);

733 }	654 }

734 }	655 }

735	656

736 static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,	657 static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,

737 const uint8_t* top) {	658 const uint8_t* top) {

738 const __m128i zero = _mm_setzero_si128();

739 const __m128i top_row = _mm_load_si128((const __m128i*)top);	659 const __m128i top_row = _mm_load_si128((const __m128i*)top);

740 const __m128i left_row = _mm_load_si128((const __m128i*)left);	660 const __m128i left_row = _mm_load_si128((const __m128i*)left);

741 const __m128i sad8x2 = _mm_sad_epu8(top_row, zero);	661 const int DC =

742 // sum the two sads: sad8x2[0:1] + sad8x2[8:9]	662 VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;

743 const __m128i sum_top = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));

744 const __m128i sad8x2_left = _mm_sad_epu8(left_row, zero);

745 // sum the two sads: sad8x2[0:1] + sad8x2[8:9]

746 const __m128i sum_left =

747 _mm_add_epi16(sad8x2_left, _mm_shuffle_epi32(sad8x2_left, 2));

748 const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 16;

749 Put16(DC >> 5, dst);	663 Put16(DC >> 5, dst);

750 }	664 }

751	665

752 static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {	666 static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {

753 const __m128i zero = _mm_setzero_si128();

754 const __m128i top_row = _mm_load_si128((const __m128i*)top);	667 const __m128i top_row = _mm_load_si128((const __m128i*)top);

755 const __m128i sad8x2 = _mm_sad_epu8(top_row, zero);	668 const int DC = VP8HorizontalAdd8b(&top_row) + 8;

756 // sum the two sads: sad8x2[0:1] + sad8x2[8:9]

757 const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));

758 const int DC = _mm_cvtsi128_si32(sum) + 8;

759 Put16(DC >> 4, dst);	669 Put16(DC >> 4, dst);

760 }	670 }

761	671

762 static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {	672 static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {

763 // 'left' is contiguous so we can reuse the top summation.	673 // 'left' is contiguous so we can reuse the top summation.

764 DC16NoLeft(dst, left);	674 DC16NoLeft(dst, left);

765 }	675 }

766	676

767 static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {	677 static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {

768 Put16(0x80, dst);	678 Put16(0x80, dst);

(...skipping 366 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1135 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);	1045 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);

1136 }	1046 }

1137	1047

1138 //------------------------------------------------------------------------------	1048 //------------------------------------------------------------------------------

1139 // Texture distortion	1049 // Texture distortion

1140 //	1050 //

1141 // We try to match the spectral content (weighted) between source and	1051 // We try to match the spectral content (weighted) between source and

1142 // reconstructed samples.	1052 // reconstructed samples.

1143	1053

1144 // Hadamard transform	1054 // Hadamard transform

1145 // Returns the difference between the weighted sum of the absolute value of	1055 // Returns the weighted sum of the absolute value of transformed coefficients.

1146 // transformed coefficients.	1056 // w[] contains a row-major 4 by 4 symmetric matrix.

1147 static int TTransform(const uint8_t* inA, const uint8_t* inB,	1057 static int TTransform(const uint8_t* inA, const uint8_t* inB,

1148 const uint16_t* const w) {	1058 const uint16_t* const w) {

1149 int32_t sum[4];	1059 int32_t sum[4];

1150 __m128i tmp_0, tmp_1, tmp_2, tmp_3;	1060 __m128i tmp_0, tmp_1, tmp_2, tmp_3;

1151 const __m128i zero = _mm_setzero_si128();	1061 const __m128i zero = _mm_setzero_si128();

1152	1062

1153 // Load, combine and transpose inputs.	1063 // Load and combine inputs.

1154 {	1064 {

1155 const __m128i inA_0 = _mm_loadl_epi64((const __m128i)&inA[BPS 0]);	1065 const __m128i inA_0 = _mm_loadl_epi64((const __m128i)&inA[BPS 0]);

1156 const __m128i inA_1 = _mm_loadl_epi64((const __m128i)&inA[BPS 1]);	1066 const __m128i inA_1 = _mm_loadl_epi64((const __m128i)&inA[BPS 1]);

1157 const __m128i inA_2 = _mm_loadl_epi64((const __m128i)&inA[BPS 2]);	1067 const __m128i inA_2 = _mm_loadl_epi64((const __m128i)&inA[BPS 2]);

1158 const __m128i inA_3 = _mm_loadl_epi64((const __m128i)&inA[BPS 3]);	1068 const __m128i inA_3 = _mm_loadl_epi64((const __m128i)&inA[BPS 3]);

1159 const __m128i inB_0 = _mm_loadl_epi64((const __m128i)&inB[BPS 0]);	1069 const __m128i inB_0 = _mm_loadl_epi64((const __m128i)&inB[BPS 0]);

1160 const __m128i inB_1 = _mm_loadl_epi64((const __m128i)&inB[BPS 1]);	1070 const __m128i inB_1 = _mm_loadl_epi64((const __m128i)&inB[BPS 1]);

1161 const __m128i inB_2 = _mm_loadl_epi64((const __m128i)&inB[BPS 2]);	1071 const __m128i inB_2 = _mm_loadl_epi64((const __m128i)&inB[BPS 2]);

1162 const __m128i inB_3 = _mm_loadl_epi64((const __m128i)&inB[BPS 3]);	1072 const __m128i inB_3 = _mm_loadl_epi64((const __m128i)&inB[BPS 3]);

1163	1073

1164 // Combine inA and inB (we'll do two transforms in parallel).	1074 // Combine inA and inB (we'll do two transforms in parallel).

1165 const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);	1075 const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);

1166 const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);	1076 const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);

1167 const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);	1077 const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);

1168 const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);	1078 const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);

1169 // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0	1079 tmp_0 = _mm_unpacklo_epi8(inAB_0, zero);

1170 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0	1080 tmp_1 = _mm_unpacklo_epi8(inAB_1, zero);

1171 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0	1081 tmp_2 = _mm_unpacklo_epi8(inAB_2, zero);

1172 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0	1082 tmp_3 = _mm_unpacklo_epi8(inAB_3, zero);

1173	1083 // a00 a01 a02 a03 b00 b01 b02 b03

1174 // Transpose the two 4x4, discarding the filling zeroes.	1084 // a10 a11 a12 a13 b10 b11 b12 b13

1175 const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);	1085 // a20 a21 a22 a23 b20 b21 b22 b23

1176 const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);	1086 // a30 a31 a32 a33 b30 b31 b32 b33

1177 // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23

1178 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33

1179 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);

1180 const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);

1181 // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31

1182 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33

1183

1184 // Convert to 16b.

1185 tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);

1186 tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);

1187 tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);

1188 tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);

1189 // a00 a10 a20 a30 b00 b10 b20 b30

1190 // a01 a11 a21 a31 b01 b11 b21 b31

1191 // a02 a12 a22 a32 b02 b12 b22 b32

1192 // a03 a13 a23 a33 b03 b13 b23 b33

1193 }	1087 }

1194	1088

1195 // Horizontal pass and subsequent transpose.	1089 // Vertical pass first to avoid a transpose (vertical and horizontal passes

	1090 // are commutative because w/kWeightY is symmetric) and subsequent transpose.

1196 {	1091 {

1197 // Calculate a and b (two 4x4 at once).	1092 // Calculate a and b (two 4x4 at once).

1198 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);	1093 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);

1199 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);	1094 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);

1200 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);	1095 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);

1201 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);	1096 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);

1202 const __m128i b0 = _mm_add_epi16(a0, a1);	1097 const __m128i b0 = _mm_add_epi16(a0, a1);

1203 const __m128i b1 = _mm_add_epi16(a3, a2);	1098 const __m128i b1 = _mm_add_epi16(a3, a2);

1204 const __m128i b2 = _mm_sub_epi16(a3, a2);	1099 const __m128i b2 = _mm_sub_epi16(a3, a2);

1205 const __m128i b3 = _mm_sub_epi16(a0, a1);	1100 const __m128i b3 = _mm_sub_epi16(a0, a1);

1206 // a00 a01 a02 a03 b00 b01 b02 b03	1101 // a00 a01 a02 a03 b00 b01 b02 b03

1207 // a10 a11 a12 a13 b10 b11 b12 b13	1102 // a10 a11 a12 a13 b10 b11 b12 b13

1208 // a20 a21 a22 a23 b20 b21 b22 b23	1103 // a20 a21 a22 a23 b20 b21 b22 b23

1209 // a30 a31 a32 a33 b30 b31 b32 b33	1104 // a30 a31 a32 a33 b30 b31 b32 b33

1210	1105

1211 // Transpose the two 4x4.	1106 // Transpose the two 4x4.

1212 const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);	1107 VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);

1213 const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);

1214 const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);

1215 const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);

1216 // a00 a10 a01 a11 a02 a12 a03 a13

1217 // a20 a30 a21 a31 a22 a32 a23 a33

1218 // b00 b10 b01 b11 b02 b12 b03 b13

1219 // b20 b30 b21 b31 b22 b32 b23 b33

1220 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);

1221 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);

1222 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);

1223 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);

1224 // a00 a10 a20 a30 a01 a11 a21 a31

1225 // b00 b10 b20 b30 b01 b11 b21 b31

1226 // a02 a12 a22 a32 a03 a13 a23 a33

1227 // b02 b12 a22 b32 b03 b13 b23 b33

1228 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);

1229 tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);

1230 tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);

1231 tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);

1232 // a00 a10 a20 a30 b00 b10 b20 b30

1233 // a01 a11 a21 a31 b01 b11 b21 b31

1234 // a02 a12 a22 a32 b02 b12 b22 b32

1235 // a03 a13 a23 a33 b03 b13 b23 b33

1236 }	1108 }

1237	1109

1238 // Vertical pass and difference of weighted sums.	1110 // Horizontal pass and difference of weighted sums.

1239 {	1111 {

1240 // Load all inputs.	1112 // Load all inputs.

1241 const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);	1113 const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);

1242 const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);	1114 const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);

1243	1115

1244 // Calculate a and b (two 4x4 at once).	1116 // Calculate a and b (two 4x4 at once).

1245 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);	1117 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);

1246 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);	1118 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);

1247 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);	1119 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);

1248 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);	1120 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);

(...skipping 210 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1459 VP8SSE4x4 = SSE4x4;	1331 VP8SSE4x4 = SSE4x4;

1460 VP8TDisto4x4 = Disto4x4;	1332 VP8TDisto4x4 = Disto4x4;

1461 VP8TDisto16x16 = Disto16x16;	1333 VP8TDisto16x16 = Disto16x16;

1462 }	1334 }

1463	1335

1464 #else // !WEBP_USE_SSE2	1336 #else // !WEBP_USE_SSE2

1465	1337

1466 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)	1338 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)

1467	1339

1468 #endif // WEBP_USE_SSE2	1340 #endif // WEBP_USE_SSE2

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/enc_neon.c ('k') | third_party/libwebp/dsp/enc_sse41.c » ('j') | no next file with comments »