third_party/libwebp/dsp/enc_sse2.c - Issue 12942006: libwebp: update snapshot to v0.3.0-rc6

Side by Side Diff: third_party/libwebp/dsp/enc_sse2.c

Issue 12942006: libwebp: update snapshot to v0.3.0-rc6 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: local webkit layout expectations Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 Google Inc. All Rights Reserved.	1 // Copyright 2011 Google Inc. All Rights Reserved.

2 //	2 //

3 // This code is licensed under the same terms as WebM:	3 // This code is licensed under the same terms as WebM:

4 // Software License Agreement: http://www.webmproject.org/license/software/	4 // Software License Agreement: http://www.webmproject.org/license/software/

5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/	5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/

6 // -----------------------------------------------------------------------------	6 // -----------------------------------------------------------------------------

7 //	7 //

8 // SSE2 version of speed-critical encoding functions.	8 // SSE2 version of speed-critical encoding functions.

9 //	9 //

10 // Author: Christian Duvivier (cduvivier@google.com)	10 // Author: Christian Duvivier (cduvivier@google.com)

11	11

12 #include "./dsp.h"	12 #include "./dsp.h"

13	13

	14 #if defined(__cplusplus) \|\| defined(c_plusplus)

	15 extern "C" {

	16 #endif

	17

14 #if defined(WEBP_USE_SSE2)	18 #if defined(WEBP_USE_SSE2)

15 #include <stdlib.h> // for abs()	19 #include <stdlib.h> // for abs()

16 #include <emmintrin.h>	20 #include <emmintrin.h>

17	21

18 #include "../enc/vp8enci.h"	22 #include "../enc/vp8enci.h"

19	23

20 #if defined(__cplusplus) \|\| defined(c_plusplus)	24 //------------------------------------------------------------------------------

21 extern "C" {	25 // Quite useful macro for debugging. Left here for convenience.

	26

	27 #if 0

	28 #include <stdio.h>

	29 static void PrintReg(const __m128i r, const char* const name, int size) {

	30 int n;

	31 union {

	32 __m128i r;

	33 uint8_t i8[16];

	34 uint16_t i16[8];

	35 uint32_t i32[4];

	36 uint64_t i64[2];

	37 } tmp;

	38 tmp.r = r;

	39 printf("%s\t: ", name);

	40 if (size == 8) {

	41 for (n = 0; n < 16; ++n) printf("%.2x ", tmp.i8[n]);

	42 } else if (size == 16) {

	43 for (n = 0; n < 8; ++n) printf("%.4x ", tmp.i16[n]);

	44 } else if (size == 32) {

	45 for (n = 0; n < 4; ++n) printf("%.8x ", tmp.i32[n]);

	46 } else {

	47 for (n = 0; n < 2; ++n) printf("%.16lx ", tmp.i64[n]);

	48 }

	49 printf("\n");

	50 }

22 #endif	51 #endif

23	52

24 //------------------------------------------------------------------------------	53 //------------------------------------------------------------------------------

25 // Compute susceptibility based on DCT-coeff histograms:	54 // Compute susceptibility based on DCT-coeff histograms:

26 // the higher, the "easier" the macroblock is to compress.	55 // the higher, the "easier" the macroblock is to compress.

27	56

28 static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,	57 static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,

29 int start_block, int end_block) {	58 int start_block, int end_block,

30 int histo[MAX_COEFF_THRESH + 1] = { 0 };	59 VP8Histogram* const histo) {

31 int16_t out[16];

32 int j, k;

33 const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);	60 const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);

	61 int j;

34 for (j = start_block; j < end_block; ++j) {	62 for (j = start_block; j < end_block; ++j) {

	63 int16_t out[16];

	64 int k;

	65

35 VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);	66 VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

36	67

37 // Convert coefficients to bin (within out[]).	68 // Convert coefficients to bin (within out[]).

38 {	69 {

39 // Load.	70 // Load.

40 const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);	71 const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);

41 const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);	72 const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);

42 // sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative)	73 // sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative)

43 const __m128i sign0 = _mm_srai_epi16(out0, 15);	74 const __m128i sign0 = _mm_srai_epi16(out0, 15);

44 const __m128i sign1 = _mm_srai_epi16(out1, 15);	75 const __m128i sign1 = _mm_srai_epi16(out1, 15);

45 // abs(out) = (out ^ sign) - sign	76 // abs(out) = (out ^ sign) - sign

46 const __m128i xor0 = _mm_xor_si128(out0, sign0);	77 const __m128i xor0 = _mm_xor_si128(out0, sign0);

47 const __m128i xor1 = _mm_xor_si128(out1, sign1);	78 const __m128i xor1 = _mm_xor_si128(out1, sign1);

48 const __m128i abs0 = _mm_sub_epi16(xor0, sign0);	79 const __m128i abs0 = _mm_sub_epi16(xor0, sign0);

49 const __m128i abs1 = _mm_sub_epi16(xor1, sign1);	80 const __m128i abs1 = _mm_sub_epi16(xor1, sign1);

50 // v = abs(out) >> 2	81 // v = abs(out) >> 3

51 const __m128i v0 = _mm_srai_epi16(abs0, 2);	82 const __m128i v0 = _mm_srai_epi16(abs0, 3);

52 const __m128i v1 = _mm_srai_epi16(abs1, 2);	83 const __m128i v1 = _mm_srai_epi16(abs1, 3);

53 // bin = min(v, MAX_COEFF_THRESH)	84 // bin = min(v, MAX_COEFF_THRESH)

54 const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);	85 const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);

55 const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);	86 const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);

56 // Store.	87 // Store.

57 _mm_storeu_si128((__m128i*)&out[0], bin0);	88 _mm_storeu_si128((__m128i*)&out[0], bin0);

58 _mm_storeu_si128((__m128i*)&out[8], bin1);	89 _mm_storeu_si128((__m128i*)&out[8], bin1);

59 }	90 }

60	91

61 // Use bin to update histogram.	92 // Convert coefficients to bin.

62 for (k = 0; k < 16; ++k) {	93 for (k = 0; k < 16; ++k) {

63 histo[out[k]]++;	94 histo->distribution[out[k]]++;

64 }	95 }

65 }	96 }

66

67 return VP8GetAlpha(histo);

68 }	97 }

69	98

70 //------------------------------------------------------------------------------	99 //------------------------------------------------------------------------------

71 // Transforms (Paragraph 14.4)	100 // Transforms (Paragraph 14.4)

72	101

73 // Does one or two inverse transforms.	102 // Does one or two inverse transforms.

74 static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,	103 static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,

75 int do_two) {	104 int do_two) {

76 // This implementation makes use of 16-bit fixed point versions of two	105 // This implementation makes use of 16-bit fixed point versions of two

77 // multiply constants:	106 // multiply constants:

(...skipping 158 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
236 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);	265 T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);

237 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);	266 T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);

238 // a00 a10 a20 a30 b00 b10 b20 b30	267 // a00 a10 a20 a30 b00 b10 b20 b30

239 // a01 a11 a21 a31 b01 b11 b21 b31	268 // a01 a11 a21 a31 b01 b11 b21 b31

240 // a02 a12 a22 a32 b02 b12 b22 b32	269 // a02 a12 a22 a32 b02 b12 b22 b32

241 // a03 a13 a23 a33 b03 b13 b23 b33	270 // a03 a13 a23 a33 b03 b13 b23 b33

242 }	271 }

243	272

244 // Add inverse transform to 'ref' and store.	273 // Add inverse transform to 'ref' and store.

245 {	274 {

246 const __m128i zero = _mm_set1_epi16(0);	275 const __m128i zero = _mm_setzero_si128();

247 // Load the reference(s).	276 // Load the reference(s).

248 __m128i ref0, ref1, ref2, ref3;	277 __m128i ref0, ref1, ref2, ref3;

249 if (do_two) {	278 if (do_two) {

250 // Load eight bytes/pixels per line.	279 // Load eight bytes/pixels per line.

251 ref0 = _mm_loadl_epi64((__m128i)&ref[0 BPS]);	280 ref0 = _mm_loadl_epi64((__m128i)&ref[0 BPS]);

252 ref1 = _mm_loadl_epi64((__m128i)&ref[1 BPS]);	281 ref1 = _mm_loadl_epi64((__m128i)&ref[1 BPS]);

253 ref2 = _mm_loadl_epi64((__m128i)&ref[2 BPS]);	282 ref2 = _mm_loadl_epi64((__m128i)&ref[2 BPS]);

254 ref3 = _mm_loadl_epi64((__m128i)&ref[3 BPS]);	283 ref3 = _mm_loadl_epi64((__m128i)&ref[3 BPS]);

255 } else {	284 } else {

256 // Load four bytes/pixels per line.	285 // Load four bytes/pixels per line.

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
288 ((int32_t )&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2);	317 ((int32_t )&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2);

289 ((int32_t )&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3);	318 ((int32_t )&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3);

290 }	319 }

291 }	320 }

292 }	321 }

293	322

294 static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,	323 static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,

295 int16_t* out) {	324 int16_t* out) {

296 const __m128i zero = _mm_setzero_si128();	325 const __m128i zero = _mm_setzero_si128();

297 const __m128i seven = _mm_set1_epi16(7);	326 const __m128i seven = _mm_set1_epi16(7);

298 const __m128i k7500 = _mm_set1_epi32(7500);	327 const __m128i k937 = _mm_set1_epi32(937);

299 const __m128i k14500 = _mm_set1_epi32(14500);	328 const __m128i k1812 = _mm_set1_epi32(1812);

300 const __m128i k51000 = _mm_set1_epi32(51000);	329 const __m128i k51000 = _mm_set1_epi32(51000);

301 const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));	330 const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));

302 const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,	331 const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,

303 5352, 2217, 5352, 2217);	332 5352, 2217, 5352, 2217);

304 const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,	333 const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,

305 2217, -5352, 2217, -5352);	334 2217, -5352, 2217, -5352);

	335 const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);

	336 const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);

	337 const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,

	338 2217, 5352, 2217, 5352);

	339 const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,

	340 -5352, 2217, -5352, 2217);

	341 __m128i v01, v32;

306	342

307 __m128i v01, v32;

308	343

309 // Difference between src and ref and initial transpose.	344 // Difference between src and ref and initial transpose.

310 {	345 {

311 // Load src and convert to 16b.	346 // Load src and convert to 16b.

312 const __m128i src0 = _mm_loadl_epi64((__m128i)&src[0 BPS]);	347 const __m128i src0 = _mm_loadl_epi64((__m128i)&src[0 BPS]);

313 const __m128i src1 = _mm_loadl_epi64((__m128i)&src[1 BPS]);	348 const __m128i src1 = _mm_loadl_epi64((__m128i)&src[1 BPS]);

314 const __m128i src2 = _mm_loadl_epi64((__m128i)&src[2 BPS]);	349 const __m128i src2 = _mm_loadl_epi64((__m128i)&src[2 BPS]);

315 const __m128i src3 = _mm_loadl_epi64((__m128i)&src[3 BPS]);	350 const __m128i src3 = _mm_loadl_epi64((__m128i)&src[3 BPS]);

316 const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);	351 const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);

317 const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);	352 const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);

318 const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);	353 const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);

319 const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);	354 const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);

320 // Load ref and convert to 16b.	355 // Load ref and convert to 16b.

321 const __m128i ref0 = _mm_loadl_epi64((__m128i)&ref[0 BPS]);	356 const __m128i ref0 = _mm_loadl_epi64((__m128i)&ref[0 BPS]);

322 const __m128i ref1 = _mm_loadl_epi64((__m128i)&ref[1 BPS]);	357 const __m128i ref1 = _mm_loadl_epi64((__m128i)&ref[1 BPS]);

323 const __m128i ref2 = _mm_loadl_epi64((__m128i)&ref[2 BPS]);	358 const __m128i ref2 = _mm_loadl_epi64((__m128i)&ref[2 BPS]);

324 const __m128i ref3 = _mm_loadl_epi64((__m128i)&ref[3 BPS]);	359 const __m128i ref3 = _mm_loadl_epi64((__m128i)&ref[3 BPS]);

325 const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);	360 const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);

326 const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);	361 const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);

327 const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);	362 const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);

328 const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);	363 const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);

329 // Compute difference.	364 // Compute difference. -> 00 01 02 03 00 00 00 00

330 const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);	365 const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);

331 const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);	366 const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);

332 const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);	367 const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);

333 const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);	368 const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);

334	369

335 // Transpose.	370

	371 // Unpack and shuffle

336 // 00 01 02 03 0 0 0 0	372 // 00 01 02 03 0 0 0 0

337 // 10 11 12 13 0 0 0 0	373 // 10 11 12 13 0 0 0 0

338 // 20 21 22 23 0 0 0 0	374 // 20 21 22 23 0 0 0 0

339 // 30 31 32 33 0 0 0 0	375 // 30 31 32 33 0 0 0 0

340 const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);	376 const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);

341 const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);	377 const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);

342 // 00 10 01 11 02 12 03 13	378 // 00 01 10 11 02 03 12 13

343 // 20 30 21 31 22 32 23 33	379 // 20 21 30 31 22 23 32 33

344 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);	380 const __m128i shuf01_p =

345 v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);	381 _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));

346 v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));	382 const __m128i shuf23_p =

347 // a02 a12 a22 a32 a03 a13 a23 a33	383 _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));

348 // a00 a10 a20 a30 a01 a11 a21 a31	384 // 00 01 10 11 03 02 13 12

349 // a03 a13 a23 a33 a02 a12 a22 a32	385 // 20 21 30 31 23 22 33 32

350 }	386 const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);

	387 const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);

	388 // 00 01 10 11 20 21 30 31

	389 // 03 02 13 12 23 22 33 32

	390 const __m128i a01 = _mm_add_epi16(s01, s32);

	391 const __m128i a32 = _mm_sub_epi16(s01, s32);

	392 // [d0 + d3 \| d1 + d2 \| ...] = [a0 a1 \| a0' a1' \| ... ]

	393 // [d0 - d3 \| d1 - d2 \| ...] = [a3 a2 \| a3' a2' \| ... ]

351	394

352 // First pass and subsequent transpose.	395 const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ]

353 {	396 const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ]

354 // Same operations are done on the (0,3) and (1,2) pairs.	397 const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);

355 // b0 = (a0 + a3) << 3	398 const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);

356 // b1 = (a1 + a2) << 3	399 const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);

357 // b3 = (a0 - a3) << 3	400 const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);

358 // b2 = (a1 - a2) << 3	401 const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9);

359 const __m128i a01 = _mm_add_epi16(v01, v32);	402 const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9);

360 const __m128i a32 = _mm_sub_epi16(v01, v32);	403 const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);

361 const __m128i b01 = _mm_slli_epi16(a01, 3);	404 const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);

362 const __m128i b32 = _mm_slli_epi16(a32, 3);	405 const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1...

363 const __m128i b11 = _mm_unpackhi_epi64(b01, b01);	406 const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3

364 const __m128i b22 = _mm_unpackhi_epi64(b32, b32);	407 const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);

365	408 v01 = _mm_unpacklo_epi32(s_lo, s_hi);

366 // e0 = b0 + b1	409 v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2..

367 // e2 = b0 - b1

368 const __m128i e0 = _mm_add_epi16(b01, b11);

369 const __m128i e2 = _mm_sub_epi16(b01, b11);

370 const __m128i e02 = _mm_unpacklo_epi64(e0, e2);

371

372 // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12

373 // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12

374 const __m128i b23 = _mm_unpacklo_epi16(b22, b32);

375 const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);

376 const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);

377 const __m128i d1 = _mm_add_epi32(c1, k14500);

378 const __m128i d3 = _mm_add_epi32(c3, k7500);

379 const __m128i e1 = _mm_srai_epi32(d1, 12);

380 const __m128i e3 = _mm_srai_epi32(d3, 12);

381 const __m128i e13 = _mm_packs_epi32(e1, e3);

382

383 // Transpose.

384 // 00 01 02 03 20 21 22 23

385 // 10 11 12 13 30 31 32 33

386 const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);

387 const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);

388 // 00 10 01 11 02 12 03 13

389 // 20 30 21 31 22 32 23 33

390 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);

391 v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);

392 v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));

393 // 02 12 22 32 03 13 23 33

394 // 00 10 20 30 01 11 21 31

395 // 03 13 23 33 02 12 22 32

396 }	410 }

397	411

398 // Second pass	412 // Second pass

399 {	413 {

400 // Same operations are done on the (0,3) and (1,2) pairs.	414 // Same operations are done on the (0,3) and (1,2) pairs.

401 // a0 = v0 + v3	415 // a0 = v0 + v3

402 // a1 = v1 + v2	416 // a1 = v1 + v2

403 // a3 = v0 - v3	417 // a3 = v0 - v3

404 // a2 = v1 - v2	418 // a2 = v1 - v2

405 const __m128i a01 = _mm_add_epi16(v01, v32);	419 const __m128i a01 = _mm_add_epi16(v01, v32);

406 const __m128i a32 = _mm_sub_epi16(v01, v32);	420 const __m128i a32 = _mm_sub_epi16(v01, v32);

407 const __m128i a11 = _mm_unpackhi_epi64(a01, a01);	421 const __m128i a11 = _mm_unpackhi_epi64(a01, a01);

408 const __m128i a22 = _mm_unpackhi_epi64(a32, a32);	422 const __m128i a22 = _mm_unpackhi_epi64(a32, a32);

	423 const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);

409	424

410 // d0 = (a0 + a1 + 7) >> 4;	425 // d0 = (a0 + a1 + 7) >> 4;

411 // d2 = (a0 - a1 + 7) >> 4;	426 // d2 = (a0 - a1 + 7) >> 4;

412 const __m128i b0 = _mm_add_epi16(a01, a11);	427 const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);

413 const __m128i b2 = _mm_sub_epi16(a01, a11);	428 const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);

414 const __m128i c0 = _mm_add_epi16(b0, seven);

415 const __m128i c2 = _mm_add_epi16(b2, seven);

416 const __m128i d0 = _mm_srai_epi16(c0, 4);	429 const __m128i d0 = _mm_srai_epi16(c0, 4);

417 const __m128i d2 = _mm_srai_epi16(c2, 4);	430 const __m128i d2 = _mm_srai_epi16(c2, 4);

418	431

419 // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)	432 // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)

420 // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)	433 // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)

421 const __m128i b23 = _mm_unpacklo_epi16(a22, a32);	434 const __m128i b23 = _mm_unpacklo_epi16(a22, a32);

422 const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);	435 const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);

423 const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);	436 const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);

424 const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);	437 const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);

425 const __m128i d3 = _mm_add_epi32(c3, k51000);	438 const __m128i d3 = _mm_add_epi32(c3, k51000);

426 const __m128i e1 = _mm_srai_epi32(d1, 16);	439 const __m128i e1 = _mm_srai_epi32(d1, 16);

427 const __m128i e3 = _mm_srai_epi32(d3, 16);	440 const __m128i e3 = _mm_srai_epi32(d3, 16);

428 const __m128i f1 = _mm_packs_epi32(e1, e1);	441 const __m128i f1 = _mm_packs_epi32(e1, e1);

429 const __m128i f3 = _mm_packs_epi32(e3, e3);	442 const __m128i f3 = _mm_packs_epi32(e3, e3);

430 // f1 = f1 + (a3 != 0);	443 // f1 = f1 + (a3 != 0);

431 // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the	444 // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the

432 // desired (0, 1), we add one earlier through k12000_plus_one.	445 // desired (0, 1), we add one earlier through k12000_plus_one.

	446 // -> f1 = f1 + 1 - (a3 == 0)

433 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));	447 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

434	448

435 _mm_storel_epi64((__m128i*)&out[ 0], d0);	449 _mm_storel_epi64((__m128i*)&out[ 0], d0);

436 _mm_storel_epi64((__m128i*)&out[ 4], g1);	450 _mm_storel_epi64((__m128i*)&out[ 4], g1);

437 _mm_storel_epi64((__m128i*)&out[ 8], d2);	451 _mm_storel_epi64((__m128i*)&out[ 8], d2);

438 _mm_storel_epi64((__m128i*)&out[12], f3);	452 _mm_storel_epi64((__m128i*)&out[12], f3);

439 }	453 }

440 }	454 }

441	455

442 //------------------------------------------------------------------------------	456 //------------------------------------------------------------------------------

443 // Metric	457 // Metric

444	458

	459 static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,

	460 int num_quads, int do_16) {

	461 const __m128i zero = _mm_setzero_si128();

	462 __m128i sum1 = zero;

	463 __m128i sum2 = zero;

	464

	465 while (num_quads-- > 0) {

	466 // Note: for the !do_16 case, we read 16 pixels instead of 8 but that's ok,

	467 // thanks to buffer over-allocation to that effect.

	468 const __m128i a0 = _mm_loadu_si128((__m128i)&a[BPS 0]);

	469 const __m128i a1 = _mm_loadu_si128((__m128i)&a[BPS 1]);

	470 const __m128i a2 = _mm_loadu_si128((__m128i)&a[BPS 2]);

	471 const __m128i a3 = _mm_loadu_si128((__m128i)&a[BPS 3]);

	472 const __m128i b0 = _mm_loadu_si128((__m128i)&b[BPS 0]);

	473 const __m128i b1 = _mm_loadu_si128((__m128i)&b[BPS 1]);

	474 const __m128i b2 = _mm_loadu_si128((__m128i)&b[BPS 2]);

	475 const __m128i b3 = _mm_loadu_si128((__m128i)&b[BPS 3]);

	476

	477 // compute clip0(a-b) and clip0(b-a)

	478 const __m128i a0p = _mm_subs_epu8(a0, b0);

	479 const __m128i a0m = _mm_subs_epu8(b0, a0);

	480 const __m128i a1p = _mm_subs_epu8(a1, b1);

	481 const __m128i a1m = _mm_subs_epu8(b1, a1);

	482 const __m128i a2p = _mm_subs_epu8(a2, b2);

	483 const __m128i a2m = _mm_subs_epu8(b2, a2);

	484 const __m128i a3p = _mm_subs_epu8(a3, b3);

	485 const __m128i a3m = _mm_subs_epu8(b3, a3);

	486

	487 // compute \|a-b\| with 8b arithmetic as clip0(a-b) \| clip0(b-a)

	488 const __m128i diff0 = _mm_or_si128(a0p, a0m);

	489 const __m128i diff1 = _mm_or_si128(a1p, a1m);

	490 const __m128i diff2 = _mm_or_si128(a2p, a2m);

	491 const __m128i diff3 = _mm_or_si128(a3p, a3m);

	492

	493 // unpack (only four operations, instead of eight)

	494 const __m128i low0 = _mm_unpacklo_epi8(diff0, zero);

	495 const __m128i low1 = _mm_unpacklo_epi8(diff1, zero);

	496 const __m128i low2 = _mm_unpacklo_epi8(diff2, zero);

	497 const __m128i low3 = _mm_unpacklo_epi8(diff3, zero);

	498

	499 // multiply with self

	500 const __m128i low_madd0 = _mm_madd_epi16(low0, low0);

	501 const __m128i low_madd1 = _mm_madd_epi16(low1, low1);

	502 const __m128i low_madd2 = _mm_madd_epi16(low2, low2);

	503 const __m128i low_madd3 = _mm_madd_epi16(low3, low3);

	504

	505 // collect in a cascading way

	506 const __m128i low_sum0 = _mm_add_epi32(low_madd0, low_madd1);

	507 const __m128i low_sum1 = _mm_add_epi32(low_madd2, low_madd3);

	508 sum1 = _mm_add_epi32(sum1, low_sum0);

	509 sum2 = _mm_add_epi32(sum2, low_sum1);

	510

	511 if (do_16) { // if necessary, process the higher 8 bytes similarly

	512 const __m128i hi0 = _mm_unpackhi_epi8(diff0, zero);

	513 const __m128i hi1 = _mm_unpackhi_epi8(diff1, zero);

	514 const __m128i hi2 = _mm_unpackhi_epi8(diff2, zero);

	515 const __m128i hi3 = _mm_unpackhi_epi8(diff3, zero);

	516

	517 const __m128i hi_madd0 = _mm_madd_epi16(hi0, hi0);

	518 const __m128i hi_madd1 = _mm_madd_epi16(hi1, hi1);

	519 const __m128i hi_madd2 = _mm_madd_epi16(hi2, hi2);

	520 const __m128i hi_madd3 = _mm_madd_epi16(hi3, hi3);

	521 const __m128i hi_sum0 = _mm_add_epi32(hi_madd0, hi_madd1);

	522 const __m128i hi_sum1 = _mm_add_epi32(hi_madd2, hi_madd3);

	523 sum1 = _mm_add_epi32(sum1, hi_sum0);

	524 sum2 = _mm_add_epi32(sum2, hi_sum1);

	525 }

	526 a += 4 * BPS;

	527 b += 4 * BPS;

	528 }

	529 {

	530 int32_t tmp[4];

	531 const __m128i sum = _mm_add_epi32(sum1, sum2);

	532 _mm_storeu_si128((__m128i*)tmp, sum);

	533 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);

	534 }

	535 }

	536

	537 static int SSE16x16SSE2(const uint8_t* a, const uint8_t* b) {

	538 return SSE_Nx4SSE2(a, b, 4, 1);

	539 }

	540

	541 static int SSE16x8SSE2(const uint8_t* a, const uint8_t* b) {

	542 return SSE_Nx4SSE2(a, b, 2, 1);

	543 }

	544

	545 static int SSE8x8SSE2(const uint8_t* a, const uint8_t* b) {

	546 return SSE_Nx4SSE2(a, b, 2, 0);

	547 }

	548

445 static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {	549 static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {

446 const __m128i zero = _mm_set1_epi16(0);	550 const __m128i zero = _mm_setzero_si128();

447	551

448 // Load values.	552 // Load values. Note that we read 8 pixels instead of 4,

	553 // but the a/b buffers are over-allocated to that effect.

449 const __m128i a0 = _mm_loadl_epi64((__m128i)&a[BPS 0]);	554 const __m128i a0 = _mm_loadl_epi64((__m128i)&a[BPS 0]);

450 const __m128i a1 = _mm_loadl_epi64((__m128i)&a[BPS 1]);	555 const __m128i a1 = _mm_loadl_epi64((__m128i)&a[BPS 1]);

451 const __m128i a2 = _mm_loadl_epi64((__m128i)&a[BPS 2]);	556 const __m128i a2 = _mm_loadl_epi64((__m128i)&a[BPS 2]);

452 const __m128i a3 = _mm_loadl_epi64((__m128i)&a[BPS 3]);	557 const __m128i a3 = _mm_loadl_epi64((__m128i)&a[BPS 3]);

453 const __m128i b0 = _mm_loadl_epi64((__m128i)&b[BPS 0]);	558 const __m128i b0 = _mm_loadl_epi64((__m128i)&b[BPS 0]);

454 const __m128i b1 = _mm_loadl_epi64((__m128i)&b[BPS 1]);	559 const __m128i b1 = _mm_loadl_epi64((__m128i)&b[BPS 1]);

455 const __m128i b2 = _mm_loadl_epi64((__m128i)&b[BPS 2]);	560 const __m128i b2 = _mm_loadl_epi64((__m128i)&b[BPS 2]);

456 const __m128i b3 = _mm_loadl_epi64((__m128i)&b[BPS 3]);	561 const __m128i b3 = _mm_loadl_epi64((__m128i)&b[BPS 3]);

457	562

458 // Combine pair of lines and convert to 16b.	563 // Combine pair of lines and convert to 16b.

(...skipping 17 matching lines...) Expand all Loading...
476 const __m128i d3 = _mm_subs_epu8(b23s, a23s);	581 const __m128i d3 = _mm_subs_epu8(b23s, a23s);

477	582

478 // Square and add them all together.	583 // Square and add them all together.

479 const __m128i madd0 = _mm_madd_epi16(d0, d0);	584 const __m128i madd0 = _mm_madd_epi16(d0, d0);

480 const __m128i madd1 = _mm_madd_epi16(d1, d1);	585 const __m128i madd1 = _mm_madd_epi16(d1, d1);

481 const __m128i madd2 = _mm_madd_epi16(d2, d2);	586 const __m128i madd2 = _mm_madd_epi16(d2, d2);

482 const __m128i madd3 = _mm_madd_epi16(d3, d3);	587 const __m128i madd3 = _mm_madd_epi16(d3, d3);

483 const __m128i sum0 = _mm_add_epi32(madd0, madd1);	588 const __m128i sum0 = _mm_add_epi32(madd0, madd1);

484 const __m128i sum1 = _mm_add_epi32(madd2, madd3);	589 const __m128i sum1 = _mm_add_epi32(madd2, madd3);

485 const __m128i sum2 = _mm_add_epi32(sum0, sum1);	590 const __m128i sum2 = _mm_add_epi32(sum0, sum1);

	591

486 int32_t tmp[4];	592 int32_t tmp[4];

487 _mm_storeu_si128((__m128i*)tmp, sum2);	593 _mm_storeu_si128((__m128i*)tmp, sum2);

488 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);	594 return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);

489 }	595 }

490	596

491 //------------------------------------------------------------------------------	597 //------------------------------------------------------------------------------

492 // Texture distortion	598 // Texture distortion

493 //	599 //

494 // We try to match the spectral content (weighted) between source and	600 // We try to match the spectral content (weighted) between source and

495 // reconstructed samples.	601 // reconstructed samples.

496	602

497 // Hadamard transform	603 // Hadamard transform

498 // Returns the difference between the weighted sum of the absolute value of	604 // Returns the difference between the weighted sum of the absolute value of

499 // transformed coefficients.	605 // transformed coefficients.

500 static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,	606 static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,

501 const uint16_t* const w) {	607 const uint16_t* const w) {

502 int32_t sum[4];	608 int32_t sum[4];

503 __m128i tmp_0, tmp_1, tmp_2, tmp_3;	609 __m128i tmp_0, tmp_1, tmp_2, tmp_3;

504 const __m128i zero = _mm_setzero_si128();	610 const __m128i zero = _mm_setzero_si128();

505 const __m128i one = _mm_set1_epi16(1);

506 const __m128i three = _mm_set1_epi16(3);

507	611

508 // Load, combine and tranpose inputs.	612 // Load, combine and tranpose inputs.

509 {	613 {

510 const __m128i inA_0 = _mm_loadl_epi64((__m128i)&inA[BPS 0]);	614 const __m128i inA_0 = _mm_loadl_epi64((__m128i)&inA[BPS 0]);

511 const __m128i inA_1 = _mm_loadl_epi64((__m128i)&inA[BPS 1]);	615 const __m128i inA_1 = _mm_loadl_epi64((__m128i)&inA[BPS 1]);

512 const __m128i inA_2 = _mm_loadl_epi64((__m128i)&inA[BPS 2]);	616 const __m128i inA_2 = _mm_loadl_epi64((__m128i)&inA[BPS 2]);

513 const __m128i inA_3 = _mm_loadl_epi64((__m128i)&inA[BPS 3]);	617 const __m128i inA_3 = _mm_loadl_epi64((__m128i)&inA[BPS 3]);

514 const __m128i inB_0 = _mm_loadl_epi64((__m128i)&inB[BPS 0]);	618 const __m128i inB_0 = _mm_loadl_epi64((__m128i)&inB[BPS 0]);

515 const __m128i inB_1 = _mm_loadl_epi64((__m128i)&inB[BPS 1]);	619 const __m128i inB_1 = _mm_loadl_epi64((__m128i)&inB[BPS 1]);

516 const __m128i inB_2 = _mm_loadl_epi64((__m128i)&inB[BPS 2]);	620 const __m128i inB_2 = _mm_loadl_epi64((__m128i)&inB[BPS 2]);

(...skipping 26 matching lines...) Expand all Loading...
543 tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);	647 tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);

544 // a00 a10 a20 a30 b00 b10 b20 b30	648 // a00 a10 a20 a30 b00 b10 b20 b30

545 // a01 a11 a21 a31 b01 b11 b21 b31	649 // a01 a11 a21 a31 b01 b11 b21 b31

546 // a02 a12 a22 a32 b02 b12 b22 b32	650 // a02 a12 a22 a32 b02 b12 b22 b32

547 // a03 a13 a23 a33 b03 b13 b23 b33	651 // a03 a13 a23 a33 b03 b13 b23 b33

548 }	652 }

549	653

550 // Horizontal pass and subsequent transpose.	654 // Horizontal pass and subsequent transpose.

551 {	655 {

552 // Calculate a and b (two 4x4 at once).	656 // Calculate a and b (two 4x4 at once).

553 const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);	657 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);

554 const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);	658 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);

555 const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);	659 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);

556 const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);	660 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);

557 // b0_extra = (a0 != 0);	661 const __m128i b0 = _mm_add_epi16(a0, a1);

558 const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);

559 const __m128i b0_base = _mm_add_epi16(a0, a1);

560 const __m128i b1 = _mm_add_epi16(a3, a2);	662 const __m128i b1 = _mm_add_epi16(a3, a2);

561 const __m128i b2 = _mm_sub_epi16(a3, a2);	663 const __m128i b2 = _mm_sub_epi16(a3, a2);

562 const __m128i b3 = _mm_sub_epi16(a0, a1);	664 const __m128i b3 = _mm_sub_epi16(a0, a1);

563 const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);

564 // a00 a01 a02 a03 b00 b01 b02 b03	665 // a00 a01 a02 a03 b00 b01 b02 b03

565 // a10 a11 a12 a13 b10 b11 b12 b13	666 // a10 a11 a12 a13 b10 b11 b12 b13

566 // a20 a21 a22 a23 b20 b21 b22 b23	667 // a20 a21 a22 a23 b20 b21 b22 b23

567 // a30 a31 a32 a33 b30 b31 b32 b33	668 // a30 a31 a32 a33 b30 b31 b32 b33

568	669

569 // Transpose the two 4x4.	670 // Transpose the two 4x4.

570 const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);	671 const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);

571 const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);	672 const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);

572 const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);	673 const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);

573 const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);	674 const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);

(...skipping 54 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
628 A_b0 = _mm_xor_si128(A_b0, sign_A_b0);	729 A_b0 = _mm_xor_si128(A_b0, sign_A_b0);

629 A_b2 = _mm_xor_si128(A_b2, sign_A_b2);	730 A_b2 = _mm_xor_si128(A_b2, sign_A_b2);

630 B_b0 = _mm_xor_si128(B_b0, sign_B_b0);	731 B_b0 = _mm_xor_si128(B_b0, sign_B_b0);

631 B_b2 = _mm_xor_si128(B_b2, sign_B_b2);	732 B_b2 = _mm_xor_si128(B_b2, sign_B_b2);

632 A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);	733 A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);

633 A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);	734 A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);

634 B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);	735 B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);

635 B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);	736 B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);

636 }	737 }

637	738

638 // b = abs(b) + 3

639 A_b0 = _mm_add_epi16(A_b0, three);

640 A_b2 = _mm_add_epi16(A_b2, three);

641 B_b0 = _mm_add_epi16(B_b0, three);

642 B_b2 = _mm_add_epi16(B_b2, three);

643

644 // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3

645 // b = (abs(b) + 3) >> 3

646 A_b0 = _mm_srai_epi16(A_b0, 3);

647 A_b2 = _mm_srai_epi16(A_b2, 3);

648 B_b0 = _mm_srai_epi16(B_b0, 3);

649 B_b2 = _mm_srai_epi16(B_b2, 3);

650

651 // weighted sums	739 // weighted sums

652 A_b0 = _mm_madd_epi16(A_b0, w_0);	740 A_b0 = _mm_madd_epi16(A_b0, w_0);

653 A_b2 = _mm_madd_epi16(A_b2, w_8);	741 A_b2 = _mm_madd_epi16(A_b2, w_8);

654 B_b0 = _mm_madd_epi16(B_b0, w_0);	742 B_b0 = _mm_madd_epi16(B_b0, w_0);

655 B_b2 = _mm_madd_epi16(B_b2, w_8);	743 B_b2 = _mm_madd_epi16(B_b2, w_8);

656 A_b0 = _mm_add_epi32(A_b0, A_b2);	744 A_b0 = _mm_add_epi32(A_b0, A_b2);

657 B_b0 = _mm_add_epi32(B_b0, B_b2);	745 B_b0 = _mm_add_epi32(B_b0, B_b2);

658	746

659 // difference of weighted sums	747 // difference of weighted sums

660 A_b0 = _mm_sub_epi32(A_b0, B_b0);	748 A_b0 = _mm_sub_epi32(A_b0, B_b0);

661 _mm_storeu_si128((__m128i*)&sum[0], A_b0);	749 _mm_storeu_si128((__m128i*)&sum[0], A_b0);

662 }	750 }

663 return sum[0] + sum[1] + sum[2] + sum[3];	751 return sum[0] + sum[1] + sum[2] + sum[3];

664 }	752 }

665	753

666 static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,	754 static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,

667 const uint16_t* const w) {	755 const uint16_t* const w) {

668 const int diff_sum = TTransformSSE2(a, b, w);	756 const int diff_sum = TTransformSSE2(a, b, w);

669 return (abs(diff_sum) + 8) >> 4;	757 return abs(diff_sum) >> 5;

670 }	758 }

671	759

672 static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,	760 static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,

673 const uint16_t* const w) {	761 const uint16_t* const w) {

674 int D = 0;	762 int D = 0;

675 int x, y;	763 int x, y;

676 for (y = 0; y < 16 * BPS; y += 4 * BPS) {	764 for (y = 0; y < 16 * BPS; y += 4 * BPS) {

677 for (x = 0; x < 16; x += 4) {	765 for (x = 0; x < 16; x += 4) {

678 D += Disto4x4SSE2(a + x + y, b + x + y, w);	766 D += Disto4x4SSE2(a + x + y, b + x + y, w);

679 }	767 }

680 }	768 }

681 return D;	769 return D;

682 }	770 }

683	771

684

685 //------------------------------------------------------------------------------	772 //------------------------------------------------------------------------------

686 // Quantization	773 // Quantization

687 //	774 //

688	775

689 // Simple quantization	776 // Simple quantization

690 static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],	777 static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],

691 int n, const VP8Matrix* const mtx) {	778 int n, const VP8Matrix* const mtx) {

692 const __m128i max_coeff_2047 = _mm_set1_epi16(2047);	779 const __m128i max_coeff_2047 = _mm_set1_epi16(2047);

693 const __m128i zero = _mm_set1_epi16(0);	780 const __m128i zero = _mm_setzero_si128();

694 __m128i sign0, sign8;

695 __m128i coeff0, coeff8;	781 __m128i coeff0, coeff8;

696 __m128i out0, out8;	782 __m128i out0, out8;

697 __m128i packed_out;	783 __m128i packed_out;

698	784

699 // Load all inputs.	785 // Load all inputs.

700 // TODO(cduvivier): Make variable declarations and allocations aligned so that	786 // TODO(cduvivier): Make variable declarations and allocations aligned so that

701 // we can use _mm_load_si128 instead of _mm_loadu_si128.	787 // we can use _mm_load_si128 instead of _mm_loadu_si128.

702 __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);	788 __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);

703 __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);	789 __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);

704 const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);	790 const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);

705 const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);	791 const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);

706 const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);	792 const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);

707 const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);	793 const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);

708 const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);	794 const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);

709 const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);	795 const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);

710 const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);	796 const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);

711 const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);	797 const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);

712 const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);	798 const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);

713 const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);	799 const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

714	800

715 // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative)	801 // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative)

716 sign0 = _mm_srai_epi16(in0, 15);	802 const __m128i sign0 = _mm_srai_epi16(in0, 15);

717 sign8 = _mm_srai_epi16(in8, 15);	803 const __m128i sign8 = _mm_srai_epi16(in8, 15);

718	804

719 // coeff = abs(in) = (in ^ sign) - sign	805 // coeff = abs(in) = (in ^ sign) - sign

720 coeff0 = _mm_xor_si128(in0, sign0);	806 coeff0 = _mm_xor_si128(in0, sign0);

721 coeff8 = _mm_xor_si128(in8, sign8);	807 coeff8 = _mm_xor_si128(in8, sign8);

722 coeff0 = _mm_sub_epi16(coeff0, sign0);	808 coeff0 = _mm_sub_epi16(coeff0, sign0);

723 coeff8 = _mm_sub_epi16(coeff8, sign8);	809 coeff8 = _mm_sub_epi16(coeff8, sign8);

724	810

725 // coeff = abs(in) + sharpen	811 // coeff = abs(in) + sharpen

726 coeff0 = _mm_add_epi16(coeff0, sharpen0);	812 coeff0 = _mm_add_epi16(coeff0, sharpen0);

727 coeff8 = _mm_add_epi16(coeff8, sharpen8);	813 coeff8 = _mm_add_epi16(coeff8, sharpen8);

(...skipping 84 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
812 {	898 {

813 int32_t tmp[4];	899 int32_t tmp[4];

814 _mm_storeu_si128((__m128i*)tmp, packed_out);	900 _mm_storeu_si128((__m128i*)tmp, packed_out);

815 if (n) {	901 if (n) {

816 tmp[0] &= ~0xff;	902 tmp[0] &= ~0xff;

817 }	903 }

818 return (tmp[3] \|\| tmp[2] \|\| tmp[1] \|\| tmp[0]);	904 return (tmp[3] \|\| tmp[2] \|\| tmp[1] \|\| tmp[0]);

819 }	905 }

820 }	906 }

821	907

	908 #endif // WEBP_USE_SSE2

	909

	910 //------------------------------------------------------------------------------

	911 // Entry point

	912

822 extern void VP8EncDspInitSSE2(void);	913 extern void VP8EncDspInitSSE2(void);

	914

823 void VP8EncDspInitSSE2(void) {	915 void VP8EncDspInitSSE2(void) {

	916 #if defined(WEBP_USE_SSE2)

824 VP8CollectHistogram = CollectHistogramSSE2;	917 VP8CollectHistogram = CollectHistogramSSE2;

825 VP8EncQuantizeBlock = QuantizeBlockSSE2;	918 VP8EncQuantizeBlock = QuantizeBlockSSE2;

826 VP8ITransform = ITransformSSE2;	919 VP8ITransform = ITransformSSE2;

827 VP8FTransform = FTransformSSE2;	920 VP8FTransform = FTransformSSE2;

	921 VP8SSE16x16 = SSE16x16SSE2;

	922 VP8SSE16x8 = SSE16x8SSE2;

	923 VP8SSE8x8 = SSE8x8SSE2;

828 VP8SSE4x4 = SSE4x4SSE2;	924 VP8SSE4x4 = SSE4x4SSE2;

829 VP8TDisto4x4 = Disto4x4SSE2;	925 VP8TDisto4x4 = Disto4x4SSE2;

830 VP8TDisto16x16 = Disto16x16SSE2;	926 VP8TDisto16x16 = Disto16x16SSE2;

	927 #endif // WEBP_USE_SSE2

831 }	928 }

832	929

833 #if defined(__cplusplus) \|\| defined(c_plusplus)	930 #if defined(__cplusplus) \|\| defined(c_plusplus)

834 } // extern "C"	931 } // extern "C"

835 #endif	932 #endif

836

837 #endif // WEBP_USE_SSE2

OLD	NEW

« third_party/libwebp/dsp/dec_neon.c ('K') | « third_party/libwebp/dsp/enc_neon.c ('k') | third_party/libwebp/dsp/lossless.h » ('j') | no next file with comments »