third_party/libwebp/dsp/lossless_enc_sse2.c - Issue 2651883004: libwebp-0.6.0-rc1

Side by Side Diff: third_party/libwebp/dsp/lossless_enc_sse2.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2015 Google Inc. All Rights Reserved.	1 // Copyright 2015 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // SSE2 variant of methods for lossless encoder	10 // SSE2 variant of methods for lossless encoder

11 //	11 //

12 // Author: Skal (pascal.massimino@gmail.com)	12 // Author: Skal (pascal.massimino@gmail.com)

13	13

14 #include "./dsp.h"	14 #include "./dsp.h"

15	15

16 #if defined(WEBP_USE_SSE2)	16 #if defined(WEBP_USE_SSE2)

17 #include <assert.h>	17 #include <assert.h>

18 #include <emmintrin.h>	18 #include <emmintrin.h>

19 #include "./lossless.h"	19 #include "./lossless.h"

	20 #include "./common_sse2.h"

	21 #include "./lossless_common.h"

20	22

21 // For sign-extended multiplying constants, pre-shifted by 5:	23 // For sign-extended multiplying constants, pre-shifted by 5:

22 #define CST_5b(X) (((int16_t)((uint16_t)X << 8)) >> 5)	24 #define CST_5b(X) (((int16_t)((uint16_t)X << 8)) >> 5)

23	25

24 //------------------------------------------------------------------------------	26 //------------------------------------------------------------------------------

25 // Subtract-Green Transform	27 // Subtract-Green Transform

26	28

27 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {	29 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {

28 int i;	30 int i;

29 for (i = 0; i + 4 <= num_pixels; i += 4) {	31 for (i = 0; i + 4 <= num_pixels; i += 4) {

30 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb	32 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb

31 const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g	33 const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g

32 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));	34 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));

33 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g	35 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g

34 const __m128i out = _mm_sub_epi8(in, C);	36 const __m128i out = _mm_sub_epi8(in, C);

35 _mm_storeu_si128((__m128i*)&argb_data[i], out);	37 _mm_storeu_si128((__m128i*)&argb_data[i], out);

36 }	38 }

37 // fallthrough and finish off with plain-C	39 // fallthrough and finish off with plain-C

38 VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);	40 if (i != num_pixels) {

	41 VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);

	42 }

39 }	43 }

40	44

41 //------------------------------------------------------------------------------	45 //------------------------------------------------------------------------------

42 // Color Transform	46 // Color Transform

43	47

44 static void TransformColor(const VP8LMultipliers* const m,	48 static void TransformColor(const VP8LMultipliers* const m,

45 uint32_t* argb_data, int num_pixels) {	49 uint32_t* argb_data, int num_pixels) {

46 const __m128i mults_rb = _mm_set_epi16(	50 const __m128i mults_rb = _mm_set_epi16(

47 CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),	51 CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),

48 CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),	52 CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),

(...skipping 13 matching lines...) Expand all Loading...
62 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1	66 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1

63 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0	67 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0

64 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0	68 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0

65 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2	69 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2

66 const __m128i H = _mm_add_epi8(G, D); // x dr x db	70 const __m128i H = _mm_add_epi8(G, D); // x dr x db

67 const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db	71 const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db

68 const __m128i out = _mm_sub_epi8(in, I);	72 const __m128i out = _mm_sub_epi8(in, I);

69 _mm_storeu_si128((__m128i*)&argb_data[i], out);	73 _mm_storeu_si128((__m128i*)&argb_data[i], out);

70 }	74 }

71 // fallthrough and finish off with plain-C	75 // fallthrough and finish off with plain-C

72 VP8LTransformColor_C(m, argb_data + i, num_pixels - i);	76 if (i != num_pixels) {

	77 VP8LTransformColor_C(m, argb_data + i, num_pixels - i);

	78 }

73 }	79 }

74	80

75 //------------------------------------------------------------------------------	81 //------------------------------------------------------------------------------

76 #define SPAN 8	82 #define SPAN 8

77 static void CollectColorBlueTransforms(const uint32_t* argb, int stride,	83 static void CollectColorBlueTransforms(const uint32_t* argb, int stride,

78 int tile_width, int tile_height,	84 int tile_width, int tile_height,

79 int green_to_blue, int red_to_blue,	85 int green_to_blue, int red_to_blue,

80 int histo[]) {	86 int histo[]) {

81 const __m128i mults_r = _mm_set_epi16(	87 const __m128i mults_r = _mm_set_epi16(

82 CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,	88 CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,

(...skipping 274 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
357 match_len = 0;	363 match_len = 0;

358 // Unroll the potential first two loops.	364 // Unroll the potential first two loops.

359 if (length >= 4 &&	365 if (length >= 4 &&

360 _mm_movemask_epi8(_mm_cmpeq_epi32(	366 _mm_movemask_epi8(_mm_cmpeq_epi32(

361 _mm_loadu_si128((const __m128i*)&array1[0]),	367 _mm_loadu_si128((const __m128i*)&array1[0]),

362 _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {	368 _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {

363 match_len = 4;	369 match_len = 4;

364 if (length >= 8 &&	370 if (length >= 8 &&

365 _mm_movemask_epi8(_mm_cmpeq_epi32(	371 _mm_movemask_epi8(_mm_cmpeq_epi32(

366 _mm_loadu_si128((const __m128i*)&array1[4]),	372 _mm_loadu_si128((const __m128i*)&array1[4]),

367 _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff)	373 _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {

368 match_len = 8;	374 match_len = 8;

	375 }

369 }	376 }

370 }	377 }

371	378

372 while (match_len < length && array1[match_len] == array2[match_len]) {	379 while (match_len < length && array1[match_len] == array2[match_len]) {

373 ++match_len;	380 ++match_len;

374 }	381 }

375 return match_len;	382 return match_len;

376 }	383 }

377	384

	385 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.

	386 static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,

	387 uint32_t* dst) {

	388 int x;

	389 assert(xbits >= 0);

	390 assert(xbits <= 3);

	391 switch (xbits) {

	392 case 0: {

	393 const __m128i ff = _mm_set1_epi16(0xff00);

	394 const __m128i zero = _mm_setzero_si128();

	395 // Store 0xff000000 \| (row[x] << 8).

	396 for (x = 0; x + 16 <= width; x += 16, dst += 16) {

	397 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);

	398 const __m128i in_lo = _mm_unpacklo_epi8(zero, in);

	399 const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff);

	400 const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff);

	401 const __m128i in_hi = _mm_unpackhi_epi8(zero, in);

	402 const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff);

	403 const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff);

	404 _mm_storeu_si128((__m128i*)&dst[0], dst0);

	405 _mm_storeu_si128((__m128i*)&dst[4], dst1);

	406 _mm_storeu_si128((__m128i*)&dst[8], dst2);

	407 _mm_storeu_si128((__m128i*)&dst[12], dst3);

	408 }

	409 break;

	410 }

	411 case 1: {

	412 const __m128i ff = _mm_set1_epi16(0xff00);

	413 const __m128i mul = _mm_set1_epi16(0x110);

	414 for (x = 0; x + 16 <= width; x += 16, dst += 8) {

	415 // 0a0b \| (where a/b are 4 bits).

	416 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);

	417 const __m128i tmp = _mm_mullo_epi16(in, mul); // aba0

	418 const __m128i pack = _mm_and_si128(tmp, ff); // ab00

	419 const __m128i dst0 = _mm_unpacklo_epi16(pack, ff);

	420 const __m128i dst1 = _mm_unpackhi_epi16(pack, ff);

	421 _mm_storeu_si128((__m128i*)&dst[0], dst0);

	422 _mm_storeu_si128((__m128i*)&dst[4], dst1);

	423 }

	424 break;

	425 }

	426 case 2: {

	427 const __m128i mask_or = _mm_set1_epi32(0xff000000);

	428 const __m128i mul_cst = _mm_set1_epi16(0x0104);

	429 const __m128i mask_mul = _mm_set1_epi16(0x0f00);

	430 for (x = 0; x + 16 <= width; x += 16, dst += 4) {

	431 // 000a000b000c000d \| (where a/b/c/d are 2 bits).

	432 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);

	433 const __m128i mul = _mm_mullo_epi16(in, mul_cst); // 00ab00b000cd00d0

	434 const __m128i tmp = _mm_and_si128(mul, mask_mul); // 00ab000000cd0000

	435 const __m128i shift = _mm_srli_epi32(tmp, 12); // 00000000ab000000

	436 const __m128i pack = _mm_or_si128(shift, tmp); // 00000000abcd0000

	437 // Convert to 0xff00**00.

	438 const __m128i res = _mm_or_si128(pack, mask_or);

	439 _mm_storeu_si128((__m128i*)dst, res);

	440 }

	441 break;

	442 }

	443 default: {

	444 assert(xbits == 3);

	445 for (x = 0; x + 16 <= width; x += 16, dst += 2) {

	446 // 0000000a00000000b... \| (where a/b are 1 bit).

	447 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);

	448 const __m128i shift = _mm_slli_epi64(in, 7);

	449 const uint32_t move = _mm_movemask_epi8(shift);

	450 dst[0] = 0xff000000 \| ((move & 0xff) << 8);

	451 dst[1] = 0xff000000 \| (move & 0xff00);

	452 }

	453 break;

	454 }

	455 }

	456 if (x != width) {

	457 VP8LBundleColorMap_C(row + x, width - x, xbits, dst);

	458 }

	459 }

	460

	461 //------------------------------------------------------------------------------

	462 // Batch version of Predictor Transform subtraction

	463

	464 static WEBP_INLINE void Average2_m128i(const __m128i* const a0,

	465 const __m128i* const a1,

	466 __m128i* const avg) {

	467 // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)

	468 const __m128i ones = _mm_set1_epi8(1);

	469 const __m128i avg1 = _mm_avg_epu8(a0, a1);

	470 const __m128i one = _mm_and_si128(_mm_xor_si128(a0, a1), ones);

	471 *avg = _mm_sub_epi8(avg1, one);

	472 }

	473

	474 // Predictor0: ARGB_BLACK.

	475 static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,

	476 int num_pixels, uint32_t* out) {

	477 int i;

	478 const __m128i black = _mm_set1_epi32(ARGB_BLACK);

	479 for (i = 0; i + 4 <= num_pixels; i += 4) {

	480 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);

	481 const __m128i res = _mm_sub_epi8(src, black);

	482 _mm_storeu_si128((__m128i*)&out[i], res);

	483 }

	484 if (i != num_pixels) {

	485 VP8LPredictorsSub_C[0](in + i, upper + i, num_pixels - i, out + i);

	486 }

	487 }

	488

	489 #define GENERATE_PREDICTOR_1(X, IN) \

	490 static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \

	491 int num_pixels, uint32_t* out) { \

	492 int i; \

	493 for (i = 0; i + 4 <= num_pixels; i += 4) { \

	494 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \

	495 const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \

	496 const __m128i res = _mm_sub_epi8(src, pred); \

	497 _mm_storeu_si128((__m128i*)&out[i], res); \

	498 } \

	499 if (i != num_pixels) { \

	500 VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \

	501 } \

	502 }

	503

	504 GENERATE_PREDICTOR_1(1, in[i - 1]) // Predictor1: L

	505 GENERATE_PREDICTOR_1(2, upper[i]) // Predictor2: T

	506 GENERATE_PREDICTOR_1(3, upper[i + 1]) // Predictor3: TR

	507 GENERATE_PREDICTOR_1(4, upper[i - 1]) // Predictor4: TL

	508 #undef GENERATE_PREDICTOR_1

	509

	510 // Predictor5: avg2(avg2(L, TR), T)

	511 static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,

	512 int num_pixels, uint32_t* out) {

	513 int i;

	514 for (i = 0; i + 4 <= num_pixels; i += 4) {

	515 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);

	516 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);

	517 const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);

	518 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);

	519 __m128i avg, pred, res;

	520 Average2_m128i(&L, &TR, &avg);

	521 Average2_m128i(&avg, &T, &pred);

	522 res = _mm_sub_epi8(src, pred);

	523 _mm_storeu_si128((__m128i*)&out[i], res);

	524 }

	525 if (i != num_pixels) {

	526 VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i);

	527 }

	528 }

	529

	530 #define GENERATE_PREDICTOR_2(X, A, B) \

	531 static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \

	532 int num_pixels, uint32_t* out) { \

	533 int i; \

	534 for (i = 0; i + 4 <= num_pixels; i += 4) { \

	535 const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \

	536 const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \

	537 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \

	538 __m128i pred, res; \

	539 Average2_m128i(&tA, &tB, &pred); \

	540 res = _mm_sub_epi8(src, pred); \

	541 _mm_storeu_si128((__m128i*)&out[i], res); \

	542 } \

	543 if (i != num_pixels) { \

	544 VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \

	545 } \

	546 }

	547

	548 GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1]) // Predictor6: avg(L, TL)

	549 GENERATE_PREDICTOR_2(7, in[i - 1], upper[i]) // Predictor7: avg(L, T)

	550 GENERATE_PREDICTOR_2(8, upper[i - 1], upper[i]) // Predictor8: avg(TL, T)

	551 GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1]) // Predictor9: average(T, TR)

	552 #undef GENERATE_PREDICTOR_2

	553

	554 // Predictor10: avg(avg(L,TL), avg(T, TR)).

	555 static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,

	556 int num_pixels, uint32_t* out) {

	557 int i;

	558 for (i = 0; i + 4 <= num_pixels; i += 4) {

	559 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);

	560 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);

	561 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);

	562 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);

	563 const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);

	564 __m128i avgTTR, avgLTL, avg, res;

	565 Average2_m128i(&T, &TR, &avgTTR);

	566 Average2_m128i(&L, &TL, &avgLTL);

	567 Average2_m128i(&avgTTR, &avgLTL, &avg);

	568 res = _mm_sub_epi8(src, avg);

	569 _mm_storeu_si128((__m128i*)&out[i], res);

	570 }

	571 if (i != num_pixels) {

	572 VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i);

	573 }

	574 }

	575

	576 // Predictor11: select.

	577 static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,

	578 __m128i* const out) {

	579 // We can unpack with any value on the upper 32 bits, provided it's the same

	580 // on both operands (to that their sum of abs diff is zero). Here we use *A.

	581 const __m128i A_lo = _mm_unpacklo_epi32(A, A);

	582 const __m128i B_lo = _mm_unpacklo_epi32(B, A);

	583 const __m128i A_hi = _mm_unpackhi_epi32(A, A);

	584 const __m128i B_hi = _mm_unpackhi_epi32(B, A);

	585 const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);

	586 const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);

	587 *out = _mm_packs_epi32(s_lo, s_hi);

	588 }

	589

	590 static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,

	591 int num_pixels, uint32_t* out) {

	592 int i;

	593 for (i = 0; i + 4 <= num_pixels; i += 4) {

	594 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);

	595 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);

	596 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);

	597 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);

	598 __m128i pa, pb;

	599 GetSumAbsDiff32(&T, &TL, &pa); // pa = sum \|T-TL\|

	600 GetSumAbsDiff32(&L, &TL, &pb); // pb = sum \|L-TL\|

	601 {

	602 const __m128i mask = _mm_cmpgt_epi32(pb, pa);

	603 const __m128i A = _mm_and_si128(mask, L);

	604 const __m128i B = _mm_andnot_si128(mask, T);

	605 const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T

	606 const __m128i res = _mm_sub_epi8(src, pred);

	607 _mm_storeu_si128((__m128i*)&out[i], res);

	608 }

	609 }

	610 if (i != num_pixels) {

	611 VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);

	612 }

	613 }

	614

	615 // Predictor12: ClampedSubSubtractFull.

	616 static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,

	617 int num_pixels, uint32_t* out) {

	618 int i;

	619 const __m128i zero = _mm_setzero_si128();

	620 for (i = 0; i + 4 <= num_pixels; i += 4) {

	621 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);

	622 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);

	623 const __m128i L_lo = _mm_unpacklo_epi8(L, zero);

	624 const __m128i L_hi = _mm_unpackhi_epi8(L, zero);

	625 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);

	626 const __m128i T_lo = _mm_unpacklo_epi8(T, zero);

	627 const __m128i T_hi = _mm_unpackhi_epi8(T, zero);

	628 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);

	629 const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);

	630 const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);

	631 const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);

	632 const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);

	633 const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo);

	634 const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi);

	635 const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);

	636 const __m128i res = _mm_sub_epi8(src, pred);

	637 _mm_storeu_si128((__m128i*)&out[i], res);

	638 }

	639 if (i != num_pixels) {

	640 VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i);

	641 }

	642 }

	643

	644 // Predictors13: ClampedAddSubtractHalf

	645 static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,

	646 int num_pixels, uint32_t* out) {

	647 int i;

	648 const __m128i zero = _mm_setzero_si128();

	649 for (i = 0; i + 2 <= num_pixels; i += 2) {

	650 // we can only process two pixels at a time

	651 const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);

	652 const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);

	653 const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);

	654 const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);

	655 const __m128i L_lo = _mm_unpacklo_epi8(L, zero);

	656 const __m128i T_lo = _mm_unpacklo_epi8(T, zero);

	657 const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);

	658 const __m128i sum = _mm_add_epi16(T_lo, L_lo);

	659 const __m128i avg = _mm_srli_epi16(sum, 1);

	660 const __m128i A1 = _mm_sub_epi16(avg, TL_lo);

	661 const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);

	662 const __m128i A2 = _mm_sub_epi16(A1, bit_fix);

	663 const __m128i A3 = _mm_srai_epi16(A2, 1);

	664 const __m128i A4 = _mm_add_epi16(avg, A3);

	665 const __m128i pred = _mm_packus_epi16(A4, A4);

	666 const __m128i res = _mm_sub_epi8(src, pred);

	667 _mm_storel_epi64((__m128i*)&out[i], res);

	668 }

	669 if (i != num_pixels) {

	670 VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);

	671 }

	672 }

	673

378 //------------------------------------------------------------------------------	674 //------------------------------------------------------------------------------

379 // Entry point	675 // Entry point

380	676

381 extern void VP8LEncDspInitSSE2(void);	677 extern void VP8LEncDspInitSSE2(void);

382	678

383 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {	679 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {

384 VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;	680 VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;

385 VP8LTransformColor = TransformColor;	681 VP8LTransformColor = TransformColor;

386 VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;	682 VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;

387 VP8LCollectColorRedTransforms = CollectColorRedTransforms;	683 VP8LCollectColorRedTransforms = CollectColorRedTransforms;

388 VP8LHistogramAdd = HistogramAdd;	684 VP8LHistogramAdd = HistogramAdd;

389 VP8LCombinedShannonEntropy = CombinedShannonEntropy;	685 VP8LCombinedShannonEntropy = CombinedShannonEntropy;

390 VP8LVectorMismatch = VectorMismatch;	686 VP8LVectorMismatch = VectorMismatch;

	687 VP8LBundleColorMap = BundleColorMap_SSE2;

	688

	689 VP8LPredictorsSub[0] = PredictorSub0_SSE2;

	690 VP8LPredictorsSub[1] = PredictorSub1_SSE2;

	691 VP8LPredictorsSub[2] = PredictorSub2_SSE2;

	692 VP8LPredictorsSub[3] = PredictorSub3_SSE2;

	693 VP8LPredictorsSub[4] = PredictorSub4_SSE2;

	694 VP8LPredictorsSub[5] = PredictorSub5_SSE2;

	695 VP8LPredictorsSub[6] = PredictorSub6_SSE2;

	696 VP8LPredictorsSub[7] = PredictorSub7_SSE2;

	697 VP8LPredictorsSub[8] = PredictorSub8_SSE2;

	698 VP8LPredictorsSub[9] = PredictorSub9_SSE2;

	699 VP8LPredictorsSub[10] = PredictorSub10_SSE2;

	700 VP8LPredictorsSub[11] = PredictorSub11_SSE2;

	701 VP8LPredictorsSub[12] = PredictorSub12_SSE2;

	702 VP8LPredictorsSub[13] = PredictorSub13_SSE2;

	703 VP8LPredictorsSub[14] = PredictorSub0_SSE2; // <- padding security sentinels

	704 VP8LPredictorsSub[15] = PredictorSub0_SSE2;

391 }	705 }

392	706

393 #else // !WEBP_USE_SSE2	707 #else // !WEBP_USE_SSE2

394	708

395 WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2)	709 WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2)

396	710

397 #endif // WEBP_USE_SSE2	711 #endif // WEBP_USE_SSE2

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/lossless_enc_msa.c ('k') | third_party/libwebp/dsp/lossless_enc_sse41.c » ('j') | no next file with comments »