| Index: third_party/libwebp/dsp/enc_sse2.c
|
| diff --git a/third_party/libwebp/dsp/enc_sse2.c b/third_party/libwebp/dsp/enc_sse2.c
|
| index 540a3cb2dbdd22818cbb6cede31d4ca7605aef4d..9958d9f6ff6dfab7a73d3e8237d1629be96665de 100644
|
| --- a/third_party/libwebp/dsp/enc_sse2.c
|
| +++ b/third_party/libwebp/dsp/enc_sse2.c
|
| @@ -17,7 +17,9 @@
|
| #include <stdlib.h> // for abs()
|
| #include <emmintrin.h>
|
|
|
| +#include "../enc/cost.h"
|
| #include "../enc/vp8enci.h"
|
| +#include "../utils/utils.h"
|
|
|
| //------------------------------------------------------------------------------
|
| // Quite useful macro for debugging. Left here for convenience.
|
| @@ -52,9 +54,9 @@ static void PrintReg(const __m128i r, const char* const name, int size) {
|
| // Compute susceptibility based on DCT-coeff histograms:
|
| // the higher, the "easier" the macroblock is to compress.
|
|
|
| -static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
|
| - int start_block, int end_block,
|
| - VP8Histogram* const histo) {
|
| +static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
| + int start_block, int end_block,
|
| + VP8Histogram* const histo) {
|
| const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
|
| int j;
|
| for (j = start_block; j < end_block; ++j) {
|
| @@ -98,8 +100,8 @@ static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
|
| // Transforms (Paragraph 14.4)
|
|
|
| // Does one or two inverse transforms.
|
| -static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
| - int do_two) {
|
| +static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
| + int do_two) {
|
| // This implementation makes use of 16-bit fixed point versions of two
|
| // multiply constants:
|
| // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
|
| @@ -318,8 +320,7 @@ static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
| }
|
| }
|
|
|
| -static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
|
| - int16_t* out) {
|
| +static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
| const __m128i zero = _mm_setzero_si128();
|
| const __m128i seven = _mm_set1_epi16(7);
|
| const __m128i k937 = _mm_set1_epi32(937);
|
| @@ -444,14 +445,14 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
|
| // -> f1 = f1 + 1 - (a3 == 0)
|
| const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
|
|
|
| - _mm_storel_epi64((__m128i*)&out[ 0], d0);
|
| - _mm_storel_epi64((__m128i*)&out[ 4], g1);
|
| - _mm_storel_epi64((__m128i*)&out[ 8], d2);
|
| - _mm_storel_epi64((__m128i*)&out[12], f3);
|
| + const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
|
| + const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
|
| + _mm_storeu_si128((__m128i*)&out[0], d0_g1);
|
| + _mm_storeu_si128((__m128i*)&out[8], d2_f3);
|
| }
|
| }
|
|
|
| -static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
|
| +static void FTransformWHT(const int16_t* in, int16_t* out) {
|
| int32_t tmp[16];
|
| int i;
|
| for (i = 0; i < 4; ++i, in += 64) {
|
| @@ -487,8 +488,8 @@ static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
|
| //------------------------------------------------------------------------------
|
| // Metric
|
|
|
| -static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
|
| - int num_quads, int do_16) {
|
| +static int SSE_Nx4(const uint8_t* a, const uint8_t* b,
|
| + int num_quads, int do_16) {
|
| const __m128i zero = _mm_setzero_si128();
|
| __m128i sum1 = zero;
|
| __m128i sum2 = zero;
|
| @@ -565,19 +566,19 @@ static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
|
| }
|
| }
|
|
|
| -static int SSE16x16SSE2(const uint8_t* a, const uint8_t* b) {
|
| - return SSE_Nx4SSE2(a, b, 4, 1);
|
| +static int SSE16x16(const uint8_t* a, const uint8_t* b) {
|
| + return SSE_Nx4(a, b, 4, 1);
|
| }
|
|
|
| -static int SSE16x8SSE2(const uint8_t* a, const uint8_t* b) {
|
| - return SSE_Nx4SSE2(a, b, 2, 1);
|
| +static int SSE16x8(const uint8_t* a, const uint8_t* b) {
|
| + return SSE_Nx4(a, b, 2, 1);
|
| }
|
|
|
| -static int SSE8x8SSE2(const uint8_t* a, const uint8_t* b) {
|
| - return SSE_Nx4SSE2(a, b, 2, 0);
|
| +static int SSE8x8(const uint8_t* a, const uint8_t* b) {
|
| + return SSE_Nx4(a, b, 2, 0);
|
| }
|
|
|
| -static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
|
| +static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
| const __m128i zero = _mm_setzero_si128();
|
|
|
| // Load values. Note that we read 8 pixels instead of 4,
|
| @@ -634,8 +635,8 @@ static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
|
| // Hadamard transform
|
| // Returns the difference between the weighted sum of the absolute value of
|
| // transformed coefficients.
|
| -static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
|
| - const uint16_t* const w) {
|
| +static int TTransform(const uint8_t* inA, const uint8_t* inB,
|
| + const uint16_t* const w) {
|
| int32_t sum[4];
|
| __m128i tmp_0, tmp_1, tmp_2, tmp_3;
|
| const __m128i zero = _mm_setzero_si128();
|
| @@ -782,19 +783,19 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
|
| return sum[0] + sum[1] + sum[2] + sum[3];
|
| }
|
|
|
| -static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
|
| - const uint16_t* const w) {
|
| - const int diff_sum = TTransformSSE2(a, b, w);
|
| +static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
| + const uint16_t* const w) {
|
| + const int diff_sum = TTransform(a, b, w);
|
| return abs(diff_sum) >> 5;
|
| }
|
|
|
| -static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
|
| - const uint16_t* const w) {
|
| +static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
| + const uint16_t* const w) {
|
| int D = 0;
|
| int x, y;
|
| for (y = 0; y < 16 * BPS; y += 4 * BPS) {
|
| for (x = 0; x < 16; x += 4) {
|
| - D += Disto4x4SSE2(a + x + y, b + x + y, w);
|
| + D += Disto4x4(a + x + y, b + x + y, w);
|
| }
|
| }
|
| return D;
|
| @@ -804,9 +805,9 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
|
| // Quantization
|
| //
|
|
|
| -// Simple quantization
|
| -static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
|
| - int n, const VP8Matrix* const mtx) {
|
| +static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
|
| + const uint16_t* const sharpen,
|
| + const VP8Matrix* const mtx) {
|
| const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
|
| const __m128i zero = _mm_setzero_si128();
|
| __m128i coeff0, coeff8;
|
| @@ -818,18 +819,14 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
|
| // we can use _mm_load_si128 instead of _mm_loadu_si128.
|
| __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
|
| __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
|
| - const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
|
| - const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
|
| const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
|
| const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
|
| - const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
|
| - const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
|
| const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
|
| const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
|
|
|
| - // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative)
|
| - const __m128i sign0 = _mm_srai_epi16(in0, 15);
|
| - const __m128i sign8 = _mm_srai_epi16(in8, 15);
|
| + // extract sign(in) (0x0000 if positive, 0xffff if negative)
|
| + const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
|
| + const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
|
|
|
| // coeff = abs(in) = (in ^ sign) - sign
|
| coeff0 = _mm_xor_si128(in0, sign0);
|
| @@ -838,32 +835,35 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
|
| coeff8 = _mm_sub_epi16(coeff8, sign8);
|
|
|
| // coeff = abs(in) + sharpen
|
| - coeff0 = _mm_add_epi16(coeff0, sharpen0);
|
| - coeff8 = _mm_add_epi16(coeff8, sharpen8);
|
| + if (sharpen != NULL) {
|
| + const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&sharpen[0]);
|
| + const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&sharpen[8]);
|
| + coeff0 = _mm_add_epi16(coeff0, sharpen0);
|
| + coeff8 = _mm_add_epi16(coeff8, sharpen8);
|
| + }
|
|
|
| - // out = (coeff * iQ + B) >> QFIX;
|
| + // out = (coeff * iQ + B) >> QFIX
|
| {
|
| // doing calculations with 32b precision (QFIX=17)
|
| // out = (coeff * iQ)
|
| - __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
|
| - __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
|
| - __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
|
| - __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
|
| + const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
|
| + const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
|
| + const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
|
| + const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
|
| __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
|
| __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
|
| __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
|
| __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
|
| - // expand bias from 16b to 32b
|
| - __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
|
| - __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
|
| - __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
|
| - __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
|
| // out = (coeff * iQ + B)
|
| + const __m128i bias_00 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
|
| + const __m128i bias_04 = _mm_loadu_si128((__m128i*)&mtx->bias_[4]);
|
| + const __m128i bias_08 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
|
| + const __m128i bias_12 = _mm_loadu_si128((__m128i*)&mtx->bias_[12]);
|
| out_00 = _mm_add_epi32(out_00, bias_00);
|
| out_04 = _mm_add_epi32(out_04, bias_04);
|
| out_08 = _mm_add_epi32(out_08, bias_08);
|
| out_12 = _mm_add_epi32(out_12, bias_12);
|
| - // out = (coeff * iQ + B) >> QFIX;
|
| + // out = QUANTDIV(coeff, iQ, B, QFIX)
|
| out_00 = _mm_srai_epi32(out_00, QFIX);
|
| out_04 = _mm_srai_epi32(out_04, QFIX);
|
| out_08 = _mm_srai_epi32(out_08, QFIX);
|
| @@ -916,19 +916,44 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
|
| }
|
|
|
| // detect if all 'out' values are zeroes or not
|
| - {
|
| - int32_t tmp[4];
|
| - _mm_storeu_si128((__m128i*)tmp, packed_out);
|
| - if (n) {
|
| - tmp[0] &= ~0xff;
|
| - }
|
| - return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
|
| - }
|
| + return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
|
| }
|
|
|
| -static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16],
|
| - const VP8Matrix* const mtx) {
|
| - return QuantizeBlockSSE2(in, out, 0, mtx);
|
| +static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
| + const VP8Matrix* const mtx) {
|
| + return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
|
| +}
|
| +
|
| +static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
|
| + const VP8Matrix* const mtx) {
|
| + return DoQuantizeBlock(in, out, NULL, mtx);
|
| +}
|
| +
|
| +// Forward declaration.
|
| +void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
|
| + VP8Residual* const res);
|
| +
|
| +void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
|
| + VP8Residual* const res) {
|
| + const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs);
|
| + const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
|
| + // Use SSE to compare 8 values with a single instruction.
|
| + const __m128i zero = _mm_setzero_si128();
|
| + const __m128i m0 = _mm_cmpeq_epi16(c0, zero);
|
| + const __m128i m1 = _mm_cmpeq_epi16(c1, zero);
|
| + // Get the comparison results as a bitmask, consisting of two times 16 bits:
|
| + // two identical bits for each result. Concatenate both bitmasks to get a
|
| + // single 32 bit value. Negate the mask to get the position of entries that
|
| + // are not equal to zero. We don't need to mask out least significant bits
|
| + // according to res->first, since coeffs[0] is 0 if res->first > 0
|
| + const uint32_t mask =
|
| + ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0));
|
| + // The position of the most significant non-zero bit indicates the position of
|
| + // the last non-zero value. Divide the result by two because __movemask_epi8
|
| + // operates on 8 bit values instead of 16 bit values.
|
| + assert(res->first == 0 || coeffs[0] == 0);
|
| + res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1;
|
| + res->coeffs = coeffs;
|
| }
|
|
|
| #endif // WEBP_USE_SSE2
|
| @@ -940,18 +965,18 @@ extern void VP8EncDspInitSSE2(void);
|
|
|
| void VP8EncDspInitSSE2(void) {
|
| #if defined(WEBP_USE_SSE2)
|
| - VP8CollectHistogram = CollectHistogramSSE2;
|
| - VP8EncQuantizeBlock = QuantizeBlockSSE2;
|
| - VP8EncQuantizeBlockWHT = QuantizeBlockWHTSSE2;
|
| - VP8ITransform = ITransformSSE2;
|
| - VP8FTransform = FTransformSSE2;
|
| - VP8FTransformWHT = FTransformWHTSSE2;
|
| - VP8SSE16x16 = SSE16x16SSE2;
|
| - VP8SSE16x8 = SSE16x8SSE2;
|
| - VP8SSE8x8 = SSE8x8SSE2;
|
| - VP8SSE4x4 = SSE4x4SSE2;
|
| - VP8TDisto4x4 = Disto4x4SSE2;
|
| - VP8TDisto16x16 = Disto16x16SSE2;
|
| + VP8CollectHistogram = CollectHistogram;
|
| + VP8EncQuantizeBlock = QuantizeBlock;
|
| + VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
|
| + VP8ITransform = ITransform;
|
| + VP8FTransform = FTransform;
|
| + VP8FTransformWHT = FTransformWHT;
|
| + VP8SSE16x16 = SSE16x16;
|
| + VP8SSE16x8 = SSE16x8;
|
| + VP8SSE8x8 = SSE8x8;
|
| + VP8SSE4x4 = SSE4x4;
|
| + VP8TDisto4x4 = Disto4x4;
|
| + VP8TDisto16x16 = Disto16x16;
|
| #endif // WEBP_USE_SSE2
|
| }
|
|
|
|
|