| Index: source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
|
| diff --git a/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
|
| index 679c66e30b86c843d08d0eb248659c5ee12a304a..00abd3c49622b83e73de14618220ae15ba5f935f 100644
|
| --- a/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
|
| +++ b/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
|
| @@ -230,6 +230,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
|
| const int16_t* scan_ptr,
|
| const int16_t* iscan_ptr) {
|
| __m128i zero;
|
| + __m128i thr;
|
| + int16_t nzflag;
|
| (void)scan_ptr;
|
| (void)zbin_ptr;
|
| (void)quant_shift_ptr;
|
| @@ -316,6 +318,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
|
| n_coeffs += 8 * 2;
|
| }
|
|
|
| + thr = _mm_srai_epi16(dequant, 1);
|
| +
|
| // AC only loop
|
| while (n_coeffs < 0) {
|
| __m128i coeff0, coeff1;
|
| @@ -335,28 +339,39 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
|
| qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
|
| qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
|
|
|
| - qcoeff0 = _mm_adds_epi16(qcoeff0, round);
|
| - qcoeff1 = _mm_adds_epi16(qcoeff1, round);
|
| - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
|
| - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
|
| + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
|
| + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
|
|
|
| - // Reinsert signs
|
| - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
|
| - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
|
| - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
|
| - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
|
| + if (nzflag) {
|
| + qcoeff0 = _mm_adds_epi16(qcoeff0, round);
|
| + qcoeff1 = _mm_adds_epi16(qcoeff1, round);
|
| + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
|
| + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
|
|
|
| - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
|
| - _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
|
| + // Reinsert signs
|
| + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
|
| + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
|
| + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
|
| + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
|
|
|
| - coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
|
| - coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
|
| + _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
|
| + _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
|
|
|
| - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
|
| - _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
|
| + coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
|
| + coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
|
| +
|
| + _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
|
| + _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
|
| + } else {
|
| + _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
|
| + _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
|
| +
|
| + _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
|
| + _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
|
| + }
|
| }
|
|
|
| - {
|
| + if (nzflag) {
|
| // Scan for eob
|
| __m128i zero_coeff0, zero_coeff1;
|
| __m128i nzero_coeff0, nzero_coeff1;
|
|
|