Index: source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c |
diff --git a/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c |
index 679c66e30b86c843d08d0eb248659c5ee12a304a..00abd3c49622b83e73de14618220ae15ba5f935f 100644 |
--- a/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c |
+++ b/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c |
@@ -230,6 +230,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, |
const int16_t* scan_ptr, |
const int16_t* iscan_ptr) { |
__m128i zero; |
+ __m128i thr; |
+ int16_t nzflag; |
(void)scan_ptr; |
(void)zbin_ptr; |
(void)quant_shift_ptr; |
@@ -316,6 +318,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, |
n_coeffs += 8 * 2; |
} |
+ thr = _mm_srai_epi16(dequant, 1); |
+ |
// AC only loop |
while (n_coeffs < 0) { |
__m128i coeff0, coeff1; |
@@ -335,28 +339,39 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, |
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
- qcoeff0 = _mm_adds_epi16(qcoeff0, round); |
- qcoeff1 = _mm_adds_epi16(qcoeff1, round); |
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); |
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); |
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); |
- // Reinsert signs |
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); |
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); |
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
+ if (nzflag) { |
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round); |
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round); |
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); |
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); |
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); |
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); |
+ // Reinsert signs |
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); |
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); |
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant); |
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant); |
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); |
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); |
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); |
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); |
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant); |
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant); |
+ |
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); |
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); |
+ } else { |
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
+ |
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
+ } |
} |
- { |
+ if (nzflag) { |
// Scan for eob |
__m128i zero_coeff0, zero_coeff1; |
__m128i nzero_coeff0, nzero_coeff1; |