Index: source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c |
diff --git a/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c |
index bdc75e993717d418b4c265edee2532f4acb2a3dd..1c1005aeeda8b50e3ff05b2bb7ebde15271c92eb 100644 |
--- a/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c |
+++ b/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c |
@@ -293,7 +293,8 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, |
if (!skip_block) { |
__m128i eob; |
- __m128i round, quant, dequant; |
+ __m128i round, quant, dequant, thr; |
+ int16_t nzflag; |
{ |
__m128i coeff0, coeff1; |
@@ -368,6 +369,7 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, |
// AC only loop |
index = 2; |
+ thr = _mm_srai_epi16(dequant, 1); |
while (n_coeffs < 0) { |
__m128i coeff0, coeff1; |
{ |
@@ -387,28 +389,39 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, |
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
- qcoeff0 = _mm_adds_epi16(qcoeff0, round); |
- qcoeff1 = _mm_adds_epi16(qcoeff1, round); |
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); |
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); |
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); |
- // Reinsert signs |
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); |
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); |
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
+ if (nzflag) { |
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round); |
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round); |
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); |
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); |
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); |
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); |
+ // Reinsert signs |
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); |
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); |
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant); |
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant); |
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); |
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); |
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); |
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); |
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant); |
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant); |
+ |
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); |
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); |
+ } else { |
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
+ |
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
+ } |
} |
- { |
+ if (nzflag) { |
// Scan for eob |
__m128i zero_coeff0, zero_coeff1; |
__m128i nzero_coeff0, nzero_coeff1; |