Index: source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c |
=================================================================== |
--- source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c (revision 0) |
+++ source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c (working copy) |
@@ -0,0 +1,182 @@ |
+/* |
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
+ * |
+ * Use of this source code is governed by a BSD-style license |
+ * that can be found in the LICENSE file in the root of the source |
+ * tree. An additional intellectual property rights grant can be found |
+ * in the file PATENTS. All contributing project authors may |
+ * be found in the AUTHORS file in the root of the source tree. |
+ */ |
+ |
+#include <emmintrin.h> |
+ |
+#include "vp9/common/vp9_common.h" |
+ |
+#if CONFIG_VP9_HIGHBITDEPTH |
+// from vp9_idct.h: typedef int32_t tran_low_t; |
+void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, |
+ intptr_t count, |
+ int skip_block, |
+ const int16_t *zbin_ptr, |
+ const int16_t *round_ptr, |
+ const int16_t *quant_ptr, |
+ const int16_t *quant_shift_ptr, |
+ tran_low_t *qcoeff_ptr, |
+ tran_low_t *dqcoeff_ptr, |
+ const int16_t *dequant_ptr, |
+ int zbin_oq_value, |
+ uint16_t *eob_ptr, |
+ const int16_t *scan, |
+ const int16_t *iscan) { |
+ int i, j, non_zero_regs = (int)count / 4, eob_i = -1; |
+ __m128i zbins[2]; |
+ __m128i nzbins[2]; |
+ |
+ zbins[0] = _mm_set_epi32((int)(zbin_ptr[1] + zbin_oq_value), |
+ (int)(zbin_ptr[1] + zbin_oq_value), |
+ (int)(zbin_ptr[1] + zbin_oq_value), |
+ (int)(zbin_ptr[0] + zbin_oq_value)); |
+ zbins[1] = _mm_set1_epi32((int)(zbin_ptr[1] + zbin_oq_value)); |
+ |
+ nzbins[0] = _mm_setzero_si128(); |
+ nzbins[1] = _mm_setzero_si128(); |
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); |
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); |
+ |
+ (void)scan; |
+ |
+ vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); |
+ vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); |
+ |
+ if (!skip_block) { |
+ // Pre-scan pass |
+ for (i = ((int)count / 4) - 1; i >= 0; i--) { |
+ __m128i coeffs, cmp1, cmp2; |
+ int test; |
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); |
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); |
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); |
+ cmp1 = _mm_and_si128(cmp1, cmp2); |
+ test = _mm_movemask_epi8(cmp1); |
+ if (test == 0xffff) |
+ non_zero_regs--; |
+ else |
+ break; |
+ } |
+ |
+ // Quantization pass: |
+ for (i = 0; i < non_zero_regs; i++) { |
+ __m128i coeffs, coeffs_sign, tmp1, tmp2; |
+ int test; |
+ int abs_coeff[4]; |
+ int coeff_sign[4]; |
+ |
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); |
+ coeffs_sign = _mm_srai_epi32(coeffs, 31); |
+ coeffs = _mm_sub_epi32( |
+ _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); |
+ tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); |
+ tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); |
+ tmp1 = _mm_or_si128(tmp1, tmp2); |
+ test = _mm_movemask_epi8(tmp1); |
+ _mm_storeu_si128((__m128i*)abs_coeff, coeffs); |
+ _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign); |
+ |
+ for (j = 0; j < 4; j++) { |
+ if (test & (1 << (4 * j))) { |
+ int k = 4 * i + j; |
+ int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0], |
+ INT32_MIN, INT32_MAX); |
+ tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) * |
+ quant_shift_ptr[k != 0]) >> 16; // quantization |
+ qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j]; |
+ dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; |
+ if (tmp) |
+ eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; |
+ } |
+ } |
+ } |
+ } |
+ *eob_ptr = eob_i + 1; |
+} |
+ |
+ |
+void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, |
+ intptr_t n_coeffs, |
+ int skip_block, |
+ const int16_t *zbin_ptr, |
+ const int16_t *round_ptr, |
+ const int16_t *quant_ptr, |
+ const int16_t *quant_shift_ptr, |
+ tran_low_t *qcoeff_ptr, |
+ tran_low_t *dqcoeff_ptr, |
+ const int16_t *dequant_ptr, |
+ int zbin_oq_value, |
+ uint16_t *eob_ptr, |
+ const int16_t *scan, |
+ const int16_t *iscan) { |
+ __m128i zbins[2]; |
+ __m128i nzbins[2]; |
+ int idx = 0; |
+ int idx_arr[1024]; |
+ int i, eob = -1; |
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1); |
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1); |
+ (void)scan; |
+ zbins[0] = _mm_set_epi32((zbin1_tmp + zbin_oq_value), |
+ (zbin1_tmp + zbin_oq_value), |
+ (zbin1_tmp + zbin_oq_value), |
+ (zbin0_tmp + zbin_oq_value)); |
+ zbins[1] = _mm_set1_epi32((zbin1_tmp + zbin_oq_value)); |
+ |
+ nzbins[0] = _mm_setzero_si128(); |
+ nzbins[1] = _mm_setzero_si128(); |
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); |
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); |
+ |
+ vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); |
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); |
+ |
+ if (!skip_block) { |
+ // Pre-scan pass |
+ for (i = 0; i < n_coeffs / 4; i++) { |
+ __m128i coeffs, cmp1, cmp2; |
+ int test; |
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); |
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); |
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); |
+ cmp1 = _mm_and_si128(cmp1, cmp2); |
+ test = _mm_movemask_epi8(cmp1); |
+ if (!(test & 0xf)) |
+ idx_arr[idx++] = i * 4; |
+ if (!(test & 0xf0)) |
+ idx_arr[idx++] = i * 4 + 1; |
+ if (!(test & 0xf00)) |
+ idx_arr[idx++] = i * 4 + 2; |
+ if (!(test & 0xf000)) |
+ idx_arr[idx++] = i * 4 + 3; |
+ } |
+ |
+ // Quantization pass: only process the coefficients selected in |
+ // pre-scan pass. Note: idx can be zero. |
+ for (i = 0; i < idx; i++) { |
+ const int rc = idx_arr[i]; |
+ const int coeff = coeff_ptr[rc]; |
+ const int coeff_sign = (coeff >> 31); |
+ int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; |
+ int64_t tmp = clamp(abs_coeff + |
+ ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), |
+ INT32_MIN, INT32_MAX); |
+ tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * |
+ quant_shift_ptr[rc != 0]) >> 15; |
+ |
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; |
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; |
+ |
+ if (tmp) |
+ eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; |
+ } |
+ } |
+ *eob_ptr = eob + 1; |
+} |
+#endif |