source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c - Issue 756673003: libvpx: Pull from upstream

Unified Diff: source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c

Issue 756673003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c

===================================================================

--- source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c (revision 0)

+++ source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c (working copy)

@@ -0,0 +1,182 @@

+/*

+ *

+ * Use of this source code is governed by a BSD-style license

+ * that can be found in the LICENSE file in the root of the source

+ * tree. An additional intellectual property rights grant can be found

+ * in the file PATENTS. All contributing project authors may

+ * be found in the AUTHORS file in the root of the source tree.

+ */

+#include <emmintrin.h>

+#include "vp9/common/vp9_common.h"

+#if CONFIG_VP9_HIGHBITDEPTH

+// from vp9_idct.h: typedef int32_t tran_low_t;

+void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,

+ intptr_t count,

+ int skip_block,

+ const int16_t *zbin_ptr,

+ const int16_t *round_ptr,

+ const int16_t *quant_ptr,

+ const int16_t *quant_shift_ptr,

+ tran_low_t *qcoeff_ptr,

+ tran_low_t *dqcoeff_ptr,

+ const int16_t *dequant_ptr,

+ int zbin_oq_value,

+ uint16_t *eob_ptr,

+ const int16_t *scan,

+ const int16_t *iscan) {

+ int i, j, non_zero_regs = (int)count / 4, eob_i = -1;

+ __m128i zbins[2];

+ __m128i nzbins[2];

+ zbins[0] = _mm_set_epi32((int)(zbin_ptr[1] + zbin_oq_value),

+ (int)(zbin_ptr[1] + zbin_oq_value),

+ (int)(zbin_ptr[0] + zbin_oq_value));

+ zbins[1] = _mm_set1_epi32((int)(zbin_ptr[1] + zbin_oq_value));

+ nzbins[0] = _mm_setzero_si128();

+ nzbins[1] = _mm_setzero_si128();

+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);

+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

+ (void)scan;

+ vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));

+ vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));

+ if (!skip_block) {

+ // Pre-scan pass

+ for (i = ((int)count / 4) - 1; i >= 0; i--) {

+ __m128i coeffs, cmp1, cmp2;

+ int test;

+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

+ cmp1 = _mm_and_si128(cmp1, cmp2);

+ test = _mm_movemask_epi8(cmp1);

+ if (test == 0xffff)

+ non_zero_regs--;

+ else

+ break;

+ }

+ // Quantization pass:

+ for (i = 0; i < non_zero_regs; i++) {

+ __m128i coeffs, coeffs_sign, tmp1, tmp2;

+ int test;

+ int abs_coeff[4];

+ int coeff_sign[4];

+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+ coeffs_sign = _mm_srai_epi32(coeffs, 31);

+ coeffs = _mm_sub_epi32(

+ _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);

+ tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);

+ tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);

+ tmp1 = _mm_or_si128(tmp1, tmp2);

+ test = _mm_movemask_epi8(tmp1);

+ _mm_storeu_si128((__m128i*)abs_coeff, coeffs);

+ _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);

+ for (j = 0; j < 4; j++) {

+ if (test & (1 << (4 * j))) {

+ int k = 4 * i + j;

+ int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0],

+ INT32_MIN, INT32_MAX);

+ tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) *

+ quant_shift_ptr[k != 0]) >> 16; // quantization

+ qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j];

+ dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];

+ if (tmp)

+ eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;

+ }

+ *eob_ptr = eob_i + 1;

+void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,

+ intptr_t n_coeffs,

+ int skip_block,

+ const int16_t *zbin_ptr,

+ const int16_t *round_ptr,

+ const int16_t *quant_ptr,

+ const int16_t *quant_shift_ptr,

+ tran_low_t *qcoeff_ptr,

+ tran_low_t *dqcoeff_ptr,

+ const int16_t *dequant_ptr,

+ int zbin_oq_value,

+ uint16_t *eob_ptr,

+ const int16_t *scan,

+ const int16_t *iscan) {

+ __m128i zbins[2];

+ __m128i nzbins[2];

+ int idx = 0;

+ int idx_arr[1024];

+ int i, eob = -1;

+ const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);

+ const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);

+ (void)scan;

+ zbins[0] = _mm_set_epi32((zbin1_tmp + zbin_oq_value),

+ (zbin1_tmp + zbin_oq_value),

+ (zbin0_tmp + zbin_oq_value));

+ zbins[1] = _mm_set1_epi32((zbin1_tmp + zbin_oq_value));

+ nzbins[0] = _mm_setzero_si128();

+ nzbins[1] = _mm_setzero_si128();

+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);

+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

+ vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+ vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

+ if (!skip_block) {

+ // Pre-scan pass

+ for (i = 0; i < n_coeffs / 4; i++) {

+ __m128i coeffs, cmp1, cmp2;

+ int test;

+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

+ cmp1 = _mm_and_si128(cmp1, cmp2);

+ test = _mm_movemask_epi8(cmp1);

+ if (!(test & 0xf))

+ idx_arr[idx++] = i * 4;

+ if (!(test & 0xf0))

+ idx_arr[idx++] = i * 4 + 1;

+ if (!(test & 0xf00))

+ idx_arr[idx++] = i * 4 + 2;

+ if (!(test & 0xf000))

+ idx_arr[idx++] = i * 4 + 3;

+ }

+ // Quantization pass: only process the coefficients selected in

+ // pre-scan pass. Note: idx can be zero.

+ for (i = 0; i < idx; i++) {

+ const int rc = idx_arr[i];

+ const int coeff = coeff_ptr[rc];

+ const int coeff_sign = (coeff >> 31);

+ int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+ int64_t tmp = clamp(abs_coeff +

+ ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),

+ INT32_MIN, INT32_MAX);

+ tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *

+ quant_shift_ptr[rc != 0]) >> 15;

+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;

+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

+ if (tmp)

+ eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;

+ }

+ *eob_ptr = eob + 1;

+#endif