OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <emmintrin.h> |
| 12 |
| 13 #include "vp9/common/vp9_common.h" |
| 14 |
| 15 #if CONFIG_VP9_HIGHBITDEPTH |
| 16 // from vp9_idct.h: typedef int32_t tran_low_t; |
| 17 void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, |
| 18 intptr_t count, |
| 19 int skip_block, |
| 20 const int16_t *zbin_ptr, |
| 21 const int16_t *round_ptr, |
| 22 const int16_t *quant_ptr, |
| 23 const int16_t *quant_shift_ptr, |
| 24 tran_low_t *qcoeff_ptr, |
| 25 tran_low_t *dqcoeff_ptr, |
| 26 const int16_t *dequant_ptr, |
| 27 int zbin_oq_value, |
| 28 uint16_t *eob_ptr, |
| 29 const int16_t *scan, |
| 30 const int16_t *iscan) { |
| 31 int i, j, non_zero_regs = (int)count / 4, eob_i = -1; |
| 32 __m128i zbins[2]; |
| 33 __m128i nzbins[2]; |
| 34 |
| 35 zbins[0] = _mm_set_epi32((int)(zbin_ptr[1] + zbin_oq_value), |
| 36 (int)(zbin_ptr[1] + zbin_oq_value), |
| 37 (int)(zbin_ptr[1] + zbin_oq_value), |
| 38 (int)(zbin_ptr[0] + zbin_oq_value)); |
| 39 zbins[1] = _mm_set1_epi32((int)(zbin_ptr[1] + zbin_oq_value)); |
| 40 |
| 41 nzbins[0] = _mm_setzero_si128(); |
| 42 nzbins[1] = _mm_setzero_si128(); |
| 43 nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); |
| 44 nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); |
| 45 |
| 46 (void)scan; |
| 47 |
| 48 vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); |
| 49 vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); |
| 50 |
| 51 if (!skip_block) { |
| 52 // Pre-scan pass |
| 53 for (i = ((int)count / 4) - 1; i >= 0; i--) { |
| 54 __m128i coeffs, cmp1, cmp2; |
| 55 int test; |
| 56 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); |
| 57 cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); |
| 58 cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); |
| 59 cmp1 = _mm_and_si128(cmp1, cmp2); |
| 60 test = _mm_movemask_epi8(cmp1); |
| 61 if (test == 0xffff) |
| 62 non_zero_regs--; |
| 63 else |
| 64 break; |
| 65 } |
| 66 |
| 67 // Quantization pass: |
| 68 for (i = 0; i < non_zero_regs; i++) { |
| 69 __m128i coeffs, coeffs_sign, tmp1, tmp2; |
| 70 int test; |
| 71 int abs_coeff[4]; |
| 72 int coeff_sign[4]; |
| 73 |
| 74 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); |
| 75 coeffs_sign = _mm_srai_epi32(coeffs, 31); |
| 76 coeffs = _mm_sub_epi32( |
| 77 _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); |
| 78 tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); |
| 79 tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); |
| 80 tmp1 = _mm_or_si128(tmp1, tmp2); |
| 81 test = _mm_movemask_epi8(tmp1); |
| 82 _mm_storeu_si128((__m128i*)abs_coeff, coeffs); |
| 83 _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign); |
| 84 |
| 85 for (j = 0; j < 4; j++) { |
| 86 if (test & (1 << (4 * j))) { |
| 87 int k = 4 * i + j; |
| 88 int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0], |
| 89 INT32_MIN, INT32_MAX); |
| 90 tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) * |
| 91 quant_shift_ptr[k != 0]) >> 16; // quantization |
| 92 qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j]; |
| 93 dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; |
| 94 if (tmp) |
| 95 eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; |
| 96 } |
| 97 } |
| 98 } |
| 99 } |
| 100 *eob_ptr = eob_i + 1; |
| 101 } |
| 102 |
| 103 |
| 104 void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, |
| 105 intptr_t n_coeffs, |
| 106 int skip_block, |
| 107 const int16_t *zbin_ptr, |
| 108 const int16_t *round_ptr, |
| 109 const int16_t *quant_ptr, |
| 110 const int16_t *quant_shift_ptr, |
| 111 tran_low_t *qcoeff_ptr, |
| 112 tran_low_t *dqcoeff_ptr, |
| 113 const int16_t *dequant_ptr, |
| 114 int zbin_oq_value, |
| 115 uint16_t *eob_ptr, |
| 116 const int16_t *scan, |
| 117 const int16_t *iscan) { |
| 118 __m128i zbins[2]; |
| 119 __m128i nzbins[2]; |
| 120 int idx = 0; |
| 121 int idx_arr[1024]; |
| 122 int i, eob = -1; |
| 123 const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1); |
| 124 const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1); |
| 125 (void)scan; |
| 126 zbins[0] = _mm_set_epi32((zbin1_tmp + zbin_oq_value), |
| 127 (zbin1_tmp + zbin_oq_value), |
| 128 (zbin1_tmp + zbin_oq_value), |
| 129 (zbin0_tmp + zbin_oq_value)); |
| 130 zbins[1] = _mm_set1_epi32((zbin1_tmp + zbin_oq_value)); |
| 131 |
| 132 nzbins[0] = _mm_setzero_si128(); |
| 133 nzbins[1] = _mm_setzero_si128(); |
| 134 nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); |
| 135 nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); |
| 136 |
| 137 vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); |
| 138 vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); |
| 139 |
| 140 if (!skip_block) { |
| 141 // Pre-scan pass |
| 142 for (i = 0; i < n_coeffs / 4; i++) { |
| 143 __m128i coeffs, cmp1, cmp2; |
| 144 int test; |
| 145 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); |
| 146 cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); |
| 147 cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); |
| 148 cmp1 = _mm_and_si128(cmp1, cmp2); |
| 149 test = _mm_movemask_epi8(cmp1); |
| 150 if (!(test & 0xf)) |
| 151 idx_arr[idx++] = i * 4; |
| 152 if (!(test & 0xf0)) |
| 153 idx_arr[idx++] = i * 4 + 1; |
| 154 if (!(test & 0xf00)) |
| 155 idx_arr[idx++] = i * 4 + 2; |
| 156 if (!(test & 0xf000)) |
| 157 idx_arr[idx++] = i * 4 + 3; |
| 158 } |
| 159 |
| 160 // Quantization pass: only process the coefficients selected in |
| 161 // pre-scan pass. Note: idx can be zero. |
| 162 for (i = 0; i < idx; i++) { |
| 163 const int rc = idx_arr[i]; |
| 164 const int coeff = coeff_ptr[rc]; |
| 165 const int coeff_sign = (coeff >> 31); |
| 166 int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; |
| 167 int64_t tmp = clamp(abs_coeff + |
| 168 ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), |
| 169 INT32_MIN, INT32_MAX); |
| 170 tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * |
| 171 quant_shift_ptr[rc != 0]) >> 15; |
| 172 |
| 173 qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; |
| 174 dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; |
| 175 |
| 176 if (tmp) |
| 177 eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; |
| 178 } |
| 179 } |
| 180 *eob_ptr = eob + 1; |
| 181 } |
| 182 #endif |
OLD | NEW |