source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c - Issue 668403002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c

Issue 668403002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <emmintrin.h>

	12 #include <xmmintrin.h>

	13

	14 #include "vpx/vpx_integer.h"

	15

	16 void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,

	17 int skip_block, const int16_t* zbin_ptr,

	18 const int16_t* round_ptr, const int16_t* quant_ptr,

	19 const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,

	20 int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,

	21 int zbin_oq_value, uint16_t* eob_ptr,

	22 const int16_t* scan_ptr,

	23 const int16_t* iscan_ptr) {

	24 __m128i zero;

	25 (void)scan_ptr;

	26

	27 coeff_ptr += n_coeffs;

	28 iscan_ptr += n_coeffs;

	29 qcoeff_ptr += n_coeffs;

	30 dqcoeff_ptr += n_coeffs;

	31 n_coeffs = -n_coeffs;

	32 zero = _mm_setzero_si128();

	33 if (!skip_block) {

	34 __m128i eob;

	35 __m128i zbin;

	36 __m128i round, quant, dequant, shift;

	37 {

	38 __m128i coeff0, coeff1;

	39

	40 // Setup global values

	41 {

	42 __m128i zbin_oq;

	43 __m128i pw_1;

	44 zbin_oq = _mm_set1_epi16(zbin_oq_value);

	45 zbin = _mm_load_si128((const __m128i*)zbin_ptr);

	46 round = _mm_load_si128((const __m128i*)round_ptr);

	47 quant = _mm_load_si128((const __m128i*)quant_ptr);

	48 zbin = _mm_add_epi16(zbin, zbin_oq);

	49 pw_1 = _mm_set1_epi16(1);

	50 zbin = _mm_sub_epi16(zbin, pw_1);

	51 dequant = _mm_load_si128((const __m128i*)dequant_ptr);

	52 shift = _mm_load_si128((const __m128i*)quant_shift_ptr);

	53 }

	54

	55 {

	56 __m128i coeff0_sign, coeff1_sign;

	57 __m128i qcoeff0, qcoeff1;

	58 __m128i qtmp0, qtmp1;

	59 __m128i cmp_mask0, cmp_mask1;

	60 // Do DC and first 15 AC

	61 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));

	62 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);

	63

	64 // Poor man's sign extract

	65 coeff0_sign = _mm_srai_epi16(coeff0, 15);

	66 coeff1_sign = _mm_srai_epi16(coeff1, 15);

	67 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

	68 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

	69 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

	70 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

	71

	72 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

	73 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC

	74 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

	75 qcoeff0 = _mm_adds_epi16(qcoeff0, round);

	76 round = _mm_unpackhi_epi64(round, round);

	77 qcoeff1 = _mm_adds_epi16(qcoeff1, round);

	78 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

	79 quant = _mm_unpackhi_epi64(quant, quant);

	80 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

	81 qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

	82 qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

	83 qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

	84 shift = _mm_unpackhi_epi64(shift, shift);

	85 qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

	86

	87 // Reinsert signs

	88 qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

	89 qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

	90 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

	91 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

	92

	93 // Mask out zbin threshold coeffs

	94 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

	95 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

	96

	97 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);

	98 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);

	99

	100 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

	101 dequant = _mm_unpackhi_epi64(dequant, dequant);

	102 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

	103

	104 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);

	105 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);

	106 }

	107

	108 {

	109 // Scan for eob

	110 __m128i zero_coeff0, zero_coeff1;

	111 __m128i nzero_coeff0, nzero_coeff1;

	112 __m128i iscan0, iscan1;

	113 __m128i eob1;

	114 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

	115 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

	116 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

	117 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

	118 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));

	119 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);

	120 // Add one to convert from indices to counts

	121 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

	122 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

	123 eob = _mm_and_si128(iscan0, nzero_coeff0);

	124 eob1 = _mm_and_si128(iscan1, nzero_coeff1);

	125 eob = _mm_max_epi16(eob, eob1);

	126 }

	127 n_coeffs += 8 * 2;

	128 }

	129

	130 // AC only loop

	131 while (n_coeffs < 0) {

	132 __m128i coeff0, coeff1;

	133 {

	134 __m128i coeff0_sign, coeff1_sign;

	135 __m128i qcoeff0, qcoeff1;

	136 __m128i qtmp0, qtmp1;

	137 __m128i cmp_mask0, cmp_mask1;

	138

	139 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));

	140 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);

	141

	142 // Poor man's sign extract

	143 coeff0_sign = _mm_srai_epi16(coeff0, 15);

	144 coeff1_sign = _mm_srai_epi16(coeff1, 15);

	145 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

	146 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

	147 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

	148 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

	149

	150 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

	151 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

	152 qcoeff0 = _mm_adds_epi16(qcoeff0, round);

	153 qcoeff1 = _mm_adds_epi16(qcoeff1, round);

	154 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

	155 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

	156 qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

	157 qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

	158 qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

	159 qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

	160

	161 // Reinsert signs

	162 qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

	163 qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

	164 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

	165 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

	166

	167 // Mask out zbin threshold coeffs

	168 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

	169 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

	170

	171 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);

	172 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);

	173

	174 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

	175 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

	176

	177 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);

	178 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);

	179 }

	180

	181 {

	182 // Scan for eob

	183 __m128i zero_coeff0, zero_coeff1;

	184 __m128i nzero_coeff0, nzero_coeff1;

	185 __m128i iscan0, iscan1;

	186 __m128i eob0, eob1;

	187 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

	188 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

	189 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

	190 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

	191 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));

	192 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);

	193 // Add one to convert from indices to counts

	194 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

	195 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

	196 eob0 = _mm_and_si128(iscan0, nzero_coeff0);

	197 eob1 = _mm_and_si128(iscan1, nzero_coeff1);

	198 eob0 = _mm_max_epi16(eob0, eob1);

	199 eob = _mm_max_epi16(eob, eob0);

	200 }

	201 n_coeffs += 8 * 2;

	202 }

	203

	204 // Accumulate EOB

	205 {

	206 __m128i eob_shuffled;

	207 eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

	208 eob = _mm_max_epi16(eob, eob_shuffled);

	209 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);

	210 eob = _mm_max_epi16(eob, eob_shuffled);

	211 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);

	212 eob = _mm_max_epi16(eob, eob_shuffled);

	213 *eob_ptr = _mm_extract_epi16(eob, 1);

	214 }

	215 } else {

	216 do {

	217 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);

	218 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);

	219 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);

	220 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);

	221 n_coeffs += 8 * 2;

	222 } while (n_coeffs < 0);

	223 *eob_ptr = 0;

	224 }

	225 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_sad_intrin_avx2.c » ('j') | no next file with comments »