source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c - Issue 756673003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c

Issue 756673003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 205 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
216 do {	216 do {

217 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);	217 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);

218 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);	218 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);

219 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);	219 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);

220 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);	220 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);

221 n_coeffs += 8 * 2;	221 n_coeffs += 8 * 2;

222 } while (n_coeffs < 0);	222 } while (n_coeffs < 0);

223 *eob_ptr = 0;	223 *eob_ptr = 0;

224 }	224 }

225 }	225 }

	226

	227 void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,

	228 int skip_block, const int16_t* zbin_ptr,

	229 const int16_t* round_ptr, const int16_t* quant_ptr,

	230 const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,

	231 int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,

	232 int zbin_oq_value, uint16_t* eob_ptr,

	233 const int16_t* scan_ptr,

	234 const int16_t* iscan_ptr) {

	235 __m128i zero;

	236 (void)scan_ptr;

	237 (void)zbin_ptr;

	238 (void)quant_shift_ptr;

	239 (void)zbin_oq_value;

	240

	241 coeff_ptr += n_coeffs;

	242 iscan_ptr += n_coeffs;

	243 qcoeff_ptr += n_coeffs;

	244 dqcoeff_ptr += n_coeffs;

	245 n_coeffs = -n_coeffs;

	246 zero = _mm_setzero_si128();

	247

	248 if (!skip_block) {

	249 __m128i eob;

	250 __m128i round, quant, dequant;

	251 {

	252 __m128i coeff0, coeff1;

	253

	254 // Setup global values

	255 {

	256 round = _mm_load_si128((const __m128i*)round_ptr);

	257 quant = _mm_load_si128((const __m128i*)quant_ptr);

	258 dequant = _mm_load_si128((const __m128i*)dequant_ptr);

	259 }

	260

	261 {

	262 __m128i coeff0_sign, coeff1_sign;

	263 __m128i qcoeff0, qcoeff1;

	264 __m128i qtmp0, qtmp1;

	265 // Do DC and first 15 AC

	266 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));

	267 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);

	268

	269 // Poor man's sign extract

	270 coeff0_sign = _mm_srai_epi16(coeff0, 15);

	271 coeff1_sign = _mm_srai_epi16(coeff1, 15);

	272 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

	273 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

	274 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

	275 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

	276

	277 qcoeff0 = _mm_adds_epi16(qcoeff0, round);

	278 round = _mm_unpackhi_epi64(round, round);

	279 qcoeff1 = _mm_adds_epi16(qcoeff1, round);

	280 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

	281 quant = _mm_unpackhi_epi64(quant, quant);

	282 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

	283

	284 // Reinsert signs

	285 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);

	286 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);

	287 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

	288 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

	289

	290 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);

	291 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);

	292

	293 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

	294 dequant = _mm_unpackhi_epi64(dequant, dequant);

	295 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

	296

	297 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);

	298 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);

	299 }

	300

	301 {

	302 // Scan for eob

	303 __m128i zero_coeff0, zero_coeff1;

	304 __m128i nzero_coeff0, nzero_coeff1;

	305 __m128i iscan0, iscan1;

	306 __m128i eob1;

	307 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

	308 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

	309 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

	310 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

	311 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));

	312 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);

	313 // Add one to convert from indices to counts

	314 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

	315 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

	316 eob = _mm_and_si128(iscan0, nzero_coeff0);

	317 eob1 = _mm_and_si128(iscan1, nzero_coeff1);

	318 eob = _mm_max_epi16(eob, eob1);

	319 }

	320 n_coeffs += 8 * 2;

	321 }

	322

	323 // AC only loop

	324 while (n_coeffs < 0) {

	325 __m128i coeff0, coeff1;

	326 {

	327 __m128i coeff0_sign, coeff1_sign;

	328 __m128i qcoeff0, qcoeff1;

	329 __m128i qtmp0, qtmp1;

	330

	331 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));

	332 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);

	333

	334 // Poor man's sign extract

	335 coeff0_sign = _mm_srai_epi16(coeff0, 15);

	336 coeff1_sign = _mm_srai_epi16(coeff1, 15);

	337 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

	338 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

	339 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

	340 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

	341

	342 qcoeff0 = _mm_adds_epi16(qcoeff0, round);

	343 qcoeff1 = _mm_adds_epi16(qcoeff1, round);

	344 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

	345 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

	346

	347 // Reinsert signs

	348 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);

	349 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);

	350 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

	351 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

	352

	353 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);

	354 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);

	355

	356 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

	357 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

	358

	359 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);

	360 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);

	361 }

	362

	363 {

	364 // Scan for eob

	365 __m128i zero_coeff0, zero_coeff1;

	366 __m128i nzero_coeff0, nzero_coeff1;

	367 __m128i iscan0, iscan1;

	368 __m128i eob0, eob1;

	369 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

	370 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

	371 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

	372 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

	373 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));

	374 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);

	375 // Add one to convert from indices to counts

	376 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

	377 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

	378 eob0 = _mm_and_si128(iscan0, nzero_coeff0);

	379 eob1 = _mm_and_si128(iscan1, nzero_coeff1);

	380 eob0 = _mm_max_epi16(eob0, eob1);

	381 eob = _mm_max_epi16(eob, eob0);

	382 }

	383 n_coeffs += 8 * 2;

	384 }

	385

	386 // Accumulate EOB

	387 {

	388 __m128i eob_shuffled;

	389 eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

	390 eob = _mm_max_epi16(eob, eob_shuffled);

	391 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);

	392 eob = _mm_max_epi16(eob, eob_shuffled);

	393 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);

	394 eob = _mm_max_epi16(eob, eob_shuffled);

	395 *eob_ptr = _mm_extract_epi16(eob, 1);

	396 }

	397 } else {

	398 do {

	399 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);

	400 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);

	401 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);

	402 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);

	403 n_coeffs += 8 * 2;

	404 } while (n_coeffs < 0);

	405 *eob_ptr = 0;

	406 }

	407 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c ('k') | source/libvpx/vp9/vp9_cx_iface.c » ('j') | no next file with comments »