Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(119)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: only update to last nights LKGR Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 212 matching lines...) Expand 10 before | Expand all | Expand 10 after
223 223
224 void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, 224 void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
225 int skip_block, const int16_t* zbin_ptr, 225 int skip_block, const int16_t* zbin_ptr,
226 const int16_t* round_ptr, const int16_t* quant_ptr, 226 const int16_t* round_ptr, const int16_t* quant_ptr,
227 const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, 227 const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
228 int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, 228 int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
229 uint16_t* eob_ptr, 229 uint16_t* eob_ptr,
230 const int16_t* scan_ptr, 230 const int16_t* scan_ptr,
231 const int16_t* iscan_ptr) { 231 const int16_t* iscan_ptr) {
232 __m128i zero; 232 __m128i zero;
233 __m128i thr;
234 int16_t nzflag;
233 (void)scan_ptr; 235 (void)scan_ptr;
234 (void)zbin_ptr; 236 (void)zbin_ptr;
235 (void)quant_shift_ptr; 237 (void)quant_shift_ptr;
236 238
237 coeff_ptr += n_coeffs; 239 coeff_ptr += n_coeffs;
238 iscan_ptr += n_coeffs; 240 iscan_ptr += n_coeffs;
239 qcoeff_ptr += n_coeffs; 241 qcoeff_ptr += n_coeffs;
240 dqcoeff_ptr += n_coeffs; 242 dqcoeff_ptr += n_coeffs;
241 n_coeffs = -n_coeffs; 243 n_coeffs = -n_coeffs;
242 zero = _mm_setzero_si128(); 244 zero = _mm_setzero_si128();
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
309 // Add one to convert from indices to counts 311 // Add one to convert from indices to counts
310 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 312 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
311 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 313 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
312 eob = _mm_and_si128(iscan0, nzero_coeff0); 314 eob = _mm_and_si128(iscan0, nzero_coeff0);
313 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 315 eob1 = _mm_and_si128(iscan1, nzero_coeff1);
314 eob = _mm_max_epi16(eob, eob1); 316 eob = _mm_max_epi16(eob, eob1);
315 } 317 }
316 n_coeffs += 8 * 2; 318 n_coeffs += 8 * 2;
317 } 319 }
318 320
321 thr = _mm_srai_epi16(dequant, 1);
322
319 // AC only loop 323 // AC only loop
320 while (n_coeffs < 0) { 324 while (n_coeffs < 0) {
321 __m128i coeff0, coeff1; 325 __m128i coeff0, coeff1;
322 { 326 {
323 __m128i coeff0_sign, coeff1_sign; 327 __m128i coeff0_sign, coeff1_sign;
324 __m128i qcoeff0, qcoeff1; 328 __m128i qcoeff0, qcoeff1;
325 __m128i qtmp0, qtmp1; 329 __m128i qtmp0, qtmp1;
326 330
327 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); 331 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
328 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); 332 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
329 333
330 // Poor man's sign extract 334 // Poor man's sign extract
331 coeff0_sign = _mm_srai_epi16(coeff0, 15); 335 coeff0_sign = _mm_srai_epi16(coeff0, 15);
332 coeff1_sign = _mm_srai_epi16(coeff1, 15); 336 coeff1_sign = _mm_srai_epi16(coeff1, 15);
333 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 337 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
334 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 338 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
335 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 339 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
336 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 340 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
337 341
338 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 342 nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
339 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 343 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
340 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
341 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
342 344
343 // Reinsert signs 345 if (nzflag) {
344 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 346 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
345 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 347 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
346 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 348 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
347 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 349 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
348 350
349 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); 351 // Reinsert signs
350 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); 352 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
353 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
354 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
355 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
351 356
352 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 357 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
353 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 358 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
354 359
355 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); 360 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
356 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); 361 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
362
363 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
364 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
365 } else {
366 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
367 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
368
369 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
370 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
371 }
357 } 372 }
358 373
359 { 374 if (nzflag) {
360 // Scan for eob 375 // Scan for eob
361 __m128i zero_coeff0, zero_coeff1; 376 __m128i zero_coeff0, zero_coeff1;
362 __m128i nzero_coeff0, nzero_coeff1; 377 __m128i nzero_coeff0, nzero_coeff1;
363 __m128i iscan0, iscan1; 378 __m128i iscan0, iscan1;
364 __m128i eob0, eob1; 379 __m128i eob0, eob1;
365 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 380 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
366 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 381 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
367 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 382 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
368 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 383 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
369 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); 384 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
(...skipping 24 matching lines...) Expand all
394 do { 409 do {
395 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); 410 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
396 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); 411 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
397 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); 412 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
398 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); 413 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
399 n_coeffs += 8 * 2; 414 n_coeffs += 8 * 2;
400 } while (n_coeffs < 0); 415 } while (n_coeffs < 0);
401 *eob_ptr = 0; 416 *eob_ptr = 0;
402 } 417 }
403 } 418 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_highbd_sad_sse2.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698