OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 275 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
286 } | 286 } |
287 | 287 |
288 iscan_ptr += n_coeffs; | 288 iscan_ptr += n_coeffs; |
289 qcoeff_ptr += n_coeffs; | 289 qcoeff_ptr += n_coeffs; |
290 dqcoeff_ptr += n_coeffs; | 290 dqcoeff_ptr += n_coeffs; |
291 n_coeffs = -n_coeffs; | 291 n_coeffs = -n_coeffs; |
292 zero = _mm_setzero_si128(); | 292 zero = _mm_setzero_si128(); |
293 | 293 |
294 if (!skip_block) { | 294 if (!skip_block) { |
295 __m128i eob; | 295 __m128i eob; |
296 __m128i round, quant, dequant; | 296 __m128i round, quant, dequant, thr; |
| 297 int16_t nzflag; |
297 { | 298 { |
298 __m128i coeff0, coeff1; | 299 __m128i coeff0, coeff1; |
299 | 300 |
300 // Setup global values | 301 // Setup global values |
301 { | 302 { |
302 round = _mm_load_si128((const __m128i*)round_ptr); | 303 round = _mm_load_si128((const __m128i*)round_ptr); |
303 quant = _mm_load_si128((const __m128i*)quant_ptr); | 304 quant = _mm_load_si128((const __m128i*)quant_ptr); |
304 dequant = _mm_load_si128((const __m128i*)dequant_ptr); | 305 dequant = _mm_load_si128((const __m128i*)dequant_ptr); |
305 } | 306 } |
306 | 307 |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
361 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); | 362 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); |
362 eob = _mm_and_si128(iscan0, nzero_coeff0); | 363 eob = _mm_and_si128(iscan0, nzero_coeff0); |
363 eob1 = _mm_and_si128(iscan1, nzero_coeff1); | 364 eob1 = _mm_and_si128(iscan1, nzero_coeff1); |
364 eob = _mm_max_epi16(eob, eob1); | 365 eob = _mm_max_epi16(eob, eob1); |
365 } | 366 } |
366 n_coeffs += 8 * 2; | 367 n_coeffs += 8 * 2; |
367 } | 368 } |
368 | 369 |
369 // AC only loop | 370 // AC only loop |
370 index = 2; | 371 index = 2; |
| 372 thr = _mm_srai_epi16(dequant, 1); |
371 while (n_coeffs < 0) { | 373 while (n_coeffs < 0) { |
372 __m128i coeff0, coeff1; | 374 __m128i coeff0, coeff1; |
373 { | 375 { |
374 __m128i coeff0_sign, coeff1_sign; | 376 __m128i coeff0_sign, coeff1_sign; |
375 __m128i qcoeff0, qcoeff1; | 377 __m128i qcoeff0, qcoeff1; |
376 __m128i qtmp0, qtmp1; | 378 __m128i qtmp0, qtmp1; |
377 | 379 |
378 assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); | 380 assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); |
379 coeff0 = *in[index]; | 381 coeff0 = *in[index]; |
380 coeff1 = *in[index + 1]; | 382 coeff1 = *in[index + 1]; |
381 | 383 |
382 // Poor man's sign extract | 384 // Poor man's sign extract |
383 coeff0_sign = _mm_srai_epi16(coeff0, 15); | 385 coeff0_sign = _mm_srai_epi16(coeff0, 15); |
384 coeff1_sign = _mm_srai_epi16(coeff1, 15); | 386 coeff1_sign = _mm_srai_epi16(coeff1, 15); |
385 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); | 387 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); |
386 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); | 388 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); |
387 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); | 389 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
388 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); | 390 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
389 | 391 |
390 qcoeff0 = _mm_adds_epi16(qcoeff0, round); | 392 nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | |
391 qcoeff1 = _mm_adds_epi16(qcoeff1, round); | 393 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); |
392 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); | |
393 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); | |
394 | 394 |
395 // Reinsert signs | 395 if (nzflag) { |
396 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); | 396 qcoeff0 = _mm_adds_epi16(qcoeff0, round); |
397 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); | 397 qcoeff1 = _mm_adds_epi16(qcoeff1, round); |
398 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); | 398 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); |
399 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); | 399 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); |
400 | 400 |
401 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); | 401 // Reinsert signs |
402 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); | 402 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); |
| 403 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); |
| 404 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 405 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
403 | 406 |
404 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); | 407 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); |
405 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); | 408 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); |
406 | 409 |
407 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); | 410 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); |
408 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); | 411 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); |
| 412 |
| 413 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); |
| 414 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); |
| 415 } else { |
| 416 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
| 417 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
| 418 |
| 419 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
| 420 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
| 421 } |
409 } | 422 } |
410 | 423 |
411 { | 424 if (nzflag) { |
412 // Scan for eob | 425 // Scan for eob |
413 __m128i zero_coeff0, zero_coeff1; | 426 __m128i zero_coeff0, zero_coeff1; |
414 __m128i nzero_coeff0, nzero_coeff1; | 427 __m128i nzero_coeff0, nzero_coeff1; |
415 __m128i iscan0, iscan1; | 428 __m128i iscan0, iscan1; |
416 __m128i eob0, eob1; | 429 __m128i eob0, eob1; |
417 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); | 430 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); |
418 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); | 431 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); |
419 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); | 432 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); |
420 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); | 433 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); |
421 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); | 434 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); |
(...skipping 25 matching lines...) Expand all Loading... |
447 do { | 460 do { |
448 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); | 461 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
449 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); | 462 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
450 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); | 463 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
451 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); | 464 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
452 n_coeffs += 8 * 2; | 465 n_coeffs += 8 * 2; |
453 } while (n_coeffs < 0); | 466 } while (n_coeffs < 0); |
454 *eob_ptr = 0; | 467 *eob_ptr = 0; |
455 } | 468 } |
456 } | 469 } |
OLD | NEW |