| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 275 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 286 } | 286 } |
| 287 | 287 |
| 288 iscan_ptr += n_coeffs; | 288 iscan_ptr += n_coeffs; |
| 289 qcoeff_ptr += n_coeffs; | 289 qcoeff_ptr += n_coeffs; |
| 290 dqcoeff_ptr += n_coeffs; | 290 dqcoeff_ptr += n_coeffs; |
| 291 n_coeffs = -n_coeffs; | 291 n_coeffs = -n_coeffs; |
| 292 zero = _mm_setzero_si128(); | 292 zero = _mm_setzero_si128(); |
| 293 | 293 |
| 294 if (!skip_block) { | 294 if (!skip_block) { |
| 295 __m128i eob; | 295 __m128i eob; |
| 296 __m128i round, quant, dequant; | 296 __m128i round, quant, dequant, thr; |
| 297 int16_t nzflag; |
| 297 { | 298 { |
| 298 __m128i coeff0, coeff1; | 299 __m128i coeff0, coeff1; |
| 299 | 300 |
| 300 // Setup global values | 301 // Setup global values |
| 301 { | 302 { |
| 302 round = _mm_load_si128((const __m128i*)round_ptr); | 303 round = _mm_load_si128((const __m128i*)round_ptr); |
| 303 quant = _mm_load_si128((const __m128i*)quant_ptr); | 304 quant = _mm_load_si128((const __m128i*)quant_ptr); |
| 304 dequant = _mm_load_si128((const __m128i*)dequant_ptr); | 305 dequant = _mm_load_si128((const __m128i*)dequant_ptr); |
| 305 } | 306 } |
| 306 | 307 |
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 361 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); | 362 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); |
| 362 eob = _mm_and_si128(iscan0, nzero_coeff0); | 363 eob = _mm_and_si128(iscan0, nzero_coeff0); |
| 363 eob1 = _mm_and_si128(iscan1, nzero_coeff1); | 364 eob1 = _mm_and_si128(iscan1, nzero_coeff1); |
| 364 eob = _mm_max_epi16(eob, eob1); | 365 eob = _mm_max_epi16(eob, eob1); |
| 365 } | 366 } |
| 366 n_coeffs += 8 * 2; | 367 n_coeffs += 8 * 2; |
| 367 } | 368 } |
| 368 | 369 |
| 369 // AC only loop | 370 // AC only loop |
| 370 index = 2; | 371 index = 2; |
| 372 thr = _mm_srai_epi16(dequant, 1); |
| 371 while (n_coeffs < 0) { | 373 while (n_coeffs < 0) { |
| 372 __m128i coeff0, coeff1; | 374 __m128i coeff0, coeff1; |
| 373 { | 375 { |
| 374 __m128i coeff0_sign, coeff1_sign; | 376 __m128i coeff0_sign, coeff1_sign; |
| 375 __m128i qcoeff0, qcoeff1; | 377 __m128i qcoeff0, qcoeff1; |
| 376 __m128i qtmp0, qtmp1; | 378 __m128i qtmp0, qtmp1; |
| 377 | 379 |
| 378 assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); | 380 assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); |
| 379 coeff0 = *in[index]; | 381 coeff0 = *in[index]; |
| 380 coeff1 = *in[index + 1]; | 382 coeff1 = *in[index + 1]; |
| 381 | 383 |
| 382 // Poor man's sign extract | 384 // Poor man's sign extract |
| 383 coeff0_sign = _mm_srai_epi16(coeff0, 15); | 385 coeff0_sign = _mm_srai_epi16(coeff0, 15); |
| 384 coeff1_sign = _mm_srai_epi16(coeff1, 15); | 386 coeff1_sign = _mm_srai_epi16(coeff1, 15); |
| 385 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); | 387 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); |
| 386 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); | 388 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); |
| 387 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); | 389 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 388 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); | 390 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
| 389 | 391 |
| 390 qcoeff0 = _mm_adds_epi16(qcoeff0, round); | 392 nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | |
| 391 qcoeff1 = _mm_adds_epi16(qcoeff1, round); | 393 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); |
| 392 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); | |
| 393 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); | |
| 394 | 394 |
| 395 // Reinsert signs | 395 if (nzflag) { |
| 396 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); | 396 qcoeff0 = _mm_adds_epi16(qcoeff0, round); |
| 397 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); | 397 qcoeff1 = _mm_adds_epi16(qcoeff1, round); |
| 398 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); | 398 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); |
| 399 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); | 399 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); |
| 400 | 400 |
| 401 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); | 401 // Reinsert signs |
| 402 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); | 402 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); |
| 403 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); |
| 404 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 405 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
| 403 | 406 |
| 404 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); | 407 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); |
| 405 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); | 408 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); |
| 406 | 409 |
| 407 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); | 410 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); |
| 408 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); | 411 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); |
| 412 |
| 413 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); |
| 414 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); |
| 415 } else { |
| 416 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
| 417 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
| 418 |
| 419 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
| 420 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
| 421 } |
| 409 } | 422 } |
| 410 | 423 |
| 411 { | 424 if (nzflag) { |
| 412 // Scan for eob | 425 // Scan for eob |
| 413 __m128i zero_coeff0, zero_coeff1; | 426 __m128i zero_coeff0, zero_coeff1; |
| 414 __m128i nzero_coeff0, nzero_coeff1; | 427 __m128i nzero_coeff0, nzero_coeff1; |
| 415 __m128i iscan0, iscan1; | 428 __m128i iscan0, iscan1; |
| 416 __m128i eob0, eob1; | 429 __m128i eob0, eob1; |
| 417 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); | 430 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); |
| 418 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); | 431 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); |
| 419 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); | 432 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); |
| 420 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); | 433 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); |
| 421 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); | 434 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); |
| (...skipping 25 matching lines...) Expand all Loading... |
| 447 do { | 460 do { |
| 448 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); | 461 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
| 449 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); | 462 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
| 450 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); | 463 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
| 451 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); | 464 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
| 452 n_coeffs += 8 * 2; | 465 n_coeffs += 8 * 2; |
| 453 } while (n_coeffs < 0); | 466 } while (n_coeffs < 0); |
| 454 *eob_ptr = 0; | 467 *eob_ptr = 0; |
| 455 } | 468 } |
| 456 } | 469 } |
| OLD | NEW |