OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 212 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
223 | 223 |
224 void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, | 224 void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, |
225 int skip_block, const int16_t* zbin_ptr, | 225 int skip_block, const int16_t* zbin_ptr, |
226 const int16_t* round_ptr, const int16_t* quant_ptr, | 226 const int16_t* round_ptr, const int16_t* quant_ptr, |
227 const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, | 227 const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, |
228 int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, | 228 int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, |
229 uint16_t* eob_ptr, | 229 uint16_t* eob_ptr, |
230 const int16_t* scan_ptr, | 230 const int16_t* scan_ptr, |
231 const int16_t* iscan_ptr) { | 231 const int16_t* iscan_ptr) { |
232 __m128i zero; | 232 __m128i zero; |
| 233 __m128i thr; |
| 234 int16_t nzflag; |
233 (void)scan_ptr; | 235 (void)scan_ptr; |
234 (void)zbin_ptr; | 236 (void)zbin_ptr; |
235 (void)quant_shift_ptr; | 237 (void)quant_shift_ptr; |
236 | 238 |
237 coeff_ptr += n_coeffs; | 239 coeff_ptr += n_coeffs; |
238 iscan_ptr += n_coeffs; | 240 iscan_ptr += n_coeffs; |
239 qcoeff_ptr += n_coeffs; | 241 qcoeff_ptr += n_coeffs; |
240 dqcoeff_ptr += n_coeffs; | 242 dqcoeff_ptr += n_coeffs; |
241 n_coeffs = -n_coeffs; | 243 n_coeffs = -n_coeffs; |
242 zero = _mm_setzero_si128(); | 244 zero = _mm_setzero_si128(); |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
309 // Add one to convert from indices to counts | 311 // Add one to convert from indices to counts |
310 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); | 312 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); |
311 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); | 313 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); |
312 eob = _mm_and_si128(iscan0, nzero_coeff0); | 314 eob = _mm_and_si128(iscan0, nzero_coeff0); |
313 eob1 = _mm_and_si128(iscan1, nzero_coeff1); | 315 eob1 = _mm_and_si128(iscan1, nzero_coeff1); |
314 eob = _mm_max_epi16(eob, eob1); | 316 eob = _mm_max_epi16(eob, eob1); |
315 } | 317 } |
316 n_coeffs += 8 * 2; | 318 n_coeffs += 8 * 2; |
317 } | 319 } |
318 | 320 |
| 321 thr = _mm_srai_epi16(dequant, 1); |
| 322 |
319 // AC only loop | 323 // AC only loop |
320 while (n_coeffs < 0) { | 324 while (n_coeffs < 0) { |
321 __m128i coeff0, coeff1; | 325 __m128i coeff0, coeff1; |
322 { | 326 { |
323 __m128i coeff0_sign, coeff1_sign; | 327 __m128i coeff0_sign, coeff1_sign; |
324 __m128i qcoeff0, qcoeff1; | 328 __m128i qcoeff0, qcoeff1; |
325 __m128i qtmp0, qtmp1; | 329 __m128i qtmp0, qtmp1; |
326 | 330 |
327 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); | 331 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); |
328 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); | 332 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); |
329 | 333 |
330 // Poor man's sign extract | 334 // Poor man's sign extract |
331 coeff0_sign = _mm_srai_epi16(coeff0, 15); | 335 coeff0_sign = _mm_srai_epi16(coeff0, 15); |
332 coeff1_sign = _mm_srai_epi16(coeff1, 15); | 336 coeff1_sign = _mm_srai_epi16(coeff1, 15); |
333 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); | 337 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); |
334 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); | 338 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); |
335 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); | 339 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
336 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); | 340 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
337 | 341 |
338 qcoeff0 = _mm_adds_epi16(qcoeff0, round); | 342 nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | |
339 qcoeff1 = _mm_adds_epi16(qcoeff1, round); | 343 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); |
340 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); | |
341 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); | |
342 | 344 |
343 // Reinsert signs | 345 if (nzflag) { |
344 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); | 346 qcoeff0 = _mm_adds_epi16(qcoeff0, round); |
345 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); | 347 qcoeff1 = _mm_adds_epi16(qcoeff1, round); |
346 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); | 348 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); |
347 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); | 349 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); |
348 | 350 |
349 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); | 351 // Reinsert signs |
350 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); | 352 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); |
| 353 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); |
| 354 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 355 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
351 | 356 |
352 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); | 357 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); |
353 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); | 358 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); |
354 | 359 |
355 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); | 360 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); |
356 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); | 361 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); |
| 362 |
| 363 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); |
| 364 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); |
| 365 } else { |
| 366 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
| 367 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
| 368 |
| 369 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
| 370 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
| 371 } |
357 } | 372 } |
358 | 373 |
359 { | 374 if (nzflag) { |
360 // Scan for eob | 375 // Scan for eob |
361 __m128i zero_coeff0, zero_coeff1; | 376 __m128i zero_coeff0, zero_coeff1; |
362 __m128i nzero_coeff0, nzero_coeff1; | 377 __m128i nzero_coeff0, nzero_coeff1; |
363 __m128i iscan0, iscan1; | 378 __m128i iscan0, iscan1; |
364 __m128i eob0, eob1; | 379 __m128i eob0, eob1; |
365 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); | 380 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); |
366 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); | 381 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); |
367 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); | 382 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); |
368 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); | 383 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); |
369 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); | 384 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); |
(...skipping 24 matching lines...) Expand all Loading... |
394 do { | 409 do { |
395 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); | 410 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
396 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); | 411 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
397 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); | 412 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
398 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); | 413 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
399 n_coeffs += 8 * 2; | 414 n_coeffs += 8 * 2; |
400 } while (n_coeffs < 0); | 415 } while (n_coeffs < 0); |
401 *eob_ptr = 0; | 416 *eob_ptr = 0; |
402 } | 417 } |
403 } | 418 } |
OLD | NEW |