OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 205 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
216 do { | 216 do { |
217 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); | 217 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
218 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); | 218 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
219 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); | 219 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
220 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); | 220 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
221 n_coeffs += 8 * 2; | 221 n_coeffs += 8 * 2; |
222 } while (n_coeffs < 0); | 222 } while (n_coeffs < 0); |
223 *eob_ptr = 0; | 223 *eob_ptr = 0; |
224 } | 224 } |
225 } | 225 } |
| 226 |
| 227 void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, |
| 228 int skip_block, const int16_t* zbin_ptr, |
| 229 const int16_t* round_ptr, const int16_t* quant_ptr, |
| 230 const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, |
| 231 int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, |
| 232 int zbin_oq_value, uint16_t* eob_ptr, |
| 233 const int16_t* scan_ptr, |
| 234 const int16_t* iscan_ptr) { |
| 235 __m128i zero; |
| 236 (void)scan_ptr; |
| 237 (void)zbin_ptr; |
| 238 (void)quant_shift_ptr; |
| 239 (void)zbin_oq_value; |
| 240 |
| 241 coeff_ptr += n_coeffs; |
| 242 iscan_ptr += n_coeffs; |
| 243 qcoeff_ptr += n_coeffs; |
| 244 dqcoeff_ptr += n_coeffs; |
| 245 n_coeffs = -n_coeffs; |
| 246 zero = _mm_setzero_si128(); |
| 247 |
| 248 if (!skip_block) { |
| 249 __m128i eob; |
| 250 __m128i round, quant, dequant; |
| 251 { |
| 252 __m128i coeff0, coeff1; |
| 253 |
| 254 // Setup global values |
| 255 { |
| 256 round = _mm_load_si128((const __m128i*)round_ptr); |
| 257 quant = _mm_load_si128((const __m128i*)quant_ptr); |
| 258 dequant = _mm_load_si128((const __m128i*)dequant_ptr); |
| 259 } |
| 260 |
| 261 { |
| 262 __m128i coeff0_sign, coeff1_sign; |
| 263 __m128i qcoeff0, qcoeff1; |
| 264 __m128i qtmp0, qtmp1; |
| 265 // Do DC and first 15 AC |
| 266 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); |
| 267 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); |
| 268 |
| 269 // Poor man's sign extract |
| 270 coeff0_sign = _mm_srai_epi16(coeff0, 15); |
| 271 coeff1_sign = _mm_srai_epi16(coeff1, 15); |
| 272 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); |
| 273 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); |
| 274 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 275 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
| 276 |
| 277 qcoeff0 = _mm_adds_epi16(qcoeff0, round); |
| 278 round = _mm_unpackhi_epi64(round, round); |
| 279 qcoeff1 = _mm_adds_epi16(qcoeff1, round); |
| 280 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); |
| 281 quant = _mm_unpackhi_epi64(quant, quant); |
| 282 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); |
| 283 |
| 284 // Reinsert signs |
| 285 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); |
| 286 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); |
| 287 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 288 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
| 289 |
| 290 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); |
| 291 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); |
| 292 |
| 293 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); |
| 294 dequant = _mm_unpackhi_epi64(dequant, dequant); |
| 295 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); |
| 296 |
| 297 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); |
| 298 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); |
| 299 } |
| 300 |
| 301 { |
| 302 // Scan for eob |
| 303 __m128i zero_coeff0, zero_coeff1; |
| 304 __m128i nzero_coeff0, nzero_coeff1; |
| 305 __m128i iscan0, iscan1; |
| 306 __m128i eob1; |
| 307 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); |
| 308 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); |
| 309 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); |
| 310 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); |
| 311 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); |
| 312 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); |
| 313 // Add one to convert from indices to counts |
| 314 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); |
| 315 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); |
| 316 eob = _mm_and_si128(iscan0, nzero_coeff0); |
| 317 eob1 = _mm_and_si128(iscan1, nzero_coeff1); |
| 318 eob = _mm_max_epi16(eob, eob1); |
| 319 } |
| 320 n_coeffs += 8 * 2; |
| 321 } |
| 322 |
| 323 // AC only loop |
| 324 while (n_coeffs < 0) { |
| 325 __m128i coeff0, coeff1; |
| 326 { |
| 327 __m128i coeff0_sign, coeff1_sign; |
| 328 __m128i qcoeff0, qcoeff1; |
| 329 __m128i qtmp0, qtmp1; |
| 330 |
| 331 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); |
| 332 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); |
| 333 |
| 334 // Poor man's sign extract |
| 335 coeff0_sign = _mm_srai_epi16(coeff0, 15); |
| 336 coeff1_sign = _mm_srai_epi16(coeff1, 15); |
| 337 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); |
| 338 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); |
| 339 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 340 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
| 341 |
| 342 qcoeff0 = _mm_adds_epi16(qcoeff0, round); |
| 343 qcoeff1 = _mm_adds_epi16(qcoeff1, round); |
| 344 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); |
| 345 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); |
| 346 |
| 347 // Reinsert signs |
| 348 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); |
| 349 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); |
| 350 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 351 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
| 352 |
| 353 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); |
| 354 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); |
| 355 |
| 356 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); |
| 357 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); |
| 358 |
| 359 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); |
| 360 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); |
| 361 } |
| 362 |
| 363 { |
| 364 // Scan for eob |
| 365 __m128i zero_coeff0, zero_coeff1; |
| 366 __m128i nzero_coeff0, nzero_coeff1; |
| 367 __m128i iscan0, iscan1; |
| 368 __m128i eob0, eob1; |
| 369 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); |
| 370 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); |
| 371 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); |
| 372 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); |
| 373 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); |
| 374 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); |
| 375 // Add one to convert from indices to counts |
| 376 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); |
| 377 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); |
| 378 eob0 = _mm_and_si128(iscan0, nzero_coeff0); |
| 379 eob1 = _mm_and_si128(iscan1, nzero_coeff1); |
| 380 eob0 = _mm_max_epi16(eob0, eob1); |
| 381 eob = _mm_max_epi16(eob, eob0); |
| 382 } |
| 383 n_coeffs += 8 * 2; |
| 384 } |
| 385 |
| 386 // Accumulate EOB |
| 387 { |
| 388 __m128i eob_shuffled; |
| 389 eob_shuffled = _mm_shuffle_epi32(eob, 0xe); |
| 390 eob = _mm_max_epi16(eob, eob_shuffled); |
| 391 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); |
| 392 eob = _mm_max_epi16(eob, eob_shuffled); |
| 393 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); |
| 394 eob = _mm_max_epi16(eob, eob_shuffled); |
| 395 *eob_ptr = _mm_extract_epi16(eob, 1); |
| 396 } |
| 397 } else { |
| 398 do { |
| 399 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
| 400 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
| 401 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
| 402 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
| 403 n_coeffs += 8 * 2; |
| 404 } while (n_coeffs < 0); |
| 405 *eob_ptr = 0; |
| 406 } |
| 407 } |
OLD | NEW |