OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ | 60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ |
61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); | 61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); |
62 | 62 |
63 #define MERGE_WITH_SRC(src_reg, reg) \ | 63 #define MERGE_WITH_SRC(src_reg, reg) \ |
64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ | 64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ |
65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); | 65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); |
66 | 66 |
67 #define LOAD_SRC_DST \ | 67 #define LOAD_SRC_DST \ |
68 /* load source and destination */ \ | 68 /* load source and destination */ \ |
69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ | 69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ |
70 dst_reg = _mm256_load_si256((__m256i const *) (dst)); | 70 dst_reg = _mm256_loadu_si256((__m256i const *) (dst)); |
71 | 71 |
72 #define AVG_NEXT_SRC(src_reg, size_stride) \ | 72 #define AVG_NEXT_SRC(src_reg, size_stride) \ |
73 src_next_reg = _mm256_loadu_si256((__m256i const *) \ | 73 src_next_reg = _mm256_loadu_si256((__m256i const *) \ |
74 (src + size_stride)); \ | 74 (src + size_stride)); \ |
75 /* average between current and next stride source */ \ | 75 /* average between current and next stride source */ \ |
76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); | 76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); |
77 | 77 |
78 #define MERGE_NEXT_SRC(src_reg, size_stride) \ | 78 #define MERGE_NEXT_SRC(src_reg, size_stride) \ |
79 src_next_reg = _mm256_loadu_si256((__m256i const *) \ | 79 src_next_reg = _mm256_loadu_si256((__m256i const *) \ |
80 (src + size_stride)); \ | 80 (src + size_stride)); \ |
(...skipping 245 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
326 int i, sum; | 326 int i, sum; |
327 sum_reg = _mm256_set1_epi16(0); | 327 sum_reg = _mm256_set1_epi16(0); |
328 sse_reg = _mm256_set1_epi16(0); | 328 sse_reg = _mm256_set1_epi16(0); |
329 zero_reg = _mm256_set1_epi16(0); | 329 zero_reg = _mm256_set1_epi16(0); |
330 | 330 |
331 // x_offset = 0 and y_offset = 0 | 331 // x_offset = 0 and y_offset = 0 |
332 if (x_offset == 0) { | 332 if (x_offset == 0) { |
333 if (y_offset == 0) { | 333 if (y_offset == 0) { |
334 for (i = 0; i < height ; i++) { | 334 for (i = 0; i < height ; i++) { |
335 LOAD_SRC_DST | 335 LOAD_SRC_DST |
336 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 336 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
337 src_reg = _mm256_avg_epu8(src_reg, sec_reg); | 337 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
338 sec+= sec_stride; | 338 sec+= sec_stride; |
339 // expend each byte to 2 bytes | 339 // expend each byte to 2 bytes |
340 MERGE_WITH_SRC(src_reg, zero_reg) | 340 MERGE_WITH_SRC(src_reg, zero_reg) |
341 CALC_SUM_SSE_INSIDE_LOOP | 341 CALC_SUM_SSE_INSIDE_LOOP |
342 src+= src_stride; | 342 src+= src_stride; |
343 dst+= dst_stride; | 343 dst+= dst_stride; |
344 } | 344 } |
345 } else if (y_offset == 8) { | 345 } else if (y_offset == 8) { |
346 __m256i src_next_reg; | 346 __m256i src_next_reg; |
347 for (i = 0; i < height ; i++) { | 347 for (i = 0; i < height ; i++) { |
348 LOAD_SRC_DST | 348 LOAD_SRC_DST |
349 AVG_NEXT_SRC(src_reg, src_stride) | 349 AVG_NEXT_SRC(src_reg, src_stride) |
350 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 350 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
351 src_reg = _mm256_avg_epu8(src_reg, sec_reg); | 351 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
352 sec+= sec_stride; | 352 sec+= sec_stride; |
353 // expend each byte to 2 bytes | 353 // expend each byte to 2 bytes |
354 MERGE_WITH_SRC(src_reg, zero_reg) | 354 MERGE_WITH_SRC(src_reg, zero_reg) |
355 CALC_SUM_SSE_INSIDE_LOOP | 355 CALC_SUM_SSE_INSIDE_LOOP |
356 src+= src_stride; | 356 src+= src_stride; |
357 dst+= dst_stride; | 357 dst+= dst_stride; |
358 } | 358 } |
359 // x_offset = 0 and y_offset = bilin interpolation | 359 // x_offset = 0 and y_offset = bilin interpolation |
360 } else { | 360 } else { |
361 __m256i filter, pw8, src_next_reg; | 361 __m256i filter, pw8, src_next_reg; |
362 | 362 |
363 y_offset <<= 5; | 363 y_offset <<= 5; |
364 filter = _mm256_load_si256((__m256i const *) | 364 filter = _mm256_load_si256((__m256i const *) |
365 (bilinear_filters_avx2 + y_offset)); | 365 (bilinear_filters_avx2 + y_offset)); |
366 pw8 = _mm256_set1_epi16(8); | 366 pw8 = _mm256_set1_epi16(8); |
367 for (i = 0; i < height ; i++) { | 367 for (i = 0; i < height ; i++) { |
368 LOAD_SRC_DST | 368 LOAD_SRC_DST |
369 MERGE_NEXT_SRC(src_reg, src_stride) | 369 MERGE_NEXT_SRC(src_reg, src_stride) |
370 FILTER_SRC(filter) | 370 FILTER_SRC(filter) |
371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
372 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 372 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
373 src_reg = _mm256_avg_epu8(src_reg, sec_reg); | 373 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
374 sec+= sec_stride; | 374 sec+= sec_stride; |
375 MERGE_WITH_SRC(src_reg, zero_reg) | 375 MERGE_WITH_SRC(src_reg, zero_reg) |
376 CALC_SUM_SSE_INSIDE_LOOP | 376 CALC_SUM_SSE_INSIDE_LOOP |
377 src+= src_stride; | 377 src+= src_stride; |
378 dst+= dst_stride; | 378 dst+= dst_stride; |
379 } | 379 } |
380 } | 380 } |
381 // x_offset = 8 and y_offset = 0 | 381 // x_offset = 8 and y_offset = 0 |
382 } else if (x_offset == 8) { | 382 } else if (x_offset == 8) { |
383 if (y_offset == 0) { | 383 if (y_offset == 0) { |
384 __m256i src_next_reg; | 384 __m256i src_next_reg; |
385 for (i = 0; i < height ; i++) { | 385 for (i = 0; i < height ; i++) { |
386 LOAD_SRC_DST | 386 LOAD_SRC_DST |
387 AVG_NEXT_SRC(src_reg, 1) | 387 AVG_NEXT_SRC(src_reg, 1) |
388 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 388 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
389 src_reg = _mm256_avg_epu8(src_reg, sec_reg); | 389 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
390 sec+= sec_stride; | 390 sec+= sec_stride; |
391 // expand each byte to 2 bytes | 391 // expand each byte to 2 bytes |
392 MERGE_WITH_SRC(src_reg, zero_reg) | 392 MERGE_WITH_SRC(src_reg, zero_reg) |
393 CALC_SUM_SSE_INSIDE_LOOP | 393 CALC_SUM_SSE_INSIDE_LOOP |
394 src+= src_stride; | 394 src+= src_stride; |
395 dst+= dst_stride; | 395 dst+= dst_stride; |
396 } | 396 } |
397 // x_offset = 8 and y_offset = 8 | 397 // x_offset = 8 and y_offset = 8 |
398 } else if (y_offset == 8) { | 398 } else if (y_offset == 8) { |
399 __m256i src_next_reg, src_avg; | 399 __m256i src_next_reg, src_avg; |
400 // load source and another source starting from the next | 400 // load source and another source starting from the next |
401 // following byte | 401 // following byte |
402 src_reg = _mm256_loadu_si256((__m256i const *) (src)); | 402 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
403 AVG_NEXT_SRC(src_reg, 1) | 403 AVG_NEXT_SRC(src_reg, 1) |
404 for (i = 0; i < height ; i++) { | 404 for (i = 0; i < height ; i++) { |
405 // save current source average | 405 // save current source average |
406 src_avg = src_reg; | 406 src_avg = src_reg; |
407 src+= src_stride; | 407 src+= src_stride; |
408 LOAD_SRC_DST | 408 LOAD_SRC_DST |
409 AVG_NEXT_SRC(src_reg, 1) | 409 AVG_NEXT_SRC(src_reg, 1) |
410 // average between previous average to current average | 410 // average between previous average to current average |
411 src_avg = _mm256_avg_epu8(src_avg, src_reg); | 411 src_avg = _mm256_avg_epu8(src_avg, src_reg); |
412 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 412 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
413 src_avg = _mm256_avg_epu8(src_avg, sec_reg); | 413 src_avg = _mm256_avg_epu8(src_avg, sec_reg); |
414 sec+= sec_stride; | 414 sec+= sec_stride; |
415 // expand each byte to 2 bytes | 415 // expand each byte to 2 bytes |
416 MERGE_WITH_SRC(src_avg, zero_reg) | 416 MERGE_WITH_SRC(src_avg, zero_reg) |
417 CALC_SUM_SSE_INSIDE_LOOP | 417 CALC_SUM_SSE_INSIDE_LOOP |
418 dst+= dst_stride; | 418 dst+= dst_stride; |
419 } | 419 } |
420 // x_offset = 8 and y_offset = bilin interpolation | 420 // x_offset = 8 and y_offset = bilin interpolation |
421 } else { | 421 } else { |
422 __m256i filter, pw8, src_next_reg, src_avg; | 422 __m256i filter, pw8, src_next_reg, src_avg; |
423 y_offset <<= 5; | 423 y_offset <<= 5; |
424 filter = _mm256_load_si256((__m256i const *) | 424 filter = _mm256_load_si256((__m256i const *) |
425 (bilinear_filters_avx2 + y_offset)); | 425 (bilinear_filters_avx2 + y_offset)); |
426 pw8 = _mm256_set1_epi16(8); | 426 pw8 = _mm256_set1_epi16(8); |
427 // load source and another source starting from the next | 427 // load source and another source starting from the next |
428 // following byte | 428 // following byte |
429 src_reg = _mm256_loadu_si256((__m256i const *) (src)); | 429 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
430 AVG_NEXT_SRC(src_reg, 1) | 430 AVG_NEXT_SRC(src_reg, 1) |
431 for (i = 0; i < height ; i++) { | 431 for (i = 0; i < height ; i++) { |
432 // save current source average | 432 // save current source average |
433 src_avg = src_reg; | 433 src_avg = src_reg; |
434 src+= src_stride; | 434 src+= src_stride; |
435 LOAD_SRC_DST | 435 LOAD_SRC_DST |
436 AVG_NEXT_SRC(src_reg, 1) | 436 AVG_NEXT_SRC(src_reg, 1) |
437 MERGE_WITH_SRC(src_avg, src_reg) | 437 MERGE_WITH_SRC(src_avg, src_reg) |
438 FILTER_SRC(filter) | 438 FILTER_SRC(filter) |
439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
440 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 440 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
441 src_avg = _mm256_avg_epu8(src_avg, sec_reg); | 441 src_avg = _mm256_avg_epu8(src_avg, sec_reg); |
442 // expand each byte to 2 bytes | 442 // expand each byte to 2 bytes |
443 MERGE_WITH_SRC(src_avg, zero_reg) | 443 MERGE_WITH_SRC(src_avg, zero_reg) |
444 sec+= sec_stride; | 444 sec+= sec_stride; |
445 CALC_SUM_SSE_INSIDE_LOOP | 445 CALC_SUM_SSE_INSIDE_LOOP |
446 dst+= dst_stride; | 446 dst+= dst_stride; |
447 } | 447 } |
448 } | 448 } |
449 // x_offset = bilin interpolation and y_offset = 0 | 449 // x_offset = bilin interpolation and y_offset = 0 |
450 } else { | 450 } else { |
451 if (y_offset == 0) { | 451 if (y_offset == 0) { |
452 __m256i filter, pw8, src_next_reg; | 452 __m256i filter, pw8, src_next_reg; |
453 x_offset <<= 5; | 453 x_offset <<= 5; |
454 filter = _mm256_load_si256((__m256i const *) | 454 filter = _mm256_load_si256((__m256i const *) |
455 (bilinear_filters_avx2 + x_offset)); | 455 (bilinear_filters_avx2 + x_offset)); |
456 pw8 = _mm256_set1_epi16(8); | 456 pw8 = _mm256_set1_epi16(8); |
457 for (i = 0; i < height ; i++) { | 457 for (i = 0; i < height ; i++) { |
458 LOAD_SRC_DST | 458 LOAD_SRC_DST |
459 MERGE_NEXT_SRC(src_reg, 1) | 459 MERGE_NEXT_SRC(src_reg, 1) |
460 FILTER_SRC(filter) | 460 FILTER_SRC(filter) |
461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
462 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 462 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
463 src_reg = _mm256_avg_epu8(src_reg, sec_reg); | 463 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
464 MERGE_WITH_SRC(src_reg, zero_reg) | 464 MERGE_WITH_SRC(src_reg, zero_reg) |
465 sec+= sec_stride; | 465 sec+= sec_stride; |
466 CALC_SUM_SSE_INSIDE_LOOP | 466 CALC_SUM_SSE_INSIDE_LOOP |
467 src+= src_stride; | 467 src+= src_stride; |
468 dst+= dst_stride; | 468 dst+= dst_stride; |
469 } | 469 } |
470 // x_offset = bilin interpolation and y_offset = 8 | 470 // x_offset = bilin interpolation and y_offset = 8 |
471 } else if (y_offset == 8) { | 471 } else if (y_offset == 8) { |
472 __m256i filter, pw8, src_next_reg, src_pack; | 472 __m256i filter, pw8, src_next_reg, src_pack; |
473 x_offset <<= 5; | 473 x_offset <<= 5; |
474 filter = _mm256_load_si256((__m256i const *) | 474 filter = _mm256_load_si256((__m256i const *) |
475 (bilinear_filters_avx2 + x_offset)); | 475 (bilinear_filters_avx2 + x_offset)); |
476 pw8 = _mm256_set1_epi16(8); | 476 pw8 = _mm256_set1_epi16(8); |
477 src_reg = _mm256_loadu_si256((__m256i const *) (src)); | 477 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
478 MERGE_NEXT_SRC(src_reg, 1) | 478 MERGE_NEXT_SRC(src_reg, 1) |
479 FILTER_SRC(filter) | 479 FILTER_SRC(filter) |
480 // convert each 16 bit to 8 bit to each low and high lane source | 480 // convert each 16 bit to 8 bit to each low and high lane source |
481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
482 for (i = 0; i < height ; i++) { | 482 for (i = 0; i < height ; i++) { |
483 src+= src_stride; | 483 src+= src_stride; |
484 LOAD_SRC_DST | 484 LOAD_SRC_DST |
485 MERGE_NEXT_SRC(src_reg, 1) | 485 MERGE_NEXT_SRC(src_reg, 1) |
486 FILTER_SRC(filter) | 486 FILTER_SRC(filter) |
487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
488 // average between previous pack to the current | 488 // average between previous pack to the current |
489 src_pack = _mm256_avg_epu8(src_pack, src_reg); | 489 src_pack = _mm256_avg_epu8(src_pack, src_reg); |
490 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 490 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
491 src_pack = _mm256_avg_epu8(src_pack, sec_reg); | 491 src_pack = _mm256_avg_epu8(src_pack, sec_reg); |
492 sec+= sec_stride; | 492 sec+= sec_stride; |
493 MERGE_WITH_SRC(src_pack, zero_reg) | 493 MERGE_WITH_SRC(src_pack, zero_reg) |
494 src_pack = src_reg; | 494 src_pack = src_reg; |
495 CALC_SUM_SSE_INSIDE_LOOP | 495 CALC_SUM_SSE_INSIDE_LOOP |
496 dst+= dst_stride; | 496 dst+= dst_stride; |
497 } | 497 } |
498 // x_offset = bilin interpolation and y_offset = bilin interpolation | 498 // x_offset = bilin interpolation and y_offset = bilin interpolation |
499 } else { | 499 } else { |
500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; | 500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; |
(...skipping 16 matching lines...) Expand all Loading... |
517 src+= src_stride; | 517 src+= src_stride; |
518 LOAD_SRC_DST | 518 LOAD_SRC_DST |
519 MERGE_NEXT_SRC(src_reg, 1) | 519 MERGE_NEXT_SRC(src_reg, 1) |
520 FILTER_SRC(xfilter) | 520 FILTER_SRC(xfilter) |
521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
522 // merge previous pack to current pack source | 522 // merge previous pack to current pack source |
523 MERGE_WITH_SRC(src_pack, src_reg) | 523 MERGE_WITH_SRC(src_pack, src_reg) |
524 // filter the source | 524 // filter the source |
525 FILTER_SRC(yfilter) | 525 FILTER_SRC(yfilter) |
526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
527 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 527 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
528 src_pack = _mm256_avg_epu8(src_pack, sec_reg); | 528 src_pack = _mm256_avg_epu8(src_pack, sec_reg); |
529 MERGE_WITH_SRC(src_pack, zero_reg) | 529 MERGE_WITH_SRC(src_pack, zero_reg) |
530 src_pack = src_reg; | 530 src_pack = src_reg; |
531 sec+= sec_stride; | 531 sec+= sec_stride; |
532 CALC_SUM_SSE_INSIDE_LOOP | 532 CALC_SUM_SSE_INSIDE_LOOP |
533 dst+= dst_stride; | 533 dst+= dst_stride; |
534 } | 534 } |
535 } | 535 } |
536 } | 536 } |
537 CALC_SUM_AND_SSE | 537 CALC_SUM_AND_SSE |
538 return sum; | 538 return sum; |
539 } | 539 } |
OLD | NEW |