| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ | 60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ |
| 61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); | 61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); |
| 62 | 62 |
| 63 #define MERGE_WITH_SRC(src_reg, reg) \ | 63 #define MERGE_WITH_SRC(src_reg, reg) \ |
| 64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ | 64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ |
| 65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); | 65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); |
| 66 | 66 |
| 67 #define LOAD_SRC_DST \ | 67 #define LOAD_SRC_DST \ |
| 68 /* load source and destination */ \ | 68 /* load source and destination */ \ |
| 69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ | 69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ |
| 70 dst_reg = _mm256_load_si256((__m256i const *) (dst)); | 70 dst_reg = _mm256_loadu_si256((__m256i const *) (dst)); |
| 71 | 71 |
| 72 #define AVG_NEXT_SRC(src_reg, size_stride) \ | 72 #define AVG_NEXT_SRC(src_reg, size_stride) \ |
| 73 src_next_reg = _mm256_loadu_si256((__m256i const *) \ | 73 src_next_reg = _mm256_loadu_si256((__m256i const *) \ |
| 74 (src + size_stride)); \ | 74 (src + size_stride)); \ |
| 75 /* average between current and next stride source */ \ | 75 /* average between current and next stride source */ \ |
| 76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); | 76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); |
| 77 | 77 |
| 78 #define MERGE_NEXT_SRC(src_reg, size_stride) \ | 78 #define MERGE_NEXT_SRC(src_reg, size_stride) \ |
| 79 src_next_reg = _mm256_loadu_si256((__m256i const *) \ | 79 src_next_reg = _mm256_loadu_si256((__m256i const *) \ |
| 80 (src + size_stride)); \ | 80 (src + size_stride)); \ |
| (...skipping 245 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 326 int i, sum; | 326 int i, sum; |
| 327 sum_reg = _mm256_set1_epi16(0); | 327 sum_reg = _mm256_set1_epi16(0); |
| 328 sse_reg = _mm256_set1_epi16(0); | 328 sse_reg = _mm256_set1_epi16(0); |
| 329 zero_reg = _mm256_set1_epi16(0); | 329 zero_reg = _mm256_set1_epi16(0); |
| 330 | 330 |
| 331 // x_offset = 0 and y_offset = 0 | 331 // x_offset = 0 and y_offset = 0 |
| 332 if (x_offset == 0) { | 332 if (x_offset == 0) { |
| 333 if (y_offset == 0) { | 333 if (y_offset == 0) { |
| 334 for (i = 0; i < height ; i++) { | 334 for (i = 0; i < height ; i++) { |
| 335 LOAD_SRC_DST | 335 LOAD_SRC_DST |
| 336 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 336 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
| 337 src_reg = _mm256_avg_epu8(src_reg, sec_reg); | 337 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
| 338 sec+= sec_stride; | 338 sec+= sec_stride; |
| 339 // expend each byte to 2 bytes | 339 // expend each byte to 2 bytes |
| 340 MERGE_WITH_SRC(src_reg, zero_reg) | 340 MERGE_WITH_SRC(src_reg, zero_reg) |
| 341 CALC_SUM_SSE_INSIDE_LOOP | 341 CALC_SUM_SSE_INSIDE_LOOP |
| 342 src+= src_stride; | 342 src+= src_stride; |
| 343 dst+= dst_stride; | 343 dst+= dst_stride; |
| 344 } | 344 } |
| 345 } else if (y_offset == 8) { | 345 } else if (y_offset == 8) { |
| 346 __m256i src_next_reg; | 346 __m256i src_next_reg; |
| 347 for (i = 0; i < height ; i++) { | 347 for (i = 0; i < height ; i++) { |
| 348 LOAD_SRC_DST | 348 LOAD_SRC_DST |
| 349 AVG_NEXT_SRC(src_reg, src_stride) | 349 AVG_NEXT_SRC(src_reg, src_stride) |
| 350 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 350 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
| 351 src_reg = _mm256_avg_epu8(src_reg, sec_reg); | 351 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
| 352 sec+= sec_stride; | 352 sec+= sec_stride; |
| 353 // expend each byte to 2 bytes | 353 // expend each byte to 2 bytes |
| 354 MERGE_WITH_SRC(src_reg, zero_reg) | 354 MERGE_WITH_SRC(src_reg, zero_reg) |
| 355 CALC_SUM_SSE_INSIDE_LOOP | 355 CALC_SUM_SSE_INSIDE_LOOP |
| 356 src+= src_stride; | 356 src+= src_stride; |
| 357 dst+= dst_stride; | 357 dst+= dst_stride; |
| 358 } | 358 } |
| 359 // x_offset = 0 and y_offset = bilin interpolation | 359 // x_offset = 0 and y_offset = bilin interpolation |
| 360 } else { | 360 } else { |
| 361 __m256i filter, pw8, src_next_reg; | 361 __m256i filter, pw8, src_next_reg; |
| 362 | 362 |
| 363 y_offset <<= 5; | 363 y_offset <<= 5; |
| 364 filter = _mm256_load_si256((__m256i const *) | 364 filter = _mm256_load_si256((__m256i const *) |
| 365 (bilinear_filters_avx2 + y_offset)); | 365 (bilinear_filters_avx2 + y_offset)); |
| 366 pw8 = _mm256_set1_epi16(8); | 366 pw8 = _mm256_set1_epi16(8); |
| 367 for (i = 0; i < height ; i++) { | 367 for (i = 0; i < height ; i++) { |
| 368 LOAD_SRC_DST | 368 LOAD_SRC_DST |
| 369 MERGE_NEXT_SRC(src_reg, src_stride) | 369 MERGE_NEXT_SRC(src_reg, src_stride) |
| 370 FILTER_SRC(filter) | 370 FILTER_SRC(filter) |
| 371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 372 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 372 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
| 373 src_reg = _mm256_avg_epu8(src_reg, sec_reg); | 373 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
| 374 sec+= sec_stride; | 374 sec+= sec_stride; |
| 375 MERGE_WITH_SRC(src_reg, zero_reg) | 375 MERGE_WITH_SRC(src_reg, zero_reg) |
| 376 CALC_SUM_SSE_INSIDE_LOOP | 376 CALC_SUM_SSE_INSIDE_LOOP |
| 377 src+= src_stride; | 377 src+= src_stride; |
| 378 dst+= dst_stride; | 378 dst+= dst_stride; |
| 379 } | 379 } |
| 380 } | 380 } |
| 381 // x_offset = 8 and y_offset = 0 | 381 // x_offset = 8 and y_offset = 0 |
| 382 } else if (x_offset == 8) { | 382 } else if (x_offset == 8) { |
| 383 if (y_offset == 0) { | 383 if (y_offset == 0) { |
| 384 __m256i src_next_reg; | 384 __m256i src_next_reg; |
| 385 for (i = 0; i < height ; i++) { | 385 for (i = 0; i < height ; i++) { |
| 386 LOAD_SRC_DST | 386 LOAD_SRC_DST |
| 387 AVG_NEXT_SRC(src_reg, 1) | 387 AVG_NEXT_SRC(src_reg, 1) |
| 388 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 388 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
| 389 src_reg = _mm256_avg_epu8(src_reg, sec_reg); | 389 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
| 390 sec+= sec_stride; | 390 sec+= sec_stride; |
| 391 // expand each byte to 2 bytes | 391 // expand each byte to 2 bytes |
| 392 MERGE_WITH_SRC(src_reg, zero_reg) | 392 MERGE_WITH_SRC(src_reg, zero_reg) |
| 393 CALC_SUM_SSE_INSIDE_LOOP | 393 CALC_SUM_SSE_INSIDE_LOOP |
| 394 src+= src_stride; | 394 src+= src_stride; |
| 395 dst+= dst_stride; | 395 dst+= dst_stride; |
| 396 } | 396 } |
| 397 // x_offset = 8 and y_offset = 8 | 397 // x_offset = 8 and y_offset = 8 |
| 398 } else if (y_offset == 8) { | 398 } else if (y_offset == 8) { |
| 399 __m256i src_next_reg, src_avg; | 399 __m256i src_next_reg, src_avg; |
| 400 // load source and another source starting from the next | 400 // load source and another source starting from the next |
| 401 // following byte | 401 // following byte |
| 402 src_reg = _mm256_loadu_si256((__m256i const *) (src)); | 402 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
| 403 AVG_NEXT_SRC(src_reg, 1) | 403 AVG_NEXT_SRC(src_reg, 1) |
| 404 for (i = 0; i < height ; i++) { | 404 for (i = 0; i < height ; i++) { |
| 405 // save current source average | 405 // save current source average |
| 406 src_avg = src_reg; | 406 src_avg = src_reg; |
| 407 src+= src_stride; | 407 src+= src_stride; |
| 408 LOAD_SRC_DST | 408 LOAD_SRC_DST |
| 409 AVG_NEXT_SRC(src_reg, 1) | 409 AVG_NEXT_SRC(src_reg, 1) |
| 410 // average between previous average to current average | 410 // average between previous average to current average |
| 411 src_avg = _mm256_avg_epu8(src_avg, src_reg); | 411 src_avg = _mm256_avg_epu8(src_avg, src_reg); |
| 412 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 412 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
| 413 src_avg = _mm256_avg_epu8(src_avg, sec_reg); | 413 src_avg = _mm256_avg_epu8(src_avg, sec_reg); |
| 414 sec+= sec_stride; | 414 sec+= sec_stride; |
| 415 // expand each byte to 2 bytes | 415 // expand each byte to 2 bytes |
| 416 MERGE_WITH_SRC(src_avg, zero_reg) | 416 MERGE_WITH_SRC(src_avg, zero_reg) |
| 417 CALC_SUM_SSE_INSIDE_LOOP | 417 CALC_SUM_SSE_INSIDE_LOOP |
| 418 dst+= dst_stride; | 418 dst+= dst_stride; |
| 419 } | 419 } |
| 420 // x_offset = 8 and y_offset = bilin interpolation | 420 // x_offset = 8 and y_offset = bilin interpolation |
| 421 } else { | 421 } else { |
| 422 __m256i filter, pw8, src_next_reg, src_avg; | 422 __m256i filter, pw8, src_next_reg, src_avg; |
| 423 y_offset <<= 5; | 423 y_offset <<= 5; |
| 424 filter = _mm256_load_si256((__m256i const *) | 424 filter = _mm256_load_si256((__m256i const *) |
| 425 (bilinear_filters_avx2 + y_offset)); | 425 (bilinear_filters_avx2 + y_offset)); |
| 426 pw8 = _mm256_set1_epi16(8); | 426 pw8 = _mm256_set1_epi16(8); |
| 427 // load source and another source starting from the next | 427 // load source and another source starting from the next |
| 428 // following byte | 428 // following byte |
| 429 src_reg = _mm256_loadu_si256((__m256i const *) (src)); | 429 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
| 430 AVG_NEXT_SRC(src_reg, 1) | 430 AVG_NEXT_SRC(src_reg, 1) |
| 431 for (i = 0; i < height ; i++) { | 431 for (i = 0; i < height ; i++) { |
| 432 // save current source average | 432 // save current source average |
| 433 src_avg = src_reg; | 433 src_avg = src_reg; |
| 434 src+= src_stride; | 434 src+= src_stride; |
| 435 LOAD_SRC_DST | 435 LOAD_SRC_DST |
| 436 AVG_NEXT_SRC(src_reg, 1) | 436 AVG_NEXT_SRC(src_reg, 1) |
| 437 MERGE_WITH_SRC(src_avg, src_reg) | 437 MERGE_WITH_SRC(src_avg, src_reg) |
| 438 FILTER_SRC(filter) | 438 FILTER_SRC(filter) |
| 439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 440 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 440 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
| 441 src_avg = _mm256_avg_epu8(src_avg, sec_reg); | 441 src_avg = _mm256_avg_epu8(src_avg, sec_reg); |
| 442 // expand each byte to 2 bytes | 442 // expand each byte to 2 bytes |
| 443 MERGE_WITH_SRC(src_avg, zero_reg) | 443 MERGE_WITH_SRC(src_avg, zero_reg) |
| 444 sec+= sec_stride; | 444 sec+= sec_stride; |
| 445 CALC_SUM_SSE_INSIDE_LOOP | 445 CALC_SUM_SSE_INSIDE_LOOP |
| 446 dst+= dst_stride; | 446 dst+= dst_stride; |
| 447 } | 447 } |
| 448 } | 448 } |
| 449 // x_offset = bilin interpolation and y_offset = 0 | 449 // x_offset = bilin interpolation and y_offset = 0 |
| 450 } else { | 450 } else { |
| 451 if (y_offset == 0) { | 451 if (y_offset == 0) { |
| 452 __m256i filter, pw8, src_next_reg; | 452 __m256i filter, pw8, src_next_reg; |
| 453 x_offset <<= 5; | 453 x_offset <<= 5; |
| 454 filter = _mm256_load_si256((__m256i const *) | 454 filter = _mm256_load_si256((__m256i const *) |
| 455 (bilinear_filters_avx2 + x_offset)); | 455 (bilinear_filters_avx2 + x_offset)); |
| 456 pw8 = _mm256_set1_epi16(8); | 456 pw8 = _mm256_set1_epi16(8); |
| 457 for (i = 0; i < height ; i++) { | 457 for (i = 0; i < height ; i++) { |
| 458 LOAD_SRC_DST | 458 LOAD_SRC_DST |
| 459 MERGE_NEXT_SRC(src_reg, 1) | 459 MERGE_NEXT_SRC(src_reg, 1) |
| 460 FILTER_SRC(filter) | 460 FILTER_SRC(filter) |
| 461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 462 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 462 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
| 463 src_reg = _mm256_avg_epu8(src_reg, sec_reg); | 463 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
| 464 MERGE_WITH_SRC(src_reg, zero_reg) | 464 MERGE_WITH_SRC(src_reg, zero_reg) |
| 465 sec+= sec_stride; | 465 sec+= sec_stride; |
| 466 CALC_SUM_SSE_INSIDE_LOOP | 466 CALC_SUM_SSE_INSIDE_LOOP |
| 467 src+= src_stride; | 467 src+= src_stride; |
| 468 dst+= dst_stride; | 468 dst+= dst_stride; |
| 469 } | 469 } |
| 470 // x_offset = bilin interpolation and y_offset = 8 | 470 // x_offset = bilin interpolation and y_offset = 8 |
| 471 } else if (y_offset == 8) { | 471 } else if (y_offset == 8) { |
| 472 __m256i filter, pw8, src_next_reg, src_pack; | 472 __m256i filter, pw8, src_next_reg, src_pack; |
| 473 x_offset <<= 5; | 473 x_offset <<= 5; |
| 474 filter = _mm256_load_si256((__m256i const *) | 474 filter = _mm256_load_si256((__m256i const *) |
| 475 (bilinear_filters_avx2 + x_offset)); | 475 (bilinear_filters_avx2 + x_offset)); |
| 476 pw8 = _mm256_set1_epi16(8); | 476 pw8 = _mm256_set1_epi16(8); |
| 477 src_reg = _mm256_loadu_si256((__m256i const *) (src)); | 477 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
| 478 MERGE_NEXT_SRC(src_reg, 1) | 478 MERGE_NEXT_SRC(src_reg, 1) |
| 479 FILTER_SRC(filter) | 479 FILTER_SRC(filter) |
| 480 // convert each 16 bit to 8 bit to each low and high lane source | 480 // convert each 16 bit to 8 bit to each low and high lane source |
| 481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 482 for (i = 0; i < height ; i++) { | 482 for (i = 0; i < height ; i++) { |
| 483 src+= src_stride; | 483 src+= src_stride; |
| 484 LOAD_SRC_DST | 484 LOAD_SRC_DST |
| 485 MERGE_NEXT_SRC(src_reg, 1) | 485 MERGE_NEXT_SRC(src_reg, 1) |
| 486 FILTER_SRC(filter) | 486 FILTER_SRC(filter) |
| 487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 488 // average between previous pack to the current | 488 // average between previous pack to the current |
| 489 src_pack = _mm256_avg_epu8(src_pack, src_reg); | 489 src_pack = _mm256_avg_epu8(src_pack, src_reg); |
| 490 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 490 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
| 491 src_pack = _mm256_avg_epu8(src_pack, sec_reg); | 491 src_pack = _mm256_avg_epu8(src_pack, sec_reg); |
| 492 sec+= sec_stride; | 492 sec+= sec_stride; |
| 493 MERGE_WITH_SRC(src_pack, zero_reg) | 493 MERGE_WITH_SRC(src_pack, zero_reg) |
| 494 src_pack = src_reg; | 494 src_pack = src_reg; |
| 495 CALC_SUM_SSE_INSIDE_LOOP | 495 CALC_SUM_SSE_INSIDE_LOOP |
| 496 dst+= dst_stride; | 496 dst+= dst_stride; |
| 497 } | 497 } |
| 498 // x_offset = bilin interpolation and y_offset = bilin interpolation | 498 // x_offset = bilin interpolation and y_offset = bilin interpolation |
| 499 } else { | 499 } else { |
| 500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; | 500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; |
| (...skipping 16 matching lines...) Expand all Loading... |
| 517 src+= src_stride; | 517 src+= src_stride; |
| 518 LOAD_SRC_DST | 518 LOAD_SRC_DST |
| 519 MERGE_NEXT_SRC(src_reg, 1) | 519 MERGE_NEXT_SRC(src_reg, 1) |
| 520 FILTER_SRC(xfilter) | 520 FILTER_SRC(xfilter) |
| 521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 522 // merge previous pack to current pack source | 522 // merge previous pack to current pack source |
| 523 MERGE_WITH_SRC(src_pack, src_reg) | 523 MERGE_WITH_SRC(src_pack, src_reg) |
| 524 // filter the source | 524 // filter the source |
| 525 FILTER_SRC(yfilter) | 525 FILTER_SRC(yfilter) |
| 526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); | 526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 527 sec_reg = _mm256_load_si256((__m256i const *) (sec)); | 527 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); |
| 528 src_pack = _mm256_avg_epu8(src_pack, sec_reg); | 528 src_pack = _mm256_avg_epu8(src_pack, sec_reg); |
| 529 MERGE_WITH_SRC(src_pack, zero_reg) | 529 MERGE_WITH_SRC(src_pack, zero_reg) |
| 530 src_pack = src_reg; | 530 src_pack = src_reg; |
| 531 sec+= sec_stride; | 531 sec+= sec_stride; |
| 532 CALC_SUM_SSE_INSIDE_LOOP | 532 CALC_SUM_SSE_INSIDE_LOOP |
| 533 dst+= dst_stride; | 533 dst+= dst_stride; |
| 534 } | 534 } |
| 535 } | 535 } |
| 536 } | 536 } |
| 537 CALC_SUM_AND_SSE | 537 CALC_SUM_AND_SSE |
| 538 return sum; | 538 return sum; |
| 539 } | 539 } |
| OLD | NEW |