Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(54)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c

Issue 478033002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c ('k') | source/libvpx/vp9/vp9_common.mk » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ 60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); 61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
62 62
63 #define MERGE_WITH_SRC(src_reg, reg) \ 63 #define MERGE_WITH_SRC(src_reg, reg) \
64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ 64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); 65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
66 66
67 #define LOAD_SRC_DST \ 67 #define LOAD_SRC_DST \
68 /* load source and destination */ \ 68 /* load source and destination */ \
69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ 69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
70 dst_reg = _mm256_load_si256((__m256i const *) (dst)); 70 dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
71 71
72 #define AVG_NEXT_SRC(src_reg, size_stride) \ 72 #define AVG_NEXT_SRC(src_reg, size_stride) \
73 src_next_reg = _mm256_loadu_si256((__m256i const *) \ 73 src_next_reg = _mm256_loadu_si256((__m256i const *) \
74 (src + size_stride)); \ 74 (src + size_stride)); \
75 /* average between current and next stride source */ \ 75 /* average between current and next stride source */ \
76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); 76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
77 77
78 #define MERGE_NEXT_SRC(src_reg, size_stride) \ 78 #define MERGE_NEXT_SRC(src_reg, size_stride) \
79 src_next_reg = _mm256_loadu_si256((__m256i const *) \ 79 src_next_reg = _mm256_loadu_si256((__m256i const *) \
80 (src + size_stride)); \ 80 (src + size_stride)); \
(...skipping 245 matching lines...) Expand 10 before | Expand all | Expand 10 after
326 int i, sum; 326 int i, sum;
327 sum_reg = _mm256_set1_epi16(0); 327 sum_reg = _mm256_set1_epi16(0);
328 sse_reg = _mm256_set1_epi16(0); 328 sse_reg = _mm256_set1_epi16(0);
329 zero_reg = _mm256_set1_epi16(0); 329 zero_reg = _mm256_set1_epi16(0);
330 330
331 // x_offset = 0 and y_offset = 0 331 // x_offset = 0 and y_offset = 0
332 if (x_offset == 0) { 332 if (x_offset == 0) {
333 if (y_offset == 0) { 333 if (y_offset == 0) {
334 for (i = 0; i < height ; i++) { 334 for (i = 0; i < height ; i++) {
335 LOAD_SRC_DST 335 LOAD_SRC_DST
336 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 336 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
337 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 337 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
338 sec+= sec_stride; 338 sec+= sec_stride;
339 // expend each byte to 2 bytes 339 // expend each byte to 2 bytes
340 MERGE_WITH_SRC(src_reg, zero_reg) 340 MERGE_WITH_SRC(src_reg, zero_reg)
341 CALC_SUM_SSE_INSIDE_LOOP 341 CALC_SUM_SSE_INSIDE_LOOP
342 src+= src_stride; 342 src+= src_stride;
343 dst+= dst_stride; 343 dst+= dst_stride;
344 } 344 }
345 } else if (y_offset == 8) { 345 } else if (y_offset == 8) {
346 __m256i src_next_reg; 346 __m256i src_next_reg;
347 for (i = 0; i < height ; i++) { 347 for (i = 0; i < height ; i++) {
348 LOAD_SRC_DST 348 LOAD_SRC_DST
349 AVG_NEXT_SRC(src_reg, src_stride) 349 AVG_NEXT_SRC(src_reg, src_stride)
350 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 350 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
351 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 351 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
352 sec+= sec_stride; 352 sec+= sec_stride;
353 // expend each byte to 2 bytes 353 // expend each byte to 2 bytes
354 MERGE_WITH_SRC(src_reg, zero_reg) 354 MERGE_WITH_SRC(src_reg, zero_reg)
355 CALC_SUM_SSE_INSIDE_LOOP 355 CALC_SUM_SSE_INSIDE_LOOP
356 src+= src_stride; 356 src+= src_stride;
357 dst+= dst_stride; 357 dst+= dst_stride;
358 } 358 }
359 // x_offset = 0 and y_offset = bilin interpolation 359 // x_offset = 0 and y_offset = bilin interpolation
360 } else { 360 } else {
361 __m256i filter, pw8, src_next_reg; 361 __m256i filter, pw8, src_next_reg;
362 362
363 y_offset <<= 5; 363 y_offset <<= 5;
364 filter = _mm256_load_si256((__m256i const *) 364 filter = _mm256_load_si256((__m256i const *)
365 (bilinear_filters_avx2 + y_offset)); 365 (bilinear_filters_avx2 + y_offset));
366 pw8 = _mm256_set1_epi16(8); 366 pw8 = _mm256_set1_epi16(8);
367 for (i = 0; i < height ; i++) { 367 for (i = 0; i < height ; i++) {
368 LOAD_SRC_DST 368 LOAD_SRC_DST
369 MERGE_NEXT_SRC(src_reg, src_stride) 369 MERGE_NEXT_SRC(src_reg, src_stride)
370 FILTER_SRC(filter) 370 FILTER_SRC(filter)
371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
372 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 372 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
373 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 373 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
374 sec+= sec_stride; 374 sec+= sec_stride;
375 MERGE_WITH_SRC(src_reg, zero_reg) 375 MERGE_WITH_SRC(src_reg, zero_reg)
376 CALC_SUM_SSE_INSIDE_LOOP 376 CALC_SUM_SSE_INSIDE_LOOP
377 src+= src_stride; 377 src+= src_stride;
378 dst+= dst_stride; 378 dst+= dst_stride;
379 } 379 }
380 } 380 }
381 // x_offset = 8 and y_offset = 0 381 // x_offset = 8 and y_offset = 0
382 } else if (x_offset == 8) { 382 } else if (x_offset == 8) {
383 if (y_offset == 0) { 383 if (y_offset == 0) {
384 __m256i src_next_reg; 384 __m256i src_next_reg;
385 for (i = 0; i < height ; i++) { 385 for (i = 0; i < height ; i++) {
386 LOAD_SRC_DST 386 LOAD_SRC_DST
387 AVG_NEXT_SRC(src_reg, 1) 387 AVG_NEXT_SRC(src_reg, 1)
388 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 388 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
389 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 389 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
390 sec+= sec_stride; 390 sec+= sec_stride;
391 // expand each byte to 2 bytes 391 // expand each byte to 2 bytes
392 MERGE_WITH_SRC(src_reg, zero_reg) 392 MERGE_WITH_SRC(src_reg, zero_reg)
393 CALC_SUM_SSE_INSIDE_LOOP 393 CALC_SUM_SSE_INSIDE_LOOP
394 src+= src_stride; 394 src+= src_stride;
395 dst+= dst_stride; 395 dst+= dst_stride;
396 } 396 }
397 // x_offset = 8 and y_offset = 8 397 // x_offset = 8 and y_offset = 8
398 } else if (y_offset == 8) { 398 } else if (y_offset == 8) {
399 __m256i src_next_reg, src_avg; 399 __m256i src_next_reg, src_avg;
400 // load source and another source starting from the next 400 // load source and another source starting from the next
401 // following byte 401 // following byte
402 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 402 src_reg = _mm256_loadu_si256((__m256i const *) (src));
403 AVG_NEXT_SRC(src_reg, 1) 403 AVG_NEXT_SRC(src_reg, 1)
404 for (i = 0; i < height ; i++) { 404 for (i = 0; i < height ; i++) {
405 // save current source average 405 // save current source average
406 src_avg = src_reg; 406 src_avg = src_reg;
407 src+= src_stride; 407 src+= src_stride;
408 LOAD_SRC_DST 408 LOAD_SRC_DST
409 AVG_NEXT_SRC(src_reg, 1) 409 AVG_NEXT_SRC(src_reg, 1)
410 // average between previous average to current average 410 // average between previous average to current average
411 src_avg = _mm256_avg_epu8(src_avg, src_reg); 411 src_avg = _mm256_avg_epu8(src_avg, src_reg);
412 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 412 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
413 src_avg = _mm256_avg_epu8(src_avg, sec_reg); 413 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
414 sec+= sec_stride; 414 sec+= sec_stride;
415 // expand each byte to 2 bytes 415 // expand each byte to 2 bytes
416 MERGE_WITH_SRC(src_avg, zero_reg) 416 MERGE_WITH_SRC(src_avg, zero_reg)
417 CALC_SUM_SSE_INSIDE_LOOP 417 CALC_SUM_SSE_INSIDE_LOOP
418 dst+= dst_stride; 418 dst+= dst_stride;
419 } 419 }
420 // x_offset = 8 and y_offset = bilin interpolation 420 // x_offset = 8 and y_offset = bilin interpolation
421 } else { 421 } else {
422 __m256i filter, pw8, src_next_reg, src_avg; 422 __m256i filter, pw8, src_next_reg, src_avg;
423 y_offset <<= 5; 423 y_offset <<= 5;
424 filter = _mm256_load_si256((__m256i const *) 424 filter = _mm256_load_si256((__m256i const *)
425 (bilinear_filters_avx2 + y_offset)); 425 (bilinear_filters_avx2 + y_offset));
426 pw8 = _mm256_set1_epi16(8); 426 pw8 = _mm256_set1_epi16(8);
427 // load source and another source starting from the next 427 // load source and another source starting from the next
428 // following byte 428 // following byte
429 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 429 src_reg = _mm256_loadu_si256((__m256i const *) (src));
430 AVG_NEXT_SRC(src_reg, 1) 430 AVG_NEXT_SRC(src_reg, 1)
431 for (i = 0; i < height ; i++) { 431 for (i = 0; i < height ; i++) {
432 // save current source average 432 // save current source average
433 src_avg = src_reg; 433 src_avg = src_reg;
434 src+= src_stride; 434 src+= src_stride;
435 LOAD_SRC_DST 435 LOAD_SRC_DST
436 AVG_NEXT_SRC(src_reg, 1) 436 AVG_NEXT_SRC(src_reg, 1)
437 MERGE_WITH_SRC(src_avg, src_reg) 437 MERGE_WITH_SRC(src_avg, src_reg)
438 FILTER_SRC(filter) 438 FILTER_SRC(filter)
439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
440 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 440 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
441 src_avg = _mm256_avg_epu8(src_avg, sec_reg); 441 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
442 // expand each byte to 2 bytes 442 // expand each byte to 2 bytes
443 MERGE_WITH_SRC(src_avg, zero_reg) 443 MERGE_WITH_SRC(src_avg, zero_reg)
444 sec+= sec_stride; 444 sec+= sec_stride;
445 CALC_SUM_SSE_INSIDE_LOOP 445 CALC_SUM_SSE_INSIDE_LOOP
446 dst+= dst_stride; 446 dst+= dst_stride;
447 } 447 }
448 } 448 }
449 // x_offset = bilin interpolation and y_offset = 0 449 // x_offset = bilin interpolation and y_offset = 0
450 } else { 450 } else {
451 if (y_offset == 0) { 451 if (y_offset == 0) {
452 __m256i filter, pw8, src_next_reg; 452 __m256i filter, pw8, src_next_reg;
453 x_offset <<= 5; 453 x_offset <<= 5;
454 filter = _mm256_load_si256((__m256i const *) 454 filter = _mm256_load_si256((__m256i const *)
455 (bilinear_filters_avx2 + x_offset)); 455 (bilinear_filters_avx2 + x_offset));
456 pw8 = _mm256_set1_epi16(8); 456 pw8 = _mm256_set1_epi16(8);
457 for (i = 0; i < height ; i++) { 457 for (i = 0; i < height ; i++) {
458 LOAD_SRC_DST 458 LOAD_SRC_DST
459 MERGE_NEXT_SRC(src_reg, 1) 459 MERGE_NEXT_SRC(src_reg, 1)
460 FILTER_SRC(filter) 460 FILTER_SRC(filter)
461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
462 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 462 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
463 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 463 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
464 MERGE_WITH_SRC(src_reg, zero_reg) 464 MERGE_WITH_SRC(src_reg, zero_reg)
465 sec+= sec_stride; 465 sec+= sec_stride;
466 CALC_SUM_SSE_INSIDE_LOOP 466 CALC_SUM_SSE_INSIDE_LOOP
467 src+= src_stride; 467 src+= src_stride;
468 dst+= dst_stride; 468 dst+= dst_stride;
469 } 469 }
470 // x_offset = bilin interpolation and y_offset = 8 470 // x_offset = bilin interpolation and y_offset = 8
471 } else if (y_offset == 8) { 471 } else if (y_offset == 8) {
472 __m256i filter, pw8, src_next_reg, src_pack; 472 __m256i filter, pw8, src_next_reg, src_pack;
473 x_offset <<= 5; 473 x_offset <<= 5;
474 filter = _mm256_load_si256((__m256i const *) 474 filter = _mm256_load_si256((__m256i const *)
475 (bilinear_filters_avx2 + x_offset)); 475 (bilinear_filters_avx2 + x_offset));
476 pw8 = _mm256_set1_epi16(8); 476 pw8 = _mm256_set1_epi16(8);
477 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 477 src_reg = _mm256_loadu_si256((__m256i const *) (src));
478 MERGE_NEXT_SRC(src_reg, 1) 478 MERGE_NEXT_SRC(src_reg, 1)
479 FILTER_SRC(filter) 479 FILTER_SRC(filter)
480 // convert each 16 bit to 8 bit to each low and high lane source 480 // convert each 16 bit to 8 bit to each low and high lane source
481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
482 for (i = 0; i < height ; i++) { 482 for (i = 0; i < height ; i++) {
483 src+= src_stride; 483 src+= src_stride;
484 LOAD_SRC_DST 484 LOAD_SRC_DST
485 MERGE_NEXT_SRC(src_reg, 1) 485 MERGE_NEXT_SRC(src_reg, 1)
486 FILTER_SRC(filter) 486 FILTER_SRC(filter)
487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
488 // average between previous pack to the current 488 // average between previous pack to the current
489 src_pack = _mm256_avg_epu8(src_pack, src_reg); 489 src_pack = _mm256_avg_epu8(src_pack, src_reg);
490 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 490 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
491 src_pack = _mm256_avg_epu8(src_pack, sec_reg); 491 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
492 sec+= sec_stride; 492 sec+= sec_stride;
493 MERGE_WITH_SRC(src_pack, zero_reg) 493 MERGE_WITH_SRC(src_pack, zero_reg)
494 src_pack = src_reg; 494 src_pack = src_reg;
495 CALC_SUM_SSE_INSIDE_LOOP 495 CALC_SUM_SSE_INSIDE_LOOP
496 dst+= dst_stride; 496 dst+= dst_stride;
497 } 497 }
498 // x_offset = bilin interpolation and y_offset = bilin interpolation 498 // x_offset = bilin interpolation and y_offset = bilin interpolation
499 } else { 499 } else {
500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; 500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
(...skipping 16 matching lines...) Expand all
517 src+= src_stride; 517 src+= src_stride;
518 LOAD_SRC_DST 518 LOAD_SRC_DST
519 MERGE_NEXT_SRC(src_reg, 1) 519 MERGE_NEXT_SRC(src_reg, 1)
520 FILTER_SRC(xfilter) 520 FILTER_SRC(xfilter)
521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
522 // merge previous pack to current pack source 522 // merge previous pack to current pack source
523 MERGE_WITH_SRC(src_pack, src_reg) 523 MERGE_WITH_SRC(src_pack, src_reg)
524 // filter the source 524 // filter the source
525 FILTER_SRC(yfilter) 525 FILTER_SRC(yfilter)
526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
527 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 527 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
528 src_pack = _mm256_avg_epu8(src_pack, sec_reg); 528 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
529 MERGE_WITH_SRC(src_pack, zero_reg) 529 MERGE_WITH_SRC(src_pack, zero_reg)
530 src_pack = src_reg; 530 src_pack = src_reg;
531 sec+= sec_stride; 531 sec+= sec_stride;
532 CALC_SUM_SSE_INSIDE_LOOP 532 CALC_SUM_SSE_INSIDE_LOOP
533 dst+= dst_stride; 533 dst+= dst_stride;
534 } 534 }
535 } 535 }
536 } 536 }
537 CALC_SUM_AND_SSE 537 CALC_SUM_AND_SSE
538 return sum; 538 return sum;
539 } 539 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c ('k') | source/libvpx/vp9/vp9_common.mk » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698