source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c - Issue 478033002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c

Issue 478033002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \	60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \

61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);	61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);

62	62

63 #define MERGE_WITH_SRC(src_reg, reg) \	63 #define MERGE_WITH_SRC(src_reg, reg) \

64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \	64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \

65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);	65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);

66	66

67 #define LOAD_SRC_DST \	67 #define LOAD_SRC_DST \

68 /* load source and destination */ \	68 /* load source and destination */ \

69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \	69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \

70 dst_reg = _mm256_load_si256((__m256i const *) (dst));	70 dst_reg = _mm256_loadu_si256((__m256i const *) (dst));

71	71

72 #define AVG_NEXT_SRC(src_reg, size_stride) \	72 #define AVG_NEXT_SRC(src_reg, size_stride) \

73 src_next_reg = _mm256_loadu_si256((__m256i const *) \	73 src_next_reg = _mm256_loadu_si256((__m256i const *) \

74 (src + size_stride)); \	74 (src + size_stride)); \

75 /* average between current and next stride source */ \	75 /* average between current and next stride source */ \

76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);	76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);

77	77

78 #define MERGE_NEXT_SRC(src_reg, size_stride) \	78 #define MERGE_NEXT_SRC(src_reg, size_stride) \

79 src_next_reg = _mm256_loadu_si256((__m256i const *) \	79 src_next_reg = _mm256_loadu_si256((__m256i const *) \

80 (src + size_stride)); \	80 (src + size_stride)); \

(...skipping 245 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
326 int i, sum;	326 int i, sum;

327 sum_reg = _mm256_set1_epi16(0);	327 sum_reg = _mm256_set1_epi16(0);

328 sse_reg = _mm256_set1_epi16(0);	328 sse_reg = _mm256_set1_epi16(0);

329 zero_reg = _mm256_set1_epi16(0);	329 zero_reg = _mm256_set1_epi16(0);

330	330

331 // x_offset = 0 and y_offset = 0	331 // x_offset = 0 and y_offset = 0

332 if (x_offset == 0) {	332 if (x_offset == 0) {

333 if (y_offset == 0) {	333 if (y_offset == 0) {

334 for (i = 0; i < height ; i++) {	334 for (i = 0; i < height ; i++) {

335 LOAD_SRC_DST	335 LOAD_SRC_DST

336 sec_reg = _mm256_load_si256((__m256i const *) (sec));	336 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

337 src_reg = _mm256_avg_epu8(src_reg, sec_reg);	337 src_reg = _mm256_avg_epu8(src_reg, sec_reg);

338 sec+= sec_stride;	338 sec+= sec_stride;

339 // expend each byte to 2 bytes	339 // expend each byte to 2 bytes

340 MERGE_WITH_SRC(src_reg, zero_reg)	340 MERGE_WITH_SRC(src_reg, zero_reg)

341 CALC_SUM_SSE_INSIDE_LOOP	341 CALC_SUM_SSE_INSIDE_LOOP

342 src+= src_stride;	342 src+= src_stride;

343 dst+= dst_stride;	343 dst+= dst_stride;

344 }	344 }

345 } else if (y_offset == 8) {	345 } else if (y_offset == 8) {

346 __m256i src_next_reg;	346 __m256i src_next_reg;

347 for (i = 0; i < height ; i++) {	347 for (i = 0; i < height ; i++) {

348 LOAD_SRC_DST	348 LOAD_SRC_DST

349 AVG_NEXT_SRC(src_reg, src_stride)	349 AVG_NEXT_SRC(src_reg, src_stride)

350 sec_reg = _mm256_load_si256((__m256i const *) (sec));	350 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

351 src_reg = _mm256_avg_epu8(src_reg, sec_reg);	351 src_reg = _mm256_avg_epu8(src_reg, sec_reg);

352 sec+= sec_stride;	352 sec+= sec_stride;

353 // expend each byte to 2 bytes	353 // expend each byte to 2 bytes

354 MERGE_WITH_SRC(src_reg, zero_reg)	354 MERGE_WITH_SRC(src_reg, zero_reg)

355 CALC_SUM_SSE_INSIDE_LOOP	355 CALC_SUM_SSE_INSIDE_LOOP

356 src+= src_stride;	356 src+= src_stride;

357 dst+= dst_stride;	357 dst+= dst_stride;

358 }	358 }

359 // x_offset = 0 and y_offset = bilin interpolation	359 // x_offset = 0 and y_offset = bilin interpolation

360 } else {	360 } else {

361 __m256i filter, pw8, src_next_reg;	361 __m256i filter, pw8, src_next_reg;

362	362

363 y_offset <<= 5;	363 y_offset <<= 5;

364 filter = _mm256_load_si256((__m256i const *)	364 filter = _mm256_load_si256((__m256i const *)

365 (bilinear_filters_avx2 + y_offset));	365 (bilinear_filters_avx2 + y_offset));

366 pw8 = _mm256_set1_epi16(8);	366 pw8 = _mm256_set1_epi16(8);

367 for (i = 0; i < height ; i++) {	367 for (i = 0; i < height ; i++) {

368 LOAD_SRC_DST	368 LOAD_SRC_DST

369 MERGE_NEXT_SRC(src_reg, src_stride)	369 MERGE_NEXT_SRC(src_reg, src_stride)

370 FILTER_SRC(filter)	370 FILTER_SRC(filter)

371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);	371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

372 sec_reg = _mm256_load_si256((__m256i const *) (sec));	372 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

373 src_reg = _mm256_avg_epu8(src_reg, sec_reg);	373 src_reg = _mm256_avg_epu8(src_reg, sec_reg);

374 sec+= sec_stride;	374 sec+= sec_stride;

375 MERGE_WITH_SRC(src_reg, zero_reg)	375 MERGE_WITH_SRC(src_reg, zero_reg)

376 CALC_SUM_SSE_INSIDE_LOOP	376 CALC_SUM_SSE_INSIDE_LOOP

377 src+= src_stride;	377 src+= src_stride;

378 dst+= dst_stride;	378 dst+= dst_stride;

379 }	379 }

380 }	380 }

381 // x_offset = 8 and y_offset = 0	381 // x_offset = 8 and y_offset = 0

382 } else if (x_offset == 8) {	382 } else if (x_offset == 8) {

383 if (y_offset == 0) {	383 if (y_offset == 0) {

384 __m256i src_next_reg;	384 __m256i src_next_reg;

385 for (i = 0; i < height ; i++) {	385 for (i = 0; i < height ; i++) {

386 LOAD_SRC_DST	386 LOAD_SRC_DST

387 AVG_NEXT_SRC(src_reg, 1)	387 AVG_NEXT_SRC(src_reg, 1)

388 sec_reg = _mm256_load_si256((__m256i const *) (sec));	388 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

389 src_reg = _mm256_avg_epu8(src_reg, sec_reg);	389 src_reg = _mm256_avg_epu8(src_reg, sec_reg);

390 sec+= sec_stride;	390 sec+= sec_stride;

391 // expand each byte to 2 bytes	391 // expand each byte to 2 bytes

392 MERGE_WITH_SRC(src_reg, zero_reg)	392 MERGE_WITH_SRC(src_reg, zero_reg)

393 CALC_SUM_SSE_INSIDE_LOOP	393 CALC_SUM_SSE_INSIDE_LOOP

394 src+= src_stride;	394 src+= src_stride;

395 dst+= dst_stride;	395 dst+= dst_stride;

396 }	396 }

397 // x_offset = 8 and y_offset = 8	397 // x_offset = 8 and y_offset = 8

398 } else if (y_offset == 8) {	398 } else if (y_offset == 8) {

399 __m256i src_next_reg, src_avg;	399 __m256i src_next_reg, src_avg;

400 // load source and another source starting from the next	400 // load source and another source starting from the next

401 // following byte	401 // following byte

402 src_reg = _mm256_loadu_si256((__m256i const *) (src));	402 src_reg = _mm256_loadu_si256((__m256i const *) (src));

403 AVG_NEXT_SRC(src_reg, 1)	403 AVG_NEXT_SRC(src_reg, 1)

404 for (i = 0; i < height ; i++) {	404 for (i = 0; i < height ; i++) {

405 // save current source average	405 // save current source average

406 src_avg = src_reg;	406 src_avg = src_reg;

407 src+= src_stride;	407 src+= src_stride;

408 LOAD_SRC_DST	408 LOAD_SRC_DST

409 AVG_NEXT_SRC(src_reg, 1)	409 AVG_NEXT_SRC(src_reg, 1)

410 // average between previous average to current average	410 // average between previous average to current average

411 src_avg = _mm256_avg_epu8(src_avg, src_reg);	411 src_avg = _mm256_avg_epu8(src_avg, src_reg);

412 sec_reg = _mm256_load_si256((__m256i const *) (sec));	412 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

413 src_avg = _mm256_avg_epu8(src_avg, sec_reg);	413 src_avg = _mm256_avg_epu8(src_avg, sec_reg);

414 sec+= sec_stride;	414 sec+= sec_stride;

415 // expand each byte to 2 bytes	415 // expand each byte to 2 bytes

416 MERGE_WITH_SRC(src_avg, zero_reg)	416 MERGE_WITH_SRC(src_avg, zero_reg)

417 CALC_SUM_SSE_INSIDE_LOOP	417 CALC_SUM_SSE_INSIDE_LOOP

418 dst+= dst_stride;	418 dst+= dst_stride;

419 }	419 }

420 // x_offset = 8 and y_offset = bilin interpolation	420 // x_offset = 8 and y_offset = bilin interpolation

421 } else {	421 } else {

422 __m256i filter, pw8, src_next_reg, src_avg;	422 __m256i filter, pw8, src_next_reg, src_avg;

423 y_offset <<= 5;	423 y_offset <<= 5;

424 filter = _mm256_load_si256((__m256i const *)	424 filter = _mm256_load_si256((__m256i const *)

425 (bilinear_filters_avx2 + y_offset));	425 (bilinear_filters_avx2 + y_offset));

426 pw8 = _mm256_set1_epi16(8);	426 pw8 = _mm256_set1_epi16(8);

427 // load source and another source starting from the next	427 // load source and another source starting from the next

428 // following byte	428 // following byte

429 src_reg = _mm256_loadu_si256((__m256i const *) (src));	429 src_reg = _mm256_loadu_si256((__m256i const *) (src));

430 AVG_NEXT_SRC(src_reg, 1)	430 AVG_NEXT_SRC(src_reg, 1)

431 for (i = 0; i < height ; i++) {	431 for (i = 0; i < height ; i++) {

432 // save current source average	432 // save current source average

433 src_avg = src_reg;	433 src_avg = src_reg;

434 src+= src_stride;	434 src+= src_stride;

435 LOAD_SRC_DST	435 LOAD_SRC_DST

436 AVG_NEXT_SRC(src_reg, 1)	436 AVG_NEXT_SRC(src_reg, 1)

437 MERGE_WITH_SRC(src_avg, src_reg)	437 MERGE_WITH_SRC(src_avg, src_reg)

438 FILTER_SRC(filter)	438 FILTER_SRC(filter)

439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);	439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

440 sec_reg = _mm256_load_si256((__m256i const *) (sec));	440 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

441 src_avg = _mm256_avg_epu8(src_avg, sec_reg);	441 src_avg = _mm256_avg_epu8(src_avg, sec_reg);

442 // expand each byte to 2 bytes	442 // expand each byte to 2 bytes

443 MERGE_WITH_SRC(src_avg, zero_reg)	443 MERGE_WITH_SRC(src_avg, zero_reg)

444 sec+= sec_stride;	444 sec+= sec_stride;

445 CALC_SUM_SSE_INSIDE_LOOP	445 CALC_SUM_SSE_INSIDE_LOOP

446 dst+= dst_stride;	446 dst+= dst_stride;

447 }	447 }

448 }	448 }

449 // x_offset = bilin interpolation and y_offset = 0	449 // x_offset = bilin interpolation and y_offset = 0

450 } else {	450 } else {

451 if (y_offset == 0) {	451 if (y_offset == 0) {

452 __m256i filter, pw8, src_next_reg;	452 __m256i filter, pw8, src_next_reg;

453 x_offset <<= 5;	453 x_offset <<= 5;

454 filter = _mm256_load_si256((__m256i const *)	454 filter = _mm256_load_si256((__m256i const *)

455 (bilinear_filters_avx2 + x_offset));	455 (bilinear_filters_avx2 + x_offset));

456 pw8 = _mm256_set1_epi16(8);	456 pw8 = _mm256_set1_epi16(8);

457 for (i = 0; i < height ; i++) {	457 for (i = 0; i < height ; i++) {

458 LOAD_SRC_DST	458 LOAD_SRC_DST

459 MERGE_NEXT_SRC(src_reg, 1)	459 MERGE_NEXT_SRC(src_reg, 1)

460 FILTER_SRC(filter)	460 FILTER_SRC(filter)

461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);	461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

462 sec_reg = _mm256_load_si256((__m256i const *) (sec));	462 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

463 src_reg = _mm256_avg_epu8(src_reg, sec_reg);	463 src_reg = _mm256_avg_epu8(src_reg, sec_reg);

464 MERGE_WITH_SRC(src_reg, zero_reg)	464 MERGE_WITH_SRC(src_reg, zero_reg)

465 sec+= sec_stride;	465 sec+= sec_stride;

466 CALC_SUM_SSE_INSIDE_LOOP	466 CALC_SUM_SSE_INSIDE_LOOP

467 src+= src_stride;	467 src+= src_stride;

468 dst+= dst_stride;	468 dst+= dst_stride;

469 }	469 }

470 // x_offset = bilin interpolation and y_offset = 8	470 // x_offset = bilin interpolation and y_offset = 8

471 } else if (y_offset == 8) {	471 } else if (y_offset == 8) {

472 __m256i filter, pw8, src_next_reg, src_pack;	472 __m256i filter, pw8, src_next_reg, src_pack;

473 x_offset <<= 5;	473 x_offset <<= 5;

474 filter = _mm256_load_si256((__m256i const *)	474 filter = _mm256_load_si256((__m256i const *)

475 (bilinear_filters_avx2 + x_offset));	475 (bilinear_filters_avx2 + x_offset));

476 pw8 = _mm256_set1_epi16(8);	476 pw8 = _mm256_set1_epi16(8);

477 src_reg = _mm256_loadu_si256((__m256i const *) (src));	477 src_reg = _mm256_loadu_si256((__m256i const *) (src));

478 MERGE_NEXT_SRC(src_reg, 1)	478 MERGE_NEXT_SRC(src_reg, 1)

479 FILTER_SRC(filter)	479 FILTER_SRC(filter)

480 // convert each 16 bit to 8 bit to each low and high lane source	480 // convert each 16 bit to 8 bit to each low and high lane source

481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);	481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

482 for (i = 0; i < height ; i++) {	482 for (i = 0; i < height ; i++) {

483 src+= src_stride;	483 src+= src_stride;

484 LOAD_SRC_DST	484 LOAD_SRC_DST

485 MERGE_NEXT_SRC(src_reg, 1)	485 MERGE_NEXT_SRC(src_reg, 1)

486 FILTER_SRC(filter)	486 FILTER_SRC(filter)

487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);	487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

488 // average between previous pack to the current	488 // average between previous pack to the current

489 src_pack = _mm256_avg_epu8(src_pack, src_reg);	489 src_pack = _mm256_avg_epu8(src_pack, src_reg);

490 sec_reg = _mm256_load_si256((__m256i const *) (sec));	490 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

491 src_pack = _mm256_avg_epu8(src_pack, sec_reg);	491 src_pack = _mm256_avg_epu8(src_pack, sec_reg);

492 sec+= sec_stride;	492 sec+= sec_stride;

493 MERGE_WITH_SRC(src_pack, zero_reg)	493 MERGE_WITH_SRC(src_pack, zero_reg)

494 src_pack = src_reg;	494 src_pack = src_reg;

495 CALC_SUM_SSE_INSIDE_LOOP	495 CALC_SUM_SSE_INSIDE_LOOP

496 dst+= dst_stride;	496 dst+= dst_stride;

497 }	497 }

498 // x_offset = bilin interpolation and y_offset = bilin interpolation	498 // x_offset = bilin interpolation and y_offset = bilin interpolation

499 } else {	499 } else {

500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;	500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;

(...skipping 16 matching lines...) Expand all Loading...
517 src+= src_stride;	517 src+= src_stride;

518 LOAD_SRC_DST	518 LOAD_SRC_DST

519 MERGE_NEXT_SRC(src_reg, 1)	519 MERGE_NEXT_SRC(src_reg, 1)

520 FILTER_SRC(xfilter)	520 FILTER_SRC(xfilter)

521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);	521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

522 // merge previous pack to current pack source	522 // merge previous pack to current pack source

523 MERGE_WITH_SRC(src_pack, src_reg)	523 MERGE_WITH_SRC(src_pack, src_reg)

524 // filter the source	524 // filter the source

525 FILTER_SRC(yfilter)	525 FILTER_SRC(yfilter)

526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);	526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

527 sec_reg = _mm256_load_si256((__m256i const *) (sec));	527 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

528 src_pack = _mm256_avg_epu8(src_pack, sec_reg);	528 src_pack = _mm256_avg_epu8(src_pack, sec_reg);

529 MERGE_WITH_SRC(src_pack, zero_reg)	529 MERGE_WITH_SRC(src_pack, zero_reg)

530 src_pack = src_reg;	530 src_pack = src_reg;

531 sec+= sec_stride;	531 sec+= sec_stride;

532 CALC_SUM_SSE_INSIDE_LOOP	532 CALC_SUM_SSE_INSIDE_LOOP

533 dst+= dst_stride;	533 dst+= dst_stride;

534 }	534 }

535 }	535 }

536 }	536 }

537 CALC_SUM_AND_SSE	537 CALC_SUM_AND_SSE

538 return sum;	538 return sum;

539 }	539 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c ('k') | source/libvpx/vp9/vp9_common.mk » ('j') | no next file with comments »