source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c - Issue 484923003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

Issue 484923003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 289 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
300 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,	300 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,

301 unsigned int src_pitch,	301 unsigned int src_pitch,

302 unsigned char *output_ptr,	302 unsigned char *output_ptr,

303 unsigned int out_pitch,	303 unsigned int out_pitch,

304 unsigned int output_height,	304 unsigned int output_height,

305 int16_t *filter) {	305 int16_t *filter) {

306 __m128i filtersReg;	306 __m128i filtersReg;

307 __m256i addFilterReg64;	307 __m256i addFilterReg64;

308 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;	308 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;

309 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;	309 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;

310 __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;	310 __m256i srcReg32b11, srcReg32b12, filtersReg32;

311 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;	311 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;

312 unsigned int i;	312 unsigned int i;

313 unsigned int src_stride, dst_stride;	313 unsigned int src_stride, dst_stride;

314	314

315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64	315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);	316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);

317 filtersReg = _mm_loadu_si128((__m128i *)filter);	317 filtersReg = _mm_loadu_si128((__m128i *)filter);

318 // converting the 16 bit (short) to 8 bit (byte) and have the	318 // converting the 16 bit (short) to 8 bit (byte) and have the

319 // same data in both lanes of 128 bit register.	319 // same data in both lanes of 128 bit register.

320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);	320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
402 _mm256_castsi256_si128(srcReg32b9), 1);	402 _mm256_castsi256_si128(srcReg32b9), 1);

403	403

404 // merge every two consecutive registers	404 // merge every two consecutive registers

405 // save	405 // save

406 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);	406 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);

407 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);	407 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);

408	408

409 // multiply 2 adjacent elements with the filter and add the result	409 // multiply 2 adjacent elements with the filter and add the result

410 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);	410 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);

411 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);	411 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);

412 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);

413 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);

414	412

415 // add and saturate the results together	413 // add and saturate the results together

416 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);	414 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);

417 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);

418

419	415

420 // multiply 2 adjacent elements with the filter and add the result	416 // multiply 2 adjacent elements with the filter and add the result

421 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);	417 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);

422 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);

423

424 // multiply 2 adjacent elements with the filter and add the result

425 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);	418 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);

426 srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);

427

428	419

429 // add and saturate the results together	420 // add and saturate the results together

430 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,	421 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,

431 _mm256_min_epi16(srcReg32b8, srcReg32b12));	422 _mm256_min_epi16(srcReg32b8, srcReg32b12));

432 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,	423 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,

433 _mm256_min_epi16(srcReg32b6, srcReg32b13));	424 _mm256_max_epi16(srcReg32b8, srcReg32b12));

	425

	426 // multiply 2 adjacent elements with the filter and add the result

	427 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);

	428 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);

	429

	430 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);

	431

	432 // multiply 2 adjacent elements with the filter and add the result

	433 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);

	434 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);

434	435

435 // add and saturate the results together	436 // add and saturate the results together

436 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,

437 _mm256_max_epi16(srcReg32b8, srcReg32b12));

438 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,	437 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,

439 _mm256_max_epi16(srcReg32b6, srcReg32b13));	438 _mm256_min_epi16(srcReg32b8, srcReg32b12));

440	439 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,

	440 _mm256_max_epi16(srcReg32b8, srcReg32b12));

441	441

442 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);	442 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);

443 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);	443 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);

444	444

445 // shift by 7 bit each 16 bit	445 // shift by 7 bit each 16 bit

446 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);	446 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);

447 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);	447 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);

448	448

449 // shrink to 8 bit each 16 bits, the first lane contain the first	449 // shrink to 8 bit each 16 bits, the first lane contain the first

450 // convolve result and the second lane contain the second convolve	450 // convolve result and the second lane contain the second convolve

(...skipping 84 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
535	535

536 // shrink to 8 bit each 16 bits, the first lane contain the first	536 // shrink to 8 bit each 16 bits, the first lane contain the first

537 // convolve result and the second lane contain the second convolve	537 // convolve result and the second lane contain the second convolve

538 // result	538 // result

539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);	539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);

540	540

541 // save 16 bytes	541 // save 16 bytes

542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);	542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);

543 }	543 }

544 }	544 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/vp9_rtcd_defs.pl ('k') | source/libvpx/vp9/decoder/vp9_decoder.h » ('j') | no next file with comments »