Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(283)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

Issue 484923003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/vp9/common/vp9_rtcd_defs.pl ('k') | source/libvpx/vp9/decoder/vp9_decoder.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 289 matching lines...) Expand 10 before | Expand all | Expand 10 after
300 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, 300 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
301 unsigned int src_pitch, 301 unsigned int src_pitch,
302 unsigned char *output_ptr, 302 unsigned char *output_ptr,
303 unsigned int out_pitch, 303 unsigned int out_pitch,
304 unsigned int output_height, 304 unsigned int output_height,
305 int16_t *filter) { 305 int16_t *filter) {
306 __m128i filtersReg; 306 __m128i filtersReg;
307 __m256i addFilterReg64; 307 __m256i addFilterReg64;
308 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; 308 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
309 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; 309 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
310 __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32; 310 __m256i srcReg32b11, srcReg32b12, filtersReg32;
311 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 311 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
312 unsigned int i; 312 unsigned int i;
313 unsigned int src_stride, dst_stride; 313 unsigned int src_stride, dst_stride;
314 314
315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); 316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
317 filtersReg = _mm_loadu_si128((__m128i *)filter); 317 filtersReg = _mm_loadu_si128((__m128i *)filter);
318 // converting the 16 bit (short) to 8 bit (byte) and have the 318 // converting the 16 bit (short) to 8 bit (byte) and have the
319 // same data in both lanes of 128 bit register. 319 // same data in both lanes of 128 bit register.
320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
402 _mm256_castsi256_si128(srcReg32b9), 1); 402 _mm256_castsi256_si128(srcReg32b9), 1);
403 403
404 // merge every two consecutive registers 404 // merge every two consecutive registers
405 // save 405 // save
406 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); 406 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
407 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); 407 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
408 408
409 // multiply 2 adjacent elements with the filter and add the result 409 // multiply 2 adjacent elements with the filter and add the result
410 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); 410 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
411 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); 411 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
412 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
413 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
414 412
415 // add and saturate the results together 413 // add and saturate the results together
416 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); 414 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
417 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
418
419 415
420 // multiply 2 adjacent elements with the filter and add the result 416 // multiply 2 adjacent elements with the filter and add the result
421 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); 417 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
422 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
423
424 // multiply 2 adjacent elements with the filter and add the result
425 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); 418 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
426 srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
427
428 419
429 // add and saturate the results together 420 // add and saturate the results together
430 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, 421 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
431 _mm256_min_epi16(srcReg32b8, srcReg32b12)); 422 _mm256_min_epi16(srcReg32b8, srcReg32b12));
432 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, 423 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
433 _mm256_min_epi16(srcReg32b6, srcReg32b13)); 424 _mm256_max_epi16(srcReg32b8, srcReg32b12));
425
426 // multiply 2 adjacent elements with the filter and add the result
427 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
428 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
429
430 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
431
432 // multiply 2 adjacent elements with the filter and add the result
433 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
434 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
434 435
435 // add and saturate the results together 436 // add and saturate the results together
436 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
437 _mm256_max_epi16(srcReg32b8, srcReg32b12));
438 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, 437 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
439 _mm256_max_epi16(srcReg32b6, srcReg32b13)); 438 _mm256_min_epi16(srcReg32b8, srcReg32b12));
440 439 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
440 _mm256_max_epi16(srcReg32b8, srcReg32b12));
441 441
442 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); 442 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
443 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); 443 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
444 444
445 // shift by 7 bit each 16 bit 445 // shift by 7 bit each 16 bit
446 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); 446 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
447 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); 447 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
448 448
449 // shrink to 8 bit each 16 bits, the first lane contain the first 449 // shrink to 8 bit each 16 bits, the first lane contain the first
450 // convolve result and the second lane contain the second convolve 450 // convolve result and the second lane contain the second convolve
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
535 535
536 // shrink to 8 bit each 16 bits, the first lane contain the first 536 // shrink to 8 bit each 16 bits, the first lane contain the first
537 // convolve result and the second lane contain the second convolve 537 // convolve result and the second lane contain the second convolve
538 // result 538 // result
539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); 539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
540 540
541 // save 16 bytes 541 // save 16 bytes
542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); 542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
543 } 543 }
544 } 544 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/vp9_rtcd_defs.pl ('k') | source/libvpx/vp9/decoder/vp9_decoder.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698