OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 289 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
300 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, | 300 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, |
301 unsigned int src_pitch, | 301 unsigned int src_pitch, |
302 unsigned char *output_ptr, | 302 unsigned char *output_ptr, |
303 unsigned int out_pitch, | 303 unsigned int out_pitch, |
304 unsigned int output_height, | 304 unsigned int output_height, |
305 int16_t *filter) { | 305 int16_t *filter) { |
306 __m128i filtersReg; | 306 __m128i filtersReg; |
307 __m256i addFilterReg64; | 307 __m256i addFilterReg64; |
308 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; | 308 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; |
309 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; | 309 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; |
310 __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32; | 310 __m256i srcReg32b11, srcReg32b12, filtersReg32; |
311 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; | 311 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; |
312 unsigned int i; | 312 unsigned int i; |
313 unsigned int src_stride, dst_stride; | 313 unsigned int src_stride, dst_stride; |
314 | 314 |
315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); | 316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); |
317 filtersReg = _mm_loadu_si128((__m128i *)filter); | 317 filtersReg = _mm_loadu_si128((__m128i *)filter); |
318 // converting the 16 bit (short) to 8 bit (byte) and have the | 318 // converting the 16 bit (short) to 8 bit (byte) and have the |
319 // same data in both lanes of 128 bit register. | 319 // same data in both lanes of 128 bit register. |
320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
402 _mm256_castsi256_si128(srcReg32b9), 1); | 402 _mm256_castsi256_si128(srcReg32b9), 1); |
403 | 403 |
404 // merge every two consecutive registers | 404 // merge every two consecutive registers |
405 // save | 405 // save |
406 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); | 406 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); |
407 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); | 407 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); |
408 | 408 |
409 // multiply 2 adjacent elements with the filter and add the result | 409 // multiply 2 adjacent elements with the filter and add the result |
410 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); | 410 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); |
411 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); | 411 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); |
412 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); | |
413 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); | |
414 | 412 |
415 // add and saturate the results together | 413 // add and saturate the results together |
416 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); | 414 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); |
417 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8); | |
418 | |
419 | 415 |
420 // multiply 2 adjacent elements with the filter and add the result | 416 // multiply 2 adjacent elements with the filter and add the result |
421 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); | 417 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); |
422 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); | |
423 | |
424 // multiply 2 adjacent elements with the filter and add the result | |
425 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); | 418 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); |
426 srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); | |
427 | |
428 | 419 |
429 // add and saturate the results together | 420 // add and saturate the results together |
430 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, | 421 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, |
431 _mm256_min_epi16(srcReg32b8, srcReg32b12)); | 422 _mm256_min_epi16(srcReg32b8, srcReg32b12)); |
432 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, | 423 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, |
433 _mm256_min_epi16(srcReg32b6, srcReg32b13)); | 424 _mm256_max_epi16(srcReg32b8, srcReg32b12)); |
| 425 |
| 426 // multiply 2 adjacent elements with the filter and add the result |
| 427 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); |
| 428 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); |
| 429 |
| 430 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); |
| 431 |
| 432 // multiply 2 adjacent elements with the filter and add the result |
| 433 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); |
| 434 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); |
434 | 435 |
435 // add and saturate the results together | 436 // add and saturate the results together |
436 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, | |
437 _mm256_max_epi16(srcReg32b8, srcReg32b12)); | |
438 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, | 437 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, |
439 _mm256_max_epi16(srcReg32b6, srcReg32b13)); | 438 _mm256_min_epi16(srcReg32b8, srcReg32b12)); |
440 | 439 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, |
| 440 _mm256_max_epi16(srcReg32b8, srcReg32b12)); |
441 | 441 |
442 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); | 442 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); |
443 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); | 443 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); |
444 | 444 |
445 // shift by 7 bit each 16 bit | 445 // shift by 7 bit each 16 bit |
446 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); | 446 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); |
447 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); | 447 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); |
448 | 448 |
449 // shrink to 8 bit each 16 bits, the first lane contain the first | 449 // shrink to 8 bit each 16 bits, the first lane contain the first |
450 // convolve result and the second lane contain the second convolve | 450 // convolve result and the second lane contain the second convolve |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
535 | 535 |
536 // shrink to 8 bit each 16 bits, the first lane contain the first | 536 // shrink to 8 bit each 16 bits, the first lane contain the first |
537 // convolve result and the second lane contain the second convolve | 537 // convolve result and the second lane contain the second convolve |
538 // result | 538 // result |
539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); | 539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); |
540 | 540 |
541 // save 16 bytes | 541 // save 16 bytes |
542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
543 } | 543 } |
544 } | 544 } |
OLD | NEW |