| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 14 matching lines...) Expand all Loading... |
| 25 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { | 25 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { |
| 26 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, | 26 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, |
| 27 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | 27 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
| 28 }; | 28 }; |
| 29 | 29 |
| 30 DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { | 30 DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { |
| 31 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, | 31 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, |
| 32 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | 32 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
| 33 }; | 33 }; |
| 34 | 34 |
| 35 #if defined(__clang__) |
| 36 # if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \ |
| 37 (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0) |
| 38 # define MM256_BROADCASTSI128_SI256(x) \ |
| 39 _mm_broadcastsi128_si256((__m128i const *)&(x)) |
| 40 # else // clang > 3.3, and not 5.0 on macosx. |
| 41 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
| 42 # endif // clang <= 3.3 |
| 43 #elif defined(__GNUC__) |
| 44 # if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) |
| 45 # define MM256_BROADCASTSI128_SI256(x) \ |
| 46 _mm_broadcastsi128_si256((__m128i const *)&(x)) |
| 47 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 |
| 48 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) |
| 49 # else // gcc > 4.7 |
| 50 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
| 51 # endif // gcc <= 4.6 |
| 52 #else // !(gcc || clang) |
| 53 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
| 54 #endif // __clang__ |
| 55 |
| 35 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, | 56 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, |
| 36 unsigned int src_pixels_per_line, | 57 unsigned int src_pixels_per_line, |
| 37 unsigned char *output_ptr, | 58 unsigned char *output_ptr, |
| 38 unsigned int output_pitch, | 59 unsigned int output_pitch, |
| 39 unsigned int output_height, | 60 unsigned int output_height, |
| 40 int16_t *filter) { | 61 int16_t *filter) { |
| 41 __m128i filtersReg; | 62 __m128i filtersReg; |
| 42 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; | 63 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
| 43 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; | 64 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 44 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; | 65 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; |
| 45 __m256i srcReg32b1, srcReg32b2, filtersReg32; | 66 __m256i srcReg32b1, srcReg32b2, filtersReg32; |
| 46 unsigned int i; | 67 unsigned int i; |
| 47 unsigned int src_stride, dst_stride; | 68 unsigned int src_stride, dst_stride; |
| 48 | 69 |
| 49 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 70 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 50 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); | 71 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); |
| 51 filtersReg = _mm_loadu_si128((__m128i *)filter); | 72 filtersReg = _mm_loadu_si128((__m128i *)filter); |
| 52 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 73 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 53 // in both lanes of 128 bit register. | 74 // in both lanes of 128 bit register. |
| 54 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 75 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 55 // have the same data in both lanes of a 256 bit register | 76 // have the same data in both lanes of a 256 bit register |
| 56 #if defined (__GNUC__) | 77 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); |
| 57 #if ( __GNUC__ < 4 || (__GNUC__ == 4 && \ | |
| 58 (__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0)))) | |
| 59 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg); | |
| 60 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0)) | |
| 61 filtersReg32 = _mm_broadcastsi128_si256(filtersReg); | |
| 62 #else | |
| 63 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); | |
| 64 #endif | |
| 65 #else | |
| 66 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); | |
| 67 #endif | |
| 68 | 78 |
| 69 // duplicate only the first 16 bits (first and second byte) | 79 // duplicate only the first 16 bits (first and second byte) |
| 70 // across 256 bit register | 80 // across 256 bit register |
| 71 firstFilters = _mm256_shuffle_epi8(filtersReg32, | 81 firstFilters = _mm256_shuffle_epi8(filtersReg32, |
| 72 _mm256_set1_epi16(0x100u)); | 82 _mm256_set1_epi16(0x100u)); |
| 73 // duplicate only the second 16 bits (third and forth byte) | 83 // duplicate only the second 16 bits (third and forth byte) |
| 74 // across 256 bit register | 84 // across 256 bit register |
| 75 secondFilters = _mm256_shuffle_epi8(filtersReg32, | 85 secondFilters = _mm256_shuffle_epi8(filtersReg32, |
| 76 _mm256_set1_epi16(0x302u)); | 86 _mm256_set1_epi16(0x302u)); |
| 77 // duplicate only the third 16 bits (fifth and sixth byte) | 87 // duplicate only the third 16 bits (fifth and sixth byte) |
| (...skipping 224 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 302 unsigned int i; | 312 unsigned int i; |
| 303 unsigned int src_stride, dst_stride; | 313 unsigned int src_stride, dst_stride; |
| 304 | 314 |
| 305 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 306 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); | 316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); |
| 307 filtersReg = _mm_loadu_si128((__m128i *)filter); | 317 filtersReg = _mm_loadu_si128((__m128i *)filter); |
| 308 // converting the 16 bit (short) to 8 bit (byte) and have the | 318 // converting the 16 bit (short) to 8 bit (byte) and have the |
| 309 // same data in both lanes of 128 bit register. | 319 // same data in both lanes of 128 bit register. |
| 310 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 311 // have the same data in both lanes of a 256 bit register | 321 // have the same data in both lanes of a 256 bit register |
| 312 #if defined (__GNUC__) | 322 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); |
| 313 #if ( __GNUC__ < 4 || (__GNUC__ == 4 && \ | |
| 314 (__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0)))) | |
| 315 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg); | |
| 316 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0)) | |
| 317 filtersReg32 = _mm_broadcastsi128_si256(filtersReg); | |
| 318 #else | |
| 319 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); | |
| 320 #endif | |
| 321 #else | |
| 322 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); | |
| 323 #endif | |
| 324 | 323 |
| 325 // duplicate only the first 16 bits (first and second byte) | 324 // duplicate only the first 16 bits (first and second byte) |
| 326 // across 256 bit register | 325 // across 256 bit register |
| 327 firstFilters = _mm256_shuffle_epi8(filtersReg32, | 326 firstFilters = _mm256_shuffle_epi8(filtersReg32, |
| 328 _mm256_set1_epi16(0x100u)); | 327 _mm256_set1_epi16(0x100u)); |
| 329 // duplicate only the second 16 bits (third and forth byte) | 328 // duplicate only the second 16 bits (third and forth byte) |
| 330 // across 256 bit register | 329 // across 256 bit register |
| 331 secondFilters = _mm256_shuffle_epi8(filtersReg32, | 330 secondFilters = _mm256_shuffle_epi8(filtersReg32, |
| 332 _mm256_set1_epi16(0x302u)); | 331 _mm256_set1_epi16(0x302u)); |
| 333 // duplicate only the third 16 bits (fifth and sixth byte) | 332 // duplicate only the third 16 bits (fifth and sixth byte) |
| (...skipping 202 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 536 | 535 |
| 537 // shrink to 8 bit each 16 bits, the first lane contain the first | 536 // shrink to 8 bit each 16 bits, the first lane contain the first |
| 538 // convolve result and the second lane contain the second convolve | 537 // convolve result and the second lane contain the second convolve |
| 539 // result | 538 // result |
| 540 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); | 539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); |
| 541 | 540 |
| 542 // save 16 bytes | 541 // save 16 bytes |
| 543 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
| 544 } | 543 } |
| 545 } | 544 } |
| OLD | NEW |