OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 14 matching lines...) Expand all Loading... |
25 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { | 25 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { |
26 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, | 26 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, |
27 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | 27 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
28 }; | 28 }; |
29 | 29 |
30 DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { | 30 DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { |
31 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, | 31 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, |
32 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | 32 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
33 }; | 33 }; |
34 | 34 |
| 35 #if defined(__clang__) |
| 36 # if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \ |
| 37 (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0) |
| 38 # define MM256_BROADCASTSI128_SI256(x) \ |
| 39 _mm_broadcastsi128_si256((__m128i const *)&(x)) |
| 40 # else // clang > 3.3, and not 5.0 on macosx. |
| 41 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
| 42 # endif // clang <= 3.3 |
| 43 #elif defined(__GNUC__) |
| 44 # if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) |
| 45 # define MM256_BROADCASTSI128_SI256(x) \ |
| 46 _mm_broadcastsi128_si256((__m128i const *)&(x)) |
| 47 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 |
| 48 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) |
| 49 # else // gcc > 4.7 |
| 50 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
| 51 # endif // gcc <= 4.6 |
| 52 #else // !(gcc || clang) |
| 53 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
| 54 #endif // __clang__ |
| 55 |
35 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, | 56 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, |
36 unsigned int src_pixels_per_line, | 57 unsigned int src_pixels_per_line, |
37 unsigned char *output_ptr, | 58 unsigned char *output_ptr, |
38 unsigned int output_pitch, | 59 unsigned int output_pitch, |
39 unsigned int output_height, | 60 unsigned int output_height, |
40 int16_t *filter) { | 61 int16_t *filter) { |
41 __m128i filtersReg; | 62 __m128i filtersReg; |
42 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; | 63 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
43 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; | 64 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; |
44 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; | 65 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; |
45 __m256i srcReg32b1, srcReg32b2, filtersReg32; | 66 __m256i srcReg32b1, srcReg32b2, filtersReg32; |
46 unsigned int i; | 67 unsigned int i; |
47 unsigned int src_stride, dst_stride; | 68 unsigned int src_stride, dst_stride; |
48 | 69 |
49 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 70 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
50 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); | 71 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); |
51 filtersReg = _mm_loadu_si128((__m128i *)filter); | 72 filtersReg = _mm_loadu_si128((__m128i *)filter); |
52 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 73 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
53 // in both lanes of 128 bit register. | 74 // in both lanes of 128 bit register. |
54 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 75 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
55 // have the same data in both lanes of a 256 bit register | 76 // have the same data in both lanes of a 256 bit register |
56 #if defined (__GNUC__) | 77 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); |
57 #if ( __GNUC__ < 4 || (__GNUC__ == 4 && \ | |
58 (__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0)))) | |
59 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg); | |
60 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0)) | |
61 filtersReg32 = _mm_broadcastsi128_si256(filtersReg); | |
62 #else | |
63 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); | |
64 #endif | |
65 #else | |
66 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); | |
67 #endif | |
68 | 78 |
69 // duplicate only the first 16 bits (first and second byte) | 79 // duplicate only the first 16 bits (first and second byte) |
70 // across 256 bit register | 80 // across 256 bit register |
71 firstFilters = _mm256_shuffle_epi8(filtersReg32, | 81 firstFilters = _mm256_shuffle_epi8(filtersReg32, |
72 _mm256_set1_epi16(0x100u)); | 82 _mm256_set1_epi16(0x100u)); |
73 // duplicate only the second 16 bits (third and forth byte) | 83 // duplicate only the second 16 bits (third and forth byte) |
74 // across 256 bit register | 84 // across 256 bit register |
75 secondFilters = _mm256_shuffle_epi8(filtersReg32, | 85 secondFilters = _mm256_shuffle_epi8(filtersReg32, |
76 _mm256_set1_epi16(0x302u)); | 86 _mm256_set1_epi16(0x302u)); |
77 // duplicate only the third 16 bits (fifth and sixth byte) | 87 // duplicate only the third 16 bits (fifth and sixth byte) |
(...skipping 224 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
302 unsigned int i; | 312 unsigned int i; |
303 unsigned int src_stride, dst_stride; | 313 unsigned int src_stride, dst_stride; |
304 | 314 |
305 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
306 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); | 316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); |
307 filtersReg = _mm_loadu_si128((__m128i *)filter); | 317 filtersReg = _mm_loadu_si128((__m128i *)filter); |
308 // converting the 16 bit (short) to 8 bit (byte) and have the | 318 // converting the 16 bit (short) to 8 bit (byte) and have the |
309 // same data in both lanes of 128 bit register. | 319 // same data in both lanes of 128 bit register. |
310 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
311 // have the same data in both lanes of a 256 bit register | 321 // have the same data in both lanes of a 256 bit register |
312 #if defined (__GNUC__) | 322 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); |
313 #if ( __GNUC__ < 4 || (__GNUC__ == 4 && \ | |
314 (__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0)))) | |
315 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg); | |
316 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0)) | |
317 filtersReg32 = _mm_broadcastsi128_si256(filtersReg); | |
318 #else | |
319 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); | |
320 #endif | |
321 #else | |
322 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); | |
323 #endif | |
324 | 323 |
325 // duplicate only the first 16 bits (first and second byte) | 324 // duplicate only the first 16 bits (first and second byte) |
326 // across 256 bit register | 325 // across 256 bit register |
327 firstFilters = _mm256_shuffle_epi8(filtersReg32, | 326 firstFilters = _mm256_shuffle_epi8(filtersReg32, |
328 _mm256_set1_epi16(0x100u)); | 327 _mm256_set1_epi16(0x100u)); |
329 // duplicate only the second 16 bits (third and forth byte) | 328 // duplicate only the second 16 bits (third and forth byte) |
330 // across 256 bit register | 329 // across 256 bit register |
331 secondFilters = _mm256_shuffle_epi8(filtersReg32, | 330 secondFilters = _mm256_shuffle_epi8(filtersReg32, |
332 _mm256_set1_epi16(0x302u)); | 331 _mm256_set1_epi16(0x302u)); |
333 // duplicate only the third 16 bits (fifth and sixth byte) | 332 // duplicate only the third 16 bits (fifth and sixth byte) |
(...skipping 202 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
536 | 535 |
537 // shrink to 8 bit each 16 bits, the first lane contain the first | 536 // shrink to 8 bit each 16 bits, the first lane contain the first |
538 // convolve result and the second lane contain the second convolve | 537 // convolve result and the second lane contain the second convolve |
539 // result | 538 // result |
540 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); | 539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); |
541 | 540 |
542 // save 16 bytes | 541 // save 16 bytes |
543 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
544 } | 543 } |
545 } | 544 } |
OLD | NEW |