OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
| 11 // Due to a header conflict between math.h and intrinsics includes with ceil() |
| 12 // in certain configurations under vs9 this include needs to precede |
| 13 // immintrin.h. |
| 14 #include "./vp9_rtcd.h" |
| 15 |
11 #include <immintrin.h> | 16 #include <immintrin.h> |
| 17 |
| 18 #include "vp9/common/x86/convolve.h" |
12 #include "vpx_ports/mem.h" | 19 #include "vpx_ports/mem.h" |
13 | 20 |
14 // filters for 16_h8 and 16_v8 | 21 // filters for 16_h8 and 16_v8 |
15 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { | 22 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { |
16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, | 23 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, |
17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | 24 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
18 }; | 25 }; |
19 | 26 |
20 DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { | 27 DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { |
21 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, | 28 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, |
(...skipping 24 matching lines...) Expand all Loading... |
46 _mm_broadcastsi128_si256((__m128i const *)&(x)) | 53 _mm_broadcastsi128_si256((__m128i const *)&(x)) |
47 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 | 54 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 |
48 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) | 55 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) |
49 # else // gcc > 4.7 | 56 # else // gcc > 4.7 |
50 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) | 57 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
51 # endif // gcc <= 4.6 | 58 # endif // gcc <= 4.6 |
52 #else // !(gcc || clang) | 59 #else // !(gcc || clang) |
53 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) | 60 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
54 #endif // __clang__ | 61 #endif // __clang__ |
55 | 62 |
56 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, | 63 static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr, |
57 unsigned int src_pixels_per_line, | 64 ptrdiff_t src_pixels_per_line, |
58 unsigned char *output_ptr, | 65 uint8_t *output_ptr, |
59 unsigned int output_pitch, | 66 ptrdiff_t output_pitch, |
60 unsigned int output_height, | 67 uint32_t output_height, |
61 int16_t *filter) { | 68 const int16_t *filter) { |
62 __m128i filtersReg; | 69 __m128i filtersReg; |
63 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; | 70 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
64 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; | 71 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; |
65 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; | 72 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; |
66 __m256i srcReg32b1, srcReg32b2, filtersReg32; | 73 __m256i srcReg32b1, srcReg32b2, filtersReg32; |
67 unsigned int i; | 74 unsigned int i; |
68 unsigned int src_stride, dst_stride; | 75 ptrdiff_t src_stride, dst_stride; |
69 | 76 |
70 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 77 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
71 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); | 78 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); |
72 filtersReg = _mm_loadu_si128((__m128i *)filter); | 79 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
73 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 80 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
74 // in both lanes of 128 bit register. | 81 // in both lanes of 128 bit register. |
75 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 82 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
76 // have the same data in both lanes of a 256 bit register | 83 // have the same data in both lanes of a 256 bit register |
77 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); | 84 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); |
78 | 85 |
79 // duplicate only the first 16 bits (first and second byte) | 86 // duplicate only the first 16 bits (first and second byte) |
80 // across 256 bit register | 87 // across 256 bit register |
81 firstFilters = _mm256_shuffle_epi8(filtersReg32, | 88 firstFilters = _mm256_shuffle_epi8(filtersReg32, |
82 _mm256_set1_epi16(0x100u)); | 89 _mm256_set1_epi16(0x100u)); |
(...skipping 14 matching lines...) Expand all Loading... |
97 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); | 104 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); |
98 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); | 105 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); |
99 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); | 106 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); |
100 | 107 |
101 // multiple the size of the source and destination stride by two | 108 // multiple the size of the source and destination stride by two |
102 src_stride = src_pixels_per_line << 1; | 109 src_stride = src_pixels_per_line << 1; |
103 dst_stride = output_pitch << 1; | 110 dst_stride = output_pitch << 1; |
104 for (i = output_height; i > 1; i-=2) { | 111 for (i = output_height; i > 1; i-=2) { |
105 // load the 2 strides of source | 112 // load the 2 strides of source |
106 srcReg32b1 = _mm256_castsi128_si256( | 113 srcReg32b1 = _mm256_castsi128_si256( |
107 _mm_loadu_si128((__m128i *)(src_ptr-3))); | 114 _mm_loadu_si128((const __m128i *)(src_ptr - 3))); |
108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, | 115 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, |
109 _mm_loadu_si128((__m128i *) | 116 _mm_loadu_si128((const __m128i *) |
110 (src_ptr+src_pixels_per_line-3)), 1); | 117 (src_ptr+src_pixels_per_line-3)), 1); |
111 | 118 |
112 // filter the source buffer | 119 // filter the source buffer |
113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); | 120 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); |
114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); | 121 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); |
115 | 122 |
116 // multiply 2 adjacent elements with the filter and add the result | 123 // multiply 2 adjacent elements with the filter and add the result |
117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); | 124 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); |
118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); | 125 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); |
119 | 126 |
120 // add and saturate the results together | 127 // add and saturate the results together |
121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); | 128 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); |
122 | 129 |
123 // filter the source buffer | 130 // filter the source buffer |
124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); | 131 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); |
125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); | 132 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); |
126 | 133 |
127 // multiply 2 adjacent elements with the filter and add the result | 134 // multiply 2 adjacent elements with the filter and add the result |
128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); | 135 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); |
129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); | 136 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); |
130 | 137 |
131 // add and saturate the results together | 138 // add and saturate the results together |
132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, | 139 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, |
133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 140 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
134 | 141 |
135 // reading 2 strides of the next 16 bytes | 142 // reading 2 strides of the next 16 bytes |
136 // (part of it was being read by earlier read) | 143 // (part of it was being read by earlier read) |
137 srcReg32b2 = _mm256_castsi128_si256( | 144 srcReg32b2 = _mm256_castsi128_si256( |
138 _mm_loadu_si128((__m128i *)(src_ptr+5))); | 145 _mm_loadu_si128((const __m128i *)(src_ptr + 5))); |
139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, | 146 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, |
140 _mm_loadu_si128((__m128i *) | 147 _mm_loadu_si128((const __m128i *) |
141 (src_ptr+src_pixels_per_line+5)), 1); | 148 (src_ptr+src_pixels_per_line+5)), 1); |
142 | 149 |
143 // add and saturate the results together | 150 // add and saturate the results together |
144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, | 151 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, |
145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 152 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
146 | 153 |
147 // filter the source buffer | 154 // filter the source buffer |
148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); | 155 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); |
149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); | 156 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); |
150 | 157 |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
195 _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); | 202 _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); |
196 output_ptr+=dst_stride; | 203 output_ptr+=dst_stride; |
197 } | 204 } |
198 | 205 |
199 // if the number of strides is odd. | 206 // if the number of strides is odd. |
200 // process only 16 bytes | 207 // process only 16 bytes |
201 if (i > 0) { | 208 if (i > 0) { |
202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; | 209 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; |
203 __m128i srcRegFilt2, srcRegFilt3; | 210 __m128i srcRegFilt2, srcRegFilt3; |
204 | 211 |
205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 212 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
206 | 213 |
207 // filter the source buffer | 214 // filter the source buffer |
208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, | 215 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, |
209 _mm256_castsi256_si128(filt1Reg)); | 216 _mm256_castsi256_si128(filt1Reg)); |
210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, | 217 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, |
211 _mm256_castsi256_si128(filt4Reg)); | 218 _mm256_castsi256_si128(filt4Reg)); |
212 | 219 |
213 // multiply 2 adjacent elements with the filter and add the result | 220 // multiply 2 adjacent elements with the filter and add the result |
214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, | 221 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, |
215 _mm256_castsi256_si128(firstFilters)); | 222 _mm256_castsi256_si128(firstFilters)); |
(...skipping 14 matching lines...) Expand all Loading... |
230 _mm256_castsi256_si128(secondFilters)); | 237 _mm256_castsi256_si128(secondFilters)); |
231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, | 238 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
232 _mm256_castsi256_si128(thirdFilters)); | 239 _mm256_castsi256_si128(thirdFilters)); |
233 | 240 |
234 // add and saturate the results together | 241 // add and saturate the results together |
235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 242 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
236 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | 243 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
237 | 244 |
238 // reading the next 16 bytes | 245 // reading the next 16 bytes |
239 // (part of it was being read by earlier read) | 246 // (part of it was being read by earlier read) |
240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); | 247 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); |
241 | 248 |
242 // add and saturate the results together | 249 // add and saturate the results together |
243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 250 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
244 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | 251 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
245 | 252 |
246 // filter the source buffer | 253 // filter the source buffer |
247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, | 254 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, |
248 _mm256_castsi256_si128(filt1Reg)); | 255 _mm256_castsi256_si128(filt1Reg)); |
249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, | 256 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, |
250 _mm256_castsi256_si128(filt4Reg)); | 257 _mm256_castsi256_si128(filt4Reg)); |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
290 // shrink to 8 bit each 16 bits, the first lane contain the first | 297 // shrink to 8 bit each 16 bits, the first lane contain the first |
291 // convolve result and the second lane contain the second convolve | 298 // convolve result and the second lane contain the second convolve |
292 // result | 299 // result |
293 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); | 300 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); |
294 | 301 |
295 // save 16 bytes | 302 // save 16 bytes |
296 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); | 303 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); |
297 } | 304 } |
298 } | 305 } |
299 | 306 |
300 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, | 307 static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr, |
301 unsigned int src_pitch, | 308 ptrdiff_t src_pitch, |
302 unsigned char *output_ptr, | 309 uint8_t *output_ptr, |
303 unsigned int out_pitch, | 310 ptrdiff_t out_pitch, |
304 unsigned int output_height, | 311 uint32_t output_height, |
305 int16_t *filter) { | 312 const int16_t *filter) { |
306 __m128i filtersReg; | 313 __m128i filtersReg; |
307 __m256i addFilterReg64; | 314 __m256i addFilterReg64; |
308 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; | 315 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; |
309 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; | 316 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; |
310 __m256i srcReg32b11, srcReg32b12, filtersReg32; | 317 __m256i srcReg32b11, srcReg32b12, filtersReg32; |
311 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; | 318 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; |
312 unsigned int i; | 319 unsigned int i; |
313 unsigned int src_stride, dst_stride; | 320 ptrdiff_t src_stride, dst_stride; |
314 | 321 |
315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 322 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); | 323 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); |
317 filtersReg = _mm_loadu_si128((__m128i *)filter); | 324 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
318 // converting the 16 bit (short) to 8 bit (byte) and have the | 325 // converting the 16 bit (short) to 8 bit (byte) and have the |
319 // same data in both lanes of 128 bit register. | 326 // same data in both lanes of 128 bit register. |
320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 327 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
321 // have the same data in both lanes of a 256 bit register | 328 // have the same data in both lanes of a 256 bit register |
322 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); | 329 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); |
323 | 330 |
324 // duplicate only the first 16 bits (first and second byte) | 331 // duplicate only the first 16 bits (first and second byte) |
325 // across 256 bit register | 332 // across 256 bit register |
326 firstFilters = _mm256_shuffle_epi8(filtersReg32, | 333 firstFilters = _mm256_shuffle_epi8(filtersReg32, |
327 _mm256_set1_epi16(0x100u)); | 334 _mm256_set1_epi16(0x100u)); |
328 // duplicate only the second 16 bits (third and forth byte) | 335 // duplicate only the second 16 bits (third and forth byte) |
329 // across 256 bit register | 336 // across 256 bit register |
330 secondFilters = _mm256_shuffle_epi8(filtersReg32, | 337 secondFilters = _mm256_shuffle_epi8(filtersReg32, |
331 _mm256_set1_epi16(0x302u)); | 338 _mm256_set1_epi16(0x302u)); |
332 // duplicate only the third 16 bits (fifth and sixth byte) | 339 // duplicate only the third 16 bits (fifth and sixth byte) |
333 // across 256 bit register | 340 // across 256 bit register |
334 thirdFilters = _mm256_shuffle_epi8(filtersReg32, | 341 thirdFilters = _mm256_shuffle_epi8(filtersReg32, |
335 _mm256_set1_epi16(0x504u)); | 342 _mm256_set1_epi16(0x504u)); |
336 // duplicate only the forth 16 bits (seventh and eighth byte) | 343 // duplicate only the forth 16 bits (seventh and eighth byte) |
337 // across 256 bit register | 344 // across 256 bit register |
338 forthFilters = _mm256_shuffle_epi8(filtersReg32, | 345 forthFilters = _mm256_shuffle_epi8(filtersReg32, |
339 _mm256_set1_epi16(0x706u)); | 346 _mm256_set1_epi16(0x706u)); |
340 | 347 |
341 // multiple the size of the source and destination stride by two | 348 // multiple the size of the source and destination stride by two |
342 src_stride = src_pitch << 1; | 349 src_stride = src_pitch << 1; |
343 dst_stride = out_pitch << 1; | 350 dst_stride = out_pitch << 1; |
344 | 351 |
345 // load 16 bytes 7 times in stride of src_pitch | 352 // load 16 bytes 7 times in stride of src_pitch |
346 srcReg32b1 = _mm256_castsi128_si256( | 353 srcReg32b1 = _mm256_castsi128_si256( |
347 _mm_loadu_si128((__m128i *)(src_ptr))); | 354 _mm_loadu_si128((const __m128i *)(src_ptr))); |
348 srcReg32b2 = _mm256_castsi128_si256( | 355 srcReg32b2 = _mm256_castsi128_si256( |
349 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); | 356 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); |
350 srcReg32b3 = _mm256_castsi128_si256( | 357 srcReg32b3 = _mm256_castsi128_si256( |
351 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); | 358 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); |
352 srcReg32b4 = _mm256_castsi128_si256( | 359 srcReg32b4 = _mm256_castsi128_si256( |
353 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); | 360 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); |
354 srcReg32b5 = _mm256_castsi128_si256( | 361 srcReg32b5 = _mm256_castsi128_si256( |
355 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); | 362 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); |
356 srcReg32b6 = _mm256_castsi128_si256( | 363 srcReg32b6 = _mm256_castsi128_si256( |
357 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); | 364 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); |
358 srcReg32b7 = _mm256_castsi128_si256( | 365 srcReg32b7 = _mm256_castsi128_si256( |
359 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); | 366 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); |
360 | 367 |
361 // have each consecutive loads on the same 256 register | 368 // have each consecutive loads on the same 256 register |
362 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, | 369 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, |
363 _mm256_castsi256_si128(srcReg32b2), 1); | 370 _mm256_castsi256_si128(srcReg32b2), 1); |
364 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, | 371 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, |
365 _mm256_castsi256_si128(srcReg32b3), 1); | 372 _mm256_castsi256_si128(srcReg32b3), 1); |
366 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, | 373 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, |
367 _mm256_castsi256_si128(srcReg32b4), 1); | 374 _mm256_castsi256_si128(srcReg32b4), 1); |
368 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, | 375 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, |
369 _mm256_castsi256_si128(srcReg32b5), 1); | 376 _mm256_castsi256_si128(srcReg32b5), 1); |
(...skipping 16 matching lines...) Expand all Loading... |
386 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); | 393 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); |
387 | 394 |
388 // save | 395 // save |
389 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); | 396 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); |
390 | 397 |
391 | 398 |
392 for (i = output_height; i > 1; i-=2) { | 399 for (i = output_height; i > 1; i-=2) { |
393 // load the last 2 loads of 16 bytes and have every two | 400 // load the last 2 loads of 16 bytes and have every two |
394 // consecutive loads in the same 256 bit register | 401 // consecutive loads in the same 256 bit register |
395 srcReg32b8 = _mm256_castsi128_si256( | 402 srcReg32b8 = _mm256_castsi128_si256( |
396 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); | 403 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); |
397 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, | 404 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, |
398 _mm256_castsi256_si128(srcReg32b8), 1); | 405 _mm256_castsi256_si128(srcReg32b8), 1); |
399 srcReg32b9 = _mm256_castsi128_si256( | 406 srcReg32b9 = _mm256_castsi128_si256( |
400 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); | 407 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); |
401 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, | 408 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, |
402 _mm256_castsi256_si128(srcReg32b9), 1); | 409 _mm256_castsi256_si128(srcReg32b9), 1); |
403 | 410 |
404 // merge every two consecutive registers | 411 // merge every two consecutive registers |
405 // save | 412 // save |
406 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); | 413 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); |
407 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); | 414 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); |
408 | 415 |
409 // multiply 2 adjacent elements with the filter and add the result | 416 // multiply 2 adjacent elements with the filter and add the result |
410 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); | 417 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
469 srcReg32b11 = srcReg32b2; | 476 srcReg32b11 = srcReg32b2; |
470 srcReg32b3 = srcReg32b5; | 477 srcReg32b3 = srcReg32b5; |
471 srcReg32b2 = srcReg32b4; | 478 srcReg32b2 = srcReg32b4; |
472 srcReg32b5 = srcReg32b7; | 479 srcReg32b5 = srcReg32b7; |
473 srcReg32b7 = srcReg32b9; | 480 srcReg32b7 = srcReg32b9; |
474 } | 481 } |
475 if (i > 0) { | 482 if (i > 0) { |
476 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; | 483 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; |
477 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; | 484 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; |
478 // load the last 16 bytes | 485 // load the last 16 bytes |
479 srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); | 486 srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); |
480 | 487 |
481 // merge the last 2 results together | 488 // merge the last 2 results together |
482 srcRegFilt4 = _mm_unpacklo_epi8( | 489 srcRegFilt4 = _mm_unpacklo_epi8( |
483 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); | 490 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); |
484 srcRegFilt7 = _mm_unpackhi_epi8( | 491 srcRegFilt7 = _mm_unpackhi_epi8( |
485 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); | 492 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); |
486 | 493 |
487 // multiply 2 adjacent elements with the filter and add the result | 494 // multiply 2 adjacent elements with the filter and add the result |
488 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), | 495 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), |
489 _mm256_castsi256_si128(firstFilters)); | 496 _mm256_castsi256_si128(firstFilters)); |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
535 | 542 |
536 // shrink to 8 bit each 16 bits, the first lane contain the first | 543 // shrink to 8 bit each 16 bits, the first lane contain the first |
537 // convolve result and the second lane contain the second convolve | 544 // convolve result and the second lane contain the second convolve |
538 // result | 545 // result |
539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); | 546 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); |
540 | 547 |
541 // save 16 bytes | 548 // save 16 bytes |
542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 549 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
543 } | 550 } |
544 } | 551 } |
| 552 |
| 553 #if HAVE_AVX2 && HAVE_SSSE3 |
| 554 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
| 555 #if ARCH_X86_64 |
| 556 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; |
| 557 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; |
| 558 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; |
| 559 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 |
| 560 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 |
| 561 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 |
| 562 #else // ARCH_X86 |
| 563 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
| 564 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
| 565 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
| 566 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 |
| 567 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 |
| 568 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 |
| 569 #endif // ARCH_X86_64 |
| 570 filter8_1dfunction vp9_filter_block1d16_v2_ssse3; |
| 571 filter8_1dfunction vp9_filter_block1d16_h2_ssse3; |
| 572 filter8_1dfunction vp9_filter_block1d8_v2_ssse3; |
| 573 filter8_1dfunction vp9_filter_block1d8_h2_ssse3; |
| 574 filter8_1dfunction vp9_filter_block1d4_v2_ssse3; |
| 575 filter8_1dfunction vp9_filter_block1d4_h2_ssse3; |
| 576 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 |
| 577 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 |
| 578 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 |
| 579 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 |
| 580 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 |
| 581 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 |
| 582 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 |
| 583 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 584 // uint8_t *dst, ptrdiff_t dst_stride, |
| 585 // const int16_t *filter_x, int x_step_q4, |
| 586 // const int16_t *filter_y, int y_step_q4, |
| 587 // int w, int h); |
| 588 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 589 // uint8_t *dst, ptrdiff_t dst_stride, |
| 590 // const int16_t *filter_x, int x_step_q4, |
| 591 // const int16_t *filter_y, int y_step_q4, |
| 592 // int w, int h); |
| 593 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); |
| 594 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); |
| 595 |
| 596 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 597 // uint8_t *dst, ptrdiff_t dst_stride, |
| 598 // const int16_t *filter_x, int x_step_q4, |
| 599 // const int16_t *filter_y, int y_step_q4, |
| 600 // int w, int h); |
| 601 FUN_CONV_2D(, avx2); |
| 602 #endif // HAVE_AX2 && HAVE_SSSE3 |
OLD | NEW |