| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 // Due to a header conflict between math.h and intrinsics includes with ceil() |
| 12 // in certain configurations under vs9 this include needs to precede |
| 13 // immintrin.h. |
| 14 #include "./vp9_rtcd.h" |
| 15 |
| 11 #include <immintrin.h> | 16 #include <immintrin.h> |
| 17 |
| 18 #include "vp9/common/x86/convolve.h" |
| 12 #include "vpx_ports/mem.h" | 19 #include "vpx_ports/mem.h" |
| 13 | 20 |
| 14 // filters for 16_h8 and 16_v8 | 21 // filters for 16_h8 and 16_v8 |
| 15 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { | 22 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { |
| 16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, | 23 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, |
| 17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | 24 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
| 18 }; | 25 }; |
| 19 | 26 |
| 20 DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { | 27 DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { |
| 21 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, | 28 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, |
| (...skipping 24 matching lines...) Expand all Loading... |
| 46 _mm_broadcastsi128_si256((__m128i const *)&(x)) | 53 _mm_broadcastsi128_si256((__m128i const *)&(x)) |
| 47 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 | 54 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 |
| 48 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) | 55 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) |
| 49 # else // gcc > 4.7 | 56 # else // gcc > 4.7 |
| 50 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) | 57 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
| 51 # endif // gcc <= 4.6 | 58 # endif // gcc <= 4.6 |
| 52 #else // !(gcc || clang) | 59 #else // !(gcc || clang) |
| 53 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) | 60 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) |
| 54 #endif // __clang__ | 61 #endif // __clang__ |
| 55 | 62 |
| 56 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, | 63 static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr, |
| 57 unsigned int src_pixels_per_line, | 64 ptrdiff_t src_pixels_per_line, |
| 58 unsigned char *output_ptr, | 65 uint8_t *output_ptr, |
| 59 unsigned int output_pitch, | 66 ptrdiff_t output_pitch, |
| 60 unsigned int output_height, | 67 uint32_t output_height, |
| 61 int16_t *filter) { | 68 const int16_t *filter) { |
| 62 __m128i filtersReg; | 69 __m128i filtersReg; |
| 63 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; | 70 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
| 64 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; | 71 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 65 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; | 72 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; |
| 66 __m256i srcReg32b1, srcReg32b2, filtersReg32; | 73 __m256i srcReg32b1, srcReg32b2, filtersReg32; |
| 67 unsigned int i; | 74 unsigned int i; |
| 68 unsigned int src_stride, dst_stride; | 75 ptrdiff_t src_stride, dst_stride; |
| 69 | 76 |
| 70 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 77 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 71 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); | 78 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); |
| 72 filtersReg = _mm_loadu_si128((__m128i *)filter); | 79 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
| 73 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 80 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 74 // in both lanes of 128 bit register. | 81 // in both lanes of 128 bit register. |
| 75 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 82 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 76 // have the same data in both lanes of a 256 bit register | 83 // have the same data in both lanes of a 256 bit register |
| 77 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); | 84 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); |
| 78 | 85 |
| 79 // duplicate only the first 16 bits (first and second byte) | 86 // duplicate only the first 16 bits (first and second byte) |
| 80 // across 256 bit register | 87 // across 256 bit register |
| 81 firstFilters = _mm256_shuffle_epi8(filtersReg32, | 88 firstFilters = _mm256_shuffle_epi8(filtersReg32, |
| 82 _mm256_set1_epi16(0x100u)); | 89 _mm256_set1_epi16(0x100u)); |
| (...skipping 14 matching lines...) Expand all Loading... |
| 97 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); | 104 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); |
| 98 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); | 105 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); |
| 99 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); | 106 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); |
| 100 | 107 |
| 101 // multiple the size of the source and destination stride by two | 108 // multiple the size of the source and destination stride by two |
| 102 src_stride = src_pixels_per_line << 1; | 109 src_stride = src_pixels_per_line << 1; |
| 103 dst_stride = output_pitch << 1; | 110 dst_stride = output_pitch << 1; |
| 104 for (i = output_height; i > 1; i-=2) { | 111 for (i = output_height; i > 1; i-=2) { |
| 105 // load the 2 strides of source | 112 // load the 2 strides of source |
| 106 srcReg32b1 = _mm256_castsi128_si256( | 113 srcReg32b1 = _mm256_castsi128_si256( |
| 107 _mm_loadu_si128((__m128i *)(src_ptr-3))); | 114 _mm_loadu_si128((const __m128i *)(src_ptr - 3))); |
| 108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, | 115 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, |
| 109 _mm_loadu_si128((__m128i *) | 116 _mm_loadu_si128((const __m128i *) |
| 110 (src_ptr+src_pixels_per_line-3)), 1); | 117 (src_ptr+src_pixels_per_line-3)), 1); |
| 111 | 118 |
| 112 // filter the source buffer | 119 // filter the source buffer |
| 113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); | 120 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); |
| 114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); | 121 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); |
| 115 | 122 |
| 116 // multiply 2 adjacent elements with the filter and add the result | 123 // multiply 2 adjacent elements with the filter and add the result |
| 117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); | 124 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); |
| 118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); | 125 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); |
| 119 | 126 |
| 120 // add and saturate the results together | 127 // add and saturate the results together |
| 121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); | 128 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); |
| 122 | 129 |
| 123 // filter the source buffer | 130 // filter the source buffer |
| 124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); | 131 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); |
| 125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); | 132 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); |
| 126 | 133 |
| 127 // multiply 2 adjacent elements with the filter and add the result | 134 // multiply 2 adjacent elements with the filter and add the result |
| 128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); | 135 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); |
| 129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); | 136 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); |
| 130 | 137 |
| 131 // add and saturate the results together | 138 // add and saturate the results together |
| 132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, | 139 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, |
| 133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 140 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
| 134 | 141 |
| 135 // reading 2 strides of the next 16 bytes | 142 // reading 2 strides of the next 16 bytes |
| 136 // (part of it was being read by earlier read) | 143 // (part of it was being read by earlier read) |
| 137 srcReg32b2 = _mm256_castsi128_si256( | 144 srcReg32b2 = _mm256_castsi128_si256( |
| 138 _mm_loadu_si128((__m128i *)(src_ptr+5))); | 145 _mm_loadu_si128((const __m128i *)(src_ptr + 5))); |
| 139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, | 146 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, |
| 140 _mm_loadu_si128((__m128i *) | 147 _mm_loadu_si128((const __m128i *) |
| 141 (src_ptr+src_pixels_per_line+5)), 1); | 148 (src_ptr+src_pixels_per_line+5)), 1); |
| 142 | 149 |
| 143 // add and saturate the results together | 150 // add and saturate the results together |
| 144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, | 151 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, |
| 145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 152 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
| 146 | 153 |
| 147 // filter the source buffer | 154 // filter the source buffer |
| 148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); | 155 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); |
| 149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); | 156 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); |
| 150 | 157 |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 195 _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); | 202 _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); |
| 196 output_ptr+=dst_stride; | 203 output_ptr+=dst_stride; |
| 197 } | 204 } |
| 198 | 205 |
| 199 // if the number of strides is odd. | 206 // if the number of strides is odd. |
| 200 // process only 16 bytes | 207 // process only 16 bytes |
| 201 if (i > 0) { | 208 if (i > 0) { |
| 202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; | 209 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; |
| 203 __m128i srcRegFilt2, srcRegFilt3; | 210 __m128i srcRegFilt2, srcRegFilt3; |
| 204 | 211 |
| 205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 212 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
| 206 | 213 |
| 207 // filter the source buffer | 214 // filter the source buffer |
| 208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, | 215 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, |
| 209 _mm256_castsi256_si128(filt1Reg)); | 216 _mm256_castsi256_si128(filt1Reg)); |
| 210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, | 217 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, |
| 211 _mm256_castsi256_si128(filt4Reg)); | 218 _mm256_castsi256_si128(filt4Reg)); |
| 212 | 219 |
| 213 // multiply 2 adjacent elements with the filter and add the result | 220 // multiply 2 adjacent elements with the filter and add the result |
| 214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, | 221 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, |
| 215 _mm256_castsi256_si128(firstFilters)); | 222 _mm256_castsi256_si128(firstFilters)); |
| (...skipping 14 matching lines...) Expand all Loading... |
| 230 _mm256_castsi256_si128(secondFilters)); | 237 _mm256_castsi256_si128(secondFilters)); |
| 231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, | 238 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
| 232 _mm256_castsi256_si128(thirdFilters)); | 239 _mm256_castsi256_si128(thirdFilters)); |
| 233 | 240 |
| 234 // add and saturate the results together | 241 // add and saturate the results together |
| 235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 242 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
| 236 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | 243 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
| 237 | 244 |
| 238 // reading the next 16 bytes | 245 // reading the next 16 bytes |
| 239 // (part of it was being read by earlier read) | 246 // (part of it was being read by earlier read) |
| 240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); | 247 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); |
| 241 | 248 |
| 242 // add and saturate the results together | 249 // add and saturate the results together |
| 243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 250 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
| 244 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | 251 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
| 245 | 252 |
| 246 // filter the source buffer | 253 // filter the source buffer |
| 247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, | 254 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, |
| 248 _mm256_castsi256_si128(filt1Reg)); | 255 _mm256_castsi256_si128(filt1Reg)); |
| 249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, | 256 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, |
| 250 _mm256_castsi256_si128(filt4Reg)); | 257 _mm256_castsi256_si128(filt4Reg)); |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 290 // shrink to 8 bit each 16 bits, the first lane contain the first | 297 // shrink to 8 bit each 16 bits, the first lane contain the first |
| 291 // convolve result and the second lane contain the second convolve | 298 // convolve result and the second lane contain the second convolve |
| 292 // result | 299 // result |
| 293 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); | 300 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); |
| 294 | 301 |
| 295 // save 16 bytes | 302 // save 16 bytes |
| 296 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); | 303 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); |
| 297 } | 304 } |
| 298 } | 305 } |
| 299 | 306 |
| 300 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, | 307 static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr, |
| 301 unsigned int src_pitch, | 308 ptrdiff_t src_pitch, |
| 302 unsigned char *output_ptr, | 309 uint8_t *output_ptr, |
| 303 unsigned int out_pitch, | 310 ptrdiff_t out_pitch, |
| 304 unsigned int output_height, | 311 uint32_t output_height, |
| 305 int16_t *filter) { | 312 const int16_t *filter) { |
| 306 __m128i filtersReg; | 313 __m128i filtersReg; |
| 307 __m256i addFilterReg64; | 314 __m256i addFilterReg64; |
| 308 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; | 315 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; |
| 309 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; | 316 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; |
| 310 __m256i srcReg32b11, srcReg32b12, filtersReg32; | 317 __m256i srcReg32b11, srcReg32b12, filtersReg32; |
| 311 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; | 318 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 312 unsigned int i; | 319 unsigned int i; |
| 313 unsigned int src_stride, dst_stride; | 320 ptrdiff_t src_stride, dst_stride; |
| 314 | 321 |
| 315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 322 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); | 323 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); |
| 317 filtersReg = _mm_loadu_si128((__m128i *)filter); | 324 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
| 318 // converting the 16 bit (short) to 8 bit (byte) and have the | 325 // converting the 16 bit (short) to 8 bit (byte) and have the |
| 319 // same data in both lanes of 128 bit register. | 326 // same data in both lanes of 128 bit register. |
| 320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 327 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 321 // have the same data in both lanes of a 256 bit register | 328 // have the same data in both lanes of a 256 bit register |
| 322 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); | 329 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); |
| 323 | 330 |
| 324 // duplicate only the first 16 bits (first and second byte) | 331 // duplicate only the first 16 bits (first and second byte) |
| 325 // across 256 bit register | 332 // across 256 bit register |
| 326 firstFilters = _mm256_shuffle_epi8(filtersReg32, | 333 firstFilters = _mm256_shuffle_epi8(filtersReg32, |
| 327 _mm256_set1_epi16(0x100u)); | 334 _mm256_set1_epi16(0x100u)); |
| 328 // duplicate only the second 16 bits (third and forth byte) | 335 // duplicate only the second 16 bits (third and forth byte) |
| 329 // across 256 bit register | 336 // across 256 bit register |
| 330 secondFilters = _mm256_shuffle_epi8(filtersReg32, | 337 secondFilters = _mm256_shuffle_epi8(filtersReg32, |
| 331 _mm256_set1_epi16(0x302u)); | 338 _mm256_set1_epi16(0x302u)); |
| 332 // duplicate only the third 16 bits (fifth and sixth byte) | 339 // duplicate only the third 16 bits (fifth and sixth byte) |
| 333 // across 256 bit register | 340 // across 256 bit register |
| 334 thirdFilters = _mm256_shuffle_epi8(filtersReg32, | 341 thirdFilters = _mm256_shuffle_epi8(filtersReg32, |
| 335 _mm256_set1_epi16(0x504u)); | 342 _mm256_set1_epi16(0x504u)); |
| 336 // duplicate only the forth 16 bits (seventh and eighth byte) | 343 // duplicate only the forth 16 bits (seventh and eighth byte) |
| 337 // across 256 bit register | 344 // across 256 bit register |
| 338 forthFilters = _mm256_shuffle_epi8(filtersReg32, | 345 forthFilters = _mm256_shuffle_epi8(filtersReg32, |
| 339 _mm256_set1_epi16(0x706u)); | 346 _mm256_set1_epi16(0x706u)); |
| 340 | 347 |
| 341 // multiple the size of the source and destination stride by two | 348 // multiple the size of the source and destination stride by two |
| 342 src_stride = src_pitch << 1; | 349 src_stride = src_pitch << 1; |
| 343 dst_stride = out_pitch << 1; | 350 dst_stride = out_pitch << 1; |
| 344 | 351 |
| 345 // load 16 bytes 7 times in stride of src_pitch | 352 // load 16 bytes 7 times in stride of src_pitch |
| 346 srcReg32b1 = _mm256_castsi128_si256( | 353 srcReg32b1 = _mm256_castsi128_si256( |
| 347 _mm_loadu_si128((__m128i *)(src_ptr))); | 354 _mm_loadu_si128((const __m128i *)(src_ptr))); |
| 348 srcReg32b2 = _mm256_castsi128_si256( | 355 srcReg32b2 = _mm256_castsi128_si256( |
| 349 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); | 356 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); |
| 350 srcReg32b3 = _mm256_castsi128_si256( | 357 srcReg32b3 = _mm256_castsi128_si256( |
| 351 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); | 358 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); |
| 352 srcReg32b4 = _mm256_castsi128_si256( | 359 srcReg32b4 = _mm256_castsi128_si256( |
| 353 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); | 360 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); |
| 354 srcReg32b5 = _mm256_castsi128_si256( | 361 srcReg32b5 = _mm256_castsi128_si256( |
| 355 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); | 362 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); |
| 356 srcReg32b6 = _mm256_castsi128_si256( | 363 srcReg32b6 = _mm256_castsi128_si256( |
| 357 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); | 364 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); |
| 358 srcReg32b7 = _mm256_castsi128_si256( | 365 srcReg32b7 = _mm256_castsi128_si256( |
| 359 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); | 366 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); |
| 360 | 367 |
| 361 // have each consecutive loads on the same 256 register | 368 // have each consecutive loads on the same 256 register |
| 362 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, | 369 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, |
| 363 _mm256_castsi256_si128(srcReg32b2), 1); | 370 _mm256_castsi256_si128(srcReg32b2), 1); |
| 364 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, | 371 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, |
| 365 _mm256_castsi256_si128(srcReg32b3), 1); | 372 _mm256_castsi256_si128(srcReg32b3), 1); |
| 366 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, | 373 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, |
| 367 _mm256_castsi256_si128(srcReg32b4), 1); | 374 _mm256_castsi256_si128(srcReg32b4), 1); |
| 368 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, | 375 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, |
| 369 _mm256_castsi256_si128(srcReg32b5), 1); | 376 _mm256_castsi256_si128(srcReg32b5), 1); |
| (...skipping 16 matching lines...) Expand all Loading... |
| 386 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); | 393 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); |
| 387 | 394 |
| 388 // save | 395 // save |
| 389 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); | 396 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); |
| 390 | 397 |
| 391 | 398 |
| 392 for (i = output_height; i > 1; i-=2) { | 399 for (i = output_height; i > 1; i-=2) { |
| 393 // load the last 2 loads of 16 bytes and have every two | 400 // load the last 2 loads of 16 bytes and have every two |
| 394 // consecutive loads in the same 256 bit register | 401 // consecutive loads in the same 256 bit register |
| 395 srcReg32b8 = _mm256_castsi128_si256( | 402 srcReg32b8 = _mm256_castsi128_si256( |
| 396 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); | 403 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); |
| 397 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, | 404 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, |
| 398 _mm256_castsi256_si128(srcReg32b8), 1); | 405 _mm256_castsi256_si128(srcReg32b8), 1); |
| 399 srcReg32b9 = _mm256_castsi128_si256( | 406 srcReg32b9 = _mm256_castsi128_si256( |
| 400 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); | 407 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); |
| 401 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, | 408 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, |
| 402 _mm256_castsi256_si128(srcReg32b9), 1); | 409 _mm256_castsi256_si128(srcReg32b9), 1); |
| 403 | 410 |
| 404 // merge every two consecutive registers | 411 // merge every two consecutive registers |
| 405 // save | 412 // save |
| 406 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); | 413 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); |
| 407 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); | 414 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); |
| 408 | 415 |
| 409 // multiply 2 adjacent elements with the filter and add the result | 416 // multiply 2 adjacent elements with the filter and add the result |
| 410 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); | 417 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); |
| (...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 469 srcReg32b11 = srcReg32b2; | 476 srcReg32b11 = srcReg32b2; |
| 470 srcReg32b3 = srcReg32b5; | 477 srcReg32b3 = srcReg32b5; |
| 471 srcReg32b2 = srcReg32b4; | 478 srcReg32b2 = srcReg32b4; |
| 472 srcReg32b5 = srcReg32b7; | 479 srcReg32b5 = srcReg32b7; |
| 473 srcReg32b7 = srcReg32b9; | 480 srcReg32b7 = srcReg32b9; |
| 474 } | 481 } |
| 475 if (i > 0) { | 482 if (i > 0) { |
| 476 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; | 483 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; |
| 477 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; | 484 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; |
| 478 // load the last 16 bytes | 485 // load the last 16 bytes |
| 479 srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); | 486 srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); |
| 480 | 487 |
| 481 // merge the last 2 results together | 488 // merge the last 2 results together |
| 482 srcRegFilt4 = _mm_unpacklo_epi8( | 489 srcRegFilt4 = _mm_unpacklo_epi8( |
| 483 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); | 490 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); |
| 484 srcRegFilt7 = _mm_unpackhi_epi8( | 491 srcRegFilt7 = _mm_unpackhi_epi8( |
| 485 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); | 492 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); |
| 486 | 493 |
| 487 // multiply 2 adjacent elements with the filter and add the result | 494 // multiply 2 adjacent elements with the filter and add the result |
| 488 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), | 495 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), |
| 489 _mm256_castsi256_si128(firstFilters)); | 496 _mm256_castsi256_si128(firstFilters)); |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 535 | 542 |
| 536 // shrink to 8 bit each 16 bits, the first lane contain the first | 543 // shrink to 8 bit each 16 bits, the first lane contain the first |
| 537 // convolve result and the second lane contain the second convolve | 544 // convolve result and the second lane contain the second convolve |
| 538 // result | 545 // result |
| 539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); | 546 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); |
| 540 | 547 |
| 541 // save 16 bytes | 548 // save 16 bytes |
| 542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 549 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
| 543 } | 550 } |
| 544 } | 551 } |
| 552 |
| 553 #if HAVE_AVX2 && HAVE_SSSE3 |
| 554 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
| 555 #if ARCH_X86_64 |
| 556 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; |
| 557 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; |
| 558 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; |
| 559 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 |
| 560 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 |
| 561 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 |
| 562 #else // ARCH_X86 |
| 563 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
| 564 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
| 565 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
| 566 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 |
| 567 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 |
| 568 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 |
| 569 #endif // ARCH_X86_64 |
| 570 filter8_1dfunction vp9_filter_block1d16_v2_ssse3; |
| 571 filter8_1dfunction vp9_filter_block1d16_h2_ssse3; |
| 572 filter8_1dfunction vp9_filter_block1d8_v2_ssse3; |
| 573 filter8_1dfunction vp9_filter_block1d8_h2_ssse3; |
| 574 filter8_1dfunction vp9_filter_block1d4_v2_ssse3; |
| 575 filter8_1dfunction vp9_filter_block1d4_h2_ssse3; |
| 576 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 |
| 577 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 |
| 578 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 |
| 579 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 |
| 580 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 |
| 581 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 |
| 582 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 |
| 583 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 584 // uint8_t *dst, ptrdiff_t dst_stride, |
| 585 // const int16_t *filter_x, int x_step_q4, |
| 586 // const int16_t *filter_y, int y_step_q4, |
| 587 // int w, int h); |
| 588 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 589 // uint8_t *dst, ptrdiff_t dst_stride, |
| 590 // const int16_t *filter_x, int x_step_q4, |
| 591 // const int16_t *filter_y, int y_step_q4, |
| 592 // int w, int h); |
| 593 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); |
| 594 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); |
| 595 |
| 596 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 597 // uint8_t *dst, ptrdiff_t dst_stride, |
| 598 // const int16_t *filter_x, int x_step_q4, |
| 599 // const int16_t *filter_y, int y_step_q4, |
| 600 // int w, int h); |
| 601 FUN_CONV_2D(, avx2); |
| 602 #endif // HAVE_AX2 && HAVE_SSSE3 |
| OLD | NEW |