OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <immintrin.h> |
| 12 #include "vpx_ports/mem.h" |
| 13 |
| 14 // filters for 16_h8 and 16_v8 |
| 15 DECLARE_ALIGNED(32, const unsigned char, filt1_global_avx2[32])= { |
| 16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, |
| 17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8}; |
| 18 |
| 19 DECLARE_ALIGNED(32, const unsigned char, filt2_global_avx2[32])= { |
| 20 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, |
| 21 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10}; |
| 22 |
| 23 DECLARE_ALIGNED(32, const unsigned char, filt3_global_avx2[32])= { |
| 24 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, |
| 25 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12}; |
| 26 |
| 27 DECLARE_ALIGNED(32, const unsigned char, filt4_global_avx2[32])= { |
| 28 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, |
| 29 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14}; |
| 30 |
| 31 |
| 32 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, |
| 33 unsigned int src_pixels_per_line, |
| 34 unsigned char *output_ptr, |
| 35 unsigned int output_pitch, |
| 36 unsigned int output_height, |
| 37 int16_t *filter) { |
| 38 __m128i filtersReg; |
| 39 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
| 40 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 41 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; |
| 42 __m256i srcReg32b1, srcReg32b2, filtersReg32; |
| 43 unsigned int i; |
| 44 unsigned int src_stride, dst_stride; |
| 45 |
| 46 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 47 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); |
| 48 filtersReg = _mm_loadu_si128((__m128i *)filter); |
| 49 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 50 // in both lanes of 128 bit register. |
| 51 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 52 // have the same data in both lanes of a 256 bit register |
| 53 #if defined (__GNUC__) |
| 54 #if ( __GNUC__ < 4 || (__GNUC__ == 4 && \ |
| 55 (__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0)))) |
| 56 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg); |
| 57 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0)) |
| 58 filtersReg32 = _mm_broadcastsi128_si256(filtersReg); |
| 59 #else |
| 60 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); |
| 61 #endif |
| 62 #else |
| 63 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); |
| 64 #endif |
| 65 |
| 66 // duplicate only the first 16 bits (first and second byte) |
| 67 // across 256 bit register |
| 68 firstFilters = _mm256_shuffle_epi8(filtersReg32, |
| 69 _mm256_set1_epi16(0x100u)); |
| 70 // duplicate only the second 16 bits (third and forth byte) |
| 71 // across 256 bit register |
| 72 secondFilters = _mm256_shuffle_epi8(filtersReg32, |
| 73 _mm256_set1_epi16(0x302u)); |
| 74 // duplicate only the third 16 bits (fifth and sixth byte) |
| 75 // across 256 bit register |
| 76 thirdFilters = _mm256_shuffle_epi8(filtersReg32, |
| 77 _mm256_set1_epi16(0x504u)); |
| 78 // duplicate only the forth 16 bits (seventh and eighth byte) |
| 79 // across 256 bit register |
| 80 forthFilters = _mm256_shuffle_epi8(filtersReg32, |
| 81 _mm256_set1_epi16(0x706u)); |
| 82 |
| 83 filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); |
| 84 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); |
| 85 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); |
| 86 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); |
| 87 |
| 88 // multiple the size of the source and destination stride by two |
| 89 src_stride = src_pixels_per_line << 1; |
| 90 dst_stride = output_pitch << 1; |
| 91 for (i = output_height; i > 1; i-=2) { |
| 92 // load the 2 strides of source |
| 93 srcReg32b1 = _mm256_castsi128_si256( |
| 94 _mm_loadu_si128((__m128i *)(src_ptr-3))); |
| 95 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, |
| 96 _mm_loadu_si128((__m128i *) |
| 97 (src_ptr+src_pixels_per_line-3)), 1); |
| 98 |
| 99 // filter the source buffer |
| 100 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); |
| 101 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); |
| 102 |
| 103 // multiply 2 adjacent elements with the filter and add the result |
| 104 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); |
| 105 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); |
| 106 |
| 107 // add and saturate the results together |
| 108 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); |
| 109 |
| 110 // filter the source buffer |
| 111 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); |
| 112 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); |
| 113 |
| 114 // multiply 2 adjacent elements with the filter and add the result |
| 115 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); |
| 116 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); |
| 117 |
| 118 // add and saturate the results together |
| 119 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, |
| 120 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
| 121 |
| 122 // reading 2 strides of the next 16 bytes |
| 123 // (part of it was being read by earlier read) |
| 124 srcReg32b2 = _mm256_castsi128_si256( |
| 125 _mm_loadu_si128((__m128i *)(src_ptr+5))); |
| 126 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, |
| 127 _mm_loadu_si128((__m128i *) |
| 128 (src_ptr+src_pixels_per_line+5)), 1); |
| 129 |
| 130 // add and saturate the results together |
| 131 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, |
| 132 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
| 133 |
| 134 // filter the source buffer |
| 135 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); |
| 136 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); |
| 137 |
| 138 // multiply 2 adjacent elements with the filter and add the result |
| 139 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); |
| 140 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); |
| 141 |
| 142 // add and saturate the results together |
| 143 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); |
| 144 |
| 145 // filter the source buffer |
| 146 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); |
| 147 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); |
| 148 |
| 149 // multiply 2 adjacent elements with the filter and add the result |
| 150 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); |
| 151 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); |
| 152 |
| 153 // add and saturate the results together |
| 154 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, |
| 155 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
| 156 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, |
| 157 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
| 158 |
| 159 |
| 160 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); |
| 161 |
| 162 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); |
| 163 |
| 164 // shift by 7 bit each 16 bit |
| 165 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); |
| 166 srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); |
| 167 |
| 168 // shrink to 8 bit each 16 bits, the first lane contain the first |
| 169 // convolve result and the second lane contain the second convolve |
| 170 // result |
| 171 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, |
| 172 srcRegFilt32b2_1); |
| 173 |
| 174 src_ptr+=src_stride; |
| 175 |
| 176 // save 16 bytes |
| 177 _mm_store_si128((__m128i*)output_ptr, |
| 178 _mm256_castsi256_si128(srcRegFilt32b1_1)); |
| 179 |
| 180 // save the next 16 bits |
| 181 _mm_store_si128((__m128i*)(output_ptr+output_pitch), |
| 182 _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); |
| 183 output_ptr+=dst_stride; |
| 184 } |
| 185 |
| 186 // if the number of strides is odd. |
| 187 // process only 16 bytes |
| 188 if (i > 0) { |
| 189 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; |
| 190 __m128i srcRegFilt2, srcRegFilt3; |
| 191 |
| 192 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); |
| 193 |
| 194 // filter the source buffer |
| 195 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, |
| 196 _mm256_castsi256_si128(filt1Reg)); |
| 197 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, |
| 198 _mm256_castsi256_si128(filt2Reg)); |
| 199 |
| 200 // multiply 2 adjacent elements with the filter and add the result |
| 201 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, |
| 202 _mm256_castsi256_si128(firstFilters)); |
| 203 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
| 204 _mm256_castsi256_si128(secondFilters)); |
| 205 |
| 206 // add and saturate the results together |
| 207 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); |
| 208 |
| 209 // filter the source buffer |
| 210 srcRegFilt3= _mm_shuffle_epi8(srcReg1, |
| 211 _mm256_castsi256_si128(filt4Reg)); |
| 212 srcRegFilt2= _mm_shuffle_epi8(srcReg1, |
| 213 _mm256_castsi256_si128(filt3Reg)); |
| 214 |
| 215 // multiply 2 adjacent elements with the filter and add the result |
| 216 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, |
| 217 _mm256_castsi256_si128(forthFilters)); |
| 218 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
| 219 _mm256_castsi256_si128(thirdFilters)); |
| 220 |
| 221 // add and saturate the results together |
| 222 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
| 223 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
| 224 |
| 225 // reading the next 16 bytes |
| 226 // (part of it was being read by earlier read) |
| 227 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); |
| 228 |
| 229 // add and saturate the results together |
| 230 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
| 231 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
| 232 |
| 233 // filter the source buffer |
| 234 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, |
| 235 _mm256_castsi256_si128(filt1Reg)); |
| 236 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, |
| 237 _mm256_castsi256_si128(filt2Reg)); |
| 238 |
| 239 // multiply 2 adjacent elements with the filter and add the result |
| 240 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, |
| 241 _mm256_castsi256_si128(firstFilters)); |
| 242 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
| 243 _mm256_castsi256_si128(secondFilters)); |
| 244 |
| 245 // add and saturate the results together |
| 246 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); |
| 247 |
| 248 // filter the source buffer |
| 249 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, |
| 250 _mm256_castsi256_si128(filt4Reg)); |
| 251 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, |
| 252 _mm256_castsi256_si128(filt3Reg)); |
| 253 |
| 254 // multiply 2 adjacent elements with the filter and add the result |
| 255 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, |
| 256 _mm256_castsi256_si128(forthFilters)); |
| 257 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
| 258 _mm256_castsi256_si128(thirdFilters)); |
| 259 |
| 260 // add and saturate the results together |
| 261 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
| 262 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
| 263 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
| 264 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
| 265 |
| 266 |
| 267 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
| 268 _mm256_castsi256_si128(addFilterReg64)); |
| 269 |
| 270 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
| 271 _mm256_castsi256_si128(addFilterReg64)); |
| 272 |
| 273 // shift by 7 bit each 16 bit |
| 274 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); |
| 275 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); |
| 276 |
| 277 // shrink to 8 bit each 16 bits, the first lane contain the first |
| 278 // convolve result and the second lane contain the second convolve |
| 279 // result |
| 280 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); |
| 281 |
| 282 // save 16 bytes |
| 283 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); |
| 284 } |
| 285 } |
| 286 |
| 287 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, |
| 288 unsigned int src_pitch, |
| 289 unsigned char *output_ptr, |
| 290 unsigned int out_pitch, |
| 291 unsigned int output_height, |
| 292 int16_t *filter) { |
| 293 __m128i filtersReg; |
| 294 __m256i addFilterReg64; |
| 295 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; |
| 296 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; |
| 297 __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32; |
| 298 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 299 unsigned int i; |
| 300 unsigned int src_stride, dst_stride; |
| 301 |
| 302 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 303 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); |
| 304 filtersReg = _mm_loadu_si128((__m128i *)filter); |
| 305 // converting the 16 bit (short) to 8 bit (byte) and have the |
| 306 // same data in both lanes of 128 bit register. |
| 307 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 308 // have the same data in both lanes of a 256 bit register |
| 309 #if defined (__GNUC__) |
| 310 #if ( __GNUC__ < 4 || (__GNUC__ == 4 && \ |
| 311 (__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0)))) |
| 312 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg); |
| 313 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0)) |
| 314 filtersReg32 = _mm_broadcastsi128_si256(filtersReg); |
| 315 #else |
| 316 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); |
| 317 #endif |
| 318 #else |
| 319 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); |
| 320 #endif |
| 321 |
| 322 // duplicate only the first 16 bits (first and second byte) |
| 323 // across 256 bit register |
| 324 firstFilters = _mm256_shuffle_epi8(filtersReg32, |
| 325 _mm256_set1_epi16(0x100u)); |
| 326 // duplicate only the second 16 bits (third and forth byte) |
| 327 // across 256 bit register |
| 328 secondFilters = _mm256_shuffle_epi8(filtersReg32, |
| 329 _mm256_set1_epi16(0x302u)); |
| 330 // duplicate only the third 16 bits (fifth and sixth byte) |
| 331 // across 256 bit register |
| 332 thirdFilters = _mm256_shuffle_epi8(filtersReg32, |
| 333 _mm256_set1_epi16(0x504u)); |
| 334 // duplicate only the forth 16 bits (seventh and eighth byte) |
| 335 // across 256 bit register |
| 336 forthFilters = _mm256_shuffle_epi8(filtersReg32, |
| 337 _mm256_set1_epi16(0x706u)); |
| 338 |
| 339 // multiple the size of the source and destination stride by two |
| 340 src_stride = src_pitch << 1; |
| 341 dst_stride = out_pitch << 1; |
| 342 |
| 343 // load 16 bytes 7 times in stride of src_pitch |
| 344 srcReg32b1 = _mm256_castsi128_si256( |
| 345 _mm_loadu_si128((__m128i *)(src_ptr))); |
| 346 srcReg32b2 = _mm256_castsi128_si256( |
| 347 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); |
| 348 srcReg32b3 = _mm256_castsi128_si256( |
| 349 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); |
| 350 srcReg32b4 = _mm256_castsi128_si256( |
| 351 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); |
| 352 srcReg32b5 = _mm256_castsi128_si256( |
| 353 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); |
| 354 srcReg32b6 = _mm256_castsi128_si256( |
| 355 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); |
| 356 srcReg32b7 = _mm256_castsi128_si256( |
| 357 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); |
| 358 |
| 359 // have each consecutive loads on the same 256 register |
| 360 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, |
| 361 _mm256_castsi256_si128(srcReg32b2), 1); |
| 362 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, |
| 363 _mm256_castsi256_si128(srcReg32b3), 1); |
| 364 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, |
| 365 _mm256_castsi256_si128(srcReg32b4), 1); |
| 366 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, |
| 367 _mm256_castsi256_si128(srcReg32b5), 1); |
| 368 srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, |
| 369 _mm256_castsi256_si128(srcReg32b6), 1); |
| 370 srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, |
| 371 _mm256_castsi256_si128(srcReg32b7), 1); |
| 372 |
| 373 // merge every two consecutive registers except the last one |
| 374 srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); |
| 375 srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); |
| 376 |
| 377 // save |
| 378 srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); |
| 379 |
| 380 // save |
| 381 srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); |
| 382 |
| 383 // save |
| 384 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); |
| 385 |
| 386 // save |
| 387 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); |
| 388 |
| 389 |
| 390 for (i = output_height; i > 1; i-=2) { |
| 391 // load the last 2 loads of 16 bytes and have every two |
| 392 // consecutive loads in the same 256 bit register |
| 393 srcReg32b8 = _mm256_castsi128_si256( |
| 394 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); |
| 395 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, |
| 396 _mm256_castsi256_si128(srcReg32b8), 1); |
| 397 srcReg32b9 = _mm256_castsi128_si256( |
| 398 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); |
| 399 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, |
| 400 _mm256_castsi256_si128(srcReg32b9), 1); |
| 401 |
| 402 // merge every two consecutive registers |
| 403 // save |
| 404 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); |
| 405 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); |
| 406 |
| 407 // multiply 2 adjacent elements with the filter and add the result |
| 408 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); |
| 409 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); |
| 410 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); |
| 411 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); |
| 412 |
| 413 // add and saturate the results together |
| 414 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); |
| 415 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8); |
| 416 |
| 417 |
| 418 // multiply 2 adjacent elements with the filter and add the result |
| 419 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); |
| 420 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); |
| 421 |
| 422 // multiply 2 adjacent elements with the filter and add the result |
| 423 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); |
| 424 srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); |
| 425 |
| 426 |
| 427 // add and saturate the results together |
| 428 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, |
| 429 _mm256_min_epi16(srcReg32b8, srcReg32b12)); |
| 430 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, |
| 431 _mm256_min_epi16(srcReg32b6, srcReg32b13)); |
| 432 |
| 433 // add and saturate the results together |
| 434 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, |
| 435 _mm256_max_epi16(srcReg32b8, srcReg32b12)); |
| 436 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, |
| 437 _mm256_max_epi16(srcReg32b6, srcReg32b13)); |
| 438 |
| 439 |
| 440 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); |
| 441 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); |
| 442 |
| 443 // shift by 7 bit each 16 bit |
| 444 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); |
| 445 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); |
| 446 |
| 447 // shrink to 8 bit each 16 bits, the first lane contain the first |
| 448 // convolve result and the second lane contain the second convolve |
| 449 // result |
| 450 srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); |
| 451 |
| 452 src_ptr+=src_stride; |
| 453 |
| 454 // save 16 bytes |
| 455 _mm_store_si128((__m128i*)output_ptr, |
| 456 _mm256_castsi256_si128(srcReg32b1)); |
| 457 |
| 458 // save the next 16 bits |
| 459 _mm_store_si128((__m128i*)(output_ptr+out_pitch), |
| 460 _mm256_extractf128_si256(srcReg32b1, 1)); |
| 461 |
| 462 output_ptr+=dst_stride; |
| 463 |
| 464 // save part of the registers for next strides |
| 465 srcReg32b10 = srcReg32b11; |
| 466 srcReg32b1 = srcReg32b3; |
| 467 srcReg32b11 = srcReg32b2; |
| 468 srcReg32b3 = srcReg32b5; |
| 469 srcReg32b2 = srcReg32b4; |
| 470 srcReg32b5 = srcReg32b7; |
| 471 srcReg32b7 = srcReg32b9; |
| 472 } |
| 473 if (i > 0) { |
| 474 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; |
| 475 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; |
| 476 // load the last 16 bytes |
| 477 srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); |
| 478 |
| 479 // merge the last 2 results together |
| 480 srcRegFilt4 = _mm_unpacklo_epi8( |
| 481 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); |
| 482 srcRegFilt7 = _mm_unpackhi_epi8( |
| 483 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); |
| 484 |
| 485 // multiply 2 adjacent elements with the filter and add the result |
| 486 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), |
| 487 _mm256_castsi256_si128(firstFilters)); |
| 488 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, |
| 489 _mm256_castsi256_si128(forthFilters)); |
| 490 srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), |
| 491 _mm256_castsi256_si128(firstFilters)); |
| 492 srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, |
| 493 _mm256_castsi256_si128(forthFilters)); |
| 494 |
| 495 // add and saturate the results together |
| 496 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); |
| 497 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); |
| 498 |
| 499 |
| 500 // multiply 2 adjacent elements with the filter and add the result |
| 501 srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), |
| 502 _mm256_castsi256_si128(secondFilters)); |
| 503 srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), |
| 504 _mm256_castsi256_si128(secondFilters)); |
| 505 |
| 506 // multiply 2 adjacent elements with the filter and add the result |
| 507 srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), |
| 508 _mm256_castsi256_si128(thirdFilters)); |
| 509 srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), |
| 510 _mm256_castsi256_si128(thirdFilters)); |
| 511 |
| 512 // add and saturate the results together |
| 513 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, |
| 514 _mm_min_epi16(srcRegFilt4, srcRegFilt6)); |
| 515 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, |
| 516 _mm_min_epi16(srcRegFilt5, srcRegFilt7)); |
| 517 |
| 518 // add and saturate the results together |
| 519 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, |
| 520 _mm_max_epi16(srcRegFilt4, srcRegFilt6)); |
| 521 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, |
| 522 _mm_max_epi16(srcRegFilt5, srcRegFilt7)); |
| 523 |
| 524 |
| 525 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, |
| 526 _mm256_castsi256_si128(addFilterReg64)); |
| 527 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, |
| 528 _mm256_castsi256_si128(addFilterReg64)); |
| 529 |
| 530 // shift by 7 bit each 16 bit |
| 531 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); |
| 532 srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); |
| 533 |
| 534 // shrink to 8 bit each 16 bits, the first lane contain the first |
| 535 // convolve result and the second lane contain the second convolve |
| 536 // result |
| 537 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); |
| 538 |
| 539 // save 16 bytes |
| 540 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
| 541 } |
| 542 } |
OLD | NEW |