| OLD | NEW |
| (Empty) | |
| 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <tmmintrin.h> |
| 12 #include "vpx_ports/mem.h" |
| 13 #include "vpx_ports/emmintrin_compat.h" |
| 14 |
| 15 // filters only for the 4_h8 convolution |
| 16 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { |
| 17 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 |
| 18 }; |
| 19 |
| 20 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { |
| 21 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 |
| 22 }; |
| 23 |
| 24 // filters for 8_h8 and 16_h8 |
| 25 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { |
| 26 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
| 27 }; |
| 28 |
| 29 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { |
| 30 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
| 31 }; |
| 32 |
| 33 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { |
| 34 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
| 35 }; |
| 36 |
| 37 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { |
| 38 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
| 39 }; |
| 40 |
| 41 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, |
| 42 unsigned int src_pixels_per_line, |
| 43 unsigned char *output_ptr, |
| 44 unsigned int output_pitch, |
| 45 unsigned int output_height, |
| 46 int16_t *filter) { |
| 47 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 48 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; |
| 49 __m128i addFilterReg64, filtersReg, srcReg, minReg; |
| 50 unsigned int i; |
| 51 |
| 52 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 53 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); |
| 54 filtersReg = _mm_loadu_si128((__m128i *)filter); |
| 55 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 56 // in both lanes of 128 bit register. |
| 57 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 58 |
| 59 // duplicate only the first 16 bits in the filter into the first lane |
| 60 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); |
| 61 // duplicate only the third 16 bit in the filter into the first lane |
| 62 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); |
| 63 // duplicate only the seconds 16 bits in the filter into the second lane |
| 64 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); |
| 65 // duplicate only the forth 16 bits in the filter into the second lane |
| 66 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); |
| 67 |
| 68 // loading the local filters |
| 69 thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8); |
| 70 forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8); |
| 71 |
| 72 for (i = 0; i < output_height; i++) { |
| 73 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); |
| 74 |
| 75 // filter the source buffer |
| 76 srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters); |
| 77 srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters); |
| 78 |
| 79 // multiply 2 adjacent elements with the filter and add the result |
| 80 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); |
| 81 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); |
| 82 |
| 83 // extract the higher half of the lane |
| 84 srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); |
| 85 srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); |
| 86 |
| 87 minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); |
| 88 |
| 89 // add and saturate all the results together |
| 90 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); |
| 91 srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); |
| 92 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); |
| 93 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); |
| 94 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); |
| 95 |
| 96 // shift by 7 bit each 16 bits |
| 97 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); |
| 98 |
| 99 // shrink to 8 bit each 16 bits |
| 100 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); |
| 101 src_ptr+=src_pixels_per_line; |
| 102 |
| 103 // save only 4 bytes |
| 104 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); |
| 105 |
| 106 output_ptr+=output_pitch; |
| 107 } |
| 108 } |
| 109 |
| 110 void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, |
| 111 unsigned int src_pixels_per_line, |
| 112 unsigned char *output_ptr, |
| 113 unsigned int output_pitch, |
| 114 unsigned int output_height, |
| 115 int16_t *filter) { |
| 116 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; |
| 117 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
| 118 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; |
| 119 __m128i addFilterReg64, filtersReg, minReg; |
| 120 unsigned int i; |
| 121 |
| 122 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 123 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
| 124 filtersReg = _mm_loadu_si128((__m128i *)filter); |
| 125 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 126 // in both lanes of 128 bit register. |
| 127 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 128 |
| 129 // duplicate only the first 16 bits (first and second byte) |
| 130 // across 128 bit register |
| 131 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
| 132 // duplicate only the second 16 bits (third and forth byte) |
| 133 // across 128 bit register |
| 134 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
| 135 // duplicate only the third 16 bits (fifth and sixth byte) |
| 136 // across 128 bit register |
| 137 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
| 138 // duplicate only the forth 16 bits (seventh and eighth byte) |
| 139 // across 128 bit register |
| 140 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
| 141 |
| 142 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); |
| 143 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); |
| 144 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); |
| 145 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); |
| 146 |
| 147 for (i = 0; i < output_height; i++) { |
| 148 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); |
| 149 |
| 150 // filter the source buffer |
| 151 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); |
| 152 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); |
| 153 |
| 154 // multiply 2 adjacent elements with the filter and add the result |
| 155 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); |
| 156 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); |
| 157 |
| 158 // filter the source buffer |
| 159 srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); |
| 160 srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); |
| 161 |
| 162 // multiply 2 adjacent elements with the filter and add the result |
| 163 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); |
| 164 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); |
| 165 |
| 166 // add and saturate all the results together |
| 167 minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3); |
| 168 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); |
| 169 |
| 170 srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3); |
| 171 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); |
| 172 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); |
| 173 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); |
| 174 |
| 175 // shift by 7 bit each 16 bits |
| 176 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); |
| 177 |
| 178 // shrink to 8 bit each 16 bits |
| 179 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); |
| 180 |
| 181 src_ptr+=src_pixels_per_line; |
| 182 |
| 183 // save only 8 bytes |
| 184 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); |
| 185 |
| 186 output_ptr+=output_pitch; |
| 187 } |
| 188 } |
| 189 |
| 190 void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, |
| 191 unsigned int src_pixels_per_line, |
| 192 unsigned char *output_ptr, |
| 193 unsigned int output_pitch, |
| 194 unsigned int output_height, |
| 195 int16_t *filter) { |
| 196 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; |
| 197 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
| 198 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 199 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; |
| 200 unsigned int i; |
| 201 |
| 202 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 203 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
| 204 filtersReg = _mm_loadu_si128((__m128i *)filter); |
| 205 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 206 // in both lanes of 128 bit register. |
| 207 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 208 |
| 209 // duplicate only the first 16 bits (first and second byte) |
| 210 // across 128 bit register |
| 211 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
| 212 // duplicate only the second 16 bits (third and forth byte) |
| 213 // across 128 bit register |
| 214 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
| 215 // duplicate only the third 16 bits (fifth and sixth byte) |
| 216 // across 128 bit register |
| 217 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
| 218 // duplicate only the forth 16 bits (seventh and eighth byte) |
| 219 // across 128 bit register |
| 220 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
| 221 |
| 222 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); |
| 223 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); |
| 224 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); |
| 225 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); |
| 226 |
| 227 for (i = 0; i < output_height; i++) { |
| 228 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); |
| 229 |
| 230 // filter the source buffer |
| 231 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); |
| 232 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg); |
| 233 |
| 234 // multiply 2 adjacent elements with the filter and add the result |
| 235 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); |
| 236 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); |
| 237 |
| 238 // add and saturate the results together |
| 239 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); |
| 240 |
| 241 // filter the source buffer |
| 242 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg); |
| 243 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); |
| 244 |
| 245 // multiply 2 adjacent elements with the filter and add the result |
| 246 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); |
| 247 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); |
| 248 |
| 249 // add and saturate the results together |
| 250 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
| 251 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
| 252 |
| 253 // reading the next 16 bytes. |
| 254 // (part of it was being read by earlier read) |
| 255 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); |
| 256 |
| 257 // add and saturate the results together |
| 258 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
| 259 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
| 260 |
| 261 // filter the source buffer |
| 262 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); |
| 263 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg); |
| 264 |
| 265 // multiply 2 adjacent elements with the filter and add the result |
| 266 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); |
| 267 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); |
| 268 |
| 269 // add and saturate the results together |
| 270 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); |
| 271 |
| 272 // filter the source buffer |
| 273 srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg); |
| 274 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); |
| 275 |
| 276 // multiply 2 adjacent elements with the filter and add the result |
| 277 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); |
| 278 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); |
| 279 |
| 280 // add and saturate the results together |
| 281 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
| 282 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
| 283 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
| 284 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
| 285 |
| 286 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); |
| 287 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); |
| 288 |
| 289 // shift by 7 bit each 16 bit |
| 290 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); |
| 291 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); |
| 292 |
| 293 // shrink to 8 bit each 16 bits, the first lane contain the first |
| 294 // convolve result and the second lane contain the second convolve |
| 295 // result |
| 296 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); |
| 297 |
| 298 src_ptr+=src_pixels_per_line; |
| 299 |
| 300 // save 16 bytes |
| 301 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); |
| 302 |
| 303 output_ptr+=output_pitch; |
| 304 } |
| 305 } |
| 306 |
| 307 void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, |
| 308 unsigned int src_pitch, |
| 309 unsigned char *output_ptr, |
| 310 unsigned int out_pitch, |
| 311 unsigned int output_height, |
| 312 int16_t *filter) { |
| 313 __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6; |
| 314 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 315 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5; |
| 316 unsigned int i; |
| 317 |
| 318 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 319 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
| 320 filtersReg = _mm_loadu_si128((__m128i *)filter); |
| 321 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 322 // in both lanes of 128 bit register. |
| 323 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 324 |
| 325 // duplicate only the first 16 bits in the filter |
| 326 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
| 327 // duplicate only the second 16 bits in the filter |
| 328 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
| 329 // duplicate only the third 16 bits in the filter |
| 330 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
| 331 // duplicate only the forth 16 bits in the filter |
| 332 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
| 333 |
| 334 for (i = 0; i < output_height; i++) { |
| 335 // load the first 8 bytes |
| 336 srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); |
| 337 // load the next 8 bytes in stride of src_pitch |
| 338 srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]); |
| 339 srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]); |
| 340 srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]); |
| 341 |
| 342 // merge the result together |
| 343 srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); |
| 344 srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); |
| 345 |
| 346 // load the next 8 bytes in stride of src_pitch |
| 347 srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]); |
| 348 srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]); |
| 349 srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]); |
| 350 srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]); |
| 351 |
| 352 // merge the result together |
| 353 srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); |
| 354 srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); |
| 355 |
| 356 // multiply 2 adjacent elements with the filter and add the result |
| 357 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); |
| 358 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); |
| 359 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); |
| 360 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); |
| 361 |
| 362 // add and saturate the results together |
| 363 minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); |
| 364 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); |
| 365 srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); |
| 366 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); |
| 367 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); |
| 368 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); |
| 369 |
| 370 // shift by 7 bit each 16 bit |
| 371 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); |
| 372 |
| 373 // shrink to 8 bit each 16 bits |
| 374 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); |
| 375 |
| 376 src_ptr+=src_pitch; |
| 377 |
| 378 // save only 8 bytes convolve result |
| 379 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); |
| 380 |
| 381 output_ptr+=out_pitch; |
| 382 } |
| 383 } |
| 384 |
| 385 void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, |
| 386 unsigned int src_pitch, |
| 387 unsigned char *output_ptr, |
| 388 unsigned int out_pitch, |
| 389 unsigned int output_height, |
| 390 int16_t *filter) { |
| 391 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; |
| 392 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 393 __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; |
| 394 unsigned int i; |
| 395 |
| 396 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 397 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
| 398 filtersReg = _mm_loadu_si128((__m128i *)filter); |
| 399 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 400 // in both lanes of 128 bit register. |
| 401 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 402 |
| 403 // duplicate only the first 16 bits in the filter |
| 404 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
| 405 // duplicate only the second 16 bits in the filter |
| 406 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
| 407 // duplicate only the third 16 bits in the filter |
| 408 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
| 409 // duplicate only the forth 16 bits in the filter |
| 410 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
| 411 |
| 412 for (i = 0; i < output_height; i++) { |
| 413 // load the first 16 bytes |
| 414 srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); |
| 415 // load the next 16 bytes in stride of src_pitch |
| 416 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); |
| 417 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); |
| 418 srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); |
| 419 |
| 420 // merge the result together |
| 421 srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); |
| 422 srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); |
| 423 srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); |
| 424 srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); |
| 425 |
| 426 // multiply 2 adjacent elements with the filter and add the result |
| 427 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); |
| 428 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); |
| 429 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); |
| 430 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); |
| 431 |
| 432 // add and saturate the results together |
| 433 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); |
| 434 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); |
| 435 |
| 436 // load the next 16 bytes in stride of two/three src_pitch |
| 437 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); |
| 438 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); |
| 439 |
| 440 // merge the result together |
| 441 srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); |
| 442 srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); |
| 443 |
| 444 // multiply 2 adjacent elements with the filter and add the result |
| 445 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); |
| 446 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); |
| 447 |
| 448 // load the next 16 bytes in stride of four/five src_pitch |
| 449 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); |
| 450 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); |
| 451 |
| 452 // merge the result together |
| 453 srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); |
| 454 srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); |
| 455 |
| 456 // multiply 2 adjacent elements with the filter and add the result |
| 457 srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); |
| 458 srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); |
| 459 |
| 460 // add and saturate the results together |
| 461 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, |
| 462 _mm_min_epi16(srcRegFilt4, srcRegFilt7)); |
| 463 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, |
| 464 _mm_min_epi16(srcRegFilt6, srcRegFilt8)); |
| 465 |
| 466 // add and saturate the results together |
| 467 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, |
| 468 _mm_max_epi16(srcRegFilt4, srcRegFilt7)); |
| 469 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, |
| 470 _mm_max_epi16(srcRegFilt6, srcRegFilt8)); |
| 471 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); |
| 472 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); |
| 473 |
| 474 // shift by 7 bit each 16 bit |
| 475 srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); |
| 476 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); |
| 477 |
| 478 // shrink to 8 bit each 16 bits, the first lane contain the first |
| 479 // convolve result and the second lane contain the second convolve |
| 480 // result |
| 481 srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); |
| 482 |
| 483 src_ptr+=src_pitch; |
| 484 |
| 485 // save 16 bytes convolve result |
| 486 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
| 487 |
| 488 output_ptr+=out_pitch; |
| 489 } |
| 490 } |
| OLD | NEW |