| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 185 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 196 | 196 |
| 197 src_ptr+=src_pixels_per_line; | 197 src_ptr+=src_pixels_per_line; |
| 198 | 198 |
| 199 // save only 8 bytes | 199 // save only 8 bytes |
| 200 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); | 200 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); |
| 201 | 201 |
| 202 output_ptr+=output_pitch; | 202 output_ptr+=output_pitch; |
| 203 } | 203 } |
| 204 } | 204 } |
| 205 | 205 |
| 206 #if ARCH_X86_64 | |
| 207 static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, | |
| 208 ptrdiff_t src_pixels_per_line, | |
| 209 uint8_t *output_ptr, | |
| 210 ptrdiff_t output_pitch, | |
| 211 uint32_t output_height, | |
| 212 const int16_t *filter) { | |
| 213 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; | |
| 214 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; | |
| 215 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | |
| 216 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; | |
| 217 unsigned int i; | |
| 218 | |
| 219 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | |
| 220 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | |
| 221 filtersReg = _mm_loadu_si128((const __m128i *)filter); | |
| 222 // converting the 16 bit (short) to 8 bit (byte) and have the same data | |
| 223 // in both lanes of 128 bit register. | |
| 224 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
| 225 | |
| 226 // duplicate only the first 16 bits (first and second byte) | |
| 227 // across 128 bit register | |
| 228 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | |
| 229 // duplicate only the second 16 bits (third and forth byte) | |
| 230 // across 128 bit register | |
| 231 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | |
| 232 // duplicate only the third 16 bits (fifth and sixth byte) | |
| 233 // across 128 bit register | |
| 234 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | |
| 235 // duplicate only the forth 16 bits (seventh and eighth byte) | |
| 236 // across 128 bit register | |
| 237 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | |
| 238 | |
| 239 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); | |
| 240 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); | |
| 241 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); | |
| 242 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); | |
| 243 | |
| 244 for (i = 0; i < output_height; i++) { | |
| 245 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); | |
| 246 | |
| 247 // filter the source buffer | |
| 248 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); | |
| 249 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); | |
| 250 | |
| 251 // multiply 2 adjacent elements with the filter and add the result | |
| 252 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); | |
| 253 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); | |
| 254 | |
| 255 // add and saturate the results together | |
| 256 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); | |
| 257 | |
| 258 // filter the source buffer | |
| 259 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); | |
| 260 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); | |
| 261 | |
| 262 // multiply 2 adjacent elements with the filter and add the result | |
| 263 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); | |
| 264 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | |
| 265 | |
| 266 // add and saturate the results together | |
| 267 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | |
| 268 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | |
| 269 | |
| 270 // reading the next 16 bytes. | |
| 271 // (part of it was being read by earlier read) | |
| 272 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); | |
| 273 | |
| 274 // add and saturate the results together | |
| 275 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | |
| 276 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | |
| 277 | |
| 278 // filter the source buffer | |
| 279 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); | |
| 280 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); | |
| 281 | |
| 282 // multiply 2 adjacent elements with the filter and add the result | |
| 283 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); | |
| 284 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); | |
| 285 | |
| 286 // add and saturate the results together | |
| 287 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); | |
| 288 | |
| 289 // filter the source buffer | |
| 290 srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); | |
| 291 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); | |
| 292 | |
| 293 // multiply 2 adjacent elements with the filter and add the result | |
| 294 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); | |
| 295 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | |
| 296 | |
| 297 // add and saturate the results together | |
| 298 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, | |
| 299 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | |
| 300 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, | |
| 301 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | |
| 302 | |
| 303 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); | |
| 304 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); | |
| 305 | |
| 306 // shift by 7 bit each 16 bit | |
| 307 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); | |
| 308 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); | |
| 309 | |
| 310 // shrink to 8 bit each 16 bits, the first lane contain the first | |
| 311 // convolve result and the second lane contain the second convolve | |
| 312 // result | |
| 313 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); | |
| 314 | |
| 315 src_ptr+=src_pixels_per_line; | |
| 316 | |
| 317 // save 16 bytes | |
| 318 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); | |
| 319 | |
| 320 output_ptr+=output_pitch; | |
| 321 } | |
| 322 } | |
| 323 #endif // ARCH_X86_64 | |
| 324 | |
| 325 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, | 206 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, |
| 326 ptrdiff_t src_pitch, | 207 ptrdiff_t src_pitch, |
| 327 uint8_t *output_ptr, | 208 uint8_t *output_ptr, |
| 328 ptrdiff_t out_pitch, | 209 ptrdiff_t out_pitch, |
| 329 uint32_t output_height, | 210 uint32_t output_height, |
| 330 const int16_t *filter) { | 211 const int16_t *filter) { |
| 331 __m128i addFilterReg64, filtersReg, minReg; | 212 __m128i addFilterReg64, filtersReg, minReg; |
| 332 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 213 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 333 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; | 214 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; |
| 334 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; | 215 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; |
| (...skipping 185 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 520 srcReg7 = srcReg8; | 401 srcReg7 = srcReg8; |
| 521 | 402 |
| 522 // save 16 bytes convolve result | 403 // save 16 bytes convolve result |
| 523 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 404 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
| 524 | 405 |
| 525 output_ptr+=out_pitch; | 406 output_ptr+=out_pitch; |
| 526 } | 407 } |
| 527 } | 408 } |
| 528 #endif // ARCH_X86_64 | 409 #endif // ARCH_X86_64 |
| 529 | 410 |
| 530 #if ARCH_X86_64 | |
| 531 filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3; | |
| 532 filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3; | |
| 533 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; | |
| 534 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; | |
| 535 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; | |
| 536 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; | |
| 537 #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3 | |
| 538 #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3 | |
| 539 #define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3 | |
| 540 #define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3 | |
| 541 #define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3 | |
| 542 #else // ARCH_X86 | |
| 543 filter8_1dfunction vpx_filter_block1d16_v8_ssse3; | 411 filter8_1dfunction vpx_filter_block1d16_v8_ssse3; |
| 544 filter8_1dfunction vpx_filter_block1d16_h8_ssse3; | 412 filter8_1dfunction vpx_filter_block1d16_h8_ssse3; |
| 545 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; | 413 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; |
| 546 filter8_1dfunction vpx_filter_block1d8_h8_ssse3; | 414 filter8_1dfunction vpx_filter_block1d8_h8_ssse3; |
| 547 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; | 415 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; |
| 548 filter8_1dfunction vpx_filter_block1d4_h8_ssse3; | 416 filter8_1dfunction vpx_filter_block1d4_h8_ssse3; |
| 549 #endif // ARCH_X86_64 | |
| 550 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; | 417 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; |
| 551 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; | 418 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; |
| 552 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; | 419 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; |
| 553 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; | 420 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; |
| 554 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; | 421 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; |
| 555 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; | 422 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; |
| 556 | 423 |
| 557 filter8_1dfunction vpx_filter_block1d16_v2_ssse3; | 424 filter8_1dfunction vpx_filter_block1d16_v2_ssse3; |
| 558 filter8_1dfunction vpx_filter_block1d16_h2_ssse3; | 425 filter8_1dfunction vpx_filter_block1d16_h2_ssse3; |
| 559 filter8_1dfunction vpx_filter_block1d8_v2_ssse3; | 426 filter8_1dfunction vpx_filter_block1d8_v2_ssse3; |
| (...skipping 596 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1156 // const int16_t *filter_x, int x_step_q4, | 1023 // const int16_t *filter_x, int x_step_q4, |
| 1157 // const int16_t *filter_y, int y_step_q4, | 1024 // const int16_t *filter_y, int y_step_q4, |
| 1158 // int w, int h); | 1025 // int w, int h); |
| 1159 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 1026 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 1160 // uint8_t *dst, ptrdiff_t dst_stride, | 1027 // uint8_t *dst, ptrdiff_t dst_stride, |
| 1161 // const int16_t *filter_x, int x_step_q4, | 1028 // const int16_t *filter_x, int x_step_q4, |
| 1162 // const int16_t *filter_y, int y_step_q4, | 1029 // const int16_t *filter_y, int y_step_q4, |
| 1163 // int w, int h); | 1030 // int w, int h); |
| 1164 FUN_CONV_2D(, ssse3); | 1031 FUN_CONV_2D(, ssse3); |
| 1165 FUN_CONV_2D(avg_ , ssse3); | 1032 FUN_CONV_2D(avg_ , ssse3); |
| OLD | NEW |