OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 294 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
305 output_ptr+=output_pitch; | 305 output_ptr+=output_pitch; |
306 } | 306 } |
307 } | 307 } |
308 | 308 |
309 void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, | 309 void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, |
310 unsigned int src_pitch, | 310 unsigned int src_pitch, |
311 unsigned char *output_ptr, | 311 unsigned char *output_ptr, |
312 unsigned int out_pitch, | 312 unsigned int out_pitch, |
313 unsigned int output_height, | 313 unsigned int output_height, |
314 int16_t *filter) { | 314 int16_t *filter) { |
315 __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6; | 315 __m128i addFilterReg64, filtersReg, minReg; |
316 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 316 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
317 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5; | 317 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; |
| 318 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; |
| 319 __m128i srcReg8; |
318 unsigned int i; | 320 unsigned int i; |
319 | 321 |
320 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 322 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
321 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | 323 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
322 filtersReg = _mm_loadu_si128((__m128i *)filter); | 324 filtersReg = _mm_loadu_si128((__m128i *)filter); |
323 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 325 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
324 // in both lanes of 128 bit register. | 326 // in both lanes of 128 bit register. |
325 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 327 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
326 | 328 |
327 // duplicate only the first 16 bits in the filter | 329 // duplicate only the first 16 bits in the filter |
328 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | 330 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
329 // duplicate only the second 16 bits in the filter | 331 // duplicate only the second 16 bits in the filter |
330 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | 332 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
331 // duplicate only the third 16 bits in the filter | 333 // duplicate only the third 16 bits in the filter |
332 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | 334 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
333 // duplicate only the forth 16 bits in the filter | 335 // duplicate only the forth 16 bits in the filter |
334 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | 336 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
335 | 337 |
| 338 // load the first 7 rows of 8 bytes |
| 339 srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); |
| 340 srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]); |
| 341 srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]); |
| 342 srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]); |
| 343 srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]); |
| 344 srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]); |
| 345 srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]); |
| 346 |
336 for (i = 0; i < output_height; i++) { | 347 for (i = 0; i < output_height; i++) { |
337 // load the first 8 bytes | 348 // load the last 8 bytes |
338 srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); | 349 srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]); |
339 // load the next 8 bytes in stride of src_pitch | |
340 srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]); | |
341 srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]); | |
342 srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]); | |
343 | 350 |
344 // merge the result together | 351 // merge the result together |
345 srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); | 352 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); |
346 srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); | 353 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); |
347 | |
348 // load the next 8 bytes in stride of src_pitch | |
349 srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]); | |
350 srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]); | |
351 srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]); | |
352 srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]); | |
353 | 354 |
354 // merge the result together | 355 // merge the result together |
355 srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); | 356 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); |
356 srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); | 357 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); |
357 | 358 |
358 // multiply 2 adjacent elements with the filter and add the result | 359 // multiply 2 adjacent elements with the filter and add the result |
359 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | 360 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); |
360 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); | 361 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); |
361 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | 362 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); |
362 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); | 363 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); |
363 | 364 |
364 // add and saturate the results together | 365 // add and saturate the results together |
365 minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); | 366 minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); |
366 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); | 367 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); |
367 srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); | 368 srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); |
368 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); | 369 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); |
369 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); | 370 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); |
370 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); | 371 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); |
371 | 372 |
372 // shift by 7 bit each 16 bit | 373 // shift by 7 bit each 16 bit |
373 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | 374 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); |
374 | 375 |
375 // shrink to 8 bit each 16 bits | 376 // shrink to 8 bit each 16 bits |
376 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | 377 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); |
377 | 378 |
378 src_ptr+=src_pitch; | 379 src_ptr+=src_pitch; |
379 | 380 |
| 381 // shift down a row |
| 382 srcReg1 = srcReg2; |
| 383 srcReg2 = srcReg3; |
| 384 srcReg3 = srcReg4; |
| 385 srcReg4 = srcReg5; |
| 386 srcReg5 = srcReg6; |
| 387 srcReg6 = srcReg7; |
| 388 srcReg7 = srcReg8; |
| 389 |
380 // save only 8 bytes convolve result | 390 // save only 8 bytes convolve result |
381 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); | 391 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); |
382 | 392 |
383 output_ptr+=out_pitch; | 393 output_ptr+=out_pitch; |
384 } | 394 } |
385 } | 395 } |
386 | 396 |
387 void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, | 397 void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, |
388 unsigned int src_pitch, | 398 unsigned int src_pitch, |
389 unsigned char *output_ptr, | 399 unsigned char *output_ptr, |
390 unsigned int out_pitch, | 400 unsigned int out_pitch, |
391 unsigned int output_height, | 401 unsigned int output_height, |
392 int16_t *filter) { | 402 int16_t *filter) { |
393 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; | 403 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; |
394 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 404 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
395 __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; | 405 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; |
| 406 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; |
| 407 __m128i srcReg8; |
396 unsigned int i; | 408 unsigned int i; |
397 | 409 |
398 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 410 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
399 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | 411 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
400 filtersReg = _mm_loadu_si128((__m128i *)filter); | 412 filtersReg = _mm_loadu_si128((__m128i *)filter); |
401 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 413 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
402 // in both lanes of 128 bit register. | 414 // in both lanes of 128 bit register. |
403 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 415 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
404 | 416 |
405 // duplicate only the first 16 bits in the filter | 417 // duplicate only the first 16 bits in the filter |
406 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | 418 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
407 // duplicate only the second 16 bits in the filter | 419 // duplicate only the second 16 bits in the filter |
408 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | 420 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
409 // duplicate only the third 16 bits in the filter | 421 // duplicate only the third 16 bits in the filter |
410 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | 422 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
411 // duplicate only the forth 16 bits in the filter | 423 // duplicate only the forth 16 bits in the filter |
412 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | 424 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
413 | 425 |
| 426 // load the first 7 rows of 16 bytes |
| 427 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr)); |
| 428 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch)); |
| 429 srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2)); |
| 430 srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3)); |
| 431 srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4)); |
| 432 srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5)); |
| 433 srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6)); |
| 434 |
414 for (i = 0; i < output_height; i++) { | 435 for (i = 0; i < output_height; i++) { |
415 // load the first 16 bytes | 436 // load the last 16 bytes |
416 srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); | 437 srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7)); |
417 // load the next 16 bytes in stride of src_pitch | |
418 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); | |
419 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); | |
420 srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); | |
421 | 438 |
422 // merge the result together | 439 // merge the result together |
423 srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); | 440 srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); |
424 srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); | 441 srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8); |
425 srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); | 442 srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2); |
426 srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); | 443 srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8); |
427 | 444 |
428 // multiply 2 adjacent elements with the filter and add the result | 445 // multiply 2 adjacent elements with the filter and add the result |
429 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); | 446 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); |
430 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); | 447 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); |
431 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | 448 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); |
432 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); | 449 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); |
433 | 450 |
434 // add and saturate the results together | 451 // add and saturate the results together |
435 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); | 452 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); |
436 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); | 453 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); |
437 | 454 |
438 // load the next 16 bytes in stride of two/three src_pitch | 455 // merge the result together |
439 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); | 456 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); |
440 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); | 457 srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4); |
| 458 |
| 459 // multiply 2 adjacent elements with the filter and add the result |
| 460 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); |
| 461 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); |
441 | 462 |
442 // merge the result together | 463 // merge the result together |
443 srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); | 464 srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6); |
444 srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); | 465 srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6); |
445 | |
446 // multiply 2 adjacent elements with the filter and add the result | |
447 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); | |
448 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); | |
449 | |
450 // load the next 16 bytes in stride of four/five src_pitch | |
451 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); | |
452 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); | |
453 | |
454 // merge the result together | |
455 srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); | |
456 srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); | |
457 | 466 |
458 // multiply 2 adjacent elements with the filter and add the result | 467 // multiply 2 adjacent elements with the filter and add the result |
459 srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); | 468 srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); |
460 srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); | 469 srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); |
461 | 470 |
462 // add and saturate the results together | 471 // add and saturate the results together |
463 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, | 472 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, |
464 _mm_min_epi16(srcRegFilt4, srcRegFilt7)); | 473 _mm_min_epi16(srcRegFilt3, srcRegFilt7)); |
465 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, | 474 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, |
466 _mm_min_epi16(srcRegFilt6, srcRegFilt8)); | 475 _mm_min_epi16(srcRegFilt6, srcRegFilt8)); |
467 | 476 |
468 // add and saturate the results together | 477 // add and saturate the results together |
469 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, | 478 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, |
470 _mm_max_epi16(srcRegFilt4, srcRegFilt7)); | 479 _mm_max_epi16(srcRegFilt3, srcRegFilt7)); |
471 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, | 480 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, |
472 _mm_max_epi16(srcRegFilt6, srcRegFilt8)); | 481 _mm_max_epi16(srcRegFilt6, srcRegFilt8)); |
473 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); | 482 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); |
474 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); | 483 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); |
475 | 484 |
476 // shift by 7 bit each 16 bit | 485 // shift by 7 bit each 16 bit |
477 srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); | 486 srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); |
478 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | 487 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); |
479 | 488 |
480 // shrink to 8 bit each 16 bits, the first lane contain the first | 489 // shrink to 8 bit each 16 bits, the first lane contain the first |
481 // convolve result and the second lane contain the second convolve | 490 // convolve result and the second lane contain the second convolve |
482 // result | 491 // result |
483 srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); | 492 srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); |
484 | 493 |
485 src_ptr+=src_pitch; | 494 src_ptr+=src_pitch; |
486 | 495 |
| 496 // shift down a row |
| 497 srcReg1 = srcReg2; |
| 498 srcReg2 = srcReg3; |
| 499 srcReg3 = srcReg4; |
| 500 srcReg4 = srcReg5; |
| 501 srcReg5 = srcReg6; |
| 502 srcReg6 = srcReg7; |
| 503 srcReg7 = srcReg8; |
| 504 |
487 // save 16 bytes convolve result | 505 // save 16 bytes convolve result |
488 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 506 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
489 | 507 |
490 output_ptr+=out_pitch; | 508 output_ptr+=out_pitch; |
491 } | 509 } |
492 } | 510 } |
OLD | NEW |