OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 // Due to a header conflict between math.h and intrinsics includes with ceil() | 11 // Due to a header conflict between math.h and intrinsics includes with ceil() |
12 // in certain configurations under vs9 this include needs to precede | 12 // in certain configurations under vs9 this include needs to precede |
13 // tmmintrin.h. | 13 // tmmintrin.h. |
14 | 14 |
15 #include <tmmintrin.h> | 15 #include <tmmintrin.h> |
16 | 16 |
17 #include "./vpx_dsp_rtcd.h" | 17 #include "./vpx_dsp_rtcd.h" |
| 18 #include "vpx_dsp/vpx_filter.h" |
18 #include "vpx_dsp/x86/convolve.h" | 19 #include "vpx_dsp/x86/convolve.h" |
| 20 #include "vpx_mem/vpx_mem.h" |
19 #include "vpx_ports/mem.h" | 21 #include "vpx_ports/mem.h" |
20 #include "vpx_ports/emmintrin_compat.h" | 22 #include "vpx_ports/emmintrin_compat.h" |
21 | 23 |
22 // filters only for the 4_h8 convolution | 24 // filters only for the 4_h8 convolution |
23 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { | 25 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { |
24 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 | 26 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 |
25 }; | 27 }; |
26 | 28 |
27 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { | 29 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { |
28 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 | 30 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 |
(...skipping 165 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
194 | 196 |
195 src_ptr+=src_pixels_per_line; | 197 src_ptr+=src_pixels_per_line; |
196 | 198 |
197 // save only 8 bytes | 199 // save only 8 bytes |
198 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); | 200 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); |
199 | 201 |
200 output_ptr+=output_pitch; | 202 output_ptr+=output_pitch; |
201 } | 203 } |
202 } | 204 } |
203 | 205 |
| 206 #if ARCH_X86_64 |
204 static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, | 207 static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, |
205 ptrdiff_t src_pixels_per_line, | 208 ptrdiff_t src_pixels_per_line, |
206 uint8_t *output_ptr, | 209 uint8_t *output_ptr, |
207 ptrdiff_t output_pitch, | 210 ptrdiff_t output_pitch, |
208 uint32_t output_height, | 211 uint32_t output_height, |
209 const int16_t *filter) { | 212 const int16_t *filter) { |
210 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; | 213 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; |
211 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; | 214 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
212 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 215 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
213 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; | 216 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; |
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
310 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); | 313 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); |
311 | 314 |
312 src_ptr+=src_pixels_per_line; | 315 src_ptr+=src_pixels_per_line; |
313 | 316 |
314 // save 16 bytes | 317 // save 16 bytes |
315 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); | 318 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); |
316 | 319 |
317 output_ptr+=output_pitch; | 320 output_ptr+=output_pitch; |
318 } | 321 } |
319 } | 322 } |
| 323 #endif // ARCH_X86_64 |
320 | 324 |
321 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, | 325 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, |
322 ptrdiff_t src_pitch, | 326 ptrdiff_t src_pitch, |
323 uint8_t *output_ptr, | 327 uint8_t *output_ptr, |
324 ptrdiff_t out_pitch, | 328 ptrdiff_t out_pitch, |
325 uint32_t output_height, | 329 uint32_t output_height, |
326 const int16_t *filter) { | 330 const int16_t *filter) { |
327 __m128i addFilterReg64, filtersReg, minReg; | 331 __m128i addFilterReg64, filtersReg, minReg; |
328 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 332 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
329 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; | 333 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
399 srcReg6 = srcReg7; | 403 srcReg6 = srcReg7; |
400 srcReg7 = srcReg8; | 404 srcReg7 = srcReg8; |
401 | 405 |
402 // save only 8 bytes convolve result | 406 // save only 8 bytes convolve result |
403 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); | 407 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); |
404 | 408 |
405 output_ptr+=out_pitch; | 409 output_ptr+=out_pitch; |
406 } | 410 } |
407 } | 411 } |
408 | 412 |
| 413 #if ARCH_X86_64 |
409 static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, | 414 static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, |
410 ptrdiff_t src_pitch, | 415 ptrdiff_t src_pitch, |
411 uint8_t *output_ptr, | 416 uint8_t *output_ptr, |
412 ptrdiff_t out_pitch, | 417 ptrdiff_t out_pitch, |
413 uint32_t output_height, | 418 uint32_t output_height, |
414 const int16_t *filter) { | 419 const int16_t *filter) { |
415 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; | 420 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; |
416 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 421 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
417 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; | 422 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; |
418 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; | 423 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
513 srcReg5 = srcReg6; | 518 srcReg5 = srcReg6; |
514 srcReg6 = srcReg7; | 519 srcReg6 = srcReg7; |
515 srcReg7 = srcReg8; | 520 srcReg7 = srcReg8; |
516 | 521 |
517 // save 16 bytes convolve result | 522 // save 16 bytes convolve result |
518 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 523 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
519 | 524 |
520 output_ptr+=out_pitch; | 525 output_ptr+=out_pitch; |
521 } | 526 } |
522 } | 527 } |
| 528 #endif // ARCH_X86_64 |
523 | 529 |
524 #if ARCH_X86_64 | 530 #if ARCH_X86_64 |
525 filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3; | 531 filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3; |
526 filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3; | 532 filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3; |
527 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; | 533 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; |
528 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; | 534 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; |
529 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; | 535 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; |
530 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; | 536 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; |
531 #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3 | 537 #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3 |
532 #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3 | 538 #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3 |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
580 // uint8_t *dst, ptrdiff_t dst_stride, | 586 // uint8_t *dst, ptrdiff_t dst_stride, |
581 // const int16_t *filter_x, int x_step_q4, | 587 // const int16_t *filter_x, int x_step_q4, |
582 // const int16_t *filter_y, int y_step_q4, | 588 // const int16_t *filter_y, int y_step_q4, |
583 // int w, int h); | 589 // int w, int h); |
584 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); | 590 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); |
585 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); | 591 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); |
586 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); | 592 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); |
587 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, | 593 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, |
588 ssse3); | 594 ssse3); |
589 | 595 |
590 // void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 596 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 597 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 598 const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ |
| 599 const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ |
| 600 const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ |
| 601 const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ |
| 602 \ |
| 603 const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ |
| 604 const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ |
| 605 const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ |
| 606 const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ |
| 607 \ |
| 608 const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ |
| 609 const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ |
| 610 const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ |
| 611 const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ |
| 612 \ |
| 613 out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ |
| 614 out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ |
| 615 out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ |
| 616 out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ |
| 617 out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ |
| 618 out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ |
| 619 out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ |
| 620 out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ |
| 621 } |
| 622 |
| 623 static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, |
| 624 uint8_t *dst, const int16_t *x_filter) { |
| 625 const __m128i k_256 = _mm_set1_epi16(1 << 8); |
| 626 const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); |
| 627 // pack and duplicate the filter values |
| 628 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); |
| 629 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); |
| 630 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); |
| 631 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); |
| 632 const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); |
| 633 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); |
| 634 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); |
| 635 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); |
| 636 const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); |
| 637 const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); |
| 638 const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); |
| 639 const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); |
| 640 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 |
| 641 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); |
| 642 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 |
| 643 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); |
| 644 // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 |
| 645 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); |
| 646 // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 |
| 647 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); |
| 648 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 |
| 649 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
| 650 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 |
| 651 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
| 652 // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 |
| 653 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); |
| 654 // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 |
| 655 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); |
| 656 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 |
| 657 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); |
| 658 const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); |
| 659 const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); |
| 660 const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); |
| 661 // multiply 2 adjacent elements with the filter and add the result |
| 662 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); |
| 663 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); |
| 664 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); |
| 665 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); |
| 666 // add and saturate the results together |
| 667 const __m128i min_x2x1 = _mm_min_epi16(x2, x1); |
| 668 const __m128i max_x2x1 = _mm_max_epi16(x2, x1); |
| 669 __m128i temp = _mm_adds_epi16(x0, x3); |
| 670 temp = _mm_adds_epi16(temp, min_x2x1); |
| 671 temp = _mm_adds_epi16(temp, max_x2x1); |
| 672 // round and shift by 7 bit each 16 bit |
| 673 temp = _mm_mulhrs_epi16(temp, k_256); |
| 674 // shrink to 8 bit each 16 bits |
| 675 temp = _mm_packus_epi16(temp, temp); |
| 676 // save only 8 bytes convolve result |
| 677 _mm_storel_epi64((__m128i*)dst, temp); |
| 678 } |
| 679 |
| 680 static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, |
| 681 uint8_t *dst, ptrdiff_t dst_stride) { |
| 682 __m128i A, B, C, D, E, F, G, H; |
| 683 |
| 684 A = _mm_loadl_epi64((const __m128i *)src); |
| 685 B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); |
| 686 C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); |
| 687 D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); |
| 688 E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); |
| 689 F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); |
| 690 G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); |
| 691 H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); |
| 692 |
| 693 TRANSPOSE_8X8(A, B, C, D, E, F, G, H, |
| 694 A, B, C, D, E, F, G, H); |
| 695 |
| 696 _mm_storel_epi64((__m128i*)dst, A); |
| 697 _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B); |
| 698 _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C); |
| 699 _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D); |
| 700 _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E); |
| 701 _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F); |
| 702 _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G); |
| 703 _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H); |
| 704 } |
| 705 |
| 706 static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, |
| 707 uint8_t *dst, ptrdiff_t dst_stride, |
| 708 const InterpKernel *x_filters, |
| 709 int x0_q4, int x_step_q4, int w, int h) { |
| 710 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); |
| 711 int x, y, z; |
| 712 src -= SUBPEL_TAPS / 2 - 1; |
| 713 |
| 714 // This function processes 8x8 areas. The intermediate height is not always |
| 715 // a multiple of 8, so force it to be a multiple of 8 here. |
| 716 y = h + (8 - (h & 0x7)); |
| 717 |
| 718 do { |
| 719 int x_q4 = x0_q4; |
| 720 for (x = 0; x < w; x += 8) { |
| 721 // process 8 src_x steps |
| 722 for (z = 0; z < 8; ++z) { |
| 723 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; |
| 724 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; |
| 725 if (x_q4 & SUBPEL_MASK) { |
| 726 filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); |
| 727 } else { |
| 728 int i; |
| 729 for (i = 0; i < 8; ++i) { |
| 730 temp[z * 8 + i] = src_x[i * src_stride + 3]; |
| 731 } |
| 732 } |
| 733 x_q4 += x_step_q4; |
| 734 } |
| 735 |
| 736 // transpose the 8x8 filters values back to dst |
| 737 transpose8x8_to_dst(temp, 8, dst + x, dst_stride); |
| 738 } |
| 739 |
| 740 src += src_stride * 8; |
| 741 dst += dst_stride * 8; |
| 742 } while (y -= 8); |
| 743 } |
| 744 |
| 745 static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, |
| 746 uint8_t *dst, const int16_t *filter) { |
| 747 const __m128i k_256 = _mm_set1_epi16(1 << 8); |
| 748 const __m128i f_values = _mm_load_si128((const __m128i *)filter); |
| 749 // pack and duplicate the filter values |
| 750 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); |
| 751 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); |
| 752 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); |
| 753 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); |
| 754 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); |
| 755 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); |
| 756 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); |
| 757 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); |
| 758 // TRANSPOSE... |
| 759 // 00 01 02 03 04 05 06 07 |
| 760 // 10 11 12 13 14 15 16 17 |
| 761 // 20 21 22 23 24 25 26 27 |
| 762 // 30 31 32 33 34 35 36 37 |
| 763 // |
| 764 // TO |
| 765 // |
| 766 // 00 10 20 30 |
| 767 // 01 11 21 31 |
| 768 // 02 12 22 32 |
| 769 // 03 13 23 33 |
| 770 // 04 14 24 34 |
| 771 // 05 15 25 35 |
| 772 // 06 16 26 36 |
| 773 // 07 17 27 37 |
| 774 // |
| 775 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 |
| 776 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); |
| 777 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 |
| 778 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); |
| 779 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 |
| 780 const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
| 781 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 |
| 782 const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
| 783 // 02 03 12 13 22 23 32 33 |
| 784 const __m128i s3s2 = _mm_srli_si128(s1s0, 8); |
| 785 // 06 07 16 17 26 27 36 37 |
| 786 const __m128i s7s6 = _mm_srli_si128(s5s4, 8); |
| 787 // multiply 2 adjacent elements with the filter and add the result |
| 788 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); |
| 789 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); |
| 790 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); |
| 791 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); |
| 792 // add and saturate the results together |
| 793 const __m128i min_x2x1 = _mm_min_epi16(x2, x1); |
| 794 const __m128i max_x2x1 = _mm_max_epi16(x2, x1); |
| 795 __m128i temp = _mm_adds_epi16(x0, x3); |
| 796 temp = _mm_adds_epi16(temp, min_x2x1); |
| 797 temp = _mm_adds_epi16(temp, max_x2x1); |
| 798 // round and shift by 7 bit each 16 bit |
| 799 temp = _mm_mulhrs_epi16(temp, k_256); |
| 800 // shrink to 8 bit each 16 bits |
| 801 temp = _mm_packus_epi16(temp, temp); |
| 802 // save only 4 bytes |
| 803 *(int *)dst = _mm_cvtsi128_si32(temp); |
| 804 } |
| 805 |
| 806 static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, |
| 807 uint8_t *dst, ptrdiff_t dst_stride) { |
| 808 __m128i A = _mm_cvtsi32_si128(*(const int *)src); |
| 809 __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); |
| 810 __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); |
| 811 __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); |
| 812 // 00 10 01 11 02 12 03 13 |
| 813 const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); |
| 814 // 20 30 21 31 22 32 23 33 |
| 815 const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); |
| 816 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 |
| 817 A = _mm_unpacklo_epi16(tr0_0, tr0_1); |
| 818 B = _mm_srli_si128(A, 4); |
| 819 C = _mm_srli_si128(A, 8); |
| 820 D = _mm_srli_si128(A, 12); |
| 821 |
| 822 *(int *)(dst) = _mm_cvtsi128_si32(A); |
| 823 *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); |
| 824 *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); |
| 825 *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); |
| 826 } |
| 827 |
| 828 static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, |
| 829 uint8_t *dst, ptrdiff_t dst_stride, |
| 830 const InterpKernel *x_filters, |
| 831 int x0_q4, int x_step_q4, int w, int h) { |
| 832 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); |
| 833 int x, y, z; |
| 834 src -= SUBPEL_TAPS / 2 - 1; |
| 835 |
| 836 for (y = 0; y < h; y += 4) { |
| 837 int x_q4 = x0_q4; |
| 838 for (x = 0; x < w; x += 4) { |
| 839 // process 4 src_x steps |
| 840 for (z = 0; z < 4; ++z) { |
| 841 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; |
| 842 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; |
| 843 if (x_q4 & SUBPEL_MASK) { |
| 844 filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); |
| 845 } else { |
| 846 int i; |
| 847 for (i = 0; i < 4; ++i) { |
| 848 temp[z * 4 + i] = src_x[i * src_stride + 3]; |
| 849 } |
| 850 } |
| 851 x_q4 += x_step_q4; |
| 852 } |
| 853 |
| 854 // transpose the 4x4 filters values back to dst |
| 855 transpose4x4_to_dst(temp, 4, dst + x, dst_stride); |
| 856 } |
| 857 |
| 858 src += src_stride * 4; |
| 859 dst += dst_stride * 4; |
| 860 } |
| 861 } |
| 862 |
| 863 static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, |
| 864 uint8_t *dst, const int16_t *filter) { |
| 865 const __m128i k_256 = _mm_set1_epi16(1 << 8); |
| 866 const __m128i f_values = _mm_load_si128((const __m128i *)filter); |
| 867 // pack and duplicate the filter values |
| 868 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); |
| 869 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); |
| 870 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); |
| 871 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); |
| 872 const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr); |
| 873 const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch)); |
| 874 const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2)); |
| 875 const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3)); |
| 876 const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4)); |
| 877 const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5)); |
| 878 const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6)); |
| 879 const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7)); |
| 880 const __m128i s1s0 = _mm_unpacklo_epi8(A, B); |
| 881 const __m128i s3s2 = _mm_unpacklo_epi8(C, D); |
| 882 const __m128i s5s4 = _mm_unpacklo_epi8(E, F); |
| 883 const __m128i s7s6 = _mm_unpacklo_epi8(G, H); |
| 884 // multiply 2 adjacent elements with the filter and add the result |
| 885 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); |
| 886 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); |
| 887 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); |
| 888 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); |
| 889 // add and saturate the results together |
| 890 const __m128i min_x2x1 = _mm_min_epi16(x2, x1); |
| 891 const __m128i max_x2x1 = _mm_max_epi16(x2, x1); |
| 892 __m128i temp = _mm_adds_epi16(x0, x3); |
| 893 temp = _mm_adds_epi16(temp, min_x2x1); |
| 894 temp = _mm_adds_epi16(temp, max_x2x1); |
| 895 // round and shift by 7 bit each 16 bit |
| 896 temp = _mm_mulhrs_epi16(temp, k_256); |
| 897 // shrink to 8 bit each 16 bits |
| 898 temp = _mm_packus_epi16(temp, temp); |
| 899 // save only 4 bytes |
| 900 *(int *)dst = _mm_cvtsi128_si32(temp); |
| 901 } |
| 902 |
| 903 static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, |
| 904 uint8_t *dst, ptrdiff_t dst_stride, |
| 905 const InterpKernel *y_filters, |
| 906 int y0_q4, int y_step_q4, int w, int h) { |
| 907 int y; |
| 908 int y_q4 = y0_q4; |
| 909 |
| 910 src -= src_stride * (SUBPEL_TAPS / 2 - 1); |
| 911 for (y = 0; y < h; ++y) { |
| 912 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; |
| 913 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; |
| 914 |
| 915 if (y_q4 & SUBPEL_MASK) { |
| 916 filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); |
| 917 } else { |
| 918 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); |
| 919 } |
| 920 |
| 921 y_q4 += y_step_q4; |
| 922 } |
| 923 } |
| 924 |
| 925 static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, |
| 926 uint8_t *dst, const int16_t *filter) { |
| 927 const __m128i k_256 = _mm_set1_epi16(1 << 8); |
| 928 const __m128i f_values = _mm_load_si128((const __m128i *)filter); |
| 929 // pack and duplicate the filter values |
| 930 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); |
| 931 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); |
| 932 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); |
| 933 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); |
| 934 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); |
| 935 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); |
| 936 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); |
| 937 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); |
| 938 const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); |
| 939 const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); |
| 940 const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); |
| 941 const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); |
| 942 const __m128i s1s0 = _mm_unpacklo_epi8(A, B); |
| 943 const __m128i s3s2 = _mm_unpacklo_epi8(C, D); |
| 944 const __m128i s5s4 = _mm_unpacklo_epi8(E, F); |
| 945 const __m128i s7s6 = _mm_unpacklo_epi8(G, H); |
| 946 // multiply 2 adjacent elements with the filter and add the result |
| 947 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); |
| 948 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); |
| 949 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); |
| 950 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); |
| 951 // add and saturate the results together |
| 952 const __m128i min_x2x1 = _mm_min_epi16(x2, x1); |
| 953 const __m128i max_x2x1 = _mm_max_epi16(x2, x1); |
| 954 __m128i temp = _mm_adds_epi16(x0, x3); |
| 955 temp = _mm_adds_epi16(temp, min_x2x1); |
| 956 temp = _mm_adds_epi16(temp, max_x2x1); |
| 957 // round and shift by 7 bit each 16 bit |
| 958 temp = _mm_mulhrs_epi16(temp, k_256); |
| 959 // shrink to 8 bit each 16 bits |
| 960 temp = _mm_packus_epi16(temp, temp); |
| 961 // save only 8 bytes convolve result |
| 962 _mm_storel_epi64((__m128i*)dst, temp); |
| 963 } |
| 964 |
| 965 static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, |
| 966 uint8_t *dst, ptrdiff_t dst_stride, |
| 967 const InterpKernel *y_filters, |
| 968 int y0_q4, int y_step_q4, int w, int h) { |
| 969 int y; |
| 970 int y_q4 = y0_q4; |
| 971 |
| 972 src -= src_stride * (SUBPEL_TAPS / 2 - 1); |
| 973 for (y = 0; y < h; ++y) { |
| 974 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; |
| 975 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; |
| 976 if (y_q4 & SUBPEL_MASK) { |
| 977 filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); |
| 978 } else { |
| 979 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); |
| 980 } |
| 981 y_q4 += y_step_q4; |
| 982 } |
| 983 } |
| 984 |
| 985 static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, |
| 986 uint8_t *dst, const int16_t *filter, int w) { |
| 987 const __m128i k_256 = _mm_set1_epi16(1 << 8); |
| 988 const __m128i f_values = _mm_load_si128((const __m128i *)filter); |
| 989 // pack and duplicate the filter values |
| 990 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); |
| 991 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); |
| 992 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); |
| 993 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); |
| 994 int i; |
| 995 |
| 996 for (i = 0; i < w; i += 16) { |
| 997 const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); |
| 998 const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); |
| 999 const __m128i C = |
| 1000 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); |
| 1001 const __m128i D = |
| 1002 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); |
| 1003 const __m128i E = |
| 1004 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); |
| 1005 const __m128i F = |
| 1006 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); |
| 1007 const __m128i G = |
| 1008 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); |
| 1009 const __m128i H = |
| 1010 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); |
| 1011 // merge the result together |
| 1012 const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); |
| 1013 const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); |
| 1014 const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); |
| 1015 const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); |
| 1016 // multiply 2 adjacent elements with the filter and add the result |
| 1017 const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); |
| 1018 const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); |
| 1019 const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); |
| 1020 const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); |
| 1021 // add and saturate the results together |
| 1022 const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); |
| 1023 const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); |
| 1024 // merge the result together |
| 1025 const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); |
| 1026 const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); |
| 1027 // multiply 2 adjacent elements with the filter and add the result |
| 1028 const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); |
| 1029 const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); |
| 1030 // merge the result together |
| 1031 const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); |
| 1032 const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); |
| 1033 // multiply 2 adjacent elements with the filter and add the result |
| 1034 const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); |
| 1035 const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); |
| 1036 // add and saturate the results together |
| 1037 __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); |
| 1038 __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); |
| 1039 |
| 1040 // add and saturate the results together |
| 1041 temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); |
| 1042 temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); |
| 1043 // round and shift by 7 bit each 16 bit |
| 1044 temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); |
| 1045 temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); |
| 1046 // shrink to 8 bit each 16 bits, the first lane contain the first |
| 1047 // convolve result and the second lane contain the second convolve |
| 1048 // result |
| 1049 temp_hi = _mm_packus_epi16(temp_lo, temp_hi); |
| 1050 src_ptr += 16; |
| 1051 // save 16 bytes convolve result |
| 1052 _mm_store_si128((__m128i*)&dst[i], temp_hi); |
| 1053 } |
| 1054 } |
| 1055 |
| 1056 static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, |
| 1057 uint8_t *dst, ptrdiff_t dst_stride, |
| 1058 const InterpKernel *y_filters, |
| 1059 int y0_q4, int y_step_q4, int w, int h) { |
| 1060 int y; |
| 1061 int y_q4 = y0_q4; |
| 1062 |
| 1063 src -= src_stride * (SUBPEL_TAPS / 2 - 1); |
| 1064 for (y = 0; y < h; ++y) { |
| 1065 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; |
| 1066 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; |
| 1067 if (y_q4 & SUBPEL_MASK) { |
| 1068 filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, |
| 1069 w); |
| 1070 } else { |
| 1071 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); |
| 1072 } |
| 1073 y_q4 += y_step_q4; |
| 1074 } |
| 1075 } |
| 1076 |
| 1077 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, |
| 1078 uint8_t *dst, ptrdiff_t dst_stride, |
| 1079 const InterpKernel *const x_filters, |
| 1080 int x0_q4, int x_step_q4, |
| 1081 const InterpKernel *const y_filters, |
| 1082 int y0_q4, int y_step_q4, |
| 1083 int w, int h) { |
| 1084 // Note: Fixed size intermediate buffer, temp, places limits on parameters. |
| 1085 // 2d filtering proceeds in 2 steps: |
| 1086 // (1) Interpolate horizontally into an intermediate buffer, temp. |
| 1087 // (2) Interpolate temp vertically to derive the sub-pixel result. |
| 1088 // Deriving the maximum number of rows in the temp buffer (135): |
| 1089 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). |
| 1090 // --Largest block size is 64x64 pixels. |
| 1091 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the |
| 1092 // original frame (in 1/16th pixel units). |
| 1093 // --Must round-up because block may be located at sub-pixel position. |
| 1094 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. |
| 1095 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. |
| 1096 // --Require an additional 8 rows for the horiz_w8 transpose tail. |
| 1097 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); |
| 1098 const int intermediate_height = |
| 1099 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; |
| 1100 |
| 1101 assert(w <= 64); |
| 1102 assert(h <= 64); |
| 1103 assert(y_step_q4 <= 32); |
| 1104 assert(x_step_q4 <= 32); |
| 1105 |
| 1106 if (w >= 8) { |
| 1107 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), |
| 1108 src_stride, temp, 64, x_filters, x0_q4, x_step_q4, |
| 1109 w, intermediate_height); |
| 1110 } else { |
| 1111 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), |
| 1112 src_stride, temp, 64, x_filters, x0_q4, x_step_q4, |
| 1113 w, intermediate_height); |
| 1114 } |
| 1115 |
| 1116 if (w >= 16) { |
| 1117 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, |
| 1118 dst_stride, y_filters, y0_q4, y_step_q4, w, h); |
| 1119 } else if (w == 8) { |
| 1120 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, |
| 1121 dst_stride, y_filters, y0_q4, y_step_q4, w, h); |
| 1122 } else { |
| 1123 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, |
| 1124 dst_stride, y_filters, y0_q4, y_step_q4, w, h); |
| 1125 } |
| 1126 } |
| 1127 |
| 1128 static const InterpKernel *get_filter_base(const int16_t *filter) { |
| 1129 // NOTE: This assumes that the filter table is 256-byte aligned. |
| 1130 // TODO(agrange) Modify to make independent of table alignment. |
| 1131 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); |
| 1132 } |
| 1133 |
| 1134 static int get_filter_offset(const int16_t *f, const InterpKernel *base) { |
| 1135 return (int)((const InterpKernel *)(intptr_t)f - base); |
| 1136 } |
| 1137 |
| 1138 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 1139 uint8_t *dst, ptrdiff_t dst_stride, |
| 1140 const int16_t *filter_x, int x_step_q4, |
| 1141 const int16_t *filter_y, int y_step_q4, |
| 1142 int w, int h) { |
| 1143 const InterpKernel *const filters_x = get_filter_base(filter_x); |
| 1144 const int x0_q4 = get_filter_offset(filter_x, filters_x); |
| 1145 |
| 1146 const InterpKernel *const filters_y = get_filter_base(filter_y); |
| 1147 const int y0_q4 = get_filter_offset(filter_y, filters_y); |
| 1148 |
| 1149 scaledconvolve2d(src, src_stride, dst, dst_stride, |
| 1150 filters_x, x0_q4, x_step_q4, |
| 1151 filters_y, y0_q4, y_step_q4, w, h); |
| 1152 } |
| 1153 |
| 1154 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
591 // uint8_t *dst, ptrdiff_t dst_stride, | 1155 // uint8_t *dst, ptrdiff_t dst_stride, |
592 // const int16_t *filter_x, int x_step_q4, | 1156 // const int16_t *filter_x, int x_step_q4, |
593 // const int16_t *filter_y, int y_step_q4, | 1157 // const int16_t *filter_y, int y_step_q4, |
594 // int w, int h); | 1158 // int w, int h); |
595 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 1159 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
596 // uint8_t *dst, ptrdiff_t dst_stride, | 1160 // uint8_t *dst, ptrdiff_t dst_stride, |
597 // const int16_t *filter_x, int x_step_q4, | 1161 // const int16_t *filter_x, int x_step_q4, |
598 // const int16_t *filter_y, int y_step_q4, | 1162 // const int16_t *filter_y, int y_step_q4, |
599 // int w, int h); | 1163 // int w, int h); |
600 FUN_CONV_2D(, ssse3); | 1164 FUN_CONV_2D(, ssse3); |
601 FUN_CONV_2D(avg_ , ssse3); | 1165 FUN_CONV_2D(avg_ , ssse3); |
OLD | NEW |