| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| 11 %include "vpx_ports/x86_abi_support.asm" | 11 %include "vpx_ports/x86_abi_support.asm" |
| 12 | 12 |
| 13 %macro STACK_FRAME_CREATE_X3 0 | 13 %macro STACK_FRAME_CREATE_X3 0 |
| 14 %if ABI_IS_32BIT | 14 %if ABI_IS_32BIT |
| 15 %define src_ptr rsi | 15 %define src_ptr rsi |
| 16 %define src_stride rax | 16 %define src_stride rax |
| 17 %define ref_ptr rdi | 17 %define ref_ptr rdi |
| 18 %define ref_stride rdx | 18 %define ref_stride rdx |
| 19 %define end_ptr rcx | 19 %define end_ptr rcx |
| 20 %define ret_var rbx | 20 %define ret_var rbx |
| 21 %define result_ptr arg(4) | 21 %define result_ptr arg(4) |
| 22 %define max_err arg(4) | 22 %define max_err arg(4) |
| 23 %define height dword ptr arg(4) |
| 23 push rbp | 24 push rbp |
| 24 mov rbp, rsp | 25 mov rbp, rsp |
| 25 push rsi | 26 push rsi |
| 26 push rdi | 27 push rdi |
| 27 push rbx | 28 push rbx |
| 28 | 29 |
| 29 mov rsi, arg(0) ; src_ptr | 30 mov rsi, arg(0) ; src_ptr |
| 30 mov rdi, arg(2) ; ref_ptr | 31 mov rdi, arg(2) ; ref_ptr |
| 31 | 32 |
| 32 movsxd rax, dword ptr arg(1) ; src_stride | 33 movsxd rax, dword ptr arg(1) ; src_stride |
| 33 movsxd rdx, dword ptr arg(3) ; ref_stride | 34 movsxd rdx, dword ptr arg(3) ; ref_stride |
| 34 %else | 35 %else |
| 35 %ifidn __OUTPUT_FORMAT__,x64 | 36 %ifidn __OUTPUT_FORMAT__,x64 |
| 37 SAVE_XMM 7, u |
| 36 %define src_ptr rcx | 38 %define src_ptr rcx |
| 37 %define src_stride rdx | 39 %define src_stride rdx |
| 38 %define ref_ptr r8 | 40 %define ref_ptr r8 |
| 39 %define ref_stride r9 | 41 %define ref_stride r9 |
| 40 %define end_ptr r10 | 42 %define end_ptr r10 |
| 41 %define ret_var r11 | 43 %define ret_var r11 |
| 42 %define result_ptr [rsp+8+4*8] | 44 %define result_ptr [rsp+xmm_stack_space+8+4*8] |
| 43 %define max_err [rsp+8+4*8] | 45 %define max_err [rsp+xmm_stack_space+8+4*8] |
| 46 %define height dword ptr [rsp+xmm_stack_space+8+4*8] |
| 44 %else | 47 %else |
| 45 %define src_ptr rdi | 48 %define src_ptr rdi |
| 46 %define src_stride rsi | 49 %define src_stride rsi |
| 47 %define ref_ptr rdx | 50 %define ref_ptr rdx |
| 48 %define ref_stride rcx | 51 %define ref_stride rcx |
| 49 %define end_ptr r9 | 52 %define end_ptr r9 |
| 50 %define ret_var r10 | 53 %define ret_var r10 |
| 51 %define result_ptr r8 | 54 %define result_ptr r8 |
| 52 %define max_err r8 | 55 %define max_err r8 |
| 56 %define height r8 |
| 53 %endif | 57 %endif |
| 54 %endif | 58 %endif |
| 55 | 59 |
| 56 %endmacro | 60 %endmacro |
| 57 | 61 |
| 58 %macro STACK_FRAME_DESTROY_X3 0 | 62 %macro STACK_FRAME_DESTROY_X3 0 |
| 59 %define src_ptr | 63 %define src_ptr |
| 60 %define src_stride | 64 %define src_stride |
| 61 %define ref_ptr | 65 %define ref_ptr |
| 62 %define ref_stride | 66 %define ref_stride |
| 63 %define end_ptr | 67 %define end_ptr |
| 64 %define ret_var | 68 %define ret_var |
| 65 %define result_ptr | 69 %define result_ptr |
| 66 %define max_err | 70 %define max_err |
| 71 %define height |
| 67 | 72 |
| 68 %if ABI_IS_32BIT | 73 %if ABI_IS_32BIT |
| 69 pop rbx | 74 pop rbx |
| 70 pop rdi | 75 pop rdi |
| 71 pop rsi | 76 pop rsi |
| 72 pop rbp | 77 pop rbp |
| 73 %else | 78 %else |
| 74 %ifidn __OUTPUT_FORMAT__,x64 | 79 %ifidn __OUTPUT_FORMAT__,x64 |
| 80 RESTORE_XMM |
| 75 %endif | 81 %endif |
| 76 %endif | 82 %endif |
| 77 ret | 83 ret |
| 78 %endmacro | 84 %endmacro |
| 79 | 85 |
| 80 %macro STACK_FRAME_CREATE_X4 0 | 86 %macro STACK_FRAME_CREATE_X4 0 |
| 81 %if ABI_IS_32BIT | 87 %if ABI_IS_32BIT |
| 82 %define src_ptr rsi | 88 %define src_ptr rsi |
| 83 %define src_stride rax | 89 %define src_stride rax |
| 84 %define r0_ptr rcx | 90 %define r0_ptr rcx |
| (...skipping 14 matching lines...) Expand all Loading... |
| 99 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi | 105 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi |
| 100 | 106 |
| 101 mov rsi, arg(0) ; src_ptr | 107 mov rsi, arg(0) ; src_ptr |
| 102 | 108 |
| 103 movsxd rbx, dword ptr arg(1) ; src_stride | 109 movsxd rbx, dword ptr arg(1) ; src_stride |
| 104 movsxd rbp, dword ptr arg(3) ; ref_stride | 110 movsxd rbp, dword ptr arg(3) ; ref_stride |
| 105 | 111 |
| 106 xchg rbx, rax | 112 xchg rbx, rax |
| 107 %else | 113 %else |
| 108 %ifidn __OUTPUT_FORMAT__,x64 | 114 %ifidn __OUTPUT_FORMAT__,x64 |
| 115 SAVE_XMM 7, u |
| 109 %define src_ptr rcx | 116 %define src_ptr rcx |
| 110 %define src_stride rdx | 117 %define src_stride rdx |
| 111 %define r0_ptr rsi | 118 %define r0_ptr rsi |
| 112 %define r1_ptr r10 | 119 %define r1_ptr r10 |
| 113 %define r2_ptr r11 | 120 %define r2_ptr r11 |
| 114 %define r3_ptr r8 | 121 %define r3_ptr r8 |
| 115 %define ref_stride r9 | 122 %define ref_stride r9 |
| 116 %define result_ptr [rsp+16+4*8] | 123 %define result_ptr [rsp+xmm_stack_space+16+4*8] |
| 117 push rsi | 124 push rsi |
| 118 | 125 |
| 119 LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr | 126 LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr |
| 120 %else | 127 %else |
| 121 %define src_ptr rdi | 128 %define src_ptr rdi |
| 122 %define src_stride rsi | 129 %define src_stride rsi |
| 123 %define r0_ptr r9 | 130 %define r0_ptr r9 |
| 124 %define r1_ptr r10 | 131 %define r1_ptr r10 |
| 125 %define r2_ptr r11 | 132 %define r2_ptr r11 |
| 126 %define r3_ptr rdx | 133 %define r3_ptr rdx |
| (...skipping 17 matching lines...) Expand all Loading... |
| 144 %define result_ptr | 151 %define result_ptr |
| 145 | 152 |
| 146 %if ABI_IS_32BIT | 153 %if ABI_IS_32BIT |
| 147 pop rbx | 154 pop rbx |
| 148 pop rdi | 155 pop rdi |
| 149 pop rsi | 156 pop rsi |
| 150 pop rbp | 157 pop rbp |
| 151 %else | 158 %else |
| 152 %ifidn __OUTPUT_FORMAT__,x64 | 159 %ifidn __OUTPUT_FORMAT__,x64 |
| 153 pop rsi | 160 pop rsi |
| 161 RESTORE_XMM |
| 154 %endif | 162 %endif |
| 155 %endif | 163 %endif |
| 156 ret | 164 ret |
| 157 %endmacro | 165 %endmacro |
| 158 | 166 |
| 159 %macro PROCESS_16X2X3 5 | 167 %macro PROCESS_16X2X3 5 |
| 160 %if %1==0 | 168 %if %1==0 |
| 161 movdqa xmm0, XMMWORD PTR [%2] | 169 movdqa xmm0, XMMWORD PTR [%2] |
| 162 lddqu xmm5, XMMWORD PTR [%3] | 170 lddqu xmm5, XMMWORD PTR [%3] |
| 163 lddqu xmm6, XMMWORD PTR [%3+1] | 171 lddqu xmm6, XMMWORD PTR [%3+1] |
| (...skipping 457 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 621 sub end_ptr, 1 | 629 sub end_ptr, 1 |
| 622 jne .vp8_sad16x16_sse3_loop | 630 jne .vp8_sad16x16_sse3_loop |
| 623 | 631 |
| 624 movq xmm0, xmm7 | 632 movq xmm0, xmm7 |
| 625 psrldq xmm7, 8 | 633 psrldq xmm7, 8 |
| 626 paddw xmm0, xmm7 | 634 paddw xmm0, xmm7 |
| 627 movq rax, xmm0 | 635 movq rax, xmm0 |
| 628 | 636 |
| 629 STACK_FRAME_DESTROY_X3 | 637 STACK_FRAME_DESTROY_X3 |
| 630 | 638 |
| 639 ;void vp8_copy32xn_sse3( |
| 640 ; unsigned char *src_ptr, |
| 641 ; int src_stride, |
| 642 ; unsigned char *dst_ptr, |
| 643 ; int dst_stride, |
| 644 ; int height); |
| 645 global sym(vp8_copy32xn_sse3) |
| 646 sym(vp8_copy32xn_sse3): |
| 647 |
| 648 STACK_FRAME_CREATE_X3 |
| 649 |
| 650 block_copy_sse3_loopx4: |
| 651 lea end_ptr, [src_ptr+src_stride*2] |
| 652 |
| 653 movdqu xmm0, XMMWORD PTR [src_ptr] |
| 654 movdqu xmm1, XMMWORD PTR [src_ptr + 16] |
| 655 movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] |
| 656 movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] |
| 657 movdqu xmm4, XMMWORD PTR [end_ptr] |
| 658 movdqu xmm5, XMMWORD PTR [end_ptr + 16] |
| 659 movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] |
| 660 movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] |
| 661 |
| 662 lea src_ptr, [src_ptr+src_stride*4] |
| 663 |
| 664 lea end_ptr, [ref_ptr+ref_stride*2] |
| 665 |
| 666 movdqa XMMWORD PTR [ref_ptr], xmm0 |
| 667 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 |
| 668 movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 |
| 669 movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 |
| 670 movdqa XMMWORD PTR [end_ptr], xmm4 |
| 671 movdqa XMMWORD PTR [end_ptr + 16], xmm5 |
| 672 movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 |
| 673 movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 |
| 674 |
| 675 lea ref_ptr, [ref_ptr+ref_stride*4] |
| 676 |
| 677 sub height, 4 |
| 678 cmp height, 4 |
| 679 jge block_copy_sse3_loopx4 |
| 680 |
| 681 ;Check to see if there is more rows need to be copied. |
| 682 cmp height, 0 |
| 683 je copy_is_done |
| 684 |
| 685 block_copy_sse3_loop: |
| 686 movdqu xmm0, XMMWORD PTR [src_ptr] |
| 687 movdqu xmm1, XMMWORD PTR [src_ptr + 16] |
| 688 lea src_ptr, [src_ptr+src_stride] |
| 689 |
| 690 movdqa XMMWORD PTR [ref_ptr], xmm0 |
| 691 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 |
| 692 lea ref_ptr, [ref_ptr+ref_stride] |
| 693 |
| 694 sub height, 1 |
| 695 jne block_copy_sse3_loop |
| 696 |
| 697 copy_is_done: |
| 698 STACK_FRAME_DESTROY_X3 |
| 699 |
| 631 ;void vp8_sad16x16x4d_sse3( | 700 ;void vp8_sad16x16x4d_sse3( |
| 632 ; unsigned char *src_ptr, | 701 ; unsigned char *src_ptr, |
| 633 ; int src_stride, | 702 ; int src_stride, |
| 634 ; unsigned char *ref_ptr_base, | 703 ; unsigned char *ref_ptr_base, |
| 635 ; int ref_stride, | 704 ; int ref_stride, |
| 636 ; int *results) | 705 ; int *results) |
| 637 global sym(vp8_sad16x16x4d_sse3) | 706 global sym(vp8_sad16x16x4d_sse3) |
| 638 sym(vp8_sad16x16x4d_sse3): | 707 sym(vp8_sad16x16x4d_sse3): |
| 639 | 708 |
| 640 STACK_FRAME_CREATE_X4 | 709 STACK_FRAME_CREATE_X4 |
| (...skipping 240 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 881 punpcklbw mm2, mm1 | 950 punpcklbw mm2, mm1 |
| 882 | 951 |
| 883 movd [rsi+8], mm7 | 952 movd [rsi+8], mm7 |
| 884 psadbw mm2, mm0 | 953 psadbw mm2, mm0 |
| 885 | 954 |
| 886 paddw mm2, mm6 | 955 paddw mm2, mm6 |
| 887 movd [rsi+12], mm2 | 956 movd [rsi+12], mm2 |
| 888 | 957 |
| 889 | 958 |
| 890 STACK_FRAME_DESTROY_X4 | 959 STACK_FRAME_DESTROY_X4 |
| 960 |
| OLD | NEW |