| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| 11 | 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" | 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 | 13 |
| 14 ; void vp9_temporal_filter_apply_sse2 | arg | 14 ; void vp9_temporal_filter_apply_sse2 | arg |
| 15 ; (unsigned char *frame1, | 0 | 15 ; (unsigned char *frame1, | 0 |
| 16 ; unsigned int stride, | 1 | 16 ; unsigned int stride, | 1 |
| 17 ; unsigned char *frame2, | 2 | 17 ; unsigned char *frame2, | 2 |
| 18 ; unsigned int block_size, | 3 | 18 ; unsigned int block_width, | 3 |
| 19 ; int strength, | 4 | 19 ; unsigned int block_height, | 4 |
| 20 ; int filter_weight, | 5 | 20 ; int strength, | 5 |
| 21 ; unsigned int *accumulator, | 6 | 21 ; int filter_weight, | 6 |
| 22 ; unsigned short *count) | 7 | 22 ; unsigned int *accumulator, | 7 |
| 23 ; unsigned short *count) | 8 |
| 23 global sym(vp9_temporal_filter_apply_sse2) PRIVATE | 24 global sym(vp9_temporal_filter_apply_sse2) PRIVATE |
| 24 sym(vp9_temporal_filter_apply_sse2): | 25 sym(vp9_temporal_filter_apply_sse2): |
| 25 | 26 |
| 26 push rbp | 27 push rbp |
| 27 mov rbp, rsp | 28 mov rbp, rsp |
| 28 SHADOW_ARGS_TO_STACK 8 | 29 SHADOW_ARGS_TO_STACK 9 |
| 29 SAVE_XMM 7 | 30 SAVE_XMM 7 |
| 30 GET_GOT rbx | 31 GET_GOT rbx |
| 31 push rsi | 32 push rsi |
| 32 push rdi | 33 push rdi |
| 33 ALIGN_STACK 16, rax | 34 ALIGN_STACK 16, rax |
| 34 %define block_size 0 | 35 %define block_width 0 |
| 35 %define strength 16 | 36 %define block_height 16 |
| 36 %define filter_weight 32 | 37 %define strength 32 |
| 37 %define rounding_bit 48 | 38 %define filter_weight 48 |
| 38 %define rbp_backup 64 | 39 %define rounding_bit 64 |
| 39 %define stack_size 80 | 40 %define rbp_backup 80 |
| 41 %define stack_size 96 |
| 40 sub rsp, stack_size | 42 sub rsp, stack_size |
| 41 mov [rsp + rbp_backup], rbp | 43 mov [rsp + rbp_backup], rbp |
| 42 ; end prolog | 44 ; end prolog |
| 43 | 45 |
| 44 mov rdx, arg(3) | 46 mov edx, arg(3) |
| 45 mov [rsp + block_size], rdx | 47 mov [rsp + block_width], rdx |
| 46 movd xmm6, arg(4) | 48 mov edx, arg(4) |
| 49 mov [rsp + block_height], rdx |
| 50 movd xmm6, arg(5) |
| 47 movdqa [rsp + strength], xmm6 ; where strength is used, all 16 byte
s are read | 51 movdqa [rsp + strength], xmm6 ; where strength is used, all 16 byte
s are read |
| 48 | 52 |
| 49 ; calculate the rounding bit outside the loop | 53 ; calculate the rounding bit outside the loop |
| 50 ; 0x8000 >> (16 - strength) | 54 ; 0x8000 >> (16 - strength) |
| 51 mov rdx, 16 | 55 mov rdx, 16 |
| 52 sub rdx, arg(4) ; 16 - strength | 56 sub rdx, arg(5) ; 16 - strength |
| 53 movq xmm4, rdx ; can't use rdx w/ shift | 57 movq xmm4, rdx ; can't use rdx w/ shift |
| 54 movdqa xmm5, [GLOBAL(_const_top_bit)] | 58 movdqa xmm5, [GLOBAL(_const_top_bit)] |
| 55 psrlw xmm5, xmm4 | 59 psrlw xmm5, xmm4 |
| 56 movdqa [rsp + rounding_bit], xmm5 | 60 movdqa [rsp + rounding_bit], xmm5 |
| 57 | 61 |
| 58 mov rsi, arg(0) ; src/frame1 | 62 mov rsi, arg(0) ; src/frame1 |
| 59 mov rdx, arg(2) ; predictor frame | 63 mov rdx, arg(2) ; predictor frame |
| 60 mov rdi, arg(6) ; accumulator | 64 mov rdi, arg(7) ; accumulator |
| 61 mov rax, arg(7) ; count | 65 mov rax, arg(8) ; count |
| 62 | 66 |
| 63 ; dup the filter weight and store for later | 67 ; dup the filter weight and store for later |
| 64 movd xmm0, arg(5) ; filter_weight | 68 movd xmm0, arg(6) ; filter_weight |
| 65 pshuflw xmm0, xmm0, 0 | 69 pshuflw xmm0, xmm0, 0 |
| 66 punpcklwd xmm0, xmm0 | 70 punpcklwd xmm0, xmm0 |
| 67 movdqa [rsp + filter_weight], xmm0 | 71 movdqa [rsp + filter_weight], xmm0 |
| 68 | 72 |
| 69 mov rbp, arg(1) ; stride | 73 mov rbp, arg(1) ; stride |
| 70 pxor xmm7, xmm7 ; zero for extraction | 74 pxor xmm7, xmm7 ; zero for extraction |
| 71 | 75 |
| 72 lea rcx, [rdx + 16*16*1] | 76 mov rcx, [rsp + block_width] |
| 73 cmp dword ptr [rsp + block_size], 8 | 77 imul rcx, [rsp + block_height] |
| 78 add rcx, rdx |
| 79 cmp dword ptr [rsp + block_width], 8 |
| 74 jne .temporal_filter_apply_load_16 | 80 jne .temporal_filter_apply_load_16 |
| 75 lea rcx, [rdx + 8*8*1] | |
| 76 | 81 |
| 77 .temporal_filter_apply_load_8: | 82 .temporal_filter_apply_load_8: |
| 78 movq xmm0, [rsi] ; first row | 83 movq xmm0, [rsi] ; first row |
| 79 lea rsi, [rsi + rbp] ; += stride | 84 lea rsi, [rsi + rbp] ; += stride |
| 80 punpcklbw xmm0, xmm7 ; src[ 0- 7] | 85 punpcklbw xmm0, xmm7 ; src[ 0- 7] |
| 81 movq xmm1, [rsi] ; second row | 86 movq xmm1, [rsi] ; second row |
| 82 lea rsi, [rsi + rbp] ; += stride | 87 lea rsi, [rsi + rbp] ; += stride |
| 83 punpcklbw xmm1, xmm7 ; src[ 8-15] | 88 punpcklbw xmm1, xmm7 ; src[ 8-15] |
| 84 jmp .temporal_filter_apply_load_finished | 89 jmp .temporal_filter_apply_load_finished |
| 85 | 90 |
| (...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 171 ; write back | 176 ; write back |
| 172 movdqa [rdi], xmm4 | 177 movdqa [rdi], xmm4 |
| 173 movdqa [rdi+16], xmm5 | 178 movdqa [rdi+16], xmm5 |
| 174 movdqa [rdi+32], xmm6 | 179 movdqa [rdi+32], xmm6 |
| 175 movdqa [rdi+48], xmm7 | 180 movdqa [rdi+48], xmm7 |
| 176 lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int
)) | 181 lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int
)) |
| 177 | 182 |
| 178 cmp rdx, rcx | 183 cmp rdx, rcx |
| 179 je .temporal_filter_apply_epilog | 184 je .temporal_filter_apply_epilog |
| 180 pxor xmm7, xmm7 ; zero for extraction | 185 pxor xmm7, xmm7 ; zero for extraction |
| 181 cmp dword ptr [rsp + block_size], 16 | 186 cmp dword ptr [rsp + block_width], 16 |
| 182 je .temporal_filter_apply_load_16 | 187 je .temporal_filter_apply_load_16 |
| 183 jmp .temporal_filter_apply_load_8 | 188 jmp .temporal_filter_apply_load_8 |
| 184 | 189 |
| 185 .temporal_filter_apply_epilog: | 190 .temporal_filter_apply_epilog: |
| 186 ; begin epilog | 191 ; begin epilog |
| 187 mov rbp, [rsp + rbp_backup] | 192 mov rbp, [rsp + rbp_backup] |
| 188 add rsp, stack_size | 193 add rsp, stack_size |
| 189 pop rsp | 194 pop rsp |
| 190 pop rdi | 195 pop rdi |
| 191 pop rsi | 196 pop rsi |
| 192 RESTORE_GOT | 197 RESTORE_GOT |
| 193 RESTORE_XMM | 198 RESTORE_XMM |
| 194 UNSHADOW_ARGS | 199 UNSHADOW_ARGS |
| 195 pop rbp | 200 pop rbp |
| 196 ret | 201 ret |
| 197 | 202 |
| 198 SECTION_RODATA | 203 SECTION_RODATA |
| 199 align 16 | 204 align 16 |
| 200 _const_3w: | 205 _const_3w: |
| 201 times 8 dw 3 | 206 times 8 dw 3 |
| 202 align 16 | 207 align 16 |
| 203 _const_top_bit: | 208 _const_top_bit: |
| 204 times 8 dw 1<<15 | 209 times 8 dw 1<<15 |
| 205 align 16 | 210 align 16 |
| 206 _const_16w | 211 _const_16w |
| 207 times 8 dw 16 | 212 times 8 dw 16 |
| OLD | NEW |