OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
11 | 11 |
12 %include "vpx_ports/x86_abi_support.asm" | 12 %include "vpx_ports/x86_abi_support.asm" |
13 | 13 |
14 ; void vp9_temporal_filter_apply_sse2 | arg | 14 ; void vp9_temporal_filter_apply_sse2 | arg |
15 ; (unsigned char *frame1, | 0 | 15 ; (unsigned char *frame1, | 0 |
16 ; unsigned int stride, | 1 | 16 ; unsigned int stride, | 1 |
17 ; unsigned char *frame2, | 2 | 17 ; unsigned char *frame2, | 2 |
18 ; unsigned int block_size, | 3 | 18 ; unsigned int block_width, | 3 |
19 ; int strength, | 4 | 19 ; unsigned int block_height, | 4 |
20 ; int filter_weight, | 5 | 20 ; int strength, | 5 |
21 ; unsigned int *accumulator, | 6 | 21 ; int filter_weight, | 6 |
22 ; unsigned short *count) | 7 | 22 ; unsigned int *accumulator, | 7 |
| 23 ; unsigned short *count) | 8 |
23 global sym(vp9_temporal_filter_apply_sse2) PRIVATE | 24 global sym(vp9_temporal_filter_apply_sse2) PRIVATE |
24 sym(vp9_temporal_filter_apply_sse2): | 25 sym(vp9_temporal_filter_apply_sse2): |
25 | 26 |
26 push rbp | 27 push rbp |
27 mov rbp, rsp | 28 mov rbp, rsp |
28 SHADOW_ARGS_TO_STACK 8 | 29 SHADOW_ARGS_TO_STACK 9 |
29 SAVE_XMM 7 | 30 SAVE_XMM 7 |
30 GET_GOT rbx | 31 GET_GOT rbx |
31 push rsi | 32 push rsi |
32 push rdi | 33 push rdi |
33 ALIGN_STACK 16, rax | 34 ALIGN_STACK 16, rax |
34 %define block_size 0 | 35 %define block_width 0 |
35 %define strength 16 | 36 %define block_height 16 |
36 %define filter_weight 32 | 37 %define strength 32 |
37 %define rounding_bit 48 | 38 %define filter_weight 48 |
38 %define rbp_backup 64 | 39 %define rounding_bit 64 |
39 %define stack_size 80 | 40 %define rbp_backup 80 |
| 41 %define stack_size 96 |
40 sub rsp, stack_size | 42 sub rsp, stack_size |
41 mov [rsp + rbp_backup], rbp | 43 mov [rsp + rbp_backup], rbp |
42 ; end prolog | 44 ; end prolog |
43 | 45 |
44 mov rdx, arg(3) | 46 mov edx, arg(3) |
45 mov [rsp + block_size], rdx | 47 mov [rsp + block_width], rdx |
46 movd xmm6, arg(4) | 48 mov edx, arg(4) |
| 49 mov [rsp + block_height], rdx |
| 50 movd xmm6, arg(5) |
47 movdqa [rsp + strength], xmm6 ; where strength is used, all 16 byte
s are read | 51 movdqa [rsp + strength], xmm6 ; where strength is used, all 16 byte
s are read |
48 | 52 |
49 ; calculate the rounding bit outside the loop | 53 ; calculate the rounding bit outside the loop |
50 ; 0x8000 >> (16 - strength) | 54 ; 0x8000 >> (16 - strength) |
51 mov rdx, 16 | 55 mov rdx, 16 |
52 sub rdx, arg(4) ; 16 - strength | 56 sub rdx, arg(5) ; 16 - strength |
53 movq xmm4, rdx ; can't use rdx w/ shift | 57 movq xmm4, rdx ; can't use rdx w/ shift |
54 movdqa xmm5, [GLOBAL(_const_top_bit)] | 58 movdqa xmm5, [GLOBAL(_const_top_bit)] |
55 psrlw xmm5, xmm4 | 59 psrlw xmm5, xmm4 |
56 movdqa [rsp + rounding_bit], xmm5 | 60 movdqa [rsp + rounding_bit], xmm5 |
57 | 61 |
58 mov rsi, arg(0) ; src/frame1 | 62 mov rsi, arg(0) ; src/frame1 |
59 mov rdx, arg(2) ; predictor frame | 63 mov rdx, arg(2) ; predictor frame |
60 mov rdi, arg(6) ; accumulator | 64 mov rdi, arg(7) ; accumulator |
61 mov rax, arg(7) ; count | 65 mov rax, arg(8) ; count |
62 | 66 |
63 ; dup the filter weight and store for later | 67 ; dup the filter weight and store for later |
64 movd xmm0, arg(5) ; filter_weight | 68 movd xmm0, arg(6) ; filter_weight |
65 pshuflw xmm0, xmm0, 0 | 69 pshuflw xmm0, xmm0, 0 |
66 punpcklwd xmm0, xmm0 | 70 punpcklwd xmm0, xmm0 |
67 movdqa [rsp + filter_weight], xmm0 | 71 movdqa [rsp + filter_weight], xmm0 |
68 | 72 |
69 mov rbp, arg(1) ; stride | 73 mov rbp, arg(1) ; stride |
70 pxor xmm7, xmm7 ; zero for extraction | 74 pxor xmm7, xmm7 ; zero for extraction |
71 | 75 |
72 lea rcx, [rdx + 16*16*1] | 76 mov rcx, [rsp + block_width] |
73 cmp dword ptr [rsp + block_size], 8 | 77 imul rcx, [rsp + block_height] |
| 78 add rcx, rdx |
| 79 cmp dword ptr [rsp + block_width], 8 |
74 jne .temporal_filter_apply_load_16 | 80 jne .temporal_filter_apply_load_16 |
75 lea rcx, [rdx + 8*8*1] | |
76 | 81 |
77 .temporal_filter_apply_load_8: | 82 .temporal_filter_apply_load_8: |
78 movq xmm0, [rsi] ; first row | 83 movq xmm0, [rsi] ; first row |
79 lea rsi, [rsi + rbp] ; += stride | 84 lea rsi, [rsi + rbp] ; += stride |
80 punpcklbw xmm0, xmm7 ; src[ 0- 7] | 85 punpcklbw xmm0, xmm7 ; src[ 0- 7] |
81 movq xmm1, [rsi] ; second row | 86 movq xmm1, [rsi] ; second row |
82 lea rsi, [rsi + rbp] ; += stride | 87 lea rsi, [rsi + rbp] ; += stride |
83 punpcklbw xmm1, xmm7 ; src[ 8-15] | 88 punpcklbw xmm1, xmm7 ; src[ 8-15] |
84 jmp .temporal_filter_apply_load_finished | 89 jmp .temporal_filter_apply_load_finished |
85 | 90 |
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
171 ; write back | 176 ; write back |
172 movdqa [rdi], xmm4 | 177 movdqa [rdi], xmm4 |
173 movdqa [rdi+16], xmm5 | 178 movdqa [rdi+16], xmm5 |
174 movdqa [rdi+32], xmm6 | 179 movdqa [rdi+32], xmm6 |
175 movdqa [rdi+48], xmm7 | 180 movdqa [rdi+48], xmm7 |
176 lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int
)) | 181 lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int
)) |
177 | 182 |
178 cmp rdx, rcx | 183 cmp rdx, rcx |
179 je .temporal_filter_apply_epilog | 184 je .temporal_filter_apply_epilog |
180 pxor xmm7, xmm7 ; zero for extraction | 185 pxor xmm7, xmm7 ; zero for extraction |
181 cmp dword ptr [rsp + block_size], 16 | 186 cmp dword ptr [rsp + block_width], 16 |
182 je .temporal_filter_apply_load_16 | 187 je .temporal_filter_apply_load_16 |
183 jmp .temporal_filter_apply_load_8 | 188 jmp .temporal_filter_apply_load_8 |
184 | 189 |
185 .temporal_filter_apply_epilog: | 190 .temporal_filter_apply_epilog: |
186 ; begin epilog | 191 ; begin epilog |
187 mov rbp, [rsp + rbp_backup] | 192 mov rbp, [rsp + rbp_backup] |
188 add rsp, stack_size | 193 add rsp, stack_size |
189 pop rsp | 194 pop rsp |
190 pop rdi | 195 pop rdi |
191 pop rsi | 196 pop rsi |
192 RESTORE_GOT | 197 RESTORE_GOT |
193 RESTORE_XMM | 198 RESTORE_XMM |
194 UNSHADOW_ARGS | 199 UNSHADOW_ARGS |
195 pop rbp | 200 pop rbp |
196 ret | 201 ret |
197 | 202 |
198 SECTION_RODATA | 203 SECTION_RODATA |
199 align 16 | 204 align 16 |
200 _const_3w: | 205 _const_3w: |
201 times 8 dw 3 | 206 times 8 dw 3 |
202 align 16 | 207 align 16 |
203 _const_top_bit: | 208 _const_top_bit: |
204 times 8 dw 1<<15 | 209 times 8 dw 1<<15 |
205 align 16 | 210 align 16 |
206 _const_16w | 211 _const_16w |
207 times 8 dw 16 | 212 times 8 dw 16 |
OLD | NEW |