Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(361)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
11 11
12 %include "vpx_ports/x86_abi_support.asm" 12 %include "vpx_ports/x86_abi_support.asm"
13 13
14 ; void vp9_temporal_filter_apply_sse2 | arg 14 ; void vp9_temporal_filter_apply_sse2 | arg
15 ; (unsigned char *frame1, | 0 15 ; (unsigned char *frame1, | 0
16 ; unsigned int stride, | 1 16 ; unsigned int stride, | 1
17 ; unsigned char *frame2, | 2 17 ; unsigned char *frame2, | 2
18 ; unsigned int block_size, | 3 18 ; unsigned int block_width, | 3
19 ; int strength, | 4 19 ; unsigned int block_height, | 4
20 ; int filter_weight, | 5 20 ; int strength, | 5
21 ; unsigned int *accumulator, | 6 21 ; int filter_weight, | 6
22 ; unsigned short *count) | 7 22 ; unsigned int *accumulator, | 7
23 ; unsigned short *count) | 8
23 global sym(vp9_temporal_filter_apply_sse2) PRIVATE 24 global sym(vp9_temporal_filter_apply_sse2) PRIVATE
24 sym(vp9_temporal_filter_apply_sse2): 25 sym(vp9_temporal_filter_apply_sse2):
25 26
26 push rbp 27 push rbp
27 mov rbp, rsp 28 mov rbp, rsp
28 SHADOW_ARGS_TO_STACK 8 29 SHADOW_ARGS_TO_STACK 9
29 SAVE_XMM 7 30 SAVE_XMM 7
30 GET_GOT rbx 31 GET_GOT rbx
31 push rsi 32 push rsi
32 push rdi 33 push rdi
33 ALIGN_STACK 16, rax 34 ALIGN_STACK 16, rax
34 %define block_size 0 35 %define block_width 0
35 %define strength 16 36 %define block_height 16
36 %define filter_weight 32 37 %define strength 32
37 %define rounding_bit 48 38 %define filter_weight 48
38 %define rbp_backup 64 39 %define rounding_bit 64
39 %define stack_size 80 40 %define rbp_backup 80
41 %define stack_size 96
40 sub rsp, stack_size 42 sub rsp, stack_size
41 mov [rsp + rbp_backup], rbp 43 mov [rsp + rbp_backup], rbp
42 ; end prolog 44 ; end prolog
43 45
44 mov rdx, arg(3) 46 mov edx, arg(3)
45 mov [rsp + block_size], rdx 47 mov [rsp + block_width], rdx
46 movd xmm6, arg(4) 48 mov edx, arg(4)
49 mov [rsp + block_height], rdx
50 movd xmm6, arg(5)
47 movdqa [rsp + strength], xmm6 ; where strength is used, all 16 byte s are read 51 movdqa [rsp + strength], xmm6 ; where strength is used, all 16 byte s are read
48 52
49 ; calculate the rounding bit outside the loop 53 ; calculate the rounding bit outside the loop
50 ; 0x8000 >> (16 - strength) 54 ; 0x8000 >> (16 - strength)
51 mov rdx, 16 55 mov rdx, 16
52 sub rdx, arg(4) ; 16 - strength 56 sub rdx, arg(5) ; 16 - strength
53 movq xmm4, rdx ; can't use rdx w/ shift 57 movq xmm4, rdx ; can't use rdx w/ shift
54 movdqa xmm5, [GLOBAL(_const_top_bit)] 58 movdqa xmm5, [GLOBAL(_const_top_bit)]
55 psrlw xmm5, xmm4 59 psrlw xmm5, xmm4
56 movdqa [rsp + rounding_bit], xmm5 60 movdqa [rsp + rounding_bit], xmm5
57 61
58 mov rsi, arg(0) ; src/frame1 62 mov rsi, arg(0) ; src/frame1
59 mov rdx, arg(2) ; predictor frame 63 mov rdx, arg(2) ; predictor frame
60 mov rdi, arg(6) ; accumulator 64 mov rdi, arg(7) ; accumulator
61 mov rax, arg(7) ; count 65 mov rax, arg(8) ; count
62 66
63 ; dup the filter weight and store for later 67 ; dup the filter weight and store for later
64 movd xmm0, arg(5) ; filter_weight 68 movd xmm0, arg(6) ; filter_weight
65 pshuflw xmm0, xmm0, 0 69 pshuflw xmm0, xmm0, 0
66 punpcklwd xmm0, xmm0 70 punpcklwd xmm0, xmm0
67 movdqa [rsp + filter_weight], xmm0 71 movdqa [rsp + filter_weight], xmm0
68 72
69 mov rbp, arg(1) ; stride 73 mov rbp, arg(1) ; stride
70 pxor xmm7, xmm7 ; zero for extraction 74 pxor xmm7, xmm7 ; zero for extraction
71 75
72 lea rcx, [rdx + 16*16*1] 76 mov rcx, [rsp + block_width]
73 cmp dword ptr [rsp + block_size], 8 77 imul rcx, [rsp + block_height]
78 add rcx, rdx
79 cmp dword ptr [rsp + block_width], 8
74 jne .temporal_filter_apply_load_16 80 jne .temporal_filter_apply_load_16
75 lea rcx, [rdx + 8*8*1]
76 81
77 .temporal_filter_apply_load_8: 82 .temporal_filter_apply_load_8:
78 movq xmm0, [rsi] ; first row 83 movq xmm0, [rsi] ; first row
79 lea rsi, [rsi + rbp] ; += stride 84 lea rsi, [rsi + rbp] ; += stride
80 punpcklbw xmm0, xmm7 ; src[ 0- 7] 85 punpcklbw xmm0, xmm7 ; src[ 0- 7]
81 movq xmm1, [rsi] ; second row 86 movq xmm1, [rsi] ; second row
82 lea rsi, [rsi + rbp] ; += stride 87 lea rsi, [rsi + rbp] ; += stride
83 punpcklbw xmm1, xmm7 ; src[ 8-15] 88 punpcklbw xmm1, xmm7 ; src[ 8-15]
84 jmp .temporal_filter_apply_load_finished 89 jmp .temporal_filter_apply_load_finished
85 90
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after
171 ; write back 176 ; write back
172 movdqa [rdi], xmm4 177 movdqa [rdi], xmm4
173 movdqa [rdi+16], xmm5 178 movdqa [rdi+16], xmm5
174 movdqa [rdi+32], xmm6 179 movdqa [rdi+32], xmm6
175 movdqa [rdi+48], xmm7 180 movdqa [rdi+48], xmm7
176 lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int )) 181 lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int ))
177 182
178 cmp rdx, rcx 183 cmp rdx, rcx
179 je .temporal_filter_apply_epilog 184 je .temporal_filter_apply_epilog
180 pxor xmm7, xmm7 ; zero for extraction 185 pxor xmm7, xmm7 ; zero for extraction
181 cmp dword ptr [rsp + block_size], 16 186 cmp dword ptr [rsp + block_width], 16
182 je .temporal_filter_apply_load_16 187 je .temporal_filter_apply_load_16
183 jmp .temporal_filter_apply_load_8 188 jmp .temporal_filter_apply_load_8
184 189
185 .temporal_filter_apply_epilog: 190 .temporal_filter_apply_epilog:
186 ; begin epilog 191 ; begin epilog
187 mov rbp, [rsp + rbp_backup] 192 mov rbp, [rsp + rbp_backup]
188 add rsp, stack_size 193 add rsp, stack_size
189 pop rsp 194 pop rsp
190 pop rdi 195 pop rdi
191 pop rsi 196 pop rsi
192 RESTORE_GOT 197 RESTORE_GOT
193 RESTORE_XMM 198 RESTORE_XMM
194 UNSHADOW_ARGS 199 UNSHADOW_ARGS
195 pop rbp 200 pop rbp
196 ret 201 ret
197 202
198 SECTION_RODATA 203 SECTION_RODATA
199 align 16 204 align 16
200 _const_3w: 205 _const_3w:
201 times 8 dw 3 206 times 8 dw 3
202 align 16 207 align 16
203 _const_top_bit: 208 _const_top_bit:
204 times 8 dw 1<<15 209 times 8 dw 1<<15
205 align 16 210 align 16
206 _const_16w 211 _const_16w
207 times 8 dw 16 212 times 8 dw 16
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698