OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 ;unsigned int vp9_highbd_calc16x16var_sse2 | |
15 ;( | |
16 ; unsigned char * src_ptr, | |
17 ; int source_stride, | |
18 ; unsigned char * ref_ptr, | |
19 ; int recon_stride, | |
20 ; unsigned int * SSE, | |
21 ; int * Sum | |
22 ;) | |
23 global sym(vp9_highbd_calc16x16var_sse2) PRIVATE | |
24 sym(vp9_highbd_calc16x16var_sse2): | |
25 push rbp | |
26 mov rbp, rsp | |
27 SHADOW_ARGS_TO_STACK 6 | |
28 SAVE_XMM 7 | |
29 push rbx | |
30 push rsi | |
31 push rdi | |
32 ; end prolog | |
33 | |
34 mov rsi, arg(0) ;[src_ptr] | |
35 mov rdi, arg(2) ;[ref_ptr] | |
36 | |
37 movsxd rax, DWORD PTR arg(1) ;[source_stride] | |
38 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] | |
39 add rax, rax ; source stride in bytes | |
40 add rdx, rdx ; recon stride in bytes | |
41 | |
42 ; Prefetch data | |
43 prefetcht0 [rsi] | |
44 prefetcht0 [rsi+16] | |
45 prefetcht0 [rsi+rax] | |
46 prefetcht0 [rsi+rax+16] | |
47 lea rbx, [rsi+rax*2] | |
48 prefetcht0 [rbx] | |
49 prefetcht0 [rbx+16] | |
50 prefetcht0 [rbx+rax] | |
51 prefetcht0 [rbx+rax+16] | |
52 | |
53 prefetcht0 [rdi] | |
54 prefetcht0 [rdi+16] | |
55 prefetcht0 [rdi+rdx] | |
56 prefetcht0 [rdi+rdx+16] | |
57 lea rbx, [rdi+rdx*2] | |
58 prefetcht0 [rbx] | |
59 prefetcht0 [rbx+16] | |
60 prefetcht0 [rbx+rdx] | |
61 prefetcht0 [rbx+rdx+16] | |
62 | |
63 pxor xmm0, xmm0 ; clear xmm0 for unpack | |
64 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs | |
65 | |
66 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse | |
67 mov rcx, 16 | |
68 | |
69 .var16loop: | |
70 movdqu xmm1, XMMWORD PTR [rsi] | |
71 movdqu xmm2, XMMWORD PTR [rdi] | |
72 | |
73 lea rbx, [rsi+rax*2] | |
74 prefetcht0 [rbx] | |
75 prefetcht0 [rbx+16] | |
76 prefetcht0 [rbx+rax] | |
77 prefetcht0 [rbx+rax+16] | |
78 lea rbx, [rdi+rdx*2] | |
79 prefetcht0 [rbx] | |
80 prefetcht0 [rbx+16] | |
81 prefetcht0 [rbx+rdx] | |
82 prefetcht0 [rbx+rdx+16] | |
83 | |
84 pxor xmm5, xmm5 | |
85 | |
86 psubw xmm1, xmm2 | |
87 movdqu xmm3, XMMWORD PTR [rsi+16] | |
88 paddw xmm5, xmm1 | |
89 pmaddwd xmm1, xmm1 | |
90 movdqu xmm2, XMMWORD PTR [rdi+16] | |
91 paddd xmm6, xmm1 | |
92 | |
93 psubw xmm3, xmm2 | |
94 movdqu xmm1, XMMWORD PTR [rsi+rax] | |
95 paddw xmm5, xmm3 | |
96 pmaddwd xmm3, xmm3 | |
97 movdqu xmm2, XMMWORD PTR [rdi+rdx] | |
98 paddd xmm6, xmm3 | |
99 | |
100 psubw xmm1, xmm2 | |
101 movdqu xmm3, XMMWORD PTR [rsi+rax+16] | |
102 paddw xmm5, xmm1 | |
103 pmaddwd xmm1, xmm1 | |
104 movdqu xmm2, XMMWORD PTR [rdi+rdx+16] | |
105 paddd xmm6, xmm1 | |
106 | |
107 psubw xmm3, xmm2 | |
108 paddw xmm5, xmm3 | |
109 pmaddwd xmm3, xmm3 | |
110 paddd xmm6, xmm3 | |
111 | |
112 movdqa xmm1, xmm5 | |
113 movdqa xmm2, xmm5 | |
114 pcmpgtw xmm1, xmm0 | |
115 pcmpeqw xmm2, xmm0 | |
116 por xmm1, xmm2 | |
117 pcmpeqw xmm1, xmm0 | |
118 movdqa xmm2, xmm5 | |
119 punpcklwd xmm5, xmm1 | |
120 punpckhwd xmm2, xmm1 | |
121 paddd xmm7, xmm5 | |
122 paddd xmm7, xmm2 | |
123 | |
124 lea rsi, [rsi + 2*rax] | |
125 lea rdi, [rdi + 2*rdx] | |
126 sub rcx, 2 | |
127 jnz .var16loop | |
128 | |
129 movdqa xmm4, xmm6 | |
130 punpckldq xmm6, xmm0 | |
131 | |
132 punpckhdq xmm4, xmm0 | |
133 movdqa xmm5, xmm7 | |
134 | |
135 paddd xmm6, xmm4 | |
136 punpckldq xmm7, xmm0 | |
137 | |
138 punpckhdq xmm5, xmm0 | |
139 paddd xmm7, xmm5 | |
140 | |
141 movdqa xmm4, xmm6 | |
142 movdqa xmm5, xmm7 | |
143 | |
144 psrldq xmm4, 8 | |
145 psrldq xmm5, 8 | |
146 | |
147 paddd xmm6, xmm4 | |
148 paddd xmm7, xmm5 | |
149 | |
150 mov rdi, arg(4) ; [SSE] | |
151 mov rax, arg(5) ; [Sum] | |
152 | |
153 movd DWORD PTR [rdi], xmm6 | |
154 movd DWORD PTR [rax], xmm7 | |
155 | |
156 | |
157 ; begin epilog | |
158 pop rdi | |
159 pop rsi | |
160 pop rbx | |
161 RESTORE_XMM | |
162 UNSHADOW_ARGS | |
163 pop rbp | |
164 ret | |
165 | |
166 | |
167 ;unsigned int vp9_highbd_calc8x8var_sse2 | |
168 ;( | |
169 ; unsigned char * src_ptr, | |
170 ; int source_stride, | |
171 ; unsigned char * ref_ptr, | |
172 ; int recon_stride, | |
173 ; unsigned int * SSE, | |
174 ; int * Sum | |
175 ;) | |
176 global sym(vp9_highbd_calc8x8var_sse2) PRIVATE | |
177 sym(vp9_highbd_calc8x8var_sse2): | |
178 push rbp | |
179 mov rbp, rsp | |
180 SHADOW_ARGS_TO_STACK 6 | |
181 SAVE_XMM 7 | |
182 push rbx | |
183 push rsi | |
184 push rdi | |
185 ; end prolog | |
186 | |
187 mov rsi, arg(0) ;[src_ptr] | |
188 mov rdi, arg(2) ;[ref_ptr] | |
189 | |
190 movsxd rax, DWORD PTR arg(1) ;[source_stride] | |
191 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] | |
192 add rax, rax ; source stride in bytes | |
193 add rdx, rdx ; recon stride in bytes | |
194 | |
195 ; Prefetch data | |
196 prefetcht0 [rsi] | |
197 prefetcht0 [rsi+rax] | |
198 lea rbx, [rsi+rax*2] | |
199 prefetcht0 [rbx] | |
200 prefetcht0 [rbx+rax] | |
201 | |
202 prefetcht0 [rdi] | |
203 prefetcht0 [rdi+rdx] | |
204 lea rbx, [rdi+rdx*2] | |
205 prefetcht0 [rbx] | |
206 prefetcht0 [rbx+rdx] | |
207 | |
208 pxor xmm0, xmm0 ; clear xmm0 for unpack | |
209 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs | |
210 | |
211 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse | |
212 mov rcx, 8 | |
213 | |
214 .var8loop: | |
215 movdqu xmm1, XMMWORD PTR [rsi] | |
216 movdqu xmm2, XMMWORD PTR [rdi] | |
217 | |
218 lea rbx, [rsi+rax*4] | |
219 prefetcht0 [rbx] | |
220 prefetcht0 [rbx+rax] | |
221 lea rbx, [rbx+rax*2] | |
222 prefetcht0 [rbx] | |
223 prefetcht0 [rbx+rax] | |
224 lea rbx, [rdi+rdx*4] | |
225 prefetcht0 [rbx] | |
226 prefetcht0 [rbx+rdx] | |
227 lea rbx, [rbx+rdx*2] | |
228 prefetcht0 [rbx] | |
229 prefetcht0 [rbx+rdx] | |
230 | |
231 pxor xmm5, xmm5 | |
232 | |
233 psubw xmm1, xmm2 | |
234 movdqu xmm3, XMMWORD PTR [rsi+rax] | |
235 paddw xmm5, xmm1 | |
236 pmaddwd xmm1, xmm1 | |
237 movdqu xmm2, XMMWORD PTR [rdi+rdx] | |
238 paddd xmm6, xmm1 | |
239 | |
240 lea rsi, [rsi + 2*rax] | |
241 lea rdi, [rdi + 2*rdx] | |
242 | |
243 psubw xmm3, xmm2 | |
244 movdqu xmm1, XMMWORD PTR [rsi] | |
245 paddw xmm5, xmm3 | |
246 pmaddwd xmm3, xmm3 | |
247 movdqu xmm2, XMMWORD PTR [rdi] | |
248 paddd xmm6, xmm3 | |
249 | |
250 psubw xmm1, xmm2 | |
251 movdqu xmm3, XMMWORD PTR [rsi+rax] | |
252 paddw xmm5, xmm1 | |
253 pmaddwd xmm1, xmm1 | |
254 movdqu xmm2, XMMWORD PTR [rdi+rdx] | |
255 paddd xmm6, xmm1 | |
256 | |
257 psubw xmm3, xmm2 | |
258 paddw xmm5, xmm3 | |
259 pmaddwd xmm3, xmm3 | |
260 paddd xmm6, xmm3 | |
261 | |
262 movdqa xmm1, xmm5 | |
263 movdqa xmm2, xmm5 | |
264 pcmpgtw xmm1, xmm0 | |
265 pcmpeqw xmm2, xmm0 | |
266 por xmm1, xmm2 | |
267 pcmpeqw xmm1, xmm0 | |
268 movdqa xmm2, xmm5 | |
269 punpcklwd xmm5, xmm1 | |
270 punpckhwd xmm2, xmm1 | |
271 paddd xmm7, xmm5 | |
272 paddd xmm7, xmm2 | |
273 | |
274 lea rsi, [rsi + 2*rax] | |
275 lea rdi, [rdi + 2*rdx] | |
276 sub rcx, 4 | |
277 jnz .var8loop | |
278 | |
279 movdqa xmm4, xmm6 | |
280 punpckldq xmm6, xmm0 | |
281 | |
282 punpckhdq xmm4, xmm0 | |
283 movdqa xmm5, xmm7 | |
284 | |
285 paddd xmm6, xmm4 | |
286 punpckldq xmm7, xmm0 | |
287 | |
288 punpckhdq xmm5, xmm0 | |
289 paddd xmm7, xmm5 | |
290 | |
291 movdqa xmm4, xmm6 | |
292 movdqa xmm5, xmm7 | |
293 | |
294 psrldq xmm4, 8 | |
295 psrldq xmm5, 8 | |
296 | |
297 paddd xmm6, xmm4 | |
298 paddd xmm7, xmm5 | |
299 | |
300 mov rdi, arg(4) ; [SSE] | |
301 mov rax, arg(5) ; [Sum] | |
302 | |
303 movd DWORD PTR [rdi], xmm6 | |
304 movd DWORD PTR [rax], xmm7 | |
305 | |
306 ; begin epilog | |
307 pop rdi | |
308 pop rsi | |
309 pop rbx | |
310 RESTORE_XMM | |
311 UNSHADOW_ARGS | |
312 pop rbp | |
313 ret | |
OLD | NEW |