OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 ;unsigned int vp9_get_mb_ss_sse2 | |
15 ;( | |
16 ; short *src_ptr | |
17 ;) | |
18 global sym(vp9_get_mb_ss_sse2) PRIVATE | |
19 sym(vp9_get_mb_ss_sse2): | |
20 push rbp | |
21 mov rbp, rsp | |
22 SHADOW_ARGS_TO_STACK 1 | |
23 GET_GOT rbx | |
24 push rsi | |
25 push rdi | |
26 sub rsp, 16 | |
27 ; end prolog | |
28 | |
29 | |
30 mov rax, arg(0) ;[src_ptr] | |
31 mov rcx, 8 | |
32 pxor xmm4, xmm4 | |
33 | |
34 .NEXTROW: | |
35 movdqa xmm0, [rax] | |
36 movdqa xmm1, [rax+16] | |
37 movdqa xmm2, [rax+32] | |
38 movdqa xmm3, [rax+48] | |
39 pmaddwd xmm0, xmm0 | |
40 pmaddwd xmm1, xmm1 | |
41 pmaddwd xmm2, xmm2 | |
42 pmaddwd xmm3, xmm3 | |
43 | |
44 paddd xmm0, xmm1 | |
45 paddd xmm2, xmm3 | |
46 paddd xmm4, xmm0 | |
47 paddd xmm4, xmm2 | |
48 | |
49 add rax, 0x40 | |
50 dec rcx | |
51 ja .NEXTROW | |
52 | |
53 movdqa xmm3,xmm4 | |
54 psrldq xmm4,8 | |
55 paddd xmm4,xmm3 | |
56 movdqa xmm3,xmm4 | |
57 psrldq xmm4,4 | |
58 paddd xmm4,xmm3 | |
59 movq rax,xmm4 | |
60 | |
61 | |
62 ; begin epilog | |
63 add rsp, 16 | |
64 pop rdi | |
65 pop rsi | |
66 RESTORE_GOT | |
67 UNSHADOW_ARGS | |
68 pop rbp | |
69 ret | |
70 | |
71 | |
72 ;unsigned int vp9_get16x16var_sse2 | |
73 ;( | |
74 ; unsigned char * src_ptr, | |
75 ; int source_stride, | |
76 ; unsigned char * ref_ptr, | |
77 ; int recon_stride, | |
78 ; unsigned int * SSE, | |
79 ; int * Sum | |
80 ;) | |
81 global sym(vp9_get16x16var_sse2) PRIVATE | |
82 sym(vp9_get16x16var_sse2): | |
83 push rbp | |
84 mov rbp, rsp | |
85 SHADOW_ARGS_TO_STACK 6 | |
86 SAVE_XMM 7 | |
87 push rbx | |
88 push rsi | |
89 push rdi | |
90 ; end prolog | |
91 | |
92 mov rsi, arg(0) ;[src_ptr] | |
93 mov rdi, arg(2) ;[ref_ptr] | |
94 | |
95 movsxd rax, DWORD PTR arg(1) ;[source_stride] | |
96 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] | |
97 | |
98 ; Prefetch data | |
99 lea rcx, [rax+rax*2] | |
100 prefetcht0 [rsi] | |
101 prefetcht0 [rsi+rax] | |
102 prefetcht0 [rsi+rax*2] | |
103 prefetcht0 [rsi+rcx] | |
104 lea rbx, [rsi+rax*4] | |
105 prefetcht0 [rbx] | |
106 prefetcht0 [rbx+rax] | |
107 prefetcht0 [rbx+rax*2] | |
108 prefetcht0 [rbx+rcx] | |
109 | |
110 lea rcx, [rdx+rdx*2] | |
111 prefetcht0 [rdi] | |
112 prefetcht0 [rdi+rdx] | |
113 prefetcht0 [rdi+rdx*2] | |
114 prefetcht0 [rdi+rcx] | |
115 lea rbx, [rdi+rdx*4] | |
116 prefetcht0 [rbx] | |
117 prefetcht0 [rbx+rdx] | |
118 prefetcht0 [rbx+rdx*2] | |
119 prefetcht0 [rbx+rcx] | |
120 | |
121 pxor xmm0, xmm0 ; clear xmm0 for
unpack | |
122 pxor xmm7, xmm7 ; clear xmm7 for
accumulating diffs | |
123 | |
124 pxor xmm6, xmm6 ; clear xmm6 for
accumulating sse | |
125 mov rcx, 16 | |
126 | |
127 .var16loop: | |
128 movdqu xmm1, XMMWORD PTR [rsi] | |
129 movdqu xmm2, XMMWORD PTR [rdi] | |
130 | |
131 prefetcht0 [rsi+rax*8] | |
132 prefetcht0 [rdi+rdx*8] | |
133 | |
134 movdqa xmm3, xmm1 | |
135 movdqa xmm4, xmm2 | |
136 | |
137 | |
138 punpcklbw xmm1, xmm0 | |
139 punpckhbw xmm3, xmm0 | |
140 | |
141 punpcklbw xmm2, xmm0 | |
142 punpckhbw xmm4, xmm0 | |
143 | |
144 | |
145 psubw xmm1, xmm2 | |
146 psubw xmm3, xmm4 | |
147 | |
148 paddw xmm7, xmm1 | |
149 pmaddwd xmm1, xmm1 | |
150 | |
151 paddw xmm7, xmm3 | |
152 pmaddwd xmm3, xmm3 | |
153 | |
154 paddd xmm6, xmm1 | |
155 paddd xmm6, xmm3 | |
156 | |
157 add rsi, rax | |
158 add rdi, rdx | |
159 | |
160 sub rcx, 1 | |
161 jnz .var16loop | |
162 | |
163 | |
164 movdqa xmm1, xmm6 | |
165 pxor xmm6, xmm6 | |
166 | |
167 pxor xmm5, xmm5 | |
168 punpcklwd xmm6, xmm7 | |
169 | |
170 punpckhwd xmm5, xmm7 | |
171 psrad xmm5, 16 | |
172 | |
173 psrad xmm6, 16 | |
174 paddd xmm6, xmm5 | |
175 | |
176 movdqa xmm2, xmm1 | |
177 punpckldq xmm1, xmm0 | |
178 | |
179 punpckhdq xmm2, xmm0 | |
180 movdqa xmm7, xmm6 | |
181 | |
182 paddd xmm1, xmm2 | |
183 punpckldq xmm6, xmm0 | |
184 | |
185 punpckhdq xmm7, xmm0 | |
186 paddd xmm6, xmm7 | |
187 | |
188 movdqa xmm2, xmm1 | |
189 movdqa xmm7, xmm6 | |
190 | |
191 psrldq xmm1, 8 | |
192 psrldq xmm6, 8 | |
193 | |
194 paddd xmm7, xmm6 | |
195 paddd xmm1, xmm2 | |
196 | |
197 mov rax, arg(5) ;[Sum] | |
198 mov rdi, arg(4) ;[SSE] | |
199 | |
200 movd DWORD PTR [rax], xmm7 | |
201 movd DWORD PTR [rdi], xmm1 | |
202 | |
203 | |
204 ; begin epilog | |
205 pop rdi | |
206 pop rsi | |
207 pop rbx | |
208 RESTORE_XMM | |
209 UNSHADOW_ARGS | |
210 pop rbp | |
211 ret | |
212 | |
213 | |
214 | |
215 | |
216 ;unsigned int vp9_get8x8var_sse2 | |
217 ;( | |
218 ; unsigned char * src_ptr, | |
219 ; int source_stride, | |
220 ; unsigned char * ref_ptr, | |
221 ; int recon_stride, | |
222 ; unsigned int * SSE, | |
223 ; int * Sum | |
224 ;) | |
225 global sym(vp9_get8x8var_sse2) PRIVATE | |
226 sym(vp9_get8x8var_sse2): | |
227 push rbp | |
228 mov rbp, rsp | |
229 SHADOW_ARGS_TO_STACK 6 | |
230 SAVE_XMM 7 | |
231 GET_GOT rbx | |
232 push rsi | |
233 push rdi | |
234 sub rsp, 16 | |
235 ; end prolog | |
236 | |
237 mov rsi, arg(0) ;[src_ptr] | |
238 mov rdi, arg(2) ;[ref_ptr] | |
239 | |
240 movsxd rax, DWORD PTR arg(1) ;[source_stride] | |
241 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] | |
242 | |
243 pxor xmm0, xmm0 ; clear xmm0 for
unpack | |
244 pxor xmm7, xmm7 ; clear xmm7 for
accumulating diffs | |
245 | |
246 movq xmm1, QWORD PTR [rsi] | |
247 movq xmm2, QWORD PTR [rdi] | |
248 | |
249 punpcklbw xmm1, xmm0 | |
250 punpcklbw xmm2, xmm0 | |
251 | |
252 psubsw xmm1, xmm2 | |
253 paddw xmm7, xmm1 | |
254 | |
255 pmaddwd xmm1, xmm1 | |
256 | |
257 movq xmm2, QWORD PTR[rsi + rax] | |
258 movq xmm3, QWORD PTR[rdi + rdx] | |
259 | |
260 punpcklbw xmm2, xmm0 | |
261 punpcklbw xmm3, xmm0 | |
262 | |
263 psubsw xmm2, xmm3 | |
264 paddw xmm7, xmm2 | |
265 | |
266 pmaddwd xmm2, xmm2 | |
267 paddd xmm1, xmm2 | |
268 | |
269 | |
270 movq xmm2, QWORD PTR[rsi + rax * 2] | |
271 movq xmm3, QWORD PTR[rdi + rdx * 2] | |
272 | |
273 punpcklbw xmm2, xmm0 | |
274 punpcklbw xmm3, xmm0 | |
275 | |
276 psubsw xmm2, xmm3 | |
277 paddw xmm7, xmm2 | |
278 | |
279 pmaddwd xmm2, xmm2 | |
280 paddd xmm1, xmm2 | |
281 | |
282 | |
283 lea rsi, [rsi + rax * 2] | |
284 lea rdi, [rdi + rdx * 2] | |
285 movq xmm2, QWORD PTR[rsi + rax] | |
286 movq xmm3, QWORD PTR[rdi + rdx] | |
287 | |
288 punpcklbw xmm2, xmm0 | |
289 punpcklbw xmm3, xmm0 | |
290 | |
291 psubsw xmm2, xmm3 | |
292 paddw xmm7, xmm2 | |
293 | |
294 pmaddwd xmm2, xmm2 | |
295 paddd xmm1, xmm2 | |
296 | |
297 movq xmm2, QWORD PTR[rsi + rax *2] | |
298 movq xmm3, QWORD PTR[rdi + rdx *2] | |
299 | |
300 punpcklbw xmm2, xmm0 | |
301 punpcklbw xmm3, xmm0 | |
302 | |
303 psubsw xmm2, xmm3 | |
304 paddw xmm7, xmm2 | |
305 | |
306 pmaddwd xmm2, xmm2 | |
307 paddd xmm1, xmm2 | |
308 | |
309 | |
310 lea rsi, [rsi + rax * 2] | |
311 lea rdi, [rdi + rdx * 2] | |
312 | |
313 | |
314 movq xmm2, QWORD PTR[rsi + rax] | |
315 movq xmm3, QWORD PTR[rdi + rdx] | |
316 | |
317 punpcklbw xmm2, xmm0 | |
318 punpcklbw xmm3, xmm0 | |
319 | |
320 psubsw xmm2, xmm3 | |
321 paddw xmm7, xmm2 | |
322 | |
323 pmaddwd xmm2, xmm2 | |
324 paddd xmm1, xmm2 | |
325 | |
326 movq xmm2, QWORD PTR[rsi + rax *2] | |
327 movq xmm3, QWORD PTR[rdi + rdx *2] | |
328 | |
329 punpcklbw xmm2, xmm0 | |
330 punpcklbw xmm3, xmm0 | |
331 | |
332 psubsw xmm2, xmm3 | |
333 paddw xmm7, xmm2 | |
334 | |
335 pmaddwd xmm2, xmm2 | |
336 paddd xmm1, xmm2 | |
337 | |
338 | |
339 lea rsi, [rsi + rax * 2] | |
340 lea rdi, [rdi + rdx * 2] | |
341 | |
342 movq xmm2, QWORD PTR[rsi + rax] | |
343 movq xmm3, QWORD PTR[rdi + rdx] | |
344 | |
345 punpcklbw xmm2, xmm0 | |
346 punpcklbw xmm3, xmm0 | |
347 | |
348 psubsw xmm2, xmm3 | |
349 paddw xmm7, xmm2 | |
350 | |
351 pmaddwd xmm2, xmm2 | |
352 paddd xmm1, xmm2 | |
353 | |
354 | |
355 movdqa xmm6, xmm7 | |
356 punpcklwd xmm6, xmm0 | |
357 | |
358 punpckhwd xmm7, xmm0 | |
359 movdqa xmm2, xmm1 | |
360 | |
361 paddw xmm6, xmm7 | |
362 punpckldq xmm1, xmm0 | |
363 | |
364 punpckhdq xmm2, xmm0 | |
365 movdqa xmm7, xmm6 | |
366 | |
367 paddd xmm1, xmm2 | |
368 punpckldq xmm6, xmm0 | |
369 | |
370 punpckhdq xmm7, xmm0 | |
371 paddw xmm6, xmm7 | |
372 | |
373 movdqa xmm2, xmm1 | |
374 movdqa xmm7, xmm6 | |
375 | |
376 psrldq xmm1, 8 | |
377 psrldq xmm6, 8 | |
378 | |
379 paddw xmm7, xmm6 | |
380 paddd xmm1, xmm2 | |
381 | |
382 mov rax, arg(5) ;[Sum] | |
383 mov rdi, arg(4) ;[SSE] | |
384 | |
385 movq rdx, xmm7 | |
386 movsx rcx, dx | |
387 | |
388 mov dword ptr [rax], ecx | |
389 movd DWORD PTR [rdi], xmm1 | |
390 | |
391 ; begin epilog | |
392 add rsp, 16 | |
393 pop rdi | |
394 pop rsi | |
395 RESTORE_GOT | |
396 RESTORE_XMM | |
397 UNSHADOW_ARGS | |
398 pop rbp | |
399 ret | |
400 | |
401 | |
OLD | NEW |