OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
11 | 11 |
12 %include "vpx_ports/x86_abi_support.asm" | 12 %include "vpx_ports/x86_abi_support.asm" |
13 | 13 |
14 %define xmm_filter_shift 7 | 14 %define xmm_filter_shift 7 |
15 | 15 |
16 ;unsigned int vp8_get_mb_ss_sse2 | |
17 ;( | |
18 ; short *src_ptr | |
19 ;) | |
20 global sym(vp8_get_mb_ss_sse2) PRIVATE | |
21 sym(vp8_get_mb_ss_sse2): | |
22 push rbp | |
23 mov rbp, rsp | |
24 SHADOW_ARGS_TO_STACK 1 | |
25 GET_GOT rbx | |
26 push rsi | |
27 push rdi | |
28 sub rsp, 16 | |
29 ; end prolog | |
30 | |
31 | |
32 mov rax, arg(0) ;[src_ptr] | |
33 mov rcx, 8 | |
34 pxor xmm4, xmm4 | |
35 | |
36 .NEXTROW: | |
37 movdqa xmm0, [rax] | |
38 movdqa xmm1, [rax+16] | |
39 movdqa xmm2, [rax+32] | |
40 movdqa xmm3, [rax+48] | |
41 pmaddwd xmm0, xmm0 | |
42 pmaddwd xmm1, xmm1 | |
43 pmaddwd xmm2, xmm2 | |
44 pmaddwd xmm3, xmm3 | |
45 | |
46 paddd xmm0, xmm1 | |
47 paddd xmm2, xmm3 | |
48 paddd xmm4, xmm0 | |
49 paddd xmm4, xmm2 | |
50 | |
51 add rax, 0x40 | |
52 dec rcx | |
53 ja .NEXTROW | |
54 | |
55 movdqa xmm3,xmm4 | |
56 psrldq xmm4,8 | |
57 paddd xmm4,xmm3 | |
58 movdqa xmm3,xmm4 | |
59 psrldq xmm4,4 | |
60 paddd xmm4,xmm3 | |
61 movq rax,xmm4 | |
62 | |
63 | |
64 ; begin epilog | |
65 add rsp, 16 | |
66 pop rdi | |
67 pop rsi | |
68 RESTORE_GOT | |
69 UNSHADOW_ARGS | |
70 pop rbp | |
71 ret | |
72 | |
73 | |
74 ;unsigned int vp8_get16x16var_sse2 | |
75 ;( | |
76 ; unsigned char * src_ptr, | |
77 ; int source_stride, | |
78 ; unsigned char * ref_ptr, | |
79 ; int recon_stride, | |
80 ; unsigned int * SSE, | |
81 ; int * Sum | |
82 ;) | |
83 global sym(vp8_get16x16var_sse2) PRIVATE | |
84 sym(vp8_get16x16var_sse2): | |
85 push rbp | |
86 mov rbp, rsp | |
87 SHADOW_ARGS_TO_STACK 6 | |
88 SAVE_XMM 7 | |
89 push rbx | |
90 push rsi | |
91 push rdi | |
92 ; end prolog | |
93 | |
94 mov rsi, arg(0) ;[src_ptr] | |
95 mov rdi, arg(2) ;[ref_ptr] | |
96 | |
97 movsxd rax, DWORD PTR arg(1) ;[source_stride] | |
98 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] | |
99 | |
100 ; Prefetch data | |
101 lea rcx, [rax+rax*2] | |
102 prefetcht0 [rsi] | |
103 prefetcht0 [rsi+rax] | |
104 prefetcht0 [rsi+rax*2] | |
105 prefetcht0 [rsi+rcx] | |
106 lea rbx, [rsi+rax*4] | |
107 prefetcht0 [rbx] | |
108 prefetcht0 [rbx+rax] | |
109 prefetcht0 [rbx+rax*2] | |
110 prefetcht0 [rbx+rcx] | |
111 | |
112 lea rcx, [rdx+rdx*2] | |
113 prefetcht0 [rdi] | |
114 prefetcht0 [rdi+rdx] | |
115 prefetcht0 [rdi+rdx*2] | |
116 prefetcht0 [rdi+rcx] | |
117 lea rbx, [rdi+rdx*4] | |
118 prefetcht0 [rbx] | |
119 prefetcht0 [rbx+rdx] | |
120 prefetcht0 [rbx+rdx*2] | |
121 prefetcht0 [rbx+rcx] | |
122 | |
123 pxor xmm0, xmm0 ; clear xmm0 for
unpack | |
124 pxor xmm7, xmm7 ; clear xmm7 for
accumulating diffs | |
125 | |
126 pxor xmm6, xmm6 ; clear xmm6 for
accumulating sse | |
127 mov rcx, 16 | |
128 | |
129 .var16loop: | |
130 movdqu xmm1, XMMWORD PTR [rsi] | |
131 movdqu xmm2, XMMWORD PTR [rdi] | |
132 | |
133 prefetcht0 [rsi+rax*8] | |
134 prefetcht0 [rdi+rdx*8] | |
135 | |
136 movdqa xmm3, xmm1 | |
137 movdqa xmm4, xmm2 | |
138 | |
139 | |
140 punpcklbw xmm1, xmm0 | |
141 punpckhbw xmm3, xmm0 | |
142 | |
143 punpcklbw xmm2, xmm0 | |
144 punpckhbw xmm4, xmm0 | |
145 | |
146 | |
147 psubw xmm1, xmm2 | |
148 psubw xmm3, xmm4 | |
149 | |
150 paddw xmm7, xmm1 | |
151 pmaddwd xmm1, xmm1 | |
152 | |
153 paddw xmm7, xmm3 | |
154 pmaddwd xmm3, xmm3 | |
155 | |
156 paddd xmm6, xmm1 | |
157 paddd xmm6, xmm3 | |
158 | |
159 add rsi, rax | |
160 add rdi, rdx | |
161 | |
162 sub rcx, 1 | |
163 jnz .var16loop | |
164 | |
165 | |
166 movdqa xmm1, xmm6 | |
167 pxor xmm6, xmm6 | |
168 | |
169 pxor xmm5, xmm5 | |
170 punpcklwd xmm6, xmm7 | |
171 | |
172 punpckhwd xmm5, xmm7 | |
173 psrad xmm5, 16 | |
174 | |
175 psrad xmm6, 16 | |
176 paddd xmm6, xmm5 | |
177 | |
178 movdqa xmm2, xmm1 | |
179 punpckldq xmm1, xmm0 | |
180 | |
181 punpckhdq xmm2, xmm0 | |
182 movdqa xmm7, xmm6 | |
183 | |
184 paddd xmm1, xmm2 | |
185 punpckldq xmm6, xmm0 | |
186 | |
187 punpckhdq xmm7, xmm0 | |
188 paddd xmm6, xmm7 | |
189 | |
190 movdqa xmm2, xmm1 | |
191 movdqa xmm7, xmm6 | |
192 | |
193 psrldq xmm1, 8 | |
194 psrldq xmm6, 8 | |
195 | |
196 paddd xmm7, xmm6 | |
197 paddd xmm1, xmm2 | |
198 | |
199 mov rax, arg(5) ;[Sum] | |
200 mov rdi, arg(4) ;[SSE] | |
201 | |
202 movd DWORD PTR [rax], xmm7 | |
203 movd DWORD PTR [rdi], xmm1 | |
204 | |
205 | |
206 ; begin epilog | |
207 pop rdi | |
208 pop rsi | |
209 pop rbx | |
210 RESTORE_XMM | |
211 UNSHADOW_ARGS | |
212 pop rbp | |
213 ret | |
214 | |
215 | |
216 | |
217 | |
218 ;unsigned int vp8_get8x8var_sse2 | |
219 ;( | |
220 ; unsigned char * src_ptr, | |
221 ; int source_stride, | |
222 ; unsigned char * ref_ptr, | |
223 ; int recon_stride, | |
224 ; unsigned int * SSE, | |
225 ; int * Sum | |
226 ;) | |
227 global sym(vp8_get8x8var_sse2) PRIVATE | |
228 sym(vp8_get8x8var_sse2): | |
229 push rbp | |
230 mov rbp, rsp | |
231 SHADOW_ARGS_TO_STACK 6 | |
232 SAVE_XMM 7 | |
233 GET_GOT rbx | |
234 push rsi | |
235 push rdi | |
236 sub rsp, 16 | |
237 ; end prolog | |
238 | |
239 mov rsi, arg(0) ;[src_ptr] | |
240 mov rdi, arg(2) ;[ref_ptr] | |
241 | |
242 movsxd rax, DWORD PTR arg(1) ;[source_stride] | |
243 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] | |
244 | |
245 pxor xmm0, xmm0 ; clear xmm0 for
unpack | |
246 pxor xmm7, xmm7 ; clear xmm7 for
accumulating diffs | |
247 | |
248 movq xmm1, QWORD PTR [rsi] | |
249 movq xmm2, QWORD PTR [rdi] | |
250 | |
251 punpcklbw xmm1, xmm0 | |
252 punpcklbw xmm2, xmm0 | |
253 | |
254 psubsw xmm1, xmm2 | |
255 paddw xmm7, xmm1 | |
256 | |
257 pmaddwd xmm1, xmm1 | |
258 | |
259 movq xmm2, QWORD PTR[rsi + rax] | |
260 movq xmm3, QWORD PTR[rdi + rdx] | |
261 | |
262 punpcklbw xmm2, xmm0 | |
263 punpcklbw xmm3, xmm0 | |
264 | |
265 psubsw xmm2, xmm3 | |
266 paddw xmm7, xmm2 | |
267 | |
268 pmaddwd xmm2, xmm2 | |
269 paddd xmm1, xmm2 | |
270 | |
271 | |
272 movq xmm2, QWORD PTR[rsi + rax * 2] | |
273 movq xmm3, QWORD PTR[rdi + rdx * 2] | |
274 | |
275 punpcklbw xmm2, xmm0 | |
276 punpcklbw xmm3, xmm0 | |
277 | |
278 psubsw xmm2, xmm3 | |
279 paddw xmm7, xmm2 | |
280 | |
281 pmaddwd xmm2, xmm2 | |
282 paddd xmm1, xmm2 | |
283 | |
284 | |
285 lea rsi, [rsi + rax * 2] | |
286 lea rdi, [rdi + rdx * 2] | |
287 movq xmm2, QWORD PTR[rsi + rax] | |
288 movq xmm3, QWORD PTR[rdi + rdx] | |
289 | |
290 punpcklbw xmm2, xmm0 | |
291 punpcklbw xmm3, xmm0 | |
292 | |
293 psubsw xmm2, xmm3 | |
294 paddw xmm7, xmm2 | |
295 | |
296 pmaddwd xmm2, xmm2 | |
297 paddd xmm1, xmm2 | |
298 | |
299 movq xmm2, QWORD PTR[rsi + rax *2] | |
300 movq xmm3, QWORD PTR[rdi + rdx *2] | |
301 | |
302 punpcklbw xmm2, xmm0 | |
303 punpcklbw xmm3, xmm0 | |
304 | |
305 psubsw xmm2, xmm3 | |
306 paddw xmm7, xmm2 | |
307 | |
308 pmaddwd xmm2, xmm2 | |
309 paddd xmm1, xmm2 | |
310 | |
311 | |
312 lea rsi, [rsi + rax * 2] | |
313 lea rdi, [rdi + rdx * 2] | |
314 | |
315 | |
316 movq xmm2, QWORD PTR[rsi + rax] | |
317 movq xmm3, QWORD PTR[rdi + rdx] | |
318 | |
319 punpcklbw xmm2, xmm0 | |
320 punpcklbw xmm3, xmm0 | |
321 | |
322 psubsw xmm2, xmm3 | |
323 paddw xmm7, xmm2 | |
324 | |
325 pmaddwd xmm2, xmm2 | |
326 paddd xmm1, xmm2 | |
327 | |
328 movq xmm2, QWORD PTR[rsi + rax *2] | |
329 movq xmm3, QWORD PTR[rdi + rdx *2] | |
330 | |
331 punpcklbw xmm2, xmm0 | |
332 punpcklbw xmm3, xmm0 | |
333 | |
334 psubsw xmm2, xmm3 | |
335 paddw xmm7, xmm2 | |
336 | |
337 pmaddwd xmm2, xmm2 | |
338 paddd xmm1, xmm2 | |
339 | |
340 | |
341 lea rsi, [rsi + rax * 2] | |
342 lea rdi, [rdi + rdx * 2] | |
343 | |
344 movq xmm2, QWORD PTR[rsi + rax] | |
345 movq xmm3, QWORD PTR[rdi + rdx] | |
346 | |
347 punpcklbw xmm2, xmm0 | |
348 punpcklbw xmm3, xmm0 | |
349 | |
350 psubsw xmm2, xmm3 | |
351 paddw xmm7, xmm2 | |
352 | |
353 pmaddwd xmm2, xmm2 | |
354 paddd xmm1, xmm2 | |
355 | |
356 | |
357 movdqa xmm6, xmm7 | |
358 punpcklwd xmm6, xmm0 | |
359 | |
360 punpckhwd xmm7, xmm0 | |
361 movdqa xmm2, xmm1 | |
362 | |
363 paddw xmm6, xmm7 | |
364 punpckldq xmm1, xmm0 | |
365 | |
366 punpckhdq xmm2, xmm0 | |
367 movdqa xmm7, xmm6 | |
368 | |
369 paddd xmm1, xmm2 | |
370 punpckldq xmm6, xmm0 | |
371 | |
372 punpckhdq xmm7, xmm0 | |
373 paddw xmm6, xmm7 | |
374 | |
375 movdqa xmm2, xmm1 | |
376 movdqa xmm7, xmm6 | |
377 | |
378 psrldq xmm1, 8 | |
379 psrldq xmm6, 8 | |
380 | |
381 paddw xmm7, xmm6 | |
382 paddd xmm1, xmm2 | |
383 | |
384 mov rax, arg(5) ;[Sum] | |
385 mov rdi, arg(4) ;[SSE] | |
386 | |
387 movq rdx, xmm7 | |
388 movsx rcx, dx | |
389 | |
390 mov dword ptr [rax], ecx | |
391 movd DWORD PTR [rdi], xmm1 | |
392 | |
393 ; begin epilog | |
394 add rsp, 16 | |
395 pop rdi | |
396 pop rsi | |
397 RESTORE_GOT | |
398 RESTORE_XMM | |
399 UNSHADOW_ARGS | |
400 pop rbp | |
401 ret | |
402 | |
403 ;void vp8_filter_block2d_bil_var_sse2 | 16 ;void vp8_filter_block2d_bil_var_sse2 |
404 ;( | 17 ;( |
405 ; unsigned char *ref_ptr, | 18 ; unsigned char *ref_ptr, |
406 ; int ref_pixels_per_line, | 19 ; int ref_pixels_per_line, |
407 ; unsigned char *src_ptr, | 20 ; unsigned char *src_ptr, |
408 ; int src_pixels_per_line, | 21 ; int src_pixels_per_line, |
409 ; unsigned int Height, | 22 ; unsigned int Height, |
410 ; int xoffset, | 23 ; int xoffset, |
411 ; int yoffset, | 24 ; int yoffset, |
412 ; int *sum, | 25 ; int *sum, |
(...skipping 937 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1350 align 16 | 963 align 16 |
1351 vp8_bilinear_filters_sse2: | 964 vp8_bilinear_filters_sse2: |
1352 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 | 965 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 |
1353 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 | 966 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 |
1354 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 | 967 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 |
1355 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 | 968 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 |
1356 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 | 969 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 |
1357 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 | 970 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 |
1358 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 | 971 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 |
1359 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 | 972 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 |
OLD | NEW |