OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 %define xmm_filter_shift 7 | |
15 | |
16 ;void vp9_filter_block2d_bil_var_sse2 | |
17 ;( | |
18 ; unsigned char *ref_ptr, | |
19 ; int ref_pixels_per_line, | |
20 ; unsigned char *src_ptr, | |
21 ; int src_pixels_per_line, | |
22 ; unsigned int Height, | |
23 ; int xoffset, | |
24 ; int yoffset, | |
25 ; int *sum, | |
26 ; unsigned int *sumsquared;; | |
27 ; | |
28 ;) | |
29 global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE | |
30 sym(vp9_filter_block2d_bil_var_sse2): | |
31 push rbp | |
32 mov rbp, rsp | |
33 SHADOW_ARGS_TO_STACK 9 | |
34 SAVE_XMM 7 | |
35 GET_GOT rbx | |
36 push rsi | |
37 push rdi | |
38 push rbx | |
39 ; end prolog | |
40 | |
41 pxor xmm6, xmm6 ; | |
42 pxor xmm7, xmm7 ; | |
43 | |
44 lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding | |
45 movdqa xmm4, XMMWORD PTR [rsi] | |
46 | |
47 lea rcx, [GLOBAL(bilinear_filters_sse2)] | |
48 movsxd rax, dword ptr arg(5) ; xoffset | |
49 | |
50 cmp rax, 0 ; skip first_pass f
ilter if xoffset=0 | |
51 je filter_block2d_bil_var_sse2_sp_only | |
52 | |
53 shl rax, 5 ; point to filter c
oeff with xoffset | |
54 lea rax, [rax + rcx] ; HFilter | |
55 | |
56 movsxd rdx, dword ptr arg(6) ; yoffset | |
57 | |
58 cmp rdx, 0 ; skip second_pass
filter if yoffset=0 | |
59 je filter_block2d_bil_var_sse2_fp_only | |
60 | |
61 shl rdx, 5 | |
62 lea rdx, [rdx + rcx] ; VFilter | |
63 | |
64 mov rsi, arg(0) ;ref_ptr | |
65 mov rdi, arg(2) ;src_ptr | |
66 movsxd rcx, dword ptr arg(4) ;Height | |
67 | |
68 pxor xmm0, xmm0 ; | |
69 movq xmm1, QWORD PTR [rsi] ; | |
70 movq xmm3, QWORD PTR [rsi+1] ; | |
71 | |
72 punpcklbw xmm1, xmm0 ; | |
73 pmullw xmm1, [rax] ; | |
74 punpcklbw xmm3, xmm0 | |
75 pmullw xmm3, [rax+16] ; | |
76 | |
77 paddw xmm1, xmm3 ; | |
78 paddw xmm1, xmm4 ; | |
79 psraw xmm1, xmm_filter_shift ; | |
80 movdqa xmm5, xmm1 | |
81 | |
82 movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line | |
83 lea rsi, [rsi + rbx] | |
84 %if ABI_IS_32BIT=0 | |
85 movsxd r9, dword ptr arg(3) ;src_pixels_per_line | |
86 %endif | |
87 | |
88 filter_block2d_bil_var_sse2_loop: | |
89 movq xmm1, QWORD PTR [rsi] ; | |
90 movq xmm3, QWORD PTR [rsi+1] ; | |
91 | |
92 punpcklbw xmm1, xmm0 ; | |
93 pmullw xmm1, [rax] ; | |
94 punpcklbw xmm3, xmm0 ; | |
95 pmullw xmm3, [rax+16] ; | |
96 | |
97 paddw xmm1, xmm3 ; | |
98 paddw xmm1, xmm4 ; | |
99 psraw xmm1, xmm_filter_shift ; | |
100 | |
101 movdqa xmm3, xmm5 ; | |
102 movdqa xmm5, xmm1 ; | |
103 | |
104 pmullw xmm3, [rdx] ; | |
105 pmullw xmm1, [rdx+16] ; | |
106 paddw xmm1, xmm3 ; | |
107 paddw xmm1, xmm4 ; | |
108 psraw xmm1, xmm_filter_shift ; | |
109 | |
110 movq xmm3, QWORD PTR [rdi] ; | |
111 punpcklbw xmm3, xmm0 ; | |
112 | |
113 psubw xmm1, xmm3 ; | |
114 paddw xmm6, xmm1 ; | |
115 | |
116 pmaddwd xmm1, xmm1 ; | |
117 paddd xmm7, xmm1 ; | |
118 | |
119 lea rsi, [rsi + rbx] ;ref_pixels_per_lin
e | |
120 %if ABI_IS_32BIT | |
121 add rdi, dword ptr arg(3) ;src_pixels_per_lin
e | |
122 %else | |
123 lea rdi, [rdi + r9] | |
124 %endif | |
125 | |
126 sub rcx, 1 ; | |
127 jnz filter_block2d_bil_var_sse2_loop ; | |
128 | |
129 jmp filter_block2d_bil_variance | |
130 | |
131 filter_block2d_bil_var_sse2_sp_only: | |
132 movsxd rdx, dword ptr arg(6) ; yoffset | |
133 | |
134 cmp rdx, 0 ; skip all if both
xoffset=0 and yoffset=0 | |
135 je filter_block2d_bil_var_sse2_full_pixel | |
136 | |
137 shl rdx, 5 | |
138 lea rdx, [rdx + rcx] ; VFilter | |
139 | |
140 mov rsi, arg(0) ;ref_ptr | |
141 mov rdi, arg(2) ;src_ptr | |
142 movsxd rcx, dword ptr arg(4) ;Height | |
143 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin
e | |
144 | |
145 pxor xmm0, xmm0 ; | |
146 movq xmm1, QWORD PTR [rsi] ; | |
147 punpcklbw xmm1, xmm0 ; | |
148 | |
149 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e | |
150 lea rsi, [rsi + rax] | |
151 | |
152 filter_block2d_bil_sp_only_loop: | |
153 movq xmm3, QWORD PTR [rsi] ; | |
154 punpcklbw xmm3, xmm0 ; | |
155 movdqa xmm5, xmm3 | |
156 | |
157 pmullw xmm1, [rdx] ; | |
158 pmullw xmm3, [rdx+16] ; | |
159 paddw xmm1, xmm3 ; | |
160 paddw xmm1, xmm4 ; | |
161 psraw xmm1, xmm_filter_shift ; | |
162 | |
163 movq xmm3, QWORD PTR [rdi] ; | |
164 punpcklbw xmm3, xmm0 ; | |
165 | |
166 psubw xmm1, xmm3 ; | |
167 paddw xmm6, xmm1 ; | |
168 | |
169 pmaddwd xmm1, xmm1 ; | |
170 paddd xmm7, xmm1 ; | |
171 | |
172 movdqa xmm1, xmm5 ; | |
173 lea rsi, [rsi + rax] ;ref_pixels_per_lin
e | |
174 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e | |
175 | |
176 sub rcx, 1 ; | |
177 jnz filter_block2d_bil_sp_only_loop ; | |
178 | |
179 jmp filter_block2d_bil_variance | |
180 | |
181 filter_block2d_bil_var_sse2_full_pixel: | |
182 mov rsi, arg(0) ;ref_ptr | |
183 mov rdi, arg(2) ;src_ptr | |
184 movsxd rcx, dword ptr arg(4) ;Height | |
185 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin
e | |
186 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e | |
187 pxor xmm0, xmm0 ; | |
188 | |
189 filter_block2d_bil_full_pixel_loop: | |
190 movq xmm1, QWORD PTR [rsi] ; | |
191 punpcklbw xmm1, xmm0 ; | |
192 | |
193 movq xmm2, QWORD PTR [rdi] ; | |
194 punpcklbw xmm2, xmm0 ; | |
195 | |
196 psubw xmm1, xmm2 ; | |
197 paddw xmm6, xmm1 ; | |
198 | |
199 pmaddwd xmm1, xmm1 ; | |
200 paddd xmm7, xmm1 ; | |
201 | |
202 lea rsi, [rsi + rax] ;ref_pixels_per_lin
e | |
203 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e | |
204 | |
205 sub rcx, 1 ; | |
206 jnz filter_block2d_bil_full_pixel_loop ; | |
207 | |
208 jmp filter_block2d_bil_variance | |
209 | |
210 filter_block2d_bil_var_sse2_fp_only: | |
211 mov rsi, arg(0) ;ref_ptr | |
212 mov rdi, arg(2) ;src_ptr | |
213 movsxd rcx, dword ptr arg(4) ;Height | |
214 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_lin
e | |
215 | |
216 pxor xmm0, xmm0 ; | |
217 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e | |
218 | |
219 filter_block2d_bil_fp_only_loop: | |
220 movq xmm1, QWORD PTR [rsi] ; | |
221 movq xmm3, QWORD PTR [rsi+1] ; | |
222 | |
223 punpcklbw xmm1, xmm0 ; | |
224 pmullw xmm1, [rax] ; | |
225 punpcklbw xmm3, xmm0 ; | |
226 pmullw xmm3, [rax+16] ; | |
227 | |
228 paddw xmm1, xmm3 ; | |
229 paddw xmm1, xmm4 ; | |
230 psraw xmm1, xmm_filter_shift ; | |
231 | |
232 movq xmm3, QWORD PTR [rdi] ; | |
233 punpcklbw xmm3, xmm0 ; | |
234 | |
235 psubw xmm1, xmm3 ; | |
236 paddw xmm6, xmm1 ; | |
237 | |
238 pmaddwd xmm1, xmm1 ; | |
239 paddd xmm7, xmm1 ; | |
240 lea rsi, [rsi + rdx] | |
241 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e | |
242 | |
243 sub rcx, 1 ; | |
244 jnz filter_block2d_bil_fp_only_loop ; | |
245 | |
246 jmp filter_block2d_bil_variance | |
247 | |
248 filter_block2d_bil_variance: | |
249 movdq2q mm6, xmm6 ; | |
250 movdq2q mm7, xmm7 ; | |
251 | |
252 psrldq xmm6, 8 | |
253 psrldq xmm7, 8 | |
254 | |
255 movdq2q mm2, xmm6 | |
256 movdq2q mm3, xmm7 | |
257 | |
258 paddw mm6, mm2 | |
259 paddd mm7, mm3 | |
260 | |
261 pxor mm3, mm3 ; | |
262 pxor mm2, mm2 ; | |
263 | |
264 punpcklwd mm2, mm6 ; | |
265 punpckhwd mm3, mm6 ; | |
266 | |
267 paddd mm2, mm3 ; | |
268 movq mm6, mm2 ; | |
269 | |
270 psrlq mm6, 32 ; | |
271 paddd mm2, mm6 ; | |
272 | |
273 psrad mm2, 16 ; | |
274 movq mm4, mm7 ; | |
275 | |
276 psrlq mm4, 32 ; | |
277 paddd mm4, mm7 ; | |
278 | |
279 mov rsi, arg(7) ; sum | |
280 mov rdi, arg(8) ; sumsquared | |
281 | |
282 movd [rsi], mm2 ; xsum | |
283 movd [rdi], mm4 ; xxsum | |
284 | |
285 ; begin epilog | |
286 pop rbx | |
287 pop rdi | |
288 pop rsi | |
289 RESTORE_GOT | |
290 RESTORE_XMM | |
291 UNSHADOW_ARGS | |
292 pop rbp | |
293 ret | |
294 | |
295 | |
296 | |
297 ;void vp9_half_horiz_vert_variance16x_h_sse2 | |
298 ;( | |
299 ; unsigned char *ref_ptr, | |
300 ; int ref_pixels_per_line, | |
301 ; unsigned char *src_ptr, | |
302 ; int src_pixels_per_line, | |
303 ; unsigned int Height, | |
304 ; int *sum, | |
305 ; unsigned int *sumsquared | |
306 ;) | |
307 global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE | |
308 sym(vp9_half_horiz_vert_variance16x_h_sse2): | |
309 push rbp | |
310 mov rbp, rsp | |
311 SHADOW_ARGS_TO_STACK 7 | |
312 SAVE_XMM 7 | |
313 GET_GOT rbx | |
314 push rsi | |
315 push rdi | |
316 ; end prolog | |
317 | |
318 pxor xmm6, xmm6 ; error accumulator | |
319 pxor xmm7, xmm7 ; sse eaccumulator | |
320 mov rsi, arg(0) ;ref_ptr ; | |
321 | |
322 mov rdi, arg(2) ;src_ptr ; | |
323 movsxd rcx, dword ptr arg(4) ;Height ; | |
324 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line | |
325 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line | |
326 | |
327 pxor xmm0, xmm0 ; | |
328 | |
329 movdqu xmm5, XMMWORD PTR [rsi] | |
330 movdqu xmm3, XMMWORD PTR [rsi+1] | |
331 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) horizontal line 1 | |
332 | |
333 lea rsi, [rsi + rax] | |
334 | |
335 .half_horiz_vert_variance16x_h_1: | |
336 movdqu xmm1, XMMWORD PTR [rsi] ; | |
337 movdqu xmm2, XMMWORD PTR [rsi+1] ; | |
338 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,x
mm3) horizontal line i+1 | |
339 | |
340 pavgb xmm5, xmm1 ; xmm = vertical av
erage of the above | |
341 | |
342 movdqa xmm4, xmm5 | |
343 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove | |
344 punpckhbw xmm4, xmm0 | |
345 | |
346 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d7 | |
347 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove | |
348 psubw xmm5, xmm3 ; xmm5 -= xmm3 | |
349 | |
350 movq xmm3, QWORD PTR [rdi+8] | |
351 punpcklbw xmm3, xmm0 | |
352 psubw xmm4, xmm3 | |
353 | |
354 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences | |
355 paddw xmm6, xmm4 | |
356 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 | |
357 pmaddwd xmm4, xmm4 | |
358 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences | |
359 paddd xmm7, xmm4 | |
360 | |
361 movdqa xmm5, xmm1 ; save xmm1 for use
on the next row | |
362 | |
363 lea rsi, [rsi + rax] | |
364 lea rdi, [rdi + rdx] | |
365 | |
366 sub rcx, 1 ; | |
367 jnz .half_horiz_vert_variance16x_h_1 ; | |
368 | |
369 pxor xmm1, xmm1 | |
370 pxor xmm5, xmm5 | |
371 | |
372 punpcklwd xmm0, xmm6 | |
373 punpckhwd xmm1, xmm6 | |
374 psrad xmm0, 16 | |
375 psrad xmm1, 16 | |
376 paddd xmm0, xmm1 | |
377 movdqa xmm1, xmm0 | |
378 | |
379 movdqa xmm6, xmm7 | |
380 punpckldq xmm6, xmm5 | |
381 punpckhdq xmm7, xmm5 | |
382 paddd xmm6, xmm7 | |
383 | |
384 punpckldq xmm0, xmm5 | |
385 punpckhdq xmm1, xmm5 | |
386 paddd xmm0, xmm1 | |
387 | |
388 movdqa xmm7, xmm6 | |
389 movdqa xmm1, xmm0 | |
390 | |
391 psrldq xmm7, 8 | |
392 psrldq xmm1, 8 | |
393 | |
394 paddd xmm6, xmm7 | |
395 paddd xmm0, xmm1 | |
396 | |
397 mov rsi, arg(5) ;[Sum] | |
398 mov rdi, arg(6) ;[SSE] | |
399 | |
400 movd [rsi], xmm0 | |
401 movd [rdi], xmm6 | |
402 | |
403 ; begin epilog | |
404 pop rdi | |
405 pop rsi | |
406 RESTORE_GOT | |
407 RESTORE_XMM | |
408 UNSHADOW_ARGS | |
409 pop rbp | |
410 ret | |
411 | |
412 ;void vp9_half_vert_variance16x_h_sse2 | |
413 ;( | |
414 ; unsigned char *ref_ptr, | |
415 ; int ref_pixels_per_line, | |
416 ; unsigned char *src_ptr, | |
417 ; int src_pixels_per_line, | |
418 ; unsigned int Height, | |
419 ; int *sum, | |
420 ; unsigned int *sumsquared | |
421 ;) | |
422 global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE | |
423 sym(vp9_half_vert_variance16x_h_sse2): | |
424 push rbp | |
425 mov rbp, rsp | |
426 SHADOW_ARGS_TO_STACK 7 | |
427 SAVE_XMM 7 | |
428 GET_GOT rbx | |
429 push rsi | |
430 push rdi | |
431 ; end prolog | |
432 | |
433 pxor xmm6, xmm6 ; error accumulator | |
434 pxor xmm7, xmm7 ; sse eaccumulator | |
435 mov rsi, arg(0) ;ref_ptr | |
436 | |
437 mov rdi, arg(2) ;src_ptr | |
438 movsxd rcx, dword ptr arg(4) ;Height | |
439 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line | |
440 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line | |
441 | |
442 movdqu xmm5, XMMWORD PTR [rsi] | |
443 lea rsi, [rsi + rax ] | |
444 pxor xmm0, xmm0 | |
445 | |
446 .half_vert_variance16x_h_1: | |
447 movdqu xmm3, XMMWORD PTR [rsi] | |
448 | |
449 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) | |
450 movdqa xmm4, xmm5 | |
451 punpcklbw xmm5, xmm0 | |
452 punpckhbw xmm4, xmm0 | |
453 | |
454 movq xmm2, QWORD PTR [rdi] | |
455 punpcklbw xmm2, xmm0 | |
456 psubw xmm5, xmm2 | |
457 movq xmm2, QWORD PTR [rdi+8] | |
458 punpcklbw xmm2, xmm0 | |
459 psubw xmm4, xmm2 | |
460 | |
461 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences | |
462 paddw xmm6, xmm4 | |
463 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 | |
464 pmaddwd xmm4, xmm4 | |
465 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences | |
466 paddd xmm7, xmm4 | |
467 | |
468 movdqa xmm5, xmm3 | |
469 | |
470 lea rsi, [rsi + rax] | |
471 lea rdi, [rdi + rdx] | |
472 | |
473 sub rcx, 1 | |
474 jnz .half_vert_variance16x_h_1 | |
475 | |
476 pxor xmm1, xmm1 | |
477 pxor xmm5, xmm5 | |
478 | |
479 punpcklwd xmm0, xmm6 | |
480 punpckhwd xmm1, xmm6 | |
481 psrad xmm0, 16 | |
482 psrad xmm1, 16 | |
483 paddd xmm0, xmm1 | |
484 movdqa xmm1, xmm0 | |
485 | |
486 movdqa xmm6, xmm7 | |
487 punpckldq xmm6, xmm5 | |
488 punpckhdq xmm7, xmm5 | |
489 paddd xmm6, xmm7 | |
490 | |
491 punpckldq xmm0, xmm5 | |
492 punpckhdq xmm1, xmm5 | |
493 paddd xmm0, xmm1 | |
494 | |
495 movdqa xmm7, xmm6 | |
496 movdqa xmm1, xmm0 | |
497 | |
498 psrldq xmm7, 8 | |
499 psrldq xmm1, 8 | |
500 | |
501 paddd xmm6, xmm7 | |
502 paddd xmm0, xmm1 | |
503 | |
504 mov rsi, arg(5) ;[Sum] | |
505 mov rdi, arg(6) ;[SSE] | |
506 | |
507 movd [rsi], xmm0 | |
508 movd [rdi], xmm6 | |
509 | |
510 ; begin epilog | |
511 pop rdi | |
512 pop rsi | |
513 RESTORE_GOT | |
514 RESTORE_XMM | |
515 UNSHADOW_ARGS | |
516 pop rbp | |
517 ret | |
518 | |
519 ;void vp9_half_horiz_variance16x_h_sse2 | |
520 ;( | |
521 ; unsigned char *ref_ptr, | |
522 ; int ref_pixels_per_line, | |
523 ; unsigned char *src_ptr, | |
524 ; int src_pixels_per_line, | |
525 ; unsigned int Height, | |
526 ; int *sum, | |
527 ; unsigned int *sumsquared | |
528 ;) | |
529 global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE | |
530 sym(vp9_half_horiz_variance16x_h_sse2): | |
531 push rbp | |
532 mov rbp, rsp | |
533 SHADOW_ARGS_TO_STACK 7 | |
534 SAVE_XMM 7 | |
535 GET_GOT rbx | |
536 push rsi | |
537 push rdi | |
538 ; end prolog | |
539 | |
540 pxor xmm6, xmm6 ; error accumulator | |
541 pxor xmm7, xmm7 ; sse eaccumulator | |
542 mov rsi, arg(0) ;ref_ptr ; | |
543 | |
544 mov rdi, arg(2) ;src_ptr ; | |
545 movsxd rcx, dword ptr arg(4) ;Height ; | |
546 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line | |
547 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line | |
548 | |
549 pxor xmm0, xmm0 ; | |
550 | |
551 .half_horiz_variance16x_h_1: | |
552 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2
..s15 | |
553 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3
..s16 | |
554 | |
555 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) | |
556 movdqa xmm1, xmm5 | |
557 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove | |
558 punpckhbw xmm1, xmm0 | |
559 | |
560 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d7 | |
561 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove | |
562 movq xmm2, QWORD PTR [rdi+8] | |
563 punpcklbw xmm2, xmm0 | |
564 | |
565 psubw xmm5, xmm3 ; xmm5 -= xmm3 | |
566 psubw xmm1, xmm2 | |
567 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences | |
568 paddw xmm6, xmm1 | |
569 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 | |
570 pmaddwd xmm1, xmm1 | |
571 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences | |
572 paddd xmm7, xmm1 | |
573 | |
574 lea rsi, [rsi + rax] | |
575 lea rdi, [rdi + rdx] | |
576 | |
577 sub rcx, 1 ; | |
578 jnz .half_horiz_variance16x_h_1 ; | |
579 | |
580 pxor xmm1, xmm1 | |
581 pxor xmm5, xmm5 | |
582 | |
583 punpcklwd xmm0, xmm6 | |
584 punpckhwd xmm1, xmm6 | |
585 psrad xmm0, 16 | |
586 psrad xmm1, 16 | |
587 paddd xmm0, xmm1 | |
588 movdqa xmm1, xmm0 | |
589 | |
590 movdqa xmm6, xmm7 | |
591 punpckldq xmm6, xmm5 | |
592 punpckhdq xmm7, xmm5 | |
593 paddd xmm6, xmm7 | |
594 | |
595 punpckldq xmm0, xmm5 | |
596 punpckhdq xmm1, xmm5 | |
597 paddd xmm0, xmm1 | |
598 | |
599 movdqa xmm7, xmm6 | |
600 movdqa xmm1, xmm0 | |
601 | |
602 psrldq xmm7, 8 | |
603 psrldq xmm1, 8 | |
604 | |
605 paddd xmm6, xmm7 | |
606 paddd xmm0, xmm1 | |
607 | |
608 mov rsi, arg(5) ;[Sum] | |
609 mov rdi, arg(6) ;[SSE] | |
610 | |
611 movd [rsi], xmm0 | |
612 movd [rdi], xmm6 | |
613 | |
614 ; begin epilog | |
615 pop rdi | |
616 pop rsi | |
617 RESTORE_GOT | |
618 RESTORE_XMM | |
619 UNSHADOW_ARGS | |
620 pop rbp | |
621 ret | |
622 | |
623 SECTION_RODATA | |
624 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; | |
625 align 16 | |
626 xmm_bi_rd: | |
627 times 8 dw 64 | |
628 align 16 | |
629 bilinear_filters_sse2: | |
630 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 | |
631 dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 | |
632 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 | |
633 dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 | |
634 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 | |
635 dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 | |
636 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 | |
637 dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 | |
638 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 | |
639 dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 | |
640 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 | |
641 dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 | |
642 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 | |
643 dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 | |
644 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 | |
645 dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 | |
OLD | NEW |