OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 %include "vpx_ports/x86_abi_support.asm" | |
12 | |
13 %macro STACK_FRAME_CREATE_X3 0 | |
14 %if ABI_IS_32BIT | |
15 %define src_ptr rsi | |
16 %define src_stride rax | |
17 %define ref_ptr rdi | |
18 %define ref_stride rdx | |
19 %define end_ptr rcx | |
20 %define ret_var rbx | |
21 %define result_ptr arg(4) | |
22 %define max_sad arg(4) | |
23 %define height dword ptr arg(4) | |
24 push rbp | |
25 mov rbp, rsp | |
26 push rsi | |
27 push rdi | |
28 push rbx | |
29 | |
30 mov rsi, arg(0) ; src_ptr | |
31 mov rdi, arg(2) ; ref_ptr | |
32 | |
33 movsxd rax, dword ptr arg(1) ; src_stride | |
34 movsxd rdx, dword ptr arg(3) ; ref_stride | |
35 %else | |
36 %if LIBVPX_YASM_WIN64 | |
37 SAVE_XMM 7, u | |
38 %define src_ptr rcx | |
39 %define src_stride rdx | |
40 %define ref_ptr r8 | |
41 %define ref_stride r9 | |
42 %define end_ptr r10 | |
43 %define ret_var r11 | |
44 %define result_ptr [rsp+xmm_stack_space+8+4*8] | |
45 %define max_sad [rsp+xmm_stack_space+8+4*8] | |
46 %define height dword ptr [rsp+xmm_stack_space+8+4*8] | |
47 %else | |
48 %define src_ptr rdi | |
49 %define src_stride rsi | |
50 %define ref_ptr rdx | |
51 %define ref_stride rcx | |
52 %define end_ptr r9 | |
53 %define ret_var r10 | |
54 %define result_ptr r8 | |
55 %define max_sad r8 | |
56 %define height r8 | |
57 %endif | |
58 %endif | |
59 | |
60 %endmacro | |
61 | |
62 %macro STACK_FRAME_DESTROY_X3 0 | |
63 %define src_ptr | |
64 %define src_stride | |
65 %define ref_ptr | |
66 %define ref_stride | |
67 %define end_ptr | |
68 %define ret_var | |
69 %define result_ptr | |
70 %define max_sad | |
71 %define height | |
72 | |
73 %if ABI_IS_32BIT | |
74 pop rbx | |
75 pop rdi | |
76 pop rsi | |
77 pop rbp | |
78 %else | |
79 %if LIBVPX_YASM_WIN64 | |
80 RESTORE_XMM | |
81 %endif | |
82 %endif | |
83 ret | |
84 %endmacro | |
85 | |
86 %macro STACK_FRAME_CREATE_X4 0 | |
87 %if ABI_IS_32BIT | |
88 %define src_ptr rsi | |
89 %define src_stride rax | |
90 %define r0_ptr rcx | |
91 %define r1_ptr rdx | |
92 %define r2_ptr rbx | |
93 %define r3_ptr rdi | |
94 %define ref_stride rbp | |
95 %define result_ptr arg(4) | |
96 push rbp | |
97 mov rbp, rsp | |
98 push rsi | |
99 push rdi | |
100 push rbx | |
101 | |
102 push rbp | |
103 mov rdi, arg(2) ; ref_ptr_base | |
104 | |
105 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi | |
106 | |
107 mov rsi, arg(0) ; src_ptr | |
108 | |
109 movsxd rbx, dword ptr arg(1) ; src_stride | |
110 movsxd rbp, dword ptr arg(3) ; ref_stride | |
111 | |
112 xchg rbx, rax | |
113 %else | |
114 %if LIBVPX_YASM_WIN64 | |
115 SAVE_XMM 7, u | |
116 %define src_ptr rcx | |
117 %define src_stride rdx | |
118 %define r0_ptr rsi | |
119 %define r1_ptr r10 | |
120 %define r2_ptr r11 | |
121 %define r3_ptr r8 | |
122 %define ref_stride r9 | |
123 %define result_ptr [rsp+xmm_stack_space+16+4*8] | |
124 push rsi | |
125 | |
126 LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr | |
127 %else | |
128 %define src_ptr rdi | |
129 %define src_stride rsi | |
130 %define r0_ptr r9 | |
131 %define r1_ptr r10 | |
132 %define r2_ptr r11 | |
133 %define r3_ptr rdx | |
134 %define ref_stride rcx | |
135 %define result_ptr r8 | |
136 | |
137 LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr | |
138 | |
139 %endif | |
140 %endif | |
141 %endmacro | |
142 | |
143 %macro STACK_FRAME_DESTROY_X4 0 | |
144 %define src_ptr | |
145 %define src_stride | |
146 %define r0_ptr | |
147 %define r1_ptr | |
148 %define r2_ptr | |
149 %define r3_ptr | |
150 %define ref_stride | |
151 %define result_ptr | |
152 | |
153 %if ABI_IS_32BIT | |
154 pop rbx | |
155 pop rdi | |
156 pop rsi | |
157 pop rbp | |
158 %else | |
159 %if LIBVPX_YASM_WIN64 | |
160 pop rsi | |
161 RESTORE_XMM | |
162 %endif | |
163 %endif | |
164 ret | |
165 %endmacro | |
166 | |
167 %macro PROCESS_16X2X3 5 | |
168 %if %1==0 | |
169 movdqa xmm0, XMMWORD PTR [%2] | |
170 lddqu xmm5, XMMWORD PTR [%3] | |
171 lddqu xmm6, XMMWORD PTR [%3+1] | |
172 lddqu xmm7, XMMWORD PTR [%3+2] | |
173 | |
174 psadbw xmm5, xmm0 | |
175 psadbw xmm6, xmm0 | |
176 psadbw xmm7, xmm0 | |
177 %else | |
178 movdqa xmm0, XMMWORD PTR [%2] | |
179 lddqu xmm1, XMMWORD PTR [%3] | |
180 lddqu xmm2, XMMWORD PTR [%3+1] | |
181 lddqu xmm3, XMMWORD PTR [%3+2] | |
182 | |
183 psadbw xmm1, xmm0 | |
184 psadbw xmm2, xmm0 | |
185 psadbw xmm3, xmm0 | |
186 | |
187 paddw xmm5, xmm1 | |
188 paddw xmm6, xmm2 | |
189 paddw xmm7, xmm3 | |
190 %endif | |
191 movdqa xmm0, XMMWORD PTR [%2+%4] | |
192 lddqu xmm1, XMMWORD PTR [%3+%5] | |
193 lddqu xmm2, XMMWORD PTR [%3+%5+1] | |
194 lddqu xmm3, XMMWORD PTR [%3+%5+2] | |
195 | |
196 %if %1==0 || %1==1 | |
197 lea %2, [%2+%4*2] | |
198 lea %3, [%3+%5*2] | |
199 %endif | |
200 | |
201 psadbw xmm1, xmm0 | |
202 psadbw xmm2, xmm0 | |
203 psadbw xmm3, xmm0 | |
204 | |
205 paddw xmm5, xmm1 | |
206 paddw xmm6, xmm2 | |
207 paddw xmm7, xmm3 | |
208 %endmacro | |
209 | |
210 %macro PROCESS_8X2X3 5 | |
211 %if %1==0 | |
212 movq mm0, QWORD PTR [%2] | |
213 movq mm5, QWORD PTR [%3] | |
214 movq mm6, QWORD PTR [%3+1] | |
215 movq mm7, QWORD PTR [%3+2] | |
216 | |
217 psadbw mm5, mm0 | |
218 psadbw mm6, mm0 | |
219 psadbw mm7, mm0 | |
220 %else | |
221 movq mm0, QWORD PTR [%2] | |
222 movq mm1, QWORD PTR [%3] | |
223 movq mm2, QWORD PTR [%3+1] | |
224 movq mm3, QWORD PTR [%3+2] | |
225 | |
226 psadbw mm1, mm0 | |
227 psadbw mm2, mm0 | |
228 psadbw mm3, mm0 | |
229 | |
230 paddw mm5, mm1 | |
231 paddw mm6, mm2 | |
232 paddw mm7, mm3 | |
233 %endif | |
234 movq mm0, QWORD PTR [%2+%4] | |
235 movq mm1, QWORD PTR [%3+%5] | |
236 movq mm2, QWORD PTR [%3+%5+1] | |
237 movq mm3, QWORD PTR [%3+%5+2] | |
238 | |
239 %if %1==0 || %1==1 | |
240 lea %2, [%2+%4*2] | |
241 lea %3, [%3+%5*2] | |
242 %endif | |
243 | |
244 psadbw mm1, mm0 | |
245 psadbw mm2, mm0 | |
246 psadbw mm3, mm0 | |
247 | |
248 paddw mm5, mm1 | |
249 paddw mm6, mm2 | |
250 paddw mm7, mm3 | |
251 %endmacro | |
252 | |
253 %macro LOAD_X4_ADDRESSES 5 | |
254 mov %2, [%1+REG_SZ_BYTES*0] | |
255 mov %3, [%1+REG_SZ_BYTES*1] | |
256 | |
257 mov %4, [%1+REG_SZ_BYTES*2] | |
258 mov %5, [%1+REG_SZ_BYTES*3] | |
259 %endmacro | |
260 | |
261 %macro PROCESS_16X2X4 8 | |
262 %if %1==0 | |
263 movdqa xmm0, XMMWORD PTR [%2] | |
264 lddqu xmm4, XMMWORD PTR [%3] | |
265 lddqu xmm5, XMMWORD PTR [%4] | |
266 lddqu xmm6, XMMWORD PTR [%5] | |
267 lddqu xmm7, XMMWORD PTR [%6] | |
268 | |
269 psadbw xmm4, xmm0 | |
270 psadbw xmm5, xmm0 | |
271 psadbw xmm6, xmm0 | |
272 psadbw xmm7, xmm0 | |
273 %else | |
274 movdqa xmm0, XMMWORD PTR [%2] | |
275 lddqu xmm1, XMMWORD PTR [%3] | |
276 lddqu xmm2, XMMWORD PTR [%4] | |
277 lddqu xmm3, XMMWORD PTR [%5] | |
278 | |
279 psadbw xmm1, xmm0 | |
280 psadbw xmm2, xmm0 | |
281 psadbw xmm3, xmm0 | |
282 | |
283 paddw xmm4, xmm1 | |
284 lddqu xmm1, XMMWORD PTR [%6] | |
285 paddw xmm5, xmm2 | |
286 paddw xmm6, xmm3 | |
287 | |
288 psadbw xmm1, xmm0 | |
289 paddw xmm7, xmm1 | |
290 %endif | |
291 movdqa xmm0, XMMWORD PTR [%2+%7] | |
292 lddqu xmm1, XMMWORD PTR [%3+%8] | |
293 lddqu xmm2, XMMWORD PTR [%4+%8] | |
294 lddqu xmm3, XMMWORD PTR [%5+%8] | |
295 | |
296 psadbw xmm1, xmm0 | |
297 psadbw xmm2, xmm0 | |
298 psadbw xmm3, xmm0 | |
299 | |
300 paddw xmm4, xmm1 | |
301 lddqu xmm1, XMMWORD PTR [%6+%8] | |
302 paddw xmm5, xmm2 | |
303 paddw xmm6, xmm3 | |
304 | |
305 %if %1==0 || %1==1 | |
306 lea %2, [%2+%7*2] | |
307 lea %3, [%3+%8*2] | |
308 | |
309 lea %4, [%4+%8*2] | |
310 lea %5, [%5+%8*2] | |
311 | |
312 lea %6, [%6+%8*2] | |
313 %endif | |
314 psadbw xmm1, xmm0 | |
315 paddw xmm7, xmm1 | |
316 | |
317 %endmacro | |
318 | |
319 %macro PROCESS_8X2X4 8 | |
320 %if %1==0 | |
321 movq mm0, QWORD PTR [%2] | |
322 movq mm4, QWORD PTR [%3] | |
323 movq mm5, QWORD PTR [%4] | |
324 movq mm6, QWORD PTR [%5] | |
325 movq mm7, QWORD PTR [%6] | |
326 | |
327 psadbw mm4, mm0 | |
328 psadbw mm5, mm0 | |
329 psadbw mm6, mm0 | |
330 psadbw mm7, mm0 | |
331 %else | |
332 movq mm0, QWORD PTR [%2] | |
333 movq mm1, QWORD PTR [%3] | |
334 movq mm2, QWORD PTR [%4] | |
335 movq mm3, QWORD PTR [%5] | |
336 | |
337 psadbw mm1, mm0 | |
338 psadbw mm2, mm0 | |
339 psadbw mm3, mm0 | |
340 | |
341 paddw mm4, mm1 | |
342 movq mm1, QWORD PTR [%6] | |
343 paddw mm5, mm2 | |
344 paddw mm6, mm3 | |
345 | |
346 psadbw mm1, mm0 | |
347 paddw mm7, mm1 | |
348 %endif | |
349 movq mm0, QWORD PTR [%2+%7] | |
350 movq mm1, QWORD PTR [%3+%8] | |
351 movq mm2, QWORD PTR [%4+%8] | |
352 movq mm3, QWORD PTR [%5+%8] | |
353 | |
354 psadbw mm1, mm0 | |
355 psadbw mm2, mm0 | |
356 psadbw mm3, mm0 | |
357 | |
358 paddw mm4, mm1 | |
359 movq mm1, QWORD PTR [%6+%8] | |
360 paddw mm5, mm2 | |
361 paddw mm6, mm3 | |
362 | |
363 %if %1==0 || %1==1 | |
364 lea %2, [%2+%7*2] | |
365 lea %3, [%3+%8*2] | |
366 | |
367 lea %4, [%4+%8*2] | |
368 lea %5, [%5+%8*2] | |
369 | |
370 lea %6, [%6+%8*2] | |
371 %endif | |
372 psadbw mm1, mm0 | |
373 paddw mm7, mm1 | |
374 | |
375 %endmacro | |
376 | |
377 ;void int vp8_sad16x16x3_sse3( | |
378 ; unsigned char *src_ptr, | |
379 ; int src_stride, | |
380 ; unsigned char *ref_ptr, | |
381 ; int ref_stride, | |
382 ; int *results) | |
383 global sym(vp8_sad16x16x3_sse3) PRIVATE | |
384 sym(vp8_sad16x16x3_sse3): | |
385 | |
386 STACK_FRAME_CREATE_X3 | |
387 | |
388 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | |
389 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
390 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
391 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
392 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
393 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
394 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
395 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride | |
396 | |
397 mov rcx, result_ptr | |
398 | |
399 movq xmm0, xmm5 | |
400 psrldq xmm5, 8 | |
401 | |
402 paddw xmm0, xmm5 | |
403 movd [rcx], xmm0 | |
404 ;- | |
405 movq xmm0, xmm6 | |
406 psrldq xmm6, 8 | |
407 | |
408 paddw xmm0, xmm6 | |
409 movd [rcx+4], xmm0 | |
410 ;- | |
411 movq xmm0, xmm7 | |
412 psrldq xmm7, 8 | |
413 | |
414 paddw xmm0, xmm7 | |
415 movd [rcx+8], xmm0 | |
416 | |
417 STACK_FRAME_DESTROY_X3 | |
418 | |
419 ;void int vp8_sad16x8x3_sse3( | |
420 ; unsigned char *src_ptr, | |
421 ; int src_stride, | |
422 ; unsigned char *ref_ptr, | |
423 ; int ref_stride, | |
424 ; int *results) | |
425 global sym(vp8_sad16x8x3_sse3) PRIVATE | |
426 sym(vp8_sad16x8x3_sse3): | |
427 | |
428 STACK_FRAME_CREATE_X3 | |
429 | |
430 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | |
431 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
432 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
433 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride | |
434 | |
435 mov rcx, result_ptr | |
436 | |
437 movq xmm0, xmm5 | |
438 psrldq xmm5, 8 | |
439 | |
440 paddw xmm0, xmm5 | |
441 movd [rcx], xmm0 | |
442 ;- | |
443 movq xmm0, xmm6 | |
444 psrldq xmm6, 8 | |
445 | |
446 paddw xmm0, xmm6 | |
447 movd [rcx+4], xmm0 | |
448 ;- | |
449 movq xmm0, xmm7 | |
450 psrldq xmm7, 8 | |
451 | |
452 paddw xmm0, xmm7 | |
453 movd [rcx+8], xmm0 | |
454 | |
455 STACK_FRAME_DESTROY_X3 | |
456 | |
457 ;void int vp8_sad8x16x3_sse3( | |
458 ; unsigned char *src_ptr, | |
459 ; int src_stride, | |
460 ; unsigned char *ref_ptr, | |
461 ; int ref_stride, | |
462 ; int *results) | |
463 global sym(vp8_sad8x16x3_sse3) PRIVATE | |
464 sym(vp8_sad8x16x3_sse3): | |
465 | |
466 STACK_FRAME_CREATE_X3 | |
467 | |
468 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | |
469 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
470 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
471 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
472 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
473 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
474 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
475 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride | |
476 | |
477 mov rcx, result_ptr | |
478 | |
479 punpckldq mm5, mm6 | |
480 | |
481 movq [rcx], mm5 | |
482 movd [rcx+8], mm7 | |
483 | |
484 STACK_FRAME_DESTROY_X3 | |
485 | |
486 ;void int vp8_sad8x8x3_sse3( | |
487 ; unsigned char *src_ptr, | |
488 ; int src_stride, | |
489 ; unsigned char *ref_ptr, | |
490 ; int ref_stride, | |
491 ; int *results) | |
492 global sym(vp8_sad8x8x3_sse3) PRIVATE | |
493 sym(vp8_sad8x8x3_sse3): | |
494 | |
495 STACK_FRAME_CREATE_X3 | |
496 | |
497 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | |
498 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
499 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
500 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride | |
501 | |
502 mov rcx, result_ptr | |
503 | |
504 punpckldq mm5, mm6 | |
505 | |
506 movq [rcx], mm5 | |
507 movd [rcx+8], mm7 | |
508 | |
509 STACK_FRAME_DESTROY_X3 | |
510 | |
511 ;void int vp8_sad4x4x3_sse3( | |
512 ; unsigned char *src_ptr, | |
513 ; int src_stride, | |
514 ; unsigned char *ref_ptr, | |
515 ; int ref_stride, | |
516 ; int *results) | |
517 global sym(vp8_sad4x4x3_sse3) PRIVATE | |
518 sym(vp8_sad4x4x3_sse3): | |
519 | |
520 STACK_FRAME_CREATE_X3 | |
521 | |
522 movd mm0, DWORD PTR [src_ptr] | |
523 movd mm1, DWORD PTR [ref_ptr] | |
524 | |
525 movd mm2, DWORD PTR [src_ptr+src_stride] | |
526 movd mm3, DWORD PTR [ref_ptr+ref_stride] | |
527 | |
528 punpcklbw mm0, mm2 | |
529 punpcklbw mm1, mm3 | |
530 | |
531 movd mm4, DWORD PTR [ref_ptr+1] | |
532 movd mm5, DWORD PTR [ref_ptr+2] | |
533 | |
534 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] | |
535 movd mm3, DWORD PTR [ref_ptr+ref_stride+2] | |
536 | |
537 psadbw mm1, mm0 | |
538 | |
539 punpcklbw mm4, mm2 | |
540 punpcklbw mm5, mm3 | |
541 | |
542 psadbw mm4, mm0 | |
543 psadbw mm5, mm0 | |
544 | |
545 lea src_ptr, [src_ptr+src_stride*2] | |
546 lea ref_ptr, [ref_ptr+ref_stride*2] | |
547 | |
548 movd mm0, DWORD PTR [src_ptr] | |
549 movd mm2, DWORD PTR [ref_ptr] | |
550 | |
551 movd mm3, DWORD PTR [src_ptr+src_stride] | |
552 movd mm6, DWORD PTR [ref_ptr+ref_stride] | |
553 | |
554 punpcklbw mm0, mm3 | |
555 punpcklbw mm2, mm6 | |
556 | |
557 movd mm3, DWORD PTR [ref_ptr+1] | |
558 movd mm7, DWORD PTR [ref_ptr+2] | |
559 | |
560 psadbw mm2, mm0 | |
561 | |
562 paddw mm1, mm2 | |
563 | |
564 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] | |
565 movd mm6, DWORD PTR [ref_ptr+ref_stride+2] | |
566 | |
567 punpcklbw mm3, mm2 | |
568 punpcklbw mm7, mm6 | |
569 | |
570 psadbw mm3, mm0 | |
571 psadbw mm7, mm0 | |
572 | |
573 paddw mm3, mm4 | |
574 paddw mm7, mm5 | |
575 | |
576 mov rcx, result_ptr | |
577 | |
578 punpckldq mm1, mm3 | |
579 | |
580 movq [rcx], mm1 | |
581 movd [rcx+8], mm7 | |
582 | |
583 STACK_FRAME_DESTROY_X3 | |
584 | |
585 ;unsigned int vp8_sad16x16_sse3( | |
586 ; unsigned char *src_ptr, | |
587 ; int src_stride, | |
588 ; unsigned char *ref_ptr, | |
589 ; int ref_stride, | |
590 ; int max_sad) | |
591 ;%define lddqu movdqu | |
592 global sym(vp8_sad16x16_sse3) PRIVATE | |
593 sym(vp8_sad16x16_sse3): | |
594 | |
595 STACK_FRAME_CREATE_X3 | |
596 | |
597 mov end_ptr, 4 | |
598 pxor xmm7, xmm7 | |
599 | |
600 .vp8_sad16x16_sse3_loop: | |
601 movdqa xmm0, XMMWORD PTR [src_ptr] | |
602 movdqu xmm1, XMMWORD PTR [ref_ptr] | |
603 movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] | |
604 movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] | |
605 | |
606 lea src_ptr, [src_ptr+src_stride*2] | |
607 lea ref_ptr, [ref_ptr+ref_stride*2] | |
608 | |
609 movdqa xmm4, XMMWORD PTR [src_ptr] | |
610 movdqu xmm5, XMMWORD PTR [ref_ptr] | |
611 movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] | |
612 | |
613 psadbw xmm0, xmm1 | |
614 | |
615 movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] | |
616 | |
617 psadbw xmm2, xmm3 | |
618 psadbw xmm4, xmm5 | |
619 psadbw xmm6, xmm1 | |
620 | |
621 lea src_ptr, [src_ptr+src_stride*2] | |
622 lea ref_ptr, [ref_ptr+ref_stride*2] | |
623 | |
624 paddw xmm7, xmm0 | |
625 paddw xmm7, xmm2 | |
626 paddw xmm7, xmm4 | |
627 paddw xmm7, xmm6 | |
628 | |
629 sub end_ptr, 1 | |
630 jne .vp8_sad16x16_sse3_loop | |
631 | |
632 movq xmm0, xmm7 | |
633 psrldq xmm7, 8 | |
634 paddw xmm0, xmm7 | |
635 movq rax, xmm0 | |
636 | |
637 STACK_FRAME_DESTROY_X3 | |
638 | |
639 ;void vp8_copy32xn_sse3( | |
640 ; unsigned char *src_ptr, | |
641 ; int src_stride, | |
642 ; unsigned char *dst_ptr, | |
643 ; int dst_stride, | |
644 ; int height); | |
645 global sym(vp8_copy32xn_sse3) PRIVATE | |
646 sym(vp8_copy32xn_sse3): | |
647 | |
648 STACK_FRAME_CREATE_X3 | |
649 | |
650 .block_copy_sse3_loopx4: | |
651 lea end_ptr, [src_ptr+src_stride*2] | |
652 | |
653 movdqu xmm0, XMMWORD PTR [src_ptr] | |
654 movdqu xmm1, XMMWORD PTR [src_ptr + 16] | |
655 movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] | |
656 movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] | |
657 movdqu xmm4, XMMWORD PTR [end_ptr] | |
658 movdqu xmm5, XMMWORD PTR [end_ptr + 16] | |
659 movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] | |
660 movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] | |
661 | |
662 lea src_ptr, [src_ptr+src_stride*4] | |
663 | |
664 lea end_ptr, [ref_ptr+ref_stride*2] | |
665 | |
666 movdqa XMMWORD PTR [ref_ptr], xmm0 | |
667 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 | |
668 movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 | |
669 movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 | |
670 movdqa XMMWORD PTR [end_ptr], xmm4 | |
671 movdqa XMMWORD PTR [end_ptr + 16], xmm5 | |
672 movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 | |
673 movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 | |
674 | |
675 lea ref_ptr, [ref_ptr+ref_stride*4] | |
676 | |
677 sub height, 4 | |
678 cmp height, 4 | |
679 jge .block_copy_sse3_loopx4 | |
680 | |
681 ;Check to see if there is more rows need to be copied. | |
682 cmp height, 0 | |
683 je .copy_is_done | |
684 | |
685 .block_copy_sse3_loop: | |
686 movdqu xmm0, XMMWORD PTR [src_ptr] | |
687 movdqu xmm1, XMMWORD PTR [src_ptr + 16] | |
688 lea src_ptr, [src_ptr+src_stride] | |
689 | |
690 movdqa XMMWORD PTR [ref_ptr], xmm0 | |
691 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 | |
692 lea ref_ptr, [ref_ptr+ref_stride] | |
693 | |
694 sub height, 1 | |
695 jne .block_copy_sse3_loop | |
696 | |
697 .copy_is_done: | |
698 STACK_FRAME_DESTROY_X3 | |
699 | |
700 ;void vp8_sad16x16x4d_sse3( | |
701 ; unsigned char *src_ptr, | |
702 ; int src_stride, | |
703 ; unsigned char *ref_ptr_base, | |
704 ; int ref_stride, | |
705 ; int *results) | |
706 global sym(vp8_sad16x16x4d_sse3) PRIVATE | |
707 sym(vp8_sad16x16x4d_sse3): | |
708 | |
709 STACK_FRAME_CREATE_X4 | |
710 | |
711 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
712 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
713 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
714 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
715 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
716 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
717 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
718 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
719 | |
720 %if ABI_IS_32BIT | |
721 pop rbp | |
722 %endif | |
723 mov rcx, result_ptr | |
724 | |
725 movq xmm0, xmm4 | |
726 psrldq xmm4, 8 | |
727 | |
728 paddw xmm0, xmm4 | |
729 movd [rcx], xmm0 | |
730 ;- | |
731 movq xmm0, xmm5 | |
732 psrldq xmm5, 8 | |
733 | |
734 paddw xmm0, xmm5 | |
735 movd [rcx+4], xmm0 | |
736 ;- | |
737 movq xmm0, xmm6 | |
738 psrldq xmm6, 8 | |
739 | |
740 paddw xmm0, xmm6 | |
741 movd [rcx+8], xmm0 | |
742 ;- | |
743 movq xmm0, xmm7 | |
744 psrldq xmm7, 8 | |
745 | |
746 paddw xmm0, xmm7 | |
747 movd [rcx+12], xmm0 | |
748 | |
749 STACK_FRAME_DESTROY_X4 | |
750 | |
751 ;void vp8_sad16x8x4d_sse3( | |
752 ; unsigned char *src_ptr, | |
753 ; int src_stride, | |
754 ; unsigned char *ref_ptr_base, | |
755 ; int ref_stride, | |
756 ; int *results) | |
757 global sym(vp8_sad16x8x4d_sse3) PRIVATE | |
758 sym(vp8_sad16x8x4d_sse3): | |
759 | |
760 STACK_FRAME_CREATE_X4 | |
761 | |
762 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
763 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
764 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
765 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r
ef_stride | |
766 | |
767 %if ABI_IS_32BIT | |
768 pop rbp | |
769 %endif | |
770 mov rcx, result_ptr | |
771 | |
772 movq xmm0, xmm4 | |
773 psrldq xmm4, 8 | |
774 | |
775 paddw xmm0, xmm4 | |
776 movd [rcx], xmm0 | |
777 ;- | |
778 movq xmm0, xmm5 | |
779 psrldq xmm5, 8 | |
780 | |
781 paddw xmm0, xmm5 | |
782 movd [rcx+4], xmm0 | |
783 ;- | |
784 movq xmm0, xmm6 | |
785 psrldq xmm6, 8 | |
786 | |
787 paddw xmm0, xmm6 | |
788 movd [rcx+8], xmm0 | |
789 ;- | |
790 movq xmm0, xmm7 | |
791 psrldq xmm7, 8 | |
792 | |
793 paddw xmm0, xmm7 | |
794 movd [rcx+12], xmm0 | |
795 | |
796 STACK_FRAME_DESTROY_X4 | |
797 | |
798 ;void int vp8_sad8x16x4d_sse3( | |
799 ; unsigned char *src_ptr, | |
800 ; int src_stride, | |
801 ; unsigned char *ref_ptr, | |
802 ; int ref_stride, | |
803 ; int *results) | |
804 global sym(vp8_sad8x16x4d_sse3) PRIVATE | |
805 sym(vp8_sad8x16x4d_sse3): | |
806 | |
807 STACK_FRAME_CREATE_X4 | |
808 | |
809 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
810 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
811 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
812 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
813 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
814 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
815 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
816 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
817 | |
818 %if ABI_IS_32BIT | |
819 pop rbp | |
820 %endif | |
821 mov rcx, result_ptr | |
822 | |
823 punpckldq mm4, mm5 | |
824 punpckldq mm6, mm7 | |
825 | |
826 movq [rcx], mm4 | |
827 movq [rcx+8], mm6 | |
828 | |
829 STACK_FRAME_DESTROY_X4 | |
830 | |
831 ;void int vp8_sad8x8x4d_sse3( | |
832 ; unsigned char *src_ptr, | |
833 ; int src_stride, | |
834 ; unsigned char *ref_ptr, | |
835 ; int ref_stride, | |
836 ; int *results) | |
837 global sym(vp8_sad8x8x4d_sse3) PRIVATE | |
838 sym(vp8_sad8x8x4d_sse3): | |
839 | |
840 STACK_FRAME_CREATE_X4 | |
841 | |
842 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
843 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
844 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
845 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re
f_stride | |
846 | |
847 %if ABI_IS_32BIT | |
848 pop rbp | |
849 %endif | |
850 mov rcx, result_ptr | |
851 | |
852 punpckldq mm4, mm5 | |
853 punpckldq mm6, mm7 | |
854 | |
855 movq [rcx], mm4 | |
856 movq [rcx+8], mm6 | |
857 | |
858 STACK_FRAME_DESTROY_X4 | |
859 | |
860 ;void int vp8_sad4x4x4d_sse3( | |
861 ; unsigned char *src_ptr, | |
862 ; int src_stride, | |
863 ; unsigned char *ref_ptr, | |
864 ; int ref_stride, | |
865 ; int *results) | |
866 global sym(vp8_sad4x4x4d_sse3) PRIVATE | |
867 sym(vp8_sad4x4x4d_sse3): | |
868 | |
869 STACK_FRAME_CREATE_X4 | |
870 | |
871 movd mm0, DWORD PTR [src_ptr] | |
872 movd mm1, DWORD PTR [r0_ptr] | |
873 | |
874 movd mm2, DWORD PTR [src_ptr+src_stride] | |
875 movd mm3, DWORD PTR [r0_ptr+ref_stride] | |
876 | |
877 punpcklbw mm0, mm2 | |
878 punpcklbw mm1, mm3 | |
879 | |
880 movd mm4, DWORD PTR [r1_ptr] | |
881 movd mm5, DWORD PTR [r2_ptr] | |
882 | |
883 movd mm6, DWORD PTR [r3_ptr] | |
884 movd mm2, DWORD PTR [r1_ptr+ref_stride] | |
885 | |
886 movd mm3, DWORD PTR [r2_ptr+ref_stride] | |
887 movd mm7, DWORD PTR [r3_ptr+ref_stride] | |
888 | |
889 psadbw mm1, mm0 | |
890 | |
891 punpcklbw mm4, mm2 | |
892 punpcklbw mm5, mm3 | |
893 | |
894 punpcklbw mm6, mm7 | |
895 psadbw mm4, mm0 | |
896 | |
897 psadbw mm5, mm0 | |
898 psadbw mm6, mm0 | |
899 | |
900 | |
901 | |
902 lea src_ptr, [src_ptr+src_stride*2] | |
903 lea r0_ptr, [r0_ptr+ref_stride*2] | |
904 | |
905 lea r1_ptr, [r1_ptr+ref_stride*2] | |
906 lea r2_ptr, [r2_ptr+ref_stride*2] | |
907 | |
908 lea r3_ptr, [r3_ptr+ref_stride*2] | |
909 | |
910 movd mm0, DWORD PTR [src_ptr] | |
911 movd mm2, DWORD PTR [r0_ptr] | |
912 | |
913 movd mm3, DWORD PTR [src_ptr+src_stride] | |
914 movd mm7, DWORD PTR [r0_ptr+ref_stride] | |
915 | |
916 punpcklbw mm0, mm3 | |
917 punpcklbw mm2, mm7 | |
918 | |
919 movd mm3, DWORD PTR [r1_ptr] | |
920 movd mm7, DWORD PTR [r2_ptr] | |
921 | |
922 psadbw mm2, mm0 | |
923 %if ABI_IS_32BIT | |
924 mov rax, rbp | |
925 | |
926 pop rbp | |
927 %define ref_stride rax | |
928 %endif | |
929 mov rsi, result_ptr | |
930 | |
931 paddw mm1, mm2 | |
932 movd [rsi], mm1 | |
933 | |
934 movd mm2, DWORD PTR [r1_ptr+ref_stride] | |
935 movd mm1, DWORD PTR [r2_ptr+ref_stride] | |
936 | |
937 punpcklbw mm3, mm2 | |
938 punpcklbw mm7, mm1 | |
939 | |
940 psadbw mm3, mm0 | |
941 psadbw mm7, mm0 | |
942 | |
943 movd mm2, DWORD PTR [r3_ptr] | |
944 movd mm1, DWORD PTR [r3_ptr+ref_stride] | |
945 | |
946 paddw mm3, mm4 | |
947 paddw mm7, mm5 | |
948 | |
949 movd [rsi+4], mm3 | |
950 punpcklbw mm2, mm1 | |
951 | |
952 movd [rsi+8], mm7 | |
953 psadbw mm2, mm0 | |
954 | |
955 paddw mm2, mm6 | |
956 movd [rsi+12], mm2 | |
957 | |
958 | |
959 STACK_FRAME_DESTROY_X4 | |
960 | |
OLD | NEW |