Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(328)

Side by Side Diff: source/libvpx/vp8/common/x86/sad_sse3.asm

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: only update to last nights LKGR Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/libvpx/vp8/common/x86/sad_sse2.asm ('k') | source/libvpx/vp8/common/x86/sad_sse4.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "vpx_ports/x86_abi_support.asm"
12
13 %macro STACK_FRAME_CREATE_X3 0
14 %if ABI_IS_32BIT
15 %define src_ptr rsi
16 %define src_stride rax
17 %define ref_ptr rdi
18 %define ref_stride rdx
19 %define end_ptr rcx
20 %define ret_var rbx
21 %define result_ptr arg(4)
22 %define max_sad arg(4)
23 %define height dword ptr arg(4)
24 push rbp
25 mov rbp, rsp
26 push rsi
27 push rdi
28 push rbx
29
30 mov rsi, arg(0) ; src_ptr
31 mov rdi, arg(2) ; ref_ptr
32
33 movsxd rax, dword ptr arg(1) ; src_stride
34 movsxd rdx, dword ptr arg(3) ; ref_stride
35 %else
36 %if LIBVPX_YASM_WIN64
37 SAVE_XMM 7, u
38 %define src_ptr rcx
39 %define src_stride rdx
40 %define ref_ptr r8
41 %define ref_stride r9
42 %define end_ptr r10
43 %define ret_var r11
44 %define result_ptr [rsp+xmm_stack_space+8+4*8]
45 %define max_sad [rsp+xmm_stack_space+8+4*8]
46 %define height dword ptr [rsp+xmm_stack_space+8+4*8]
47 %else
48 %define src_ptr rdi
49 %define src_stride rsi
50 %define ref_ptr rdx
51 %define ref_stride rcx
52 %define end_ptr r9
53 %define ret_var r10
54 %define result_ptr r8
55 %define max_sad r8
56 %define height r8
57 %endif
58 %endif
59
60 %endmacro
61
62 %macro STACK_FRAME_DESTROY_X3 0
63 %define src_ptr
64 %define src_stride
65 %define ref_ptr
66 %define ref_stride
67 %define end_ptr
68 %define ret_var
69 %define result_ptr
70 %define max_sad
71 %define height
72
73 %if ABI_IS_32BIT
74 pop rbx
75 pop rdi
76 pop rsi
77 pop rbp
78 %else
79 %if LIBVPX_YASM_WIN64
80 RESTORE_XMM
81 %endif
82 %endif
83 ret
84 %endmacro
85
86 %macro STACK_FRAME_CREATE_X4 0
87 %if ABI_IS_32BIT
88 %define src_ptr rsi
89 %define src_stride rax
90 %define r0_ptr rcx
91 %define r1_ptr rdx
92 %define r2_ptr rbx
93 %define r3_ptr rdi
94 %define ref_stride rbp
95 %define result_ptr arg(4)
96 push rbp
97 mov rbp, rsp
98 push rsi
99 push rdi
100 push rbx
101
102 push rbp
103 mov rdi, arg(2) ; ref_ptr_base
104
105 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
106
107 mov rsi, arg(0) ; src_ptr
108
109 movsxd rbx, dword ptr arg(1) ; src_stride
110 movsxd rbp, dword ptr arg(3) ; ref_stride
111
112 xchg rbx, rax
113 %else
114 %if LIBVPX_YASM_WIN64
115 SAVE_XMM 7, u
116 %define src_ptr rcx
117 %define src_stride rdx
118 %define r0_ptr rsi
119 %define r1_ptr r10
120 %define r2_ptr r11
121 %define r3_ptr r8
122 %define ref_stride r9
123 %define result_ptr [rsp+xmm_stack_space+16+4*8]
124 push rsi
125
126 LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
127 %else
128 %define src_ptr rdi
129 %define src_stride rsi
130 %define r0_ptr r9
131 %define r1_ptr r10
132 %define r2_ptr r11
133 %define r3_ptr rdx
134 %define ref_stride rcx
135 %define result_ptr r8
136
137 LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
138
139 %endif
140 %endif
141 %endmacro
142
143 %macro STACK_FRAME_DESTROY_X4 0
144 %define src_ptr
145 %define src_stride
146 %define r0_ptr
147 %define r1_ptr
148 %define r2_ptr
149 %define r3_ptr
150 %define ref_stride
151 %define result_ptr
152
153 %if ABI_IS_32BIT
154 pop rbx
155 pop rdi
156 pop rsi
157 pop rbp
158 %else
159 %if LIBVPX_YASM_WIN64
160 pop rsi
161 RESTORE_XMM
162 %endif
163 %endif
164 ret
165 %endmacro
166
167 %macro PROCESS_16X2X3 5
168 %if %1==0
169 movdqa xmm0, XMMWORD PTR [%2]
170 lddqu xmm5, XMMWORD PTR [%3]
171 lddqu xmm6, XMMWORD PTR [%3+1]
172 lddqu xmm7, XMMWORD PTR [%3+2]
173
174 psadbw xmm5, xmm0
175 psadbw xmm6, xmm0
176 psadbw xmm7, xmm0
177 %else
178 movdqa xmm0, XMMWORD PTR [%2]
179 lddqu xmm1, XMMWORD PTR [%3]
180 lddqu xmm2, XMMWORD PTR [%3+1]
181 lddqu xmm3, XMMWORD PTR [%3+2]
182
183 psadbw xmm1, xmm0
184 psadbw xmm2, xmm0
185 psadbw xmm3, xmm0
186
187 paddw xmm5, xmm1
188 paddw xmm6, xmm2
189 paddw xmm7, xmm3
190 %endif
191 movdqa xmm0, XMMWORD PTR [%2+%4]
192 lddqu xmm1, XMMWORD PTR [%3+%5]
193 lddqu xmm2, XMMWORD PTR [%3+%5+1]
194 lddqu xmm3, XMMWORD PTR [%3+%5+2]
195
196 %if %1==0 || %1==1
197 lea %2, [%2+%4*2]
198 lea %3, [%3+%5*2]
199 %endif
200
201 psadbw xmm1, xmm0
202 psadbw xmm2, xmm0
203 psadbw xmm3, xmm0
204
205 paddw xmm5, xmm1
206 paddw xmm6, xmm2
207 paddw xmm7, xmm3
208 %endmacro
209
210 %macro PROCESS_8X2X3 5
211 %if %1==0
212 movq mm0, QWORD PTR [%2]
213 movq mm5, QWORD PTR [%3]
214 movq mm6, QWORD PTR [%3+1]
215 movq mm7, QWORD PTR [%3+2]
216
217 psadbw mm5, mm0
218 psadbw mm6, mm0
219 psadbw mm7, mm0
220 %else
221 movq mm0, QWORD PTR [%2]
222 movq mm1, QWORD PTR [%3]
223 movq mm2, QWORD PTR [%3+1]
224 movq mm3, QWORD PTR [%3+2]
225
226 psadbw mm1, mm0
227 psadbw mm2, mm0
228 psadbw mm3, mm0
229
230 paddw mm5, mm1
231 paddw mm6, mm2
232 paddw mm7, mm3
233 %endif
234 movq mm0, QWORD PTR [%2+%4]
235 movq mm1, QWORD PTR [%3+%5]
236 movq mm2, QWORD PTR [%3+%5+1]
237 movq mm3, QWORD PTR [%3+%5+2]
238
239 %if %1==0 || %1==1
240 lea %2, [%2+%4*2]
241 lea %3, [%3+%5*2]
242 %endif
243
244 psadbw mm1, mm0
245 psadbw mm2, mm0
246 psadbw mm3, mm0
247
248 paddw mm5, mm1
249 paddw mm6, mm2
250 paddw mm7, mm3
251 %endmacro
252
253 %macro LOAD_X4_ADDRESSES 5
254 mov %2, [%1+REG_SZ_BYTES*0]
255 mov %3, [%1+REG_SZ_BYTES*1]
256
257 mov %4, [%1+REG_SZ_BYTES*2]
258 mov %5, [%1+REG_SZ_BYTES*3]
259 %endmacro
260
261 %macro PROCESS_16X2X4 8
262 %if %1==0
263 movdqa xmm0, XMMWORD PTR [%2]
264 lddqu xmm4, XMMWORD PTR [%3]
265 lddqu xmm5, XMMWORD PTR [%4]
266 lddqu xmm6, XMMWORD PTR [%5]
267 lddqu xmm7, XMMWORD PTR [%6]
268
269 psadbw xmm4, xmm0
270 psadbw xmm5, xmm0
271 psadbw xmm6, xmm0
272 psadbw xmm7, xmm0
273 %else
274 movdqa xmm0, XMMWORD PTR [%2]
275 lddqu xmm1, XMMWORD PTR [%3]
276 lddqu xmm2, XMMWORD PTR [%4]
277 lddqu xmm3, XMMWORD PTR [%5]
278
279 psadbw xmm1, xmm0
280 psadbw xmm2, xmm0
281 psadbw xmm3, xmm0
282
283 paddw xmm4, xmm1
284 lddqu xmm1, XMMWORD PTR [%6]
285 paddw xmm5, xmm2
286 paddw xmm6, xmm3
287
288 psadbw xmm1, xmm0
289 paddw xmm7, xmm1
290 %endif
291 movdqa xmm0, XMMWORD PTR [%2+%7]
292 lddqu xmm1, XMMWORD PTR [%3+%8]
293 lddqu xmm2, XMMWORD PTR [%4+%8]
294 lddqu xmm3, XMMWORD PTR [%5+%8]
295
296 psadbw xmm1, xmm0
297 psadbw xmm2, xmm0
298 psadbw xmm3, xmm0
299
300 paddw xmm4, xmm1
301 lddqu xmm1, XMMWORD PTR [%6+%8]
302 paddw xmm5, xmm2
303 paddw xmm6, xmm3
304
305 %if %1==0 || %1==1
306 lea %2, [%2+%7*2]
307 lea %3, [%3+%8*2]
308
309 lea %4, [%4+%8*2]
310 lea %5, [%5+%8*2]
311
312 lea %6, [%6+%8*2]
313 %endif
314 psadbw xmm1, xmm0
315 paddw xmm7, xmm1
316
317 %endmacro
318
319 %macro PROCESS_8X2X4 8
320 %if %1==0
321 movq mm0, QWORD PTR [%2]
322 movq mm4, QWORD PTR [%3]
323 movq mm5, QWORD PTR [%4]
324 movq mm6, QWORD PTR [%5]
325 movq mm7, QWORD PTR [%6]
326
327 psadbw mm4, mm0
328 psadbw mm5, mm0
329 psadbw mm6, mm0
330 psadbw mm7, mm0
331 %else
332 movq mm0, QWORD PTR [%2]
333 movq mm1, QWORD PTR [%3]
334 movq mm2, QWORD PTR [%4]
335 movq mm3, QWORD PTR [%5]
336
337 psadbw mm1, mm0
338 psadbw mm2, mm0
339 psadbw mm3, mm0
340
341 paddw mm4, mm1
342 movq mm1, QWORD PTR [%6]
343 paddw mm5, mm2
344 paddw mm6, mm3
345
346 psadbw mm1, mm0
347 paddw mm7, mm1
348 %endif
349 movq mm0, QWORD PTR [%2+%7]
350 movq mm1, QWORD PTR [%3+%8]
351 movq mm2, QWORD PTR [%4+%8]
352 movq mm3, QWORD PTR [%5+%8]
353
354 psadbw mm1, mm0
355 psadbw mm2, mm0
356 psadbw mm3, mm0
357
358 paddw mm4, mm1
359 movq mm1, QWORD PTR [%6+%8]
360 paddw mm5, mm2
361 paddw mm6, mm3
362
363 %if %1==0 || %1==1
364 lea %2, [%2+%7*2]
365 lea %3, [%3+%8*2]
366
367 lea %4, [%4+%8*2]
368 lea %5, [%5+%8*2]
369
370 lea %6, [%6+%8*2]
371 %endif
372 psadbw mm1, mm0
373 paddw mm7, mm1
374
375 %endmacro
376
377 ;void int vp8_sad16x16x3_sse3(
378 ; unsigned char *src_ptr,
379 ; int src_stride,
380 ; unsigned char *ref_ptr,
381 ; int ref_stride,
382 ; int *results)
383 global sym(vp8_sad16x16x3_sse3) PRIVATE
384 sym(vp8_sad16x16x3_sse3):
385
386 STACK_FRAME_CREATE_X3
387
388 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
389 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
390 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
391 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
392 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
393 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
394 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
395 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
396
397 mov rcx, result_ptr
398
399 movq xmm0, xmm5
400 psrldq xmm5, 8
401
402 paddw xmm0, xmm5
403 movd [rcx], xmm0
404 ;-
405 movq xmm0, xmm6
406 psrldq xmm6, 8
407
408 paddw xmm0, xmm6
409 movd [rcx+4], xmm0
410 ;-
411 movq xmm0, xmm7
412 psrldq xmm7, 8
413
414 paddw xmm0, xmm7
415 movd [rcx+8], xmm0
416
417 STACK_FRAME_DESTROY_X3
418
419 ;void int vp8_sad16x8x3_sse3(
420 ; unsigned char *src_ptr,
421 ; int src_stride,
422 ; unsigned char *ref_ptr,
423 ; int ref_stride,
424 ; int *results)
425 global sym(vp8_sad16x8x3_sse3) PRIVATE
426 sym(vp8_sad16x8x3_sse3):
427
428 STACK_FRAME_CREATE_X3
429
430 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
431 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
432 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
433 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
434
435 mov rcx, result_ptr
436
437 movq xmm0, xmm5
438 psrldq xmm5, 8
439
440 paddw xmm0, xmm5
441 movd [rcx], xmm0
442 ;-
443 movq xmm0, xmm6
444 psrldq xmm6, 8
445
446 paddw xmm0, xmm6
447 movd [rcx+4], xmm0
448 ;-
449 movq xmm0, xmm7
450 psrldq xmm7, 8
451
452 paddw xmm0, xmm7
453 movd [rcx+8], xmm0
454
455 STACK_FRAME_DESTROY_X3
456
457 ;void int vp8_sad8x16x3_sse3(
458 ; unsigned char *src_ptr,
459 ; int src_stride,
460 ; unsigned char *ref_ptr,
461 ; int ref_stride,
462 ; int *results)
463 global sym(vp8_sad8x16x3_sse3) PRIVATE
464 sym(vp8_sad8x16x3_sse3):
465
466 STACK_FRAME_CREATE_X3
467
468 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
469 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
470 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
471 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
472 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
473 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
474 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
475 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
476
477 mov rcx, result_ptr
478
479 punpckldq mm5, mm6
480
481 movq [rcx], mm5
482 movd [rcx+8], mm7
483
484 STACK_FRAME_DESTROY_X3
485
486 ;void int vp8_sad8x8x3_sse3(
487 ; unsigned char *src_ptr,
488 ; int src_stride,
489 ; unsigned char *ref_ptr,
490 ; int ref_stride,
491 ; int *results)
492 global sym(vp8_sad8x8x3_sse3) PRIVATE
493 sym(vp8_sad8x8x3_sse3):
494
495 STACK_FRAME_CREATE_X3
496
497 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
498 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
499 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
500 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
501
502 mov rcx, result_ptr
503
504 punpckldq mm5, mm6
505
506 movq [rcx], mm5
507 movd [rcx+8], mm7
508
509 STACK_FRAME_DESTROY_X3
510
511 ;void int vp8_sad4x4x3_sse3(
512 ; unsigned char *src_ptr,
513 ; int src_stride,
514 ; unsigned char *ref_ptr,
515 ; int ref_stride,
516 ; int *results)
517 global sym(vp8_sad4x4x3_sse3) PRIVATE
518 sym(vp8_sad4x4x3_sse3):
519
520 STACK_FRAME_CREATE_X3
521
522 movd mm0, DWORD PTR [src_ptr]
523 movd mm1, DWORD PTR [ref_ptr]
524
525 movd mm2, DWORD PTR [src_ptr+src_stride]
526 movd mm3, DWORD PTR [ref_ptr+ref_stride]
527
528 punpcklbw mm0, mm2
529 punpcklbw mm1, mm3
530
531 movd mm4, DWORD PTR [ref_ptr+1]
532 movd mm5, DWORD PTR [ref_ptr+2]
533
534 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
535 movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
536
537 psadbw mm1, mm0
538
539 punpcklbw mm4, mm2
540 punpcklbw mm5, mm3
541
542 psadbw mm4, mm0
543 psadbw mm5, mm0
544
545 lea src_ptr, [src_ptr+src_stride*2]
546 lea ref_ptr, [ref_ptr+ref_stride*2]
547
548 movd mm0, DWORD PTR [src_ptr]
549 movd mm2, DWORD PTR [ref_ptr]
550
551 movd mm3, DWORD PTR [src_ptr+src_stride]
552 movd mm6, DWORD PTR [ref_ptr+ref_stride]
553
554 punpcklbw mm0, mm3
555 punpcklbw mm2, mm6
556
557 movd mm3, DWORD PTR [ref_ptr+1]
558 movd mm7, DWORD PTR [ref_ptr+2]
559
560 psadbw mm2, mm0
561
562 paddw mm1, mm2
563
564 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
565 movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
566
567 punpcklbw mm3, mm2
568 punpcklbw mm7, mm6
569
570 psadbw mm3, mm0
571 psadbw mm7, mm0
572
573 paddw mm3, mm4
574 paddw mm7, mm5
575
576 mov rcx, result_ptr
577
578 punpckldq mm1, mm3
579
580 movq [rcx], mm1
581 movd [rcx+8], mm7
582
583 STACK_FRAME_DESTROY_X3
584
585 ;unsigned int vp8_sad16x16_sse3(
586 ; unsigned char *src_ptr,
587 ; int src_stride,
588 ; unsigned char *ref_ptr,
589 ; int ref_stride,
590 ; int max_sad)
591 ;%define lddqu movdqu
592 global sym(vp8_sad16x16_sse3) PRIVATE
593 sym(vp8_sad16x16_sse3):
594
595 STACK_FRAME_CREATE_X3
596
597 mov end_ptr, 4
598 pxor xmm7, xmm7
599
600 .vp8_sad16x16_sse3_loop:
601 movdqa xmm0, XMMWORD PTR [src_ptr]
602 movdqu xmm1, XMMWORD PTR [ref_ptr]
603 movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
604 movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
605
606 lea src_ptr, [src_ptr+src_stride*2]
607 lea ref_ptr, [ref_ptr+ref_stride*2]
608
609 movdqa xmm4, XMMWORD PTR [src_ptr]
610 movdqu xmm5, XMMWORD PTR [ref_ptr]
611 movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
612
613 psadbw xmm0, xmm1
614
615 movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
616
617 psadbw xmm2, xmm3
618 psadbw xmm4, xmm5
619 psadbw xmm6, xmm1
620
621 lea src_ptr, [src_ptr+src_stride*2]
622 lea ref_ptr, [ref_ptr+ref_stride*2]
623
624 paddw xmm7, xmm0
625 paddw xmm7, xmm2
626 paddw xmm7, xmm4
627 paddw xmm7, xmm6
628
629 sub end_ptr, 1
630 jne .vp8_sad16x16_sse3_loop
631
632 movq xmm0, xmm7
633 psrldq xmm7, 8
634 paddw xmm0, xmm7
635 movq rax, xmm0
636
637 STACK_FRAME_DESTROY_X3
638
639 ;void vp8_copy32xn_sse3(
640 ; unsigned char *src_ptr,
641 ; int src_stride,
642 ; unsigned char *dst_ptr,
643 ; int dst_stride,
644 ; int height);
645 global sym(vp8_copy32xn_sse3) PRIVATE
646 sym(vp8_copy32xn_sse3):
647
648 STACK_FRAME_CREATE_X3
649
650 .block_copy_sse3_loopx4:
651 lea end_ptr, [src_ptr+src_stride*2]
652
653 movdqu xmm0, XMMWORD PTR [src_ptr]
654 movdqu xmm1, XMMWORD PTR [src_ptr + 16]
655 movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
656 movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
657 movdqu xmm4, XMMWORD PTR [end_ptr]
658 movdqu xmm5, XMMWORD PTR [end_ptr + 16]
659 movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
660 movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
661
662 lea src_ptr, [src_ptr+src_stride*4]
663
664 lea end_ptr, [ref_ptr+ref_stride*2]
665
666 movdqa XMMWORD PTR [ref_ptr], xmm0
667 movdqa XMMWORD PTR [ref_ptr + 16], xmm1
668 movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
669 movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
670 movdqa XMMWORD PTR [end_ptr], xmm4
671 movdqa XMMWORD PTR [end_ptr + 16], xmm5
672 movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
673 movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
674
675 lea ref_ptr, [ref_ptr+ref_stride*4]
676
677 sub height, 4
678 cmp height, 4
679 jge .block_copy_sse3_loopx4
680
681 ;Check to see if there is more rows need to be copied.
682 cmp height, 0
683 je .copy_is_done
684
685 .block_copy_sse3_loop:
686 movdqu xmm0, XMMWORD PTR [src_ptr]
687 movdqu xmm1, XMMWORD PTR [src_ptr + 16]
688 lea src_ptr, [src_ptr+src_stride]
689
690 movdqa XMMWORD PTR [ref_ptr], xmm0
691 movdqa XMMWORD PTR [ref_ptr + 16], xmm1
692 lea ref_ptr, [ref_ptr+ref_stride]
693
694 sub height, 1
695 jne .block_copy_sse3_loop
696
697 .copy_is_done:
698 STACK_FRAME_DESTROY_X3
699
700 ;void vp8_sad16x16x4d_sse3(
701 ; unsigned char *src_ptr,
702 ; int src_stride,
703 ; unsigned char *ref_ptr_base,
704 ; int ref_stride,
705 ; int *results)
706 global sym(vp8_sad16x16x4d_sse3) PRIVATE
707 sym(vp8_sad16x16x4d_sse3):
708
709 STACK_FRAME_CREATE_X4
710
711 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
712 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
713 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
714 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
715 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
716 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
717 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
718 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
719
720 %if ABI_IS_32BIT
721 pop rbp
722 %endif
723 mov rcx, result_ptr
724
725 movq xmm0, xmm4
726 psrldq xmm4, 8
727
728 paddw xmm0, xmm4
729 movd [rcx], xmm0
730 ;-
731 movq xmm0, xmm5
732 psrldq xmm5, 8
733
734 paddw xmm0, xmm5
735 movd [rcx+4], xmm0
736 ;-
737 movq xmm0, xmm6
738 psrldq xmm6, 8
739
740 paddw xmm0, xmm6
741 movd [rcx+8], xmm0
742 ;-
743 movq xmm0, xmm7
744 psrldq xmm7, 8
745
746 paddw xmm0, xmm7
747 movd [rcx+12], xmm0
748
749 STACK_FRAME_DESTROY_X4
750
751 ;void vp8_sad16x8x4d_sse3(
752 ; unsigned char *src_ptr,
753 ; int src_stride,
754 ; unsigned char *ref_ptr_base,
755 ; int ref_stride,
756 ; int *results)
757 global sym(vp8_sad16x8x4d_sse3) PRIVATE
758 sym(vp8_sad16x8x4d_sse3):
759
760 STACK_FRAME_CREATE_X4
761
762 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
763 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
764 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
765 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, r ef_stride
766
767 %if ABI_IS_32BIT
768 pop rbp
769 %endif
770 mov rcx, result_ptr
771
772 movq xmm0, xmm4
773 psrldq xmm4, 8
774
775 paddw xmm0, xmm4
776 movd [rcx], xmm0
777 ;-
778 movq xmm0, xmm5
779 psrldq xmm5, 8
780
781 paddw xmm0, xmm5
782 movd [rcx+4], xmm0
783 ;-
784 movq xmm0, xmm6
785 psrldq xmm6, 8
786
787 paddw xmm0, xmm6
788 movd [rcx+8], xmm0
789 ;-
790 movq xmm0, xmm7
791 psrldq xmm7, 8
792
793 paddw xmm0, xmm7
794 movd [rcx+12], xmm0
795
796 STACK_FRAME_DESTROY_X4
797
798 ;void int vp8_sad8x16x4d_sse3(
799 ; unsigned char *src_ptr,
800 ; int src_stride,
801 ; unsigned char *ref_ptr,
802 ; int ref_stride,
803 ; int *results)
804 global sym(vp8_sad8x16x4d_sse3) PRIVATE
805 sym(vp8_sad8x16x4d_sse3):
806
807 STACK_FRAME_CREATE_X4
808
809 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
810 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
811 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
812 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
813 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
814 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
815 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
816 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
817
818 %if ABI_IS_32BIT
819 pop rbp
820 %endif
821 mov rcx, result_ptr
822
823 punpckldq mm4, mm5
824 punpckldq mm6, mm7
825
826 movq [rcx], mm4
827 movq [rcx+8], mm6
828
829 STACK_FRAME_DESTROY_X4
830
831 ;void int vp8_sad8x8x4d_sse3(
832 ; unsigned char *src_ptr,
833 ; int src_stride,
834 ; unsigned char *ref_ptr,
835 ; int ref_stride,
836 ; int *results)
837 global sym(vp8_sad8x8x4d_sse3) PRIVATE
838 sym(vp8_sad8x8x4d_sse3):
839
840 STACK_FRAME_CREATE_X4
841
842 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
843 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
844 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
845 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, re f_stride
846
847 %if ABI_IS_32BIT
848 pop rbp
849 %endif
850 mov rcx, result_ptr
851
852 punpckldq mm4, mm5
853 punpckldq mm6, mm7
854
855 movq [rcx], mm4
856 movq [rcx+8], mm6
857
858 STACK_FRAME_DESTROY_X4
859
860 ;void int vp8_sad4x4x4d_sse3(
861 ; unsigned char *src_ptr,
862 ; int src_stride,
863 ; unsigned char *ref_ptr,
864 ; int ref_stride,
865 ; int *results)
866 global sym(vp8_sad4x4x4d_sse3) PRIVATE
867 sym(vp8_sad4x4x4d_sse3):
868
869 STACK_FRAME_CREATE_X4
870
871 movd mm0, DWORD PTR [src_ptr]
872 movd mm1, DWORD PTR [r0_ptr]
873
874 movd mm2, DWORD PTR [src_ptr+src_stride]
875 movd mm3, DWORD PTR [r0_ptr+ref_stride]
876
877 punpcklbw mm0, mm2
878 punpcklbw mm1, mm3
879
880 movd mm4, DWORD PTR [r1_ptr]
881 movd mm5, DWORD PTR [r2_ptr]
882
883 movd mm6, DWORD PTR [r3_ptr]
884 movd mm2, DWORD PTR [r1_ptr+ref_stride]
885
886 movd mm3, DWORD PTR [r2_ptr+ref_stride]
887 movd mm7, DWORD PTR [r3_ptr+ref_stride]
888
889 psadbw mm1, mm0
890
891 punpcklbw mm4, mm2
892 punpcklbw mm5, mm3
893
894 punpcklbw mm6, mm7
895 psadbw mm4, mm0
896
897 psadbw mm5, mm0
898 psadbw mm6, mm0
899
900
901
902 lea src_ptr, [src_ptr+src_stride*2]
903 lea r0_ptr, [r0_ptr+ref_stride*2]
904
905 lea r1_ptr, [r1_ptr+ref_stride*2]
906 lea r2_ptr, [r2_ptr+ref_stride*2]
907
908 lea r3_ptr, [r3_ptr+ref_stride*2]
909
910 movd mm0, DWORD PTR [src_ptr]
911 movd mm2, DWORD PTR [r0_ptr]
912
913 movd mm3, DWORD PTR [src_ptr+src_stride]
914 movd mm7, DWORD PTR [r0_ptr+ref_stride]
915
916 punpcklbw mm0, mm3
917 punpcklbw mm2, mm7
918
919 movd mm3, DWORD PTR [r1_ptr]
920 movd mm7, DWORD PTR [r2_ptr]
921
922 psadbw mm2, mm0
923 %if ABI_IS_32BIT
924 mov rax, rbp
925
926 pop rbp
927 %define ref_stride rax
928 %endif
929 mov rsi, result_ptr
930
931 paddw mm1, mm2
932 movd [rsi], mm1
933
934 movd mm2, DWORD PTR [r1_ptr+ref_stride]
935 movd mm1, DWORD PTR [r2_ptr+ref_stride]
936
937 punpcklbw mm3, mm2
938 punpcklbw mm7, mm1
939
940 psadbw mm3, mm0
941 psadbw mm7, mm0
942
943 movd mm2, DWORD PTR [r3_ptr]
944 movd mm1, DWORD PTR [r3_ptr+ref_stride]
945
946 paddw mm3, mm4
947 paddw mm7, mm5
948
949 movd [rsi+4], mm3
950 punpcklbw mm2, mm1
951
952 movd [rsi+8], mm7
953 psadbw mm2, mm0
954
955 paddw mm2, mm6
956 movd [rsi+12], mm2
957
958
959 STACK_FRAME_DESTROY_X4
960
OLDNEW
« no previous file with comments | « source/libvpx/vp8/common/x86/sad_sse2.asm ('k') | source/libvpx/vp8/common/x86/sad_sse4.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698