Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(116)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm

Issue 592203002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
15 ;overflow.
16
17 %macro HIGH_GET_FILTERS_4 0
18 mov rdx, arg(5) ;filter ptr
19 mov rcx, 0x00000040
20
21 movdqa xmm7, [rdx] ;load filters
22 pshuflw xmm0, xmm7, 0b ;k0
23 pshuflw xmm1, xmm7, 01010101b ;k1
24 pshuflw xmm2, xmm7, 10101010b ;k2
25 pshuflw xmm3, xmm7, 11111111b ;k3
26 psrldq xmm7, 8
27 pshuflw xmm4, xmm7, 0b ;k4
28 pshuflw xmm5, xmm7, 01010101b ;k5
29 pshuflw xmm6, xmm7, 10101010b ;k6
30 pshuflw xmm7, xmm7, 11111111b ;k7
31
32 punpcklwd xmm0, xmm6
33 punpcklwd xmm2, xmm5
34 punpcklwd xmm3, xmm4
35 punpcklwd xmm1, xmm7
36
37 movdqa k0k6, xmm0
38 movdqa k2k5, xmm2
39 movdqa k3k4, xmm3
40 movdqa k1k7, xmm1
41
42 movq xmm6, rcx
43 pshufd xmm6, xmm6, 0
44 movdqa krd, xmm6
45
46 ;Compute max and min values of a pixel
47 mov rdx, 0x00010001
48 movsxd rcx, DWORD PTR arg(6) ;bps
49 movq xmm0, rdx
50 movq xmm1, rcx
51 pshufd xmm0, xmm0, 0b
52 movdqa xmm2, xmm0
53 psllw xmm0, xmm1
54 psubw xmm0, xmm2
55 pxor xmm1, xmm1
56 movdqa max, xmm0 ;max value (for clamping)
57 movdqa min, xmm1 ;min value (for clamping)
58
59 %endm
60
61 %macro HIGH_APPLY_FILTER_4 1
62 punpcklwd xmm0, xmm6 ;two row in one register
63 punpcklwd xmm1, xmm7
64 punpcklwd xmm2, xmm5
65 punpcklwd xmm3, xmm4
66
67 pmaddwd xmm0, k0k6 ;multiply the filter factors
68 pmaddwd xmm1, k1k7
69 pmaddwd xmm2, k2k5
70 pmaddwd xmm3, k3k4
71
72 paddd xmm0, xmm1 ;sum
73 paddd xmm0, xmm2
74 paddd xmm0, xmm3
75
76 paddd xmm0, krd ;rounding
77 psrad xmm0, 7 ;shift
78 packssdw xmm0, xmm0 ;pack to word
79
80 ;clamp the values
81 pminsw xmm0, max
82 pmaxsw xmm0, min
83
84 %if %1
85 movq xmm1, [rdi]
86 pavgw xmm0, xmm1
87 %endif
88 movq [rdi], xmm0
89 %endm
90
91 %macro HIGH_GET_FILTERS 0
92 mov rdx, arg(5) ;filter ptr
93 mov rsi, arg(0) ;src_ptr
94 mov rdi, arg(2) ;output_ptr
95 mov rcx, 0x00000040
96
97 movdqa xmm7, [rdx] ;load filters
98 pshuflw xmm0, xmm7, 0b ;k0
99 pshuflw xmm1, xmm7, 01010101b ;k1
100 pshuflw xmm2, xmm7, 10101010b ;k2
101 pshuflw xmm3, xmm7, 11111111b ;k3
102 pshufhw xmm4, xmm7, 0b ;k4
103 pshufhw xmm5, xmm7, 01010101b ;k5
104 pshufhw xmm6, xmm7, 10101010b ;k6
105 pshufhw xmm7, xmm7, 11111111b ;k7
106 punpcklqdq xmm2, xmm2
107 punpcklqdq xmm3, xmm3
108 punpcklwd xmm0, xmm1
109 punpckhwd xmm6, xmm7
110 punpckhwd xmm2, xmm5
111 punpckhwd xmm3, xmm4
112
113 movdqa k0k1, xmm0 ;store filter factors on stack
114 movdqa k6k7, xmm6
115 movdqa k2k5, xmm2
116 movdqa k3k4, xmm3
117
118 movq xmm6, rcx
119 pshufd xmm6, xmm6, 0
120 movdqa krd, xmm6 ;rounding
121
122 ;Compute max and min values of a pixel
123 mov rdx, 0x00010001
124 movsxd rcx, DWORD PTR arg(6) ;bps
125 movq xmm0, rdx
126 movq xmm1, rcx
127 pshufd xmm0, xmm0, 0b
128 movdqa xmm2, xmm0
129 psllw xmm0, xmm1
130 psubw xmm0, xmm2
131 pxor xmm1, xmm1
132 movdqa max, xmm0 ;max value (for clamping)
133 movdqa min, xmm1 ;min value (for clamping)
134 %endm
135
136 %macro LOAD_VERT_8 1
137 movdqu xmm0, [rsi + %1] ;0
138 movdqu xmm1, [rsi + rax + %1] ;1
139 movdqu xmm6, [rsi + rdx * 2 + %1] ;6
140 lea rsi, [rsi + rax]
141 movdqu xmm7, [rsi + rdx * 2 + %1] ;7
142 movdqu xmm2, [rsi + rax + %1] ;2
143 movdqu xmm3, [rsi + rax * 2 + %1] ;3
144 movdqu xmm4, [rsi + rdx + %1] ;4
145 movdqu xmm5, [rsi + rax * 4 + %1] ;5
146 %endm
147
148 %macro HIGH_APPLY_FILTER_8 2
149 movdqu temp, xmm4
150 movdqa xmm4, xmm0
151 punpcklwd xmm0, xmm1
152 punpckhwd xmm4, xmm1
153 movdqa xmm1, xmm6
154 punpcklwd xmm6, xmm7
155 punpckhwd xmm1, xmm7
156 movdqa xmm7, xmm2
157 punpcklwd xmm2, xmm5
158 punpckhwd xmm7, xmm5
159
160 movdqu xmm5, temp
161 movdqu temp, xmm4
162 movdqa xmm4, xmm3
163 punpcklwd xmm3, xmm5
164 punpckhwd xmm4, xmm5
165 movdqu xmm5, temp
166
167 pmaddwd xmm0, k0k1
168 pmaddwd xmm5, k0k1
169 pmaddwd xmm6, k6k7
170 pmaddwd xmm1, k6k7
171 pmaddwd xmm2, k2k5
172 pmaddwd xmm7, k2k5
173 pmaddwd xmm3, k3k4
174 pmaddwd xmm4, k3k4
175
176 paddd xmm0, xmm6
177 paddd xmm0, xmm2
178 paddd xmm0, xmm3
179 paddd xmm5, xmm1
180 paddd xmm5, xmm7
181 paddd xmm5, xmm4
182
183 paddd xmm0, krd ;rounding
184 paddd xmm5, krd
185 psrad xmm0, 7 ;shift
186 psrad xmm5, 7
187 packssdw xmm0, xmm5 ;pack back to word
188
189 ;clamp the values
190 pminsw xmm0, max
191 pmaxsw xmm0, min
192
193 %if %1
194 movdqu xmm1, [rdi + %2]
195 pavgw xmm0, xmm1
196 %endif
197 movdqu [rdi + %2], xmm0
198 %endm
199
200 ;void vp9_filter_block1d4_v8_sse2
201 ;(
202 ; unsigned char *src_ptr,
203 ; unsigned int src_pitch,
204 ; unsigned char *output_ptr,
205 ; unsigned int out_pitch,
206 ; unsigned int output_height,
207 ; short *filter
208 ;)
209 global sym(vp9_high_filter_block1d4_v8_sse2) PRIVATE
210 sym(vp9_high_filter_block1d4_v8_sse2):
211 push rbp
212 mov rbp, rsp
213 SHADOW_ARGS_TO_STACK 7
214 SAVE_XMM 7
215 push rsi
216 push rdi
217 push rbx
218 ; end prolog
219
220 ALIGN_STACK 16, rax
221 sub rsp, 16 * 7
222 %define k0k6 [rsp + 16 * 0]
223 %define k2k5 [rsp + 16 * 1]
224 %define k3k4 [rsp + 16 * 2]
225 %define k1k7 [rsp + 16 * 3]
226 %define krd [rsp + 16 * 4]
227 %define max [rsp + 16 * 5]
228 %define min [rsp + 16 * 6]
229
230 HIGH_GET_FILTERS_4
231
232 mov rsi, arg(0) ;src_ptr
233 mov rdi, arg(2) ;output_ptr
234
235 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
236 movsxd rbx, DWORD PTR arg(3) ;out_pitch
237 lea rax, [rax + rax] ;bytes per line
238 lea rbx, [rbx + rbx]
239 lea rdx, [rax + rax * 2]
240 movsxd rcx, DWORD PTR arg(4) ;output_height
241
242 .loop:
243 movq xmm0, [rsi] ;load src: row 0
244 movq xmm1, [rsi + rax] ;1
245 movq xmm6, [rsi + rdx * 2] ;6
246 lea rsi, [rsi + rax]
247 movq xmm7, [rsi + rdx * 2] ;7
248 movq xmm2, [rsi + rax] ;2
249 movq xmm3, [rsi + rax * 2] ;3
250 movq xmm4, [rsi + rdx] ;4
251 movq xmm5, [rsi + rax * 4] ;5
252
253 HIGH_APPLY_FILTER_4 0
254
255 lea rdi, [rdi + rbx]
256 dec rcx
257 jnz .loop
258
259 add rsp, 16 * 7
260 pop rsp
261 pop rbx
262 ; begin epilog
263 pop rdi
264 pop rsi
265 RESTORE_XMM
266 UNSHADOW_ARGS
267 pop rbp
268 ret
269
270 ;void vp9_filter_block1d8_v8_sse2
271 ;(
272 ; unsigned char *src_ptr,
273 ; unsigned int src_pitch,
274 ; unsigned char *output_ptr,
275 ; unsigned int out_pitch,
276 ; unsigned int output_height,
277 ; short *filter
278 ;)
279 global sym(vp9_high_filter_block1d8_v8_sse2) PRIVATE
280 sym(vp9_high_filter_block1d8_v8_sse2):
281 push rbp
282 mov rbp, rsp
283 SHADOW_ARGS_TO_STACK 7
284 SAVE_XMM 7
285 push rsi
286 push rdi
287 push rbx
288 ; end prolog
289
290 ALIGN_STACK 16, rax
291 sub rsp, 16 * 8
292 %define k0k1 [rsp + 16 * 0]
293 %define k6k7 [rsp + 16 * 1]
294 %define k2k5 [rsp + 16 * 2]
295 %define k3k4 [rsp + 16 * 3]
296 %define krd [rsp + 16 * 4]
297 %define temp [rsp + 16 * 5]
298 %define max [rsp + 16 * 6]
299 %define min [rsp + 16 * 7]
300
301 HIGH_GET_FILTERS
302
303 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
304 movsxd rbx, DWORD PTR arg(3) ;out_pitch
305 lea rax, [rax + rax] ;bytes per line
306 lea rbx, [rbx + rbx]
307 lea rdx, [rax + rax * 2]
308 movsxd rcx, DWORD PTR arg(4) ;output_height
309
310 .loop:
311 LOAD_VERT_8 0
312 HIGH_APPLY_FILTER_8 0, 0
313
314 lea rdi, [rdi + rbx]
315 dec rcx
316 jnz .loop
317
318 add rsp, 16 * 8
319 pop rsp
320 pop rbx
321 ; begin epilog
322 pop rdi
323 pop rsi
324 RESTORE_XMM
325 UNSHADOW_ARGS
326 pop rbp
327 ret
328
329 ;void vp9_filter_block1d16_v8_sse2
330 ;(
331 ; unsigned char *src_ptr,
332 ; unsigned int src_pitch,
333 ; unsigned char *output_ptr,
334 ; unsigned int out_pitch,
335 ; unsigned int output_height,
336 ; short *filter
337 ;)
338 global sym(vp9_high_filter_block1d16_v8_sse2) PRIVATE
339 sym(vp9_high_filter_block1d16_v8_sse2):
340 push rbp
341 mov rbp, rsp
342 SHADOW_ARGS_TO_STACK 7
343 SAVE_XMM 7
344 push rsi
345 push rdi
346 push rbx
347 ; end prolog
348
349 ALIGN_STACK 16, rax
350 sub rsp, 16 * 8
351 %define k0k1 [rsp + 16 * 0]
352 %define k6k7 [rsp + 16 * 1]
353 %define k2k5 [rsp + 16 * 2]
354 %define k3k4 [rsp + 16 * 3]
355 %define krd [rsp + 16 * 4]
356 %define temp [rsp + 16 * 5]
357 %define max [rsp + 16 * 6]
358 %define min [rsp + 16 * 7]
359
360 HIGH_GET_FILTERS
361
362 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
363 movsxd rbx, DWORD PTR arg(3) ;out_pitch
364 lea rax, [rax + rax] ;bytes per line
365 lea rbx, [rbx + rbx]
366 lea rdx, [rax + rax * 2]
367 movsxd rcx, DWORD PTR arg(4) ;output_height
368
369 .loop:
370 LOAD_VERT_8 0
371 HIGH_APPLY_FILTER_8 0, 0
372 sub rsi, rax
373
374 LOAD_VERT_8 16
375 HIGH_APPLY_FILTER_8 0, 16
376 add rdi, rbx
377
378 dec rcx
379 jnz .loop
380
381 add rsp, 16 * 8
382 pop rsp
383 pop rbx
384 ; begin epilog
385 pop rdi
386 pop rsi
387 RESTORE_XMM
388 UNSHADOW_ARGS
389 pop rbp
390 ret
391
392 global sym(vp9_high_filter_block1d4_v8_avg_sse2) PRIVATE
393 sym(vp9_high_filter_block1d4_v8_avg_sse2):
394 push rbp
395 mov rbp, rsp
396 SHADOW_ARGS_TO_STACK 7
397 SAVE_XMM 7
398 push rsi
399 push rdi
400 push rbx
401 ; end prolog
402
403 ALIGN_STACK 16, rax
404 sub rsp, 16 * 7
405 %define k0k6 [rsp + 16 * 0]
406 %define k2k5 [rsp + 16 * 1]
407 %define k3k4 [rsp + 16 * 2]
408 %define k1k7 [rsp + 16 * 3]
409 %define krd [rsp + 16 * 4]
410 %define max [rsp + 16 * 5]
411 %define min [rsp + 16 * 6]
412
413 HIGH_GET_FILTERS_4
414
415 mov rsi, arg(0) ;src_ptr
416 mov rdi, arg(2) ;output_ptr
417
418 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
419 movsxd rbx, DWORD PTR arg(3) ;out_pitch
420 lea rax, [rax + rax] ;bytes per line
421 lea rbx, [rbx + rbx]
422 lea rdx, [rax + rax * 2]
423 movsxd rcx, DWORD PTR arg(4) ;output_height
424
425 .loop:
426 movq xmm0, [rsi] ;load src: row 0
427 movq xmm1, [rsi + rax] ;1
428 movq xmm6, [rsi + rdx * 2] ;6
429 lea rsi, [rsi + rax]
430 movq xmm7, [rsi + rdx * 2] ;7
431 movq xmm2, [rsi + rax] ;2
432 movq xmm3, [rsi + rax * 2] ;3
433 movq xmm4, [rsi + rdx] ;4
434 movq xmm5, [rsi + rax * 4] ;5
435
436 HIGH_APPLY_FILTER_4 1
437
438 lea rdi, [rdi + rbx]
439 dec rcx
440 jnz .loop
441
442 add rsp, 16 * 7
443 pop rsp
444 pop rbx
445 ; begin epilog
446 pop rdi
447 pop rsi
448 RESTORE_XMM
449 UNSHADOW_ARGS
450 pop rbp
451 ret
452
453 global sym(vp9_high_filter_block1d8_v8_avg_sse2) PRIVATE
454 sym(vp9_high_filter_block1d8_v8_avg_sse2):
455 push rbp
456 mov rbp, rsp
457 SHADOW_ARGS_TO_STACK 7
458 SAVE_XMM 7
459 push rsi
460 push rdi
461 push rbx
462 ; end prolog
463
464 ALIGN_STACK 16, rax
465 sub rsp, 16 * 8
466 %define k0k1 [rsp + 16 * 0]
467 %define k6k7 [rsp + 16 * 1]
468 %define k2k5 [rsp + 16 * 2]
469 %define k3k4 [rsp + 16 * 3]
470 %define krd [rsp + 16 * 4]
471 %define temp [rsp + 16 * 5]
472 %define max [rsp + 16 * 6]
473 %define min [rsp + 16 * 7]
474
475 HIGH_GET_FILTERS
476
477 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
478 movsxd rbx, DWORD PTR arg(3) ;out_pitch
479 lea rax, [rax + rax] ;bytes per line
480 lea rbx, [rbx + rbx]
481 lea rdx, [rax + rax * 2]
482 movsxd rcx, DWORD PTR arg(4) ;output_height
483 .loop:
484 LOAD_VERT_8 0
485 HIGH_APPLY_FILTER_8 1, 0
486
487 lea rdi, [rdi + rbx]
488 dec rcx
489 jnz .loop
490
491 add rsp, 16 * 8
492 pop rsp
493 pop rbx
494 ; begin epilog
495 pop rdi
496 pop rsi
497 RESTORE_XMM
498 UNSHADOW_ARGS
499 pop rbp
500 ret
501
502 global sym(vp9_high_filter_block1d16_v8_avg_sse2) PRIVATE
503 sym(vp9_high_filter_block1d16_v8_avg_sse2):
504 push rbp
505 mov rbp, rsp
506 SHADOW_ARGS_TO_STACK 7
507 SAVE_XMM 7
508 push rsi
509 push rdi
510 push rbx
511 ; end prolog
512
513 ALIGN_STACK 16, rax
514 sub rsp, 16 * 8
515 %define k0k1 [rsp + 16 * 0]
516 %define k6k7 [rsp + 16 * 1]
517 %define k2k5 [rsp + 16 * 2]
518 %define k3k4 [rsp + 16 * 3]
519 %define krd [rsp + 16 * 4]
520 %define temp [rsp + 16 * 5]
521 %define max [rsp + 16 * 6]
522 %define min [rsp + 16 * 7]
523
524 HIGH_GET_FILTERS
525
526 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
527 movsxd rbx, DWORD PTR arg(3) ;out_pitch
528 lea rax, [rax + rax] ;bytes per line
529 lea rbx, [rbx + rbx]
530 lea rdx, [rax + rax * 2]
531 movsxd rcx, DWORD PTR arg(4) ;output_height
532 .loop:
533 LOAD_VERT_8 0
534 HIGH_APPLY_FILTER_8 1, 0
535 sub rsi, rax
536
537 LOAD_VERT_8 16
538 HIGH_APPLY_FILTER_8 1, 16
539 add rdi, rbx
540
541 dec rcx
542 jnz .loop
543
544 add rsp, 16 * 8
545 pop rsp
546 pop rbx
547 ; begin epilog
548 pop rdi
549 pop rsi
550 RESTORE_XMM
551 UNSHADOW_ARGS
552 pop rbp
553 ret
554
555 ;void vp9_filter_block1d4_h8_sse2
556 ;(
557 ; unsigned char *src_ptr,
558 ; unsigned int src_pixels_per_line,
559 ; unsigned char *output_ptr,
560 ; unsigned int output_pitch,
561 ; unsigned int output_height,
562 ; short *filter
563 ;)
564 global sym(vp9_high_filter_block1d4_h8_sse2) PRIVATE
565 sym(vp9_high_filter_block1d4_h8_sse2):
566 push rbp
567 mov rbp, rsp
568 SHADOW_ARGS_TO_STACK 7
569 SAVE_XMM 7
570 push rsi
571 push rdi
572 ; end prolog
573
574 ALIGN_STACK 16, rax
575 sub rsp, 16 * 7
576 %define k0k6 [rsp + 16 * 0]
577 %define k2k5 [rsp + 16 * 1]
578 %define k3k4 [rsp + 16 * 2]
579 %define k1k7 [rsp + 16 * 3]
580 %define krd [rsp + 16 * 4]
581 %define max [rsp + 16 * 5]
582 %define min [rsp + 16 * 6]
583
584 HIGH_GET_FILTERS_4
585
586 mov rsi, arg(0) ;src_ptr
587 mov rdi, arg(2) ;output_ptr
588
589 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
590 movsxd rdx, DWORD PTR arg(3) ;out_pitch
591 lea rax, [rax + rax] ;bytes per line
592 lea rdx, [rdx + rdx]
593 movsxd rcx, DWORD PTR arg(4) ;output_height
594
595 .loop:
596 movdqu xmm0, [rsi - 6] ;load src
597 movdqu xmm4, [rsi + 2]
598 movdqa xmm1, xmm0
599 movdqa xmm6, xmm4
600 movdqa xmm7, xmm4
601 movdqa xmm2, xmm0
602 movdqa xmm3, xmm0
603 movdqa xmm5, xmm4
604
605 psrldq xmm1, 2
606 psrldq xmm6, 4
607 psrldq xmm7, 6
608 psrldq xmm2, 4
609 psrldq xmm3, 6
610 psrldq xmm5, 2
611
612 HIGH_APPLY_FILTER_4 0
613
614 lea rsi, [rsi + rax]
615 lea rdi, [rdi + rdx]
616 dec rcx
617 jnz .loop
618
619 add rsp, 16 * 7
620 pop rsp
621
622 ; begin epilog
623 pop rdi
624 pop rsi
625 RESTORE_XMM
626 UNSHADOW_ARGS
627 pop rbp
628 ret
629
630 ;void vp9_filter_block1d8_h8_sse2
631 ;(
632 ; unsigned char *src_ptr,
633 ; unsigned int src_pixels_per_line,
634 ; unsigned char *output_ptr,
635 ; unsigned int output_pitch,
636 ; unsigned int output_height,
637 ; short *filter
638 ;)
639 global sym(vp9_high_filter_block1d8_h8_sse2) PRIVATE
640 sym(vp9_high_filter_block1d8_h8_sse2):
641 push rbp
642 mov rbp, rsp
643 SHADOW_ARGS_TO_STACK 7
644 SAVE_XMM 7
645 push rsi
646 push rdi
647 ; end prolog
648
649 ALIGN_STACK 16, rax
650 sub rsp, 16 * 8
651 %define k0k1 [rsp + 16 * 0]
652 %define k6k7 [rsp + 16 * 1]
653 %define k2k5 [rsp + 16 * 2]
654 %define k3k4 [rsp + 16 * 3]
655 %define krd [rsp + 16 * 4]
656 %define temp [rsp + 16 * 5]
657 %define max [rsp + 16 * 6]
658 %define min [rsp + 16 * 7]
659
660 HIGH_GET_FILTERS
661
662 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
663 movsxd rdx, DWORD PTR arg(3) ;out_pitch
664 lea rax, [rax + rax] ;bytes per line
665 lea rdx, [rdx + rdx]
666 movsxd rcx, DWORD PTR arg(4) ;output_height
667
668 .loop:
669 movdqu xmm0, [rsi - 6] ;load src
670 movdqu xmm1, [rsi - 4]
671 movdqu xmm2, [rsi - 2]
672 movdqu xmm3, [rsi]
673 movdqu xmm4, [rsi + 2]
674 movdqu xmm5, [rsi + 4]
675 movdqu xmm6, [rsi + 6]
676 movdqu xmm7, [rsi + 8]
677
678 HIGH_APPLY_FILTER_8 0, 0
679
680 lea rsi, [rsi + rax]
681 lea rdi, [rdi + rdx]
682 dec rcx
683 jnz .loop
684
685 add rsp, 16 * 8
686 pop rsp
687
688 ; begin epilog
689 pop rdi
690 pop rsi
691 RESTORE_XMM
692 UNSHADOW_ARGS
693 pop rbp
694 ret
695
696 ;void vp9_filter_block1d16_h8_sse2
697 ;(
698 ; unsigned char *src_ptr,
699 ; unsigned int src_pixels_per_line,
700 ; unsigned char *output_ptr,
701 ; unsigned int output_pitch,
702 ; unsigned int output_height,
703 ; short *filter
704 ;)
705 global sym(vp9_high_filter_block1d16_h8_sse2) PRIVATE
706 sym(vp9_high_filter_block1d16_h8_sse2):
707 push rbp
708 mov rbp, rsp
709 SHADOW_ARGS_TO_STACK 7
710 SAVE_XMM 7
711 push rsi
712 push rdi
713 ; end prolog
714
715 ALIGN_STACK 16, rax
716 sub rsp, 16 * 8
717 %define k0k1 [rsp + 16 * 0]
718 %define k6k7 [rsp + 16 * 1]
719 %define k2k5 [rsp + 16 * 2]
720 %define k3k4 [rsp + 16 * 3]
721 %define krd [rsp + 16 * 4]
722 %define temp [rsp + 16 * 5]
723 %define max [rsp + 16 * 6]
724 %define min [rsp + 16 * 7]
725
726 HIGH_GET_FILTERS
727
728 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
729 movsxd rdx, DWORD PTR arg(3) ;out_pitch
730 lea rax, [rax + rax] ;bytes per line
731 lea rdx, [rdx + rdx]
732 movsxd rcx, DWORD PTR arg(4) ;output_height
733
734 .loop:
735 movdqu xmm0, [rsi - 6] ;load src
736 movdqu xmm1, [rsi - 4]
737 movdqu xmm2, [rsi - 2]
738 movdqu xmm3, [rsi]
739 movdqu xmm4, [rsi + 2]
740 movdqu xmm5, [rsi + 4]
741 movdqu xmm6, [rsi + 6]
742 movdqu xmm7, [rsi + 8]
743
744 HIGH_APPLY_FILTER_8 0, 0
745
746 movdqu xmm0, [rsi + 10] ;load src
747 movdqu xmm1, [rsi + 12]
748 movdqu xmm2, [rsi + 14]
749 movdqu xmm3, [rsi + 16]
750 movdqu xmm4, [rsi + 18]
751 movdqu xmm5, [rsi + 20]
752 movdqu xmm6, [rsi + 22]
753 movdqu xmm7, [rsi + 24]
754
755 HIGH_APPLY_FILTER_8 0, 16
756
757 lea rsi, [rsi + rax]
758 lea rdi, [rdi + rdx]
759 dec rcx
760 jnz .loop
761
762 add rsp, 16 * 8
763 pop rsp
764
765 ; begin epilog
766 pop rdi
767 pop rsi
768 RESTORE_XMM
769 UNSHADOW_ARGS
770 pop rbp
771 ret
772
773 global sym(vp9_high_filter_block1d4_h8_avg_sse2) PRIVATE
774 sym(vp9_high_filter_block1d4_h8_avg_sse2):
775 push rbp
776 mov rbp, rsp
777 SHADOW_ARGS_TO_STACK 7
778 SAVE_XMM 7
779 push rsi
780 push rdi
781 ; end prolog
782
783 ALIGN_STACK 16, rax
784 sub rsp, 16 * 7
785 %define k0k6 [rsp + 16 * 0]
786 %define k2k5 [rsp + 16 * 1]
787 %define k3k4 [rsp + 16 * 2]
788 %define k1k7 [rsp + 16 * 3]
789 %define krd [rsp + 16 * 4]
790 %define max [rsp + 16 * 5]
791 %define min [rsp + 16 * 6]
792
793 HIGH_GET_FILTERS_4
794
795 mov rsi, arg(0) ;src_ptr
796 mov rdi, arg(2) ;output_ptr
797
798 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
799 movsxd rdx, DWORD PTR arg(3) ;out_pitch
800 lea rax, [rax + rax] ;bytes per line
801 lea rdx, [rdx + rdx]
802 movsxd rcx, DWORD PTR arg(4) ;output_height
803
804 .loop:
805 movdqu xmm0, [rsi - 6] ;load src
806 movdqu xmm4, [rsi + 2]
807 movdqa xmm1, xmm0
808 movdqa xmm6, xmm4
809 movdqa xmm7, xmm4
810 movdqa xmm2, xmm0
811 movdqa xmm3, xmm0
812 movdqa xmm5, xmm4
813
814 psrldq xmm1, 2
815 psrldq xmm6, 4
816 psrldq xmm7, 6
817 psrldq xmm2, 4
818 psrldq xmm3, 6
819 psrldq xmm5, 2
820
821 HIGH_APPLY_FILTER_4 1
822
823 lea rsi, [rsi + rax]
824 lea rdi, [rdi + rdx]
825 dec rcx
826 jnz .loop
827
828 add rsp, 16 * 7
829 pop rsp
830
831 ; begin epilog
832 pop rdi
833 pop rsi
834 RESTORE_XMM
835 UNSHADOW_ARGS
836 pop rbp
837 ret
838
839 global sym(vp9_high_filter_block1d8_h8_avg_sse2) PRIVATE
840 sym(vp9_high_filter_block1d8_h8_avg_sse2):
841 push rbp
842 mov rbp, rsp
843 SHADOW_ARGS_TO_STACK 7
844 SAVE_XMM 7
845 push rsi
846 push rdi
847 ; end prolog
848
849 ALIGN_STACK 16, rax
850 sub rsp, 16 * 8
851 %define k0k1 [rsp + 16 * 0]
852 %define k6k7 [rsp + 16 * 1]
853 %define k2k5 [rsp + 16 * 2]
854 %define k3k4 [rsp + 16 * 3]
855 %define krd [rsp + 16 * 4]
856 %define temp [rsp + 16 * 5]
857 %define max [rsp + 16 * 6]
858 %define min [rsp + 16 * 7]
859
860 HIGH_GET_FILTERS
861
862 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
863 movsxd rdx, DWORD PTR arg(3) ;out_pitch
864 lea rax, [rax + rax] ;bytes per line
865 lea rdx, [rdx + rdx]
866 movsxd rcx, DWORD PTR arg(4) ;output_height
867
868 .loop:
869 movdqu xmm0, [rsi - 6] ;load src
870 movdqu xmm1, [rsi - 4]
871 movdqu xmm2, [rsi - 2]
872 movdqu xmm3, [rsi]
873 movdqu xmm4, [rsi + 2]
874 movdqu xmm5, [rsi + 4]
875 movdqu xmm6, [rsi + 6]
876 movdqu xmm7, [rsi + 8]
877
878 HIGH_APPLY_FILTER_8 1, 0
879
880 lea rsi, [rsi + rax]
881 lea rdi, [rdi + rdx]
882 dec rcx
883 jnz .loop
884
885 add rsp, 16 * 8
886 pop rsp
887
888 ; begin epilog
889 pop rdi
890 pop rsi
891 RESTORE_XMM
892 UNSHADOW_ARGS
893 pop rbp
894 ret
895
896 global sym(vp9_high_filter_block1d16_h8_avg_sse2) PRIVATE
897 sym(vp9_high_filter_block1d16_h8_avg_sse2):
898 push rbp
899 mov rbp, rsp
900 SHADOW_ARGS_TO_STACK 7
901 SAVE_XMM 7
902 push rsi
903 push rdi
904 ; end prolog
905
906 ALIGN_STACK 16, rax
907 sub rsp, 16 * 8
908 %define k0k1 [rsp + 16 * 0]
909 %define k6k7 [rsp + 16 * 1]
910 %define k2k5 [rsp + 16 * 2]
911 %define k3k4 [rsp + 16 * 3]
912 %define krd [rsp + 16 * 4]
913 %define temp [rsp + 16 * 5]
914 %define max [rsp + 16 * 6]
915 %define min [rsp + 16 * 7]
916
917 HIGH_GET_FILTERS
918
919 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
920 movsxd rdx, DWORD PTR arg(3) ;out_pitch
921 lea rax, [rax + rax] ;bytes per line
922 lea rdx, [rdx + rdx]
923 movsxd rcx, DWORD PTR arg(4) ;output_height
924
925 .loop:
926 movdqu xmm0, [rsi - 6] ;load src
927 movdqu xmm1, [rsi - 4]
928 movdqu xmm2, [rsi - 2]
929 movdqu xmm3, [rsi]
930 movdqu xmm4, [rsi + 2]
931 movdqu xmm5, [rsi + 4]
932 movdqu xmm6, [rsi + 6]
933 movdqu xmm7, [rsi + 8]
934
935 HIGH_APPLY_FILTER_8 1, 0
936
937 movdqu xmm0, [rsi + 10] ;load src
938 movdqu xmm1, [rsi + 12]
939 movdqu xmm2, [rsi + 14]
940 movdqu xmm3, [rsi + 16]
941 movdqu xmm4, [rsi + 18]
942 movdqu xmm5, [rsi + 20]
943 movdqu xmm6, [rsi + 22]
944 movdqu xmm7, [rsi + 24]
945
946 HIGH_APPLY_FILTER_8 1, 16
947
948 lea rsi, [rsi + rax]
949 lea rdi, [rdi + rdx]
950 dec rcx
951 jnz .loop
952
953 add rsp, 16 * 8
954 pop rsp
955
956 ; begin epilog
957 pop rdi
958 pop rsi
959 RESTORE_XMM
960 UNSHADOW_ARGS
961 pop rbp
962 ret
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698