Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(592)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_ssse3.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %define BLOCK_HEIGHT_WIDTH 4
15 %define VP9_FILTER_WEIGHT 128
16 %define VP9_FILTER_SHIFT 7
17
18
19 ;/****************************************************************************** ******
20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixe ls. The
21 ; input pixel array has output_height rows. This routine assumes that output_hei ght is an
22 ; even number. This function handles 8 pixels in horizontal direction, calculati ng ONE
23 ; rows each iteration to take advantage of the 128 bits operations.
24 ;
25 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
26 ;
27 ;******************************************************************************* ******/
28 ;void vp9_filter_block1d8_h6_ssse3
29 ;(
30 ; unsigned char *src_ptr,
31 ; unsigned int src_pixels_per_line,
32 ; unsigned char *output_ptr,
33 ; unsigned int output_pitch,
34 ; unsigned int output_height,
35 ; unsigned int vp9_filter_index
36 ;)
37 global sym(vp9_filter_block1d8_h6_ssse3)
38 sym(vp9_filter_block1d8_h6_ssse3):
39 push rbp
40 mov rbp, rsp
41 SHADOW_ARGS_TO_STACK 6
42 SAVE_XMM 7
43 GET_GOT rbx
44 push rsi
45 push rdi
46 ; end prolog
47
48 movsxd rdx, DWORD PTR arg(5) ;table index
49 xor rsi, rsi
50 shl rdx, 4
51
52 movdqa xmm7, [GLOBAL(rd)]
53
54 lea rax, [GLOBAL(k0_k5)]
55 add rax, rdx
56 mov rdi, arg(2) ;output_ptr
57
58 cmp esi, DWORD PTR [rax]
59 je vp9_filter_block1d8_h4_ssse3
60
61 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
62 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
63 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
64
65 mov rsi, arg(0) ;src_ptr
66 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
67 movsxd rcx, dword ptr arg(4) ;output_height
68
69 movsxd rdx, dword ptr arg(3) ;output_pitch
70
71 sub rdi, rdx
72 ;xmm3 free
73 .filter_block1d8_h6_rowloop_ssse3:
74 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
75
76 movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
77
78 punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
79
80 movdqa xmm1, xmm0
81 pmaddubsw xmm0, xmm4
82
83 movdqa xmm2, xmm1
84 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
85
86 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
87 pmaddubsw xmm1, xmm5
88
89 lea rdi, [rdi + rdx]
90 pmaddubsw xmm2, xmm6
91
92 lea rsi, [rsi + rax]
93 dec rcx
94
95 paddsw xmm0, xmm1
96 paddsw xmm2, xmm7
97
98 paddsw xmm0, xmm2
99
100 psraw xmm0, 7
101
102 packuswb xmm0, xmm0
103
104 movq MMWORD Ptr [rdi], xmm0
105 jnz .filter_block1d8_h6_rowloop_ssse3
106
107 ; begin epilog
108 pop rdi
109 pop rsi
110 RESTORE_GOT
111 RESTORE_XMM
112 UNSHADOW_ARGS
113 pop rbp
114 ret
115
116 vp9_filter_block1d8_h4_ssse3:
117 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
118 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
119
120 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
121 movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
122
123 mov rsi, arg(0) ;src_ptr
124
125 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
126 movsxd rcx, dword ptr arg(4) ;output_height
127
128 movsxd rdx, dword ptr arg(3) ;output_pitch
129
130 sub rdi, rdx
131
132 .filter_block1d8_h4_rowloop_ssse3:
133 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
134
135 movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
136
137 punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
138
139 movdqa xmm2, xmm0
140 pshufb xmm0, xmm3
141
142 pshufb xmm2, xmm4
143 pmaddubsw xmm0, xmm5
144
145 lea rdi, [rdi + rdx]
146 pmaddubsw xmm2, xmm6
147
148 lea rsi, [rsi + rax]
149 dec rcx
150
151 paddsw xmm0, xmm7
152
153 paddsw xmm0, xmm2
154
155 psraw xmm0, 7
156
157 packuswb xmm0, xmm0
158
159 movq MMWORD Ptr [rdi], xmm0
160
161 jnz .filter_block1d8_h4_rowloop_ssse3
162
163 ; begin epilog
164 pop rdi
165 pop rsi
166 RESTORE_GOT
167 RESTORE_XMM
168 UNSHADOW_ARGS
169 pop rbp
170 ret
171 ;void vp9_filter_block1d16_h6_ssse3
172 ;(
173 ; unsigned char *src_ptr,
174 ; unsigned int src_pixels_per_line,
175 ; unsigned char *output_ptr,
176 ; unsigned int output_pitch,
177 ; unsigned int output_height,
178 ; unsigned int vp9_filter_index
179 ;)
180 global sym(vp9_filter_block1d16_h6_ssse3)
181 sym(vp9_filter_block1d16_h6_ssse3):
182 push rbp
183 mov rbp, rsp
184 SHADOW_ARGS_TO_STACK 6
185 SAVE_XMM 7
186 GET_GOT rbx
187 push rsi
188 push rdi
189 ; end prolog
190
191 movsxd rdx, DWORD PTR arg(5) ;table index
192 xor rsi, rsi
193 shl rdx, 4 ;
194
195 lea rax, [GLOBAL(k0_k5)]
196 add rax, rdx
197
198 mov rdi, arg(2) ;output_ptr
199
200 mov rsi, arg(0) ;src_ptr
201
202 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
203 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
204 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
205
206 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
207 movsxd rcx, dword ptr arg(4) ;output_height
208 movsxd rdx, dword ptr arg(3) ;output_pitch
209
210 .filter_block1d16_h6_rowloop_ssse3:
211 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
212
213 movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
214
215 punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
216
217 movdqa xmm1, xmm0
218 pmaddubsw xmm0, xmm4
219
220 movdqa xmm2, xmm1
221 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
222
223 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
224 movq xmm3, MMWORD PTR [rsi + 6]
225
226 pmaddubsw xmm1, xmm5
227 movq xmm7, MMWORD PTR [rsi + 11]
228
229 pmaddubsw xmm2, xmm6
230 punpcklbw xmm3, xmm7
231
232 paddsw xmm0, xmm1
233 movdqa xmm1, xmm3
234
235 pmaddubsw xmm3, xmm4
236 paddsw xmm0, xmm2
237
238 movdqa xmm2, xmm1
239 paddsw xmm0, [GLOBAL(rd)]
240
241 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
242 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
243
244 psraw xmm0, 7
245 pmaddubsw xmm1, xmm5
246
247 pmaddubsw xmm2, xmm6
248 packuswb xmm0, xmm0
249
250 lea rsi, [rsi + rax]
251 paddsw xmm3, xmm1
252
253 paddsw xmm3, xmm2
254
255 paddsw xmm3, [GLOBAL(rd)]
256
257 psraw xmm3, 7
258
259 packuswb xmm3, xmm3
260
261 punpcklqdq xmm0, xmm3
262
263 movdqa XMMWORD Ptr [rdi], xmm0
264
265 lea rdi, [rdi + rdx]
266 dec rcx
267 jnz .filter_block1d16_h6_rowloop_ssse3
268
269 ; begin epilog
270 pop rdi
271 pop rsi
272 RESTORE_GOT
273 RESTORE_XMM
274 UNSHADOW_ARGS
275 pop rbp
276 ret
277
278 ;void vp9_filter_block1d4_h6_ssse3
279 ;(
280 ; unsigned char *src_ptr,
281 ; unsigned int src_pixels_per_line,
282 ; unsigned char *output_ptr,
283 ; unsigned int output_pitch,
284 ; unsigned int output_height,
285 ; unsigned int vp9_filter_index
286 ;)
287 global sym(vp9_filter_block1d4_h6_ssse3)
288 sym(vp9_filter_block1d4_h6_ssse3):
289 push rbp
290 mov rbp, rsp
291 SHADOW_ARGS_TO_STACK 6
292 SAVE_XMM 7
293 GET_GOT rbx
294 push rsi
295 push rdi
296 ; end prolog
297
298 movsxd rdx, DWORD PTR arg(5) ;table index
299 xor rsi, rsi
300 shl rdx, 4 ;
301
302 lea rax, [GLOBAL(k0_k5)]
303 add rax, rdx
304 movdqa xmm7, [GLOBAL(rd)]
305
306 cmp esi, DWORD PTR [rax]
307 je .vp9_filter_block1d4_h4_ssse3
308
309 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
310 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
311 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
312
313 mov rsi, arg(0) ;src_ptr
314 mov rdi, arg(2) ;output_ptr
315 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
316 movsxd rcx, dword ptr arg(4) ;output_height
317
318 movsxd rdx, dword ptr arg(3) ;output_pitch
319
320 ;xmm3 free
321 .filter_block1d4_h6_rowloop_ssse3:
322 movdqu xmm0, XMMWORD PTR [rsi - 2]
323
324 movdqa xmm1, xmm0
325 pshufb xmm0, [GLOBAL(shuf1b)]
326
327 movdqa xmm2, xmm1
328 pshufb xmm1, [GLOBAL(shuf2b)]
329 pmaddubsw xmm0, xmm4
330 pshufb xmm2, [GLOBAL(shuf3b)]
331 pmaddubsw xmm1, xmm5
332
333 ;--
334 pmaddubsw xmm2, xmm6
335
336 lea rsi, [rsi + rax]
337 ;--
338 paddsw xmm0, xmm1
339 paddsw xmm0, xmm7
340 pxor xmm1, xmm1
341 paddsw xmm0, xmm2
342 psraw xmm0, 7
343 packuswb xmm0, xmm0
344
345 movd DWORD PTR [rdi], xmm0
346
347 add rdi, rdx
348 dec rcx
349 jnz .filter_block1d4_h6_rowloop_ssse3
350
351 ; begin epilog
352 pop rdi
353 pop rsi
354 RESTORE_GOT
355 UNSHADOW_ARGS
356 pop rbp
357 ret
358
359 .vp9_filter_block1d4_h4_ssse3:
360 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
361 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
362 movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
363 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
364
365 mov rsi, arg(0) ;src_ptr
366 mov rdi, arg(2) ;output_ptr
367 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
368 movsxd rcx, dword ptr arg(4) ;output_height
369
370 movsxd rdx, dword ptr arg(3) ;output_pitch
371
372 .filter_block1d4_h4_rowloop_ssse3:
373 movdqu xmm1, XMMWORD PTR [rsi - 2]
374
375 movdqa xmm2, xmm1
376 pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
377 pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
378 pmaddubsw xmm1, xmm5
379
380 ;--
381 pmaddubsw xmm2, xmm6
382
383 lea rsi, [rsi + rax]
384 ;--
385 paddsw xmm1, xmm7
386 paddsw xmm1, xmm2
387 psraw xmm1, 7
388 packuswb xmm1, xmm1
389
390 movd DWORD PTR [rdi], xmm1
391
392 add rdi, rdx
393 dec rcx
394 jnz .filter_block1d4_h4_rowloop_ssse3
395
396 ; begin epilog
397 pop rdi
398 pop rsi
399 RESTORE_GOT
400 RESTORE_XMM
401 UNSHADOW_ARGS
402 pop rbp
403 ret
404
405
406
407 ;void vp9_filter_block1d16_v6_ssse3
408 ;(
409 ; unsigned char *src_ptr,
410 ; unsigned int src_pitch,
411 ; unsigned char *output_ptr,
412 ; unsigned int out_pitch,
413 ; unsigned int output_height,
414 ; unsigned int vp9_filter_index
415 ;)
416 global sym(vp9_filter_block1d16_v6_ssse3)
417 sym(vp9_filter_block1d16_v6_ssse3):
418 push rbp
419 mov rbp, rsp
420 SHADOW_ARGS_TO_STACK 6
421 SAVE_XMM 7
422 GET_GOT rbx
423 push rsi
424 push rdi
425 ; end prolog
426
427 movsxd rdx, DWORD PTR arg(5) ;table index
428 xor rsi, rsi
429 shl rdx, 4 ;
430
431 lea rax, [GLOBAL(k0_k5)]
432 add rax, rdx
433
434 cmp esi, DWORD PTR [rax]
435 je .vp9_filter_block1d16_v4_ssse3
436
437 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
438 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
439 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
440
441 mov rsi, arg(0) ;src_ptr
442 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
443 mov rdi, arg(2) ;output_ptr
444
445 %if ABI_IS_32BIT=0
446 movsxd r8, DWORD PTR arg(3) ;out_pitch
447 %endif
448 mov rax, rsi
449 movsxd rcx, DWORD PTR arg(4) ;output_height
450 add rax, rdx
451
452
453 .vp9_filter_block1d16_v6_ssse3_loop:
454 movq xmm1, MMWORD PTR [rsi] ;A
455 movq xmm2, MMWORD PTR [rsi + rdx] ;B
456 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
457 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
458 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
459
460 punpcklbw xmm2, xmm4 ;B D
461 punpcklbw xmm3, xmm0 ;C E
462
463 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
464
465 pmaddubsw xmm3, xmm6
466 punpcklbw xmm1, xmm0 ;A F
467 pmaddubsw xmm2, xmm7
468 pmaddubsw xmm1, xmm5
469
470 paddsw xmm2, xmm3
471 paddsw xmm2, xmm1
472 paddsw xmm2, [GLOBAL(rd)]
473 psraw xmm2, 7
474 packuswb xmm2, xmm2
475
476 movq MMWORD PTR [rdi], xmm2 ;store the results
477
478 movq xmm1, MMWORD PTR [rsi + 8] ;A
479 movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
480 movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
481 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
482 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
483
484 punpcklbw xmm2, xmm4 ;B D
485 punpcklbw xmm3, xmm0 ;C E
486
487 movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
488 pmaddubsw xmm3, xmm6
489 punpcklbw xmm1, xmm0 ;A F
490 pmaddubsw xmm2, xmm7
491 pmaddubsw xmm1, xmm5
492
493 add rsi, rdx
494 add rax, rdx
495 ;--
496 ;--
497 paddsw xmm2, xmm3
498 paddsw xmm2, xmm1
499 paddsw xmm2, [GLOBAL(rd)]
500 psraw xmm2, 7
501 packuswb xmm2, xmm2
502
503 movq MMWORD PTR [rdi+8], xmm2
504
505 %if ABI_IS_32BIT
506 add rdi, DWORD PTR arg(3) ;out_pitch
507 %else
508 add rdi, r8
509 %endif
510 dec rcx
511 jnz .vp9_filter_block1d16_v6_ssse3_loop
512
513 ; begin epilog
514 pop rdi
515 pop rsi
516 RESTORE_GOT
517 RESTORE_XMM
518 UNSHADOW_ARGS
519 pop rbp
520 ret
521
522 .vp9_filter_block1d16_v4_ssse3:
523 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
524 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
525
526 mov rsi, arg(0) ;src_ptr
527 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
528 mov rdi, arg(2) ;output_ptr
529
530 %if ABI_IS_32BIT=0
531 movsxd r8, DWORD PTR arg(3) ;out_pitch
532 %endif
533 mov rax, rsi
534 movsxd rcx, DWORD PTR arg(4) ;output_height
535 add rax, rdx
536
537 .vp9_filter_block1d16_v4_ssse3_loop:
538 movq xmm2, MMWORD PTR [rsi + rdx] ;B
539 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
540 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
541 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
542
543 punpcklbw xmm2, xmm4 ;B D
544 punpcklbw xmm3, xmm0 ;C E
545
546 pmaddubsw xmm3, xmm6
547 pmaddubsw xmm2, xmm7
548 movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
549 movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
550 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
551 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
552
553 paddsw xmm2, [GLOBAL(rd)]
554 paddsw xmm2, xmm3
555 psraw xmm2, 7
556 packuswb xmm2, xmm2
557
558 punpcklbw xmm5, xmm4 ;B D
559 punpcklbw xmm1, xmm0 ;C E
560
561 pmaddubsw xmm1, xmm6
562 pmaddubsw xmm5, xmm7
563
564 movdqa xmm4, [GLOBAL(rd)]
565 add rsi, rdx
566 add rax, rdx
567 ;--
568 ;--
569 paddsw xmm5, xmm1
570 paddsw xmm5, xmm4
571 psraw xmm5, 7
572 packuswb xmm5, xmm5
573
574 punpcklqdq xmm2, xmm5
575
576 movdqa XMMWORD PTR [rdi], xmm2
577
578 %if ABI_IS_32BIT
579 add rdi, DWORD PTR arg(3) ;out_pitch
580 %else
581 add rdi, r8
582 %endif
583 dec rcx
584 jnz .vp9_filter_block1d16_v4_ssse3_loop
585
586 ; begin epilog
587 pop rdi
588 pop rsi
589 RESTORE_GOT
590 RESTORE_XMM
591 UNSHADOW_ARGS
592 pop rbp
593 ret
594
595 ;void vp9_filter_block1d8_v6_ssse3
596 ;(
597 ; unsigned char *src_ptr,
598 ; unsigned int src_pitch,
599 ; unsigned char *output_ptr,
600 ; unsigned int out_pitch,
601 ; unsigned int output_height,
602 ; unsigned int vp9_filter_index
603 ;)
604 global sym(vp9_filter_block1d8_v6_ssse3)
605 sym(vp9_filter_block1d8_v6_ssse3):
606 push rbp
607 mov rbp, rsp
608 SHADOW_ARGS_TO_STACK 6
609 SAVE_XMM 7
610 GET_GOT rbx
611 push rsi
612 push rdi
613 ; end prolog
614
615 movsxd rdx, DWORD PTR arg(5) ;table index
616 xor rsi, rsi
617 shl rdx, 4 ;
618
619 lea rax, [GLOBAL(k0_k5)]
620 add rax, rdx
621
622 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
623 mov rdi, arg(2) ;output_ptr
624 %if ABI_IS_32BIT=0
625 movsxd r8, DWORD PTR arg(3) ; out_pitch
626 %endif
627 movsxd rcx, DWORD PTR arg(4) ;[output_height]
628
629 cmp esi, DWORD PTR [rax]
630 je .vp9_filter_block1d8_v4_ssse3
631
632 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
633 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
634 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
635
636 mov rsi, arg(0) ;src_ptr
637
638 mov rax, rsi
639 add rax, rdx
640
641 .vp9_filter_block1d8_v6_ssse3_loop:
642 movq xmm1, MMWORD PTR [rsi] ;A
643 movq xmm2, MMWORD PTR [rsi + rdx] ;B
644 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
645 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
646 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
647
648 punpcklbw xmm2, xmm4 ;B D
649 punpcklbw xmm3, xmm0 ;C E
650
651 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
652 movdqa xmm4, [GLOBAL(rd)]
653
654 pmaddubsw xmm3, xmm6
655 punpcklbw xmm1, xmm0 ;A F
656 pmaddubsw xmm2, xmm7
657 pmaddubsw xmm1, xmm5
658 add rsi, rdx
659 add rax, rdx
660 ;--
661 ;--
662 paddsw xmm2, xmm3
663 paddsw xmm2, xmm1
664 paddsw xmm2, xmm4
665 psraw xmm2, 7
666 packuswb xmm2, xmm2
667
668 movq MMWORD PTR [rdi], xmm2
669
670 %if ABI_IS_32BIT
671 add rdi, DWORD PTR arg(3) ;[out_pitch]
672 %else
673 add rdi, r8
674 %endif
675 dec rcx
676 jnz .vp9_filter_block1d8_v6_ssse3_loop
677
678 ; begin epilog
679 pop rdi
680 pop rsi
681 RESTORE_GOT
682 RESTORE_XMM
683 UNSHADOW_ARGS
684 pop rbp
685 ret
686
687 .vp9_filter_block1d8_v4_ssse3:
688 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
689 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
690 movdqa xmm5, [GLOBAL(rd)]
691
692 mov rsi, arg(0) ;src_ptr
693
694 mov rax, rsi
695 add rax, rdx
696
697 .vp9_filter_block1d8_v4_ssse3_loop:
698 movq xmm2, MMWORD PTR [rsi + rdx] ;B
699 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
700 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
701 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
702
703 punpcklbw xmm2, xmm4 ;B D
704 punpcklbw xmm3, xmm0 ;C E
705
706 pmaddubsw xmm3, xmm6
707 pmaddubsw xmm2, xmm7
708 add rsi, rdx
709 add rax, rdx
710 ;--
711 ;--
712 paddsw xmm2, xmm3
713 paddsw xmm2, xmm5
714 psraw xmm2, 7
715 packuswb xmm2, xmm2
716
717 movq MMWORD PTR [rdi], xmm2
718
719 %if ABI_IS_32BIT
720 add rdi, DWORD PTR arg(3) ;[out_pitch]
721 %else
722 add rdi, r8
723 %endif
724 dec rcx
725 jnz .vp9_filter_block1d8_v4_ssse3_loop
726
727 ; begin epilog
728 pop rdi
729 pop rsi
730 RESTORE_GOT
731 RESTORE_XMM
732 UNSHADOW_ARGS
733 pop rbp
734 ret
735 ;void vp9_filter_block1d4_v6_ssse3
736 ;(
737 ; unsigned char *src_ptr,
738 ; unsigned int src_pitch,
739 ; unsigned char *output_ptr,
740 ; unsigned int out_pitch,
741 ; unsigned int output_height,
742 ; unsigned int vp9_filter_index
743 ;)
744 global sym(vp9_filter_block1d4_v6_ssse3)
745 sym(vp9_filter_block1d4_v6_ssse3):
746 push rbp
747 mov rbp, rsp
748 SHADOW_ARGS_TO_STACK 6
749 GET_GOT rbx
750 push rsi
751 push rdi
752 ; end prolog
753
754 movsxd rdx, DWORD PTR arg(5) ;table index
755 xor rsi, rsi
756 shl rdx, 4 ;
757
758 lea rax, [GLOBAL(k0_k5)]
759 add rax, rdx
760
761 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
762 mov rdi, arg(2) ;output_ptr
763 %if ABI_IS_32BIT=0
764 movsxd r8, DWORD PTR arg(3) ; out_pitch
765 %endif
766 movsxd rcx, DWORD PTR arg(4) ;[output_height]
767
768 cmp esi, DWORD PTR [rax]
769 je .vp9_filter_block1d4_v4_ssse3
770
771 movq mm5, MMWORD PTR [rax] ;k0_k5
772 movq mm6, MMWORD PTR [rax+256] ;k2_k4
773 movq mm7, MMWORD PTR [rax+128] ;k1_k3
774
775 mov rsi, arg(0) ;src_ptr
776
777 mov rax, rsi
778 add rax, rdx
779
780 .vp9_filter_block1d4_v6_ssse3_loop:
781 movd mm1, DWORD PTR [rsi] ;A
782 movd mm2, DWORD PTR [rsi + rdx] ;B
783 movd mm3, DWORD PTR [rsi + rdx * 2] ;C
784 movd mm4, DWORD PTR [rax + rdx * 2] ;D
785 movd mm0, DWORD PTR [rsi + rdx * 4] ;E
786
787 punpcklbw mm2, mm4 ;B D
788 punpcklbw mm3, mm0 ;C E
789
790 movd mm0, DWORD PTR [rax + rdx * 4] ;F
791
792 movq mm4, [GLOBAL(rd)]
793
794 pmaddubsw mm3, mm6
795 punpcklbw mm1, mm0 ;A F
796 pmaddubsw mm2, mm7
797 pmaddubsw mm1, mm5
798 add rsi, rdx
799 add rax, rdx
800 ;--
801 ;--
802 paddsw mm2, mm3
803 paddsw mm2, mm1
804 paddsw mm2, mm4
805 psraw mm2, 7
806 packuswb mm2, mm2
807
808 movd DWORD PTR [rdi], mm2
809
810 %if ABI_IS_32BIT
811 add rdi, DWORD PTR arg(3) ;[out_pitch]
812 %else
813 add rdi, r8
814 %endif
815 dec rcx
816 jnz .vp9_filter_block1d4_v6_ssse3_loop
817
818 ; begin epilog
819 pop rdi
820 pop rsi
821 RESTORE_GOT
822 UNSHADOW_ARGS
823 pop rbp
824 ret
825
826 .vp9_filter_block1d4_v4_ssse3:
827 movq mm6, MMWORD PTR [rax+256] ;k2_k4
828 movq mm7, MMWORD PTR [rax+128] ;k1_k3
829 movq mm5, MMWORD PTR [GLOBAL(rd)]
830
831 mov rsi, arg(0) ;src_ptr
832
833 mov rax, rsi
834 add rax, rdx
835
836 .vp9_filter_block1d4_v4_ssse3_loop:
837 movd mm2, DWORD PTR [rsi + rdx] ;B
838 movd mm3, DWORD PTR [rsi + rdx * 2] ;C
839 movd mm4, DWORD PTR [rax + rdx * 2] ;D
840 movd mm0, DWORD PTR [rsi + rdx * 4] ;E
841
842 punpcklbw mm2, mm4 ;B D
843 punpcklbw mm3, mm0 ;C E
844
845 pmaddubsw mm3, mm6
846 pmaddubsw mm2, mm7
847 add rsi, rdx
848 add rax, rdx
849 ;--
850 ;--
851 paddsw mm2, mm3
852 paddsw mm2, mm5
853 psraw mm2, 7
854 packuswb mm2, mm2
855
856 movd DWORD PTR [rdi], mm2
857
858 %if ABI_IS_32BIT
859 add rdi, DWORD PTR arg(3) ;[out_pitch]
860 %else
861 add rdi, r8
862 %endif
863 dec rcx
864 jnz .vp9_filter_block1d4_v4_ssse3_loop
865
866 ; begin epilog
867 pop rdi
868 pop rsi
869 RESTORE_GOT
870 UNSHADOW_ARGS
871 pop rbp
872 ret
873
874 ;void vp9_bilinear_predict16x16_ssse3
875 ;(
876 ; unsigned char *src_ptr,
877 ; int src_pixels_per_line,
878 ; int xoffset,
879 ; int yoffset,
880 ; unsigned char *dst_ptr,
881 ; int dst_pitch
882 ;)
883 global sym(vp9_bilinear_predict16x16_ssse3)
884 sym(vp9_bilinear_predict16x16_ssse3):
885 push rbp
886 mov rbp, rsp
887 SHADOW_ARGS_TO_STACK 6
888 SAVE_XMM 7
889 GET_GOT rbx
890 push rsi
891 push rdi
892 ; end prolog
893
894 lea rcx, [GLOBAL(bilinear_filters_ssse3)]
895 movsxd rax, dword ptr arg(2) ; xoffset
896
897 cmp rax, 0 ; skip first_pass filter if xoffset=0
898 je .b16x16_sp_only
899
900 shl rax, 4
901 lea rax, [rax + rcx] ; HFilter
902
903 mov rdi, arg(4) ; dst_ptr
904 mov rsi, arg(0) ; src_ptr
905 movsxd rdx, dword ptr arg(5) ; dst_pitch
906
907 movdqa xmm1, [rax]
908
909 movsxd rax, dword ptr arg(3) ; yoffset
910
911 cmp rax, 0 ; skip second_pass filter if yoffset=0
912 je .b16x16_fp_only
913
914 shl rax, 4
915 lea rax, [rax + rcx] ; VFilter
916
917 lea rcx, [rdi+rdx*8]
918 lea rcx, [rcx+rdx*8]
919 movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
920
921 movdqa xmm2, [rax]
922
923 %if ABI_IS_32BIT=0
924 movsxd r8, dword ptr arg(5) ; dst_pitch
925 %endif
926 movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
927 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
928
929 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
930 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
931
932 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
933
934 lea rsi, [rsi + rdx] ; next line
935
936 pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
937
938 punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
939 pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
940
941 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
942 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
943
944 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
945 psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128
946
947 movdqa xmm7, xmm3
948 packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
949
950 .next_row:
951 movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
952 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
953
954 punpcklbw xmm6, xmm5
955 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
956
957 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
958 lea rsi, [rsi + rdx] ; next line
959
960 pmaddubsw xmm6, xmm1
961
962 punpcklbw xmm4, xmm5
963 pmaddubsw xmm4, xmm1
964
965 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
966 psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128
967
968 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
969 psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128
970
971 packuswb xmm6, xmm4
972 movdqa xmm5, xmm7
973
974 punpcklbw xmm5, xmm6
975 pmaddubsw xmm5, xmm2
976
977 punpckhbw xmm7, xmm6
978 pmaddubsw xmm7, xmm2
979
980 paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
981 psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128
982
983 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
984 psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128
985
986 packuswb xmm5, xmm7
987 movdqa xmm7, xmm6
988
989 movdqa [rdi], xmm5 ; store the results in the d estination
990 %if ABI_IS_32BIT
991 add rdi, DWORD PTR arg(5) ; dst_pitch
992 %else
993 add rdi, r8
994 %endif
995
996 cmp rdi, rcx
997 jne .next_row
998
999 jmp .done
1000
1001 .b16x16_sp_only:
1002 movsxd rax, dword ptr arg(3) ; yoffset
1003 shl rax, 4
1004 lea rax, [rax + rcx] ; VFilter
1005
1006 mov rdi, arg(4) ; dst_ptr
1007 mov rsi, arg(0) ; src_ptr
1008 movsxd rdx, dword ptr arg(5) ; dst_pitch
1009
1010 movdqa xmm1, [rax] ; VFilter
1011
1012 lea rcx, [rdi+rdx*8]
1013 lea rcx, [rcx+rdx*8]
1014 movsxd rax, dword ptr arg(1) ; src_pixels_per_line
1015
1016 ; get the first horizontal line done
1017 movq xmm4, [rsi] ; load row 0
1018 movq xmm2, [rsi + 8] ; load row 0
1019
1020 lea rsi, [rsi + rax] ; next line
1021 .next_row_sp:
1022 movq xmm3, [rsi] ; load row + 1
1023 movq xmm5, [rsi + 8] ; load row + 1
1024
1025 punpcklbw xmm4, xmm3
1026 punpcklbw xmm2, xmm5
1027
1028 pmaddubsw xmm4, xmm1
1029 movq xmm7, [rsi + rax] ; load row + 2
1030
1031 pmaddubsw xmm2, xmm1
1032 movq xmm6, [rsi + rax + 8] ; load row + 2
1033
1034 punpcklbw xmm3, xmm7
1035 punpcklbw xmm5, xmm6
1036
1037 pmaddubsw xmm3, xmm1
1038 paddw xmm4, [GLOBAL(rd)]
1039
1040 pmaddubsw xmm5, xmm1
1041 paddw xmm2, [GLOBAL(rd)]
1042
1043 psraw xmm4, VP9_FILTER_SHIFT
1044 psraw xmm2, VP9_FILTER_SHIFT
1045
1046 packuswb xmm4, xmm2
1047 paddw xmm3, [GLOBAL(rd)]
1048
1049 movdqa [rdi], xmm4 ; store row 0
1050 paddw xmm5, [GLOBAL(rd)]
1051
1052 psraw xmm3, VP9_FILTER_SHIFT
1053 psraw xmm5, VP9_FILTER_SHIFT
1054
1055 packuswb xmm3, xmm5
1056 movdqa xmm4, xmm7
1057
1058 movdqa [rdi + rdx],xmm3 ; store row 1
1059 lea rsi, [rsi + 2*rax]
1060
1061 movdqa xmm2, xmm6
1062 lea rdi, [rdi + 2*rdx]
1063
1064 cmp rdi, rcx
1065 jne .next_row_sp
1066
1067 jmp .done
1068
1069 .b16x16_fp_only:
1070 lea rcx, [rdi+rdx*8]
1071 lea rcx, [rcx+rdx*8]
1072 movsxd rax, dword ptr arg(1) ; src_pixels_per_line
1073
1074 .next_row_fp:
1075 movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
1076 movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
1077
1078 punpcklbw xmm2, xmm4
1079 movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
1080
1081 pmaddubsw xmm2, xmm1
1082 movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
1083
1084 lea rsi, [rsi + rax] ; next line
1085 punpcklbw xmm3, xmm4
1086
1087 pmaddubsw xmm3, xmm1
1088 movq xmm5, [rsi]
1089
1090 paddw xmm2, [GLOBAL(rd)]
1091 movq xmm7, [rsi+1]
1092
1093 movq xmm6, [rsi+8]
1094 psraw xmm2, VP9_FILTER_SHIFT
1095
1096 punpcklbw xmm5, xmm7
1097 movq xmm7, [rsi+9]
1098
1099 paddw xmm3, [GLOBAL(rd)]
1100 pmaddubsw xmm5, xmm1
1101
1102 psraw xmm3, VP9_FILTER_SHIFT
1103 punpcklbw xmm6, xmm7
1104
1105 packuswb xmm2, xmm3
1106 pmaddubsw xmm6, xmm1
1107
1108 movdqa [rdi], xmm2 ; store the results in the d estination
1109 paddw xmm5, [GLOBAL(rd)]
1110
1111 lea rdi, [rdi + rdx] ; dst_pitch
1112 psraw xmm5, VP9_FILTER_SHIFT
1113
1114 paddw xmm6, [GLOBAL(rd)]
1115 psraw xmm6, VP9_FILTER_SHIFT
1116
1117 packuswb xmm5, xmm6
1118 lea rsi, [rsi + rax] ; next line
1119
1120 movdqa [rdi], xmm5 ; store the results in the d estination
1121 lea rdi, [rdi + rdx] ; dst_pitch
1122
1123 cmp rdi, rcx
1124
1125 jne .next_row_fp
1126
1127 .done:
1128 ; begin epilog
1129 pop rdi
1130 pop rsi
1131 RESTORE_GOT
1132 RESTORE_XMM
1133 UNSHADOW_ARGS
1134 pop rbp
1135 ret
1136
1137 ;void vp9_bilinear_predict8x8_ssse3
1138 ;(
1139 ; unsigned char *src_ptr,
1140 ; int src_pixels_per_line,
1141 ; int xoffset,
1142 ; int yoffset,
1143 ; unsigned char *dst_ptr,
1144 ; int dst_pitch
1145 ;)
1146 global sym(vp9_bilinear_predict8x8_ssse3)
1147 sym(vp9_bilinear_predict8x8_ssse3):
1148 push rbp
1149 mov rbp, rsp
1150 SHADOW_ARGS_TO_STACK 6
1151 SAVE_XMM 7
1152 GET_GOT rbx
1153 push rsi
1154 push rdi
1155 ; end prolog
1156
1157 ALIGN_STACK 16, rax
1158 sub rsp, 144 ; reserve 144 bytes
1159
1160 lea rcx, [GLOBAL(bilinear_filters_ssse3)]
1161
1162 mov rsi, arg(0) ;src_ptr
1163 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
1164
1165 ;Read 9-line unaligned data in and put them on stack. This gives a big
1166 ;performance boost.
1167 movdqu xmm0, [rsi]
1168 lea rax, [rdx + rdx*2]
1169 movdqu xmm1, [rsi+rdx]
1170 movdqu xmm2, [rsi+rdx*2]
1171 add rsi, rax
1172 movdqu xmm3, [rsi]
1173 movdqu xmm4, [rsi+rdx]
1174 movdqu xmm5, [rsi+rdx*2]
1175 add rsi, rax
1176 movdqu xmm6, [rsi]
1177 movdqu xmm7, [rsi+rdx]
1178
1179 movdqa XMMWORD PTR [rsp], xmm0
1180
1181 movdqu xmm0, [rsi+rdx*2]
1182
1183 movdqa XMMWORD PTR [rsp+16], xmm1
1184 movdqa XMMWORD PTR [rsp+32], xmm2
1185 movdqa XMMWORD PTR [rsp+48], xmm3
1186 movdqa XMMWORD PTR [rsp+64], xmm4
1187 movdqa XMMWORD PTR [rsp+80], xmm5
1188 movdqa XMMWORD PTR [rsp+96], xmm6
1189 movdqa XMMWORD PTR [rsp+112], xmm7
1190 movdqa XMMWORD PTR [rsp+128], xmm0
1191
1192 movsxd rax, dword ptr arg(2) ; xoffset
1193 cmp rax, 0 ; skip first_pass filter if xoffset=0
1194 je .b8x8_sp_only
1195
1196 shl rax, 4
1197 add rax, rcx ; HFilter
1198
1199 mov rdi, arg(4) ; dst_ptr
1200 movsxd rdx, dword ptr arg(5) ; dst_pitch
1201
1202 movdqa xmm0, [rax]
1203
1204 movsxd rax, dword ptr arg(3) ; yoffset
1205 cmp rax, 0 ; skip second_pass filter if yoffset=0
1206 je .b8x8_fp_only
1207
1208 shl rax, 4
1209 lea rax, [rax + rcx] ; VFilter
1210
1211 lea rcx, [rdi+rdx*8]
1212
1213 movdqa xmm1, [rax]
1214
1215 ; get the first horizontal line done
1216 movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1217 movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
1218
1219 psrldq xmm5, 1
1220 lea rsp, [rsp + 16] ; next line
1221
1222 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
1223 pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
1224
1225 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1226 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
1227
1228 movdqa xmm7, xmm3
1229 packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1230
1231 .next_row:
1232 movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1233 lea rsp, [rsp + 16] ; next line
1234
1235 movdqa xmm5, xmm6
1236
1237 psrldq xmm5, 1
1238
1239 punpcklbw xmm6, xmm5
1240 pmaddubsw xmm6, xmm0
1241
1242 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
1243 psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128
1244
1245 packuswb xmm6, xmm6
1246
1247 punpcklbw xmm7, xmm6
1248 pmaddubsw xmm7, xmm1
1249
1250 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
1251 psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128
1252
1253 packuswb xmm7, xmm7
1254
1255 movq [rdi], xmm7 ; store the results in the d estination
1256 lea rdi, [rdi + rdx]
1257
1258 movdqa xmm7, xmm6
1259
1260 cmp rdi, rcx
1261 jne .next_row
1262
1263 jmp .done8x8
1264
1265 .b8x8_sp_only:
1266 movsxd rax, dword ptr arg(3) ; yoffset
1267 shl rax, 4
1268 lea rax, [rax + rcx] ; VFilter
1269
1270 mov rdi, arg(4) ;dst_ptr
1271 movsxd rdx, dword ptr arg(5) ; dst_pitch
1272
1273 movdqa xmm0, [rax] ; VFilter
1274
1275 movq xmm1, XMMWORD PTR [rsp]
1276 movq xmm2, XMMWORD PTR [rsp+16]
1277
1278 movq xmm3, XMMWORD PTR [rsp+32]
1279 punpcklbw xmm1, xmm2
1280
1281 movq xmm4, XMMWORD PTR [rsp+48]
1282 punpcklbw xmm2, xmm3
1283
1284 movq xmm5, XMMWORD PTR [rsp+64]
1285 punpcklbw xmm3, xmm4
1286
1287 movq xmm6, XMMWORD PTR [rsp+80]
1288 punpcklbw xmm4, xmm5
1289
1290 movq xmm7, XMMWORD PTR [rsp+96]
1291 punpcklbw xmm5, xmm6
1292
1293 pmaddubsw xmm1, xmm0
1294 pmaddubsw xmm2, xmm0
1295
1296 pmaddubsw xmm3, xmm0
1297 pmaddubsw xmm4, xmm0
1298
1299 pmaddubsw xmm5, xmm0
1300 punpcklbw xmm6, xmm7
1301
1302 pmaddubsw xmm6, xmm0
1303 paddw xmm1, [GLOBAL(rd)]
1304
1305 paddw xmm2, [GLOBAL(rd)]
1306 psraw xmm1, VP9_FILTER_SHIFT
1307
1308 paddw xmm3, [GLOBAL(rd)]
1309 psraw xmm2, VP9_FILTER_SHIFT
1310
1311 paddw xmm4, [GLOBAL(rd)]
1312 psraw xmm3, VP9_FILTER_SHIFT
1313
1314 paddw xmm5, [GLOBAL(rd)]
1315 psraw xmm4, VP9_FILTER_SHIFT
1316
1317 paddw xmm6, [GLOBAL(rd)]
1318 psraw xmm5, VP9_FILTER_SHIFT
1319
1320 psraw xmm6, VP9_FILTER_SHIFT
1321 packuswb xmm1, xmm1
1322
1323 packuswb xmm2, xmm2
1324 movq [rdi], xmm1
1325
1326 packuswb xmm3, xmm3
1327 movq [rdi+rdx], xmm2
1328
1329 packuswb xmm4, xmm4
1330 movq xmm1, XMMWORD PTR [rsp+112]
1331
1332 lea rdi, [rdi + 2*rdx]
1333 movq xmm2, XMMWORD PTR [rsp+128]
1334
1335 packuswb xmm5, xmm5
1336 movq [rdi], xmm3
1337
1338 packuswb xmm6, xmm6
1339 movq [rdi+rdx], xmm4
1340
1341 lea rdi, [rdi + 2*rdx]
1342 punpcklbw xmm7, xmm1
1343
1344 movq [rdi], xmm5
1345 pmaddubsw xmm7, xmm0
1346
1347 movq [rdi+rdx], xmm6
1348 punpcklbw xmm1, xmm2
1349
1350 pmaddubsw xmm1, xmm0
1351 paddw xmm7, [GLOBAL(rd)]
1352
1353 psraw xmm7, VP9_FILTER_SHIFT
1354 paddw xmm1, [GLOBAL(rd)]
1355
1356 psraw xmm1, VP9_FILTER_SHIFT
1357 packuswb xmm7, xmm7
1358
1359 packuswb xmm1, xmm1
1360 lea rdi, [rdi + 2*rdx]
1361
1362 movq [rdi], xmm7
1363
1364 movq [rdi+rdx], xmm1
1365 lea rsp, [rsp + 144]
1366
1367 jmp .done8x8
1368
1369 .b8x8_fp_only:
1370 lea rcx, [rdi+rdx*8]
1371
1372 .next_row_fp:
1373 movdqa xmm1, XMMWORD PTR [rsp]
1374 movdqa xmm3, XMMWORD PTR [rsp+16]
1375
1376 movdqa xmm2, xmm1
1377 movdqa xmm5, XMMWORD PTR [rsp+32]
1378
1379 psrldq xmm2, 1
1380 movdqa xmm7, XMMWORD PTR [rsp+48]
1381
1382 movdqa xmm4, xmm3
1383 psrldq xmm4, 1
1384
1385 movdqa xmm6, xmm5
1386 psrldq xmm6, 1
1387
1388 punpcklbw xmm1, xmm2
1389 pmaddubsw xmm1, xmm0
1390
1391 punpcklbw xmm3, xmm4
1392 pmaddubsw xmm3, xmm0
1393
1394 punpcklbw xmm5, xmm6
1395 pmaddubsw xmm5, xmm0
1396
1397 movdqa xmm2, xmm7
1398 psrldq xmm2, 1
1399
1400 punpcklbw xmm7, xmm2
1401 pmaddubsw xmm7, xmm0
1402
1403 paddw xmm1, [GLOBAL(rd)]
1404 psraw xmm1, VP9_FILTER_SHIFT
1405
1406 paddw xmm3, [GLOBAL(rd)]
1407 psraw xmm3, VP9_FILTER_SHIFT
1408
1409 paddw xmm5, [GLOBAL(rd)]
1410 psraw xmm5, VP9_FILTER_SHIFT
1411
1412 paddw xmm7, [GLOBAL(rd)]
1413 psraw xmm7, VP9_FILTER_SHIFT
1414
1415 packuswb xmm1, xmm1
1416 packuswb xmm3, xmm3
1417
1418 packuswb xmm5, xmm5
1419 movq [rdi], xmm1
1420
1421 packuswb xmm7, xmm7
1422 movq [rdi+rdx], xmm3
1423
1424 lea rdi, [rdi + 2*rdx]
1425 movq [rdi], xmm5
1426
1427 lea rsp, [rsp + 4*16]
1428 movq [rdi+rdx], xmm7
1429
1430 lea rdi, [rdi + 2*rdx]
1431 cmp rdi, rcx
1432
1433 jne .next_row_fp
1434
1435 lea rsp, [rsp + 16]
1436
1437 .done8x8:
1438 ;add rsp, 144
1439 pop rsp
1440 ; begin epilog
1441 pop rdi
1442 pop rsi
1443 RESTORE_GOT
1444 RESTORE_XMM
1445 UNSHADOW_ARGS
1446 pop rbp
1447 ret
1448
1449 SECTION_RODATA
1450 align 16
1451 shuf1b:
1452 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
1453 shuf2b:
1454 db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
1455 shuf3b:
1456 db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
1457
1458 align 16
1459 shuf2bfrom1:
1460 db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
1461 align 16
1462 shuf3bfrom1:
1463 db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
1464
1465 align 16
1466 rd:
1467 times 8 dw 0x40
1468
1469 align 16
1470 k0_k5:
1471 times 8 db 0, 0 ;placeholder
1472 times 8 db 0, 0
1473 times 8 db 2, 1
1474 times 8 db 0, 0
1475 times 8 db 3, 3
1476 times 8 db 0, 0
1477 times 8 db 1, 2
1478 times 8 db 0, 0
1479 k1_k3:
1480 times 8 db 0, 0 ;placeholder
1481 times 8 db -6, 12
1482 times 8 db -11, 36
1483 times 8 db -9, 50
1484 times 8 db -16, 77
1485 times 8 db -6, 93
1486 times 8 db -8, 108
1487 times 8 db -1, 123
1488 k2_k4:
1489 times 8 db 128, 0 ;placeholder
1490 times 8 db 123, -1
1491 times 8 db 108, -8
1492 times 8 db 93, -6
1493 times 8 db 77, -16
1494 times 8 db 50, -9
1495 times 8 db 36, -11
1496 times 8 db 12, -6
1497 align 16
1498 bilinear_filters_ssse3:
1499 times 8 db 128, 0
1500 times 8 db 120, 8
1501 times 8 db 112, 16
1502 times 8 db 104, 24
1503 times 8 db 96, 32
1504 times 8 db 88, 40
1505 times 8 db 80, 48
1506 times 8 db 72, 56
1507 times 8 db 64, 64
1508 times 8 db 56, 72
1509 times 8 db 48, 80
1510 times 8 db 40, 88
1511 times 8 db 32, 96
1512 times 8 db 24, 104
1513 times 8 db 16, 112
1514 times 8 db 8, 120
1515
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698