Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(118)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_sse2.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %define BLOCK_HEIGHT_WIDTH 4
15 %define VP9_FILTER_WEIGHT 128
16 %define VP9_FILTER_SHIFT 7
17
18
19 ;/****************************************************************************** ******
20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixe ls. The
21 ; input pixel array has output_height rows. This routine assumes that output_hei ght is an
22 ; even number. This function handles 8 pixels in horizontal direction, calculati ng ONE
23 ; rows each iteration to take advantage of the 128 bits operations.
24 ;******************************************************************************* ******/
25 ;void vp9_filter_block1d8_h6_sse2
26 ;(
27 ; unsigned char *src_ptr,
28 ; unsigned short *output_ptr,
29 ; unsigned int src_pixels_per_line,
30 ; unsigned int pixel_step,
31 ; unsigned int output_height,
32 ; unsigned int output_width,
33 ; short *vp9_filter
34 ;)
35 global sym(vp9_filter_block1d8_h6_sse2)
36 sym(vp9_filter_block1d8_h6_sse2):
37 push rbp
38 mov rbp, rsp
39 SHADOW_ARGS_TO_STACK 7
40 SAVE_XMM 7
41 GET_GOT rbx
42 push rsi
43 push rdi
44 ; end prolog
45
46 mov rdx, arg(6) ;vp9_filter
47 mov rsi, arg(0) ;src_ptr
48
49 mov rdi, arg(1) ;output_ptr
50
51 movsxd rcx, dword ptr arg(4) ;output_height
52 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
53 %if ABI_IS_32BIT=0
54 movsxd r8, dword ptr arg(5) ;output_width
55 %endif
56 pxor xmm0, xmm0 ; clear xmm0 for unp ack
57
58 .filter_block1d8_h6_rowloop:
59 movq xmm3, MMWORD PTR [rsi - 2]
60 movq xmm1, MMWORD PTR [rsi + 6]
61
62 prefetcht2 [rsi+rax-2]
63
64 pslldq xmm1, 8
65 por xmm1, xmm3
66
67 movdqa xmm4, xmm1
68 movdqa xmm5, xmm1
69
70 movdqa xmm6, xmm1
71 movdqa xmm7, xmm1
72
73 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0 2 xx01 xx01 xx-1 xx-2
74 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
75
76 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
77 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0 3 xx02 xx01 xx00 xx-1
78
79 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
80 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
81
82
83 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0 4 xx03 xx02 xx01 xx00
84 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
85
86 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
87
88 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0 5 xx04 xx03 xx02 xx01
89 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
90
91 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta p 4
92
93 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0 6 xx05 xx04 xx03 xx02
94 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
95
96
97 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta p 5
98
99 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx0 7 xx06 xx05 xx04 xx03
100 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Ta p 6
101
102
103 paddsw xmm4, xmm7
104 paddsw xmm4, xmm5
105
106 paddsw xmm4, xmm3
107 paddsw xmm4, xmm6
108
109 paddsw xmm4, xmm1
110 paddsw xmm4, [GLOBAL(rd)]
111
112 psraw xmm4, 7
113
114 packuswb xmm4, xmm0
115 punpcklbw xmm4, xmm0
116
117 movdqa XMMWORD Ptr [rdi], xmm4
118 lea rsi, [rsi + rax]
119
120 %if ABI_IS_32BIT
121 add rdi, DWORD Ptr arg(5) ;[output_width]
122 %else
123 add rdi, r8
124 %endif
125 dec rcx
126
127 jnz .filter_block1d8_h6_rowloop ; next row
128
129 ; begin epilog
130 pop rdi
131 pop rsi
132 RESTORE_GOT
133 RESTORE_XMM
134 UNSHADOW_ARGS
135 pop rbp
136 ret
137
138
139 ;void vp9_filter_block1d16_h6_sse2
140 ;(
141 ; unsigned char *src_ptr,
142 ; unsigned short *output_ptr,
143 ; unsigned int src_pixels_per_line,
144 ; unsigned int pixel_step,
145 ; unsigned int output_height,
146 ; unsigned int output_width,
147 ; short *vp9_filter
148 ;)
149 ;/****************************************************************************** ******
150 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixe ls. The
151 ; input pixel array has output_height rows. This routine assumes that output_hei ght is an
152 ; even number. This function handles 8 pixels in horizontal direction, calculati ng ONE
153 ; rows each iteration to take advantage of the 128 bits operations.
154 ;******************************************************************************* ******/
155 global sym(vp9_filter_block1d16_h6_sse2)
156 sym(vp9_filter_block1d16_h6_sse2):
157 push rbp
158 mov rbp, rsp
159 SHADOW_ARGS_TO_STACK 7
160 SAVE_XMM 7
161 GET_GOT rbx
162 push rsi
163 push rdi
164 ; end prolog
165
166 mov rdx, arg(6) ;vp9_filter
167 mov rsi, arg(0) ;src_ptr
168
169 mov rdi, arg(1) ;output_ptr
170
171 movsxd rcx, dword ptr arg(4) ;output_height
172 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
173 %if ABI_IS_32BIT=0
174 movsxd r8, dword ptr arg(5) ;output_width
175 %endif
176
177 pxor xmm0, xmm0 ; clear xmm0 for unp ack
178
179 .filter_block1d16_h6_sse2_rowloop:
180 movq xmm3, MMWORD PTR [rsi - 2]
181 movq xmm1, MMWORD PTR [rsi + 6]
182
183 movq xmm2, MMWORD PTR [rsi +14]
184 pslldq xmm2, 8
185
186 por xmm2, xmm1
187 prefetcht2 [rsi+rax-2]
188
189 pslldq xmm1, 8
190 por xmm1, xmm3
191
192 movdqa xmm4, xmm1
193 movdqa xmm5, xmm1
194
195 movdqa xmm6, xmm1
196 movdqa xmm7, xmm1
197
198 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0 2 xx01 xx01 xx-1 xx-2
199 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
200
201 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
202 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0 3 xx02 xx01 xx00 xx-1
203
204 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
205 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
206
207
208 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0 4 xx03 xx02 xx01 xx00
209 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
210
211 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
212
213 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0 5 xx04 xx03 xx02 xx01
214 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
215
216 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta p 4
217
218 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0 6 xx05 xx04 xx03 xx02
219 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
220
221
222 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta p 5
223
224 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx0 7 xx06 xx05 xx04 xx03
225 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Ta p 6
226
227 paddsw xmm4, xmm7
228 paddsw xmm4, xmm5
229
230 paddsw xmm4, xmm3
231 paddsw xmm4, xmm6
232
233 paddsw xmm4, xmm1
234 paddsw xmm4, [GLOBAL(rd)]
235
236 psraw xmm4, 7
237
238 packuswb xmm4, xmm0
239 punpcklbw xmm4, xmm0
240
241 movdqa XMMWORD Ptr [rdi], xmm4
242
243 movdqa xmm3, xmm2
244 movdqa xmm4, xmm2
245
246 movdqa xmm5, xmm2
247 movdqa xmm6, xmm2
248
249 movdqa xmm7, xmm2
250
251 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0 2 xx01 xx01 xx-1 xx-2
252 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
253
254 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
255 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0 3 xx02 xx01 xx00 xx-1
256
257 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
258 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
259
260
261 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0 4 xx03 xx02 xx01 xx00
262 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
263
264 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
265
266 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0 5 xx04 xx03 xx02 xx01
267 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
268
269 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta p 4
270
271 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0 6 xx05 xx04 xx03 xx02
272 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
273
274 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta p 5
275
276 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx0 7 xx06 xx05 xx04 xx03
277 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Ta p 6
278
279
280 paddsw xmm4, xmm7
281 paddsw xmm4, xmm5
282
283 paddsw xmm4, xmm3
284 paddsw xmm4, xmm6
285
286 paddsw xmm4, xmm2
287 paddsw xmm4, [GLOBAL(rd)]
288
289 psraw xmm4, 7
290
291 packuswb xmm4, xmm0
292 punpcklbw xmm4, xmm0
293
294 movdqa XMMWORD Ptr [rdi+16], xmm4
295
296 lea rsi, [rsi + rax]
297 %if ABI_IS_32BIT
298 add rdi, DWORD Ptr arg(5) ;[output_width]
299 %else
300 add rdi, r8
301 %endif
302
303 dec rcx
304 jnz .filter_block1d16_h6_sse2_rowloop ; next row
305
306 ; begin epilog
307 pop rdi
308 pop rsi
309 RESTORE_GOT
310 RESTORE_XMM
311 UNSHADOW_ARGS
312 pop rbp
313 ret
314
315
316 ;void vp9_filter_block1d8_v6_sse2
317 ;(
318 ; short *src_ptr,
319 ; unsigned char *output_ptr,
320 ; int dst_ptich,
321 ; unsigned int pixels_per_line,
322 ; unsigned int pixel_step,
323 ; unsigned int output_height,
324 ; unsigned int output_width,
325 ; short * vp9_filter
326 ;)
327 ;/****************************************************************************** ******
328 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixel s. The
329 ; input pixel array has output_height rows.
330 ;******************************************************************************* ******/
331 global sym(vp9_filter_block1d8_v6_sse2)
332 sym(vp9_filter_block1d8_v6_sse2):
333 push rbp
334 mov rbp, rsp
335 SHADOW_ARGS_TO_STACK 8
336 SAVE_XMM 7
337 GET_GOT rbx
338 push rsi
339 push rdi
340 ; end prolog
341
342 mov rax, arg(7) ;vp9_filter
343 movsxd rdx, dword ptr arg(3) ;pixels_per_line
344
345 mov rdi, arg(1) ;output_ptr
346 mov rsi, arg(0) ;src_ptr
347
348 sub rsi, rdx
349 sub rsi, rdx
350
351 movsxd rcx, DWORD PTR arg(5) ;[output_height]
352 pxor xmm0, xmm0 ; clear xmm0
353
354 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
355 %if ABI_IS_32BIT=0
356 movsxd r8, dword ptr arg(2) ; dst_ptich
357 %endif
358
359 .vp9_filter_block1d8_v6_sse2_loop:
360 movdqa xmm1, XMMWORD PTR [rsi]
361 pmullw xmm1, [rax]
362
363 movdqa xmm2, XMMWORD PTR [rsi + rdx]
364 pmullw xmm2, [rax + 16]
365
366 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
367 pmullw xmm3, [rax + 32]
368
369 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
370 pmullw xmm5, [rax + 64]
371
372 add rsi, rdx
373 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
374
375 pmullw xmm4, [rax + 48]
376 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
377
378 pmullw xmm6, [rax + 80]
379
380 paddsw xmm2, xmm5
381 paddsw xmm2, xmm3
382
383 paddsw xmm2, xmm1
384 paddsw xmm2, xmm4
385
386 paddsw xmm2, xmm6
387 paddsw xmm2, xmm7
388
389 psraw xmm2, 7
390 packuswb xmm2, xmm0 ; pack and saturate
391
392 movq QWORD PTR [rdi], xmm2 ; store the results in the des tination
393 %if ABI_IS_32BIT
394 add rdi, DWORD PTR arg(2) ;[dst_ptich]
395 %else
396 add rdi, r8
397 %endif
398 dec rcx ; decrement count
399 jnz .vp9_filter_block1d8_v6_sse2_loop ; next row
400
401 ; begin epilog
402 pop rdi
403 pop rsi
404 RESTORE_GOT
405 RESTORE_XMM
406 UNSHADOW_ARGS
407 pop rbp
408 ret
409
410
411 ;void vp9_filter_block1d16_v6_sse2
412 ;(
413 ; unsigned short *src_ptr,
414 ; unsigned char *output_ptr,
415 ; int dst_ptich,
416 ; unsigned int pixels_per_line,
417 ; unsigned int pixel_step,
418 ; unsigned int output_height,
419 ; unsigned int output_width,
420 ; const short *vp9_filter
421 ;)
422 ;/****************************************************************************** ******
423 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixe ls. The
424 ; input pixel array has output_height rows.
425 ;******************************************************************************* ******/
426 global sym(vp9_filter_block1d16_v6_sse2)
427 sym(vp9_filter_block1d16_v6_sse2):
428 push rbp
429 mov rbp, rsp
430 SHADOW_ARGS_TO_STACK 8
431 SAVE_XMM 7
432 GET_GOT rbx
433 push rsi
434 push rdi
435 ; end prolog
436
437 mov rax, arg(7) ;vp9_filter
438 movsxd rdx, dword ptr arg(3) ;pixels_per_line
439
440 mov rdi, arg(1) ;output_ptr
441 mov rsi, arg(0) ;src_ptr
442
443 sub rsi, rdx
444 sub rsi, rdx
445
446 movsxd rcx, DWORD PTR arg(5) ;[output_height]
447 %if ABI_IS_32BIT=0
448 movsxd r8, dword ptr arg(2) ; dst_ptich
449 %endif
450
451 .vp9_filter_block1d16_v6_sse2_loop:
452 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
453 movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
454 movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
455 pmullw xmm1, [rax + 16]
456 pmullw xmm2, [rax + 16]
457
458 movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
459 movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
460 pmullw xmm3, [rax + 64]
461 pmullw xmm4, [rax + 64]
462
463 movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
464 movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
465 pmullw xmm5, [rax + 32]
466 pmullw xmm6, [rax + 32]
467
468 movdqa xmm7, XMMWORD PTR [rsi] ; line 1
469 movdqa xmm0, XMMWORD PTR [rsi + 16]
470 pmullw xmm7, [rax]
471 pmullw xmm0, [rax]
472
473 paddsw xmm1, xmm3
474 paddsw xmm2, xmm4
475 paddsw xmm1, xmm5
476 paddsw xmm2, xmm6
477 paddsw xmm1, xmm7
478 paddsw xmm2, xmm0
479
480 add rsi, rdx
481
482 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
483 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
484 pmullw xmm3, [rax + 48]
485 pmullw xmm4, [rax + 48]
486
487 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
488 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
489 pmullw xmm5, [rax + 80]
490 pmullw xmm6, [rax + 80]
491
492 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
493 pxor xmm0, xmm0 ; clear xmm0
494
495 paddsw xmm1, xmm3
496 paddsw xmm2, xmm4
497 paddsw xmm1, xmm5
498 paddsw xmm2, xmm6
499
500 paddsw xmm1, xmm7
501 paddsw xmm2, xmm7
502
503 psraw xmm1, 7
504 psraw xmm2, 7
505
506 packuswb xmm1, xmm2 ; pack and saturate
507 movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the des tination
508 %if ABI_IS_32BIT
509 add rdi, DWORD PTR arg(2) ;[dst_ptich]
510 %else
511 add rdi, r8
512 %endif
513 dec rcx ; decrement count
514 jnz .vp9_filter_block1d16_v6_sse2_loop ; next row
515
516 ; begin epilog
517 pop rdi
518 pop rsi
519 RESTORE_GOT
520 RESTORE_XMM
521 UNSHADOW_ARGS
522 pop rbp
523 ret
524
525
526 ;void vp9_filter_block1d8_h6_only_sse2
527 ;(
528 ; unsigned char *src_ptr,
529 ; unsigned int src_pixels_per_line,
530 ; unsigned char *output_ptr,
531 ; int dst_ptich,
532 ; unsigned int output_height,
533 ; const short *vp9_filter
534 ;)
535 ; First-pass filter only when yoffset==0
536 global sym(vp9_filter_block1d8_h6_only_sse2)
537 sym(vp9_filter_block1d8_h6_only_sse2):
538 push rbp
539 mov rbp, rsp
540 SHADOW_ARGS_TO_STACK 6
541 SAVE_XMM 7
542 GET_GOT rbx
543 push rsi
544 push rdi
545 ; end prolog
546
547 mov rdx, arg(5) ;vp9_filter
548 mov rsi, arg(0) ;src_ptr
549
550 mov rdi, arg(2) ;output_ptr
551
552 movsxd rcx, dword ptr arg(4) ;output_height
553 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
554 %if ABI_IS_32BIT=0
555 movsxd r8, dword ptr arg(3) ;dst_ptich
556 %endif
557 pxor xmm0, xmm0 ; clear xmm0 for unp ack
558
559 .filter_block1d8_h6_only_rowloop:
560 movq xmm3, MMWORD PTR [rsi - 2]
561 movq xmm1, MMWORD PTR [rsi + 6]
562
563 prefetcht2 [rsi+rax-2]
564
565 pslldq xmm1, 8
566 por xmm1, xmm3
567
568 movdqa xmm4, xmm1
569 movdqa xmm5, xmm1
570
571 movdqa xmm6, xmm1
572 movdqa xmm7, xmm1
573
574 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0 2 xx01 xx01 xx-1 xx-2
575 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
576
577 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
578 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0 3 xx02 xx01 xx00 xx-1
579
580 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
581 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
582
583
584 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0 4 xx03 xx02 xx01 xx00
585 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
586
587 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
588
589 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0 5 xx04 xx03 xx02 xx01
590 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
591
592 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta p 4
593
594 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0 6 xx05 xx04 xx03 xx02
595 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
596
597
598 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta p 5
599
600 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx0 7 xx06 xx05 xx04 xx03
601 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Ta p 6
602
603
604 paddsw xmm4, xmm7
605 paddsw xmm4, xmm5
606
607 paddsw xmm4, xmm3
608 paddsw xmm4, xmm6
609
610 paddsw xmm4, xmm1
611 paddsw xmm4, [GLOBAL(rd)]
612
613 psraw xmm4, 7
614
615 packuswb xmm4, xmm0
616
617 movq QWORD PTR [rdi], xmm4 ; store the results in the des tination
618 lea rsi, [rsi + rax]
619
620 %if ABI_IS_32BIT
621 add rdi, DWORD Ptr arg(3) ;dst_ptich
622 %else
623 add rdi, r8
624 %endif
625 dec rcx
626
627 jnz .filter_block1d8_h6_only_rowloop ; next row
628
629 ; begin epilog
630 pop rdi
631 pop rsi
632 RESTORE_GOT
633 RESTORE_XMM
634 UNSHADOW_ARGS
635 pop rbp
636 ret
637
638
639 ;void vp9_filter_block1d16_h6_only_sse2
640 ;(
641 ; unsigned char *src_ptr,
642 ; unsigned int src_pixels_per_line,
643 ; unsigned char *output_ptr,
644 ; int dst_ptich,
645 ; unsigned int output_height,
646 ; const short *vp9_filter
647 ;)
648 ; First-pass filter only when yoffset==0
649 global sym(vp9_filter_block1d16_h6_only_sse2)
650 sym(vp9_filter_block1d16_h6_only_sse2):
651 push rbp
652 mov rbp, rsp
653 SHADOW_ARGS_TO_STACK 6
654 SAVE_XMM 7
655 GET_GOT rbx
656 push rsi
657 push rdi
658 ; end prolog
659
660 mov rdx, arg(5) ;vp9_filter
661 mov rsi, arg(0) ;src_ptr
662
663 mov rdi, arg(2) ;output_ptr
664
665 movsxd rcx, dword ptr arg(4) ;output_height
666 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
667 %if ABI_IS_32BIT=0
668 movsxd r8, dword ptr arg(3) ;dst_ptich
669 %endif
670
671 pxor xmm0, xmm0 ; clear xmm0 for unp ack
672
673 .filter_block1d16_h6_only_sse2_rowloop:
674 movq xmm3, MMWORD PTR [rsi - 2]
675 movq xmm1, MMWORD PTR [rsi + 6]
676
677 movq xmm2, MMWORD PTR [rsi +14]
678 pslldq xmm2, 8
679
680 por xmm2, xmm1
681 prefetcht2 [rsi+rax-2]
682
683 pslldq xmm1, 8
684 por xmm1, xmm3
685
686 movdqa xmm4, xmm1
687 movdqa xmm5, xmm1
688
689 movdqa xmm6, xmm1
690 movdqa xmm7, xmm1
691
692 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0 2 xx01 xx01 xx-1 xx-2
693 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
694
695 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
696 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0 3 xx02 xx01 xx00 xx-1
697
698 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
699 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
700
701 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0 4 xx03 xx02 xx01 xx00
702 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
703
704 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
705
706 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0 5 xx04 xx03 xx02 xx01
707 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
708
709 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta p 4
710
711 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0 6 xx05 xx04 xx03 xx02
712 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
713
714 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta p 5
715
716 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx0 7 xx06 xx05 xx04 xx03
717 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Ta p 6
718
719 paddsw xmm4, xmm7
720 paddsw xmm4, xmm5
721
722 paddsw xmm4, xmm3
723 paddsw xmm4, xmm6
724
725 paddsw xmm4, xmm1
726 paddsw xmm4, [GLOBAL(rd)]
727
728 psraw xmm4, 7
729
730 packuswb xmm4, xmm0 ; lower 8 bytes
731
732 movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
733
734 movdqa xmm3, xmm2
735 movdqa xmm4, xmm2
736
737 movdqa xmm5, xmm2
738 movdqa xmm6, xmm2
739
740 movdqa xmm7, xmm2
741
742 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0 2 xx01 xx01 xx-1 xx-2
743 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
744
745 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
746 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0 3 xx02 xx01 xx00 xx-1
747
748 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
749 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
750
751 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0 4 xx03 xx02 xx01 xx00
752 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
753
754 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
755
756 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0 5 xx04 xx03 xx02 xx01
757 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
758
759 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta p 4
760
761 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0 6 xx05 xx04 xx03 xx02
762 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
763
764 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta p 5
765
766 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx0 7 xx06 xx05 xx04 xx03
767 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Ta p 6
768
769 paddsw xmm4, xmm7
770 paddsw xmm4, xmm5
771
772 paddsw xmm4, xmm3
773 paddsw xmm4, xmm6
774
775 paddsw xmm4, xmm2
776 paddsw xmm4, [GLOBAL(rd)]
777
778 psraw xmm4, 7
779
780 packuswb xmm4, xmm0 ; higher 8 bytes
781
782 movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
783
784 lea rsi, [rsi + rax]
785 %if ABI_IS_32BIT
786 add rdi, DWORD Ptr arg(3) ;dst_ptich
787 %else
788 add rdi, r8
789 %endif
790
791 dec rcx
792 jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
793
794 ; begin epilog
795 pop rdi
796 pop rsi
797 RESTORE_GOT
798 RESTORE_XMM
799 UNSHADOW_ARGS
800 pop rbp
801 ret
802
803
804 ;void vp9_filter_block1d8_v6_only_sse2
805 ;(
806 ; unsigned char *src_ptr,
807 ; unsigned int src_pixels_per_line,
808 ; unsigned char *output_ptr,
809 ; int dst_ptich,
810 ; unsigned int output_height,
811 ; const short *vp9_filter
812 ;)
813 ; Second-pass filter only when xoffset==0
814 global sym(vp9_filter_block1d8_v6_only_sse2)
815 sym(vp9_filter_block1d8_v6_only_sse2):
816 push rbp
817 mov rbp, rsp
818 SHADOW_ARGS_TO_STACK 6
819 SAVE_XMM 7
820 GET_GOT rbx
821 push rsi
822 push rdi
823 ; end prolog
824
825 mov rsi, arg(0) ;src_ptr
826 mov rdi, arg(2) ;output_ptr
827
828 movsxd rcx, dword ptr arg(4) ;output_height
829 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
830
831 mov rax, arg(5) ;vp9_filter
832
833 pxor xmm0, xmm0 ; clear xmm0
834
835 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
836 %if ABI_IS_32BIT=0
837 movsxd r8, dword ptr arg(3) ; dst_ptich
838 %endif
839
840 .vp9_filter_block1d8_v6_only_sse2_loop:
841 movq xmm1, MMWORD PTR [rsi]
842 movq xmm2, MMWORD PTR [rsi + rdx]
843 movq xmm3, MMWORD PTR [rsi + rdx * 2]
844 movq xmm5, MMWORD PTR [rsi + rdx * 4]
845 add rsi, rdx
846 movq xmm4, MMWORD PTR [rsi + rdx * 2]
847 movq xmm6, MMWORD PTR [rsi + rdx * 4]
848
849 punpcklbw xmm1, xmm0
850 pmullw xmm1, [rax]
851
852 punpcklbw xmm2, xmm0
853 pmullw xmm2, [rax + 16]
854
855 punpcklbw xmm3, xmm0
856 pmullw xmm3, [rax + 32]
857
858 punpcklbw xmm5, xmm0
859 pmullw xmm5, [rax + 64]
860
861 punpcklbw xmm4, xmm0
862 pmullw xmm4, [rax + 48]
863
864 punpcklbw xmm6, xmm0
865 pmullw xmm6, [rax + 80]
866
867 paddsw xmm2, xmm5
868 paddsw xmm2, xmm3
869
870 paddsw xmm2, xmm1
871 paddsw xmm2, xmm4
872
873 paddsw xmm2, xmm6
874 paddsw xmm2, xmm7
875
876 psraw xmm2, 7
877 packuswb xmm2, xmm0 ; pack and saturate
878
879 movq QWORD PTR [rdi], xmm2 ; store the results in the des tination
880 %if ABI_IS_32BIT
881 add rdi, DWORD PTR arg(3) ;[dst_ptich]
882 %else
883 add rdi, r8
884 %endif
885 dec rcx ; decrement count
886 jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next r ow
887
888 ; begin epilog
889 pop rdi
890 pop rsi
891 RESTORE_GOT
892 RESTORE_XMM
893 UNSHADOW_ARGS
894 pop rbp
895 ret
896
897
898 ;void vp9_unpack_block1d16_h6_sse2
899 ;(
900 ; unsigned char *src_ptr,
901 ; unsigned short *output_ptr,
902 ; unsigned int src_pixels_per_line,
903 ; unsigned int output_height,
904 ; unsigned int output_width
905 ;)
906 global sym(vp9_unpack_block1d16_h6_sse2)
907 sym(vp9_unpack_block1d16_h6_sse2):
908 push rbp
909 mov rbp, rsp
910 SHADOW_ARGS_TO_STACK 5
911 GET_GOT rbx
912 push rsi
913 push rdi
914 ; end prolog
915
916 mov rsi, arg(0) ;src_ptr
917 mov rdi, arg(1) ;output_ptr
918
919 movsxd rcx, dword ptr arg(3) ;output_height
920 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
921
922 pxor xmm0, xmm0 ; clear xmm0 for unp ack
923 %if ABI_IS_32BIT=0
924 movsxd r8, dword ptr arg(4) ;output_width ; Pitc h for Source
925 %endif
926
927 .unpack_block1d16_h6_sse2_rowloop:
928 movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
929 movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
930
931 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0 2 xx01 xx01 xx-1 xx-2
932 punpcklbw xmm1, xmm0
933
934 movdqa XMMWORD Ptr [rdi], xmm1
935 movdqa XMMWORD Ptr [rdi + 16], xmm3
936
937 lea rsi, [rsi + rax]
938 %if ABI_IS_32BIT
939 add rdi, DWORD Ptr arg(4) ;[output_width]
940 %else
941 add rdi, r8
942 %endif
943 dec rcx
944 jnz .unpack_block1d16_h6_sse2_rowloop ; next row
945
946 ; begin epilog
947 pop rdi
948 pop rsi
949 RESTORE_GOT
950 UNSHADOW_ARGS
951 pop rbp
952 ret
953
954
955 ;void vp9_bilinear_predict16x16_sse2
956 ;(
957 ; unsigned char *src_ptr,
958 ; int src_pixels_per_line,
959 ; int xoffset,
960 ; int yoffset,
961 ; unsigned char *dst_ptr,
962 ; int dst_pitch
963 ;)
964 extern sym(vp9_bilinear_filters_mmx)
965 global sym(vp9_bilinear_predict16x16_sse2)
966 sym(vp9_bilinear_predict16x16_sse2):
967 push rbp
968 mov rbp, rsp
969 SHADOW_ARGS_TO_STACK 6
970 SAVE_XMM 7
971 GET_GOT rbx
972 push rsi
973 push rdi
974 ; end prolog
975
976 ;const short *HFilter = bilinear_filters_mmx[xoffset]
977 ;const short *VFilter = bilinear_filters_mmx[yoffset]
978
979 lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]
980 movsxd rax, dword ptr arg(2) ;xoffset
981
982 cmp rax, 0 ;skip first_pass filter if xoffset=0
983 je .b16x16_sp_only
984
985 shl rax, 5
986 add rax, rcx ;HFilter
987
988 mov rdi, arg(4) ;dst_ptr
989 mov rsi, arg(0) ;src_ptr
990 movsxd rdx, dword ptr arg(5) ;dst_pitch
991
992 movdqa xmm1, [rax]
993 movdqa xmm2, [rax+16]
994
995 movsxd rax, dword ptr arg(3) ;yoffset
996
997 cmp rax, 0 ;skip second_pass filter if yoffset=0
998 je .b16x16_fp_only
999
1000 shl rax, 5
1001 add rax, rcx ;VFilter
1002
1003 lea rcx, [rdi+rdx*8]
1004 lea rcx, [rcx+rdx*8]
1005 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
1006
1007 pxor xmm0, xmm0
1008
1009 %if ABI_IS_32BIT=0
1010 movsxd r8, dword ptr arg(5) ;dst_pitch
1011 %endif
1012 ; get the first horizontal line done
1013 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1014 movdqa xmm4, xmm3 ; make a copy of current li ne
1015
1016 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1017 punpckhbw xmm4, xmm0
1018
1019 pmullw xmm3, xmm1
1020 pmullw xmm4, xmm1
1021
1022 movdqu xmm5, [rsi+1]
1023 movdqa xmm6, xmm5
1024
1025 punpcklbw xmm5, xmm0
1026 punpckhbw xmm6, xmm0
1027
1028 pmullw xmm5, xmm2
1029 pmullw xmm6, xmm2
1030
1031 paddw xmm3, xmm5
1032 paddw xmm4, xmm6
1033
1034 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1035 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
1036
1037 paddw xmm4, [GLOBAL(rd)]
1038 psraw xmm4, VP9_FILTER_SHIFT
1039
1040 movdqa xmm7, xmm3
1041 packuswb xmm7, xmm4
1042
1043 add rsi, rdx ; next line
1044 .next_row:
1045 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1046 movdqa xmm4, xmm3 ; make a copy of current li ne
1047
1048 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1049 punpckhbw xmm4, xmm0
1050
1051 pmullw xmm3, xmm1
1052 pmullw xmm4, xmm1
1053
1054 movdqu xmm5, [rsi+1]
1055 movdqa xmm6, xmm5
1056
1057 punpcklbw xmm5, xmm0
1058 punpckhbw xmm6, xmm0
1059
1060 pmullw xmm5, xmm2
1061 pmullw xmm6, xmm2
1062
1063 paddw xmm3, xmm5
1064 paddw xmm4, xmm6
1065
1066 movdqa xmm5, xmm7
1067 movdqa xmm6, xmm7
1068
1069 punpcklbw xmm5, xmm0
1070 punpckhbw xmm6, xmm0
1071
1072 pmullw xmm5, [rax]
1073 pmullw xmm6, [rax]
1074
1075 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1076 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
1077
1078 paddw xmm4, [GLOBAL(rd)]
1079 psraw xmm4, VP9_FILTER_SHIFT
1080
1081 movdqa xmm7, xmm3
1082 packuswb xmm7, xmm4
1083
1084 pmullw xmm3, [rax+16]
1085 pmullw xmm4, [rax+16]
1086
1087 paddw xmm3, xmm5
1088 paddw xmm4, xmm6
1089
1090 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1091 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
1092
1093 paddw xmm4, [GLOBAL(rd)]
1094 psraw xmm4, VP9_FILTER_SHIFT
1095
1096 packuswb xmm3, xmm4
1097 movdqa [rdi], xmm3 ; store the results in the destination
1098
1099 add rsi, rdx ; next line
1100 %if ABI_IS_32BIT
1101 add rdi, DWORD PTR arg(5) ;dst_pitch
1102 %else
1103 add rdi, r8
1104 %endif
1105
1106 cmp rdi, rcx
1107 jne .next_row
1108
1109 jmp .done
1110
1111 .b16x16_sp_only:
1112 movsxd rax, dword ptr arg(3) ;yoffset
1113 shl rax, 5
1114 add rax, rcx ;VFilter
1115
1116 mov rdi, arg(4) ;dst_ptr
1117 mov rsi, arg(0) ;src_ptr
1118 movsxd rdx, dword ptr arg(5) ;dst_pitch
1119
1120 movdqa xmm1, [rax]
1121 movdqa xmm2, [rax+16]
1122
1123 lea rcx, [rdi+rdx*8]
1124 lea rcx, [rcx+rdx*8]
1125 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
1126
1127 pxor xmm0, xmm0
1128
1129 ; get the first horizontal line done
1130 movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1131
1132 add rsi, rax ; next line
1133 .next_row_spo:
1134 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1135
1136 movdqa xmm5, xmm7
1137 movdqa xmm6, xmm7
1138
1139 movdqa xmm4, xmm3 ; make a copy of current li ne
1140 movdqa xmm7, xmm3
1141
1142 punpcklbw xmm5, xmm0
1143 punpckhbw xmm6, xmm0
1144 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1145 punpckhbw xmm4, xmm0
1146
1147 pmullw xmm5, xmm1
1148 pmullw xmm6, xmm1
1149 pmullw xmm3, xmm2
1150 pmullw xmm4, xmm2
1151
1152 paddw xmm3, xmm5
1153 paddw xmm4, xmm6
1154
1155 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1156 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
1157
1158 paddw xmm4, [GLOBAL(rd)]
1159 psraw xmm4, VP9_FILTER_SHIFT
1160
1161 packuswb xmm3, xmm4
1162 movdqa [rdi], xmm3 ; store the results in the destination
1163
1164 add rsi, rax ; next line
1165 add rdi, rdx ;dst_pitch
1166 cmp rdi, rcx
1167 jne .next_row_spo
1168
1169 jmp .done
1170
1171 .b16x16_fp_only:
1172 lea rcx, [rdi+rdx*8]
1173 lea rcx, [rcx+rdx*8]
1174 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
1175 pxor xmm0, xmm0
1176
1177 .next_row_fpo:
1178 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1179 movdqa xmm4, xmm3 ; make a copy of current li ne
1180
1181 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1182 punpckhbw xmm4, xmm0
1183
1184 pmullw xmm3, xmm1
1185 pmullw xmm4, xmm1
1186
1187 movdqu xmm5, [rsi+1]
1188 movdqa xmm6, xmm5
1189
1190 punpcklbw xmm5, xmm0
1191 punpckhbw xmm6, xmm0
1192
1193 pmullw xmm5, xmm2
1194 pmullw xmm6, xmm2
1195
1196 paddw xmm3, xmm5
1197 paddw xmm4, xmm6
1198
1199 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1200 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
1201
1202 paddw xmm4, [GLOBAL(rd)]
1203 psraw xmm4, VP9_FILTER_SHIFT
1204
1205 packuswb xmm3, xmm4
1206 movdqa [rdi], xmm3 ; store the results in the destination
1207
1208 add rsi, rax ; next line
1209 add rdi, rdx ; dst_pitch
1210 cmp rdi, rcx
1211 jne .next_row_fpo
1212
1213 .done:
1214 ; begin epilog
1215 pop rdi
1216 pop rsi
1217 RESTORE_GOT
1218 RESTORE_XMM
1219 UNSHADOW_ARGS
1220 pop rbp
1221 ret
1222
1223
1224 ;void vp9_bilinear_predict8x8_sse2
1225 ;(
1226 ; unsigned char *src_ptr,
1227 ; int src_pixels_per_line,
1228 ; int xoffset,
1229 ; int yoffset,
1230 ; unsigned char *dst_ptr,
1231 ; int dst_pitch
1232 ;)
1233 extern sym(vp9_bilinear_filters_mmx)
1234 global sym(vp9_bilinear_predict8x8_sse2)
1235 sym(vp9_bilinear_predict8x8_sse2):
1236 push rbp
1237 mov rbp, rsp
1238 SHADOW_ARGS_TO_STACK 6
1239 SAVE_XMM 7
1240 GET_GOT rbx
1241 push rsi
1242 push rdi
1243 ; end prolog
1244
1245 ALIGN_STACK 16, rax
1246 sub rsp, 144 ; reserve 144 bytes
1247
1248 ;const short *HFilter = bilinear_filters_mmx[xoffset]
1249 ;const short *VFilter = bilinear_filters_mmx[yoffset]
1250 lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]
1251
1252 mov rsi, arg(0) ;src_ptr
1253 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
1254
1255 ;Read 9-line unaligned data in and put them on stack. This gives a big
1256 ;performance boost.
1257 movdqu xmm0, [rsi]
1258 lea rax, [rdx + rdx*2]
1259 movdqu xmm1, [rsi+rdx]
1260 movdqu xmm2, [rsi+rdx*2]
1261 add rsi, rax
1262 movdqu xmm3, [rsi]
1263 movdqu xmm4, [rsi+rdx]
1264 movdqu xmm5, [rsi+rdx*2]
1265 add rsi, rax
1266 movdqu xmm6, [rsi]
1267 movdqu xmm7, [rsi+rdx]
1268
1269 movdqa XMMWORD PTR [rsp], xmm0
1270
1271 movdqu xmm0, [rsi+rdx*2]
1272
1273 movdqa XMMWORD PTR [rsp+16], xmm1
1274 movdqa XMMWORD PTR [rsp+32], xmm2
1275 movdqa XMMWORD PTR [rsp+48], xmm3
1276 movdqa XMMWORD PTR [rsp+64], xmm4
1277 movdqa XMMWORD PTR [rsp+80], xmm5
1278 movdqa XMMWORD PTR [rsp+96], xmm6
1279 movdqa XMMWORD PTR [rsp+112], xmm7
1280 movdqa XMMWORD PTR [rsp+128], xmm0
1281
1282 movsxd rax, dword ptr arg(2) ;xoffset
1283 shl rax, 5
1284 add rax, rcx ;HFilter
1285
1286 mov rdi, arg(4) ;dst_ptr
1287 movsxd rdx, dword ptr arg(5) ;dst_pitch
1288
1289 movdqa xmm1, [rax]
1290 movdqa xmm2, [rax+16]
1291
1292 movsxd rax, dword ptr arg(3) ;yoffset
1293 shl rax, 5
1294 add rax, rcx ;VFilter
1295
1296 lea rcx, [rdi+rdx*8]
1297
1298 movdqa xmm5, [rax]
1299 movdqa xmm6, [rax+16]
1300
1301 pxor xmm0, xmm0
1302
1303 ; get the first horizontal line done
1304 movdqa xmm3, XMMWORD PTR [rsp]
1305 movdqa xmm4, xmm3 ; make a copy of current li ne
1306 psrldq xmm4, 1
1307
1308 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
1309 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
1310
1311 pmullw xmm3, xmm1
1312 pmullw xmm4, xmm2
1313
1314 paddw xmm3, xmm4
1315
1316 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1317 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
1318
1319 movdqa xmm7, xmm3
1320 add rsp, 16 ; next line
1321 .next_row8x8:
1322 movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1323 movdqa xmm4, xmm3 ; make a copy of current li ne
1324 psrldq xmm4, 1
1325
1326 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
1327 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
1328
1329 pmullw xmm3, xmm1
1330 pmullw xmm4, xmm2
1331
1332 paddw xmm3, xmm4
1333 pmullw xmm7, xmm5
1334
1335 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1336 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
1337
1338 movdqa xmm4, xmm3
1339
1340 pmullw xmm3, xmm6
1341 paddw xmm3, xmm7
1342
1343 movdqa xmm7, xmm4
1344
1345 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1346 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
1347
1348 packuswb xmm3, xmm0
1349 movq [rdi], xmm3 ; store the results in the destination
1350
1351 add rsp, 16 ; next line
1352 add rdi, rdx
1353
1354 cmp rdi, rcx
1355 jne .next_row8x8
1356
1357 ;add rsp, 144
1358 pop rsp
1359 ; begin epilog
1360 pop rdi
1361 pop rsi
1362 RESTORE_GOT
1363 RESTORE_XMM
1364 UNSHADOW_ARGS
1365 pop rbp
1366 ret
1367
1368
1369 SECTION_RODATA
1370 align 16
1371 rd:
1372 times 8 dw 0x40
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698