Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(627)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ;/****************************************************************************** ******
15 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixe ls. The
16 ; input pixel array has output_height rows. This routine assumes that output_hei ght is an
17 ; even number. This function handles 8 pixels in horizontal direction, calculati ng ONE
18 ; rows each iteration to take advantage of the 128 bits operations.
19 ;
20 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
21 ;
22 ;******************************************************************************* ******/
23
24 ;void vp9_filter_block1d8_v8_ssse3
25 ;(
26 ; unsigned char *src_ptr,
27 ; unsigned int src_pitch,
28 ; unsigned char *output_ptr,
29 ; unsigned int out_pitch,
30 ; unsigned int output_height,
31 ; short *filter
32 ;)
33 global sym(vp9_filter_block1d8_v8_ssse3)
34 sym(vp9_filter_block1d8_v8_ssse3):
35 push rbp
36 mov rbp, rsp
37 SHADOW_ARGS_TO_STACK 6
38 SAVE_XMM 7
39 push rsi
40 push rdi
41 push rbx
42 ; end prolog
43
44 ALIGN_STACK 16, rax
45 sub rsp, 16*5
46 %define k0k1 [rsp + 16*0]
47 %define k2k3 [rsp + 16*1]
48 %define k4k5 [rsp + 16*2]
49 %define k6k7 [rsp + 16*3]
50 %define krd [rsp + 16*4]
51
52 mov rdx, arg(5) ;filter ptr
53 mov rsi, arg(0) ;src_ptr
54 mov rdi, arg(2) ;output_ptr
55 mov rcx, 0x0400040
56
57 movdqa xmm4, [rdx] ;load filters
58 movd xmm5, rcx
59 packsswb xmm4, xmm4
60 pshuflw xmm0, xmm4, 0b ;k0_k1
61 pshuflw xmm1, xmm4, 01010101b ;k2_k3
62 pshuflw xmm2, xmm4, 10101010b ;k4_k5
63 pshuflw xmm3, xmm4, 11111111b ;k6_k7
64
65 punpcklqdq xmm0, xmm0
66 punpcklqdq xmm1, xmm1
67 punpcklqdq xmm2, xmm2
68 punpcklqdq xmm3, xmm3
69
70 movdqa k0k1, xmm0
71 movdqa k2k3, xmm1
72 pshufd xmm5, xmm5, 0
73 movdqa k4k5, xmm2
74 movdqa k6k7, xmm3
75 movdqa krd, xmm5
76
77 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
78
79 %if ABI_IS_32BIT=0
80 movsxd r8, DWORD PTR arg(3) ;out_pitch
81 %endif
82 mov rax, rsi
83 movsxd rcx, DWORD PTR arg(4) ;output_height
84 add rax, rdx
85
86 lea rbx, [rdx + rdx*4]
87 add rbx, rdx ;pitch * 6
88
89 .vp9_filter_block1d8_v8_ssse3_loop:
90 movq xmm0, [rsi] ;A
91 movq xmm1, [rsi + rdx] ;B
92 movq xmm2, [rsi + rdx * 2] ;C
93 movq xmm3, [rax + rdx * 2] ;D
94 movq xmm4, [rsi + rdx * 4] ;E
95 movq xmm5, [rax + rdx * 4] ;F
96
97 punpcklbw xmm0, xmm1 ;A B
98 punpcklbw xmm2, xmm3 ;C D
99 punpcklbw xmm4, xmm5 ;E F
100
101 movq xmm6, [rsi + rbx] ;G
102 movq xmm7, [rax + rbx] ;H
103
104 pmaddubsw xmm0, k0k1
105 pmaddubsw xmm2, k2k3
106 punpcklbw xmm6, xmm7 ;G H
107 pmaddubsw xmm4, k4k5
108 pmaddubsw xmm6, k6k7
109
110 paddsw xmm0, xmm2
111 paddsw xmm0, krd
112 paddsw xmm4, xmm6
113 paddsw xmm0, xmm4
114
115 psraw xmm0, 7
116 packuswb xmm0, xmm0
117
118 add rsi, rdx
119 add rax, rdx
120
121 movq [rdi], xmm0
122
123 %if ABI_IS_32BIT
124 add rdi, DWORD PTR arg(3) ;out_pitch
125 %else
126 add rdi, r8
127 %endif
128 dec rcx
129 jnz .vp9_filter_block1d8_v8_ssse3_loop
130
131 add rsp, 16*5
132 pop rsp
133 pop rbx
134 ; begin epilog
135 pop rdi
136 pop rsi
137 RESTORE_XMM
138 UNSHADOW_ARGS
139 pop rbp
140 ret
141
142 ;void vp9_filter_block1d16_v8_ssse3
143 ;(
144 ; unsigned char *src_ptr,
145 ; unsigned int src_pitch,
146 ; unsigned char *output_ptr,
147 ; unsigned int out_pitch,
148 ; unsigned int output_height,
149 ; short *filter
150 ;)
151 global sym(vp9_filter_block1d16_v8_ssse3)
152 sym(vp9_filter_block1d16_v8_ssse3):
153 push rbp
154 mov rbp, rsp
155 SHADOW_ARGS_TO_STACK 6
156 SAVE_XMM 7
157 push rsi
158 push rdi
159 push rbx
160 ; end prolog
161
162 ALIGN_STACK 16, rax
163 sub rsp, 16*5
164 %define k0k1 [rsp + 16*0]
165 %define k2k3 [rsp + 16*1]
166 %define k4k5 [rsp + 16*2]
167 %define k6k7 [rsp + 16*3]
168 %define krd [rsp + 16*4]
169
170 mov rdx, arg(5) ;filter ptr
171 mov rsi, arg(0) ;src_ptr
172 mov rdi, arg(2) ;output_ptr
173 mov rcx, 0x0400040
174
175 movdqa xmm4, [rdx] ;load filters
176 movd xmm5, rcx
177 packsswb xmm4, xmm4
178 pshuflw xmm0, xmm4, 0b ;k0_k1
179 pshuflw xmm1, xmm4, 01010101b ;k2_k3
180 pshuflw xmm2, xmm4, 10101010b ;k4_k5
181 pshuflw xmm3, xmm4, 11111111b ;k6_k7
182
183 punpcklqdq xmm0, xmm0
184 punpcklqdq xmm1, xmm1
185 punpcklqdq xmm2, xmm2
186 punpcklqdq xmm3, xmm3
187
188 movdqa k0k1, xmm0
189 movdqa k2k3, xmm1
190 pshufd xmm5, xmm5, 0
191 movdqa k4k5, xmm2
192 movdqa k6k7, xmm3
193 movdqa krd, xmm5
194
195 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
196
197 %if ABI_IS_32BIT=0
198 movsxd r8, DWORD PTR arg(3) ;out_pitch
199 %endif
200 mov rax, rsi
201 movsxd rcx, DWORD PTR arg(4) ;output_height
202 add rax, rdx
203
204 lea rbx, [rdx + rdx*4]
205 add rbx, rdx ;pitch * 6
206
207 .vp9_filter_block1d16_v8_ssse3_loop:
208 movq xmm0, [rsi] ;A
209 movq xmm1, [rsi + rdx] ;B
210 movq xmm2, [rsi + rdx * 2] ;C
211 movq xmm3, [rax + rdx * 2] ;D
212 movq xmm4, [rsi + rdx * 4] ;E
213 movq xmm5, [rax + rdx * 4] ;F
214
215 punpcklbw xmm0, xmm1 ;A B
216 punpcklbw xmm2, xmm3 ;C D
217 punpcklbw xmm4, xmm5 ;E F
218
219 movq xmm6, [rsi + rbx] ;G
220 movq xmm7, [rax + rbx] ;H
221
222 pmaddubsw xmm0, k0k1
223 pmaddubsw xmm2, k2k3
224 punpcklbw xmm6, xmm7 ;G H
225 pmaddubsw xmm4, k4k5
226 pmaddubsw xmm6, k6k7
227
228 paddsw xmm0, xmm2
229 paddsw xmm0, krd
230 paddsw xmm4, xmm6
231 paddsw xmm0, xmm4
232
233 psraw xmm0, 7
234 packuswb xmm0, xmm0
235
236 movq [rdi], xmm0
237
238 movq xmm0, [rsi + 8] ;A
239 movq xmm1, [rsi + rdx + 8] ;B
240 movq xmm2, [rsi + rdx * 2 + 8] ;C
241 movq xmm3, [rax + rdx * 2 + 8] ;D
242 movq xmm4, [rsi + rdx * 4 + 8] ;E
243 movq xmm5, [rax + rdx * 4 + 8] ;F
244
245 punpcklbw xmm0, xmm1 ;A B
246 punpcklbw xmm2, xmm3 ;C D
247 punpcklbw xmm4, xmm5 ;E F
248
249
250 movq xmm6, [rsi + rbx + 8] ;G
251 movq xmm7, [rax + rbx + 8] ;H
252 punpcklbw xmm6, xmm7 ;G H
253
254
255 pmaddubsw xmm0, k0k1
256 pmaddubsw xmm2, k2k3
257 pmaddubsw xmm4, k4k5
258 pmaddubsw xmm6, k6k7
259
260 paddsw xmm0, xmm2
261 paddsw xmm4, xmm6
262 paddsw xmm0, krd
263 paddsw xmm0, xmm4
264
265 psraw xmm0, 7
266 packuswb xmm0, xmm0
267
268 add rsi, rdx
269 add rax, rdx
270
271 movq [rdi+8], xmm0
272
273 %if ABI_IS_32BIT
274 add rdi, DWORD PTR arg(3) ;out_pitch
275 %else
276 add rdi, r8
277 %endif
278 dec rcx
279 jnz .vp9_filter_block1d16_v8_ssse3_loop
280
281 add rsp, 16*5
282 pop rsp
283 pop rbx
284 ; begin epilog
285 pop rdi
286 pop rsi
287 RESTORE_XMM
288 UNSHADOW_ARGS
289 pop rbp
290 ret
291
292 ;void vp9_filter_block1d8_h8_ssse3
293 ;(
294 ; unsigned char *src_ptr,
295 ; unsigned int src_pixels_per_line,
296 ; unsigned char *output_ptr,
297 ; unsigned int output_pitch,
298 ; unsigned int output_height,
299 ; short *filter
300 ;)
301 global sym(vp9_filter_block1d8_h8_ssse3)
302 sym(vp9_filter_block1d8_h8_ssse3):
303 push rbp
304 mov rbp, rsp
305 SHADOW_ARGS_TO_STACK 6
306 SAVE_XMM 7
307 GET_GOT rbx
308 push rsi
309 push rdi
310 ; end prolog
311
312 ALIGN_STACK 16, rax
313 sub rsp, 16*5
314 %define k0k1 [rsp + 16*0]
315 %define k2k3 [rsp + 16*1]
316 %define k4k5 [rsp + 16*2]
317 %define k6k7 [rsp + 16*3]
318 %define krd [rsp + 16*4]
319
320 mov rdx, arg(5) ;filter ptr
321 mov rsi, arg(0) ;src_ptr
322 mov rdi, arg(2) ;output_ptr
323 mov rcx, 0x0400040
324
325 movdqa xmm4, [rdx] ;load filters
326 movd xmm5, rcx
327 packsswb xmm4, xmm4
328 pshuflw xmm0, xmm4, 0b ;k0_k1
329 pshuflw xmm1, xmm4, 01010101b ;k2_k3
330 pshuflw xmm2, xmm4, 10101010b ;k4_k5
331 pshuflw xmm3, xmm4, 11111111b ;k6_k7
332
333 punpcklqdq xmm0, xmm0
334 punpcklqdq xmm1, xmm1
335 punpcklqdq xmm2, xmm2
336 punpcklqdq xmm3, xmm3
337
338 movdqa k0k1, xmm0
339 movdqa k2k3, xmm1
340 pshufd xmm5, xmm5, 0
341 movdqa k4k5, xmm2
342 movdqa k6k7, xmm3
343 ; movdqa krd, xmm5
344
345 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
346 movsxd rdx, dword ptr arg(3) ;output_pitch
347 movsxd rcx, dword ptr arg(4) ;output_height
348
349 .filter_block1d8_h8_rowloop_ssse3:
350 movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
351
352 ; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11
353 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
354 ;note: if we create a k0_k7 filter, we can save a pshufb
355 ; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
356 punpcklqdq xmm0, xmm3
357
358 movdqa xmm1, xmm0
359 pshufb xmm0, [GLOBAL(shuf_t0t1)]
360 pmaddubsw xmm0, k0k1
361
362 movdqa xmm2, xmm1
363 pshufb xmm1, [GLOBAL(shuf_t2t3)]
364 pmaddubsw xmm1, k2k3
365
366 movdqa xmm4, xmm2
367 pshufb xmm2, [GLOBAL(shuf_t4t5)]
368 pmaddubsw xmm2, k4k5
369
370 pshufb xmm4, [GLOBAL(shuf_t6t7)]
371 pmaddubsw xmm4, k6k7
372
373 paddsw xmm0, xmm1
374 paddsw xmm0, xmm2
375 paddsw xmm0, xmm5
376 paddsw xmm0, xmm4
377 psraw xmm0, 7
378 packuswb xmm0, xmm0
379
380 lea rsi, [rsi + rax]
381 movq [rdi], xmm0
382
383 lea rdi, [rdi + rdx]
384 dec rcx
385 jnz .filter_block1d8_h8_rowloop_ssse3
386
387 add rsp, 16*5
388 pop rsp
389
390 ; begin epilog
391 pop rdi
392 pop rsi
393 RESTORE_GOT
394 RESTORE_XMM
395 UNSHADOW_ARGS
396 pop rbp
397 ret
398
399 ;void vp9_filter_block1d16_h8_ssse3
400 ;(
401 ; unsigned char *src_ptr,
402 ; unsigned int src_pixels_per_line,
403 ; unsigned char *output_ptr,
404 ; unsigned int output_pitch,
405 ; unsigned int output_height,
406 ; short *filter
407 ;)
408 global sym(vp9_filter_block1d16_h8_ssse3)
409 sym(vp9_filter_block1d16_h8_ssse3):
410 push rbp
411 mov rbp, rsp
412 SHADOW_ARGS_TO_STACK 6
413 SAVE_XMM 7
414 GET_GOT rbx
415 push rsi
416 push rdi
417 ; end prolog
418
419 ALIGN_STACK 16, rax
420 sub rsp, 16*5
421 %define k0k1 [rsp + 16*0]
422 %define k2k3 [rsp + 16*1]
423 %define k4k5 [rsp + 16*2]
424 %define k6k7 [rsp + 16*3]
425 %define krd [rsp + 16*4]
426
427 mov rdx, arg(5) ;filter ptr
428 mov rsi, arg(0) ;src_ptr
429 mov rdi, arg(2) ;output_ptr
430 mov rcx, 0x0400040
431
432 movdqa xmm4, [rdx] ;load filters
433 movd xmm5, rcx
434 packsswb xmm4, xmm4
435 pshuflw xmm0, xmm4, 0b ;k0_k1
436 pshuflw xmm1, xmm4, 01010101b ;k2_k3
437 pshuflw xmm2, xmm4, 10101010b ;k4_k5
438 pshuflw xmm3, xmm4, 11111111b ;k6_k7
439
440 punpcklqdq xmm0, xmm0
441 punpcklqdq xmm1, xmm1
442 punpcklqdq xmm2, xmm2
443 punpcklqdq xmm3, xmm3
444
445 movdqa k0k1, xmm0
446 movdqa k2k3, xmm1
447 pshufd xmm5, xmm5, 0
448 movdqa k4k5, xmm2
449 movdqa k6k7, xmm3
450 movdqa krd, xmm5
451
452 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
453 movsxd rdx, dword ptr arg(3) ;output_pitch
454 movsxd rcx, dword ptr arg(4) ;output_height
455
456 .filter_block1d16_h8_rowloop_ssse3:
457 movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
458
459 ; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11
460 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
461 ;note: if we create a k0_k7 filter, we can save a pshufb
462 ; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
463 punpcklqdq xmm0, xmm3
464
465 movdqa xmm1, xmm0
466 pshufb xmm0, [GLOBAL(shuf_t0t1)]
467 pmaddubsw xmm0, k0k1
468
469 movdqa xmm2, xmm1
470 pshufb xmm1, [GLOBAL(shuf_t2t3)]
471 pmaddubsw xmm1, k2k3
472
473 movdqa xmm4, xmm2
474 pshufb xmm2, [GLOBAL(shuf_t4t5)]
475 pmaddubsw xmm2, k4k5
476
477 pshufb xmm4, [GLOBAL(shuf_t6t7)]
478 pmaddubsw xmm4, k6k7
479
480 paddsw xmm0, xmm1
481 paddsw xmm0, xmm4
482 paddsw xmm0, xmm2
483 paddsw xmm0, krd
484 psraw xmm0, 7
485 packuswb xmm0, xmm0
486
487
488 movq xmm3, [rsi + 5]
489 ; movq xmm7, [rsi + 12]
490 movq xmm7, [rsi + 13]
491 ;note: same as above
492 ; punpcklbw xmm3, xmm7
493 punpcklqdq xmm3, xmm7
494
495 movdqa xmm1, xmm3
496 pshufb xmm3, [GLOBAL(shuf_t0t1)]
497 pmaddubsw xmm3, k0k1
498
499 movdqa xmm2, xmm1
500 pshufb xmm1, [GLOBAL(shuf_t2t3)]
501 pmaddubsw xmm1, k2k3
502
503 movdqa xmm4, xmm2
504 pshufb xmm2, [GLOBAL(shuf_t4t5)]
505 pmaddubsw xmm2, k4k5
506
507 pshufb xmm4, [GLOBAL(shuf_t6t7)]
508 pmaddubsw xmm4, k6k7
509
510 paddsw xmm3, xmm1
511 paddsw xmm3, xmm2
512 paddsw xmm3, krd
513 paddsw xmm3, xmm4
514 psraw xmm3, 7
515 packuswb xmm3, xmm3
516 punpcklqdq xmm0, xmm3
517
518 lea rsi, [rsi + rax]
519 movdqa [rdi], xmm0
520
521 lea rdi, [rdi + rdx]
522 dec rcx
523 jnz .filter_block1d16_h8_rowloop_ssse3
524
525 add rsp, 16*5
526 pop rsp
527
528 ; begin epilog
529 pop rdi
530 pop rsi
531 RESTORE_GOT
532 RESTORE_XMM
533 UNSHADOW_ARGS
534 pop rbp
535 ret
536
537
538 SECTION_RODATA
539 align 16
540 shuf_t0t1:
541 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
542 align 16
543 shuf_t2t3:
544 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
545 align 16
546 shuf_t4t5:
547 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
548 align 16
549 shuf_t6t7:
550 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698