Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(224)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm

Issue 592203002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "vpx_ports/x86_abi_support.asm"
12
13 %macro HIGH_GET_PARAM_4 0
14 mov rdx, arg(5) ;filter ptr
15 mov rsi, arg(0) ;src_ptr
16 mov rdi, arg(2) ;output_ptr
17 mov rcx, 0x00000040
18
19 movdqa xmm3, [rdx] ;load filters
20 pshuflw xmm4, xmm3, 11111111b ;k3
21 psrldq xmm3, 8
22 pshuflw xmm3, xmm3, 0b ;k4
23 punpcklwd xmm4, xmm3 ;k3k4
24
25 movq xmm3, rcx ;rounding
26 pshufd xmm3, xmm3, 0
27
28 mov rdx, 0x00010001
29 movsxd rcx, DWORD PTR arg(6) ;bps
30 movq xmm5, rdx
31 movq xmm2, rcx
32 pshufd xmm5, xmm5, 0b
33 movdqa xmm1, xmm5
34 psllw xmm5, xmm2
35 psubw xmm5, xmm1 ;max value (for clamping)
36 pxor xmm2, xmm2 ;min value (for clamping)
37
38 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
39 movsxd rdx, DWORD PTR arg(3) ;out_pitch
40 movsxd rcx, DWORD PTR arg(4) ;output_height
41 %endm
42
43 %macro HIGH_APPLY_FILTER_4 1
44
45 punpcklwd xmm0, xmm1 ;two row in one register
46 pmaddwd xmm0, xmm4 ;multiply the filter factors
47
48 paddd xmm0, xmm3 ;rounding
49 psrad xmm0, 7 ;shift
50 packssdw xmm0, xmm0 ;pack to word
51
52 ;clamp the values
53 pminsw xmm0, xmm5
54 pmaxsw xmm0, xmm2
55
56 %if %1
57 movq xmm1, [rdi]
58 pavgw xmm0, xmm1
59 %endif
60
61 movq [rdi], xmm0
62 lea rsi, [rsi + 2*rax]
63 lea rdi, [rdi + 2*rdx]
64 dec rcx
65 %endm
66
67 %if ARCH_X86_64
68 %macro HIGH_GET_PARAM 0
69 mov rdx, arg(5) ;filter ptr
70 mov rsi, arg(0) ;src_ptr
71 mov rdi, arg(2) ;output_ptr
72 mov rcx, 0x00000040
73
74 movdqa xmm6, [rdx] ;load filters
75
76 pshuflw xmm7, xmm6, 11111111b ;k3
77 pshufhw xmm6, xmm6, 0b ;k4
78 psrldq xmm6, 8
79 punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
80
81 movq xmm4, rcx ;rounding
82 pshufd xmm4, xmm4, 0
83
84 mov rdx, 0x00010001
85 movsxd rcx, DWORD PTR arg(6) ;bps
86 movq xmm8, rdx
87 movq xmm5, rcx
88 pshufd xmm8, xmm8, 0b
89 movdqa xmm1, xmm8
90 psllw xmm8, xmm5
91 psubw xmm8, xmm1 ;max value (for clamping)
92 pxor xmm5, xmm5 ;min value (for clamping)
93
94 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
95 movsxd rdx, DWORD PTR arg(3) ;out_pitch
96 movsxd rcx, DWORD PTR arg(4) ;output_height
97 %endm
98
99 %macro HIGH_APPLY_FILTER_8 1
100 movdqa xmm6, xmm0
101 punpckhwd xmm6, xmm1
102 punpcklwd xmm0, xmm1
103 pmaddwd xmm6, xmm7
104 pmaddwd xmm0, xmm7
105
106 paddd xmm6, xmm4 ;rounding
107 paddd xmm0, xmm4 ;rounding
108 psrad xmm6, 7 ;shift
109 psrad xmm0, 7 ;shift
110 packssdw xmm0, xmm6 ;pack back to word
111
112 ;clamp the values
113 pminsw xmm0, xmm8
114 pmaxsw xmm0, xmm5
115
116 %if %1
117 movdqu xmm1, [rdi]
118 pavgw xmm0, xmm1
119 %endif
120 movdqu [rdi], xmm0 ;store the result
121
122 lea rsi, [rsi + 2*rax]
123 lea rdi, [rdi + 2*rdx]
124 dec rcx
125 %endm
126
127 %macro HIGH_APPLY_FILTER_16 1
128 movdqa xmm9, xmm0
129 movdqa xmm6, xmm2
130 punpckhwd xmm9, xmm1
131 punpckhwd xmm6, xmm3
132 punpcklwd xmm0, xmm1
133 punpcklwd xmm2, xmm3
134
135 pmaddwd xmm9, xmm7
136 pmaddwd xmm6, xmm7
137 pmaddwd xmm0, xmm7
138 pmaddwd xmm2, xmm7
139
140 paddd xmm9, xmm4 ;rounding
141 paddd xmm6, xmm4
142 paddd xmm0, xmm4
143 paddd xmm2, xmm4
144
145 psrad xmm9, 7 ;shift
146 psrad xmm6, 7
147 psrad xmm0, 7
148 psrad xmm2, 7
149
150 packssdw xmm0, xmm9 ;pack back to word
151 packssdw xmm2, xmm6 ;pack back to word
152
153 ;clamp the values
154 pminsw xmm0, xmm8
155 pmaxsw xmm0, xmm5
156 pminsw xmm2, xmm8
157 pmaxsw xmm2, xmm5
158
159 %if %1
160 movdqu xmm1, [rdi]
161 movdqu xmm3, [rdi + 16]
162 pavgw xmm0, xmm1
163 pavgw xmm2, xmm3
164 %endif
165 movdqu [rdi], xmm0 ;store the result
166 movdqu [rdi + 16], xmm2 ;store the result
167
168 lea rsi, [rsi + 2*rax]
169 lea rdi, [rdi + 2*rdx]
170 dec rcx
171 %endm
172 %endif
173
174 global sym(vp9_high_filter_block1d4_v2_sse2) PRIVATE
175 sym(vp9_high_filter_block1d4_v2_sse2):
176 push rbp
177 mov rbp, rsp
178 SHADOW_ARGS_TO_STACK 7
179 push rsi
180 push rdi
181 ; end prolog
182
183 HIGH_GET_PARAM_4
184 .loop:
185 movq xmm0, [rsi] ;load src
186 movq xmm1, [rsi + 2*rax]
187
188 HIGH_APPLY_FILTER_4 0
189 jnz .loop
190
191 ; begin epilog
192 pop rdi
193 pop rsi
194 UNSHADOW_ARGS
195 pop rbp
196 ret
197
198 %if ARCH_X86_64
199 global sym(vp9_high_filter_block1d8_v2_sse2) PRIVATE
200 sym(vp9_high_filter_block1d8_v2_sse2):
201 push rbp
202 mov rbp, rsp
203 SHADOW_ARGS_TO_STACK 7
204 SAVE_XMM 8
205 push rsi
206 push rdi
207 ; end prolog
208
209 HIGH_GET_PARAM
210 .loop:
211 movdqu xmm0, [rsi] ;0
212 movdqu xmm1, [rsi + 2*rax] ;1
213
214 HIGH_APPLY_FILTER_8 0
215 jnz .loop
216
217 ; begin epilog
218 pop rdi
219 pop rsi
220 RESTORE_XMM
221 UNSHADOW_ARGS
222 pop rbp
223 ret
224
225 global sym(vp9_high_filter_block1d16_v2_sse2) PRIVATE
226 sym(vp9_high_filter_block1d16_v2_sse2):
227 push rbp
228 mov rbp, rsp
229 SHADOW_ARGS_TO_STACK 7
230 SAVE_XMM 9
231 push rsi
232 push rdi
233 ; end prolog
234
235 HIGH_GET_PARAM
236 .loop:
237 movdqu xmm0, [rsi] ;0
238 movdqu xmm2, [rsi + 16]
239 movdqu xmm1, [rsi + 2*rax] ;1
240 movdqu xmm3, [rsi + 2*rax + 16]
241
242 HIGH_APPLY_FILTER_16 0
243 jnz .loop
244
245 ; begin epilog
246 pop rdi
247 pop rsi
248 RESTORE_XMM
249 UNSHADOW_ARGS
250 pop rbp
251 ret
252 %endif
253
254 global sym(vp9_high_filter_block1d4_v2_avg_sse2) PRIVATE
255 sym(vp9_high_filter_block1d4_v2_avg_sse2):
256 push rbp
257 mov rbp, rsp
258 SHADOW_ARGS_TO_STACK 7
259 push rsi
260 push rdi
261 ; end prolog
262
263 HIGH_GET_PARAM_4
264 .loop:
265 movq xmm0, [rsi] ;load src
266 movq xmm1, [rsi + 2*rax]
267
268 HIGH_APPLY_FILTER_4 1
269 jnz .loop
270
271 ; begin epilog
272 pop rdi
273 pop rsi
274 UNSHADOW_ARGS
275 pop rbp
276 ret
277
278 %if ARCH_X86_64
279 global sym(vp9_high_filter_block1d8_v2_avg_sse2) PRIVATE
280 sym(vp9_high_filter_block1d8_v2_avg_sse2):
281 push rbp
282 mov rbp, rsp
283 SHADOW_ARGS_TO_STACK 7
284 SAVE_XMM 8
285 push rsi
286 push rdi
287 ; end prolog
288
289 HIGH_GET_PARAM
290 .loop:
291 movdqu xmm0, [rsi] ;0
292 movdqu xmm1, [rsi + 2*rax] ;1
293
294 HIGH_APPLY_FILTER_8 1
295 jnz .loop
296
297 ; begin epilog
298 pop rdi
299 pop rsi
300 RESTORE_XMM
301 UNSHADOW_ARGS
302 pop rbp
303 ret
304
305 global sym(vp9_high_filter_block1d16_v2_avg_sse2) PRIVATE
306 sym(vp9_high_filter_block1d16_v2_avg_sse2):
307 push rbp
308 mov rbp, rsp
309 SHADOW_ARGS_TO_STACK 7
310 SAVE_XMM 9
311 push rsi
312 push rdi
313 ; end prolog
314
315 HIGH_GET_PARAM
316 .loop:
317 movdqu xmm0, [rsi] ;0
318 movdqu xmm1, [rsi + 2*rax] ;1
319 movdqu xmm2, [rsi + 16]
320 movdqu xmm3, [rsi + 2*rax + 16]
321
322 HIGH_APPLY_FILTER_16 1
323 jnz .loop
324
325 ; begin epilog
326 pop rdi
327 pop rsi
328 RESTORE_XMM
329 UNSHADOW_ARGS
330 pop rbp
331 ret
332 %endif
333
334 global sym(vp9_high_filter_block1d4_h2_sse2) PRIVATE
335 sym(vp9_high_filter_block1d4_h2_sse2):
336 push rbp
337 mov rbp, rsp
338 SHADOW_ARGS_TO_STACK 7
339 push rsi
340 push rdi
341 ; end prolog
342
343 HIGH_GET_PARAM_4
344 .loop:
345 movdqu xmm0, [rsi] ;load src
346 movdqa xmm1, xmm0
347 psrldq xmm1, 2
348
349 HIGH_APPLY_FILTER_4 0
350 jnz .loop
351
352 ; begin epilog
353 pop rdi
354 pop rsi
355 UNSHADOW_ARGS
356 pop rbp
357 ret
358
359 %if ARCH_X86_64
360 global sym(vp9_high_filter_block1d8_h2_sse2) PRIVATE
361 sym(vp9_high_filter_block1d8_h2_sse2):
362 push rbp
363 mov rbp, rsp
364 SHADOW_ARGS_TO_STACK 7
365 SAVE_XMM 8
366 push rsi
367 push rdi
368 ; end prolog
369
370 HIGH_GET_PARAM
371 .loop:
372 movdqu xmm0, [rsi] ;load src
373 movdqu xmm1, [rsi + 2]
374
375 HIGH_APPLY_FILTER_8 0
376 jnz .loop
377
378 ; begin epilog
379 pop rdi
380 pop rsi
381 RESTORE_XMM
382 UNSHADOW_ARGS
383 pop rbp
384 ret
385
386 global sym(vp9_high_filter_block1d16_h2_sse2) PRIVATE
387 sym(vp9_high_filter_block1d16_h2_sse2):
388 push rbp
389 mov rbp, rsp
390 SHADOW_ARGS_TO_STACK 7
391 SAVE_XMM 9
392 push rsi
393 push rdi
394 ; end prolog
395
396 HIGH_GET_PARAM
397 .loop:
398 movdqu xmm0, [rsi] ;load src
399 movdqu xmm1, [rsi + 2]
400 movdqu xmm2, [rsi + 16]
401 movdqu xmm3, [rsi + 18]
402
403 HIGH_APPLY_FILTER_16 0
404 jnz .loop
405
406 ; begin epilog
407 pop rdi
408 pop rsi
409 RESTORE_XMM
410 UNSHADOW_ARGS
411 pop rbp
412 ret
413 %endif
414
415 global sym(vp9_high_filter_block1d4_h2_avg_sse2) PRIVATE
416 sym(vp9_high_filter_block1d4_h2_avg_sse2):
417 push rbp
418 mov rbp, rsp
419 SHADOW_ARGS_TO_STACK 7
420 push rsi
421 push rdi
422 ; end prolog
423
424 HIGH_GET_PARAM_4
425 .loop:
426 movdqu xmm0, [rsi] ;load src
427 movdqa xmm1, xmm0
428 psrldq xmm1, 2
429
430 HIGH_APPLY_FILTER_4 1
431 jnz .loop
432
433 ; begin epilog
434 pop rdi
435 pop rsi
436 UNSHADOW_ARGS
437 pop rbp
438 ret
439
440 %if ARCH_X86_64
441 global sym(vp9_high_filter_block1d8_h2_avg_sse2) PRIVATE
442 sym(vp9_high_filter_block1d8_h2_avg_sse2):
443 push rbp
444 mov rbp, rsp
445 SHADOW_ARGS_TO_STACK 7
446 SAVE_XMM 8
447 push rsi
448 push rdi
449 ; end prolog
450
451 HIGH_GET_PARAM
452 .loop:
453 movdqu xmm0, [rsi] ;load src
454 movdqu xmm1, [rsi + 2]
455
456 HIGH_APPLY_FILTER_8 1
457 jnz .loop
458
459 ; begin epilog
460 pop rdi
461 pop rsi
462 RESTORE_XMM
463 UNSHADOW_ARGS
464 pop rbp
465 ret
466
467 global sym(vp9_high_filter_block1d16_h2_avg_sse2) PRIVATE
468 sym(vp9_high_filter_block1d16_h2_avg_sse2):
469 push rbp
470 mov rbp, rsp
471 SHADOW_ARGS_TO_STACK 7
472 SAVE_XMM 9
473 push rsi
474 push rdi
475 ; end prolog
476
477 HIGH_GET_PARAM
478 .loop:
479 movdqu xmm0, [rsi] ;load src
480 movdqu xmm1, [rsi + 2]
481 movdqu xmm2, [rsi + 16]
482 movdqu xmm3, [rsi + 18]
483
484 HIGH_APPLY_FILTER_16 1
485 jnz .loop
486
487 ; begin epilog
488 pop rdi
489 pop rsi
490 RESTORE_XMM
491 UNSHADOW_ARGS
492 pop rbp
493 ret
494 %endif
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm ('k') | source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698