Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(79)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm

Issue 168343002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: libvpx: Pull from upstream Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "vpx_ports/x86_abi_support.asm"
12
13 %macro GET_PARAM_4 0
14 mov rdx, arg(5) ;filter ptr
15 mov rsi, arg(0) ;src_ptr
16 mov rdi, arg(2) ;output_ptr
17 mov rcx, 0x0400040
18
19 movdqa xmm3, [rdx] ;load filters
20 pshuflw xmm4, xmm3, 11111111b ;k3
21 psrldq xmm3, 8
22 pshuflw xmm3, xmm3, 0b ;k4
23 punpcklqdq xmm4, xmm3 ;k3k4
24
25 movq xmm3, rcx ;rounding
26 pshufd xmm3, xmm3, 0
27
28 pxor xmm2, xmm2
29
30 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
31 movsxd rdx, DWORD PTR arg(3) ;out_pitch
32 movsxd rcx, DWORD PTR arg(4) ;output_height
33 %endm
34
35 %macro APPLY_FILTER_4 1
36
37 punpckldq xmm0, xmm1 ;two row in one register
38 punpcklbw xmm0, xmm2 ;unpack to word
39 pmullw xmm0, xmm4 ;multiply the filter factors
40
41 movdqa xmm1, xmm0
42 psrldq xmm1, 8
43 paddsw xmm0, xmm1
44
45 paddsw xmm0, xmm3 ;rounding
46 psraw xmm0, 7 ;shift
47 packuswb xmm0, xmm0 ;pack to byte
48
49 %if %1
50 movd xmm1, [rdi]
51 pavgb xmm0, xmm1
52 %endif
53
54 movd [rdi], xmm0
55 lea rsi, [rsi + rax]
56 lea rdi, [rdi + rdx]
57 dec rcx
58 %endm
59
60 %macro GET_PARAM 0
61 mov rdx, arg(5) ;filter ptr
62 mov rsi, arg(0) ;src_ptr
63 mov rdi, arg(2) ;output_ptr
64 mov rcx, 0x0400040
65
66 movdqa xmm7, [rdx] ;load filters
67
68 pshuflw xmm6, xmm7, 11111111b ;k3
69 pshufhw xmm7, xmm7, 0b ;k4
70 punpcklwd xmm6, xmm6
71 punpckhwd xmm7, xmm7
72
73 movq xmm4, rcx ;rounding
74 pshufd xmm4, xmm4, 0
75
76 pxor xmm5, xmm5
77
78 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
79 movsxd rdx, DWORD PTR arg(3) ;out_pitch
80 movsxd rcx, DWORD PTR arg(4) ;output_height
81 %endm
82
83 %macro APPLY_FILTER_8 1
84 punpcklbw xmm0, xmm5
85 punpcklbw xmm1, xmm5
86
87 pmullw xmm0, xmm6
88 pmullw xmm1, xmm7
89 paddsw xmm0, xmm1
90 paddsw xmm0, xmm4 ;rounding
91 psraw xmm0, 7 ;shift
92 packuswb xmm0, xmm0 ;pack back to byte
93 %if %1
94 movq xmm1, [rdi]
95 pavgb xmm0, xmm1
96 %endif
97 movq [rdi], xmm0 ;store the result
98
99 lea rsi, [rsi + rax]
100 lea rdi, [rdi + rdx]
101 dec rcx
102 %endm
103
104 %macro APPLY_FILTER_16 1
105 punpcklbw xmm0, xmm5
106 punpcklbw xmm1, xmm5
107 punpckhbw xmm2, xmm5
108 punpckhbw xmm3, xmm5
109
110 pmullw xmm0, xmm6
111 pmullw xmm1, xmm7
112 pmullw xmm2, xmm6
113 pmullw xmm3, xmm7
114
115 paddsw xmm0, xmm1
116 paddsw xmm2, xmm3
117
118 paddsw xmm0, xmm4 ;rounding
119 paddsw xmm2, xmm4
120 psraw xmm0, 7 ;shift
121 psraw xmm2, 7
122 packuswb xmm0, xmm2 ;pack back to byte
123 %if %1
124 movdqu xmm1, [rdi]
125 pavgb xmm0, xmm1
126 %endif
127 movdqu [rdi], xmm0 ;store the result
128
129 lea rsi, [rsi + rax]
130 lea rdi, [rdi + rdx]
131 dec rcx
132 %endm
133
134 global sym(vp9_filter_block1d4_v2_sse2) PRIVATE
135 sym(vp9_filter_block1d4_v2_sse2):
136 push rbp
137 mov rbp, rsp
138 SHADOW_ARGS_TO_STACK 6
139 push rsi
140 push rdi
141 ; end prolog
142
143 GET_PARAM_4
144 .loop:
145 movd xmm0, [rsi] ;load src
146 movd xmm1, [rsi + rax]
147
148 APPLY_FILTER_4 0
149 jnz .loop
150
151 ; begin epilog
152 pop rdi
153 pop rsi
154 UNSHADOW_ARGS
155 pop rbp
156 ret
157
158 global sym(vp9_filter_block1d8_v2_sse2) PRIVATE
159 sym(vp9_filter_block1d8_v2_sse2):
160 push rbp
161 mov rbp, rsp
162 SHADOW_ARGS_TO_STACK 6
163 SAVE_XMM 7
164 push rsi
165 push rdi
166 ; end prolog
167
168 GET_PARAM
169 .loop:
170 movq xmm0, [rsi] ;0
171 movq xmm1, [rsi + rax] ;1
172
173 APPLY_FILTER_8 0
174 jnz .loop
175
176 ; begin epilog
177 pop rdi
178 pop rsi
179 RESTORE_XMM
180 UNSHADOW_ARGS
181 pop rbp
182 ret
183
184 global sym(vp9_filter_block1d16_v2_sse2) PRIVATE
185 sym(vp9_filter_block1d16_v2_sse2):
186 push rbp
187 mov rbp, rsp
188 SHADOW_ARGS_TO_STACK 6
189 SAVE_XMM 7
190 push rsi
191 push rdi
192 ; end prolog
193
194 GET_PARAM
195 .loop:
196 movdqu xmm0, [rsi] ;0
197 movdqu xmm1, [rsi + rax] ;1
198 movdqa xmm2, xmm0
199 movdqa xmm3, xmm1
200
201 APPLY_FILTER_16 0
202 jnz .loop
203
204 ; begin epilog
205 pop rdi
206 pop rsi
207 RESTORE_XMM
208 UNSHADOW_ARGS
209 pop rbp
210 ret
211
212 global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE
213 sym(vp9_filter_block1d4_v2_avg_sse2):
214 push rbp
215 mov rbp, rsp
216 SHADOW_ARGS_TO_STACK 6
217 push rsi
218 push rdi
219 ; end prolog
220
221 GET_PARAM_4
222 .loop:
223 movd xmm0, [rsi] ;load src
224 movd xmm1, [rsi + rax]
225
226 APPLY_FILTER_4 1
227 jnz .loop
228
229 ; begin epilog
230 pop rdi
231 pop rsi
232 UNSHADOW_ARGS
233 pop rbp
234 ret
235
236 global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE
237 sym(vp9_filter_block1d8_v2_avg_sse2):
238 push rbp
239 mov rbp, rsp
240 SHADOW_ARGS_TO_STACK 6
241 SAVE_XMM 7
242 push rsi
243 push rdi
244 ; end prolog
245
246 GET_PARAM
247 .loop:
248 movq xmm0, [rsi] ;0
249 movq xmm1, [rsi + rax] ;1
250
251 APPLY_FILTER_8 1
252 jnz .loop
253
254 ; begin epilog
255 pop rdi
256 pop rsi
257 RESTORE_XMM
258 UNSHADOW_ARGS
259 pop rbp
260 ret
261
262 global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE
263 sym(vp9_filter_block1d16_v2_avg_sse2):
264 push rbp
265 mov rbp, rsp
266 SHADOW_ARGS_TO_STACK 6
267 SAVE_XMM 7
268 push rsi
269 push rdi
270 ; end prolog
271
272 GET_PARAM
273 .loop:
274 movdqu xmm0, [rsi] ;0
275 movdqu xmm1, [rsi + rax] ;1
276 movdqa xmm2, xmm0
277 movdqa xmm3, xmm1
278
279 APPLY_FILTER_16 1
280 jnz .loop
281
282 ; begin epilog
283 pop rdi
284 pop rsi
285 RESTORE_XMM
286 UNSHADOW_ARGS
287 pop rbp
288 ret
289
290 global sym(vp9_filter_block1d4_h2_sse2) PRIVATE
291 sym(vp9_filter_block1d4_h2_sse2):
292 push rbp
293 mov rbp, rsp
294 SHADOW_ARGS_TO_STACK 6
295 push rsi
296 push rdi
297 ; end prolog
298
299 GET_PARAM_4
300 .loop:
301 movdqu xmm0, [rsi] ;load src
302 movdqa xmm1, xmm0
303 psrldq xmm1, 1
304
305 APPLY_FILTER_4 0
306 jnz .loop
307
308 ; begin epilog
309 pop rdi
310 pop rsi
311 UNSHADOW_ARGS
312 pop rbp
313 ret
314
315 global sym(vp9_filter_block1d8_h2_sse2) PRIVATE
316 sym(vp9_filter_block1d8_h2_sse2):
317 push rbp
318 mov rbp, rsp
319 SHADOW_ARGS_TO_STACK 6
320 SAVE_XMM 7
321 push rsi
322 push rdi
323 ; end prolog
324
325 GET_PARAM
326 .loop:
327 movdqu xmm0, [rsi] ;load src
328 movdqa xmm1, xmm0
329 psrldq xmm1, 1
330
331 APPLY_FILTER_8 0
332 jnz .loop
333
334 ; begin epilog
335 pop rdi
336 pop rsi
337 RESTORE_XMM
338 UNSHADOW_ARGS
339 pop rbp
340 ret
341
342 global sym(vp9_filter_block1d16_h2_sse2) PRIVATE
343 sym(vp9_filter_block1d16_h2_sse2):
344 push rbp
345 mov rbp, rsp
346 SHADOW_ARGS_TO_STACK 6
347 SAVE_XMM 7
348 push rsi
349 push rdi
350 ; end prolog
351
352 GET_PARAM
353 .loop:
354 movdqu xmm0, [rsi] ;load src
355 movdqu xmm1, [rsi + 1]
356 movdqa xmm2, xmm0
357 movdqa xmm3, xmm1
358
359 APPLY_FILTER_16 0
360 jnz .loop
361
362 ; begin epilog
363 pop rdi
364 pop rsi
365 RESTORE_XMM
366 UNSHADOW_ARGS
367 pop rbp
368 ret
369
370 global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE
371 sym(vp9_filter_block1d4_h2_avg_sse2):
372 push rbp
373 mov rbp, rsp
374 SHADOW_ARGS_TO_STACK 6
375 push rsi
376 push rdi
377 ; end prolog
378
379 GET_PARAM_4
380 .loop:
381 movdqu xmm0, [rsi] ;load src
382 movdqa xmm1, xmm0
383 psrldq xmm1, 1
384
385 APPLY_FILTER_4 1
386 jnz .loop
387
388 ; begin epilog
389 pop rdi
390 pop rsi
391 UNSHADOW_ARGS
392 pop rbp
393 ret
394
395 global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE
396 sym(vp9_filter_block1d8_h2_avg_sse2):
397 push rbp
398 mov rbp, rsp
399 SHADOW_ARGS_TO_STACK 6
400 SAVE_XMM 7
401 push rsi
402 push rdi
403 ; end prolog
404
405 GET_PARAM
406 .loop:
407 movdqu xmm0, [rsi] ;load src
408 movdqa xmm1, xmm0
409 psrldq xmm1, 1
410
411 APPLY_FILTER_8 1
412 jnz .loop
413
414 ; begin epilog
415 pop rdi
416 pop rsi
417 RESTORE_XMM
418 UNSHADOW_ARGS
419 pop rbp
420 ret
421
422 global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE
423 sym(vp9_filter_block1d16_h2_avg_sse2):
424 push rbp
425 mov rbp, rsp
426 SHADOW_ARGS_TO_STACK 6
427 SAVE_XMM 7
428 push rsi
429 push rdi
430 ; end prolog
431
432 GET_PARAM
433 .loop:
434 movdqu xmm0, [rsi] ;load src
435 movdqu xmm1, [rsi + 1]
436 movdqa xmm2, xmm0
437 movdqa xmm3, xmm1
438
439 APPLY_FILTER_16 1
440 jnz .loop
441
442 ; begin epilog
443 pop rdi
444 pop rsi
445 RESTORE_XMM
446 UNSHADOW_ARGS
447 pop rbp
448 ret
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698