OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 %define VP9_FILTER_WEIGHT 128 | |
15 %define VP9_FILTER_SHIFT 7 | |
16 | |
17 ;void vp9_post_proc_down_and_across_mmx | |
18 ;( | |
19 ; unsigned char *src_ptr, | |
20 ; unsigned char *dst_ptr, | |
21 ; int src_pixels_per_line, | |
22 ; int dst_pixels_per_line, | |
23 ; int rows, | |
24 ; int cols, | |
25 ; int flimit | |
26 ;) | |
27 global sym(vp9_post_proc_down_and_across_mmx) PRIVATE | |
28 sym(vp9_post_proc_down_and_across_mmx): | |
29 push rbp | |
30 mov rbp, rsp | |
31 SHADOW_ARGS_TO_STACK 7 | |
32 GET_GOT rbx | |
33 push rsi | |
34 push rdi | |
35 ; end prolog | |
36 | |
37 %if ABI_IS_32BIT=1 && CONFIG_PIC=1 | |
38 ; move the global rd onto the stack, since we don't have enough registers | |
39 ; to do PIC addressing | |
40 movq mm0, [GLOBAL(rd)] | |
41 sub rsp, 8 | |
42 movq [rsp], mm0 | |
43 %define RD [rsp] | |
44 %else | |
45 %define RD [GLOBAL(rd)] | |
46 %endif | |
47 | |
48 push rbx | |
49 lea rbx, [GLOBAL(Blur)] | |
50 movd mm2, dword ptr arg(6) ;flimit | |
51 punpcklwd mm2, mm2 | |
52 punpckldq mm2, mm2 | |
53 | |
54 mov rsi, arg(0) ;src_ptr | |
55 mov rdi, arg(1) ;dst_ptr | |
56 | |
57 movsxd rcx, DWORD PTR arg(4) ;rows | |
58 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pit
ch? | |
59 pxor mm0, mm0 ; mm0 = 00000000 | |
60 | |
61 .nextrow: | |
62 | |
63 xor rdx, rdx ; clear out rdx for use as loop counte
r | |
64 .nextcol: | |
65 | |
66 pxor mm7, mm7 ; mm7 = 00000000 | |
67 movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps | |
68 movq mm3, [rsi] ; mm4 = r0 p0..p7 | |
69 punpcklbw mm3, mm0 ; mm3 = p0..p3 | |
70 movq mm1, mm3 ; mm1 = p0..p3 | |
71 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers | |
72 | |
73 movq mm6, [rbx + 48] ; mm6 = kernel 3 taps | |
74 movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 | |
75 punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 | |
76 pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers | |
77 paddusw mm3, mm6 ; mm3 += mm6 | |
78 | |
79 ; thresholding | |
80 movq mm7, mm1 ; mm7 = r0 p0..p3 | |
81 psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 | |
82 psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 | |
83 paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) | |
84 pcmpgtw mm7, mm2 | |
85 | |
86 movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers | |
87 movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 | |
88 punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 | |
89 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers | |
90 paddusw mm3, mm6 ; mm3 += mm5 | |
91 | |
92 ; thresholding | |
93 movq mm6, mm1 ; mm6 = r0 p0..p3 | |
94 psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 | |
95 psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 | |
96 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) | |
97 pcmpgtw mm6, mm2 | |
98 por mm7, mm6 ; accumulate thresholds | |
99 | |
100 | |
101 neg rax | |
102 movq mm6, [rbx ] ; kernel 0 taps | |
103 movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 | |
104 punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 | |
105 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers | |
106 paddusw mm3, mm6 ; mm3 += mm5 | |
107 | |
108 ; thresholding | |
109 movq mm6, mm1 ; mm6 = r0 p0..p3 | |
110 psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 | |
111 psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 | |
112 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) | |
113 pcmpgtw mm6, mm2 | |
114 por mm7, mm6 ; accumulate thresholds | |
115 | |
116 movq mm6, [rbx + 16] ; kernel 1 taps | |
117 movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 | |
118 punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 | |
119 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. | |
120 paddusw mm3, mm6 ; mm3 += mm5 | |
121 | |
122 ; thresholding | |
123 movq mm6, mm1 ; mm6 = r0 p0..p3 | |
124 psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 | |
125 psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 | |
126 paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) | |
127 pcmpgtw mm6, mm2 | |
128 por mm7, mm6 ; accumulate thresholds | |
129 | |
130 | |
131 paddusw mm3, RD ; mm3 += round value | |
132 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 | |
133 | |
134 pand mm1, mm7 ; mm1 select vals > thresh from source | |
135 pandn mm7, mm3 ; mm7 select vals < thresh from blurre
d result | |
136 paddusw mm1, mm7 ; combination | |
137 | |
138 packuswb mm1, mm0 ; pack to bytes | |
139 | |
140 movd [rdi], mm1 ; | |
141 neg rax ; pitch is positive | |
142 | |
143 | |
144 add rsi, 4 | |
145 add rdi, 4 | |
146 add rdx, 4 | |
147 | |
148 cmp edx, dword ptr arg(5) ;cols | |
149 jl .nextcol | |
150 ; done with the all cols, start the across filtering in place | |
151 sub rsi, rdx | |
152 sub rdi, rdx | |
153 | |
154 | |
155 push rax | |
156 xor rdx, rdx | |
157 mov rax, [rdi-4]; | |
158 | |
159 .acrossnextcol: | |
160 pxor mm7, mm7 ; mm7 = 00000000 | |
161 movq mm6, [rbx + 32 ] ; | |
162 movq mm4, [rdi+rdx] ; mm4 = p0..p7 | |
163 movq mm3, mm4 ; mm3 = p0..p7 | |
164 punpcklbw mm3, mm0 ; mm3 = p0..p3 | |
165 movq mm1, mm3 ; mm1 = p0..p3 | |
166 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers | |
167 | |
168 movq mm6, [rbx + 48] | |
169 psrlq mm4, 8 ; mm4 = p1..p7 | |
170 movq mm5, mm4 ; mm5 = p1..p7 | |
171 punpcklbw mm5, mm0 ; mm5 = p1..p4 | |
172 pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers | |
173 paddusw mm3, mm6 ; mm3 += mm6 | |
174 | |
175 ; thresholding | |
176 movq mm7, mm1 ; mm7 = p0..p3 | |
177 psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 | |
178 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 | |
179 paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) | |
180 pcmpgtw mm7, mm2 | |
181 | |
182 movq mm6, [rbx + 64 ] | |
183 psrlq mm4, 8 ; mm4 = p2..p7 | |
184 movq mm5, mm4 ; mm5 = p2..p7 | |
185 punpcklbw mm5, mm0 ; mm5 = p2..p5 | |
186 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers | |
187 paddusw mm3, mm6 ; mm3 += mm5 | |
188 | |
189 ; thresholding | |
190 movq mm6, mm1 ; mm6 = p0..p3 | |
191 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 | |
192 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 | |
193 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) | |
194 pcmpgtw mm6, mm2 | |
195 por mm7, mm6 ; accumulate thresholds | |
196 | |
197 | |
198 movq mm6, [rbx ] | |
199 movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 | |
200 movq mm5, mm4 ; mm5 = p-2..p5 | |
201 punpcklbw mm5, mm0 ; mm5 = p-2..p1 | |
202 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers | |
203 paddusw mm3, mm6 ; mm3 += mm5 | |
204 | |
205 ; thresholding | |
206 movq mm6, mm1 ; mm6 = p0..p3 | |
207 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 | |
208 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 | |
209 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) | |
210 pcmpgtw mm6, mm2 | |
211 por mm7, mm6 ; accumulate thresholds | |
212 | |
213 movq mm6, [rbx + 16] | |
214 psrlq mm4, 8 ; mm4 = p-1..p5 | |
215 punpcklbw mm4, mm0 ; mm4 = p-1..p2 | |
216 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. | |
217 paddusw mm3, mm6 ; mm3 += mm5 | |
218 | |
219 ; thresholding | |
220 movq mm6, mm1 ; mm6 = p0..p3 | |
221 psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 | |
222 psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 | |
223 paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) | |
224 pcmpgtw mm6, mm2 | |
225 por mm7, mm6 ; accumulate thresholds | |
226 | |
227 paddusw mm3, RD ; mm3 += round value | |
228 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 | |
229 | |
230 pand mm1, mm7 ; mm1 select vals > thresh from source | |
231 pandn mm7, mm3 ; mm7 select vals < thresh from blurre
d result | |
232 paddusw mm1, mm7 ; combination | |
233 | |
234 packuswb mm1, mm0 ; pack to bytes | |
235 mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes | |
236 movd eax, mm1 | |
237 | |
238 add rdx, 4 | |
239 cmp edx, dword ptr arg(5) ;cols | |
240 jl .acrossnextcol; | |
241 | |
242 mov DWORD PTR [rdi+rdx-4], eax | |
243 pop rax | |
244 | |
245 ; done with this rwo | |
246 add rsi,rax ; next line | |
247 movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pit
ch? | |
248 add rdi,rax ; next destination | |
249 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pit
ch? | |
250 | |
251 dec rcx ; decrement count | |
252 jnz .nextrow ; next row | |
253 pop rbx | |
254 | |
255 ; begin epilog | |
256 pop rdi | |
257 pop rsi | |
258 RESTORE_GOT | |
259 UNSHADOW_ARGS | |
260 pop rbp | |
261 ret | |
262 %undef RD | |
263 | |
264 | |
265 ;void vp9_mbpost_proc_down_mmx(unsigned char *dst, | |
266 ; int pitch, int rows, int cols,int flimit) | |
267 extern sym(vp9_rv) | |
268 global sym(vp9_mbpost_proc_down_mmx) PRIVATE | |
269 sym(vp9_mbpost_proc_down_mmx): | |
270 push rbp | |
271 mov rbp, rsp | |
272 SHADOW_ARGS_TO_STACK 5 | |
273 GET_GOT rbx | |
274 push rsi | |
275 push rdi | |
276 ; end prolog | |
277 | |
278 ALIGN_STACK 16, rax | |
279 sub rsp, 136 | |
280 | |
281 ; unsigned char d[16][8] at [rsp] | |
282 ; create flimit2 at [rsp+128] | |
283 mov eax, dword ptr arg(4) ;flimit | |
284 mov [rsp+128], eax | |
285 mov [rsp+128+4], eax | |
286 %define flimit2 [rsp+128] | |
287 | |
288 %if ABI_IS_32BIT=0 | |
289 lea r8, [GLOBAL(sym(vp9_rv))] | |
290 %endif | |
291 | |
292 ;rows +=8; | |
293 add dword ptr arg(2), 8 | |
294 | |
295 ;for(c=0; c<cols; c+=4) | |
296 .loop_col: | |
297 mov rsi, arg(0) ;s | |
298 pxor mm0, mm0 ; | |
299 | |
300 movsxd rax, dword ptr arg(1) ;pitch ; | |
301 neg rax ; rax = -pitch | |
302 | |
303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch
*8] | |
304 neg rax | |
305 | |
306 | |
307 pxor mm5, mm5 | |
308 pxor mm6, mm6 ; | |
309 | |
310 pxor mm7, mm7 ; | |
311 mov rdi, rsi | |
312 | |
313 mov rcx, 15 ; | |
314 | |
315 .loop_initvar: | |
316 movd mm1, DWORD PTR [rdi]; | |
317 punpcklbw mm1, mm0 ; | |
318 | |
319 paddw mm5, mm1 ; | |
320 pmullw mm1, mm1 ; | |
321 | |
322 movq mm2, mm1 ; | |
323 punpcklwd mm1, mm0 ; | |
324 | |
325 punpckhwd mm2, mm0 ; | |
326 paddd mm6, mm1 ; | |
327 | |
328 paddd mm7, mm2 ; | |
329 lea rdi, [rdi+rax] ; | |
330 | |
331 dec rcx | |
332 jne .loop_initvar | |
333 ;save the var and sum | |
334 xor rdx, rdx | |
335 .loop_row: | |
336 movd mm1, DWORD PTR [rsi] ; [s-pitch*8] | |
337 movd mm2, DWORD PTR [rdi] ; [s+pitch*7] | |
338 | |
339 punpcklbw mm1, mm0 | |
340 punpcklbw mm2, mm0 | |
341 | |
342 paddw mm5, mm2 | |
343 psubw mm5, mm1 | |
344 | |
345 pmullw mm2, mm2 | |
346 movq mm4, mm2 | |
347 | |
348 punpcklwd mm2, mm0 | |
349 punpckhwd mm4, mm0 | |
350 | |
351 paddd mm6, mm2 | |
352 paddd mm7, mm4 | |
353 | |
354 pmullw mm1, mm1 | |
355 movq mm2, mm1 | |
356 | |
357 punpcklwd mm1, mm0 | |
358 psubd mm6, mm1 | |
359 | |
360 punpckhwd mm2, mm0 | |
361 psubd mm7, mm2 | |
362 | |
363 | |
364 movq mm3, mm6 | |
365 pslld mm3, 4 | |
366 | |
367 psubd mm3, mm6 | |
368 movq mm1, mm5 | |
369 | |
370 movq mm4, mm5 | |
371 pmullw mm1, mm1 | |
372 | |
373 pmulhw mm4, mm4 | |
374 movq mm2, mm1 | |
375 | |
376 punpcklwd mm1, mm4 | |
377 punpckhwd mm2, mm4 | |
378 | |
379 movq mm4, mm7 | |
380 pslld mm4, 4 | |
381 | |
382 psubd mm4, mm7 | |
383 | |
384 psubd mm3, mm1 | |
385 psubd mm4, mm2 | |
386 | |
387 psubd mm3, flimit2 | |
388 psubd mm4, flimit2 | |
389 | |
390 psrad mm3, 31 | |
391 psrad mm4, 31 | |
392 | |
393 packssdw mm3, mm4 | |
394 packsswb mm3, mm0 | |
395 | |
396 movd mm1, DWORD PTR [rsi+rax*8] | |
397 | |
398 movq mm2, mm1 | |
399 punpcklbw mm1, mm0 | |
400 | |
401 paddw mm1, mm5 | |
402 mov rcx, rdx | |
403 | |
404 and rcx, 127 | |
405 %if ABI_IS_32BIT=1 && CONFIG_PIC=1 | |
406 push rax | |
407 lea rax, [GLOBAL(sym(vp9_rv))] | |
408 movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2] | |
409 pop rax | |
410 %elif ABI_IS_32BIT=0 | |
411 movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2] | |
412 %else | |
413 movq mm4, [sym(vp9_rv) + rcx*2] | |
414 %endif | |
415 paddw mm1, mm4 | |
416 ;paddw xmm1, eight8s | |
417 psraw mm1, 4 | |
418 | |
419 packuswb mm1, mm0 | |
420 pand mm1, mm3 | |
421 | |
422 pandn mm3, mm2 | |
423 por mm1, mm3 | |
424 | |
425 and rcx, 15 | |
426 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] | |
427 | |
428 mov rcx, rdx | |
429 sub rcx, 8 | |
430 | |
431 and rcx, 15 | |
432 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] | |
433 | |
434 movd [rsi], mm1 | |
435 lea rsi, [rsi+rax] | |
436 | |
437 lea rdi, [rdi+rax] | |
438 add rdx, 1 | |
439 | |
440 cmp edx, dword arg(2) ;rows | |
441 jl .loop_row | |
442 | |
443 | |
444 add dword arg(0), 4 ; s += 4 | |
445 sub dword arg(3), 4 ; cols -= 4 | |
446 cmp dword arg(3), 0 | |
447 jg .loop_col | |
448 | |
449 add rsp, 136 | |
450 pop rsp | |
451 | |
452 ; begin epilog | |
453 pop rdi | |
454 pop rsi | |
455 RESTORE_GOT | |
456 UNSHADOW_ARGS | |
457 pop rbp | |
458 ret | |
459 %undef flimit2 | |
460 | |
461 | |
462 ;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise, | |
463 ; unsigned char blackclamp[16], | |
464 ; unsigned char whiteclamp[16], | |
465 ; unsigned char bothclamp[16], | |
466 ; unsigned int width, unsigned int height, int pitch) | |
467 global sym(vp9_plane_add_noise_mmx) PRIVATE | |
468 sym(vp9_plane_add_noise_mmx): | |
469 push rbp | |
470 mov rbp, rsp | |
471 SHADOW_ARGS_TO_STACK 8 | |
472 GET_GOT rbx | |
473 push rsi | |
474 push rdi | |
475 ; end prolog | |
476 | |
477 .addnoise_loop: | |
478 call sym(LIBVPX_RAND) WRT_PLT | |
479 mov rcx, arg(1) ;noise | |
480 and rax, 0xff | |
481 add rcx, rax | |
482 | |
483 ; we rely on the fact that the clamping vectors are stored contiguously | |
484 ; in black/white/both order. Note that we have to reload this here because | |
485 ; rdx could be trashed by rand() | |
486 mov rdx, arg(2) ; blackclamp | |
487 | |
488 | |
489 mov rdi, rcx | |
490 movsxd rcx, dword arg(5) ;[Width] | |
491 mov rsi, arg(0) ;Pos | |
492 xor rax,rax | |
493 | |
494 .addnoise_nextset: | |
495 movq mm1,[rsi+rax] ; get the source | |
496 | |
497 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so w
e don't outrange adding noise | |
498 paddusb mm1, [rdx+32] ;bothclamp | |
499 psubusb mm1, [rdx+16] ;whiteclamp | |
500 | |
501 movq mm2,[rdi+rax] ; get the noise for this line | |
502 paddb mm1,mm2 ; add it in | |
503 movq [rsi+rax],mm1 ; store the result | |
504 | |
505 add rax,8 ; move to the next line | |
506 | |
507 cmp rax, rcx | |
508 jl .addnoise_nextset | |
509 | |
510 movsxd rax, dword arg(7) ; Pitch | |
511 add arg(0), rax ; Start += Pitch | |
512 sub dword arg(6), 1 ; Height -= 1 | |
513 jg .addnoise_loop | |
514 | |
515 ; begin epilog | |
516 pop rdi | |
517 pop rsi | |
518 RESTORE_GOT | |
519 UNSHADOW_ARGS | |
520 pop rbp | |
521 ret | |
522 | |
523 | |
524 SECTION_RODATA | |
525 align 16 | |
526 Blur: | |
527 times 16 dw 16 | |
528 times 8 dw 64 | |
529 times 16 dw 16 | |
530 times 8 dw 0 | |
531 | |
532 rd: | |
533 times 4 dw 0x40 | |
OLD | NEW |