OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 | |
15 ;void vp8_loop_filter_horizontal_edge_mmx | |
16 ;( | |
17 ; unsigned char *src_ptr, | |
18 ; int src_pixel_step, | |
19 ; const char *blimit, | |
20 ; const char *limit, | |
21 ; const char *thresh, | |
22 ; int count | |
23 ;) | |
24 global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE | |
25 sym(vp8_loop_filter_horizontal_edge_mmx): | |
26 push rbp | |
27 mov rbp, rsp | |
28 SHADOW_ARGS_TO_STACK 6 | |
29 GET_GOT rbx | |
30 push rsi | |
31 push rdi | |
32 ; end prolog | |
33 | |
34 ALIGN_STACK 16, rax | |
35 sub rsp, 32 ; reserve 32 bytes | |
36 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; | |
37 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; | |
38 | |
39 mov rsi, arg(0) ;src_ptr | |
40 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc
h? | |
41 | |
42 movsxd rcx, dword ptr arg(5) ;count | |
43 .next8_h: | |
44 mov rdx, arg(3) ;limit | |
45 movq mm7, [rdx] | |
46 mov rdi, rsi ; rdi points to row +1 for indirect ad
dressing | |
47 add rdi, rax | |
48 | |
49 ; calculate breakout conditions | |
50 movq mm2, [rdi+2*rax] ; q3 | |
51 movq mm1, [rsi+2*rax] ; q2 | |
52 movq mm6, mm1 ; q2 | |
53 psubusb mm1, mm2 ; q2-=q3 | |
54 psubusb mm2, mm6 ; q3-=q2 | |
55 por mm1, mm2 ; abs(q3-q2) | |
56 psubusb mm1, mm7 ; | |
57 | |
58 | |
59 movq mm4, [rsi+rax] ; q1 | |
60 movq mm3, mm4 ; q1 | |
61 psubusb mm4, mm6 ; q1-=q2 | |
62 psubusb mm6, mm3 ; q2-=q1 | |
63 por mm4, mm6 ; abs(q2-q1) | |
64 | |
65 psubusb mm4, mm7 | |
66 por mm1, mm4 | |
67 | |
68 movq mm4, [rsi] ; q0 | |
69 movq mm0, mm4 ; q0 | |
70 psubusb mm4, mm3 ; q0-=q1 | |
71 psubusb mm3, mm0 ; q1-=q0 | |
72 por mm4, mm3 ; abs(q0-q1) | |
73 movq t0, mm4 ; save to t0 | |
74 psubusb mm4, mm7 | |
75 por mm1, mm4 | |
76 | |
77 | |
78 neg rax ; negate pitch to deal with above bord
er | |
79 | |
80 movq mm2, [rsi+4*rax] ; p3 | |
81 movq mm4, [rdi+4*rax] ; p2 | |
82 movq mm5, mm4 ; p2 | |
83 psubusb mm4, mm2 ; p2-=p3 | |
84 psubusb mm2, mm5 ; p3-=p2 | |
85 por mm4, mm2 ; abs(p3 - p2) | |
86 psubusb mm4, mm7 | |
87 por mm1, mm4 | |
88 | |
89 | |
90 movq mm4, [rsi+2*rax] ; p1 | |
91 movq mm3, mm4 ; p1 | |
92 psubusb mm4, mm5 ; p1-=p2 | |
93 psubusb mm5, mm3 ; p2-=p1 | |
94 por mm4, mm5 ; abs(p2 - p1) | |
95 psubusb mm4, mm7 | |
96 por mm1, mm4 | |
97 | |
98 movq mm2, mm3 ; p1 | |
99 | |
100 movq mm4, [rsi+rax] ; p0 | |
101 movq mm5, mm4 ; p0 | |
102 psubusb mm4, mm3 ; p0-=p1 | |
103 psubusb mm3, mm5 ; p1-=p0 | |
104 por mm4, mm3 ; abs(p1 - p0) | |
105 movq t1, mm4 ; save to t1 | |
106 psubusb mm4, mm7 | |
107 por mm1, mm4 | |
108 | |
109 movq mm3, [rdi] ; q1 | |
110 movq mm4, mm3 ; q1 | |
111 psubusb mm3, mm2 ; q1-=p1 | |
112 psubusb mm2, mm4 ; p1-=q1 | |
113 por mm2, mm3 ; abs(p1-q1) | |
114 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero | |
115 psrlw mm2, 1 ; abs(p1-q1)/2 | |
116 | |
117 movq mm6, mm5 ; p0 | |
118 movq mm3, [rsi] ; q0 | |
119 psubusb mm5, mm3 ; p0-=q0 | |
120 psubusb mm3, mm6 ; q0-=p0 | |
121 por mm5, mm3 ; abs(p0 - q0) | |
122 paddusb mm5, mm5 ; abs(p0-q0)*2 | |
123 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 | |
124 | |
125 mov rdx, arg(2) ;blimit ; get blimit | |
126 movq mm7, [rdx] ; blimit | |
127 | |
128 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > b
limit | |
129 por mm1, mm5 | |
130 pxor mm5, mm5 | |
131 pcmpeqb mm1, mm5 ; mask mm1 | |
132 | |
133 ; calculate high edge variance | |
134 mov rdx, arg(4) ;thresh ; get thresh | |
135 movq mm7, [rdx] ; | |
136 movq mm4, t0 ; get abs (q1 - q0) | |
137 psubusb mm4, mm7 | |
138 movq mm3, t1 ; get abs (p1 - p0) | |
139 psubusb mm3, mm7 | |
140 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0
) > thresh | |
141 | |
142 pcmpeqb mm4, mm5 | |
143 | |
144 pcmpeqb mm5, mm5 | |
145 pxor mm4, mm5 | |
146 | |
147 | |
148 ; start work on filters | |
149 movq mm2, [rsi+2*rax] ; p1 | |
150 movq mm7, [rdi] ; q1 | |
151 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value
s | |
152 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value
s | |
153 psubsb mm2, mm7 ; p1 - q1 | |
154 pand mm2, mm4 ; high var mask (hvm)(p1 - q1) | |
155 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values | |
156 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values | |
157 movq mm3, mm0 ; q0 | |
158 psubsb mm0, mm6 ; q0 - p0 | |
159 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) | |
160 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) | |
161 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) | |
162 pand mm1, mm2 ; mask filter values we don't care
about | |
163 movq mm2, mm1 | |
164 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 | |
165 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 | |
166 | |
167 pxor mm0, mm0 ; | |
168 pxor mm5, mm5 | |
169 punpcklbw mm0, mm2 ; | |
170 punpckhbw mm5, mm2 ; | |
171 psraw mm0, 11 ; | |
172 psraw mm5, 11 | |
173 packsswb mm0, mm5 | |
174 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >>
3; | |
175 | |
176 pxor mm0, mm0 ; 0 | |
177 movq mm5, mm1 ; abcdefgh | |
178 punpcklbw mm0, mm1 ; e0f0g0h0 | |
179 psraw mm0, 11 ; sign extended shift right by 3 | |
180 pxor mm1, mm1 ; 0 | |
181 punpckhbw mm1, mm5 ; a0b0c0d0 | |
182 psraw mm1, 11 ; sign extended shift right by 3 | |
183 movq mm5, mm0 ; save results | |
184 | |
185 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>
3 | |
186 paddsw mm5, [GLOBAL(ones)] | |
187 paddsw mm1, [GLOBAL(ones)] | |
188 psraw mm5, 1 ; partial shifted one more time for 2n
d tap | |
189 psraw mm1, 1 ; partial shifted one more time for 2n
d tap | |
190 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>
4 | |
191 pandn mm4, mm5 ; high edge variance additive | |
192 | |
193 paddsb mm6, mm2 ; p0+= p0 add | |
194 pxor mm6, [GLOBAL(t80)] ; unoffset | |
195 movq [rsi+rax], mm6 ; write back | |
196 | |
197 movq mm6, [rsi+2*rax] ; p1 | |
198 pxor mm6, [GLOBAL(t80)] ; reoffset | |
199 paddsb mm6, mm4 ; p1+= p1 add | |
200 pxor mm6, [GLOBAL(t80)] ; unoffset | |
201 movq [rsi+2*rax], mm6 ; write back | |
202 | |
203 psubsb mm3, mm0 ; q0-= q0 add | |
204 pxor mm3, [GLOBAL(t80)] ; unoffset | |
205 movq [rsi], mm3 ; write back | |
206 | |
207 psubsb mm7, mm4 ; q1-= q1 add | |
208 pxor mm7, [GLOBAL(t80)] ; unoffset | |
209 movq [rdi], mm7 ; write back | |
210 | |
211 add rsi,8 | |
212 neg rax | |
213 dec rcx | |
214 jnz .next8_h | |
215 | |
216 add rsp, 32 | |
217 pop rsp | |
218 ; begin epilog | |
219 pop rdi | |
220 pop rsi | |
221 RESTORE_GOT | |
222 UNSHADOW_ARGS | |
223 pop rbp | |
224 ret | |
225 | |
226 | |
227 ;void vp8_loop_filter_vertical_edge_mmx | |
228 ;( | |
229 ; unsigned char *src_ptr, | |
230 ; int src_pixel_step, | |
231 ; const char *blimit, | |
232 ; const char *limit, | |
233 ; const char *thresh, | |
234 ; int count | |
235 ;) | |
236 global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE | |
237 sym(vp8_loop_filter_vertical_edge_mmx): | |
238 push rbp | |
239 mov rbp, rsp | |
240 SHADOW_ARGS_TO_STACK 6 | |
241 GET_GOT rbx | |
242 push rsi | |
243 push rdi | |
244 ; end prolog | |
245 | |
246 ALIGN_STACK 16, rax | |
247 sub rsp, 64 ; reserve 64 bytes | |
248 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; | |
249 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; | |
250 %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; | |
251 | |
252 mov rsi, arg(0) ;src_ptr | |
253 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destinati
on pitch? | |
254 | |
255 lea rsi, [rsi + rax*4 - 4] | |
256 | |
257 movsxd rcx, dword ptr arg(5) ;count | |
258 .next8_v: | |
259 mov rdi, rsi ; rdi points to row +1 for indirec
t addressing | |
260 add rdi, rax | |
261 | |
262 | |
263 ;transpose | |
264 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62
61 60 | |
265 movq mm7, mm6 ; 77 76 75 74 73 72
71 70 | |
266 | |
267 punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65
74 64 | |
268 punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61
70 60 | |
269 | |
270 movq mm4, [rsi] ; 47 46 45 44 43 42
41 40 | |
271 movq mm5, mm4 ; 47 46 45 44 43 42
41 40 | |
272 | |
273 punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45
54 44 | |
274 punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41
50 40 | |
275 | |
276 movq mm3, mm5 ; 57 47 56 46 55 45
54 44 | |
277 punpckhwd mm5, mm7 ; 77 67 57 47 76 66
56 46 | |
278 | |
279 punpcklwd mm3, mm7 ; 75 65 55 45 74 64
54 44 | |
280 movq mm2, mm4 ; 53 43 52 42 51 41
50 40 | |
281 | |
282 punpckhwd mm4, mm6 ; 73 63 53 43 72 62
52 42 | |
283 punpcklwd mm2, mm6 ; 71 61 51 41 70 60
50 40 | |
284 | |
285 neg rax | |
286 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22
21 20 | |
287 | |
288 movq mm1, mm6 ; 27 26 25 24 23 22
21 20 | |
289 punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25
34 24 | |
290 | |
291 punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21
30 20 | |
292 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02
01 00 | |
293 | |
294 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05
14 04 | |
295 movq mm0, mm7 ; 17 07 16 06 15 05
14 04 | |
296 | |
297 punpckhwd mm7, mm6 ; 37 27 17 07 36 26
16 06 | |
298 punpcklwd mm0, mm6 ; 35 25 15 05 34 24
14 04 | |
299 | |
300 movq mm6, mm7 ; 37 27 17 07 36 26
16 06 | |
301 punpckhdq mm7, mm5 ; 77 67 57 47 37 27
17 07 = q3 | |
302 | |
303 punpckldq mm6, mm5 ; 76 66 56 46 36 26
16 06 = q2 | |
304 | |
305 movq mm5, mm6 ; 76 66 56 46 36 26
16 06 | |
306 psubusb mm5, mm7 ; q2-q3 | |
307 | |
308 psubusb mm7, mm6 ; q3-q2 | |
309 por mm7, mm5; ; mm7=abs (q3-q2) | |
310 | |
311 movq mm5, mm0 ; 35 25 15 05 34 24
14 04 | |
312 punpckhdq mm5, mm3 ; 75 65 55 45 35 25
15 05 = q1 | |
313 | |
314 punpckldq mm0, mm3 ; 74 64 54 44 34 24
15 04 = q0 | |
315 movq mm3, mm5 ; 75 65 55 45 35 25
15 05 = q1 | |
316 | |
317 psubusb mm3, mm6 ; q1-q2 | |
318 psubusb mm6, mm5 ; q2-q1 | |
319 | |
320 por mm6, mm3 ; mm6=abs(q2-q1) | |
321 lea rdx, srct | |
322 | |
323 movq [rdx+24], mm5 ; save q1 | |
324 movq [rdx+16], mm0 ; save q0 | |
325 | |
326 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02
01 00 | |
327 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01
10 00 | |
328 | |
329 movq mm0, mm3 ; 13 03 12 02 11 01
10 00 | |
330 punpcklwd mm0, mm1 ; 31 21 11 01 30 20
10 00 | |
331 | |
332 punpckhwd mm3, mm1 ; 33 23 13 03 32 22
12 02 | |
333 movq mm1, mm0 ; 31 21 11 01 30 20
10 00 | |
334 | |
335 punpckldq mm0, mm2 ; 70 60 50 40 30 20
10 00 =p3 | |
336 punpckhdq mm1, mm2 ; 71 61 51 41 31 21
11 01 =p2 | |
337 | |
338 movq mm2, mm1 ; 71 61 51 41 31 21
11 01 =p2 | |
339 psubusb mm2, mm0 ; p2-p3 | |
340 | |
341 psubusb mm0, mm1 ; p3-p2 | |
342 por mm0, mm2 ; mm0=abs(p3-p2) | |
343 | |
344 movq mm2, mm3 ; 33 23 13 03 32 22
12 02 | |
345 punpckldq mm2, mm4 ; 72 62 52 42 32 22
12 02 = p1 | |
346 | |
347 punpckhdq mm3, mm4 ; 73 63 53 43 33 23
13 03 = p0 | |
348 movq [rdx+8], mm3 ; save p0 | |
349 | |
350 movq [rdx], mm2 ; save p1 | |
351 movq mm5, mm2 ; mm5 = p1 | |
352 | |
353 psubusb mm2, mm1 ; p1-p2 | |
354 psubusb mm1, mm5 ; p2-p1 | |
355 | |
356 por mm1, mm2 ; mm1=abs(p2-p1) | |
357 mov rdx, arg(3) ;limit | |
358 | |
359 movq mm4, [rdx] ; mm4 = limit | |
360 psubusb mm7, mm4 | |
361 | |
362 psubusb mm0, mm4 | |
363 psubusb mm1, mm4 | |
364 | |
365 psubusb mm6, mm4 | |
366 por mm7, mm6 | |
367 | |
368 por mm0, mm1 | |
369 por mm0, mm7 ; abs(q3-q2) > lim
it || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit | |
370 | |
371 movq mm1, mm5 ; p1 | |
372 | |
373 movq mm7, mm3 ; mm3=mm7=p0 | |
374 psubusb mm7, mm5 ; p0 - p1 | |
375 | |
376 psubusb mm5, mm3 ; p1 - p0 | |
377 por mm5, mm7 ; abs(p1-p0) | |
378 | |
379 movq t0, mm5 ; save abs(p1-p0) | |
380 lea rdx, srct | |
381 | |
382 psubusb mm5, mm4 | |
383 por mm0, mm5 ; mm0=mask | |
384 | |
385 movq mm5, [rdx+16] ; mm5=q0 | |
386 movq mm7, [rdx+24] ; mm7=q1 | |
387 | |
388 movq mm6, mm5 ; mm6=q0 | |
389 movq mm2, mm7 ; q1 | |
390 psubusb mm5, mm7 ; q0-q1 | |
391 | |
392 psubusb mm7, mm6 ; q1-q0 | |
393 por mm7, mm5 ; abs(q1-q0) | |
394 | |
395 movq t1, mm7 ; save abs(q1-q0) | |
396 psubusb mm7, mm4 | |
397 | |
398 por mm0, mm7 ; mask | |
399 | |
400 movq mm5, mm2 ; q1 | |
401 psubusb mm5, mm1 ; q1-=p1 | |
402 psubusb mm1, mm2 ; p1-=q1 | |
403 por mm5, mm1 ; abs(p1-q1) | |
404 pand mm5, [GLOBAL(tfe)] ; set lsb of each by
te to zero | |
405 psrlw mm5, 1 ; abs(p1-q1)/2 | |
406 | |
407 mov rdx, arg(2) ;blimit ; | |
408 | |
409 movq mm4, [rdx] ;blimit | |
410 movq mm1, mm3 ; mm1=mm3=p0 | |
411 | |
412 movq mm7, mm6 ; mm7=mm6=q0 | |
413 psubusb mm1, mm7 ; p0-q0 | |
414 | |
415 psubusb mm7, mm3 ; q0-p0 | |
416 por mm1, mm7 ; abs(q0-p0) | |
417 paddusb mm1, mm1 ; abs(q0-p0)*2 | |
418 paddusb mm1, mm5 ; abs (p0 - q0) *2 +
abs(p1-q1)/2 | |
419 | |
420 psubusb mm1, mm4 ; abs (p0 - q0) *2 +
abs(p1-q1)/2 > blimit | |
421 por mm1, mm0; ; mask | |
422 | |
423 pxor mm0, mm0 | |
424 pcmpeqb mm1, mm0 | |
425 | |
426 ; calculate high edge variance | |
427 mov rdx, arg(4) ;thresh ; get thresh | |
428 movq mm7, [rdx] | |
429 ; | |
430 movq mm4, t0 ; get abs (q1 - q0) | |
431 psubusb mm4, mm7 | |
432 | |
433 movq mm3, t1 ; get abs (p1 - p0) | |
434 psubusb mm3, mm7 | |
435 | |
436 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p
1 - p0) > thresh | |
437 pcmpeqb mm4, mm0 | |
438 | |
439 pcmpeqb mm0, mm0 | |
440 pxor mm4, mm0 | |
441 | |
442 | |
443 | |
444 ; start work on filters | |
445 lea rdx, srct | |
446 | |
447 movq mm2, [rdx] ; p1 | |
448 movq mm7, [rdx+24] ; q1 | |
449 | |
450 movq mm6, [rdx+8] ; p0 | |
451 movq mm0, [rdx+16] ; q0 | |
452 | |
453 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed
values | |
454 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed
values | |
455 | |
456 psubsb mm2, mm7 ; p1 - q1 | |
457 pand mm2, mm4 ; high var mask (hvm)(p1 - q1) | |
458 | |
459 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed va
lues | |
460 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed va
lues | |
461 | |
462 movq mm3, mm0 ; q0 | |
463 psubsb mm0, mm6 ; q0 - p0 | |
464 | |
465 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) | |
466 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) | |
467 | |
468 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) | |
469 pand mm1, mm2 ; mask filter values we don't ca
re about | |
470 | |
471 movq mm2, mm1 | |
472 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1)
+ 4 | |
473 | |
474 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1)
+ 3 | |
475 pxor mm0, mm0 ; | |
476 | |
477 pxor mm5, mm5 | |
478 punpcklbw mm0, mm2 ; | |
479 | |
480 punpckhbw mm5, mm2 ; | |
481 psraw mm0, 11 ; | |
482 | |
483 psraw mm5, 11 | |
484 packsswb mm0, mm5 | |
485 | |
486 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3)
>> 3; | |
487 | |
488 pxor mm0, mm0 ; 0 | |
489 movq mm5, mm1 ; abcdefgh | |
490 | |
491 punpcklbw mm0, mm1 ; e0f0g0h0 | |
492 psraw mm0, 11 ; sign extended shift right by
3 | |
493 | |
494 pxor mm1, mm1 ; 0 | |
495 punpckhbw mm1, mm5 ; a0b0c0d0 | |
496 | |
497 psraw mm1, 11 ; sign extended shift right by
3 | |
498 movq mm5, mm0 ; save results | |
499 | |
500 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4
) >>3 | |
501 paddsw mm5, [GLOBAL(ones)] | |
502 | |
503 paddsw mm1, [GLOBAL(ones)] | |
504 psraw mm5, 1 ; partial shifted one more tim
e for 2nd tap | |
505 | |
506 psraw mm1, 1 ; partial shifted one more tim
e for 2nd tap | |
507 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4
) >>4 | |
508 | |
509 pandn mm4, mm5 ; high edge variance additive | |
510 | |
511 paddsb mm6, mm2 ; p0+= p0 add | |
512 pxor mm6, [GLOBAL(t80)] ; unoffset | |
513 | |
514 ; mm6=p0 ; | |
515 movq mm1, [rdx] ; p1 | |
516 pxor mm1, [GLOBAL(t80)] ; reoffset | |
517 | |
518 paddsb mm1, mm4 ; p1+= p1 add | |
519 pxor mm1, [GLOBAL(t80)] ; unoffset | |
520 ; mm6 = p0 mm1 = p1 | |
521 | |
522 psubsb mm3, mm0 ; q0-= q0 add | |
523 pxor mm3, [GLOBAL(t80)] ; unoffset | |
524 | |
525 ; mm3 = q0 | |
526 psubsb mm7, mm4 ; q1-= q1 add | |
527 pxor mm7, [GLOBAL(t80)] ; unoffset | |
528 ; mm7 = q1 | |
529 | |
530 ; transpose and write back | |
531 ; mm1 = 72 62 52 42 32 22 12 02 | |
532 ; mm6 = 73 63 53 43 33 23 13 03 | |
533 ; mm3 = 74 64 54 44 34 24 14 04 | |
534 ; mm7 = 75 65 55 45 35 25 15 05 | |
535 | |
536 movq mm2, mm1 ; 72 62 52 42 32 22 12 02 | |
537 punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 | |
538 | |
539 movq mm4, mm3 ; 74 64 54 44 34 24 14 04 | |
540 punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 | |
541 | |
542 punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 | |
543 punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 | |
544 | |
545 movq mm6, mm2 ; 33 32 23 22 13 12 03 02 | |
546 punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 | |
547 | |
548 punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 | |
549 movq mm5, mm1 ; 73 72 63 62 53 52 43 42 | |
550 | |
551 punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 | |
552 punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 | |
553 | |
554 | |
555 ; mm2 = 15 14 13 12 05 04 03 02 | |
556 ; mm6 = 35 34 33 32 25 24 23 22 | |
557 ; mm5 = 55 54 53 52 45 44 43 42 | |
558 ; mm1 = 75 74 73 72 65 64 63 62 | |
559 | |
560 | |
561 | |
562 movd [rsi+rax*4+2], mm2 | |
563 psrlq mm2, 32 | |
564 | |
565 movd [rdi+rax*4+2], mm2 | |
566 movd [rsi+rax*2+2], mm6 | |
567 | |
568 psrlq mm6, 32 | |
569 movd [rsi+rax+2],mm6 | |
570 | |
571 movd [rsi+2], mm1 | |
572 psrlq mm1, 32 | |
573 | |
574 movd [rdi+2], mm1 | |
575 neg rax | |
576 | |
577 movd [rdi+rax+2],mm5 | |
578 psrlq mm5, 32 | |
579 | |
580 movd [rdi+rax*2+2], mm5 | |
581 | |
582 lea rsi, [rsi+rax*8] | |
583 dec rcx | |
584 jnz .next8_v | |
585 | |
586 add rsp, 64 | |
587 pop rsp | |
588 ; begin epilog | |
589 pop rdi | |
590 pop rsi | |
591 RESTORE_GOT | |
592 UNSHADOW_ARGS | |
593 pop rbp | |
594 ret | |
595 | |
596 | |
597 ;void vp8_mbloop_filter_horizontal_edge_mmx | |
598 ;( | |
599 ; unsigned char *src_ptr, | |
600 ; int src_pixel_step, | |
601 ; const char *blimit, | |
602 ; const char *limit, | |
603 ; const char *thresh, | |
604 ; int count | |
605 ;) | |
606 global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE | |
607 sym(vp8_mbloop_filter_horizontal_edge_mmx): | |
608 push rbp | |
609 mov rbp, rsp | |
610 SHADOW_ARGS_TO_STACK 6 | |
611 GET_GOT rbx | |
612 push rsi | |
613 push rdi | |
614 ; end prolog | |
615 | |
616 ALIGN_STACK 16, rax | |
617 sub rsp, 32 ; reserve 32 bytes | |
618 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; | |
619 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; | |
620 | |
621 mov rsi, arg(0) ;src_ptr | |
622 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc
h? | |
623 | |
624 movsxd rcx, dword ptr arg(5) ;count | |
625 .next8_mbh: | |
626 mov rdx, arg(3) ;limit | |
627 movq mm7, [rdx] | |
628 mov rdi, rsi ; rdi points to row +1 for indirect ad
dressing | |
629 add rdi, rax | |
630 | |
631 ; calculate breakout conditions | |
632 movq mm2, [rdi+2*rax] ; q3 | |
633 | |
634 movq mm1, [rsi+2*rax] ; q2 | |
635 movq mm6, mm1 ; q2 | |
636 psubusb mm1, mm2 ; q2-=q3 | |
637 psubusb mm2, mm6 ; q3-=q2 | |
638 por mm1, mm2 ; abs(q3-q2) | |
639 psubusb mm1, mm7 | |
640 | |
641 | |
642 ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit | |
643 movq mm4, [rsi+rax] ; q1 | |
644 movq mm3, mm4 ; q1 | |
645 psubusb mm4, mm6 ; q1-=q2 | |
646 psubusb mm6, mm3 ; q2-=q1 | |
647 por mm4, mm6 ; abs(q2-q1) | |
648 psubusb mm4, mm7 | |
649 por mm1, mm4 | |
650 | |
651 | |
652 ; mm1 = mask, mm3=q1, mm7 = limit | |
653 | |
654 movq mm4, [rsi] ; q0 | |
655 movq mm0, mm4 ; q0 | |
656 psubusb mm4, mm3 ; q0-=q1 | |
657 psubusb mm3, mm0 ; q1-=q0 | |
658 por mm4, mm3 ; abs(q0-q1) | |
659 movq t0, mm4 ; save to t0 | |
660 psubusb mm4, mm7 | |
661 por mm1, mm4 | |
662 | |
663 | |
664 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) | |
665 | |
666 neg rax ; negate pitch to deal with above bord
er | |
667 | |
668 movq mm2, [rsi+4*rax] ; p3 | |
669 movq mm4, [rdi+4*rax] ; p2 | |
670 movq mm5, mm4 ; p2 | |
671 psubusb mm4, mm2 ; p2-=p3 | |
672 psubusb mm2, mm5 ; p3-=p2 | |
673 por mm4, mm2 ; abs(p3 - p2) | |
674 psubusb mm4, mm7 | |
675 por mm1, mm4 | |
676 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) | |
677 | |
678 movq mm4, [rsi+2*rax] ; p1 | |
679 movq mm3, mm4 ; p1 | |
680 psubusb mm4, mm5 ; p1-=p2 | |
681 psubusb mm5, mm3 ; p2-=p1 | |
682 por mm4, mm5 ; abs(p2 - p1) | |
683 psubusb mm4, mm7 | |
684 por mm1, mm4 | |
685 | |
686 movq mm2, mm3 ; p1 | |
687 | |
688 | |
689 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) | |
690 | |
691 movq mm4, [rsi+rax] ; p0 | |
692 movq mm5, mm4 ; p0 | |
693 psubusb mm4, mm3 ; p0-=p1 | |
694 psubusb mm3, mm5 ; p1-=p0 | |
695 por mm4, mm3 ; abs(p1 - p0) | |
696 movq t1, mm4 ; save to t1 | |
697 psubusb mm4, mm7 | |
698 por mm1, mm4 | |
699 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) | |
700 ; mm5 = p0 | |
701 movq mm3, [rdi] ; q1 | |
702 movq mm4, mm3 ; q1 | |
703 psubusb mm3, mm2 ; q1-=p1 | |
704 psubusb mm2, mm4 ; p1-=q1 | |
705 por mm2, mm3 ; abs(p1-q1) | |
706 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero | |
707 psrlw mm2, 1 ; abs(p1-q1)/2 | |
708 | |
709 movq mm6, mm5 ; p0 | |
710 movq mm3, mm0 ; q0 | |
711 psubusb mm5, mm3 ; p0-=q0 | |
712 psubusb mm3, mm6 ; q0-=p0 | |
713 por mm5, mm3 ; abs(p0 - q0) | |
714 paddusb mm5, mm5 ; abs(p0-q0)*2 | |
715 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 | |
716 | |
717 mov rdx, arg(2) ;blimit ; get blimit | |
718 movq mm7, [rdx] ; blimit | |
719 | |
720 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > b
limit | |
721 por mm1, mm5 | |
722 pxor mm5, mm5 | |
723 pcmpeqb mm1, mm5 ; mask mm1 | |
724 | |
725 ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) | |
726 ; mm6 = p0, | |
727 | |
728 ; calculate high edge variance | |
729 mov rdx, arg(4) ;thresh ; get thresh | |
730 movq mm7, [rdx] ; | |
731 movq mm4, t0 ; get abs (q1 - q0) | |
732 psubusb mm4, mm7 | |
733 movq mm3, t1 ; get abs (p1 - p0) | |
734 psubusb mm3, mm7 | |
735 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0
) > thresh | |
736 | |
737 pcmpeqb mm4, mm5 | |
738 | |
739 pcmpeqb mm5, mm5 | |
740 pxor mm4, mm5 | |
741 | |
742 | |
743 | |
744 ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) | |
745 ; mm6 = p0, mm4=hev | |
746 ; start work on filters | |
747 movq mm2, [rsi+2*rax] ; p1 | |
748 movq mm7, [rdi] ; q1 | |
749 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value
s | |
750 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value
s | |
751 psubsb mm2, mm7 ; p1 - q1 | |
752 | |
753 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values | |
754 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values | |
755 movq mm3, mm0 ; q0 | |
756 psubsb mm0, mm6 ; q0 - p0 | |
757 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) | |
758 paddsb mm2, mm0 ; 2 * (q0 - p0) | |
759 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) | |
760 pand mm1, mm2 ; mask filter values we don't care abo
ut | |
761 | |
762 | |
763 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 | |
764 movq mm2, mm1 ; vp8_filter | |
765 pand mm2, mm4; ; Filter2 = vp8_filter & hev | |
766 | |
767 movq mm5, mm2 ; | |
768 paddsb mm5, [GLOBAL(t3)]; | |
769 | |
770 pxor mm0, mm0 ; 0 | |
771 pxor mm7, mm7 ; 0 | |
772 | |
773 punpcklbw mm0, mm5 ; e0f0g0h0 | |
774 psraw mm0, 11 ; sign extended shift right by 3 | |
775 punpckhbw mm7, mm5 ; a0b0c0d0 | |
776 psraw mm7, 11 ; sign extended shift right by 3 | |
777 packsswb mm0, mm7 ; Filter2 >>=3; | |
778 | |
779 movq mm5, mm0 ; Filter2 | |
780 | |
781 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) | |
782 pxor mm0, mm0 ; 0 | |
783 pxor mm7, mm7 ; 0 | |
784 | |
785 punpcklbw mm0, mm2 ; e0f0g0h0 | |
786 psraw mm0, 11 ; sign extended shift right by 3 | |
787 punpckhbw mm7, mm2 ; a0b0c0d0 | |
788 psraw mm7, 11 ; sign extended shift right by 3 | |
789 packsswb mm0, mm7 ; Filter2 >>=3; | |
790 | |
791 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 | |
792 psubsb mm3, mm0 ; qs0 =qs0 - filter1 | |
793 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 | |
794 | |
795 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 | |
796 ; vp8_filter &= ~hev; | |
797 ; Filter2 = vp8_filter; | |
798 pandn mm4, mm1 ; vp8_filter&=~hev | |
799 | |
800 | |
801 ; mm3=qs0, mm4=filter2, mm6=ps0 | |
802 | |
803 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); | |
804 ; s = vp8_signed_char_clamp(qs0 - u); | |
805 ; *oq0 = s^0x80; | |
806 ; s = vp8_signed_char_clamp(ps0 + u); | |
807 ; *op0 = s^0x80; | |
808 pxor mm0, mm0 | |
809 | |
810 pxor mm1, mm1 | |
811 pxor mm2, mm2 | |
812 punpcklbw mm1, mm4 | |
813 punpckhbw mm2, mm4 | |
814 pmulhw mm1, [GLOBAL(s27)] | |
815 pmulhw mm2, [GLOBAL(s27)] | |
816 paddw mm1, [GLOBAL(s63)] | |
817 paddw mm2, [GLOBAL(s63)] | |
818 psraw mm1, 7 | |
819 psraw mm2, 7 | |
820 packsswb mm1, mm2 | |
821 | |
822 psubsb mm3, mm1 | |
823 paddsb mm6, mm1 | |
824 | |
825 pxor mm3, [GLOBAL(t80)] | |
826 pxor mm6, [GLOBAL(t80)] | |
827 movq [rsi+rax], mm6 | |
828 movq [rsi], mm3 | |
829 | |
830 ; roughly 2/7th difference across boundary | |
831 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); | |
832 ; s = vp8_signed_char_clamp(qs1 - u); | |
833 ; *oq1 = s^0x80; | |
834 ; s = vp8_signed_char_clamp(ps1 + u); | |
835 ; *op1 = s^0x80; | |
836 pxor mm1, mm1 | |
837 pxor mm2, mm2 | |
838 punpcklbw mm1, mm4 | |
839 punpckhbw mm2, mm4 | |
840 pmulhw mm1, [GLOBAL(s18)] | |
841 pmulhw mm2, [GLOBAL(s18)] | |
842 paddw mm1, [GLOBAL(s63)] | |
843 paddw mm2, [GLOBAL(s63)] | |
844 psraw mm1, 7 | |
845 psraw mm2, 7 | |
846 packsswb mm1, mm2 | |
847 | |
848 movq mm3, [rdi] | |
849 movq mm6, [rsi+rax*2] ; p1 | |
850 | |
851 pxor mm3, [GLOBAL(t80)] | |
852 pxor mm6, [GLOBAL(t80)] | |
853 | |
854 paddsb mm6, mm1 | |
855 psubsb mm3, mm1 | |
856 | |
857 pxor mm6, [GLOBAL(t80)] | |
858 pxor mm3, [GLOBAL(t80)] | |
859 movq [rdi], mm3 | |
860 movq [rsi+rax*2], mm6 | |
861 | |
862 ; roughly 1/7th difference across boundary | |
863 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); | |
864 ; s = vp8_signed_char_clamp(qs2 - u); | |
865 ; *oq2 = s^0x80; | |
866 ; s = vp8_signed_char_clamp(ps2 + u); | |
867 ; *op2 = s^0x80; | |
868 pxor mm1, mm1 | |
869 pxor mm2, mm2 | |
870 punpcklbw mm1, mm4 | |
871 punpckhbw mm2, mm4 | |
872 pmulhw mm1, [GLOBAL(s9)] | |
873 pmulhw mm2, [GLOBAL(s9)] | |
874 paddw mm1, [GLOBAL(s63)] | |
875 paddw mm2, [GLOBAL(s63)] | |
876 psraw mm1, 7 | |
877 psraw mm2, 7 | |
878 packsswb mm1, mm2 | |
879 | |
880 | |
881 movq mm6, [rdi+rax*4] | |
882 neg rax | |
883 movq mm3, [rdi+rax ] | |
884 | |
885 pxor mm6, [GLOBAL(t80)] | |
886 pxor mm3, [GLOBAL(t80)] | |
887 | |
888 paddsb mm6, mm1 | |
889 psubsb mm3, mm1 | |
890 | |
891 pxor mm6, [GLOBAL(t80)] | |
892 pxor mm3, [GLOBAL(t80)] | |
893 movq [rdi+rax ], mm3 | |
894 neg rax | |
895 movq [rdi+rax*4], mm6 | |
896 | |
897 ;EARLY_BREAK_OUT: | |
898 neg rax | |
899 add rsi,8 | |
900 dec rcx | |
901 jnz .next8_mbh | |
902 | |
903 add rsp, 32 | |
904 pop rsp | |
905 ; begin epilog | |
906 pop rdi | |
907 pop rsi | |
908 RESTORE_GOT | |
909 UNSHADOW_ARGS | |
910 pop rbp | |
911 ret | |
912 | |
913 | |
914 ;void vp8_mbloop_filter_vertical_edge_mmx | |
915 ;( | |
916 ; unsigned char *src_ptr, | |
917 ; int src_pixel_step, | |
918 ; const char *blimit, | |
919 ; const char *limit, | |
920 ; const char *thresh, | |
921 ; int count | |
922 ;) | |
923 global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE | |
924 sym(vp8_mbloop_filter_vertical_edge_mmx): | |
925 push rbp | |
926 mov rbp, rsp | |
927 SHADOW_ARGS_TO_STACK 6 | |
928 GET_GOT rbx | |
929 push rsi | |
930 push rdi | |
931 ; end prolog | |
932 | |
933 ALIGN_STACK 16, rax | |
934 sub rsp, 96 ; reserve 96 bytes | |
935 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; | |
936 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; | |
937 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; | |
938 | |
939 mov rsi, arg(0) ;src_ptr | |
940 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destinati
on pitch? | |
941 | |
942 lea rsi, [rsi + rax*4 - 4] | |
943 | |
944 movsxd rcx, dword ptr arg(5) ;count | |
945 .next8_mbv: | |
946 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect
addressing | |
947 | |
948 ;transpose | |
949 movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72
71 70 | |
950 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62
61 60 | |
951 | |
952 movq mm7, mm6 ; 77 76 75 74 73 72
71 70 | |
953 punpckhbw mm7, mm0 ; 77 67 76 66 75 65
74 64 | |
954 | |
955 punpcklbw mm6, mm0 ; 73 63 72 62 71 61
70 60 | |
956 movq mm0, [rsi+rax] ; 57 56 55 54 53 52
51 50 | |
957 | |
958 movq mm4, [rsi] ; 47 46 45 44 43 42
41 40 | |
959 movq mm5, mm4 ; 47 46 45 44 43 42
41 40 | |
960 | |
961 punpckhbw mm5, mm0 ; 57 47 56 46 55 45
54 44 | |
962 punpcklbw mm4, mm0 ; 53 43 52 42 51 41
50 40 | |
963 | |
964 movq mm3, mm5 ; 57 47 56 46 55 45
54 44 | |
965 punpckhwd mm5, mm7 ; 77 67 57 47 76 66
56 46 | |
966 | |
967 punpcklwd mm3, mm7 ; 75 65 55 45 74 64
54 44 | |
968 movq mm2, mm4 ; 53 43 52 42 51 41
50 40 | |
969 | |
970 punpckhwd mm4, mm6 ; 73 63 53 43 72 62
52 42 | |
971 punpcklwd mm2, mm6 ; 71 61 51 41 70 60
50 40 | |
972 | |
973 neg rax | |
974 | |
975 movq mm7, [rsi+rax] ; 37 36 35 34 33 32
31 30 | |
976 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22
21 20 | |
977 | |
978 movq mm1, mm6 ; 27 26 25 24 23 22
21 20 | |
979 punpckhbw mm6, mm7 ; 37 27 36 36 35 25
34 24 | |
980 | |
981 punpcklbw mm1, mm7 ; 33 23 32 22 31 21
30 20 | |
982 | |
983 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02
01 00 | |
984 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05
14 04 | |
985 | |
986 movq mm0, mm7 ; 17 07 16 06 15 05
14 04 | |
987 punpckhwd mm7, mm6 ; 37 27 17 07 36 26
16 06 | |
988 | |
989 punpcklwd mm0, mm6 ; 35 25 15 05 34 24
14 04 | |
990 movq mm6, mm7 ; 37 27 17 07 36 26
16 06 | |
991 | |
992 punpckhdq mm7, mm5 ; 77 67 57 47 37 27
17 07 = q3 | |
993 punpckldq mm6, mm5 ; 76 66 56 46 36 26
16 06 = q2 | |
994 | |
995 lea rdx, srct | |
996 movq mm5, mm6 ; 76 66 56 46 36 26
16 06 | |
997 | |
998 movq [rdx+56], mm7 | |
999 psubusb mm5, mm7 ; q2-q3 | |
1000 | |
1001 | |
1002 movq [rdx+48], mm6 | |
1003 psubusb mm7, mm6 ; q3-q2 | |
1004 | |
1005 por mm7, mm5; ; mm7=abs (q3-q2) | |
1006 movq mm5, mm0 ; 35 25 15 05 34 24
14 04 | |
1007 | |
1008 punpckhdq mm5, mm3 ; 75 65 55 45 35 25
15 05 = q1 | |
1009 punpckldq mm0, mm3 ; 74 64 54 44 34 24
15 04 = q0 | |
1010 | |
1011 movq mm3, mm5 ; 75 65 55 45 35 25
15 05 = q1 | |
1012 psubusb mm3, mm6 ; q1-q2 | |
1013 | |
1014 psubusb mm6, mm5 ; q2-q1 | |
1015 por mm6, mm3 ; mm6=abs(q2-q1) | |
1016 | |
1017 movq [rdx+40], mm5 ; save q1 | |
1018 movq [rdx+32], mm0 ; save q0 | |
1019 | |
1020 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02
01 00 | |
1021 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01
10 00 | |
1022 | |
1023 movq mm0, mm3 ; 13 03 12 02 11 01
10 00 | |
1024 punpcklwd mm0, mm1 ; 31 21 11 01 30 20
10 00 | |
1025 | |
1026 punpckhwd mm3, mm1 ; 33 23 13 03 32 22
12 02 | |
1027 movq mm1, mm0 ; 31 21 11 01 30 20
10 00 | |
1028 | |
1029 punpckldq mm0, mm2 ; 70 60 50 40 30 20
10 00 =p3 | |
1030 punpckhdq mm1, mm2 ; 71 61 51 41 31 21
11 01 =p2 | |
1031 | |
1032 movq [rdx], mm0 ; save p3 | |
1033 movq [rdx+8], mm1 ; save p2 | |
1034 | |
1035 movq mm2, mm1 ; 71 61 51 41 31 21
11 01 =p2 | |
1036 psubusb mm2, mm0 ; p2-p3 | |
1037 | |
1038 psubusb mm0, mm1 ; p3-p2 | |
1039 por mm0, mm2 ; mm0=abs(p3-p2) | |
1040 | |
1041 movq mm2, mm3 ; 33 23 13 03 32 22
12 02 | |
1042 punpckldq mm2, mm4 ; 72 62 52 42 32 22
12 02 = p1 | |
1043 | |
1044 punpckhdq mm3, mm4 ; 73 63 53 43 33 23
13 03 = p0 | |
1045 movq [rdx+24], mm3 ; save p0 | |
1046 | |
1047 movq [rdx+16], mm2 ; save p1 | |
1048 movq mm5, mm2 ; mm5 = p1 | |
1049 | |
1050 psubusb mm2, mm1 ; p1-p2 | |
1051 psubusb mm1, mm5 ; p2-p1 | |
1052 | |
1053 por mm1, mm2 ; mm1=abs(p2-p1) | |
1054 mov rdx, arg(3) ;limit | |
1055 | |
1056 movq mm4, [rdx] ; mm4 = limit | |
1057 psubusb mm7, mm4 ; abs(q3-q2) > limit | |
1058 | |
1059 psubusb mm0, mm4 ; abs(p3-p2) > limit | |
1060 psubusb mm1, mm4 ; abs(p2-p1) > limit | |
1061 | |
1062 psubusb mm6, mm4 ; abs(q2-q1) > limit | |
1063 por mm7, mm6 ; or | |
1064 | |
1065 por mm0, mm1 ; | |
1066 por mm0, mm7 ; abs(q3-q2) > limit
|| abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit | |
1067 | |
1068 movq mm1, mm5 ; p1 | |
1069 | |
1070 movq mm7, mm3 ; mm3=mm7=p0 | |
1071 psubusb mm7, mm5 ; p0 - p1 | |
1072 | |
1073 psubusb mm5, mm3 ; p1 - p0 | |
1074 por mm5, mm7 ; abs(p1-p0) | |
1075 | |
1076 movq t0, mm5 ; save abs(p1-p0) | |
1077 lea rdx, srct | |
1078 | |
1079 psubusb mm5, mm4 ; mm5 = abs(p1-p0) >
limit | |
1080 por mm0, mm5 ; mm0=mask | |
1081 | |
1082 movq mm5, [rdx+32] ; mm5=q0 | |
1083 movq mm7, [rdx+40] ; mm7=q1 | |
1084 | |
1085 movq mm6, mm5 ; mm6=q0 | |
1086 movq mm2, mm7 ; q1 | |
1087 psubusb mm5, mm7 ; q0-q1 | |
1088 | |
1089 psubusb mm7, mm6 ; q1-q0 | |
1090 por mm7, mm5 ; abs(q1-q0) | |
1091 | |
1092 movq t1, mm7 ; save abs(q1-q0) | |
1093 psubusb mm7, mm4 ; mm7=abs(q1-q0)> li
mit | |
1094 | |
1095 por mm0, mm7 ; mask | |
1096 | |
1097 movq mm5, mm2 ; q1 | |
1098 psubusb mm5, mm1 ; q1-=p1 | |
1099 psubusb mm1, mm2 ; p1-=q1 | |
1100 por mm5, mm1 ; abs(p1-q1) | |
1101 pand mm5, [GLOBAL(tfe)] ; set lsb of each by
te to zero | |
1102 psrlw mm5, 1 ; abs(p1-q1)/2 | |
1103 | |
1104 mov rdx, arg(2) ;blimit ; | |
1105 | |
1106 movq mm4, [rdx] ;blimit | |
1107 movq mm1, mm3 ; mm1=mm3=p0 | |
1108 | |
1109 movq mm7, mm6 ; mm7=mm6=q0 | |
1110 psubusb mm1, mm7 ; p0-q0 | |
1111 | |
1112 psubusb mm7, mm3 ; q0-p0 | |
1113 por mm1, mm7 ; abs(q0-p0) | |
1114 paddusb mm1, mm1 ; abs(q0-p0)*2 | |
1115 paddusb mm1, mm5 ; abs (p0 - q0) *2 +
abs(p1-q1)/2 | |
1116 | |
1117 psubusb mm1, mm4 ; abs (p0 - q0) *2 +
abs(p1-q1)/2 > blimit | |
1118 por mm1, mm0; ; mask | |
1119 | |
1120 pxor mm0, mm0 | |
1121 pcmpeqb mm1, mm0 | |
1122 | |
1123 ; calculate high edge variance | |
1124 mov rdx, arg(4) ;thresh ; get thresh | |
1125 movq mm7, [rdx] | |
1126 ; | |
1127 movq mm4, t0 ; get abs (q1 - q0) | |
1128 psubusb mm4, mm7 ; abs(q1 - q0) > thresh | |
1129 | |
1130 movq mm3, t1 ; get abs (p1 - p0) | |
1131 psubusb mm3, mm7 ; abs(p1 - p0)> thresh | |
1132 | |
1133 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p
1 - p0) > thresh | |
1134 pcmpeqb mm4, mm0 | |
1135 | |
1136 pcmpeqb mm0, mm0 | |
1137 pxor mm4, mm0 | |
1138 | |
1139 | |
1140 | |
1141 | |
1142 ; start work on filters | |
1143 lea rdx, srct | |
1144 | |
1145 ; start work on filters | |
1146 movq mm2, [rdx+16] ; p1 | |
1147 movq mm7, [rdx+40] ; q1 | |
1148 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value
s | |
1149 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value
s | |
1150 psubsb mm2, mm7 ; p1 - q1 | |
1151 | |
1152 movq mm6, [rdx+24] ; p0 | |
1153 movq mm0, [rdx+32] ; q0 | |
1154 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values | |
1155 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values | |
1156 | |
1157 movq mm3, mm0 ; q0 | |
1158 psubsb mm0, mm6 ; q0 - p0 | |
1159 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) | |
1160 paddsb mm2, mm0 ; 2 * (q0 - p0) | |
1161 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) | |
1162 pand mm1, mm2 ; mask filter values we don't care about | |
1163 | |
1164 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 | |
1165 movq mm2, mm1 ; vp8_filter | |
1166 pand mm2, mm4; ; Filter2 = vp8_filter & hev | |
1167 | |
1168 movq mm5, mm2 ; | |
1169 paddsb mm5, [GLOBAL(t3)]; | |
1170 | |
1171 pxor mm0, mm0 ; 0 | |
1172 pxor mm7, mm7 ; 0 | |
1173 | |
1174 punpcklbw mm0, mm5 ; e0f0g0h0 | |
1175 psraw mm0, 11 ; sign extended shift right by 3 | |
1176 punpckhbw mm7, mm5 ; a0b0c0d0 | |
1177 psraw mm7, 11 ; sign extended shift right by 3 | |
1178 packsswb mm0, mm7 ; Filter2 >>=3; | |
1179 | |
1180 movq mm5, mm0 ; Filter2 | |
1181 | |
1182 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) | |
1183 pxor mm0, mm0 ; 0 | |
1184 pxor mm7, mm7 ; 0 | |
1185 | |
1186 punpcklbw mm0, mm2 ; e0f0g0h0 | |
1187 psraw mm0, 11 ; sign extended shift right by 3 | |
1188 punpckhbw mm7, mm2 ; a0b0c0d0 | |
1189 psraw mm7, 11 ; sign extended shift right by 3 | |
1190 packsswb mm0, mm7 ; Filter2 >>=3; | |
1191 | |
1192 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 | |
1193 psubsb mm3, mm0 ; qs0 =qs0 - filter1 | |
1194 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 | |
1195 | |
1196 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 | |
1197 ; vp8_filter &= ~hev; | |
1198 ; Filter2 = vp8_filter; | |
1199 pandn mm4, mm1 ; vp8_filter&=~hev | |
1200 | |
1201 | |
1202 ; mm3=qs0, mm4=filter2, mm6=ps0 | |
1203 | |
1204 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); | |
1205 ; s = vp8_signed_char_clamp(qs0 - u); | |
1206 ; *oq0 = s^0x80; | |
1207 ; s = vp8_signed_char_clamp(ps0 + u); | |
1208 ; *op0 = s^0x80; | |
1209 pxor mm0, mm0 | |
1210 | |
1211 pxor mm1, mm1 | |
1212 pxor mm2, mm2 | |
1213 punpcklbw mm1, mm4 | |
1214 punpckhbw mm2, mm4 | |
1215 pmulhw mm1, [GLOBAL(s27)] | |
1216 pmulhw mm2, [GLOBAL(s27)] | |
1217 paddw mm1, [GLOBAL(s63)] | |
1218 paddw mm2, [GLOBAL(s63)] | |
1219 psraw mm1, 7 | |
1220 psraw mm2, 7 | |
1221 packsswb mm1, mm2 | |
1222 | |
1223 psubsb mm3, mm1 | |
1224 paddsb mm6, mm1 | |
1225 | |
1226 pxor mm3, [GLOBAL(t80)] | |
1227 pxor mm6, [GLOBAL(t80)] | |
1228 movq [rdx+24], mm6 | |
1229 movq [rdx+32], mm3 | |
1230 | |
1231 ; roughly 2/7th difference across boundary | |
1232 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); | |
1233 ; s = vp8_signed_char_clamp(qs1 - u); | |
1234 ; *oq1 = s^0x80; | |
1235 ; s = vp8_signed_char_clamp(ps1 + u); | |
1236 ; *op1 = s^0x80; | |
1237 pxor mm1, mm1 | |
1238 pxor mm2, mm2 | |
1239 punpcklbw mm1, mm4 | |
1240 punpckhbw mm2, mm4 | |
1241 pmulhw mm1, [GLOBAL(s18)] | |
1242 pmulhw mm2, [GLOBAL(s18)] | |
1243 paddw mm1, [GLOBAL(s63)] | |
1244 paddw mm2, [GLOBAL(s63)] | |
1245 psraw mm1, 7 | |
1246 psraw mm2, 7 | |
1247 packsswb mm1, mm2 | |
1248 | |
1249 movq mm3, [rdx + 40] | |
1250 movq mm6, [rdx + 16] ; p1 | |
1251 pxor mm3, [GLOBAL(t80)] | |
1252 pxor mm6, [GLOBAL(t80)] | |
1253 | |
1254 paddsb mm6, mm1 | |
1255 psubsb mm3, mm1 | |
1256 | |
1257 pxor mm6, [GLOBAL(t80)] | |
1258 pxor mm3, [GLOBAL(t80)] | |
1259 movq [rdx + 40], mm3 | |
1260 movq [rdx + 16], mm6 | |
1261 | |
1262 ; roughly 1/7th difference across boundary | |
1263 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); | |
1264 ; s = vp8_signed_char_clamp(qs2 - u); | |
1265 ; *oq2 = s^0x80; | |
1266 ; s = vp8_signed_char_clamp(ps2 + u); | |
1267 ; *op2 = s^0x80; | |
1268 pxor mm1, mm1 | |
1269 pxor mm2, mm2 | |
1270 punpcklbw mm1, mm4 | |
1271 punpckhbw mm2, mm4 | |
1272 pmulhw mm1, [GLOBAL(s9)] | |
1273 pmulhw mm2, [GLOBAL(s9)] | |
1274 paddw mm1, [GLOBAL(s63)] | |
1275 paddw mm2, [GLOBAL(s63)] | |
1276 psraw mm1, 7 | |
1277 psraw mm2, 7 | |
1278 packsswb mm1, mm2 | |
1279 | |
1280 movq mm6, [rdx+ 8] | |
1281 movq mm3, [rdx+48] | |
1282 | |
1283 pxor mm6, [GLOBAL(t80)] | |
1284 pxor mm3, [GLOBAL(t80)] | |
1285 | |
1286 paddsb mm6, mm1 | |
1287 psubsb mm3, mm1 | |
1288 | |
1289 pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 | |
1290 pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 | |
1291 | |
1292 ; transpose and write back | |
1293 movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 | |
1294 movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 | |
1295 | |
1296 punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 | |
1297 punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 | |
1298 | |
1299 movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 | |
1300 movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 | |
1301 | |
1302 punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 | |
1303 punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 | |
1304 | |
1305 movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 | |
1306 punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 | |
1307 | |
1308 punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 | |
1309 movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 | |
1310 | |
1311 punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 | |
1312 punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 | |
1313 | |
1314 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 | |
1315 punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 | |
1316 | |
1317 movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 | |
1318 punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 | |
1319 | |
1320 movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 | |
1321 punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 | |
1322 | |
1323 punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 | |
1324 movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 | |
1325 | |
1326 punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 | |
1327 punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 | |
1328 | |
1329 movq [rsi+rax*4], mm0 ; write out | |
1330 movq [rdi+rax*4], mm6 ; write out | |
1331 | |
1332 movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 | |
1333 punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 | |
1334 | |
1335 punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 | |
1336 movq [rsi+rax*2], mm0 ; write out | |
1337 | |
1338 movq [rdi+rax*2], mm5 ; write out | |
1339 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 | |
1340 | |
1341 punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 | |
1342 punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 | |
1343 | |
1344 movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 | |
1345 punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 | |
1346 | |
1347 punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 | |
1348 movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 | |
1349 | |
1350 movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 | |
1351 punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 | |
1352 | |
1353 punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 | |
1354 movq [rsi], mm0 ; write out | |
1355 | |
1356 movq [rdi], mm1 ; write out | |
1357 neg rax | |
1358 | |
1359 punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 | |
1360 punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 | |
1361 | |
1362 movq [rsi+rax*2], mm3 | |
1363 movq [rdi+rax*2], mm4 | |
1364 | |
1365 lea rsi, [rsi+rax*8] | |
1366 dec rcx | |
1367 | |
1368 jnz .next8_mbv | |
1369 | |
1370 add rsp, 96 | |
1371 pop rsp | |
1372 ; begin epilog | |
1373 pop rdi | |
1374 pop rsi | |
1375 RESTORE_GOT | |
1376 UNSHADOW_ARGS | |
1377 pop rbp | |
1378 ret | |
1379 | |
1380 | |
1381 ;void vp8_loop_filter_simple_horizontal_edge_mmx | |
1382 ;( | |
1383 ; unsigned char *src_ptr, | |
1384 ; int src_pixel_step, | |
1385 ; const char *blimit | |
1386 ;) | |
1387 global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE | |
1388 sym(vp8_loop_filter_simple_horizontal_edge_mmx): | |
1389 push rbp | |
1390 mov rbp, rsp | |
1391 SHADOW_ARGS_TO_STACK 3 | |
1392 GET_GOT rbx | |
1393 push rsi | |
1394 push rdi | |
1395 ; end prolog | |
1396 | |
1397 mov rsi, arg(0) ;src_ptr | |
1398 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc
h? | |
1399 | |
1400 mov rcx, 2 ; count | |
1401 .nexts8_h: | |
1402 mov rdx, arg(2) ;blimit ; get blimit | |
1403 movq mm3, [rdx] ; | |
1404 | |
1405 mov rdi, rsi ; rdi points to row +1 for indirect ad
dressing | |
1406 add rdi, rax | |
1407 neg rax | |
1408 | |
1409 ; calculate mask | |
1410 movq mm1, [rsi+2*rax] ; p1 | |
1411 movq mm0, [rdi] ; q1 | |
1412 movq mm2, mm1 | |
1413 movq mm7, mm0 | |
1414 movq mm4, mm0 | |
1415 psubusb mm0, mm1 ; q1-=p1 | |
1416 psubusb mm1, mm4 ; p1-=q1 | |
1417 por mm1, mm0 ; abs(p1-q1) | |
1418 pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero | |
1419 psrlw mm1, 1 ; abs(p1-q1)/2 | |
1420 | |
1421 movq mm5, [rsi+rax] ; p0 | |
1422 movq mm4, [rsi] ; q0 | |
1423 movq mm0, mm4 ; q0 | |
1424 movq mm6, mm5 ; p0 | |
1425 psubusb mm5, mm4 ; p0-=q0 | |
1426 psubusb mm4, mm6 ; q0-=p0 | |
1427 por mm5, mm4 ; abs(p0 - q0) | |
1428 paddusb mm5, mm5 ; abs(p0-q0)*2 | |
1429 paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 | |
1430 | |
1431 psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > bl
imit | |
1432 pxor mm3, mm3 | |
1433 pcmpeqb mm5, mm3 | |
1434 | |
1435 ; start work on filters | |
1436 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value
s | |
1437 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value
s | |
1438 psubsb mm2, mm7 ; p1 - q1 | |
1439 | |
1440 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values | |
1441 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values | |
1442 movq mm3, mm0 ; q0 | |
1443 psubsb mm0, mm6 ; q0 - p0 | |
1444 paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) | |
1445 paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) | |
1446 paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) | |
1447 pand mm5, mm2 ; mask filter values we don't care abo
ut | |
1448 | |
1449 ; do + 4 side | |
1450 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 | |
1451 | |
1452 movq mm0, mm5 ; get a copy of filters | |
1453 psllw mm0, 8 ; shift left 8 | |
1454 psraw mm0, 3 ; arithmetic shift right 11 | |
1455 psrlw mm0, 8 | |
1456 movq mm1, mm5 ; get a copy of filters | |
1457 psraw mm1, 11 ; arithmetic shift right 11 | |
1458 psllw mm1, 8 ; shift left 8 to put it back | |
1459 | |
1460 por mm0, mm1 ; put the two together to get result | |
1461 | |
1462 psubsb mm3, mm0 ; q0-= q0 add | |
1463 pxor mm3, [GLOBAL(t80)] ; unoffset | |
1464 movq [rsi], mm3 ; write back | |
1465 | |
1466 | |
1467 ; now do +3 side | |
1468 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 | |
1469 | |
1470 movq mm0, mm5 ; get a copy of filters | |
1471 psllw mm0, 8 ; shift left 8 | |
1472 psraw mm0, 3 ; arithmetic shift right 11 | |
1473 psrlw mm0, 8 | |
1474 psraw mm5, 11 ; arithmetic shift right 11 | |
1475 psllw mm5, 8 ; shift left 8 to put it back | |
1476 por mm0, mm5 ; put the two together to get result | |
1477 | |
1478 | |
1479 paddsb mm6, mm0 ; p0+= p0 add | |
1480 pxor mm6, [GLOBAL(t80)] ; unoffset | |
1481 movq [rsi+rax], mm6 ; write back | |
1482 | |
1483 add rsi,8 | |
1484 neg rax | |
1485 dec rcx | |
1486 jnz .nexts8_h | |
1487 | |
1488 ; begin epilog | |
1489 pop rdi | |
1490 pop rsi | |
1491 RESTORE_GOT | |
1492 UNSHADOW_ARGS | |
1493 pop rbp | |
1494 ret | |
1495 | |
1496 | |
1497 ;void vp8_loop_filter_simple_vertical_edge_mmx | |
1498 ;( | |
1499 ; unsigned char *src_ptr, | |
1500 ; int src_pixel_step, | |
1501 ; const char *blimit | |
1502 ;) | |
1503 global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE | |
1504 sym(vp8_loop_filter_simple_vertical_edge_mmx): | |
1505 push rbp | |
1506 mov rbp, rsp | |
1507 SHADOW_ARGS_TO_STACK 3 | |
1508 GET_GOT rbx | |
1509 push rsi | |
1510 push rdi | |
1511 ; end prolog | |
1512 | |
1513 ALIGN_STACK 16, rax | |
1514 sub rsp, 32 ; reserve 32 bytes | |
1515 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; | |
1516 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; | |
1517 | |
1518 mov rsi, arg(0) ;src_ptr | |
1519 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc
h? | |
1520 | |
1521 lea rsi, [rsi + rax*4- 2]; ; | |
1522 mov rcx, 2 ; count | |
1523 .nexts8_v: | |
1524 | |
1525 lea rdi, [rsi + rax]; | |
1526 movd mm0, [rdi + rax * 2] ; xx xx xx xx 73
72 71 70 | |
1527 | |
1528 movd mm6, [rsi + rax * 2] ; xx xx xx xx 63
62 61 60 | |
1529 punpcklbw mm6, mm0 ; 73 63 72 62 71
61 70 60 | |
1530 | |
1531 movd mm0, [rsi + rax] ; xx xx xx xx 53
52 51 50 | |
1532 movd mm4, [rsi] ; xx xx xx xx 43
42 41 40 | |
1533 | |
1534 punpcklbw mm4, mm0 ; 53 43 52 42 51
41 50 40 | |
1535 movq mm5, mm4 ; 53 43 52 42 51
41 50 40 | |
1536 | |
1537 punpcklwd mm4, mm6 ; 71 61 51 41 70
60 50 40 | |
1538 punpckhwd mm5, mm6 ; 73 63 53 43 72
62 52 42 | |
1539 | |
1540 neg rax | |
1541 | |
1542 movd mm7, [rsi + rax] ; xx xx xx xx 33
32 31 30 | |
1543 movd mm6, [rsi + rax * 2] ; xx xx xx xx 23
22 21 20 | |
1544 | |
1545 punpcklbw mm6, mm7 ; 33 23 32 22 31
21 30 20 | |
1546 movd mm1, [rdi + rax * 4] ; xx xx xx xx 13
12 11 10 | |
1547 | |
1548 movd mm0, [rsi + rax * 4] ; xx xx xx xx 03
02 01 00 | |
1549 punpcklbw mm0, mm1 ; 13 03 12 02 11
01 10 00 | |
1550 | |
1551 movq mm2, mm0 ; 13 03 12 02 11
01 10 00 | |
1552 punpcklwd mm0, mm6 ; 31 21 11 01 30
20 10 00 | |
1553 | |
1554 punpckhwd mm2, mm6 ; 33 23 13 03 32
22 12 02 | |
1555 movq mm1, mm0 ; 13 03 12 02 11
01 10 00 | |
1556 | |
1557 punpckldq mm0, mm4 ; 70 60 50 40 30
20 10 00 = p1 | |
1558 movq mm3, mm2 ; 33 23 13 03 32
22 12 02 | |
1559 | |
1560 punpckhdq mm1, mm4 ; 71 61 51 41 31
21 11 01 = p0 | |
1561 punpckldq mm2, mm5 ; 72 62 52 42 32
22 12 02 = q0 | |
1562 | |
1563 punpckhdq mm3, mm5 ; 73 63 53 43 33
23 13 03 = q1 | |
1564 | |
1565 | |
1566 ; calculate mask | |
1567 movq mm6, mm0 ; p1 | |
1568 movq mm7, mm3 ; q1 | |
1569 psubusb mm7, mm6 ; q1-=p1 | |
1570 psubusb mm6, mm3 ; p1-=q1 | |
1571 por mm6, mm7 ; abs(p1-q1) | |
1572 pand mm6, [GLOBAL(tfe)] ; set lsb of eac
h byte to zero | |
1573 psrlw mm6, 1 ; abs(p1-q1)/2 | |
1574 | |
1575 movq mm5, mm1 ; p0 | |
1576 movq mm4, mm2 ; q0 | |
1577 | |
1578 psubusb mm5, mm2 ; p0-=q0 | |
1579 psubusb mm4, mm1 ; q0-=p0 | |
1580 | |
1581 por mm5, mm4 ; abs(p0 - q0) | |
1582 paddusb mm5, mm5 ; abs(p0-q0)*2 | |
1583 paddusb mm5, mm6 ; abs (p0 - q0)
*2 + abs(p1-q1)/2 | |
1584 | |
1585 mov rdx, arg(2) ;blimit ; get bl
imit | |
1586 movq mm7, [rdx] | |
1587 | |
1588 psubusb mm5, mm7 ; abs(p0 - q0) *
2 + abs(p1-q1)/2 > blimit | |
1589 pxor mm7, mm7 | |
1590 pcmpeqb mm5, mm7 ; mm5 = mask | |
1591 | |
1592 ; start work on filters | |
1593 movq t0, mm0 | |
1594 movq t1, mm3 | |
1595 | |
1596 pxor mm0, [GLOBAL(t80)] ; p1 offset to c
onvert to signed values | |
1597 pxor mm3, [GLOBAL(t80)] ; q1 offset to c
onvert to signed values | |
1598 | |
1599 psubsb mm0, mm3 ; p1 - q1 | |
1600 movq mm6, mm1 ; p0 | |
1601 | |
1602 movq mm7, mm2 ; q0 | |
1603 pxor mm6, [GLOBAL(t80)] ; offset to conv
ert to signed values | |
1604 | |
1605 pxor mm7, [GLOBAL(t80)] ; offset to conv
ert to signed values | |
1606 movq mm3, mm7 ; offseted ; q0 | |
1607 | |
1608 psubsb mm7, mm6 ; q0 - p0 | |
1609 paddsb mm0, mm7 ; p1 - q1 + 1 *
(q0 - p0) | |
1610 | |
1611 paddsb mm0, mm7 ; p1 - q1 + 2 *
(q0 - p0) | |
1612 paddsb mm0, mm7 ; p1 - q1 + 3 *
(q0 - p0) | |
1613 | |
1614 pand mm5, mm0 ; mask filter va
lues we don't care about | |
1615 | |
1616 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0)
+ (p1 - q1) + 4 | |
1617 | |
1618 movq mm0, mm5 ; get a copy of
filters | |
1619 psllw mm0, 8 ; shift left 8 | |
1620 psraw mm0, 3 ; arithmetic shi
ft right 11 | |
1621 psrlw mm0, 8 | |
1622 | |
1623 movq mm7, mm5 ; get a copy of
filters | |
1624 psraw mm7, 11 ; arithmetic shi
ft right 11 | |
1625 psllw mm7, 8 ; shift left 8 t
o put it back | |
1626 | |
1627 por mm0, mm7 ; put the two to
gether to get result | |
1628 | |
1629 psubsb mm3, mm0 ; q0-= q0sz add | |
1630 pxor mm3, [GLOBAL(t80)] ; unoffset | |
1631 | |
1632 ; now do +3 side | |
1633 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of
+4 | |
1634 | |
1635 movq mm0, mm5 ; get a copy of
filters | |
1636 psllw mm0, 8 ; shift left 8 | |
1637 psraw mm0, 3 ; arithmetic shi
ft right 11 | |
1638 psrlw mm0, 8 | |
1639 | |
1640 psraw mm5, 11 ; arithmetic shi
ft right 11 | |
1641 psllw mm5, 8 ; shift left 8 t
o put it back | |
1642 por mm0, mm5 ; put the two to
gether to get result | |
1643 | |
1644 paddsb mm6, mm0 ; p0+= p0 add | |
1645 pxor mm6, [GLOBAL(t80)] ; unoffset | |
1646 | |
1647 | |
1648 movq mm0, t0 | |
1649 movq mm4, t1 | |
1650 | |
1651 ; mm0 = 70 60 50 40 30 20 10 00 | |
1652 ; mm6 = 71 61 51 41 31 21 11 01 | |
1653 ; mm3 = 72 62 52 42 32 22 12 02 | |
1654 ; mm4 = 73 63 53 43 33 23 13 03 | |
1655 ; transpose back to write out | |
1656 | |
1657 movq mm1, mm0 ; | |
1658 punpcklbw mm0, mm6 ; 31 30 21 20 11 10
01 00 | |
1659 | |
1660 punpckhbw mm1, mm6 ; 71 70 61 60 51 50
41 40 | |
1661 movq mm2, mm3 ; | |
1662 | |
1663 punpcklbw mm2, mm4 ; 33 32 23 22 13 12
03 02 | |
1664 movq mm5, mm1 ; 71 70 61 60 51 50
41 40 | |
1665 | |
1666 punpckhbw mm3, mm4 ; 73 72 63 62 53 52
43 42 | |
1667 movq mm6, mm0 ; 31 30 21 20 11 10
01 00 | |
1668 | |
1669 punpcklwd mm0, mm2 ; 13 12 11 10 03 02
01 00 | |
1670 punpckhwd mm6, mm2 ; 33 32 31 30 23 22
21 20 | |
1671 | |
1672 movd [rsi+rax*4], mm0 ; write 03 02 01 00 | |
1673 punpcklwd mm1, mm3 ; 53 52 51 50 43 42
41 40 | |
1674 | |
1675 psrlq mm0, 32 ; xx xx xx xx 13 12
11 10 | |
1676 punpckhwd mm5, mm3 ; 73 72 71 70 63 62
61 60 | |
1677 | |
1678 movd [rdi+rax*4], mm0 ; write 13 12 11 10 | |
1679 movd [rsi+rax*2], mm6 ; write 23 22 21 20 | |
1680 | |
1681 psrlq mm6, 32 ; 33 32 31 30 | |
1682 movd [rsi], mm1 ; write 43 42 41 40 | |
1683 | |
1684 movd [rsi + rax], mm6 ; write 33 32 31 30 | |
1685 neg rax | |
1686 | |
1687 movd [rsi + rax*2], mm5 ; write 63 62 61 60 | |
1688 psrlq mm1, 32 ; 53 52 51 50 | |
1689 | |
1690 movd [rdi], mm1 ; write out 53 52 51
50 | |
1691 psrlq mm5, 32 ; 73 72 71 70 | |
1692 | |
1693 movd [rdi + rax*2], mm5 ; write 73 72 71 70 | |
1694 | |
1695 lea rsi, [rsi+rax*8] ; next 8 | |
1696 | |
1697 dec rcx | |
1698 jnz .nexts8_v | |
1699 | |
1700 add rsp, 32 | |
1701 pop rsp | |
1702 ; begin epilog | |
1703 pop rdi | |
1704 pop rsi | |
1705 RESTORE_GOT | |
1706 UNSHADOW_ARGS | |
1707 pop rbp | |
1708 ret | |
1709 | |
1710 | |
1711 | |
1712 ;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, | |
1713 ; int y_stride, | |
1714 ; loop_filter_info *lfi) | |
1715 ;{ | |
1716 ; | |
1717 ; | |
1718 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->
lim,lfi->thr,2); | |
1719 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->
lim,lfi->thr,2); | |
1720 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi-
>lim,lfi->thr,2); | |
1721 ;} | |
1722 | |
1723 SECTION_RODATA | |
1724 align 16 | |
1725 tfe: | |
1726 times 8 db 0xfe | |
1727 align 16 | |
1728 t80: | |
1729 times 8 db 0x80 | |
1730 align 16 | |
1731 t1s: | |
1732 times 8 db 0x01 | |
1733 align 16 | |
1734 t3: | |
1735 times 8 db 0x03 | |
1736 align 16 | |
1737 t4: | |
1738 times 8 db 0x04 | |
1739 align 16 | |
1740 ones: | |
1741 times 4 dw 0x0001 | |
1742 align 16 | |
1743 s27: | |
1744 times 4 dw 0x1b00 | |
1745 align 16 | |
1746 s18: | |
1747 times 4 dw 0x1200 | |
1748 align 16 | |
1749 s9: | |
1750 times 4 dw 0x0900 | |
1751 align 16 | |
1752 s63: | |
1753 times 4 dw 0x003f | |
OLD | NEW |