Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(580)

Side by Side Diff: source/libvpx/vp8/common/x86/loopfilter_mmx.asm

Issue 1255053007: Rename vp8 loopfilter_mmx.asm (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « libvpx_srcs_x86_64.gypi ('k') | source/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14
15 ;void vp8_loop_filter_horizontal_edge_mmx
16 ;(
17 ; unsigned char *src_ptr,
18 ; int src_pixel_step,
19 ; const char *blimit,
20 ; const char *limit,
21 ; const char *thresh,
22 ; int count
23 ;)
24 global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
25 sym(vp8_loop_filter_horizontal_edge_mmx):
26 push rbp
27 mov rbp, rsp
28 SHADOW_ARGS_TO_STACK 6
29 GET_GOT rbx
30 push rsi
31 push rdi
32 ; end prolog
33
34 ALIGN_STACK 16, rax
35 sub rsp, 32 ; reserve 32 bytes
36 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
37 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
38
39 mov rsi, arg(0) ;src_ptr
40 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc h?
41
42 movsxd rcx, dword ptr arg(5) ;count
43 .next8_h:
44 mov rdx, arg(3) ;limit
45 movq mm7, [rdx]
46 mov rdi, rsi ; rdi points to row +1 for indirect ad dressing
47 add rdi, rax
48
49 ; calculate breakout conditions
50 movq mm2, [rdi+2*rax] ; q3
51 movq mm1, [rsi+2*rax] ; q2
52 movq mm6, mm1 ; q2
53 psubusb mm1, mm2 ; q2-=q3
54 psubusb mm2, mm6 ; q3-=q2
55 por mm1, mm2 ; abs(q3-q2)
56 psubusb mm1, mm7 ;
57
58
59 movq mm4, [rsi+rax] ; q1
60 movq mm3, mm4 ; q1
61 psubusb mm4, mm6 ; q1-=q2
62 psubusb mm6, mm3 ; q2-=q1
63 por mm4, mm6 ; abs(q2-q1)
64
65 psubusb mm4, mm7
66 por mm1, mm4
67
68 movq mm4, [rsi] ; q0
69 movq mm0, mm4 ; q0
70 psubusb mm4, mm3 ; q0-=q1
71 psubusb mm3, mm0 ; q1-=q0
72 por mm4, mm3 ; abs(q0-q1)
73 movq t0, mm4 ; save to t0
74 psubusb mm4, mm7
75 por mm1, mm4
76
77
78 neg rax ; negate pitch to deal with above bord er
79
80 movq mm2, [rsi+4*rax] ; p3
81 movq mm4, [rdi+4*rax] ; p2
82 movq mm5, mm4 ; p2
83 psubusb mm4, mm2 ; p2-=p3
84 psubusb mm2, mm5 ; p3-=p2
85 por mm4, mm2 ; abs(p3 - p2)
86 psubusb mm4, mm7
87 por mm1, mm4
88
89
90 movq mm4, [rsi+2*rax] ; p1
91 movq mm3, mm4 ; p1
92 psubusb mm4, mm5 ; p1-=p2
93 psubusb mm5, mm3 ; p2-=p1
94 por mm4, mm5 ; abs(p2 - p1)
95 psubusb mm4, mm7
96 por mm1, mm4
97
98 movq mm2, mm3 ; p1
99
100 movq mm4, [rsi+rax] ; p0
101 movq mm5, mm4 ; p0
102 psubusb mm4, mm3 ; p0-=p1
103 psubusb mm3, mm5 ; p1-=p0
104 por mm4, mm3 ; abs(p1 - p0)
105 movq t1, mm4 ; save to t1
106 psubusb mm4, mm7
107 por mm1, mm4
108
109 movq mm3, [rdi] ; q1
110 movq mm4, mm3 ; q1
111 psubusb mm3, mm2 ; q1-=p1
112 psubusb mm2, mm4 ; p1-=q1
113 por mm2, mm3 ; abs(p1-q1)
114 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
115 psrlw mm2, 1 ; abs(p1-q1)/2
116
117 movq mm6, mm5 ; p0
118 movq mm3, [rsi] ; q0
119 psubusb mm5, mm3 ; p0-=q0
120 psubusb mm3, mm6 ; q0-=p0
121 por mm5, mm3 ; abs(p0 - q0)
122 paddusb mm5, mm5 ; abs(p0-q0)*2
123 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
124
125 mov rdx, arg(2) ;blimit ; get blimit
126 movq mm7, [rdx] ; blimit
127
128 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > b limit
129 por mm1, mm5
130 pxor mm5, mm5
131 pcmpeqb mm1, mm5 ; mask mm1
132
133 ; calculate high edge variance
134 mov rdx, arg(4) ;thresh ; get thresh
135 movq mm7, [rdx] ;
136 movq mm4, t0 ; get abs (q1 - q0)
137 psubusb mm4, mm7
138 movq mm3, t1 ; get abs (p1 - p0)
139 psubusb mm3, mm7
140 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0 ) > thresh
141
142 pcmpeqb mm4, mm5
143
144 pcmpeqb mm5, mm5
145 pxor mm4, mm5
146
147
148 ; start work on filters
149 movq mm2, [rsi+2*rax] ; p1
150 movq mm7, [rdi] ; q1
151 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value s
152 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value s
153 psubsb mm2, mm7 ; p1 - q1
154 pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
155 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
156 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
157 movq mm3, mm0 ; q0
158 psubsb mm0, mm6 ; q0 - p0
159 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
160 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
161 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
162 pand mm1, mm2 ; mask filter values we don't care about
163 movq mm2, mm1
164 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
165 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
166
167 pxor mm0, mm0 ;
168 pxor mm5, mm5
169 punpcklbw mm0, mm2 ;
170 punpckhbw mm5, mm2 ;
171 psraw mm0, 11 ;
172 psraw mm5, 11
173 packsswb mm0, mm5
174 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
175
176 pxor mm0, mm0 ; 0
177 movq mm5, mm1 ; abcdefgh
178 punpcklbw mm0, mm1 ; e0f0g0h0
179 psraw mm0, 11 ; sign extended shift right by 3
180 pxor mm1, mm1 ; 0
181 punpckhbw mm1, mm5 ; a0b0c0d0
182 psraw mm1, 11 ; sign extended shift right by 3
183 movq mm5, mm0 ; save results
184
185 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >> 3
186 paddsw mm5, [GLOBAL(ones)]
187 paddsw mm1, [GLOBAL(ones)]
188 psraw mm5, 1 ; partial shifted one more time for 2n d tap
189 psraw mm1, 1 ; partial shifted one more time for 2n d tap
190 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >> 4
191 pandn mm4, mm5 ; high edge variance additive
192
193 paddsb mm6, mm2 ; p0+= p0 add
194 pxor mm6, [GLOBAL(t80)] ; unoffset
195 movq [rsi+rax], mm6 ; write back
196
197 movq mm6, [rsi+2*rax] ; p1
198 pxor mm6, [GLOBAL(t80)] ; reoffset
199 paddsb mm6, mm4 ; p1+= p1 add
200 pxor mm6, [GLOBAL(t80)] ; unoffset
201 movq [rsi+2*rax], mm6 ; write back
202
203 psubsb mm3, mm0 ; q0-= q0 add
204 pxor mm3, [GLOBAL(t80)] ; unoffset
205 movq [rsi], mm3 ; write back
206
207 psubsb mm7, mm4 ; q1-= q1 add
208 pxor mm7, [GLOBAL(t80)] ; unoffset
209 movq [rdi], mm7 ; write back
210
211 add rsi,8
212 neg rax
213 dec rcx
214 jnz .next8_h
215
216 add rsp, 32
217 pop rsp
218 ; begin epilog
219 pop rdi
220 pop rsi
221 RESTORE_GOT
222 UNSHADOW_ARGS
223 pop rbp
224 ret
225
226
227 ;void vp8_loop_filter_vertical_edge_mmx
228 ;(
229 ; unsigned char *src_ptr,
230 ; int src_pixel_step,
231 ; const char *blimit,
232 ; const char *limit,
233 ; const char *thresh,
234 ; int count
235 ;)
236 global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
237 sym(vp8_loop_filter_vertical_edge_mmx):
238 push rbp
239 mov rbp, rsp
240 SHADOW_ARGS_TO_STACK 6
241 GET_GOT rbx
242 push rsi
243 push rdi
244 ; end prolog
245
246 ALIGN_STACK 16, rax
247 sub rsp, 64 ; reserve 64 bytes
248 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
249 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
250 %define srct [rsp + 32] ;__declspec(align(16)) char srct[32];
251
252 mov rsi, arg(0) ;src_ptr
253 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destinati on pitch?
254
255 lea rsi, [rsi + rax*4 - 4]
256
257 movsxd rcx, dword ptr arg(5) ;count
258 .next8_v:
259 mov rdi, rsi ; rdi points to row +1 for indirec t addressing
260 add rdi, rax
261
262
263 ;transpose
264 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
265 movq mm7, mm6 ; 77 76 75 74 73 72 71 70
266
267 punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64
268 punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60
269
270 movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
271 movq mm5, mm4 ; 47 46 45 44 43 42 41 40
272
273 punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44
274 punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40
275
276 movq mm3, mm5 ; 57 47 56 46 55 45 54 44
277 punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
278
279 punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
280 movq mm2, mm4 ; 53 43 52 42 51 41 50 40
281
282 punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
283 punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
284
285 neg rax
286 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
287
288 movq mm1, mm6 ; 27 26 25 24 23 22 21 20
289 punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24
290
291 punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20
292 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
293
294 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
295 movq mm0, mm7 ; 17 07 16 06 15 05 14 04
296
297 punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
298 punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
299
300 movq mm6, mm7 ; 37 27 17 07 36 26 16 06
301 punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
302
303 punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
304
305 movq mm5, mm6 ; 76 66 56 46 36 26 16 06
306 psubusb mm5, mm7 ; q2-q3
307
308 psubusb mm7, mm6 ; q3-q2
309 por mm7, mm5; ; mm7=abs (q3-q2)
310
311 movq mm5, mm0 ; 35 25 15 05 34 24 14 04
312 punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
313
314 punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
315 movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
316
317 psubusb mm3, mm6 ; q1-q2
318 psubusb mm6, mm5 ; q2-q1
319
320 por mm6, mm3 ; mm6=abs(q2-q1)
321 lea rdx, srct
322
323 movq [rdx+24], mm5 ; save q1
324 movq [rdx+16], mm0 ; save q0
325
326 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
327 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
328
329 movq mm0, mm3 ; 13 03 12 02 11 01 10 00
330 punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
331
332 punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
333 movq mm1, mm0 ; 31 21 11 01 30 20 10 00
334
335 punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
336 punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
337
338 movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
339 psubusb mm2, mm0 ; p2-p3
340
341 psubusb mm0, mm1 ; p3-p2
342 por mm0, mm2 ; mm0=abs(p3-p2)
343
344 movq mm2, mm3 ; 33 23 13 03 32 22 12 02
345 punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
346
347 punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
348 movq [rdx+8], mm3 ; save p0
349
350 movq [rdx], mm2 ; save p1
351 movq mm5, mm2 ; mm5 = p1
352
353 psubusb mm2, mm1 ; p1-p2
354 psubusb mm1, mm5 ; p2-p1
355
356 por mm1, mm2 ; mm1=abs(p2-p1)
357 mov rdx, arg(3) ;limit
358
359 movq mm4, [rdx] ; mm4 = limit
360 psubusb mm7, mm4
361
362 psubusb mm0, mm4
363 psubusb mm1, mm4
364
365 psubusb mm6, mm4
366 por mm7, mm6
367
368 por mm0, mm1
369 por mm0, mm7 ; abs(q3-q2) > lim it || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
370
371 movq mm1, mm5 ; p1
372
373 movq mm7, mm3 ; mm3=mm7=p0
374 psubusb mm7, mm5 ; p0 - p1
375
376 psubusb mm5, mm3 ; p1 - p0
377 por mm5, mm7 ; abs(p1-p0)
378
379 movq t0, mm5 ; save abs(p1-p0)
380 lea rdx, srct
381
382 psubusb mm5, mm4
383 por mm0, mm5 ; mm0=mask
384
385 movq mm5, [rdx+16] ; mm5=q0
386 movq mm7, [rdx+24] ; mm7=q1
387
388 movq mm6, mm5 ; mm6=q0
389 movq mm2, mm7 ; q1
390 psubusb mm5, mm7 ; q0-q1
391
392 psubusb mm7, mm6 ; q1-q0
393 por mm7, mm5 ; abs(q1-q0)
394
395 movq t1, mm7 ; save abs(q1-q0)
396 psubusb mm7, mm4
397
398 por mm0, mm7 ; mask
399
400 movq mm5, mm2 ; q1
401 psubusb mm5, mm1 ; q1-=p1
402 psubusb mm1, mm2 ; p1-=q1
403 por mm5, mm1 ; abs(p1-q1)
404 pand mm5, [GLOBAL(tfe)] ; set lsb of each by te to zero
405 psrlw mm5, 1 ; abs(p1-q1)/2
406
407 mov rdx, arg(2) ;blimit ;
408
409 movq mm4, [rdx] ;blimit
410 movq mm1, mm3 ; mm1=mm3=p0
411
412 movq mm7, mm6 ; mm7=mm6=q0
413 psubusb mm1, mm7 ; p0-q0
414
415 psubusb mm7, mm3 ; q0-p0
416 por mm1, mm7 ; abs(q0-p0)
417 paddusb mm1, mm1 ; abs(q0-p0)*2
418 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
419
420 psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
421 por mm1, mm0; ; mask
422
423 pxor mm0, mm0
424 pcmpeqb mm1, mm0
425
426 ; calculate high edge variance
427 mov rdx, arg(4) ;thresh ; get thresh
428 movq mm7, [rdx]
429 ;
430 movq mm4, t0 ; get abs (q1 - q0)
431 psubusb mm4, mm7
432
433 movq mm3, t1 ; get abs (p1 - p0)
434 psubusb mm3, mm7
435
436 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p 1 - p0) > thresh
437 pcmpeqb mm4, mm0
438
439 pcmpeqb mm0, mm0
440 pxor mm4, mm0
441
442
443
444 ; start work on filters
445 lea rdx, srct
446
447 movq mm2, [rdx] ; p1
448 movq mm7, [rdx+24] ; q1
449
450 movq mm6, [rdx+8] ; p0
451 movq mm0, [rdx+16] ; q0
452
453 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
454 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
455
456 psubsb mm2, mm7 ; p1 - q1
457 pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
458
459 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed va lues
460 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed va lues
461
462 movq mm3, mm0 ; q0
463 psubsb mm0, mm6 ; q0 - p0
464
465 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
466 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
467
468 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
469 pand mm1, mm2 ; mask filter values we don't ca re about
470
471 movq mm2, mm1
472 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
473
474 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
475 pxor mm0, mm0 ;
476
477 pxor mm5, mm5
478 punpcklbw mm0, mm2 ;
479
480 punpckhbw mm5, mm2 ;
481 psraw mm0, 11 ;
482
483 psraw mm5, 11
484 packsswb mm0, mm5
485
486 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
487
488 pxor mm0, mm0 ; 0
489 movq mm5, mm1 ; abcdefgh
490
491 punpcklbw mm0, mm1 ; e0f0g0h0
492 psraw mm0, 11 ; sign extended shift right by 3
493
494 pxor mm1, mm1 ; 0
495 punpckhbw mm1, mm5 ; a0b0c0d0
496
497 psraw mm1, 11 ; sign extended shift right by 3
498 movq mm5, mm0 ; save results
499
500 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4 ) >>3
501 paddsw mm5, [GLOBAL(ones)]
502
503 paddsw mm1, [GLOBAL(ones)]
504 psraw mm5, 1 ; partial shifted one more tim e for 2nd tap
505
506 psraw mm1, 1 ; partial shifted one more tim e for 2nd tap
507 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4 ) >>4
508
509 pandn mm4, mm5 ; high edge variance additive
510
511 paddsb mm6, mm2 ; p0+= p0 add
512 pxor mm6, [GLOBAL(t80)] ; unoffset
513
514 ; mm6=p0 ;
515 movq mm1, [rdx] ; p1
516 pxor mm1, [GLOBAL(t80)] ; reoffset
517
518 paddsb mm1, mm4 ; p1+= p1 add
519 pxor mm1, [GLOBAL(t80)] ; unoffset
520 ; mm6 = p0 mm1 = p1
521
522 psubsb mm3, mm0 ; q0-= q0 add
523 pxor mm3, [GLOBAL(t80)] ; unoffset
524
525 ; mm3 = q0
526 psubsb mm7, mm4 ; q1-= q1 add
527 pxor mm7, [GLOBAL(t80)] ; unoffset
528 ; mm7 = q1
529
530 ; transpose and write back
531 ; mm1 = 72 62 52 42 32 22 12 02
532 ; mm6 = 73 63 53 43 33 23 13 03
533 ; mm3 = 74 64 54 44 34 24 14 04
534 ; mm7 = 75 65 55 45 35 25 15 05
535
536 movq mm2, mm1 ; 72 62 52 42 32 22 12 02
537 punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02
538
539 movq mm4, mm3 ; 74 64 54 44 34 24 14 04
540 punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42
541
542 punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04
543 punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44
544
545 movq mm6, mm2 ; 33 32 23 22 13 12 03 02
546 punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02
547
548 punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22
549 movq mm5, mm1 ; 73 72 63 62 53 52 43 42
550
551 punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42
552 punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62
553
554
555 ; mm2 = 15 14 13 12 05 04 03 02
556 ; mm6 = 35 34 33 32 25 24 23 22
557 ; mm5 = 55 54 53 52 45 44 43 42
558 ; mm1 = 75 74 73 72 65 64 63 62
559
560
561
562 movd [rsi+rax*4+2], mm2
563 psrlq mm2, 32
564
565 movd [rdi+rax*4+2], mm2
566 movd [rsi+rax*2+2], mm6
567
568 psrlq mm6, 32
569 movd [rsi+rax+2],mm6
570
571 movd [rsi+2], mm1
572 psrlq mm1, 32
573
574 movd [rdi+2], mm1
575 neg rax
576
577 movd [rdi+rax+2],mm5
578 psrlq mm5, 32
579
580 movd [rdi+rax*2+2], mm5
581
582 lea rsi, [rsi+rax*8]
583 dec rcx
584 jnz .next8_v
585
586 add rsp, 64
587 pop rsp
588 ; begin epilog
589 pop rdi
590 pop rsi
591 RESTORE_GOT
592 UNSHADOW_ARGS
593 pop rbp
594 ret
595
596
597 ;void vp8_mbloop_filter_horizontal_edge_mmx
598 ;(
599 ; unsigned char *src_ptr,
600 ; int src_pixel_step,
601 ; const char *blimit,
602 ; const char *limit,
603 ; const char *thresh,
604 ; int count
605 ;)
606 global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
607 sym(vp8_mbloop_filter_horizontal_edge_mmx):
608 push rbp
609 mov rbp, rsp
610 SHADOW_ARGS_TO_STACK 6
611 GET_GOT rbx
612 push rsi
613 push rdi
614 ; end prolog
615
616 ALIGN_STACK 16, rax
617 sub rsp, 32 ; reserve 32 bytes
618 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
619 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
620
621 mov rsi, arg(0) ;src_ptr
622 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc h?
623
624 movsxd rcx, dword ptr arg(5) ;count
625 .next8_mbh:
626 mov rdx, arg(3) ;limit
627 movq mm7, [rdx]
628 mov rdi, rsi ; rdi points to row +1 for indirect ad dressing
629 add rdi, rax
630
631 ; calculate breakout conditions
632 movq mm2, [rdi+2*rax] ; q3
633
634 movq mm1, [rsi+2*rax] ; q2
635 movq mm6, mm1 ; q2
636 psubusb mm1, mm2 ; q2-=q3
637 psubusb mm2, mm6 ; q3-=q2
638 por mm1, mm2 ; abs(q3-q2)
639 psubusb mm1, mm7
640
641
642 ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
643 movq mm4, [rsi+rax] ; q1
644 movq mm3, mm4 ; q1
645 psubusb mm4, mm6 ; q1-=q2
646 psubusb mm6, mm3 ; q2-=q1
647 por mm4, mm6 ; abs(q2-q1)
648 psubusb mm4, mm7
649 por mm1, mm4
650
651
652 ; mm1 = mask, mm3=q1, mm7 = limit
653
654 movq mm4, [rsi] ; q0
655 movq mm0, mm4 ; q0
656 psubusb mm4, mm3 ; q0-=q1
657 psubusb mm3, mm0 ; q1-=q0
658 por mm4, mm3 ; abs(q0-q1)
659 movq t0, mm4 ; save to t0
660 psubusb mm4, mm7
661 por mm1, mm4
662
663
664 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
665
666 neg rax ; negate pitch to deal with above bord er
667
668 movq mm2, [rsi+4*rax] ; p3
669 movq mm4, [rdi+4*rax] ; p2
670 movq mm5, mm4 ; p2
671 psubusb mm4, mm2 ; p2-=p3
672 psubusb mm2, mm5 ; p3-=p2
673 por mm4, mm2 ; abs(p3 - p2)
674 psubusb mm4, mm7
675 por mm1, mm4
676 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
677
678 movq mm4, [rsi+2*rax] ; p1
679 movq mm3, mm4 ; p1
680 psubusb mm4, mm5 ; p1-=p2
681 psubusb mm5, mm3 ; p2-=p1
682 por mm4, mm5 ; abs(p2 - p1)
683 psubusb mm4, mm7
684 por mm1, mm4
685
686 movq mm2, mm3 ; p1
687
688
689 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
690
691 movq mm4, [rsi+rax] ; p0
692 movq mm5, mm4 ; p0
693 psubusb mm4, mm3 ; p0-=p1
694 psubusb mm3, mm5 ; p1-=p0
695 por mm4, mm3 ; abs(p1 - p0)
696 movq t1, mm4 ; save to t1
697 psubusb mm4, mm7
698 por mm1, mm4
699 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
700 ; mm5 = p0
701 movq mm3, [rdi] ; q1
702 movq mm4, mm3 ; q1
703 psubusb mm3, mm2 ; q1-=p1
704 psubusb mm2, mm4 ; p1-=q1
705 por mm2, mm3 ; abs(p1-q1)
706 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
707 psrlw mm2, 1 ; abs(p1-q1)/2
708
709 movq mm6, mm5 ; p0
710 movq mm3, mm0 ; q0
711 psubusb mm5, mm3 ; p0-=q0
712 psubusb mm3, mm6 ; q0-=p0
713 por mm5, mm3 ; abs(p0 - q0)
714 paddusb mm5, mm5 ; abs(p0-q0)*2
715 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
716
717 mov rdx, arg(2) ;blimit ; get blimit
718 movq mm7, [rdx] ; blimit
719
720 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > b limit
721 por mm1, mm5
722 pxor mm5, mm5
723 pcmpeqb mm1, mm5 ; mask mm1
724
725 ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
726 ; mm6 = p0,
727
728 ; calculate high edge variance
729 mov rdx, arg(4) ;thresh ; get thresh
730 movq mm7, [rdx] ;
731 movq mm4, t0 ; get abs (q1 - q0)
732 psubusb mm4, mm7
733 movq mm3, t1 ; get abs (p1 - p0)
734 psubusb mm3, mm7
735 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0 ) > thresh
736
737 pcmpeqb mm4, mm5
738
739 pcmpeqb mm5, mm5
740 pxor mm4, mm5
741
742
743
744 ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
745 ; mm6 = p0, mm4=hev
746 ; start work on filters
747 movq mm2, [rsi+2*rax] ; p1
748 movq mm7, [rdi] ; q1
749 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value s
750 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value s
751 psubsb mm2, mm7 ; p1 - q1
752
753 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
754 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
755 movq mm3, mm0 ; q0
756 psubsb mm0, mm6 ; q0 - p0
757 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
758 paddsb mm2, mm0 ; 2 * (q0 - p0)
759 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
760 pand mm1, mm2 ; mask filter values we don't care abo ut
761
762
763 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
764 movq mm2, mm1 ; vp8_filter
765 pand mm2, mm4; ; Filter2 = vp8_filter & hev
766
767 movq mm5, mm2 ;
768 paddsb mm5, [GLOBAL(t3)];
769
770 pxor mm0, mm0 ; 0
771 pxor mm7, mm7 ; 0
772
773 punpcklbw mm0, mm5 ; e0f0g0h0
774 psraw mm0, 11 ; sign extended shift right by 3
775 punpckhbw mm7, mm5 ; a0b0c0d0
776 psraw mm7, 11 ; sign extended shift right by 3
777 packsswb mm0, mm7 ; Filter2 >>=3;
778
779 movq mm5, mm0 ; Filter2
780
781 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
782 pxor mm0, mm0 ; 0
783 pxor mm7, mm7 ; 0
784
785 punpcklbw mm0, mm2 ; e0f0g0h0
786 psraw mm0, 11 ; sign extended shift right by 3
787 punpckhbw mm7, mm2 ; a0b0c0d0
788 psraw mm7, 11 ; sign extended shift right by 3
789 packsswb mm0, mm7 ; Filter2 >>=3;
790
791 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
792 psubsb mm3, mm0 ; qs0 =qs0 - filter1
793 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
794
795 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
796 ; vp8_filter &= ~hev;
797 ; Filter2 = vp8_filter;
798 pandn mm4, mm1 ; vp8_filter&=~hev
799
800
801 ; mm3=qs0, mm4=filter2, mm6=ps0
802
803 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
804 ; s = vp8_signed_char_clamp(qs0 - u);
805 ; *oq0 = s^0x80;
806 ; s = vp8_signed_char_clamp(ps0 + u);
807 ; *op0 = s^0x80;
808 pxor mm0, mm0
809
810 pxor mm1, mm1
811 pxor mm2, mm2
812 punpcklbw mm1, mm4
813 punpckhbw mm2, mm4
814 pmulhw mm1, [GLOBAL(s27)]
815 pmulhw mm2, [GLOBAL(s27)]
816 paddw mm1, [GLOBAL(s63)]
817 paddw mm2, [GLOBAL(s63)]
818 psraw mm1, 7
819 psraw mm2, 7
820 packsswb mm1, mm2
821
822 psubsb mm3, mm1
823 paddsb mm6, mm1
824
825 pxor mm3, [GLOBAL(t80)]
826 pxor mm6, [GLOBAL(t80)]
827 movq [rsi+rax], mm6
828 movq [rsi], mm3
829
830 ; roughly 2/7th difference across boundary
831 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
832 ; s = vp8_signed_char_clamp(qs1 - u);
833 ; *oq1 = s^0x80;
834 ; s = vp8_signed_char_clamp(ps1 + u);
835 ; *op1 = s^0x80;
836 pxor mm1, mm1
837 pxor mm2, mm2
838 punpcklbw mm1, mm4
839 punpckhbw mm2, mm4
840 pmulhw mm1, [GLOBAL(s18)]
841 pmulhw mm2, [GLOBAL(s18)]
842 paddw mm1, [GLOBAL(s63)]
843 paddw mm2, [GLOBAL(s63)]
844 psraw mm1, 7
845 psraw mm2, 7
846 packsswb mm1, mm2
847
848 movq mm3, [rdi]
849 movq mm6, [rsi+rax*2] ; p1
850
851 pxor mm3, [GLOBAL(t80)]
852 pxor mm6, [GLOBAL(t80)]
853
854 paddsb mm6, mm1
855 psubsb mm3, mm1
856
857 pxor mm6, [GLOBAL(t80)]
858 pxor mm3, [GLOBAL(t80)]
859 movq [rdi], mm3
860 movq [rsi+rax*2], mm6
861
862 ; roughly 1/7th difference across boundary
863 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
864 ; s = vp8_signed_char_clamp(qs2 - u);
865 ; *oq2 = s^0x80;
866 ; s = vp8_signed_char_clamp(ps2 + u);
867 ; *op2 = s^0x80;
868 pxor mm1, mm1
869 pxor mm2, mm2
870 punpcklbw mm1, mm4
871 punpckhbw mm2, mm4
872 pmulhw mm1, [GLOBAL(s9)]
873 pmulhw mm2, [GLOBAL(s9)]
874 paddw mm1, [GLOBAL(s63)]
875 paddw mm2, [GLOBAL(s63)]
876 psraw mm1, 7
877 psraw mm2, 7
878 packsswb mm1, mm2
879
880
881 movq mm6, [rdi+rax*4]
882 neg rax
883 movq mm3, [rdi+rax ]
884
885 pxor mm6, [GLOBAL(t80)]
886 pxor mm3, [GLOBAL(t80)]
887
888 paddsb mm6, mm1
889 psubsb mm3, mm1
890
891 pxor mm6, [GLOBAL(t80)]
892 pxor mm3, [GLOBAL(t80)]
893 movq [rdi+rax ], mm3
894 neg rax
895 movq [rdi+rax*4], mm6
896
897 ;EARLY_BREAK_OUT:
898 neg rax
899 add rsi,8
900 dec rcx
901 jnz .next8_mbh
902
903 add rsp, 32
904 pop rsp
905 ; begin epilog
906 pop rdi
907 pop rsi
908 RESTORE_GOT
909 UNSHADOW_ARGS
910 pop rbp
911 ret
912
913
914 ;void vp8_mbloop_filter_vertical_edge_mmx
915 ;(
916 ; unsigned char *src_ptr,
917 ; int src_pixel_step,
918 ; const char *blimit,
919 ; const char *limit,
920 ; const char *thresh,
921 ; int count
922 ;)
923 global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
924 sym(vp8_mbloop_filter_vertical_edge_mmx):
925 push rbp
926 mov rbp, rsp
927 SHADOW_ARGS_TO_STACK 6
928 GET_GOT rbx
929 push rsi
930 push rdi
931 ; end prolog
932
933 ALIGN_STACK 16, rax
934 sub rsp, 96 ; reserve 96 bytes
935 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
936 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
937 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
938
939 mov rsi, arg(0) ;src_ptr
940 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destinati on pitch?
941
942 lea rsi, [rsi + rax*4 - 4]
943
944 movsxd rcx, dword ptr arg(5) ;count
945 .next8_mbv:
946 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
947
948 ;transpose
949 movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70
950 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
951
952 movq mm7, mm6 ; 77 76 75 74 73 72 71 70
953 punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64
954
955 punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
956 movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50
957
958 movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
959 movq mm5, mm4 ; 47 46 45 44 43 42 41 40
960
961 punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44
962 punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
963
964 movq mm3, mm5 ; 57 47 56 46 55 45 54 44
965 punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
966
967 punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
968 movq mm2, mm4 ; 53 43 52 42 51 41 50 40
969
970 punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
971 punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
972
973 neg rax
974
975 movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30
976 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
977
978 movq mm1, mm6 ; 27 26 25 24 23 22 21 20
979 punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24
980
981 punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20
982
983 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
984 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
985
986 movq mm0, mm7 ; 17 07 16 06 15 05 14 04
987 punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
988
989 punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
990 movq mm6, mm7 ; 37 27 17 07 36 26 16 06
991
992 punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
993 punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
994
995 lea rdx, srct
996 movq mm5, mm6 ; 76 66 56 46 36 26 16 06
997
998 movq [rdx+56], mm7
999 psubusb mm5, mm7 ; q2-q3
1000
1001
1002 movq [rdx+48], mm6
1003 psubusb mm7, mm6 ; q3-q2
1004
1005 por mm7, mm5; ; mm7=abs (q3-q2)
1006 movq mm5, mm0 ; 35 25 15 05 34 24 14 04
1007
1008 punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
1009 punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
1010
1011 movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
1012 psubusb mm3, mm6 ; q1-q2
1013
1014 psubusb mm6, mm5 ; q2-q1
1015 por mm6, mm3 ; mm6=abs(q2-q1)
1016
1017 movq [rdx+40], mm5 ; save q1
1018 movq [rdx+32], mm0 ; save q0
1019
1020 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
1021 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
1022
1023 movq mm0, mm3 ; 13 03 12 02 11 01 10 00
1024 punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
1025
1026 punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
1027 movq mm1, mm0 ; 31 21 11 01 30 20 10 00
1028
1029 punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
1030 punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
1031
1032 movq [rdx], mm0 ; save p3
1033 movq [rdx+8], mm1 ; save p2
1034
1035 movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
1036 psubusb mm2, mm0 ; p2-p3
1037
1038 psubusb mm0, mm1 ; p3-p2
1039 por mm0, mm2 ; mm0=abs(p3-p2)
1040
1041 movq mm2, mm3 ; 33 23 13 03 32 22 12 02
1042 punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
1043
1044 punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
1045 movq [rdx+24], mm3 ; save p0
1046
1047 movq [rdx+16], mm2 ; save p1
1048 movq mm5, mm2 ; mm5 = p1
1049
1050 psubusb mm2, mm1 ; p1-p2
1051 psubusb mm1, mm5 ; p2-p1
1052
1053 por mm1, mm2 ; mm1=abs(p2-p1)
1054 mov rdx, arg(3) ;limit
1055
1056 movq mm4, [rdx] ; mm4 = limit
1057 psubusb mm7, mm4 ; abs(q3-q2) > limit
1058
1059 psubusb mm0, mm4 ; abs(p3-p2) > limit
1060 psubusb mm1, mm4 ; abs(p2-p1) > limit
1061
1062 psubusb mm6, mm4 ; abs(q2-q1) > limit
1063 por mm7, mm6 ; or
1064
1065 por mm0, mm1 ;
1066 por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
1067
1068 movq mm1, mm5 ; p1
1069
1070 movq mm7, mm3 ; mm3=mm7=p0
1071 psubusb mm7, mm5 ; p0 - p1
1072
1073 psubusb mm5, mm3 ; p1 - p0
1074 por mm5, mm7 ; abs(p1-p0)
1075
1076 movq t0, mm5 ; save abs(p1-p0)
1077 lea rdx, srct
1078
1079 psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit
1080 por mm0, mm5 ; mm0=mask
1081
1082 movq mm5, [rdx+32] ; mm5=q0
1083 movq mm7, [rdx+40] ; mm7=q1
1084
1085 movq mm6, mm5 ; mm6=q0
1086 movq mm2, mm7 ; q1
1087 psubusb mm5, mm7 ; q0-q1
1088
1089 psubusb mm7, mm6 ; q1-q0
1090 por mm7, mm5 ; abs(q1-q0)
1091
1092 movq t1, mm7 ; save abs(q1-q0)
1093 psubusb mm7, mm4 ; mm7=abs(q1-q0)> li mit
1094
1095 por mm0, mm7 ; mask
1096
1097 movq mm5, mm2 ; q1
1098 psubusb mm5, mm1 ; q1-=p1
1099 psubusb mm1, mm2 ; p1-=q1
1100 por mm5, mm1 ; abs(p1-q1)
1101 pand mm5, [GLOBAL(tfe)] ; set lsb of each by te to zero
1102 psrlw mm5, 1 ; abs(p1-q1)/2
1103
1104 mov rdx, arg(2) ;blimit ;
1105
1106 movq mm4, [rdx] ;blimit
1107 movq mm1, mm3 ; mm1=mm3=p0
1108
1109 movq mm7, mm6 ; mm7=mm6=q0
1110 psubusb mm1, mm7 ; p0-q0
1111
1112 psubusb mm7, mm3 ; q0-p0
1113 por mm1, mm7 ; abs(q0-p0)
1114 paddusb mm1, mm1 ; abs(q0-p0)*2
1115 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1116
1117 psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
1118 por mm1, mm0; ; mask
1119
1120 pxor mm0, mm0
1121 pcmpeqb mm1, mm0
1122
1123 ; calculate high edge variance
1124 mov rdx, arg(4) ;thresh ; get thresh
1125 movq mm7, [rdx]
1126 ;
1127 movq mm4, t0 ; get abs (q1 - q0)
1128 psubusb mm4, mm7 ; abs(q1 - q0) > thresh
1129
1130 movq mm3, t1 ; get abs (p1 - p0)
1131 psubusb mm3, mm7 ; abs(p1 - p0)> thresh
1132
1133 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p 1 - p0) > thresh
1134 pcmpeqb mm4, mm0
1135
1136 pcmpeqb mm0, mm0
1137 pxor mm4, mm0
1138
1139
1140
1141
1142 ; start work on filters
1143 lea rdx, srct
1144
1145 ; start work on filters
1146 movq mm2, [rdx+16] ; p1
1147 movq mm7, [rdx+40] ; q1
1148 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value s
1149 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value s
1150 psubsb mm2, mm7 ; p1 - q1
1151
1152 movq mm6, [rdx+24] ; p0
1153 movq mm0, [rdx+32] ; q0
1154 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
1155 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
1156
1157 movq mm3, mm0 ; q0
1158 psubsb mm0, mm6 ; q0 - p0
1159 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
1160 paddsb mm2, mm0 ; 2 * (q0 - p0)
1161 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
1162 pand mm1, mm2 ; mask filter values we don't care about
1163
1164 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
1165 movq mm2, mm1 ; vp8_filter
1166 pand mm2, mm4; ; Filter2 = vp8_filter & hev
1167
1168 movq mm5, mm2 ;
1169 paddsb mm5, [GLOBAL(t3)];
1170
1171 pxor mm0, mm0 ; 0
1172 pxor mm7, mm7 ; 0
1173
1174 punpcklbw mm0, mm5 ; e0f0g0h0
1175 psraw mm0, 11 ; sign extended shift right by 3
1176 punpckhbw mm7, mm5 ; a0b0c0d0
1177 psraw mm7, 11 ; sign extended shift right by 3
1178 packsswb mm0, mm7 ; Filter2 >>=3;
1179
1180 movq mm5, mm0 ; Filter2
1181
1182 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
1183 pxor mm0, mm0 ; 0
1184 pxor mm7, mm7 ; 0
1185
1186 punpcklbw mm0, mm2 ; e0f0g0h0
1187 psraw mm0, 11 ; sign extended shift right by 3
1188 punpckhbw mm7, mm2 ; a0b0c0d0
1189 psraw mm7, 11 ; sign extended shift right by 3
1190 packsswb mm0, mm7 ; Filter2 >>=3;
1191
1192 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
1193 psubsb mm3, mm0 ; qs0 =qs0 - filter1
1194 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
1195
1196 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
1197 ; vp8_filter &= ~hev;
1198 ; Filter2 = vp8_filter;
1199 pandn mm4, mm1 ; vp8_filter&=~hev
1200
1201
1202 ; mm3=qs0, mm4=filter2, mm6=ps0
1203
1204 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
1205 ; s = vp8_signed_char_clamp(qs0 - u);
1206 ; *oq0 = s^0x80;
1207 ; s = vp8_signed_char_clamp(ps0 + u);
1208 ; *op0 = s^0x80;
1209 pxor mm0, mm0
1210
1211 pxor mm1, mm1
1212 pxor mm2, mm2
1213 punpcklbw mm1, mm4
1214 punpckhbw mm2, mm4
1215 pmulhw mm1, [GLOBAL(s27)]
1216 pmulhw mm2, [GLOBAL(s27)]
1217 paddw mm1, [GLOBAL(s63)]
1218 paddw mm2, [GLOBAL(s63)]
1219 psraw mm1, 7
1220 psraw mm2, 7
1221 packsswb mm1, mm2
1222
1223 psubsb mm3, mm1
1224 paddsb mm6, mm1
1225
1226 pxor mm3, [GLOBAL(t80)]
1227 pxor mm6, [GLOBAL(t80)]
1228 movq [rdx+24], mm6
1229 movq [rdx+32], mm3
1230
1231 ; roughly 2/7th difference across boundary
1232 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
1233 ; s = vp8_signed_char_clamp(qs1 - u);
1234 ; *oq1 = s^0x80;
1235 ; s = vp8_signed_char_clamp(ps1 + u);
1236 ; *op1 = s^0x80;
1237 pxor mm1, mm1
1238 pxor mm2, mm2
1239 punpcklbw mm1, mm4
1240 punpckhbw mm2, mm4
1241 pmulhw mm1, [GLOBAL(s18)]
1242 pmulhw mm2, [GLOBAL(s18)]
1243 paddw mm1, [GLOBAL(s63)]
1244 paddw mm2, [GLOBAL(s63)]
1245 psraw mm1, 7
1246 psraw mm2, 7
1247 packsswb mm1, mm2
1248
1249 movq mm3, [rdx + 40]
1250 movq mm6, [rdx + 16] ; p1
1251 pxor mm3, [GLOBAL(t80)]
1252 pxor mm6, [GLOBAL(t80)]
1253
1254 paddsb mm6, mm1
1255 psubsb mm3, mm1
1256
1257 pxor mm6, [GLOBAL(t80)]
1258 pxor mm3, [GLOBAL(t80)]
1259 movq [rdx + 40], mm3
1260 movq [rdx + 16], mm6
1261
1262 ; roughly 1/7th difference across boundary
1263 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
1264 ; s = vp8_signed_char_clamp(qs2 - u);
1265 ; *oq2 = s^0x80;
1266 ; s = vp8_signed_char_clamp(ps2 + u);
1267 ; *op2 = s^0x80;
1268 pxor mm1, mm1
1269 pxor mm2, mm2
1270 punpcklbw mm1, mm4
1271 punpckhbw mm2, mm4
1272 pmulhw mm1, [GLOBAL(s9)]
1273 pmulhw mm2, [GLOBAL(s9)]
1274 paddw mm1, [GLOBAL(s63)]
1275 paddw mm2, [GLOBAL(s63)]
1276 psraw mm1, 7
1277 psraw mm2, 7
1278 packsswb mm1, mm2
1279
1280 movq mm6, [rdx+ 8]
1281 movq mm3, [rdx+48]
1282
1283 pxor mm6, [GLOBAL(t80)]
1284 pxor mm3, [GLOBAL(t80)]
1285
1286 paddsb mm6, mm1
1287 psubsb mm3, mm1
1288
1289 pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
1290 pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
1291
1292 ; transpose and write back
1293 movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
1294 movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
1295
1296 punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00
1297 punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40
1298
1299 movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02
1300 movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02
1301
1302 punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02
1303 punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42
1304
1305 movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00
1306 punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00
1307
1308 punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20
1309 movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40
1310
1311 punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40
1312 punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60
1313
1314 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
1315 punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04
1316
1317 movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06
1318 punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06
1319
1320 movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04
1321 punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04
1322
1323 punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24
1324 movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00
1325
1326 punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00
1327 punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10
1328
1329 movq [rsi+rax*4], mm0 ; write out
1330 movq [rdi+rax*4], mm6 ; write out
1331
1332 movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20
1333 punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20
1334
1335 punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30
1336 movq [rsi+rax*2], mm0 ; write out
1337
1338 movq [rdi+rax*2], mm5 ; write out
1339 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
1340
1341 punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44
1342 punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46
1343
1344 movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44
1345 punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44
1346
1347 punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64
1348 movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40
1349
1350 movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60
1351 punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40
1352
1353 punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50
1354 movq [rsi], mm0 ; write out
1355
1356 movq [rdi], mm1 ; write out
1357 neg rax
1358
1359 punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60
1360 punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60
1361
1362 movq [rsi+rax*2], mm3
1363 movq [rdi+rax*2], mm4
1364
1365 lea rsi, [rsi+rax*8]
1366 dec rcx
1367
1368 jnz .next8_mbv
1369
1370 add rsp, 96
1371 pop rsp
1372 ; begin epilog
1373 pop rdi
1374 pop rsi
1375 RESTORE_GOT
1376 UNSHADOW_ARGS
1377 pop rbp
1378 ret
1379
1380
1381 ;void vp8_loop_filter_simple_horizontal_edge_mmx
1382 ;(
1383 ; unsigned char *src_ptr,
1384 ; int src_pixel_step,
1385 ; const char *blimit
1386 ;)
1387 global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
1388 sym(vp8_loop_filter_simple_horizontal_edge_mmx):
1389 push rbp
1390 mov rbp, rsp
1391 SHADOW_ARGS_TO_STACK 3
1392 GET_GOT rbx
1393 push rsi
1394 push rdi
1395 ; end prolog
1396
1397 mov rsi, arg(0) ;src_ptr
1398 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc h?
1399
1400 mov rcx, 2 ; count
1401 .nexts8_h:
1402 mov rdx, arg(2) ;blimit ; get blimit
1403 movq mm3, [rdx] ;
1404
1405 mov rdi, rsi ; rdi points to row +1 for indirect ad dressing
1406 add rdi, rax
1407 neg rax
1408
1409 ; calculate mask
1410 movq mm1, [rsi+2*rax] ; p1
1411 movq mm0, [rdi] ; q1
1412 movq mm2, mm1
1413 movq mm7, mm0
1414 movq mm4, mm0
1415 psubusb mm0, mm1 ; q1-=p1
1416 psubusb mm1, mm4 ; p1-=q1
1417 por mm1, mm0 ; abs(p1-q1)
1418 pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
1419 psrlw mm1, 1 ; abs(p1-q1)/2
1420
1421 movq mm5, [rsi+rax] ; p0
1422 movq mm4, [rsi] ; q0
1423 movq mm0, mm4 ; q0
1424 movq mm6, mm5 ; p0
1425 psubusb mm5, mm4 ; p0-=q0
1426 psubusb mm4, mm6 ; q0-=p0
1427 por mm5, mm4 ; abs(p0 - q0)
1428 paddusb mm5, mm5 ; abs(p0-q0)*2
1429 paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1430
1431 psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > bl imit
1432 pxor mm3, mm3
1433 pcmpeqb mm5, mm3
1434
1435 ; start work on filters
1436 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value s
1437 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value s
1438 psubsb mm2, mm7 ; p1 - q1
1439
1440 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
1441 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
1442 movq mm3, mm0 ; q0
1443 psubsb mm0, mm6 ; q0 - p0
1444 paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0)
1445 paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0)
1446 paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0)
1447 pand mm5, mm2 ; mask filter values we don't care abo ut
1448
1449 ; do + 4 side
1450 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
1451
1452 movq mm0, mm5 ; get a copy of filters
1453 psllw mm0, 8 ; shift left 8
1454 psraw mm0, 3 ; arithmetic shift right 11
1455 psrlw mm0, 8
1456 movq mm1, mm5 ; get a copy of filters
1457 psraw mm1, 11 ; arithmetic shift right 11
1458 psllw mm1, 8 ; shift left 8 to put it back
1459
1460 por mm0, mm1 ; put the two together to get result
1461
1462 psubsb mm3, mm0 ; q0-= q0 add
1463 pxor mm3, [GLOBAL(t80)] ; unoffset
1464 movq [rsi], mm3 ; write back
1465
1466
1467 ; now do +3 side
1468 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
1469
1470 movq mm0, mm5 ; get a copy of filters
1471 psllw mm0, 8 ; shift left 8
1472 psraw mm0, 3 ; arithmetic shift right 11
1473 psrlw mm0, 8
1474 psraw mm5, 11 ; arithmetic shift right 11
1475 psllw mm5, 8 ; shift left 8 to put it back
1476 por mm0, mm5 ; put the two together to get result
1477
1478
1479 paddsb mm6, mm0 ; p0+= p0 add
1480 pxor mm6, [GLOBAL(t80)] ; unoffset
1481 movq [rsi+rax], mm6 ; write back
1482
1483 add rsi,8
1484 neg rax
1485 dec rcx
1486 jnz .nexts8_h
1487
1488 ; begin epilog
1489 pop rdi
1490 pop rsi
1491 RESTORE_GOT
1492 UNSHADOW_ARGS
1493 pop rbp
1494 ret
1495
1496
1497 ;void vp8_loop_filter_simple_vertical_edge_mmx
1498 ;(
1499 ; unsigned char *src_ptr,
1500 ; int src_pixel_step,
1501 ; const char *blimit
1502 ;)
1503 global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
1504 sym(vp8_loop_filter_simple_vertical_edge_mmx):
1505 push rbp
1506 mov rbp, rsp
1507 SHADOW_ARGS_TO_STACK 3
1508 GET_GOT rbx
1509 push rsi
1510 push rdi
1511 ; end prolog
1512
1513 ALIGN_STACK 16, rax
1514 sub rsp, 32 ; reserve 32 bytes
1515 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
1516 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
1517
1518 mov rsi, arg(0) ;src_ptr
1519 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc h?
1520
1521 lea rsi, [rsi + rax*4- 2]; ;
1522 mov rcx, 2 ; count
1523 .nexts8_v:
1524
1525 lea rdi, [rsi + rax];
1526 movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
1527
1528 movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60
1529 punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
1530
1531 movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50
1532 movd mm4, [rsi] ; xx xx xx xx 43 42 41 40
1533
1534 punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
1535 movq mm5, mm4 ; 53 43 52 42 51 41 50 40
1536
1537 punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40
1538 punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42
1539
1540 neg rax
1541
1542 movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30
1543 movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20
1544
1545 punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20
1546 movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10
1547
1548 movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00
1549 punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00
1550
1551 movq mm2, mm0 ; 13 03 12 02 11 01 10 00
1552 punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00
1553
1554 punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02
1555 movq mm1, mm0 ; 13 03 12 02 11 01 10 00
1556
1557 punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1
1558 movq mm3, mm2 ; 33 23 13 03 32 22 12 02
1559
1560 punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0
1561 punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0
1562
1563 punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1
1564
1565
1566 ; calculate mask
1567 movq mm6, mm0 ; p1
1568 movq mm7, mm3 ; q1
1569 psubusb mm7, mm6 ; q1-=p1
1570 psubusb mm6, mm3 ; p1-=q1
1571 por mm6, mm7 ; abs(p1-q1)
1572 pand mm6, [GLOBAL(tfe)] ; set lsb of eac h byte to zero
1573 psrlw mm6, 1 ; abs(p1-q1)/2
1574
1575 movq mm5, mm1 ; p0
1576 movq mm4, mm2 ; q0
1577
1578 psubusb mm5, mm2 ; p0-=q0
1579 psubusb mm4, mm1 ; q0-=p0
1580
1581 por mm5, mm4 ; abs(p0 - q0)
1582 paddusb mm5, mm5 ; abs(p0-q0)*2
1583 paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1584
1585 mov rdx, arg(2) ;blimit ; get bl imit
1586 movq mm7, [rdx]
1587
1588 psubusb mm5, mm7 ; abs(p0 - q0) * 2 + abs(p1-q1)/2 > blimit
1589 pxor mm7, mm7
1590 pcmpeqb mm5, mm7 ; mm5 = mask
1591
1592 ; start work on filters
1593 movq t0, mm0
1594 movq t1, mm3
1595
1596 pxor mm0, [GLOBAL(t80)] ; p1 offset to c onvert to signed values
1597 pxor mm3, [GLOBAL(t80)] ; q1 offset to c onvert to signed values
1598
1599 psubsb mm0, mm3 ; p1 - q1
1600 movq mm6, mm1 ; p0
1601
1602 movq mm7, mm2 ; q0
1603 pxor mm6, [GLOBAL(t80)] ; offset to conv ert to signed values
1604
1605 pxor mm7, [GLOBAL(t80)] ; offset to conv ert to signed values
1606 movq mm3, mm7 ; offseted ; q0
1607
1608 psubsb mm7, mm6 ; q0 - p0
1609 paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0)
1610
1611 paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0)
1612 paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0)
1613
1614 pand mm5, mm0 ; mask filter va lues we don't care about
1615
1616 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
1617
1618 movq mm0, mm5 ; get a copy of filters
1619 psllw mm0, 8 ; shift left 8
1620 psraw mm0, 3 ; arithmetic shi ft right 11
1621 psrlw mm0, 8
1622
1623 movq mm7, mm5 ; get a copy of filters
1624 psraw mm7, 11 ; arithmetic shi ft right 11
1625 psllw mm7, 8 ; shift left 8 t o put it back
1626
1627 por mm0, mm7 ; put the two to gether to get result
1628
1629 psubsb mm3, mm0 ; q0-= q0sz add
1630 pxor mm3, [GLOBAL(t80)] ; unoffset
1631
1632 ; now do +3 side
1633 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
1634
1635 movq mm0, mm5 ; get a copy of filters
1636 psllw mm0, 8 ; shift left 8
1637 psraw mm0, 3 ; arithmetic shi ft right 11
1638 psrlw mm0, 8
1639
1640 psraw mm5, 11 ; arithmetic shi ft right 11
1641 psllw mm5, 8 ; shift left 8 t o put it back
1642 por mm0, mm5 ; put the two to gether to get result
1643
1644 paddsb mm6, mm0 ; p0+= p0 add
1645 pxor mm6, [GLOBAL(t80)] ; unoffset
1646
1647
1648 movq mm0, t0
1649 movq mm4, t1
1650
1651 ; mm0 = 70 60 50 40 30 20 10 00
1652 ; mm6 = 71 61 51 41 31 21 11 01
1653 ; mm3 = 72 62 52 42 32 22 12 02
1654 ; mm4 = 73 63 53 43 33 23 13 03
1655 ; transpose back to write out
1656
1657 movq mm1, mm0 ;
1658 punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00
1659
1660 punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40
1661 movq mm2, mm3 ;
1662
1663 punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02
1664 movq mm5, mm1 ; 71 70 61 60 51 50 41 40
1665
1666 punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42
1667 movq mm6, mm0 ; 31 30 21 20 11 10 01 00
1668
1669 punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00
1670 punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20
1671
1672 movd [rsi+rax*4], mm0 ; write 03 02 01 00
1673 punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40
1674
1675 psrlq mm0, 32 ; xx xx xx xx 13 12 11 10
1676 punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60
1677
1678 movd [rdi+rax*4], mm0 ; write 13 12 11 10
1679 movd [rsi+rax*2], mm6 ; write 23 22 21 20
1680
1681 psrlq mm6, 32 ; 33 32 31 30
1682 movd [rsi], mm1 ; write 43 42 41 40
1683
1684 movd [rsi + rax], mm6 ; write 33 32 31 30
1685 neg rax
1686
1687 movd [rsi + rax*2], mm5 ; write 63 62 61 60
1688 psrlq mm1, 32 ; 53 52 51 50
1689
1690 movd [rdi], mm1 ; write out 53 52 51 50
1691 psrlq mm5, 32 ; 73 72 71 70
1692
1693 movd [rdi + rax*2], mm5 ; write 73 72 71 70
1694
1695 lea rsi, [rsi+rax*8] ; next 8
1696
1697 dec rcx
1698 jnz .nexts8_v
1699
1700 add rsp, 32
1701 pop rsp
1702 ; begin epilog
1703 pop rdi
1704 pop rsi
1705 RESTORE_GOT
1706 UNSHADOW_ARGS
1707 pop rbp
1708 ret
1709
1710
1711
1712 ;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
1713 ; int y_stride,
1714 ; loop_filter_info *lfi)
1715 ;{
1716 ;
1717 ;
1718 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi-> lim,lfi->thr,2);
1719 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi-> lim,lfi->thr,2);
1720 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi- >lim,lfi->thr,2);
1721 ;}
1722
1723 SECTION_RODATA
1724 align 16
1725 tfe:
1726 times 8 db 0xfe
1727 align 16
1728 t80:
1729 times 8 db 0x80
1730 align 16
1731 t1s:
1732 times 8 db 0x01
1733 align 16
1734 t3:
1735 times 8 db 0x03
1736 align 16
1737 t4:
1738 times 8 db 0x04
1739 align 16
1740 ones:
1741 times 4 dw 0x0001
1742 align 16
1743 s27:
1744 times 4 dw 0x1b00
1745 align 16
1746 s18:
1747 times 4 dw 0x1200
1748 align 16
1749 s9:
1750 times 4 dw 0x0900
1751 align 16
1752 s63:
1753 times 4 dw 0x003f
OLDNEW
« no previous file with comments | « libvpx_srcs_x86_64.gypi ('k') | source/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698