OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 %macro LF_ABS 2 | |
15 ; %1 value not preserved | |
16 ; %2 value preserved | |
17 ; output in %1 | |
18 movdqa scratch1, %2 ; v2 | |
19 | |
20 psubusb scratch1, %1 ; v2 - v1 | |
21 psubusb %1, %2 ; v1 - v2 | |
22 por %1, scratch1 ; abs(v2 - v1) | |
23 %endmacro | |
24 | |
25 %macro LF_FILTER_HEV_MASK 8-9 | |
26 | |
27 LF_ABS %1, %2 ; abs(p3 - p2) | |
28 LF_ABS %2, %3 ; abs(p2 - p1) | |
29 pmaxub %1, %2 ; accumulate mask | |
30 %if %0 == 8 | |
31 movdqa scratch2, %3 ; save p1 | |
32 LF_ABS scratch2, %4 ; abs(p1 - p0) | |
33 %endif | |
34 LF_ABS %4, %5 ; abs(p0 - q0) | |
35 LF_ABS %5, %6 ; abs(q0 - q1) | |
36 %if %0 == 8 | |
37 pmaxub %5, scratch2 ; accumulate hev | |
38 %else | |
39 pmaxub %5, %9 | |
40 %endif | |
41 pmaxub %1, %5 ; accumulate mask | |
42 | |
43 LF_ABS %3, %6 ; abs(p1 - q1) | |
44 LF_ABS %6, %7 ; abs(q1 - q2) | |
45 pmaxub %1, %6 ; accumulate mask | |
46 LF_ABS %7, %8 ; abs(q2 - q3) | |
47 pmaxub %1, %7 ; accumulate mask | |
48 | |
49 paddusb %4, %4 ; 2 * abs(p0 - q0) | |
50 pand %3, [GLOBAL(tfe)] | |
51 psrlw %3, 1 ; abs(p1 - q1) / 2 | |
52 paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) /
2 | |
53 | |
54 psubusb %1, [limit] | |
55 psubusb %4, [blimit] | |
56 por %1, %4 | |
57 pcmpeqb %1, zero ; mask | |
58 | |
59 psubusb %5, [thresh] | |
60 pcmpeqb %5, zero ; ~hev | |
61 %endmacro | |
62 | |
63 %macro LF_FILTER 6 | |
64 ; %1-%4: p1-q1 | |
65 ; %5: mask | |
66 ; %6: hev | |
67 | |
68 movdqa scratch2, %6 ; save hev | |
69 | |
70 pxor %1, [GLOBAL(t80)] ; ps1 | |
71 pxor %4, [GLOBAL(t80)] ; qs1 | |
72 movdqa scratch1, %1 | |
73 psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1) | |
74 pandn scratch2, scratch1 ; vp8_filter &= hev | |
75 | |
76 pxor %2, [GLOBAL(t80)] ; ps0 | |
77 pxor %3, [GLOBAL(t80)] ; qs0 | |
78 movdqa scratch1, %3 | |
79 psubsb scratch1, %2 ; qs0 - ps0 | |
80 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) | |
81 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) | |
82 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) | |
83 pand %5, scratch2 ; &= mask | |
84 | |
85 movdqa scratch2, %5 | |
86 paddsb %5, [GLOBAL(t4)] ; Filter1 | |
87 paddsb scratch2, [GLOBAL(t3)] ; Filter2 | |
88 | |
89 ; Filter1 >> 3 | |
90 movdqa scratch1, zero | |
91 pcmpgtb scratch1, %5 | |
92 psrlw %5, 3 | |
93 pand scratch1, [GLOBAL(te0)] | |
94 pand %5, [GLOBAL(t1f)] | |
95 por %5, scratch1 | |
96 | |
97 psubsb %3, %5 ; qs0 - Filter1 | |
98 pxor %3, [GLOBAL(t80)] | |
99 | |
100 ; Filter2 >> 3 | |
101 movdqa scratch1, zero | |
102 pcmpgtb scratch1, scratch2 | |
103 psrlw scratch2, 3 | |
104 pand scratch1, [GLOBAL(te0)] | |
105 pand scratch2, [GLOBAL(t1f)] | |
106 por scratch2, scratch1 | |
107 | |
108 paddsb %2, scratch2 ; ps0 + Filter2 | |
109 pxor %2, [GLOBAL(t80)] | |
110 | |
111 ; outer tap adjustments | |
112 paddsb %5, [GLOBAL(t1)] | |
113 movdqa scratch1, zero | |
114 pcmpgtb scratch1, %5 | |
115 psrlw %5, 1 | |
116 pand scratch1, [GLOBAL(t80)] | |
117 pand %5, [GLOBAL(t7f)] | |
118 por %5, scratch1 | |
119 pand %5, %6 ; vp8_filter &= ~hev | |
120 | |
121 psubsb %4, %5 ; qs1 - vp8_filter | |
122 pxor %4, [GLOBAL(t80)] | |
123 | |
124 paddsb %1, %5 ; ps1 + vp8_filter | |
125 pxor %1, [GLOBAL(t80)] | |
126 %endmacro | |
127 | |
128 ;void vp8_loop_filter_bh_y_sse2 | |
129 ;( | |
130 ; unsigned char *src_ptr, | |
131 ; int src_pixel_step, | |
132 ; const char *blimit, | |
133 ; const char *limit, | |
134 ; const char *thresh | |
135 ;) | |
136 global sym(vp8_loop_filter_bh_y_sse2) PRIVATE | |
137 sym(vp8_loop_filter_bh_y_sse2): | |
138 | |
139 %if LIBVPX_YASM_WIN64 | |
140 %define src rcx ; src_ptr | |
141 %define stride rdx ; src_pixel_step | |
142 %define blimit r8 | |
143 %define limit r9 | |
144 %define thresh r10 | |
145 | |
146 %define spp rax | |
147 %define stride3 r11 | |
148 %define stride5 r12 | |
149 %define stride7 r13 | |
150 | |
151 push rbp | |
152 mov rbp, rsp | |
153 SAVE_XMM 11 | |
154 push r12 | |
155 push r13 | |
156 mov thresh, arg(4) | |
157 %else | |
158 %define src rdi ; src_ptr | |
159 %define stride rsi ; src_pixel_step | |
160 %define blimit rdx | |
161 %define limit rcx | |
162 %define thresh r8 | |
163 | |
164 %define spp rax | |
165 %define stride3 r9 | |
166 %define stride5 r10 | |
167 %define stride7 r11 | |
168 %endif | |
169 | |
170 %define scratch1 xmm5 | |
171 %define scratch2 xmm6 | |
172 %define zero xmm7 | |
173 | |
174 %define i0 [src] | |
175 %define i1 [spp] | |
176 %define i2 [src + 2 * stride] | |
177 %define i3 [spp + 2 * stride] | |
178 %define i4 [src + 4 * stride] | |
179 %define i5 [spp + 4 * stride] | |
180 %define i6 [src + 2 * stride3] | |
181 %define i7 [spp + 2 * stride3] | |
182 %define i8 [src + 8 * stride] | |
183 %define i9 [spp + 8 * stride] | |
184 %define i10 [src + 2 * stride5] | |
185 %define i11 [spp + 2 * stride5] | |
186 %define i12 [src + 4 * stride3] | |
187 %define i13 [spp + 4 * stride3] | |
188 %define i14 [src + 2 * stride7] | |
189 %define i15 [spp + 2 * stride7] | |
190 | |
191 ; prep work | |
192 lea spp, [src + stride] | |
193 lea stride3, [stride + 2 * stride] | |
194 lea stride5, [stride3 + 2 * stride] | |
195 lea stride7, [stride3 + 4 * stride] | |
196 pxor zero, zero | |
197 | |
198 ; load the first set into registers | |
199 movdqa xmm0, i0 | |
200 movdqa xmm1, i1 | |
201 movdqa xmm2, i2 | |
202 movdqa xmm3, i3 | |
203 movdqa xmm4, i4 | |
204 movdqa xmm8, i5 | |
205 movdqa xmm9, i6 ; q2, will contain abs(p1-p0) | |
206 movdqa xmm10, i7 | |
207 LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10 | |
208 | |
209 movdqa xmm1, i2 | |
210 movdqa xmm2, i3 | |
211 movdqa xmm3, i4 | |
212 movdqa xmm8, i5 | |
213 LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4 | |
214 movdqa i2, xmm1 | |
215 movdqa i3, xmm2 | |
216 | |
217 ; second set | |
218 movdqa i4, xmm3 | |
219 movdqa i5, xmm8 | |
220 | |
221 movdqa xmm0, i6 | |
222 movdqa xmm1, i7 | |
223 movdqa xmm2, i8 | |
224 movdqa xmm4, i9 | |
225 movdqa xmm10, i10 ; q2, will contain abs(p1-p0) | |
226 movdqa xmm11, i11 | |
227 LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9 | |
228 | |
229 movdqa xmm0, i6 | |
230 movdqa xmm1, i7 | |
231 movdqa xmm4, i8 | |
232 movdqa xmm8, i9 | |
233 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 | |
234 movdqa i6, xmm0 | |
235 movdqa i7, xmm1 | |
236 | |
237 ; last set | |
238 movdqa i8, xmm4 | |
239 movdqa i9, xmm8 | |
240 | |
241 movdqa xmm0, i10 | |
242 movdqa xmm1, i11 | |
243 movdqa xmm2, i12 | |
244 movdqa xmm3, i13 | |
245 movdqa xmm9, i14 ; q2, will contain abs(p1-p0) | |
246 movdqa xmm11, i15 | |
247 LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10 | |
248 | |
249 movdqa xmm0, i10 | |
250 movdqa xmm1, i11 | |
251 movdqa xmm3, i12 | |
252 movdqa xmm8, i13 | |
253 LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2 | |
254 movdqa i10, xmm0 | |
255 movdqa i11, xmm1 | |
256 movdqa i12, xmm3 | |
257 movdqa i13, xmm8 | |
258 | |
259 %if LIBVPX_YASM_WIN64 | |
260 pop r13 | |
261 pop r12 | |
262 RESTORE_XMM | |
263 pop rbp | |
264 %endif | |
265 | |
266 ret | |
267 | |
268 | |
269 ;void vp8_loop_filter_bv_y_sse2 | |
270 ;( | |
271 ; unsigned char *src_ptr, | |
272 ; int src_pixel_step, | |
273 ; const char *blimit, | |
274 ; const char *limit, | |
275 ; const char *thresh | |
276 ;) | |
277 | |
278 global sym(vp8_loop_filter_bv_y_sse2) PRIVATE | |
279 sym(vp8_loop_filter_bv_y_sse2): | |
280 | |
281 %if LIBVPX_YASM_WIN64 | |
282 %define src rcx ; src_ptr | |
283 %define stride rdx ; src_pixel_step | |
284 %define blimit r8 | |
285 %define limit r9 | |
286 %define thresh r10 | |
287 | |
288 %define spp rax | |
289 %define stride3 r11 | |
290 %define stride5 r12 | |
291 %define stride7 r13 | |
292 | |
293 push rbp | |
294 mov rbp, rsp | |
295 SAVE_XMM 15 | |
296 push r12 | |
297 push r13 | |
298 mov thresh, arg(4) | |
299 %else | |
300 %define src rdi | |
301 %define stride rsi | |
302 %define blimit rdx | |
303 %define limit rcx | |
304 %define thresh r8 | |
305 | |
306 %define spp rax | |
307 %define stride3 r9 | |
308 %define stride5 r10 | |
309 %define stride7 r11 | |
310 %endif | |
311 | |
312 %define scratch1 xmm5 | |
313 %define scratch2 xmm6 | |
314 %define zero xmm7 | |
315 | |
316 %define s0 [src] | |
317 %define s1 [spp] | |
318 %define s2 [src + 2 * stride] | |
319 %define s3 [spp + 2 * stride] | |
320 %define s4 [src + 4 * stride] | |
321 %define s5 [spp + 4 * stride] | |
322 %define s6 [src + 2 * stride3] | |
323 %define s7 [spp + 2 * stride3] | |
324 %define s8 [src + 8 * stride] | |
325 %define s9 [spp + 8 * stride] | |
326 %define s10 [src + 2 * stride5] | |
327 %define s11 [spp + 2 * stride5] | |
328 %define s12 [src + 4 * stride3] | |
329 %define s13 [spp + 4 * stride3] | |
330 %define s14 [src + 2 * stride7] | |
331 %define s15 [spp + 2 * stride7] | |
332 | |
333 %define i0 [rsp] | |
334 %define i1 [rsp + 16] | |
335 %define i2 [rsp + 32] | |
336 %define i3 [rsp + 48] | |
337 %define i4 [rsp + 64] | |
338 %define i5 [rsp + 80] | |
339 %define i6 [rsp + 96] | |
340 %define i7 [rsp + 112] | |
341 %define i8 [rsp + 128] | |
342 %define i9 [rsp + 144] | |
343 %define i10 [rsp + 160] | |
344 %define i11 [rsp + 176] | |
345 %define i12 [rsp + 192] | |
346 %define i13 [rsp + 208] | |
347 %define i14 [rsp + 224] | |
348 %define i15 [rsp + 240] | |
349 | |
350 ALIGN_STACK 16, rax | |
351 | |
352 ; reserve stack space | |
353 %define temp_storage 0 ; size is 256 (16*16) | |
354 %define stack_size 256 | |
355 sub rsp, stack_size | |
356 | |
357 ; prep work | |
358 lea spp, [src + stride] | |
359 lea stride3, [stride + 2 * stride] | |
360 lea stride5, [stride3 + 2 * stride] | |
361 lea stride7, [stride3 + 4 * stride] | |
362 | |
363 ; 8-f | |
364 movdqa xmm0, s8 | |
365 movdqa xmm1, xmm0 | |
366 punpcklbw xmm0, s9 ; 80 90 | |
367 punpckhbw xmm1, s9 ; 88 98 | |
368 | |
369 movdqa xmm2, s10 | |
370 movdqa xmm3, xmm2 | |
371 punpcklbw xmm2, s11 ; a0 b0 | |
372 punpckhbw xmm3, s11 ; a8 b8 | |
373 | |
374 movdqa xmm4, xmm0 | |
375 punpcklwd xmm0, xmm2 ; 80 90 a0 b0 | |
376 punpckhwd xmm4, xmm2 ; 84 94 a4 b4 | |
377 | |
378 movdqa xmm2, xmm1 | |
379 punpcklwd xmm1, xmm3 ; 88 98 a8 b8 | |
380 punpckhwd xmm2, xmm3 ; 8c 9c ac bc | |
381 | |
382 ; using xmm[0124] | |
383 ; work on next 4 rows | |
384 | |
385 movdqa xmm3, s12 | |
386 movdqa xmm5, xmm3 | |
387 punpcklbw xmm3, s13 ; c0 d0 | |
388 punpckhbw xmm5, s13 ; c8 d8 | |
389 | |
390 movdqa xmm6, s14 | |
391 movdqa xmm7, xmm6 | |
392 punpcklbw xmm6, s15 ; e0 f0 | |
393 punpckhbw xmm7, s15 ; e8 f8 | |
394 | |
395 movdqa xmm8, xmm3 | |
396 punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 | |
397 punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 | |
398 | |
399 movdqa xmm6, xmm5 | |
400 punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 | |
401 punpckhwd xmm6, xmm7 ; cc dc ec fc | |
402 | |
403 ; pull the third and fourth sets together | |
404 | |
405 movdqa xmm7, xmm0 | |
406 punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 | |
407 punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 | |
408 | |
409 movdqa xmm3, xmm4 | |
410 punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 | |
411 punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 | |
412 | |
413 movdqa xmm8, xmm1 | |
414 punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 | |
415 punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa | |
416 | |
417 movdqa xmm5, xmm2 | |
418 punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc | |
419 punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe | |
420 | |
421 ; save the calculations. we only have 15 registers ... | |
422 movdqa i0, xmm0 | |
423 movdqa i1, xmm7 | |
424 movdqa i2, xmm4 | |
425 movdqa i3, xmm3 | |
426 movdqa i4, xmm1 | |
427 movdqa i5, xmm8 | |
428 movdqa i6, xmm2 | |
429 movdqa i7, xmm5 | |
430 | |
431 ; 0-7 | |
432 movdqa xmm0, s0 | |
433 movdqa xmm1, xmm0 | |
434 punpcklbw xmm0, s1 ; 00 10 | |
435 punpckhbw xmm1, s1 ; 08 18 | |
436 | |
437 movdqa xmm2, s2 | |
438 movdqa xmm3, xmm2 | |
439 punpcklbw xmm2, s3 ; 20 30 | |
440 punpckhbw xmm3, s3 ; 28 38 | |
441 | |
442 movdqa xmm4, xmm0 | |
443 punpcklwd xmm0, xmm2 ; 00 10 20 30 | |
444 punpckhwd xmm4, xmm2 ; 04 14 24 34 | |
445 | |
446 movdqa xmm2, xmm1 | |
447 punpcklwd xmm1, xmm3 ; 08 18 28 38 | |
448 punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c | |
449 | |
450 ; using xmm[0124] | |
451 ; work on next 4 rows | |
452 | |
453 movdqa xmm3, s4 | |
454 movdqa xmm5, xmm3 | |
455 punpcklbw xmm3, s5 ; 40 50 | |
456 punpckhbw xmm5, s5 ; 48 58 | |
457 | |
458 movdqa xmm6, s6 | |
459 movdqa xmm7, xmm6 | |
460 punpcklbw xmm6, s7 ; 60 70 | |
461 punpckhbw xmm7, s7 ; 68 78 | |
462 | |
463 movdqa xmm8, xmm3 | |
464 punpcklwd xmm3, xmm6 ; 40 50 60 70 | |
465 punpckhwd xmm8, xmm6 ; 44 54 64 74 | |
466 | |
467 movdqa xmm6, xmm5 | |
468 punpcklwd xmm5, xmm7 ; 48 58 68 78 | |
469 punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c | |
470 | |
471 ; pull the first two sets together | |
472 | |
473 movdqa xmm7, xmm0 | |
474 punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 | |
475 punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 | |
476 | |
477 movdqa xmm3, xmm4 | |
478 punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 | |
479 punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 | |
480 | |
481 movdqa xmm8, xmm1 | |
482 punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 | |
483 punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a | |
484 | |
485 movdqa xmm5, xmm2 | |
486 punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c | |
487 punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e | |
488 ; final combination | |
489 | |
490 movdqa xmm6, xmm0 | |
491 punpcklqdq xmm0, i0 | |
492 punpckhqdq xmm6, i0 | |
493 | |
494 movdqa xmm9, xmm7 | |
495 punpcklqdq xmm7, i1 | |
496 punpckhqdq xmm9, i1 | |
497 | |
498 movdqa xmm10, xmm4 | |
499 punpcklqdq xmm4, i2 | |
500 punpckhqdq xmm10, i2 | |
501 | |
502 movdqa xmm11, xmm3 | |
503 punpcklqdq xmm3, i3 | |
504 punpckhqdq xmm11, i3 | |
505 | |
506 movdqa xmm12, xmm1 | |
507 punpcklqdq xmm1, i4 | |
508 punpckhqdq xmm12, i4 | |
509 | |
510 movdqa xmm13, xmm8 | |
511 punpcklqdq xmm8, i5 | |
512 punpckhqdq xmm13, i5 | |
513 | |
514 movdqa xmm14, xmm2 | |
515 punpcklqdq xmm2, i6 | |
516 punpckhqdq xmm14, i6 | |
517 | |
518 movdqa xmm15, xmm5 | |
519 punpcklqdq xmm5, i7 | |
520 punpckhqdq xmm15, i7 | |
521 | |
522 movdqa i0, xmm0 | |
523 movdqa i1, xmm6 | |
524 movdqa i2, xmm7 | |
525 movdqa i3, xmm9 | |
526 movdqa i4, xmm4 | |
527 movdqa i5, xmm10 | |
528 movdqa i6, xmm3 | |
529 movdqa i7, xmm11 | |
530 movdqa i8, xmm1 | |
531 movdqa i9, xmm12 | |
532 movdqa i10, xmm8 | |
533 movdqa i11, xmm13 | |
534 movdqa i12, xmm2 | |
535 movdqa i13, xmm14 | |
536 movdqa i14, xmm5 | |
537 movdqa i15, xmm15 | |
538 | |
539 ; TRANSPOSED DATA AVAILABLE ON THE STACK | |
540 | |
541 movdqa xmm12, xmm6 | |
542 movdqa xmm13, xmm7 | |
543 | |
544 pxor zero, zero | |
545 | |
546 LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11 | |
547 | |
548 movdqa xmm1, i2 | |
549 movdqa xmm2, i3 | |
550 movdqa xmm8, i4 | |
551 movdqa xmm9, i5 | |
552 LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4 | |
553 movdqa i2, xmm1 | |
554 movdqa i3, xmm2 | |
555 | |
556 ; second set | |
557 movdqa i4, xmm8 | |
558 movdqa i5, xmm9 | |
559 | |
560 movdqa xmm0, i6 | |
561 movdqa xmm1, i7 | |
562 movdqa xmm2, i8 | |
563 movdqa xmm4, i9 | |
564 movdqa xmm10, i10 ; q2, will contain abs(p1-p0) | |
565 movdqa xmm11, i11 | |
566 LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3 | |
567 | |
568 movdqa xmm0, i6 | |
569 movdqa xmm1, i7 | |
570 movdqa xmm3, i8 | |
571 movdqa xmm4, i9 | |
572 LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2 | |
573 movdqa i6, xmm0 | |
574 movdqa i7, xmm1 | |
575 | |
576 ; last set | |
577 movdqa i8, xmm3 | |
578 movdqa i9, xmm4 | |
579 | |
580 movdqa xmm0, i10 | |
581 movdqa xmm1, i11 | |
582 movdqa xmm2, i12 | |
583 movdqa xmm8, i13 | |
584 movdqa xmm9, i14 ; q2, will contain abs(p1-p0) | |
585 movdqa xmm11, i15 | |
586 LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10 | |
587 | |
588 movdqa xmm0, i10 | |
589 movdqa xmm1, i11 | |
590 movdqa xmm4, i12 | |
591 movdqa xmm8, i13 | |
592 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 | |
593 movdqa i10, xmm0 | |
594 movdqa i11, xmm1 | |
595 movdqa i12, xmm4 | |
596 movdqa i13, xmm8 | |
597 | |
598 | |
599 ; RESHUFFLE AND WRITE OUT | |
600 ; 8-f | |
601 movdqa xmm0, i8 | |
602 movdqa xmm1, xmm0 | |
603 punpcklbw xmm0, i9 ; 80 90 | |
604 punpckhbw xmm1, i9 ; 88 98 | |
605 | |
606 movdqa xmm2, i10 | |
607 movdqa xmm3, xmm2 | |
608 punpcklbw xmm2, i11 ; a0 b0 | |
609 punpckhbw xmm3, i11 ; a8 b8 | |
610 | |
611 movdqa xmm4, xmm0 | |
612 punpcklwd xmm0, xmm2 ; 80 90 a0 b0 | |
613 punpckhwd xmm4, xmm2 ; 84 94 a4 b4 | |
614 | |
615 movdqa xmm2, xmm1 | |
616 punpcklwd xmm1, xmm3 ; 88 98 a8 b8 | |
617 punpckhwd xmm2, xmm3 ; 8c 9c ac bc | |
618 | |
619 ; using xmm[0124] | |
620 ; work on next 4 rows | |
621 | |
622 movdqa xmm3, i12 | |
623 movdqa xmm5, xmm3 | |
624 punpcklbw xmm3, i13 ; c0 d0 | |
625 punpckhbw xmm5, i13 ; c8 d8 | |
626 | |
627 movdqa xmm6, i14 | |
628 movdqa xmm7, xmm6 | |
629 punpcklbw xmm6, i15 ; e0 f0 | |
630 punpckhbw xmm7, i15 ; e8 f8 | |
631 | |
632 movdqa xmm8, xmm3 | |
633 punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 | |
634 punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 | |
635 | |
636 movdqa xmm6, xmm5 | |
637 punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 | |
638 punpckhwd xmm6, xmm7 ; cc dc ec fc | |
639 | |
640 ; pull the third and fourth sets together | |
641 | |
642 movdqa xmm7, xmm0 | |
643 punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 | |
644 punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 | |
645 | |
646 movdqa xmm3, xmm4 | |
647 punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 | |
648 punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 | |
649 | |
650 movdqa xmm8, xmm1 | |
651 punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 | |
652 punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa | |
653 | |
654 movdqa xmm5, xmm2 | |
655 punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc | |
656 punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe | |
657 | |
658 ; save the calculations. we only have 15 registers ... | |
659 movdqa i8, xmm0 | |
660 movdqa i9, xmm7 | |
661 movdqa i10, xmm4 | |
662 movdqa i11, xmm3 | |
663 movdqa i12, xmm1 | |
664 movdqa i13, xmm8 | |
665 movdqa i14, xmm2 | |
666 movdqa i15, xmm5 | |
667 | |
668 ; 0-7 | |
669 movdqa xmm0, i0 | |
670 movdqa xmm1, xmm0 | |
671 punpcklbw xmm0, i1 ; 00 10 | |
672 punpckhbw xmm1, i1 ; 08 18 | |
673 | |
674 movdqa xmm2, i2 | |
675 movdqa xmm3, xmm2 | |
676 punpcklbw xmm2, i3 ; 20 30 | |
677 punpckhbw xmm3, i3 ; 28 38 | |
678 | |
679 movdqa xmm4, xmm0 | |
680 punpcklwd xmm0, xmm2 ; 00 10 20 30 | |
681 punpckhwd xmm4, xmm2 ; 04 14 24 34 | |
682 | |
683 movdqa xmm2, xmm1 | |
684 punpcklwd xmm1, xmm3 ; 08 18 28 38 | |
685 punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c | |
686 | |
687 ; using xmm[0124] | |
688 ; work on next 4 rows | |
689 | |
690 movdqa xmm3, i4 | |
691 movdqa xmm5, xmm3 | |
692 punpcklbw xmm3, i5 ; 40 50 | |
693 punpckhbw xmm5, i5 ; 48 58 | |
694 | |
695 movdqa xmm6, i6 | |
696 movdqa xmm7, xmm6 | |
697 punpcklbw xmm6, i7 ; 60 70 | |
698 punpckhbw xmm7, i7 ; 68 78 | |
699 | |
700 movdqa xmm8, xmm3 | |
701 punpcklwd xmm3, xmm6 ; 40 50 60 70 | |
702 punpckhwd xmm8, xmm6 ; 44 54 64 74 | |
703 | |
704 movdqa xmm6, xmm5 | |
705 punpcklwd xmm5, xmm7 ; 48 58 68 78 | |
706 punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c | |
707 | |
708 ; pull the first two sets together | |
709 | |
710 movdqa xmm7, xmm0 | |
711 punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 | |
712 punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 | |
713 | |
714 movdqa xmm3, xmm4 | |
715 punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 | |
716 punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 | |
717 | |
718 movdqa xmm8, xmm1 | |
719 punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 | |
720 punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a | |
721 | |
722 movdqa xmm5, xmm2 | |
723 punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c | |
724 punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e | |
725 ; final combination | |
726 | |
727 movdqa xmm6, xmm0 | |
728 punpcklqdq xmm0, i8 | |
729 punpckhqdq xmm6, i8 | |
730 | |
731 movdqa xmm9, xmm7 | |
732 punpcklqdq xmm7, i9 | |
733 punpckhqdq xmm9, i9 | |
734 | |
735 movdqa xmm10, xmm4 | |
736 punpcklqdq xmm4, i10 | |
737 punpckhqdq xmm10, i10 | |
738 | |
739 movdqa xmm11, xmm3 | |
740 punpcklqdq xmm3, i11 | |
741 punpckhqdq xmm11, i11 | |
742 | |
743 movdqa xmm12, xmm1 | |
744 punpcklqdq xmm1, i12 | |
745 punpckhqdq xmm12, i12 | |
746 | |
747 movdqa xmm13, xmm8 | |
748 punpcklqdq xmm8, i13 | |
749 punpckhqdq xmm13, i13 | |
750 | |
751 movdqa xmm14, xmm2 | |
752 punpcklqdq xmm2, i14 | |
753 punpckhqdq xmm14, i14 | |
754 | |
755 movdqa xmm15, xmm5 | |
756 punpcklqdq xmm5, i15 | |
757 punpckhqdq xmm15, i15 | |
758 | |
759 movdqa s0, xmm0 | |
760 movdqa s1, xmm6 | |
761 movdqa s2, xmm7 | |
762 movdqa s3, xmm9 | |
763 movdqa s4, xmm4 | |
764 movdqa s5, xmm10 | |
765 movdqa s6, xmm3 | |
766 movdqa s7, xmm11 | |
767 movdqa s8, xmm1 | |
768 movdqa s9, xmm12 | |
769 movdqa s10, xmm8 | |
770 movdqa s11, xmm13 | |
771 movdqa s12, xmm2 | |
772 movdqa s13, xmm14 | |
773 movdqa s14, xmm5 | |
774 movdqa s15, xmm15 | |
775 | |
776 ; free stack space | |
777 add rsp, stack_size | |
778 | |
779 ; un-ALIGN_STACK | |
780 pop rsp | |
781 | |
782 %if LIBVPX_YASM_WIN64 | |
783 pop r13 | |
784 pop r12 | |
785 RESTORE_XMM | |
786 pop rbp | |
787 %endif | |
788 | |
789 ret | |
790 | |
791 SECTION_RODATA | |
792 align 16 | |
793 te0: | |
794 times 16 db 0xe0 | |
795 align 16 | |
796 t7f: | |
797 times 16 db 0x7f | |
798 align 16 | |
799 tfe: | |
800 times 16 db 0xfe | |
801 align 16 | |
802 t1f: | |
803 times 16 db 0x1f | |
804 align 16 | |
805 t80: | |
806 times 16 db 0x80 | |
807 align 16 | |
808 t1: | |
809 times 16 db 0x01 | |
810 align 16 | |
811 t3: | |
812 times 16 db 0x03 | |
813 align 16 | |
814 t4: | |
815 times 16 db 0x04 | |
OLD | NEW |