Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(98)

Side by Side Diff: source/libvpx/vp8/common/x86/loopfilter_block_sse2.asm

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %macro LF_ABS 2
15 ; %1 value not preserved
16 ; %2 value preserved
17 ; output in %1
18 movdqa scratch1, %2 ; v2
19
20 psubusb scratch1, %1 ; v2 - v1
21 psubusb %1, %2 ; v1 - v2
22 por %1, scratch1 ; abs(v2 - v1)
23 %endmacro
24
25 %macro LF_FILTER_HEV_MASK 8-9
26
27 LF_ABS %1, %2 ; abs(p3 - p2)
28 LF_ABS %2, %3 ; abs(p2 - p1)
29 pmaxub %1, %2 ; accumulate mask
30 %if %0 == 8
31 movdqa scratch2, %3 ; save p1
32 LF_ABS scratch2, %4 ; abs(p1 - p0)
33 %endif
34 LF_ABS %4, %5 ; abs(p0 - q0)
35 LF_ABS %5, %6 ; abs(q0 - q1)
36 %if %0 == 8
37 pmaxub %5, scratch2 ; accumulate hev
38 %else
39 pmaxub %5, %9
40 %endif
41 pmaxub %1, %5 ; accumulate mask
42
43 LF_ABS %3, %6 ; abs(p1 - q1)
44 LF_ABS %6, %7 ; abs(q1 - q2)
45 pmaxub %1, %6 ; accumulate mask
46 LF_ABS %7, %8 ; abs(q2 - q3)
47 pmaxub %1, %7 ; accumulate mask
48
49 paddusb %4, %4 ; 2 * abs(p0 - q0)
50 pand %3, [GLOBAL(tfe)]
51 psrlw %3, 1 ; abs(p1 - q1) / 2
52 paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
53
54 psubusb %1, [limit]
55 psubusb %4, [blimit]
56 por %1, %4
57 pcmpeqb %1, zero ; mask
58
59 psubusb %5, [thresh]
60 pcmpeqb %5, zero ; ~hev
61 %endmacro
62
63 %macro LF_FILTER 6
64 ; %1-%4: p1-q1
65 ; %5: mask
66 ; %6: hev
67
68 movdqa scratch2, %6 ; save hev
69
70 pxor %1, [GLOBAL(t80)] ; ps1
71 pxor %4, [GLOBAL(t80)] ; qs1
72 movdqa scratch1, %1
73 psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
74 pandn scratch2, scratch1 ; vp8_filter &= hev
75
76 pxor %2, [GLOBAL(t80)] ; ps0
77 pxor %3, [GLOBAL(t80)] ; qs0
78 movdqa scratch1, %3
79 psubsb scratch1, %2 ; qs0 - ps0
80 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
81 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
82 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
83 pand %5, scratch2 ; &= mask
84
85 movdqa scratch2, %5
86 paddsb %5, [GLOBAL(t4)] ; Filter1
87 paddsb scratch2, [GLOBAL(t3)] ; Filter2
88
89 ; Filter1 >> 3
90 movdqa scratch1, zero
91 pcmpgtb scratch1, %5
92 psrlw %5, 3
93 pand scratch1, [GLOBAL(te0)]
94 pand %5, [GLOBAL(t1f)]
95 por %5, scratch1
96
97 psubsb %3, %5 ; qs0 - Filter1
98 pxor %3, [GLOBAL(t80)]
99
100 ; Filter2 >> 3
101 movdqa scratch1, zero
102 pcmpgtb scratch1, scratch2
103 psrlw scratch2, 3
104 pand scratch1, [GLOBAL(te0)]
105 pand scratch2, [GLOBAL(t1f)]
106 por scratch2, scratch1
107
108 paddsb %2, scratch2 ; ps0 + Filter2
109 pxor %2, [GLOBAL(t80)]
110
111 ; outer tap adjustments
112 paddsb %5, [GLOBAL(t1)]
113 movdqa scratch1, zero
114 pcmpgtb scratch1, %5
115 psrlw %5, 1
116 pand scratch1, [GLOBAL(t80)]
117 pand %5, [GLOBAL(t7f)]
118 por %5, scratch1
119 pand %5, %6 ; vp8_filter &= ~hev
120
121 psubsb %4, %5 ; qs1 - vp8_filter
122 pxor %4, [GLOBAL(t80)]
123
124 paddsb %1, %5 ; ps1 + vp8_filter
125 pxor %1, [GLOBAL(t80)]
126 %endmacro
127
128 ;void vp8_loop_filter_bh_y_sse2
129 ;(
130 ; unsigned char *src_ptr,
131 ; int src_pixel_step,
132 ; const char *blimit,
133 ; const char *limit,
134 ; const char *thresh
135 ;)
136 global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
137 sym(vp8_loop_filter_bh_y_sse2):
138
139 %if LIBVPX_YASM_WIN64
140 %define src rcx ; src_ptr
141 %define stride rdx ; src_pixel_step
142 %define blimit r8
143 %define limit r9
144 %define thresh r10
145
146 %define spp rax
147 %define stride3 r11
148 %define stride5 r12
149 %define stride7 r13
150
151 push rbp
152 mov rbp, rsp
153 SAVE_XMM 11
154 push r12
155 push r13
156 mov thresh, arg(4)
157 %else
158 %define src rdi ; src_ptr
159 %define stride rsi ; src_pixel_step
160 %define blimit rdx
161 %define limit rcx
162 %define thresh r8
163
164 %define spp rax
165 %define stride3 r9
166 %define stride5 r10
167 %define stride7 r11
168 %endif
169
170 %define scratch1 xmm5
171 %define scratch2 xmm6
172 %define zero xmm7
173
174 %define i0 [src]
175 %define i1 [spp]
176 %define i2 [src + 2 * stride]
177 %define i3 [spp + 2 * stride]
178 %define i4 [src + 4 * stride]
179 %define i5 [spp + 4 * stride]
180 %define i6 [src + 2 * stride3]
181 %define i7 [spp + 2 * stride3]
182 %define i8 [src + 8 * stride]
183 %define i9 [spp + 8 * stride]
184 %define i10 [src + 2 * stride5]
185 %define i11 [spp + 2 * stride5]
186 %define i12 [src + 4 * stride3]
187 %define i13 [spp + 4 * stride3]
188 %define i14 [src + 2 * stride7]
189 %define i15 [spp + 2 * stride7]
190
191 ; prep work
192 lea spp, [src + stride]
193 lea stride3, [stride + 2 * stride]
194 lea stride5, [stride3 + 2 * stride]
195 lea stride7, [stride3 + 4 * stride]
196 pxor zero, zero
197
198 ; load the first set into registers
199 movdqa xmm0, i0
200 movdqa xmm1, i1
201 movdqa xmm2, i2
202 movdqa xmm3, i3
203 movdqa xmm4, i4
204 movdqa xmm8, i5
205 movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
206 movdqa xmm10, i7
207 LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
208
209 movdqa xmm1, i2
210 movdqa xmm2, i3
211 movdqa xmm3, i4
212 movdqa xmm8, i5
213 LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
214 movdqa i2, xmm1
215 movdqa i3, xmm2
216
217 ; second set
218 movdqa i4, xmm3
219 movdqa i5, xmm8
220
221 movdqa xmm0, i6
222 movdqa xmm1, i7
223 movdqa xmm2, i8
224 movdqa xmm4, i9
225 movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
226 movdqa xmm11, i11
227 LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
228
229 movdqa xmm0, i6
230 movdqa xmm1, i7
231 movdqa xmm4, i8
232 movdqa xmm8, i9
233 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
234 movdqa i6, xmm0
235 movdqa i7, xmm1
236
237 ; last set
238 movdqa i8, xmm4
239 movdqa i9, xmm8
240
241 movdqa xmm0, i10
242 movdqa xmm1, i11
243 movdqa xmm2, i12
244 movdqa xmm3, i13
245 movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
246 movdqa xmm11, i15
247 LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
248
249 movdqa xmm0, i10
250 movdqa xmm1, i11
251 movdqa xmm3, i12
252 movdqa xmm8, i13
253 LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
254 movdqa i10, xmm0
255 movdqa i11, xmm1
256 movdqa i12, xmm3
257 movdqa i13, xmm8
258
259 %if LIBVPX_YASM_WIN64
260 pop r13
261 pop r12
262 RESTORE_XMM
263 pop rbp
264 %endif
265
266 ret
267
268
269 ;void vp8_loop_filter_bv_y_sse2
270 ;(
271 ; unsigned char *src_ptr,
272 ; int src_pixel_step,
273 ; const char *blimit,
274 ; const char *limit,
275 ; const char *thresh
276 ;)
277
278 global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
279 sym(vp8_loop_filter_bv_y_sse2):
280
281 %if LIBVPX_YASM_WIN64
282 %define src rcx ; src_ptr
283 %define stride rdx ; src_pixel_step
284 %define blimit r8
285 %define limit r9
286 %define thresh r10
287
288 %define spp rax
289 %define stride3 r11
290 %define stride5 r12
291 %define stride7 r13
292
293 push rbp
294 mov rbp, rsp
295 SAVE_XMM 15
296 push r12
297 push r13
298 mov thresh, arg(4)
299 %else
300 %define src rdi
301 %define stride rsi
302 %define blimit rdx
303 %define limit rcx
304 %define thresh r8
305
306 %define spp rax
307 %define stride3 r9
308 %define stride5 r10
309 %define stride7 r11
310 %endif
311
312 %define scratch1 xmm5
313 %define scratch2 xmm6
314 %define zero xmm7
315
316 %define s0 [src]
317 %define s1 [spp]
318 %define s2 [src + 2 * stride]
319 %define s3 [spp + 2 * stride]
320 %define s4 [src + 4 * stride]
321 %define s5 [spp + 4 * stride]
322 %define s6 [src + 2 * stride3]
323 %define s7 [spp + 2 * stride3]
324 %define s8 [src + 8 * stride]
325 %define s9 [spp + 8 * stride]
326 %define s10 [src + 2 * stride5]
327 %define s11 [spp + 2 * stride5]
328 %define s12 [src + 4 * stride3]
329 %define s13 [spp + 4 * stride3]
330 %define s14 [src + 2 * stride7]
331 %define s15 [spp + 2 * stride7]
332
333 %define i0 [rsp]
334 %define i1 [rsp + 16]
335 %define i2 [rsp + 32]
336 %define i3 [rsp + 48]
337 %define i4 [rsp + 64]
338 %define i5 [rsp + 80]
339 %define i6 [rsp + 96]
340 %define i7 [rsp + 112]
341 %define i8 [rsp + 128]
342 %define i9 [rsp + 144]
343 %define i10 [rsp + 160]
344 %define i11 [rsp + 176]
345 %define i12 [rsp + 192]
346 %define i13 [rsp + 208]
347 %define i14 [rsp + 224]
348 %define i15 [rsp + 240]
349
350 ALIGN_STACK 16, rax
351
352 ; reserve stack space
353 %define temp_storage 0 ; size is 256 (16*16)
354 %define stack_size 256
355 sub rsp, stack_size
356
357 ; prep work
358 lea spp, [src + stride]
359 lea stride3, [stride + 2 * stride]
360 lea stride5, [stride3 + 2 * stride]
361 lea stride7, [stride3 + 4 * stride]
362
363 ; 8-f
364 movdqa xmm0, s8
365 movdqa xmm1, xmm0
366 punpcklbw xmm0, s9 ; 80 90
367 punpckhbw xmm1, s9 ; 88 98
368
369 movdqa xmm2, s10
370 movdqa xmm3, xmm2
371 punpcklbw xmm2, s11 ; a0 b0
372 punpckhbw xmm3, s11 ; a8 b8
373
374 movdqa xmm4, xmm0
375 punpcklwd xmm0, xmm2 ; 80 90 a0 b0
376 punpckhwd xmm4, xmm2 ; 84 94 a4 b4
377
378 movdqa xmm2, xmm1
379 punpcklwd xmm1, xmm3 ; 88 98 a8 b8
380 punpckhwd xmm2, xmm3 ; 8c 9c ac bc
381
382 ; using xmm[0124]
383 ; work on next 4 rows
384
385 movdqa xmm3, s12
386 movdqa xmm5, xmm3
387 punpcklbw xmm3, s13 ; c0 d0
388 punpckhbw xmm5, s13 ; c8 d8
389
390 movdqa xmm6, s14
391 movdqa xmm7, xmm6
392 punpcklbw xmm6, s15 ; e0 f0
393 punpckhbw xmm7, s15 ; e8 f8
394
395 movdqa xmm8, xmm3
396 punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
397 punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
398
399 movdqa xmm6, xmm5
400 punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
401 punpckhwd xmm6, xmm7 ; cc dc ec fc
402
403 ; pull the third and fourth sets together
404
405 movdqa xmm7, xmm0
406 punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
407 punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
408
409 movdqa xmm3, xmm4
410 punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
411 punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
412
413 movdqa xmm8, xmm1
414 punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
415 punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
416
417 movdqa xmm5, xmm2
418 punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
419 punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
420
421 ; save the calculations. we only have 15 registers ...
422 movdqa i0, xmm0
423 movdqa i1, xmm7
424 movdqa i2, xmm4
425 movdqa i3, xmm3
426 movdqa i4, xmm1
427 movdqa i5, xmm8
428 movdqa i6, xmm2
429 movdqa i7, xmm5
430
431 ; 0-7
432 movdqa xmm0, s0
433 movdqa xmm1, xmm0
434 punpcklbw xmm0, s1 ; 00 10
435 punpckhbw xmm1, s1 ; 08 18
436
437 movdqa xmm2, s2
438 movdqa xmm3, xmm2
439 punpcklbw xmm2, s3 ; 20 30
440 punpckhbw xmm3, s3 ; 28 38
441
442 movdqa xmm4, xmm0
443 punpcklwd xmm0, xmm2 ; 00 10 20 30
444 punpckhwd xmm4, xmm2 ; 04 14 24 34
445
446 movdqa xmm2, xmm1
447 punpcklwd xmm1, xmm3 ; 08 18 28 38
448 punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
449
450 ; using xmm[0124]
451 ; work on next 4 rows
452
453 movdqa xmm3, s4
454 movdqa xmm5, xmm3
455 punpcklbw xmm3, s5 ; 40 50
456 punpckhbw xmm5, s5 ; 48 58
457
458 movdqa xmm6, s6
459 movdqa xmm7, xmm6
460 punpcklbw xmm6, s7 ; 60 70
461 punpckhbw xmm7, s7 ; 68 78
462
463 movdqa xmm8, xmm3
464 punpcklwd xmm3, xmm6 ; 40 50 60 70
465 punpckhwd xmm8, xmm6 ; 44 54 64 74
466
467 movdqa xmm6, xmm5
468 punpcklwd xmm5, xmm7 ; 48 58 68 78
469 punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
470
471 ; pull the first two sets together
472
473 movdqa xmm7, xmm0
474 punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
475 punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
476
477 movdqa xmm3, xmm4
478 punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
479 punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
480
481 movdqa xmm8, xmm1
482 punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
483 punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
484
485 movdqa xmm5, xmm2
486 punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
487 punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
488 ; final combination
489
490 movdqa xmm6, xmm0
491 punpcklqdq xmm0, i0
492 punpckhqdq xmm6, i0
493
494 movdqa xmm9, xmm7
495 punpcklqdq xmm7, i1
496 punpckhqdq xmm9, i1
497
498 movdqa xmm10, xmm4
499 punpcklqdq xmm4, i2
500 punpckhqdq xmm10, i2
501
502 movdqa xmm11, xmm3
503 punpcklqdq xmm3, i3
504 punpckhqdq xmm11, i3
505
506 movdqa xmm12, xmm1
507 punpcklqdq xmm1, i4
508 punpckhqdq xmm12, i4
509
510 movdqa xmm13, xmm8
511 punpcklqdq xmm8, i5
512 punpckhqdq xmm13, i5
513
514 movdqa xmm14, xmm2
515 punpcklqdq xmm2, i6
516 punpckhqdq xmm14, i6
517
518 movdqa xmm15, xmm5
519 punpcklqdq xmm5, i7
520 punpckhqdq xmm15, i7
521
522 movdqa i0, xmm0
523 movdqa i1, xmm6
524 movdqa i2, xmm7
525 movdqa i3, xmm9
526 movdqa i4, xmm4
527 movdqa i5, xmm10
528 movdqa i6, xmm3
529 movdqa i7, xmm11
530 movdqa i8, xmm1
531 movdqa i9, xmm12
532 movdqa i10, xmm8
533 movdqa i11, xmm13
534 movdqa i12, xmm2
535 movdqa i13, xmm14
536 movdqa i14, xmm5
537 movdqa i15, xmm15
538
539 ; TRANSPOSED DATA AVAILABLE ON THE STACK
540
541 movdqa xmm12, xmm6
542 movdqa xmm13, xmm7
543
544 pxor zero, zero
545
546 LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
547
548 movdqa xmm1, i2
549 movdqa xmm2, i3
550 movdqa xmm8, i4
551 movdqa xmm9, i5
552 LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
553 movdqa i2, xmm1
554 movdqa i3, xmm2
555
556 ; second set
557 movdqa i4, xmm8
558 movdqa i5, xmm9
559
560 movdqa xmm0, i6
561 movdqa xmm1, i7
562 movdqa xmm2, i8
563 movdqa xmm4, i9
564 movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
565 movdqa xmm11, i11
566 LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
567
568 movdqa xmm0, i6
569 movdqa xmm1, i7
570 movdqa xmm3, i8
571 movdqa xmm4, i9
572 LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
573 movdqa i6, xmm0
574 movdqa i7, xmm1
575
576 ; last set
577 movdqa i8, xmm3
578 movdqa i9, xmm4
579
580 movdqa xmm0, i10
581 movdqa xmm1, i11
582 movdqa xmm2, i12
583 movdqa xmm8, i13
584 movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
585 movdqa xmm11, i15
586 LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
587
588 movdqa xmm0, i10
589 movdqa xmm1, i11
590 movdqa xmm4, i12
591 movdqa xmm8, i13
592 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
593 movdqa i10, xmm0
594 movdqa i11, xmm1
595 movdqa i12, xmm4
596 movdqa i13, xmm8
597
598
599 ; RESHUFFLE AND WRITE OUT
600 ; 8-f
601 movdqa xmm0, i8
602 movdqa xmm1, xmm0
603 punpcklbw xmm0, i9 ; 80 90
604 punpckhbw xmm1, i9 ; 88 98
605
606 movdqa xmm2, i10
607 movdqa xmm3, xmm2
608 punpcklbw xmm2, i11 ; a0 b0
609 punpckhbw xmm3, i11 ; a8 b8
610
611 movdqa xmm4, xmm0
612 punpcklwd xmm0, xmm2 ; 80 90 a0 b0
613 punpckhwd xmm4, xmm2 ; 84 94 a4 b4
614
615 movdqa xmm2, xmm1
616 punpcklwd xmm1, xmm3 ; 88 98 a8 b8
617 punpckhwd xmm2, xmm3 ; 8c 9c ac bc
618
619 ; using xmm[0124]
620 ; work on next 4 rows
621
622 movdqa xmm3, i12
623 movdqa xmm5, xmm3
624 punpcklbw xmm3, i13 ; c0 d0
625 punpckhbw xmm5, i13 ; c8 d8
626
627 movdqa xmm6, i14
628 movdqa xmm7, xmm6
629 punpcklbw xmm6, i15 ; e0 f0
630 punpckhbw xmm7, i15 ; e8 f8
631
632 movdqa xmm8, xmm3
633 punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
634 punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
635
636 movdqa xmm6, xmm5
637 punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
638 punpckhwd xmm6, xmm7 ; cc dc ec fc
639
640 ; pull the third and fourth sets together
641
642 movdqa xmm7, xmm0
643 punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
644 punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
645
646 movdqa xmm3, xmm4
647 punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
648 punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
649
650 movdqa xmm8, xmm1
651 punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
652 punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
653
654 movdqa xmm5, xmm2
655 punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
656 punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
657
658 ; save the calculations. we only have 15 registers ...
659 movdqa i8, xmm0
660 movdqa i9, xmm7
661 movdqa i10, xmm4
662 movdqa i11, xmm3
663 movdqa i12, xmm1
664 movdqa i13, xmm8
665 movdqa i14, xmm2
666 movdqa i15, xmm5
667
668 ; 0-7
669 movdqa xmm0, i0
670 movdqa xmm1, xmm0
671 punpcklbw xmm0, i1 ; 00 10
672 punpckhbw xmm1, i1 ; 08 18
673
674 movdqa xmm2, i2
675 movdqa xmm3, xmm2
676 punpcklbw xmm2, i3 ; 20 30
677 punpckhbw xmm3, i3 ; 28 38
678
679 movdqa xmm4, xmm0
680 punpcklwd xmm0, xmm2 ; 00 10 20 30
681 punpckhwd xmm4, xmm2 ; 04 14 24 34
682
683 movdqa xmm2, xmm1
684 punpcklwd xmm1, xmm3 ; 08 18 28 38
685 punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
686
687 ; using xmm[0124]
688 ; work on next 4 rows
689
690 movdqa xmm3, i4
691 movdqa xmm5, xmm3
692 punpcklbw xmm3, i5 ; 40 50
693 punpckhbw xmm5, i5 ; 48 58
694
695 movdqa xmm6, i6
696 movdqa xmm7, xmm6
697 punpcklbw xmm6, i7 ; 60 70
698 punpckhbw xmm7, i7 ; 68 78
699
700 movdqa xmm8, xmm3
701 punpcklwd xmm3, xmm6 ; 40 50 60 70
702 punpckhwd xmm8, xmm6 ; 44 54 64 74
703
704 movdqa xmm6, xmm5
705 punpcklwd xmm5, xmm7 ; 48 58 68 78
706 punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
707
708 ; pull the first two sets together
709
710 movdqa xmm7, xmm0
711 punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
712 punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
713
714 movdqa xmm3, xmm4
715 punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
716 punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
717
718 movdqa xmm8, xmm1
719 punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
720 punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
721
722 movdqa xmm5, xmm2
723 punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
724 punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
725 ; final combination
726
727 movdqa xmm6, xmm0
728 punpcklqdq xmm0, i8
729 punpckhqdq xmm6, i8
730
731 movdqa xmm9, xmm7
732 punpcklqdq xmm7, i9
733 punpckhqdq xmm9, i9
734
735 movdqa xmm10, xmm4
736 punpcklqdq xmm4, i10
737 punpckhqdq xmm10, i10
738
739 movdqa xmm11, xmm3
740 punpcklqdq xmm3, i11
741 punpckhqdq xmm11, i11
742
743 movdqa xmm12, xmm1
744 punpcklqdq xmm1, i12
745 punpckhqdq xmm12, i12
746
747 movdqa xmm13, xmm8
748 punpcklqdq xmm8, i13
749 punpckhqdq xmm13, i13
750
751 movdqa xmm14, xmm2
752 punpcklqdq xmm2, i14
753 punpckhqdq xmm14, i14
754
755 movdqa xmm15, xmm5
756 punpcklqdq xmm5, i15
757 punpckhqdq xmm15, i15
758
759 movdqa s0, xmm0
760 movdqa s1, xmm6
761 movdqa s2, xmm7
762 movdqa s3, xmm9
763 movdqa s4, xmm4
764 movdqa s5, xmm10
765 movdqa s6, xmm3
766 movdqa s7, xmm11
767 movdqa s8, xmm1
768 movdqa s9, xmm12
769 movdqa s10, xmm8
770 movdqa s11, xmm13
771 movdqa s12, xmm2
772 movdqa s13, xmm14
773 movdqa s14, xmm5
774 movdqa s15, xmm15
775
776 ; free stack space
777 add rsp, stack_size
778
779 ; un-ALIGN_STACK
780 pop rsp
781
782 %if LIBVPX_YASM_WIN64
783 pop r13
784 pop r12
785 RESTORE_XMM
786 pop rbp
787 %endif
788
789 ret
790
791 SECTION_RODATA
792 align 16
793 te0:
794 times 16 db 0xe0
795 align 16
796 t7f:
797 times 16 db 0x7f
798 align 16
799 tfe:
800 times 16 db 0xfe
801 align 16
802 t1f:
803 times 16 db 0x1f
804 align 16
805 t80:
806 times 16 db 0x80
807 align 16
808 t1:
809 times 16 db 0x01
810 align 16
811 t3:
812 times 16 db 0x03
813 align 16
814 t4:
815 times 16 db 0x04
OLDNEW
« no previous file with comments | « source/libvpx/vp8/common/rtcd_defs.pl ('k') | source/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698