Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(170)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_idctllm_sse2.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ;void vp9_idct_dequant_0_2x_sse2
15 ; (
16 ; short *qcoeff - 0
17 ; short *dequant - 1
18 ; unsigned char *pre - 2
19 ; unsigned char *dst - 3
20 ; int dst_stride - 4
21 ; int blk_stride - 5
22 ; )
23
24 global sym(vp9_idct_dequant_0_2x_sse2)
25 sym(vp9_idct_dequant_0_2x_sse2):
26 push rbp
27 mov rbp, rsp
28 SHADOW_ARGS_TO_STACK 6
29 GET_GOT rbx
30 ; end prolog
31
32 mov rdx, arg(1) ; dequant
33 mov rax, arg(0) ; qcoeff
34
35 movd xmm4, [rax]
36 movd xmm5, [rdx]
37
38 pinsrw xmm4, [rax+32], 4
39 pinsrw xmm5, [rdx], 4
40
41 pmullw xmm4, xmm5
42
43 ; Zero out xmm5, for use unpacking
44 pxor xmm5, xmm5
45
46 ; clear coeffs
47 movd [rax], xmm5
48 movd [rax+32], xmm5
49 ;pshufb
50 pshuflw xmm4, xmm4, 00000000b
51 pshufhw xmm4, xmm4, 00000000b
52
53 mov rax, arg(2) ; pre
54 paddw xmm4, [GLOBAL(fours)]
55
56 movsxd rcx, dword ptr arg(5) ; blk_stride
57 psraw xmm4, 3
58
59 movq xmm0, [rax]
60 movq xmm1, [rax+rcx]
61 movq xmm2, [rax+2*rcx]
62 lea rcx, [3*rcx]
63 movq xmm3, [rax+rcx]
64
65 punpcklbw xmm0, xmm5
66 punpcklbw xmm1, xmm5
67 punpcklbw xmm2, xmm5
68 punpcklbw xmm3, xmm5
69
70 mov rax, arg(3) ; dst
71 movsxd rdx, dword ptr arg(4) ; dst_stride
72
73 ; Add to predict buffer
74 paddw xmm0, xmm4
75 paddw xmm1, xmm4
76 paddw xmm2, xmm4
77 paddw xmm3, xmm4
78
79 ; pack up before storing
80 packuswb xmm0, xmm5
81 packuswb xmm1, xmm5
82 packuswb xmm2, xmm5
83 packuswb xmm3, xmm5
84
85 ; store blocks back out
86 movq [rax], xmm0
87 movq [rax + rdx], xmm1
88
89 lea rax, [rax + 2*rdx]
90
91 movq [rax], xmm2
92 movq [rax + rdx], xmm3
93
94 ; begin epilog
95 RESTORE_GOT
96 UNSHADOW_ARGS
97 pop rbp
98 ret
99
100 global sym(vp9_idct_dequant_full_2x_sse2)
101 sym(vp9_idct_dequant_full_2x_sse2):
102 push rbp
103 mov rbp, rsp
104 SHADOW_ARGS_TO_STACK 7
105 SAVE_XMM 7
106 GET_GOT rbx
107 push rsi
108 push rdi
109 ; end prolog
110
111 ; special case when 2 blocks have 0 or 1 coeffs
112 ; dc is set as first coeff, so no need to load qcoeff
113 mov rax, arg(0) ; qcoeff
114 mov rsi, arg(2) ; pre
115 mov rdi, arg(3) ; dst
116 movsxd rcx, dword ptr arg(5) ; blk_stride
117
118 ; Zero out xmm7, for use unpacking
119 pxor xmm7, xmm7
120
121 mov rdx, arg(1) ; dequant
122
123 ; note the transpose of xmm1 and xmm2, necessary for shuffle
124 ; to spit out sensicle data
125 movdqa xmm0, [rax]
126 movdqa xmm2, [rax+16]
127 movdqa xmm1, [rax+32]
128 movdqa xmm3, [rax+48]
129
130 ; Clear out coeffs
131 movdqa [rax], xmm7
132 movdqa [rax+16], xmm7
133 movdqa [rax+32], xmm7
134 movdqa [rax+48], xmm7
135
136 ; dequantize qcoeff buffer
137 pmullw xmm0, [rdx]
138 pmullw xmm2, [rdx+16]
139 pmullw xmm1, [rdx]
140 pmullw xmm3, [rdx+16]
141
142 ; repack so block 0 row x and block 1 row x are together
143 movdqa xmm4, xmm0
144 punpckldq xmm0, xmm1
145 punpckhdq xmm4, xmm1
146
147 pshufd xmm0, xmm0, 11011000b
148 pshufd xmm1, xmm4, 11011000b
149
150 movdqa xmm4, xmm2
151 punpckldq xmm2, xmm3
152 punpckhdq xmm4, xmm3
153
154 pshufd xmm2, xmm2, 11011000b
155 pshufd xmm3, xmm4, 11011000b
156
157 ; first pass
158 psubw xmm0, xmm2 ; b1 = 0-2
159 paddw xmm2, xmm2 ;
160
161 movdqa xmm5, xmm1
162 paddw xmm2, xmm0 ; a1 = 0+2
163
164 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
165 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
166
167 movdqa xmm7, xmm3
168 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
169
170 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
171 psubw xmm7, xmm5 ; c1
172
173 movdqa xmm5, xmm1
174 movdqa xmm4, xmm3
175
176 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
177 paddw xmm5, xmm1
178
179 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
180 paddw xmm3, xmm4
181
182 paddw xmm3, xmm5 ; d1
183 movdqa xmm6, xmm2 ; a1
184
185 movdqa xmm4, xmm0 ; b1
186 paddw xmm2, xmm3 ;0
187
188 paddw xmm4, xmm7 ;1
189 psubw xmm0, xmm7 ;2
190
191 psubw xmm6, xmm3 ;3
192
193 ; transpose for the second pass
194 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 00 0
195 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 00 0
196 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 10 0
197
198 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 00 8
199 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 00 8
200 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 10 8
201
202
203 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 00 0
204 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 00 0
205 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 00 2
206
207 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 10 0
208 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 10 0
209 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 10 2
210
211
212 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 00 0
213 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 00 0
214 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 00 1
215
216 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 00 2
217 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 00 2
218 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 00 3
219
220 pshufd xmm0, xmm2, 11011000b
221 pshufd xmm2, xmm1, 11011000b
222
223 pshufd xmm1, xmm5, 11011000b
224 pshufd xmm3, xmm7, 11011000b
225
226 ; second pass
227 psubw xmm0, xmm2 ; b1 = 0-2
228 paddw xmm2, xmm2
229
230 movdqa xmm5, xmm1
231 paddw xmm2, xmm0 ; a1 = 0+2
232
233 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
234 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
235
236 movdqa xmm7, xmm3
237 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
238
239 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
240 psubw xmm7, xmm5 ; c1
241
242 movdqa xmm5, xmm1
243 movdqa xmm4, xmm3
244
245 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
246 paddw xmm5, xmm1
247
248 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
249 paddw xmm3, xmm4
250
251 paddw xmm3, xmm5 ; d1
252 paddw xmm0, [GLOBAL(fours)]
253
254 paddw xmm2, [GLOBAL(fours)]
255 movdqa xmm6, xmm2 ; a1
256
257 movdqa xmm4, xmm0 ; b1
258 paddw xmm2, xmm3 ;0
259
260 paddw xmm4, xmm7 ;1
261 psubw xmm0, xmm7 ;2
262
263 psubw xmm6, xmm3 ;3
264 psraw xmm2, 3
265
266 psraw xmm0, 3
267 psraw xmm4, 3
268
269 psraw xmm6, 3
270
271 ; transpose to save
272 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 00 0
273 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 00 0
274 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 10 0
275
276 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 00 8
277 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 00 8
278 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 10 8
279
280
281 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 00 0
282 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 00 0
283 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 00 2
284
285 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 10 0
286 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 10 0
287 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 10 2
288
289
290 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 00 0
291 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 00 0
292 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 00 1
293
294 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 00 2
295 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 00 2
296 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 00 3
297
298 pshufd xmm0, xmm2, 11011000b
299 pshufd xmm2, xmm1, 11011000b
300
301 pshufd xmm1, xmm5, 11011000b
302 pshufd xmm3, xmm7, 11011000b
303
304 pxor xmm7, xmm7
305
306 ; Load up predict blocks
307 movq xmm4, [rsi]
308 movq xmm5, [rsi+rcx]
309
310 punpcklbw xmm4, xmm7
311 punpcklbw xmm5, xmm7
312
313 paddw xmm0, xmm4
314 paddw xmm1, xmm5
315
316 movq xmm4, [rsi+2*rcx]
317 lea rcx, [3*rcx]
318 movq xmm5, [rsi+rcx]
319
320 punpcklbw xmm4, xmm7
321 punpcklbw xmm5, xmm7
322
323 paddw xmm2, xmm4
324 paddw xmm3, xmm5
325
326 .finish:
327
328 ; pack up before storing
329 packuswb xmm0, xmm7
330 packuswb xmm1, xmm7
331 packuswb xmm2, xmm7
332 packuswb xmm3, xmm7
333
334 ; Load destination stride before writing out,
335 ; doesn't need to persist
336 movsxd rdx, dword ptr arg(4) ; dst_stride
337
338 ; store blocks back out
339 movq [rdi], xmm0
340 movq [rdi + rdx], xmm1
341
342 lea rdi, [rdi + 2*rdx]
343
344 movq [rdi], xmm2
345 movq [rdi + rdx], xmm3
346
347 ; begin epilog
348 pop rdi
349 pop rsi
350 RESTORE_GOT
351 RESTORE_XMM
352 UNSHADOW_ARGS
353 pop rbp
354 ret
355
356 ;void vp9_idct_dequant_dc_0_2x_sse2
357 ; (
358 ; short *qcoeff - 0
359 ; short *dequant - 1
360 ; unsigned char *pre - 2
361 ; unsigned char *dst - 3
362 ; int dst_stride - 4
363 ; short *dc - 5
364 ; )
365 global sym(vp9_idct_dequant_dc_0_2x_sse2)
366 sym(vp9_idct_dequant_dc_0_2x_sse2):
367 push rbp
368 mov rbp, rsp
369 SHADOW_ARGS_TO_STACK 7
370 GET_GOT rbx
371 push rsi
372 push rdi
373 ; end prolog
374
375 ; special case when 2 blocks have 0 or 1 coeffs
376 ; dc is set as first coeff, so no need to load qcoeff
377 mov rax, arg(0) ; qcoeff
378 mov rsi, arg(2) ; pre
379 mov rdi, arg(3) ; dst
380 mov rdx, arg(5) ; dc
381
382 ; Zero out xmm5, for use unpacking
383 pxor xmm5, xmm5
384
385 ; load up 2 dc words here == 2*16 = doubleword
386 movd xmm4, [rdx]
387
388 ; Load up predict blocks
389 movq xmm0, [rsi]
390 movq xmm1, [rsi+16]
391 movq xmm2, [rsi+32]
392 movq xmm3, [rsi+48]
393
394 ; Duplicate and expand dc across
395 punpcklwd xmm4, xmm4
396 punpckldq xmm4, xmm4
397
398 ; Rounding to dequant and downshift
399 paddw xmm4, [GLOBAL(fours)]
400 psraw xmm4, 3
401
402 ; Predict buffer needs to be expanded from bytes to words
403 punpcklbw xmm0, xmm5
404 punpcklbw xmm1, xmm5
405 punpcklbw xmm2, xmm5
406 punpcklbw xmm3, xmm5
407
408 ; Add to predict buffer
409 paddw xmm0, xmm4
410 paddw xmm1, xmm4
411 paddw xmm2, xmm4
412 paddw xmm3, xmm4
413
414 ; pack up before storing
415 packuswb xmm0, xmm5
416 packuswb xmm1, xmm5
417 packuswb xmm2, xmm5
418 packuswb xmm3, xmm5
419
420 ; Load destination stride before writing out,
421 ; doesn't need to persist
422 movsxd rdx, dword ptr arg(4) ; dst_stride
423
424 ; store blocks back out
425 movq [rdi], xmm0
426 movq [rdi + rdx], xmm1
427
428 lea rdi, [rdi + 2*rdx]
429
430 movq [rdi], xmm2
431 movq [rdi + rdx], xmm3
432
433 ; begin epilog
434 pop rdi
435 pop rsi
436 RESTORE_GOT
437 UNSHADOW_ARGS
438 pop rbp
439 ret
440
441 global sym(vp9_idct_dequant_dc_full_2x_sse2)
442 sym(vp9_idct_dequant_dc_full_2x_sse2):
443 push rbp
444 mov rbp, rsp
445 SHADOW_ARGS_TO_STACK 7
446 SAVE_XMM 7
447 GET_GOT rbx
448 push rsi
449 push rdi
450 ; end prolog
451
452 ; special case when 2 blocks have 0 or 1 coeffs
453 ; dc is set as first coeff, so no need to load qcoeff
454 mov rax, arg(0) ; qcoeff
455 mov rsi, arg(2) ; pre
456 mov rdi, arg(3) ; dst
457
458 ; Zero out xmm7, for use unpacking
459 pxor xmm7, xmm7
460
461 mov rdx, arg(1) ; dequant
462
463 ; note the transpose of xmm1 and xmm2, necessary for shuffle
464 ; to spit out sensicle data
465 movdqa xmm0, [rax]
466 movdqa xmm2, [rax+16]
467 movdqa xmm1, [rax+32]
468 movdqa xmm3, [rax+48]
469
470 ; Clear out coeffs
471 movdqa [rax], xmm7
472 movdqa [rax+16], xmm7
473 movdqa [rax+32], xmm7
474 movdqa [rax+48], xmm7
475
476 ; dequantize qcoeff buffer
477 pmullw xmm0, [rdx]
478 pmullw xmm2, [rdx+16]
479 pmullw xmm1, [rdx]
480 pmullw xmm3, [rdx+16]
481
482 ; DC component
483 mov rdx, arg(5)
484
485 ; repack so block 0 row x and block 1 row x are together
486 movdqa xmm4, xmm0
487 punpckldq xmm0, xmm1
488 punpckhdq xmm4, xmm1
489
490 pshufd xmm0, xmm0, 11011000b
491 pshufd xmm1, xmm4, 11011000b
492
493 movdqa xmm4, xmm2
494 punpckldq xmm2, xmm3
495 punpckhdq xmm4, xmm3
496
497 pshufd xmm2, xmm2, 11011000b
498 pshufd xmm3, xmm4, 11011000b
499
500 ; insert DC component
501 pinsrw xmm0, [rdx], 0
502 pinsrw xmm0, [rdx+2], 4
503
504 ; first pass
505 psubw xmm0, xmm2 ; b1 = 0-2
506 paddw xmm2, xmm2 ;
507
508 movdqa xmm5, xmm1
509 paddw xmm2, xmm0 ; a1 = 0+2
510
511 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
512 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
513
514 movdqa xmm7, xmm3
515 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
516
517 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
518 psubw xmm7, xmm5 ; c1
519
520 movdqa xmm5, xmm1
521 movdqa xmm4, xmm3
522
523 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
524 paddw xmm5, xmm1
525
526 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
527 paddw xmm3, xmm4
528
529 paddw xmm3, xmm5 ; d1
530 movdqa xmm6, xmm2 ; a1
531
532 movdqa xmm4, xmm0 ; b1
533 paddw xmm2, xmm3 ;0
534
535 paddw xmm4, xmm7 ;1
536 psubw xmm0, xmm7 ;2
537
538 psubw xmm6, xmm3 ;3
539
540 ; transpose for the second pass
541 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 00 0
542 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 00 0
543 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 10 0
544
545 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 00 8
546 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 00 8
547 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 10 8
548
549
550 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 00 0
551 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 00 0
552 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 00 2
553
554 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 10 0
555 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 10 0
556 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 10 2
557
558
559 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 00 0
560 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 00 0
561 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 00 1
562
563 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 00 2
564 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 00 2
565 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 00 3
566
567 pshufd xmm0, xmm2, 11011000b
568 pshufd xmm2, xmm1, 11011000b
569
570 pshufd xmm1, xmm5, 11011000b
571 pshufd xmm3, xmm7, 11011000b
572
573 ; second pass
574 psubw xmm0, xmm2 ; b1 = 0-2
575 paddw xmm2, xmm2
576
577 movdqa xmm5, xmm1
578 paddw xmm2, xmm0 ; a1 = 0+2
579
580 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
581 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
582
583 movdqa xmm7, xmm3
584 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
585
586 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
587 psubw xmm7, xmm5 ; c1
588
589 movdqa xmm5, xmm1
590 movdqa xmm4, xmm3
591
592 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
593 paddw xmm5, xmm1
594
595 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
596 paddw xmm3, xmm4
597
598 paddw xmm3, xmm5 ; d1
599 paddw xmm0, [GLOBAL(fours)]
600
601 paddw xmm2, [GLOBAL(fours)]
602 movdqa xmm6, xmm2 ; a1
603
604 movdqa xmm4, xmm0 ; b1
605 paddw xmm2, xmm3 ;0
606
607 paddw xmm4, xmm7 ;1
608 psubw xmm0, xmm7 ;2
609
610 psubw xmm6, xmm3 ;3
611 psraw xmm2, 3
612
613 psraw xmm0, 3
614 psraw xmm4, 3
615
616 psraw xmm6, 3
617
618 ; transpose to save
619 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 00 0
620 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 00 0
621 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 10 0
622
623 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 00 8
624 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 00 8
625 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 10 8
626
627
628 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 00 0
629 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 00 0
630 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 00 2
631
632 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 10 0
633 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 10 0
634 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 10 2
635
636
637 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 00 0
638 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 00 0
639 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 00 1
640
641 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 00 2
642 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 00 2
643 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 00 3
644
645 pshufd xmm0, xmm2, 11011000b
646 pshufd xmm2, xmm1, 11011000b
647
648 pshufd xmm1, xmm5, 11011000b
649 pshufd xmm3, xmm7, 11011000b
650
651 pxor xmm7, xmm7
652
653 ; Load up predict blocks
654 movq xmm4, [rsi]
655 movq xmm5, [rsi+16]
656
657 punpcklbw xmm4, xmm7
658 punpcklbw xmm5, xmm7
659
660 paddw xmm0, xmm4
661 paddw xmm1, xmm5
662
663 movq xmm4, [rsi+32]
664 movq xmm5, [rsi+48]
665
666 punpcklbw xmm4, xmm7
667 punpcklbw xmm5, xmm7
668
669 paddw xmm2, xmm4
670 paddw xmm3, xmm5
671
672 .finish:
673
674 ; pack up before storing
675 packuswb xmm0, xmm7
676 packuswb xmm1, xmm7
677 packuswb xmm2, xmm7
678 packuswb xmm3, xmm7
679
680 ; Load destination stride before writing out,
681 ; doesn't need to persist
682 movsxd rdx, dword ptr arg(4) ; dst_stride
683
684 ; store blocks back out
685 movq [rdi], xmm0
686 movq [rdi + rdx], xmm1
687
688 lea rdi, [rdi + 2*rdx]
689
690 movq [rdi], xmm2
691 movq [rdi + rdx], xmm3
692
693
694 ; begin epilog
695 pop rdi
696 pop rsi
697 RESTORE_GOT
698 RESTORE_XMM
699 UNSHADOW_ARGS
700 pop rbp
701 ret
702
703 SECTION_RODATA
704 align 16
705 fours:
706 times 8 dw 0x0004
707 align 16
708 x_s1sqr2:
709 times 8 dw 0x8A8C
710 align 16
711 x_c1sqr2less1:
712 times 8 dw 0x4E7B
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698