OLD | NEW |
1 ; | 1 ; |
2 ; jiss2red-64.asm - reduced-size IDCT (64-bit SSE2) | 2 ; jiss2red-64.asm - reduced-size IDCT (64-bit SSE2) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
6 ; | 6 ; |
7 ; Based on | 7 ; Based on |
8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
(...skipping 294 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
305 | 305 |
306 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] | 306 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] |
307 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] | 307 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] |
308 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] | 308 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] |
309 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] | 309 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] |
310 | 310 |
311 ; ---- Pass 2: process rows, store into output array. | 311 ; ---- Pass 2: process rows, store into output array. |
312 | 312 |
313 mov rax, [original_rbp] | 313 mov rax, [original_rbp] |
314 mov rdi, r12 ; (JSAMPROW *) | 314 mov rdi, r12 ; (JSAMPROW *) |
315 » mov» rax, r13 | 315 » mov» eax, r13d |
316 | 316 |
317 ; -- Even part | 317 ; -- Even part |
318 | 318 |
319 pxor xmm4,xmm4 | 319 pxor xmm4,xmm4 |
320 punpcklwd xmm4,xmm1 ; xmm4=tmp0 | 320 punpcklwd xmm4,xmm1 ; xmm4=tmp0 |
321 psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+1 | 321 psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+1 |
322 | 322 |
323 ; -- Odd part | 323 ; -- Odd part |
324 | 324 |
325 punpckhwd xmm1,xmm0 | 325 punpckhwd xmm1,xmm0 |
(...skipping 188 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
514 ; -- Prefetch the next coefficient block | 514 ; -- Prefetch the next coefficient block |
515 | 515 |
516 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] | 516 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] |
517 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] | 517 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] |
518 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] | 518 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] |
519 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] | 519 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] |
520 | 520 |
521 ; ---- Pass 2: process rows, store into output array. | 521 ; ---- Pass 2: process rows, store into output array. |
522 | 522 |
523 mov rdi, r12 ; (JSAMPROW *) | 523 mov rdi, r12 ; (JSAMPROW *) |
524 » mov» rax, r13 | 524 » mov» eax, r13d |
525 | 525 |
526 ; | input:| result:| | 526 ; | input:| result:| |
527 ; | A0 B0 | | | 527 ; | A0 B0 | | |
528 ; | A1 B1 | C0 C1 | | 528 ; | A1 B1 | C0 C1 | |
529 ; | A3 B3 | D0 D1 | | 529 ; | A3 B3 | D0 D1 | |
530 ; | A5 B5 | | | 530 ; | A5 B5 | | |
531 ; | A7 B7 | | | 531 ; | A7 B7 | | |
532 | 532 |
533 ; -- Odd part | 533 ; -- Odd part |
534 | 534 |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
567 mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx | 567 mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx |
568 | 568 |
569 pop rbx | 569 pop rbx |
570 uncollect_args | 570 uncollect_args |
571 pop rbp | 571 pop rbp |
572 ret | 572 ret |
573 | 573 |
574 ; For some reason, the OS X linker does not honor the request to align the | 574 ; For some reason, the OS X linker does not honor the request to align the |
575 ; segment unless we do this. | 575 ; segment unless we do this. |
576 align 16 | 576 align 16 |
OLD | NEW |