| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; jiss2red-64.asm - reduced-size IDCT (64-bit SSE2) | 2 ; jiss2red-64.asm - reduced-size IDCT (64-bit SSE2) |
| 3 ; | 3 ; |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
| 5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
| 6 ; | 6 ; |
| 7 ; Based on | 7 ; Based on |
| 8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
| 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| (...skipping 294 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 305 | 305 |
| 306 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] | 306 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] |
| 307 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] | 307 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] |
| 308 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] | 308 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] |
| 309 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] | 309 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] |
| 310 | 310 |
| 311 ; ---- Pass 2: process rows, store into output array. | 311 ; ---- Pass 2: process rows, store into output array. |
| 312 | 312 |
| 313 mov rax, [original_rbp] | 313 mov rax, [original_rbp] |
| 314 mov rdi, r12 ; (JSAMPROW *) | 314 mov rdi, r12 ; (JSAMPROW *) |
| 315 » mov» rax, r13 | 315 » mov» eax, r13d |
| 316 | 316 |
| 317 ; -- Even part | 317 ; -- Even part |
| 318 | 318 |
| 319 pxor xmm4,xmm4 | 319 pxor xmm4,xmm4 |
| 320 punpcklwd xmm4,xmm1 ; xmm4=tmp0 | 320 punpcklwd xmm4,xmm1 ; xmm4=tmp0 |
| 321 psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+1 | 321 psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+1 |
| 322 | 322 |
| 323 ; -- Odd part | 323 ; -- Odd part |
| 324 | 324 |
| 325 punpckhwd xmm1,xmm0 | 325 punpckhwd xmm1,xmm0 |
| (...skipping 188 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 514 ; -- Prefetch the next coefficient block | 514 ; -- Prefetch the next coefficient block |
| 515 | 515 |
| 516 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] | 516 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] |
| 517 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] | 517 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] |
| 518 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] | 518 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] |
| 519 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] | 519 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] |
| 520 | 520 |
| 521 ; ---- Pass 2: process rows, store into output array. | 521 ; ---- Pass 2: process rows, store into output array. |
| 522 | 522 |
| 523 mov rdi, r12 ; (JSAMPROW *) | 523 mov rdi, r12 ; (JSAMPROW *) |
| 524 » mov» rax, r13 | 524 » mov» eax, r13d |
| 525 | 525 |
| 526 ; | input:| result:| | 526 ; | input:| result:| |
| 527 ; | A0 B0 | | | 527 ; | A0 B0 | | |
| 528 ; | A1 B1 | C0 C1 | | 528 ; | A1 B1 | C0 C1 | |
| 529 ; | A3 B3 | D0 D1 | | 529 ; | A3 B3 | D0 D1 | |
| 530 ; | A5 B5 | | | 530 ; | A5 B5 | | |
| 531 ; | A7 B7 | | | 531 ; | A7 B7 | | |
| 532 | 532 |
| 533 ; -- Odd part | 533 ; -- Odd part |
| 534 | 534 |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 567 mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx | 567 mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx |
| 568 | 568 |
| 569 pop rbx | 569 pop rbx |
| 570 uncollect_args | 570 uncollect_args |
| 571 pop rbp | 571 pop rbp |
| 572 ret | 572 ret |
| 573 | 573 |
| 574 ; For some reason, the OS X linker does not honor the request to align the | 574 ; For some reason, the OS X linker does not honor the request to align the |
| 575 ; segment unless we do this. | 575 ; segment unless we do this. |
| 576 align 16 | 576 align 16 |
| OLD | NEW |