| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; jiss2flt-64.asm - floating-point IDCT (64-bit SSE & SSE2) | 2 ; jiss2flt-64.asm - floating-point IDCT (64-bit SSE & SSE2) |
| 3 ; | 3 ; |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
| 5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
| 6 ; | 6 ; |
| 7 ; Based on | 7 ; Based on |
| 8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
| 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| (...skipping 308 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 319 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] | 319 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] |
| 320 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] | 320 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] |
| 321 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] | 321 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] |
| 322 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] | 322 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] |
| 323 | 323 |
| 324 ; ---- Pass 2: process rows from work array, store into output array. | 324 ; ---- Pass 2: process rows from work array, store into output array. |
| 325 | 325 |
| 326 mov rax, [original_rbp] | 326 mov rax, [original_rbp] |
| 327 lea rsi, [workspace] ; FAST_FLOAT * wsptr | 327 lea rsi, [workspace] ; FAST_FLOAT * wsptr |
| 328 mov rdi, r12 ; (JSAMPROW *) | 328 mov rdi, r12 ; (JSAMPROW *) |
| 329 » mov» rax, r13 | 329 » mov» eax, r13d |
| 330 mov rcx, DCTSIZE/4 ; ctr | 330 mov rcx, DCTSIZE/4 ; ctr |
| 331 .rowloop: | 331 .rowloop: |
| 332 | 332 |
| 333 ; -- Even part | 333 ; -- Even part |
| 334 | 334 |
| 335 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] | 335 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] |
| 336 movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] | 336 movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] |
| 337 movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] | 337 movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] |
| 338 movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] | 338 movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] |
| 339 | 339 |
| (...skipping 134 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 474 pop rbx | 474 pop rbx |
| 475 uncollect_args | 475 uncollect_args |
| 476 mov rsp,rbp ; rsp <- aligned rbp | 476 mov rsp,rbp ; rsp <- aligned rbp |
| 477 pop rsp ; rsp <- original rbp | 477 pop rsp ; rsp <- original rbp |
| 478 pop rbp | 478 pop rbp |
| 479 ret | 479 ret |
| 480 | 480 |
| 481 ; For some reason, the OS X linker does not honor the request to align the | 481 ; For some reason, the OS X linker does not honor the request to align the |
| 482 ; segment unless we do this. | 482 ; segment unless we do this. |
| 483 align 16 | 483 align 16 |
| OLD | NEW |