| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; jiss2fst-64.asm - fast integer IDCT (64-bit SSE2) | 2 ; jiss2fst-64.asm - fast integer IDCT (64-bit SSE2) |
| 3 ; | 3 ; |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
| 5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
| 6 ; | 6 ; |
| 7 ; Based on | 7 ; Based on |
| 8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
| 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 53 ; -------------------------------------------------------------------------- | 53 ; -------------------------------------------------------------------------- |
| 54 SECTION SEG_CONST | 54 SECTION SEG_CONST |
| 55 | 55 |
| 56 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) | 56 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) |
| 57 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) | 57 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) |
| 58 | 58 |
| 59 %define PRE_MULTIPLY_SCALE_BITS 2 | 59 %define PRE_MULTIPLY_SCALE_BITS 2 |
| 60 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | 60 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) |
| 61 | 61 |
| 62 alignz 16 | 62 alignz 16 |
| 63 » global» EXTN(jconst_idct_ifast_sse2) | 63 » global» EXTN(jconst_idct_ifast_sse2) PRIVATE |
| 64 | 64 |
| 65 EXTN(jconst_idct_ifast_sse2): | 65 EXTN(jconst_idct_ifast_sse2): |
| 66 | 66 |
| 67 PW_F1414 times 8 dw F_1_414 << CONST_SHIFT | 67 PW_F1414 times 8 dw F_1_414 << CONST_SHIFT |
| 68 PW_F1847 times 8 dw F_1_847 << CONST_SHIFT | 68 PW_F1847 times 8 dw F_1_847 << CONST_SHIFT |
| 69 PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT | 69 PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT |
| 70 PW_F1082 times 8 dw F_1_082 << CONST_SHIFT | 70 PW_F1082 times 8 dw F_1_082 << CONST_SHIFT |
| 71 PB_CENTERJSAMP times 16 db CENTERJSAMPLE | 71 PB_CENTERJSAMP times 16 db CENTERJSAMPLE |
| 72 | 72 |
| 73 alignz 16 | 73 alignz 16 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 86 ; r10 = jpeg_component_info * compptr | 86 ; r10 = jpeg_component_info * compptr |
| 87 ; r11 = JCOEFPTR coef_block | 87 ; r11 = JCOEFPTR coef_block |
| 88 ; r12 = JSAMPARRAY output_buf | 88 ; r12 = JSAMPARRAY output_buf |
| 89 ; r13 = JDIMENSION output_col | 89 ; r13 = JDIMENSION output_col |
| 90 | 90 |
| 91 %define original_rbp rbp+0 | 91 %define original_rbp rbp+0 |
| 92 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 92 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] |
| 93 %define WK_NUM 2 | 93 %define WK_NUM 2 |
| 94 | 94 |
| 95 align 16 | 95 align 16 |
| 96 » global» EXTN(jsimd_idct_ifast_sse2) | 96 » global» EXTN(jsimd_idct_ifast_sse2) PRIVATE |
| 97 | 97 |
| 98 EXTN(jsimd_idct_ifast_sse2): | 98 EXTN(jsimd_idct_ifast_sse2): |
| 99 push rbp | 99 push rbp |
| 100 mov rax,rsp ; rax = original rbp | 100 mov rax,rsp ; rax = original rbp |
| 101 sub rsp, byte 4 | 101 sub rsp, byte 4 |
| 102 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | 102 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits |
| 103 mov [rsp],rax | 103 mov [rsp],rax |
| 104 mov rbp,rsp ; rbp = aligned rbp | 104 mov rbp,rsp ; rbp = aligned rbp |
| 105 lea rsp, [wk(0)] | 105 lea rsp, [wk(0)] |
| 106 collect_args | 106 collect_args |
| (...skipping 376 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 483 uncollect_args | 483 uncollect_args |
| 484 mov rsp,rbp ; rsp <- aligned rbp | 484 mov rsp,rbp ; rsp <- aligned rbp |
| 485 pop rsp ; rsp <- original rbp | 485 pop rsp ; rsp <- original rbp |
| 486 pop rbp | 486 pop rbp |
| 487 ret | 487 ret |
| 488 ret | 488 ret |
| 489 | 489 |
| 490 ; For some reason, the OS X linker does not honor the request to align the | 490 ; For some reason, the OS X linker does not honor the request to align the |
| 491 ; segment unless we do this. | 491 ; segment unless we do this. |
| 492 align 16 | 492 align 16 |
| OLD | NEW |