| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; jiss2red-64.asm - reduced-size IDCT (64-bit SSE2) | 2 ; jiss2red-64.asm - reduced-size IDCT (64-bit SSE2) |
| 3 ; | 3 ; |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
| 5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
| 6 ; | 6 ; |
| 7 ; Based on | 7 ; Based on |
| 8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
| 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| (...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 66 F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) | 66 F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) |
| 67 F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) | 67 F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) |
| 68 F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) | 68 F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) |
| 69 F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) | 69 F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) |
| 70 %endif | 70 %endif |
| 71 | 71 |
| 72 ; -------------------------------------------------------------------------- | 72 ; -------------------------------------------------------------------------- |
| 73 SECTION SEG_CONST | 73 SECTION SEG_CONST |
| 74 | 74 |
| 75 alignz 16 | 75 alignz 16 |
| 76 » global» EXTN(jconst_idct_red_sse2) | 76 » global» EXTN(jconst_idct_red_sse2) PRIVATE |
| 77 | 77 |
| 78 EXTN(jconst_idct_red_sse2): | 78 EXTN(jconst_idct_red_sse2): |
| 79 | 79 |
| 80 PW_F184_MF076 times 4 dw F_1_847,-F_0_765 | 80 PW_F184_MF076 times 4 dw F_1_847,-F_0_765 |
| 81 PW_F256_F089 times 4 dw F_2_562, F_0_899 | 81 PW_F256_F089 times 4 dw F_2_562, F_0_899 |
| 82 PW_F106_MF217 times 4 dw F_1_061,-F_2_172 | 82 PW_F106_MF217 times 4 dw F_1_061,-F_2_172 |
| 83 PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 | 83 PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 |
| 84 PW_F145_MF021 times 4 dw F_1_451,-F_0_211 | 84 PW_F145_MF021 times 4 dw F_1_451,-F_0_211 |
| 85 PW_F362_MF127 times 4 dw F_3_624,-F_1_272 | 85 PW_F362_MF127 times 4 dw F_3_624,-F_1_272 |
| 86 PW_F085_MF072 times 4 dw F_0_850,-F_0_720 | 86 PW_F085_MF072 times 4 dw F_0_850,-F_0_720 |
| (...skipping 20 matching lines...) Expand all Loading... |
| 107 ; r10 = void * dct_table | 107 ; r10 = void * dct_table |
| 108 ; r11 = JCOEFPTR coef_block | 108 ; r11 = JCOEFPTR coef_block |
| 109 ; r12 = JSAMPARRAY output_buf | 109 ; r12 = JSAMPARRAY output_buf |
| 110 ; r13 = JDIMENSION output_col | 110 ; r13 = JDIMENSION output_col |
| 111 | 111 |
| 112 %define original_rbp rbp+0 | 112 %define original_rbp rbp+0 |
| 113 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 113 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] |
| 114 %define WK_NUM 2 | 114 %define WK_NUM 2 |
| 115 | 115 |
| 116 align 16 | 116 align 16 |
| 117 » global» EXTN(jsimd_idct_4x4_sse2) | 117 » global» EXTN(jsimd_idct_4x4_sse2) PRIVATE |
| 118 | 118 |
| 119 EXTN(jsimd_idct_4x4_sse2): | 119 EXTN(jsimd_idct_4x4_sse2): |
| 120 push rbp | 120 push rbp |
| 121 mov rax,rsp ; rax = original rbp | 121 mov rax,rsp ; rax = original rbp |
| 122 sub rsp, byte 4 | 122 sub rsp, byte 4 |
| 123 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | 123 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits |
| 124 mov [rsp],rax | 124 mov [rsp],rax |
| 125 mov rbp,rsp ; rbp = aligned rbp | 125 mov rbp,rsp ; rbp = aligned rbp |
| 126 lea rsp, [wk(0)] | 126 lea rsp, [wk(0)] |
| 127 collect_args | 127 collect_args |
| (...skipping 278 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 406 ; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block, | 406 ; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block, |
| 407 ; JSAMPARRAY output_buf, JDIMENSION output_col) | 407 ; JSAMPARRAY output_buf, JDIMENSION output_col) |
| 408 ; | 408 ; |
| 409 | 409 |
| 410 ; r10 = void * dct_table | 410 ; r10 = void * dct_table |
| 411 ; r11 = JCOEFPTR coef_block | 411 ; r11 = JCOEFPTR coef_block |
| 412 ; r12 = JSAMPARRAY output_buf | 412 ; r12 = JSAMPARRAY output_buf |
| 413 ; r13 = JDIMENSION output_col | 413 ; r13 = JDIMENSION output_col |
| 414 | 414 |
| 415 align 16 | 415 align 16 |
| 416 » global» EXTN(jsimd_idct_2x2_sse2) | 416 » global» EXTN(jsimd_idct_2x2_sse2) PRIVATE |
| 417 | 417 |
| 418 EXTN(jsimd_idct_2x2_sse2): | 418 EXTN(jsimd_idct_2x2_sse2): |
| 419 push rbp | 419 push rbp |
| 420 mov rax,rsp | 420 mov rax,rsp |
| 421 mov rbp,rsp | 421 mov rbp,rsp |
| 422 collect_args | 422 collect_args |
| 423 push rbx | 423 push rbx |
| 424 | 424 |
| 425 ; ---- Pass 1: process columns from input. | 425 ; ---- Pass 1: process columns from input. |
| 426 | 426 |
| (...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 567 mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx | 567 mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx |
| 568 | 568 |
| 569 pop rbx | 569 pop rbx |
| 570 uncollect_args | 570 uncollect_args |
| 571 pop rbp | 571 pop rbp |
| 572 ret | 572 ret |
| 573 | 573 |
| 574 ; For some reason, the OS X linker does not honor the request to align the | 574 ; For some reason, the OS X linker does not honor the request to align the |
| 575 ; segment unless we do this. | 575 ; segment unless we do this. |
| 576 align 16 | 576 align 16 |
| OLD | NEW |