OLD | NEW |
1 ; | 1 ; |
2 ; jiss2red-64.asm - reduced-size IDCT (64-bit SSE2) | 2 ; jiss2red-64.asm - reduced-size IDCT (64-bit SSE2) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
6 ; | 6 ; |
7 ; Based on | 7 ; Based on |
8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
66 F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) | 66 F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) |
67 F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) | 67 F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) |
68 F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) | 68 F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) |
69 F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) | 69 F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) |
70 %endif | 70 %endif |
71 | 71 |
72 ; -------------------------------------------------------------------------- | 72 ; -------------------------------------------------------------------------- |
73 SECTION SEG_CONST | 73 SECTION SEG_CONST |
74 | 74 |
75 alignz 16 | 75 alignz 16 |
76 » global» EXTN(jconst_idct_red_sse2) | 76 » global» EXTN(jconst_idct_red_sse2) PRIVATE |
77 | 77 |
78 EXTN(jconst_idct_red_sse2): | 78 EXTN(jconst_idct_red_sse2): |
79 | 79 |
80 PW_F184_MF076 times 4 dw F_1_847,-F_0_765 | 80 PW_F184_MF076 times 4 dw F_1_847,-F_0_765 |
81 PW_F256_F089 times 4 dw F_2_562, F_0_899 | 81 PW_F256_F089 times 4 dw F_2_562, F_0_899 |
82 PW_F106_MF217 times 4 dw F_1_061,-F_2_172 | 82 PW_F106_MF217 times 4 dw F_1_061,-F_2_172 |
83 PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 | 83 PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 |
84 PW_F145_MF021 times 4 dw F_1_451,-F_0_211 | 84 PW_F145_MF021 times 4 dw F_1_451,-F_0_211 |
85 PW_F362_MF127 times 4 dw F_3_624,-F_1_272 | 85 PW_F362_MF127 times 4 dw F_3_624,-F_1_272 |
86 PW_F085_MF072 times 4 dw F_0_850,-F_0_720 | 86 PW_F085_MF072 times 4 dw F_0_850,-F_0_720 |
(...skipping 20 matching lines...) Expand all Loading... |
107 ; r10 = void * dct_table | 107 ; r10 = void * dct_table |
108 ; r11 = JCOEFPTR coef_block | 108 ; r11 = JCOEFPTR coef_block |
109 ; r12 = JSAMPARRAY output_buf | 109 ; r12 = JSAMPARRAY output_buf |
110 ; r13 = JDIMENSION output_col | 110 ; r13 = JDIMENSION output_col |
111 | 111 |
112 %define original_rbp rbp+0 | 112 %define original_rbp rbp+0 |
113 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 113 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] |
114 %define WK_NUM 2 | 114 %define WK_NUM 2 |
115 | 115 |
116 align 16 | 116 align 16 |
117 » global» EXTN(jsimd_idct_4x4_sse2) | 117 » global» EXTN(jsimd_idct_4x4_sse2) PRIVATE |
118 | 118 |
119 EXTN(jsimd_idct_4x4_sse2): | 119 EXTN(jsimd_idct_4x4_sse2): |
120 push rbp | 120 push rbp |
121 mov rax,rsp ; rax = original rbp | 121 mov rax,rsp ; rax = original rbp |
122 sub rsp, byte 4 | 122 sub rsp, byte 4 |
123 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | 123 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits |
124 mov [rsp],rax | 124 mov [rsp],rax |
125 mov rbp,rsp ; rbp = aligned rbp | 125 mov rbp,rsp ; rbp = aligned rbp |
126 lea rsp, [wk(0)] | 126 lea rsp, [wk(0)] |
127 collect_args | 127 collect_args |
(...skipping 278 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
406 ; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block, | 406 ; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block, |
407 ; JSAMPARRAY output_buf, JDIMENSION output_col) | 407 ; JSAMPARRAY output_buf, JDIMENSION output_col) |
408 ; | 408 ; |
409 | 409 |
410 ; r10 = void * dct_table | 410 ; r10 = void * dct_table |
411 ; r11 = JCOEFPTR coef_block | 411 ; r11 = JCOEFPTR coef_block |
412 ; r12 = JSAMPARRAY output_buf | 412 ; r12 = JSAMPARRAY output_buf |
413 ; r13 = JDIMENSION output_col | 413 ; r13 = JDIMENSION output_col |
414 | 414 |
415 align 16 | 415 align 16 |
416 » global» EXTN(jsimd_idct_2x2_sse2) | 416 » global» EXTN(jsimd_idct_2x2_sse2) PRIVATE |
417 | 417 |
418 EXTN(jsimd_idct_2x2_sse2): | 418 EXTN(jsimd_idct_2x2_sse2): |
419 push rbp | 419 push rbp |
420 mov rax,rsp | 420 mov rax,rsp |
421 mov rbp,rsp | 421 mov rbp,rsp |
422 collect_args | 422 collect_args |
423 push rbx | 423 push rbx |
424 | 424 |
425 ; ---- Pass 1: process columns from input. | 425 ; ---- Pass 1: process columns from input. |
426 | 426 |
(...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
567 mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx | 567 mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx |
568 | 568 |
569 pop rbx | 569 pop rbx |
570 uncollect_args | 570 uncollect_args |
571 pop rbp | 571 pop rbp |
572 ret | 572 ret |
573 | 573 |
574 ; For some reason, the OS X linker does not honor the request to align the | 574 ; For some reason, the OS X linker does not honor the request to align the |
575 ; segment unless we do this. | 575 ; segment unless we do this. |
576 align 16 | 576 align 16 |
OLD | NEW |