OLD | NEW |
1 ; | 1 ; |
2 ; jiss2fst.asm - fast integer IDCT (SSE2) | 2 ; jiss2fst.asm - fast integer IDCT (SSE2) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; | 5 ; |
6 ; Based on | 6 ; Based on |
7 ; x86 SIMD extension for IJG JPEG library | 7 ; x86 SIMD extension for IJG JPEG library |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
10 ; | 10 ; |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
52 ; -------------------------------------------------------------------------- | 52 ; -------------------------------------------------------------------------- |
53 SECTION SEG_CONST | 53 SECTION SEG_CONST |
54 | 54 |
55 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) | 55 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) |
56 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) | 56 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) |
57 | 57 |
58 %define PRE_MULTIPLY_SCALE_BITS 2 | 58 %define PRE_MULTIPLY_SCALE_BITS 2 |
59 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | 59 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) |
60 | 60 |
61 alignz 16 | 61 alignz 16 |
62 » global» EXTN(jconst_idct_ifast_sse2) | 62 » global» EXTN(jconst_idct_ifast_sse2) PRIVATE |
63 | 63 |
64 EXTN(jconst_idct_ifast_sse2): | 64 EXTN(jconst_idct_ifast_sse2): |
65 | 65 |
66 PW_F1414 times 8 dw F_1_414 << CONST_SHIFT | 66 PW_F1414 times 8 dw F_1_414 << CONST_SHIFT |
67 PW_F1847 times 8 dw F_1_847 << CONST_SHIFT | 67 PW_F1847 times 8 dw F_1_847 << CONST_SHIFT |
68 PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT | 68 PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT |
69 PW_F1082 times 8 dw F_1_082 << CONST_SHIFT | 69 PW_F1082 times 8 dw F_1_082 << CONST_SHIFT |
70 PB_CENTERJSAMP times 16 db CENTERJSAMPLE | 70 PB_CENTERJSAMP times 16 db CENTERJSAMPLE |
71 | 71 |
72 alignz 16 | 72 alignz 16 |
(...skipping 12 matching lines...) Expand all Loading... |
85 %define dct_table(b) (b)+8 ; jpeg_component_info * compptr | 85 %define dct_table(b) (b)+8 ; jpeg_component_info * compptr |
86 %define coef_block(b) (b)+12 ; JCOEFPTR coef_block | 86 %define coef_block(b) (b)+12 ; JCOEFPTR coef_block |
87 %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf | 87 %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf |
88 %define output_col(b) (b)+20 ; JDIMENSION output_col | 88 %define output_col(b) (b)+20 ; JDIMENSION output_col |
89 | 89 |
90 %define original_ebp ebp+0 | 90 %define original_ebp ebp+0 |
91 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 91 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] |
92 %define WK_NUM 2 | 92 %define WK_NUM 2 |
93 | 93 |
94 align 16 | 94 align 16 |
95 » global» EXTN(jsimd_idct_ifast_sse2) | 95 » global» EXTN(jsimd_idct_ifast_sse2) PRIVATE |
96 | 96 |
97 EXTN(jsimd_idct_ifast_sse2): | 97 EXTN(jsimd_idct_ifast_sse2): |
98 push ebp | 98 push ebp |
99 mov eax,esp ; eax = original ebp | 99 mov eax,esp ; eax = original ebp |
100 sub esp, byte 4 | 100 sub esp, byte 4 |
101 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | 101 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits |
102 mov [esp],eax | 102 mov [esp],eax |
103 mov ebp,esp ; ebp = aligned ebp | 103 mov ebp,esp ; ebp = aligned ebp |
104 lea esp, [wk(0)] | 104 lea esp, [wk(0)] |
105 pushpic ebx | 105 pushpic ebx |
(...skipping 387 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
493 ; pop ecx ; unused | 493 ; pop ecx ; unused |
494 poppic ebx | 494 poppic ebx |
495 mov esp,ebp ; esp <- aligned ebp | 495 mov esp,ebp ; esp <- aligned ebp |
496 pop esp ; esp <- original ebp | 496 pop esp ; esp <- original ebp |
497 pop ebp | 497 pop ebp |
498 ret | 498 ret |
499 | 499 |
500 ; For some reason, the OS X linker does not honor the request to align the | 500 ; For some reason, the OS X linker does not honor the request to align the |
501 ; segment unless we do this. | 501 ; segment unless we do this. |
502 align 16 | 502 align 16 |
OLD | NEW |