OLD | NEW |
1 ; | 1 ; |
2 ; jidctflt.asm - floating-point IDCT (3DNow! & MMX) | 2 ; jidctflt.asm - floating-point IDCT (3DNow! & MMX) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; | 5 ; |
6 ; Based on | 6 ; Based on |
7 ; x86 SIMD extension for IJG JPEG library | 7 ; x86 SIMD extension for IJG JPEG library |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
10 ; | 10 ; |
11 ; This file should be assembled with NASM (Netwide Assembler), | 11 ; This file should be assembled with NASM (Netwide Assembler), |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | 12 ; can *not* be assembled with Microsoft's MASM or any compatible |
13 ; assembler (including Borland's Turbo Assembler). | 13 ; assembler (including Borland's Turbo Assembler). |
14 ; NASM is available from http://nasm.sourceforge.net/ or | 14 ; NASM is available from http://nasm.sourceforge.net/ or |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
16 ; | 16 ; |
17 ; This file contains a floating-point implementation of the inverse DCT | 17 ; This file contains a floating-point implementation of the inverse DCT |
18 ; (Discrete Cosine Transform). The following code is based directly on | 18 ; (Discrete Cosine Transform). The following code is based directly on |
19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details. | 19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details. |
20 ; | 20 ; |
21 ; [TAB8] | 21 ; [TAB8] |
22 | 22 |
23 %include "jsimdext.inc" | 23 %include "jsimdext.inc" |
24 %include "jdct.inc" | 24 %include "jdct.inc" |
25 | 25 |
26 ; -------------------------------------------------------------------------- | 26 ; -------------------------------------------------------------------------- |
27 SECTION SEG_CONST | 27 SECTION SEG_CONST |
28 | 28 |
29 alignz 16 | 29 alignz 16 |
30 global EXTN(jconst_idct_float_3dnow) | 30 global EXTN(jconst_idct_float_3dnow) PRIVATE |
31 | 31 |
32 EXTN(jconst_idct_float_3dnow): | 32 EXTN(jconst_idct_float_3dnow): |
33 | 33 |
34 PD_1_414 times 2 dd 1.414213562373095048801689 | 34 PD_1_414 times 2 dd 1.414213562373095048801689 |
35 PD_1_847 times 2 dd 1.847759065022573512256366 | 35 PD_1_847 times 2 dd 1.847759065022573512256366 |
36 PD_1_082 times 2 dd 1.082392200292393968799446 | 36 PD_1_082 times 2 dd 1.082392200292393968799446 |
37 PD_2_613 times 2 dd 2.613125929752753055713286 | 37 PD_2_613 times 2 dd 2.613125929752753055713286 |
38 PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) | 38 PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) |
39 PB_CENTERJSAMP times 8 db CENTERJSAMPLE | 39 PB_CENTERJSAMP times 8 db CENTERJSAMPLE |
40 | 40 |
(...skipping 15 matching lines...) Expand all Loading... |
56 %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf | 56 %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf |
57 %define output_col(b) (b)+20 ; JDIMENSION output_col | 57 %define output_col(b) (b)+20 ; JDIMENSION output_col |
58 | 58 |
59 %define original_ebp ebp+0 | 59 %define original_ebp ebp+0 |
60 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] | 60 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] |
61 %define WK_NUM 2 | 61 %define WK_NUM 2 |
62 %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT | 62 %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT |
63 ; FAST_FLOAT workspace[DCTSIZE2] | 63 ; FAST_FLOAT workspace[DCTSIZE2] |
64 | 64 |
65 align 16 | 65 align 16 |
66 global EXTN(jsimd_idct_float_3dnow) | 66 global EXTN(jsimd_idct_float_3dnow) PRIVATE |
67 | 67 |
68 EXTN(jsimd_idct_float_3dnow): | 68 EXTN(jsimd_idct_float_3dnow): |
69 push ebp | 69 push ebp |
70 mov eax,esp ; eax = original ebp | 70 mov eax,esp ; eax = original ebp |
71 sub esp, byte 4 | 71 sub esp, byte 4 |
72 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits | 72 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits |
73 mov [esp],eax | 73 mov [esp],eax |
74 mov ebp,esp ; ebp = aligned ebp | 74 mov ebp,esp ; ebp = aligned ebp |
75 lea esp, [workspace] | 75 lea esp, [workspace] |
76 push ebx | 76 push ebx |
(...skipping 366 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
443 ; pop ecx ; need not be preserved | 443 ; pop ecx ; need not be preserved |
444 pop ebx | 444 pop ebx |
445 mov esp,ebp ; esp <- aligned ebp | 445 mov esp,ebp ; esp <- aligned ebp |
446 pop esp ; esp <- original ebp | 446 pop esp ; esp <- original ebp |
447 pop ebp | 447 pop ebp |
448 ret | 448 ret |
449 | 449 |
450 ; For some reason, the OS X linker does not honor the request to align the | 450 ; For some reason, the OS X linker does not honor the request to align the |
451 ; segment unless we do this. | 451 ; segment unless we do this. |
452 align 16 | 452 align 16 |
OLD | NEW |