OLD | NEW |
1 ; | 1 ; |
2 ; jfdctflt.asm - floating-point FDCT (3DNow!) | 2 ; jfdctflt.asm - floating-point FDCT (3DNow!) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; | 5 ; |
6 ; Based on | 6 ; Based on |
7 ; x86 SIMD extension for IJG JPEG library | 7 ; x86 SIMD extension for IJG JPEG library |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
10 ; | 10 ; |
11 ; This file should be assembled with NASM (Netwide Assembler), | 11 ; This file should be assembled with NASM (Netwide Assembler), |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | 12 ; can *not* be assembled with Microsoft's MASM or any compatible |
13 ; assembler (including Borland's Turbo Assembler). | 13 ; assembler (including Borland's Turbo Assembler). |
14 ; NASM is available from http://nasm.sourceforge.net/ or | 14 ; NASM is available from http://nasm.sourceforge.net/ or |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
16 ; | 16 ; |
17 ; This file contains a floating-point implementation of the forward DCT | 17 ; This file contains a floating-point implementation of the forward DCT |
18 ; (Discrete Cosine Transform). The following code is based directly on | 18 ; (Discrete Cosine Transform). The following code is based directly on |
19 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. | 19 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. |
20 ; | 20 ; |
21 ; [TAB8] | 21 ; [TAB8] |
22 | 22 |
23 %include "jsimdext.inc" | 23 %include "jsimdext.inc" |
24 %include "jdct.inc" | 24 %include "jdct.inc" |
25 | 25 |
26 ; -------------------------------------------------------------------------- | 26 ; -------------------------------------------------------------------------- |
27 SECTION SEG_CONST | 27 SECTION SEG_CONST |
28 | 28 |
29 alignz 16 | 29 alignz 16 |
30 global EXTN(jconst_fdct_float_3dnow) | 30 global EXTN(jconst_fdct_float_3dnow) PRIVATE |
31 | 31 |
32 EXTN(jconst_fdct_float_3dnow): | 32 EXTN(jconst_fdct_float_3dnow): |
33 | 33 |
34 PD_0_382 times 2 dd 0.382683432365089771728460 | 34 PD_0_382 times 2 dd 0.382683432365089771728460 |
35 PD_0_707 times 2 dd 0.707106781186547524400844 | 35 PD_0_707 times 2 dd 0.707106781186547524400844 |
36 PD_0_541 times 2 dd 0.541196100146196984399723 | 36 PD_0_541 times 2 dd 0.541196100146196984399723 |
37 PD_1_306 times 2 dd 1.306562964876376527856643 | 37 PD_1_306 times 2 dd 1.306562964876376527856643 |
38 | 38 |
39 alignz 16 | 39 alignz 16 |
40 | 40 |
41 ; -------------------------------------------------------------------------- | 41 ; -------------------------------------------------------------------------- |
42 SECTION SEG_TEXT | 42 SECTION SEG_TEXT |
43 BITS 32 | 43 BITS 32 |
44 ; | 44 ; |
45 ; Perform the forward DCT on one block of samples. | 45 ; Perform the forward DCT on one block of samples. |
46 ; | 46 ; |
47 ; GLOBAL(void) | 47 ; GLOBAL(void) |
48 ; jsimd_fdct_float_3dnow (FAST_FLOAT *data) | 48 ; jsimd_fdct_float_3dnow (FAST_FLOAT *data) |
49 ; | 49 ; |
50 | 50 |
51 %define data(b) (b)+8 ; FAST_FLOAT *data | 51 %define data(b) (b)+8 ; FAST_FLOAT *data |
52 | 52 |
53 %define original_ebp ebp+0 | 53 %define original_ebp ebp+0 |
54 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] | 54 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] |
55 %define WK_NUM 2 | 55 %define WK_NUM 2 |
56 | 56 |
57 align 16 | 57 align 16 |
58 global EXTN(jsimd_fdct_float_3dnow) | 58 global EXTN(jsimd_fdct_float_3dnow) PRIVATE |
59 | 59 |
60 EXTN(jsimd_fdct_float_3dnow): | 60 EXTN(jsimd_fdct_float_3dnow): |
61 push ebp | 61 push ebp |
62 mov eax,esp ; eax = original ebp | 62 mov eax,esp ; eax = original ebp |
63 sub esp, byte 4 | 63 sub esp, byte 4 |
64 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits | 64 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits |
65 mov [esp],eax | 65 mov [esp],eax |
66 mov ebp,esp ; ebp = aligned ebp | 66 mov ebp,esp ; ebp = aligned ebp |
67 lea esp, [wk(0)] | 67 lea esp, [wk(0)] |
68 pushpic ebx | 68 pushpic ebx |
(...skipping 242 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
311 ; pop ecx ; need not be preserved | 311 ; pop ecx ; need not be preserved |
312 poppic ebx | 312 poppic ebx |
313 mov esp,ebp ; esp <- aligned ebp | 313 mov esp,ebp ; esp <- aligned ebp |
314 pop esp ; esp <- original ebp | 314 pop esp ; esp <- original ebp |
315 pop ebp | 315 pop ebp |
316 ret | 316 ret |
317 | 317 |
318 ; For some reason, the OS X linker does not honor the request to align the | 318 ; For some reason, the OS X linker does not honor the request to align the |
319 ; segment unless we do this. | 319 ; segment unless we do this. |
320 align 16 | 320 align 16 |
OLD | NEW |