| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; jfdctflt.asm - floating-point FDCT (3DNow!) | 2 ; jfdctflt.asm - floating-point FDCT (3DNow!) |
| 3 ; | 3 ; |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
| 5 ; | 5 ; |
| 6 ; Based on | 6 ; Based on |
| 7 ; x86 SIMD extension for IJG JPEG library | 7 ; x86 SIMD extension for IJG JPEG library |
| 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| 10 ; | 10 ; |
| 11 ; This file should be assembled with NASM (Netwide Assembler), | 11 ; This file should be assembled with NASM (Netwide Assembler), |
| 12 ; can *not* be assembled with Microsoft's MASM or any compatible | 12 ; can *not* be assembled with Microsoft's MASM or any compatible |
| 13 ; assembler (including Borland's Turbo Assembler). | 13 ; assembler (including Borland's Turbo Assembler). |
| 14 ; NASM is available from http://nasm.sourceforge.net/ or | 14 ; NASM is available from http://nasm.sourceforge.net/ or |
| 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
| 16 ; | 16 ; |
| 17 ; This file contains a floating-point implementation of the forward DCT | 17 ; This file contains a floating-point implementation of the forward DCT |
| 18 ; (Discrete Cosine Transform). The following code is based directly on | 18 ; (Discrete Cosine Transform). The following code is based directly on |
| 19 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. | 19 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. |
| 20 ; | 20 ; |
| 21 ; [TAB8] | 21 ; [TAB8] |
| 22 | 22 |
| 23 %include "jsimdext.inc" | 23 %include "jsimdext.inc" |
| 24 %include "jdct.inc" | 24 %include "jdct.inc" |
| 25 | 25 |
| 26 ; -------------------------------------------------------------------------- | 26 ; -------------------------------------------------------------------------- |
| 27 SECTION SEG_CONST | 27 SECTION SEG_CONST |
| 28 | 28 |
| 29 alignz 16 | 29 alignz 16 |
| 30 global EXTN(jconst_fdct_float_3dnow) | 30 global EXTN(jconst_fdct_float_3dnow) PRIVATE |
| 31 | 31 |
| 32 EXTN(jconst_fdct_float_3dnow): | 32 EXTN(jconst_fdct_float_3dnow): |
| 33 | 33 |
| 34 PD_0_382 times 2 dd 0.382683432365089771728460 | 34 PD_0_382 times 2 dd 0.382683432365089771728460 |
| 35 PD_0_707 times 2 dd 0.707106781186547524400844 | 35 PD_0_707 times 2 dd 0.707106781186547524400844 |
| 36 PD_0_541 times 2 dd 0.541196100146196984399723 | 36 PD_0_541 times 2 dd 0.541196100146196984399723 |
| 37 PD_1_306 times 2 dd 1.306562964876376527856643 | 37 PD_1_306 times 2 dd 1.306562964876376527856643 |
| 38 | 38 |
| 39 alignz 16 | 39 alignz 16 |
| 40 | 40 |
| 41 ; -------------------------------------------------------------------------- | 41 ; -------------------------------------------------------------------------- |
| 42 SECTION SEG_TEXT | 42 SECTION SEG_TEXT |
| 43 BITS 32 | 43 BITS 32 |
| 44 ; | 44 ; |
| 45 ; Perform the forward DCT on one block of samples. | 45 ; Perform the forward DCT on one block of samples. |
| 46 ; | 46 ; |
| 47 ; GLOBAL(void) | 47 ; GLOBAL(void) |
| 48 ; jsimd_fdct_float_3dnow (FAST_FLOAT *data) | 48 ; jsimd_fdct_float_3dnow (FAST_FLOAT *data) |
| 49 ; | 49 ; |
| 50 | 50 |
| 51 %define data(b) (b)+8 ; FAST_FLOAT *data | 51 %define data(b) (b)+8 ; FAST_FLOAT *data |
| 52 | 52 |
| 53 %define original_ebp ebp+0 | 53 %define original_ebp ebp+0 |
| 54 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] | 54 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] |
| 55 %define WK_NUM 2 | 55 %define WK_NUM 2 |
| 56 | 56 |
| 57 align 16 | 57 align 16 |
| 58 global EXTN(jsimd_fdct_float_3dnow) | 58 global EXTN(jsimd_fdct_float_3dnow) PRIVATE |
| 59 | 59 |
| 60 EXTN(jsimd_fdct_float_3dnow): | 60 EXTN(jsimd_fdct_float_3dnow): |
| 61 push ebp | 61 push ebp |
| 62 mov eax,esp ; eax = original ebp | 62 mov eax,esp ; eax = original ebp |
| 63 sub esp, byte 4 | 63 sub esp, byte 4 |
| 64 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits | 64 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits |
| 65 mov [esp],eax | 65 mov [esp],eax |
| 66 mov ebp,esp ; ebp = aligned ebp | 66 mov ebp,esp ; ebp = aligned ebp |
| 67 lea esp, [wk(0)] | 67 lea esp, [wk(0)] |
| 68 pushpic ebx | 68 pushpic ebx |
| (...skipping 242 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 311 ; pop ecx ; need not be preserved | 311 ; pop ecx ; need not be preserved |
| 312 poppic ebx | 312 poppic ebx |
| 313 mov esp,ebp ; esp <- aligned ebp | 313 mov esp,ebp ; esp <- aligned ebp |
| 314 pop esp ; esp <- original ebp | 314 pop esp ; esp <- original ebp |
| 315 pop ebp | 315 pop ebp |
| 316 ret | 316 ret |
| 317 | 317 |
| 318 ; For some reason, the OS X linker does not honor the request to align the | 318 ; For some reason, the OS X linker does not honor the request to align the |
| 319 ; segment unless we do this. | 319 ; segment unless we do this. |
| 320 align 16 | 320 align 16 |
| OLD | NEW |