OLD | NEW |
1 ; | 1 ; |
2 ; jfdctflt.asm - floating-point FDCT (64-bit SSE) | 2 ; jfdctflt.asm - floating-point FDCT (64-bit SSE) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
6 ; | 6 ; |
7 ; Based on | 7 ; Based on |
8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
(...skipping 20 matching lines...) Expand all Loading... |
31 %endmacro | 31 %endmacro |
32 | 32 |
33 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) | 33 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) |
34 shufps %1,%2,0xEE | 34 shufps %1,%2,0xEE |
35 %endmacro | 35 %endmacro |
36 | 36 |
37 ; -------------------------------------------------------------------------- | 37 ; -------------------------------------------------------------------------- |
38 SECTION SEG_CONST | 38 SECTION SEG_CONST |
39 | 39 |
40 alignz 16 | 40 alignz 16 |
41 global EXTN(jconst_fdct_float_sse) | 41 global EXTN(jconst_fdct_float_sse) PRIVATE |
42 | 42 |
43 EXTN(jconst_fdct_float_sse): | 43 EXTN(jconst_fdct_float_sse): |
44 | 44 |
45 PD_0_382 times 4 dd 0.382683432365089771728460 | 45 PD_0_382 times 4 dd 0.382683432365089771728460 |
46 PD_0_707 times 4 dd 0.707106781186547524400844 | 46 PD_0_707 times 4 dd 0.707106781186547524400844 |
47 PD_0_541 times 4 dd 0.541196100146196984399723 | 47 PD_0_541 times 4 dd 0.541196100146196984399723 |
48 PD_1_306 times 4 dd 1.306562964876376527856643 | 48 PD_1_306 times 4 dd 1.306562964876376527856643 |
49 | 49 |
50 alignz 16 | 50 alignz 16 |
51 | 51 |
52 ; -------------------------------------------------------------------------- | 52 ; -------------------------------------------------------------------------- |
53 SECTION SEG_TEXT | 53 SECTION SEG_TEXT |
54 BITS 64 | 54 BITS 64 |
55 ; | 55 ; |
56 ; Perform the forward DCT on one block of samples. | 56 ; Perform the forward DCT on one block of samples. |
57 ; | 57 ; |
58 ; GLOBAL(void) | 58 ; GLOBAL(void) |
59 ; jsimd_fdct_float_sse (FAST_FLOAT *data) | 59 ; jsimd_fdct_float_sse (FAST_FLOAT *data) |
60 ; | 60 ; |
61 | 61 |
62 ; r10 = FAST_FLOAT *data | 62 ; r10 = FAST_FLOAT *data |
63 | 63 |
64 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 64 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] |
65 %define WK_NUM 2 | 65 %define WK_NUM 2 |
66 | 66 |
67 align 16 | 67 align 16 |
68 global EXTN(jsimd_fdct_float_sse) | 68 global EXTN(jsimd_fdct_float_sse) PRIVATE |
69 | 69 |
70 EXTN(jsimd_fdct_float_sse): | 70 EXTN(jsimd_fdct_float_sse): |
71 push rbp | 71 push rbp |
72 mov rax,rsp ; rax = original rbp | 72 mov rax,rsp ; rax = original rbp |
73 sub rsp, byte 4 | 73 sub rsp, byte 4 |
74 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | 74 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits |
75 mov [rsp],rax | 75 mov [rsp],rax |
76 mov rbp,rsp ; rbp = aligned rbp | 76 mov rbp,rsp ; rbp = aligned rbp |
77 lea rsp, [wk(0)] | 77 lea rsp, [wk(0)] |
78 collect_args | 78 collect_args |
(...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
349 | 349 |
350 uncollect_args | 350 uncollect_args |
351 mov rsp,rbp ; rsp <- aligned rbp | 351 mov rsp,rbp ; rsp <- aligned rbp |
352 pop rsp ; rsp <- original rbp | 352 pop rsp ; rsp <- original rbp |
353 pop rbp | 353 pop rbp |
354 ret | 354 ret |
355 | 355 |
356 ; For some reason, the OS X linker does not honor the request to align the | 356 ; For some reason, the OS X linker does not honor the request to align the |
357 ; segment unless we do this. | 357 ; segment unless we do this. |
358 align 16 | 358 align 16 |
OLD | NEW |