OLD | NEW |
1 ; | 1 ; |
2 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) | 2 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
6 ; | 6 ; |
7 ; Based on | 7 ; Based on |
8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
(...skipping 20 matching lines...) Expand all Loading... |
31 %endmacro | 31 %endmacro |
32 | 32 |
33 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) | 33 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) |
34 shufps %1,%2,0xEE | 34 shufps %1,%2,0xEE |
35 %endmacro | 35 %endmacro |
36 | 36 |
37 ; -------------------------------------------------------------------------- | 37 ; -------------------------------------------------------------------------- |
38 SECTION SEG_CONST | 38 SECTION SEG_CONST |
39 | 39 |
40 alignz 16 | 40 alignz 16 |
41 global EXTN(jconst_idct_float_sse2) | 41 global EXTN(jconst_idct_float_sse2) PRIVATE |
42 | 42 |
43 EXTN(jconst_idct_float_sse2): | 43 EXTN(jconst_idct_float_sse2): |
44 | 44 |
45 PD_1_414 times 4 dd 1.414213562373095048801689 | 45 PD_1_414 times 4 dd 1.414213562373095048801689 |
46 PD_1_847 times 4 dd 1.847759065022573512256366 | 46 PD_1_847 times 4 dd 1.847759065022573512256366 |
47 PD_1_082 times 4 dd 1.082392200292393968799446 | 47 PD_1_082 times 4 dd 1.082392200292393968799446 |
48 PD_M2_613 times 4 dd -2.613125929752753055713286 | 48 PD_M2_613 times 4 dd -2.613125929752753055713286 |
49 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) | 49 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) |
50 PB_CENTERJSAMP times 16 db CENTERJSAMPLE | 50 PB_CENTERJSAMP times 16 db CENTERJSAMPLE |
51 | 51 |
(...skipping 15 matching lines...) Expand all Loading... |
67 ; r12 = JSAMPARRAY output_buf | 67 ; r12 = JSAMPARRAY output_buf |
68 ; r13 = JDIMENSION output_col | 68 ; r13 = JDIMENSION output_col |
69 | 69 |
70 %define original_rbp rbp+0 | 70 %define original_rbp rbp+0 |
71 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 71 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] |
72 %define WK_NUM 2 | 72 %define WK_NUM 2 |
73 %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT | 73 %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT |
74 ; FAST_FLOAT workspace[DCTSIZE2] | 74 ; FAST_FLOAT workspace[DCTSIZE2] |
75 | 75 |
76 align 16 | 76 align 16 |
77 global EXTN(jsimd_idct_float_sse2) | 77 global EXTN(jsimd_idct_float_sse2) PRIVATE |
78 | 78 |
79 EXTN(jsimd_idct_float_sse2): | 79 EXTN(jsimd_idct_float_sse2): |
80 push rbp | 80 push rbp |
81 mov rax,rsp ; rax = original rbp | 81 mov rax,rsp ; rax = original rbp |
82 sub rsp, byte 4 | 82 sub rsp, byte 4 |
83 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | 83 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits |
84 mov [rsp],rax | 84 mov [rsp],rax |
85 mov rbp,rsp ; rbp = aligned rbp | 85 mov rbp,rsp ; rbp = aligned rbp |
86 lea rsp, [workspace] | 86 lea rsp, [workspace] |
87 collect_args | 87 collect_args |
(...skipping 386 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
474 pop rbx | 474 pop rbx |
475 uncollect_args | 475 uncollect_args |
476 mov rsp,rbp ; rsp <- aligned rbp | 476 mov rsp,rbp ; rsp <- aligned rbp |
477 pop rsp ; rsp <- original rbp | 477 pop rsp ; rsp <- original rbp |
478 pop rbp | 478 pop rbp |
479 ret | 479 ret |
480 | 480 |
481 ; For some reason, the OS X linker does not honor the request to align the | 481 ; For some reason, the OS X linker does not honor the request to align the |
482 ; segment unless we do this. | 482 ; segment unless we do this. |
483 align 16 | 483 align 16 |
OLD | NEW |