OLD | NEW |
1 ; | 1 ; |
2 ; jfdctfst.asm - fast integer FDCT (64-bit SSE2) | 2 ; jfdctfst.asm - fast integer FDCT (64-bit SSE2) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
6 ; | 6 ; |
7 ; Based on | 7 ; Based on |
8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
46 ; -------------------------------------------------------------------------- | 46 ; -------------------------------------------------------------------------- |
47 SECTION SEG_CONST | 47 SECTION SEG_CONST |
48 | 48 |
49 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) | 49 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) |
50 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) | 50 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) |
51 | 51 |
52 %define PRE_MULTIPLY_SCALE_BITS 2 | 52 %define PRE_MULTIPLY_SCALE_BITS 2 |
53 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | 53 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) |
54 | 54 |
55 alignz 16 | 55 alignz 16 |
56 global EXTN(jconst_fdct_ifast_sse2) | 56 global EXTN(jconst_fdct_ifast_sse2) PRIVATE |
57 | 57 |
58 EXTN(jconst_fdct_ifast_sse2): | 58 EXTN(jconst_fdct_ifast_sse2): |
59 | 59 |
60 PW_F0707 times 8 dw F_0_707 << CONST_SHIFT | 60 PW_F0707 times 8 dw F_0_707 << CONST_SHIFT |
61 PW_F0382 times 8 dw F_0_382 << CONST_SHIFT | 61 PW_F0382 times 8 dw F_0_382 << CONST_SHIFT |
62 PW_F0541 times 8 dw F_0_541 << CONST_SHIFT | 62 PW_F0541 times 8 dw F_0_541 << CONST_SHIFT |
63 PW_F1306 times 8 dw F_1_306 << CONST_SHIFT | 63 PW_F1306 times 8 dw F_1_306 << CONST_SHIFT |
64 | 64 |
65 alignz 16 | 65 alignz 16 |
66 | 66 |
67 ; -------------------------------------------------------------------------- | 67 ; -------------------------------------------------------------------------- |
68 SECTION SEG_TEXT | 68 SECTION SEG_TEXT |
69 BITS 64 | 69 BITS 64 |
70 ; | 70 ; |
71 ; Perform the forward DCT on one block of samples. | 71 ; Perform the forward DCT on one block of samples. |
72 ; | 72 ; |
73 ; GLOBAL(void) | 73 ; GLOBAL(void) |
74 ; jsimd_fdct_ifast_sse2 (DCTELEM *data) | 74 ; jsimd_fdct_ifast_sse2 (DCTELEM *data) |
75 ; | 75 ; |
76 | 76 |
77 ; r10 = DCTELEM *data | 77 ; r10 = DCTELEM *data |
78 | 78 |
79 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 79 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] |
80 %define WK_NUM 2 | 80 %define WK_NUM 2 |
81 | 81 |
82 align 16 | 82 align 16 |
83 global EXTN(jsimd_fdct_ifast_sse2) | 83 global EXTN(jsimd_fdct_ifast_sse2) PRIVATE |
84 | 84 |
85 EXTN(jsimd_fdct_ifast_sse2): | 85 EXTN(jsimd_fdct_ifast_sse2): |
86 push rbp | 86 push rbp |
87 mov rax,rsp ; rax = original rbp | 87 mov rax,rsp ; rax = original rbp |
88 sub rsp, byte 4 | 88 sub rsp, byte 4 |
89 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | 89 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits |
90 mov [rsp],rax | 90 mov [rsp],rax |
91 mov rbp,rsp ; rbp = aligned rbp | 91 mov rbp,rsp ; rbp = aligned rbp |
92 lea rsp, [wk(0)] | 92 lea rsp, [wk(0)] |
93 collect_args | 93 collect_args |
(...skipping 289 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
383 | 383 |
384 uncollect_args | 384 uncollect_args |
385 mov rsp,rbp ; rsp <- aligned rbp | 385 mov rsp,rbp ; rsp <- aligned rbp |
386 pop rsp ; rsp <- original rbp | 386 pop rsp ; rsp <- original rbp |
387 pop rbp | 387 pop rbp |
388 ret | 388 ret |
389 | 389 |
390 ; For some reason, the OS X linker does not honor the request to align the | 390 ; For some reason, the OS X linker does not honor the request to align the |
391 ; segment unless we do this. | 391 ; segment unless we do this. |
392 align 16 | 392 align 16 |
OLD | NEW |