OLD | NEW |
| (Empty) |
1 ; | |
2 ; jcqnts2f-64.asm - sample data conversion and quantization (64-bit SSE & SSE2) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; Copyright 2009 D. R. Commander | |
6 ; | |
7 ; Based on | |
8 ; x86 SIMD extension for IJG JPEG library | |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
11 ; | |
12 ; This file should be assembled with NASM (Netwide Assembler), | |
13 ; can *not* be assembled with Microsoft's MASM or any compatible | |
14 ; assembler (including Borland's Turbo Assembler). | |
15 ; NASM is available from http://nasm.sourceforge.net/ or | |
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
17 ; | |
18 ; [TAB8] | |
19 | |
20 %include "jsimdext.inc" | |
21 %include "jdct.inc" | |
22 | |
23 ; -------------------------------------------------------------------------- | |
24 SECTION SEG_TEXT | |
25 BITS 64 | |
26 ; | |
27 ; Load data into workspace, applying unsigned->signed conversion | |
28 ; | |
29 ; GLOBAL(void) | |
30 ; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, | |
31 ; FAST_FLOAT * workspace); | |
32 ; | |
33 | |
34 ; r10 = JSAMPARRAY sample_data | |
35 ; r11 = JDIMENSION start_col | |
36 ; r12 = FAST_FLOAT * workspace | |
37 | |
38 align 16 | |
39 global EXTN(jsimd_convsamp_float_sse2) PRIVATE | |
40 | |
41 EXTN(jsimd_convsamp_float_sse2): | |
42 push rbp | |
43 mov rax,rsp | |
44 mov rbp,rsp | |
45 collect_args | |
46 push rbx | |
47 | |
48 pcmpeqw xmm7,xmm7 | |
49 psllw xmm7,7 | |
50 packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) | |
51 | |
52 mov rsi, r10 | |
53 mov eax, r11d | |
54 mov rdi, r12 | |
55 mov rcx, DCTSIZE/2 | |
56 .convloop: | |
57 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) | |
58 mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) | |
59 | |
60 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] | |
61 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] | |
62 | |
63 psubb xmm0,xmm7 ; xmm0=(01234567) | |
64 psubb xmm1,xmm7 ; xmm1=(89ABCDEF) | |
65 | |
66 punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) | |
67 punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) | |
68 | |
69 punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) | |
70 punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) | |
71 punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) | |
72 punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) | |
73 | |
74 psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) | |
75 psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) | |
76 cvtdq2ps xmm2,xmm2 ; xmm2=(0123) | |
77 cvtdq2ps xmm0,xmm0 ; xmm0=(4567) | |
78 psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) | |
79 psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) | |
80 cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) | |
81 cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) | |
82 | |
83 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 | |
84 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 | |
85 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 | |
86 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 | |
87 | |
88 add rsi, byte 2*SIZEOF_JSAMPROW | |
89 add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT | |
90 dec rcx | |
91 jnz short .convloop | |
92 | |
93 pop rbx | |
94 uncollect_args | |
95 pop rbp | |
96 ret | |
97 | |
98 | |
99 ; -------------------------------------------------------------------------- | |
100 ; | |
101 ; Quantize/descale the coefficients, and store into coef_block | |
102 ; | |
103 ; GLOBAL(void) | |
104 ; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors, | |
105 ; FAST_FLOAT * workspace); | |
106 ; | |
107 | |
108 ; r10 = JCOEFPTR coef_block | |
109 ; r11 = FAST_FLOAT * divisors | |
110 ; r12 = FAST_FLOAT * workspace | |
111 | |
112 align 16 | |
113 global EXTN(jsimd_quantize_float_sse2) PRIVATE | |
114 | |
115 EXTN(jsimd_quantize_float_sse2): | |
116 push rbp | |
117 mov rax,rsp | |
118 mov rbp,rsp | |
119 collect_args | |
120 | |
121 mov rsi, r12 | |
122 mov rdx, r11 | |
123 mov rdi, r10 | |
124 mov rax, DCTSIZE2/16 | |
125 .quantloop: | |
126 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] | |
127 movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] | |
128 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] | |
129 mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] | |
130 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] | |
131 movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] | |
132 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] | |
133 mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] | |
134 | |
135 cvtps2dq xmm0,xmm0 | |
136 cvtps2dq xmm1,xmm1 | |
137 cvtps2dq xmm2,xmm2 | |
138 cvtps2dq xmm3,xmm3 | |
139 | |
140 packssdw xmm0,xmm1 | |
141 packssdw xmm2,xmm3 | |
142 | |
143 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 | |
144 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 | |
145 | |
146 add rsi, byte 16*SIZEOF_FAST_FLOAT | |
147 add rdx, byte 16*SIZEOF_FAST_FLOAT | |
148 add rdi, byte 16*SIZEOF_JCOEF | |
149 dec rax | |
150 jnz short .quantloop | |
151 | |
152 uncollect_args | |
153 pop rbp | |
154 ret | |
155 | |
156 ; For some reason, the OS X linker does not honor the request to align the | |
157 ; segment unless we do this. | |
158 align 16 | |
OLD | NEW |