OLD | NEW |
| (Empty) |
1 ; | |
2 ; jcqnts2i-64.asm - sample data conversion and quantization (64-bit SSE2) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; Copyright 2009 D. R. Commander | |
6 ; | |
7 ; Based on | |
8 ; x86 SIMD extension for IJG JPEG library | |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
11 ; | |
12 ; This file should be assembled with NASM (Netwide Assembler), | |
13 ; can *not* be assembled with Microsoft's MASM or any compatible | |
14 ; assembler (including Borland's Turbo Assembler). | |
15 ; NASM is available from http://nasm.sourceforge.net/ or | |
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
17 ; | |
18 ; [TAB8] | |
19 | |
20 %include "jsimdext.inc" | |
21 %include "jdct.inc" | |
22 | |
23 ; -------------------------------------------------------------------------- | |
24 SECTION SEG_TEXT | |
25 BITS 64 | |
26 ; | |
27 ; Load data into workspace, applying unsigned->signed conversion | |
28 ; | |
29 ; GLOBAL(void) | |
30 ; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, | |
31 ; DCTELEM * workspace); | |
32 ; | |
33 | |
34 ; r10 = JSAMPARRAY sample_data | |
35 ; r11 = JDIMENSION start_col | |
36 ; r12 = DCTELEM * workspace | |
37 | |
38 align 16 | |
39 global EXTN(jsimd_convsamp_sse2) PRIVATE | |
40 | |
41 EXTN(jsimd_convsamp_sse2): | |
42 push rbp | |
43 mov rax,rsp | |
44 mov rbp,rsp | |
45 collect_args | |
46 push rbx | |
47 | |
48 pxor xmm6,xmm6 ; xmm6=(all 0's) | |
49 pcmpeqw xmm7,xmm7 | |
50 psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} | |
51 | |
52 mov rsi, r10 | |
53 mov eax, r11d | |
54 mov rdi, r12 | |
55 mov rcx, DCTSIZE/4 | |
56 .convloop: | |
57 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) | |
58 mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) | |
59 | |
60 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567
) | |
61 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF
) | |
62 | |
63 mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) | |
64 mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) | |
65 | |
66 movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN
) | |
67 movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV
) | |
68 | |
69 punpcklbw xmm0,xmm6 ; xmm0=(01234567) | |
70 punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) | |
71 paddw xmm0,xmm7 | |
72 paddw xmm1,xmm7 | |
73 punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) | |
74 punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) | |
75 paddw xmm2,xmm7 | |
76 paddw xmm3,xmm7 | |
77 | |
78 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 | |
79 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 | |
80 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 | |
81 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 | |
82 | |
83 add rsi, byte 4*SIZEOF_JSAMPROW | |
84 add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM | |
85 dec rcx | |
86 jnz short .convloop | |
87 | |
88 pop rbx | |
89 uncollect_args | |
90 pop rbp | |
91 ret | |
92 | |
93 ; -------------------------------------------------------------------------- | |
94 ; | |
95 ; Quantize/descale the coefficients, and store into coef_block | |
96 ; | |
97 ; This implementation is based on an algorithm described in | |
98 ; "How to optimize for the Pentium family of microprocessors" | |
99 ; (http://www.agner.org/assem/). | |
100 ; | |
101 ; GLOBAL(void) | |
102 ; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors, | |
103 ; DCTELEM * workspace); | |
104 ; | |
105 | |
106 %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) | |
107 %define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) | |
108 %define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) | |
109 | |
110 ; r10 = JCOEFPTR coef_block | |
111 ; r11 = DCTELEM * divisors | |
112 ; r12 = DCTELEM * workspace | |
113 | |
114 align 16 | |
115 global EXTN(jsimd_quantize_sse2) PRIVATE | |
116 | |
117 EXTN(jsimd_quantize_sse2): | |
118 push rbp | |
119 mov rax,rsp | |
120 mov rbp,rsp | |
121 collect_args | |
122 | |
123 mov rsi, r12 | |
124 mov rdx, r11 | |
125 mov rdi, r10 | |
126 mov rax, DCTSIZE2/32 | |
127 .quantloop: | |
128 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] | |
129 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] | |
130 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] | |
131 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] | |
132 movdqa xmm0,xmm4 | |
133 movdqa xmm1,xmm5 | |
134 movdqa xmm2,xmm6 | |
135 movdqa xmm3,xmm7 | |
136 psraw xmm4,(WORD_BIT-1) | |
137 psraw xmm5,(WORD_BIT-1) | |
138 psraw xmm6,(WORD_BIT-1) | |
139 psraw xmm7,(WORD_BIT-1) | |
140 pxor xmm0,xmm4 | |
141 pxor xmm1,xmm5 | |
142 pxor xmm2,xmm6 | |
143 pxor xmm3,xmm7 | |
144 psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; | |
145 psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; | |
146 psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; | |
147 psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; | |
148 | |
149 paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor | |
150 paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] | |
151 paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] | |
152 paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] | |
153 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal | |
154 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] | |
155 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] | |
156 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] | |
157 pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale | |
158 pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] | |
159 pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] | |
160 pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] | |
161 | |
162 pxor xmm0,xmm4 | |
163 pxor xmm1,xmm5 | |
164 pxor xmm2,xmm6 | |
165 pxor xmm3,xmm7 | |
166 psubw xmm0,xmm4 | |
167 psubw xmm1,xmm5 | |
168 psubw xmm2,xmm6 | |
169 psubw xmm3,xmm7 | |
170 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 | |
171 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 | |
172 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 | |
173 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 | |
174 | |
175 add rsi, byte 32*SIZEOF_DCTELEM | |
176 add rdx, byte 32*SIZEOF_DCTELEM | |
177 add rdi, byte 32*SIZEOF_JCOEF | |
178 dec rax | |
179 jnz near .quantloop | |
180 | |
181 uncollect_args | |
182 pop rbp | |
183 ret | |
184 | |
185 ; For some reason, the OS X linker does not honor the request to align the | |
186 ; segment unless we do this. | |
187 align 16 | |
OLD | NEW |