OLD | NEW |
| (Empty) |
1 ; | |
2 ; jcqntmmx.asm - sample data conversion and quantization (MMX) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; | |
6 ; Based on | |
7 ; x86 SIMD extension for IJG JPEG library | |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
10 ; | |
11 ; This file should be assembled with NASM (Netwide Assembler), | |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | |
13 ; assembler (including Borland's Turbo Assembler). | |
14 ; NASM is available from http://nasm.sourceforge.net/ or | |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
16 ; | |
17 ; [TAB8] | |
18 | |
19 %include "jsimdext.inc" | |
20 %include "jdct.inc" | |
21 | |
22 ; -------------------------------------------------------------------------- | |
23 SECTION SEG_TEXT | |
24 BITS 32 | |
25 ; | |
26 ; Load data into workspace, applying unsigned->signed conversion | |
27 ; | |
28 ; GLOBAL(void) | |
29 ; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col, | |
30 ; DCTELEM * workspace); | |
31 ; | |
32 | |
33 %define sample_data ebp+8 ; JSAMPARRAY sample_data | |
34 %define start_col ebp+12 ; JDIMENSION start_col | |
35 %define workspace ebp+16 ; DCTELEM * workspace | |
36 | |
37 align 16 | |
38 global EXTN(jsimd_convsamp_mmx) PRIVATE | |
39 | |
40 EXTN(jsimd_convsamp_mmx): | |
41 push ebp | |
42 mov ebp,esp | |
43 push ebx | |
44 ; push ecx ; need not be preserved | |
45 ; push edx ; need not be preserved | |
46 push esi | |
47 push edi | |
48 | |
49 pxor mm6,mm6 ; mm6=(all 0's) | |
50 pcmpeqw mm7,mm7 | |
51 psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} | |
52 | |
53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) | |
54 mov eax, JDIMENSION [start_col] | |
55 mov edi, POINTER [workspace] ; (DCTELEM *) | |
56 mov ecx, DCTSIZE/4 | |
57 alignx 16,7 | |
58 .convloop: | |
59 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) | |
60 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) | |
61 | |
62 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567) | |
63 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF) | |
64 | |
65 mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) | |
66 mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) | |
67 | |
68 movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN) | |
69 movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV) | |
70 | |
71 movq mm4,mm0 | |
72 punpcklbw mm0,mm6 ; mm0=(0123) | |
73 punpckhbw mm4,mm6 ; mm4=(4567) | |
74 movq mm5,mm1 | |
75 punpcklbw mm1,mm6 ; mm1=(89AB) | |
76 punpckhbw mm5,mm6 ; mm5=(CDEF) | |
77 | |
78 paddw mm0,mm7 | |
79 paddw mm4,mm7 | |
80 paddw mm1,mm7 | |
81 paddw mm5,mm7 | |
82 | |
83 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 | |
84 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4 | |
85 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1 | |
86 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5 | |
87 | |
88 movq mm0,mm2 | |
89 punpcklbw mm2,mm6 ; mm2=(GHIJ) | |
90 punpckhbw mm0,mm6 ; mm0=(KLMN) | |
91 movq mm4,mm3 | |
92 punpcklbw mm3,mm6 ; mm3=(OPQR) | |
93 punpckhbw mm4,mm6 ; mm4=(STUV) | |
94 | |
95 paddw mm2,mm7 | |
96 paddw mm0,mm7 | |
97 paddw mm3,mm7 | |
98 paddw mm4,mm7 | |
99 | |
100 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2 | |
101 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0 | |
102 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3 | |
103 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4 | |
104 | |
105 add esi, byte 4*SIZEOF_JSAMPROW | |
106 add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM | |
107 dec ecx | |
108 jnz short .convloop | |
109 | |
110 emms ; empty MMX state | |
111 | |
112 pop edi | |
113 pop esi | |
114 ; pop edx ; need not be preserved | |
115 ; pop ecx ; need not be preserved | |
116 pop ebx | |
117 pop ebp | |
118 ret | |
119 | |
120 ; -------------------------------------------------------------------------- | |
121 ; | |
122 ; Quantize/descale the coefficients, and store into coef_block | |
123 ; | |
124 ; This implementation is based on an algorithm described in | |
125 ; "How to optimize for the Pentium family of microprocessors" | |
126 ; (http://www.agner.org/assem/). | |
127 ; | |
128 ; GLOBAL(void) | |
129 ; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors, | |
130 ; DCTELEM * workspace); | |
131 ; | |
132 | |
133 %define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) | |
134 %define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) | |
135 %define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) | |
136 %define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM) | |
137 | |
138 %define coef_block ebp+8 ; JCOEFPTR coef_block | |
139 %define divisors ebp+12 ; DCTELEM * divisors | |
140 %define workspace ebp+16 ; DCTELEM * workspace | |
141 | |
142 align 16 | |
143 global EXTN(jsimd_quantize_mmx) PRIVATE | |
144 | |
145 EXTN(jsimd_quantize_mmx): | |
146 push ebp | |
147 mov ebp,esp | |
148 ; push ebx ; unused | |
149 ; push ecx ; unused | |
150 ; push edx ; need not be preserved | |
151 push esi | |
152 push edi | |
153 | |
154 mov esi, POINTER [workspace] | |
155 mov edx, POINTER [divisors] | |
156 mov edi, JCOEFPTR [coef_block] | |
157 mov ah, 2 | |
158 alignx 16,7 | |
159 .quantloop1: | |
160 mov al, DCTSIZE2/8/2 | |
161 alignx 16,7 | |
162 .quantloop2: | |
163 movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] | |
164 movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] | |
165 | |
166 movq mm0,mm2 | |
167 movq mm1,mm3 | |
168 | |
169 psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise | |
170 psraw mm3,(WORD_BIT-1) | |
171 | |
172 pxor mm0,mm2 ; val = -val | |
173 pxor mm1,mm3 | |
174 psubw mm0,mm2 | |
175 psubw mm1,mm3 | |
176 | |
177 ; | |
178 ; MMX is an annoyingly crappy instruction set. It has two | |
179 ; misfeatures that are causing problems here: | |
180 ; | |
181 ; - All multiplications are signed. | |
182 ; | |
183 ; - The second operand for the shifts is not treated as packed. | |
184 ; | |
185 ; | |
186 ; We work around the first problem by implementing this algorithm: | |
187 ; | |
188 ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) | |
189 ; { | |
190 ; enum { SHORT_BIT = 16 }; | |
191 ; signed short sx = (signed short) x; | |
192 ; signed short sy = (signed short) y; | |
193 ; signed long sz; | |
194 ; | |
195 ; sz = (long) sx * (long) sy; /* signed multiply */ | |
196 ; | |
197 ; if (sx < 0) sz += (long) sy << SHORT_BIT; | |
198 ; if (sy < 0) sz += (long) sx << SHORT_BIT; | |
199 ; | |
200 ; return (unsigned long) sz; | |
201 ; } | |
202 ; | |
203 ; (note that a negative sx adds _sy_ and vice versa) | |
204 ; | |
205 ; For the second problem, we replace the shift by a multiplication. | |
206 ; Unfortunately that means we have to deal with the signed issue again. | |
207 ; | |
208 | |
209 paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor | |
210 paddw mm1, MMWORD [CORRECTION(0,1,edx)] | |
211 | |
212 movq mm4,mm0 ; store current value for later | |
213 movq mm5,mm1 | |
214 pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal | |
215 pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)] | |
216 paddw mm0,mm4 ; reciprocal is always negative (MSB=1), | |
217 paddw mm1,mm5 ; so we always need to add the initial value | |
218 ; (input value is never negative as we | |
219 ; inverted it at the start of this routine) | |
220 | |
221 ; here it gets a bit tricky as both scale | |
222 ; and mm0/mm1 can be negative | |
223 movq mm6, MMWORD [SCALE(0,0,edx)] ; scale | |
224 movq mm7, MMWORD [SCALE(0,1,edx)] | |
225 movq mm4,mm0 | |
226 movq mm5,mm1 | |
227 pmulhw mm0,mm6 | |
228 pmulhw mm1,mm7 | |
229 | |
230 psraw mm6,(WORD_BIT-1) ; determine if scale is negative | |
231 psraw mm7,(WORD_BIT-1) | |
232 | |
233 pand mm6,mm4 ; and add input if it is | |
234 pand mm7,mm5 | |
235 paddw mm0,mm6 | |
236 paddw mm1,mm7 | |
237 | |
238 psraw mm4,(WORD_BIT-1) ; then check if negative input | |
239 psraw mm5,(WORD_BIT-1) | |
240 | |
241 pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is | |
242 pand mm5, MMWORD [SCALE(0,1,edx)] | |
243 paddw mm0,mm4 | |
244 paddw mm1,mm5 | |
245 | |
246 pxor mm0,mm2 ; val = -val | |
247 pxor mm1,mm3 | |
248 psubw mm0,mm2 | |
249 psubw mm1,mm3 | |
250 | |
251 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 | |
252 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1 | |
253 | |
254 add esi, byte 8*SIZEOF_DCTELEM | |
255 add edx, byte 8*SIZEOF_DCTELEM | |
256 add edi, byte 8*SIZEOF_JCOEF | |
257 dec al | |
258 jnz near .quantloop2 | |
259 dec ah | |
260 jnz near .quantloop1 ; to avoid branch misprediction | |
261 | |
262 emms ; empty MMX state | |
263 | |
264 pop edi | |
265 pop esi | |
266 ; pop edx ; need not be preserved | |
267 ; pop ecx ; unused | |
268 ; pop ebx ; unused | |
269 pop ebp | |
270 ret | |
271 | |
272 ; For some reason, the OS X linker does not honor the request to align the | |
273 ; segment unless we do this. | |
274 align 16 | |
OLD | NEW |