OLD | NEW |
| (Empty) |
1 ; | |
2 ; jiss2fst.asm - fast integer IDCT (SSE2) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; | |
6 ; Based on | |
7 ; x86 SIMD extension for IJG JPEG library | |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
10 ; | |
11 ; This file should be assembled with NASM (Netwide Assembler), | |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | |
13 ; assembler (including Borland's Turbo Assembler). | |
14 ; NASM is available from http://nasm.sourceforge.net/ or | |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
16 ; | |
17 ; This file contains a fast, not so accurate integer implementation of | |
18 ; the inverse DCT (Discrete Cosine Transform). The following code is | |
19 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c | |
20 ; for more details. | |
21 ; | |
22 ; [TAB8] | |
23 | |
24 %include "jsimdext.inc" | |
25 %include "jdct.inc" | |
26 | |
27 ; -------------------------------------------------------------------------- | |
28 | |
29 %define CONST_BITS 8 ; 14 is also OK. | |
30 %define PASS1_BITS 2 | |
31 | |
32 %if IFAST_SCALE_BITS != PASS1_BITS | |
33 %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." | |
34 %endif | |
35 | |
36 %if CONST_BITS == 8 | |
37 F_1_082 equ 277 ; FIX(1.082392200) | |
38 F_1_414 equ 362 ; FIX(1.414213562) | |
39 F_1_847 equ 473 ; FIX(1.847759065) | |
40 F_2_613 equ 669 ; FIX(2.613125930) | |
41 F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) | |
42 %else | |
43 ; NASM cannot do compile-time arithmetic on floating-point constants. | |
44 %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) | |
45 F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) | |
46 F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) | |
47 F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) | |
48 F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) | |
49 F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) | |
50 %endif | |
51 | |
52 ; -------------------------------------------------------------------------- | |
53 SECTION SEG_CONST | |
54 | |
55 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) | |
56 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) | |
57 | |
58 %define PRE_MULTIPLY_SCALE_BITS 2 | |
59 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
60 | |
61 alignz 16 | |
62 global EXTN(jconst_idct_ifast_sse2) PRIVATE | |
63 | |
64 EXTN(jconst_idct_ifast_sse2): | |
65 | |
66 PW_F1414 times 8 dw F_1_414 << CONST_SHIFT | |
67 PW_F1847 times 8 dw F_1_847 << CONST_SHIFT | |
68 PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT | |
69 PW_F1082 times 8 dw F_1_082 << CONST_SHIFT | |
70 PB_CENTERJSAMP times 16 db CENTERJSAMPLE | |
71 | |
72 alignz 16 | |
73 | |
74 ; -------------------------------------------------------------------------- | |
75 SECTION SEG_TEXT | |
76 BITS 32 | |
77 ; | |
78 ; Perform dequantization and inverse DCT on one block of coefficients. | |
79 ; | |
80 ; GLOBAL(void) | |
81 ; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block, | |
82 ; JSAMPARRAY output_buf, JDIMENSION output_col) | |
83 ; | |
84 | |
85 %define dct_table(b) (b)+8 ; jpeg_component_info * compptr | |
86 %define coef_block(b) (b)+12 ; JCOEFPTR coef_block | |
87 %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf | |
88 %define output_col(b) (b)+20 ; JDIMENSION output_col | |
89 | |
90 %define original_ebp ebp+0 | |
91 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | |
92 %define WK_NUM 2 | |
93 | |
94 align 16 | |
95 global EXTN(jsimd_idct_ifast_sse2) PRIVATE | |
96 | |
97 EXTN(jsimd_idct_ifast_sse2): | |
98 push ebp | |
99 mov eax,esp ; eax = original ebp | |
100 sub esp, byte 4 | |
101 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | |
102 mov [esp],eax | |
103 mov ebp,esp ; ebp = aligned ebp | |
104 lea esp, [wk(0)] | |
105 pushpic ebx | |
106 ; push ecx ; unused | |
107 ; push edx ; need not be preserved | |
108 push esi | |
109 push edi | |
110 | |
111 get_GOT ebx ; get GOT address | |
112 | |
113 ; ---- Pass 1: process columns from input. | |
114 | |
115 ; mov eax, [original_ebp] | |
116 mov edx, POINTER [dct_table(eax)] ; quantptr | |
117 mov esi, JCOEFPTR [coef_block(eax)] ; inptr | |
118 | |
119 %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 | |
120 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] | |
121 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] | |
122 jnz near .columnDCT | |
123 | |
124 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] | |
125 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] | |
126 por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] | |
127 por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] | |
128 por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] | |
129 por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] | |
130 por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] | |
131 por xmm1,xmm0 | |
132 packsswb xmm1,xmm1 | |
133 packsswb xmm1,xmm1 | |
134 movd eax,xmm1 | |
135 test eax,eax | |
136 jnz short .columnDCT | |
137 | |
138 ; -- AC terms all zero | |
139 | |
140 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] | |
141 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] | |
142 | |
143 movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) | |
144 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) | |
145 punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) | |
146 | |
147 pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) | |
148 pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) | |
149 pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) | |
150 pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) | |
151 pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) | |
152 pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) | |
153 pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) | |
154 pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) | |
155 | |
156 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 | |
157 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 | |
158 jmp near .column_end | |
159 alignx 16,7 | |
160 %endif | |
161 .columnDCT: | |
162 | |
163 ; -- Even part | |
164 | |
165 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] | |
166 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] | |
167 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
168 pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
169 movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] | |
170 movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] | |
171 pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
172 pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
173 | |
174 movdqa xmm4,xmm0 | |
175 movdqa xmm5,xmm1 | |
176 psubw xmm0,xmm2 ; xmm0=tmp11 | |
177 psubw xmm1,xmm3 | |
178 paddw xmm4,xmm2 ; xmm4=tmp10 | |
179 paddw xmm5,xmm3 ; xmm5=tmp13 | |
180 | |
181 psllw xmm1,PRE_MULTIPLY_SCALE_BITS | |
182 pmulhw xmm1,[GOTOFF(ebx,PW_F1414)] | |
183 psubw xmm1,xmm5 ; xmm1=tmp12 | |
184 | |
185 movdqa xmm6,xmm4 | |
186 movdqa xmm7,xmm0 | |
187 psubw xmm4,xmm5 ; xmm4=tmp3 | |
188 psubw xmm0,xmm1 ; xmm0=tmp2 | |
189 paddw xmm6,xmm5 ; xmm6=tmp0 | |
190 paddw xmm7,xmm1 ; xmm7=tmp1 | |
191 | |
192 movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 | |
193 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 | |
194 | |
195 ; -- Odd part | |
196 | |
197 movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] | |
198 movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] | |
199 pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
200 pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
201 movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] | |
202 movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] | |
203 pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
204 pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
205 | |
206 movdqa xmm4,xmm2 | |
207 movdqa xmm0,xmm5 | |
208 psubw xmm2,xmm1 ; xmm2=z12 | |
209 psubw xmm5,xmm3 ; xmm5=z10 | |
210 paddw xmm4,xmm1 ; xmm4=z11 | |
211 paddw xmm0,xmm3 ; xmm0=z13 | |
212 | |
213 movdqa xmm1,xmm5 ; xmm1=z10(unscaled) | |
214 psllw xmm2,PRE_MULTIPLY_SCALE_BITS | |
215 psllw xmm5,PRE_MULTIPLY_SCALE_BITS | |
216 | |
217 movdqa xmm3,xmm4 | |
218 psubw xmm4,xmm0 | |
219 paddw xmm3,xmm0 ; xmm3=tmp7 | |
220 | |
221 psllw xmm4,PRE_MULTIPLY_SCALE_BITS | |
222 pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 | |
223 | |
224 ; To avoid overflow... | |
225 ; | |
226 ; (Original) | |
227 ; tmp12 = -2.613125930 * z10 + z5; | |
228 ; | |
229 ; (This implementation) | |
230 ; tmp12 = (-1.613125930 - 1) * z10 + z5; | |
231 ; = -1.613125930 * z10 - z10 + z5; | |
232 | |
233 movdqa xmm0,xmm5 | |
234 paddw xmm5,xmm2 | |
235 pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5 | |
236 pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)] | |
237 pmulhw xmm2,[GOTOFF(ebx,PW_F1082)] | |
238 psubw xmm0,xmm1 | |
239 psubw xmm2,xmm5 ; xmm2=tmp10 | |
240 paddw xmm0,xmm5 ; xmm0=tmp12 | |
241 | |
242 ; -- Final output stage | |
243 | |
244 psubw xmm0,xmm3 ; xmm0=tmp6 | |
245 movdqa xmm1,xmm6 | |
246 movdqa xmm5,xmm7 | |
247 paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) | |
248 paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) | |
249 psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) | |
250 psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) | |
251 psubw xmm4,xmm0 ; xmm4=tmp5 | |
252 | |
253 movdqa xmm3,xmm6 ; transpose coefficients(phase 1) | |
254 punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) | |
255 punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) | |
256 movdqa xmm0,xmm5 ; transpose coefficients(phase 1) | |
257 punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) | |
258 punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) | |
259 | |
260 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 | |
261 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 | |
262 | |
263 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) | |
264 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) | |
265 | |
266 paddw xmm2,xmm4 ; xmm2=tmp4 | |
267 movdqa xmm5,xmm7 | |
268 movdqa xmm0,xmm1 | |
269 paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) | |
270 paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) | |
271 psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) | |
272 psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) | |
273 | |
274 movdqa xmm4,xmm7 ; transpose coefficients(phase 1) | |
275 punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) | |
276 punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) | |
277 movdqa xmm2,xmm1 ; transpose coefficients(phase 1) | |
278 punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) | |
279 punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) | |
280 | |
281 movdqa xmm0,xmm3 ; transpose coefficients(phase 2) | |
282 punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) | |
283 punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) | |
284 movdqa xmm5,xmm6 ; transpose coefficients(phase 2) | |
285 punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) | |
286 punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) | |
287 | |
288 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) | |
289 movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) | |
290 | |
291 movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) | |
292 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) | |
293 | |
294 movdqa xmm3,xmm1 ; transpose coefficients(phase 2) | |
295 punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) | |
296 punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) | |
297 movdqa xmm0,xmm2 ; transpose coefficients(phase 2) | |
298 punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) | |
299 punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) | |
300 | |
301 movdqa xmm4,xmm6 ; transpose coefficients(phase 3) | |
302 punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) | |
303 punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) | |
304 movdqa xmm7,xmm5 ; transpose coefficients(phase 3) | |
305 punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) | |
306 punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) | |
307 | |
308 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) | |
309 movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) | |
310 | |
311 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 | |
312 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 | |
313 | |
314 movdqa xmm4,xmm1 ; transpose coefficients(phase 3) | |
315 punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) | |
316 punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) | |
317 movdqa xmm7,xmm3 ; transpose coefficients(phase 3) | |
318 punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) | |
319 punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) | |
320 .column_end: | |
321 | |
322 ; -- Prefetch the next coefficient block | |
323 | |
324 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] | |
325 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] | |
326 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] | |
327 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] | |
328 | |
329 ; ---- Pass 2: process rows from work array, store into output array. | |
330 | |
331 mov eax, [original_ebp] | |
332 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) | |
333 mov eax, JDIMENSION [output_col(eax)] | |
334 | |
335 ; -- Even part | |
336 | |
337 ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 | |
338 | |
339 movdqa xmm2,xmm6 | |
340 movdqa xmm0,xmm5 | |
341 psubw xmm6,xmm1 ; xmm6=tmp11 | |
342 psubw xmm5,xmm3 | |
343 paddw xmm2,xmm1 ; xmm2=tmp10 | |
344 paddw xmm0,xmm3 ; xmm0=tmp13 | |
345 | |
346 psllw xmm5,PRE_MULTIPLY_SCALE_BITS | |
347 pmulhw xmm5,[GOTOFF(ebx,PW_F1414)] | |
348 psubw xmm5,xmm0 ; xmm5=tmp12 | |
349 | |
350 movdqa xmm1,xmm2 | |
351 movdqa xmm3,xmm6 | |
352 psubw xmm2,xmm0 ; xmm2=tmp3 | |
353 psubw xmm6,xmm5 ; xmm6=tmp2 | |
354 paddw xmm1,xmm0 ; xmm1=tmp0 | |
355 paddw xmm3,xmm5 ; xmm3=tmp1 | |
356 | |
357 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 | |
358 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 | |
359 | |
360 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 | |
361 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 | |
362 | |
363 ; -- Odd part | |
364 | |
365 ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 | |
366 | |
367 movdqa xmm2,xmm0 | |
368 movdqa xmm6,xmm4 | |
369 psubw xmm0,xmm7 ; xmm0=z12 | |
370 psubw xmm4,xmm5 ; xmm4=z10 | |
371 paddw xmm2,xmm7 ; xmm2=z11 | |
372 paddw xmm6,xmm5 ; xmm6=z13 | |
373 | |
374 movdqa xmm7,xmm4 ; xmm7=z10(unscaled) | |
375 psllw xmm0,PRE_MULTIPLY_SCALE_BITS | |
376 psllw xmm4,PRE_MULTIPLY_SCALE_BITS | |
377 | |
378 movdqa xmm5,xmm2 | |
379 psubw xmm2,xmm6 | |
380 paddw xmm5,xmm6 ; xmm5=tmp7 | |
381 | |
382 psllw xmm2,PRE_MULTIPLY_SCALE_BITS | |
383 pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 | |
384 | |
385 ; To avoid overflow... | |
386 ; | |
387 ; (Original) | |
388 ; tmp12 = -2.613125930 * z10 + z5; | |
389 ; | |
390 ; (This implementation) | |
391 ; tmp12 = (-1.613125930 - 1) * z10 + z5; | |
392 ; = -1.613125930 * z10 - z10 + z5; | |
393 | |
394 movdqa xmm6,xmm4 | |
395 paddw xmm4,xmm0 | |
396 pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5 | |
397 pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)] | |
398 pmulhw xmm0,[GOTOFF(ebx,PW_F1082)] | |
399 psubw xmm6,xmm7 | |
400 psubw xmm0,xmm4 ; xmm0=tmp10 | |
401 paddw xmm6,xmm4 ; xmm6=tmp12 | |
402 | |
403 ; -- Final output stage | |
404 | |
405 psubw xmm6,xmm5 ; xmm6=tmp6 | |
406 movdqa xmm7,xmm1 | |
407 movdqa xmm4,xmm3 | |
408 paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) | |
409 paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) | |
410 psraw xmm1,(PASS1_BITS+3) ; descale | |
411 psraw xmm3,(PASS1_BITS+3) ; descale | |
412 psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) | |
413 psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) | |
414 psraw xmm7,(PASS1_BITS+3) ; descale | |
415 psraw xmm4,(PASS1_BITS+3) ; descale | |
416 psubw xmm2,xmm6 ; xmm2=tmp5 | |
417 | |
418 packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 5
6 66 76) | |
419 packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 5
7 67 77) | |
420 | |
421 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 | |
422 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 | |
423 | |
424 paddw xmm0,xmm2 ; xmm0=tmp4 | |
425 movdqa xmm4,xmm5 | |
426 movdqa xmm7,xmm6 | |
427 paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) | |
428 paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) | |
429 psraw xmm5,(PASS1_BITS+3) ; descale | |
430 psraw xmm6,(PASS1_BITS+3) ; descale | |
431 psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) | |
432 psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) | |
433 psraw xmm4,(PASS1_BITS+3) ; descale | |
434 psraw xmm7,(PASS1_BITS+3) ; descale | |
435 | |
436 movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] | |
437 | |
438 packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 5
4 64 74) | |
439 packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 5
5 65 75) | |
440 | |
441 paddb xmm1,xmm2 | |
442 paddb xmm3,xmm2 | |
443 paddb xmm5,xmm2 | |
444 paddb xmm7,xmm2 | |
445 | |
446 movdqa xmm0,xmm1 ; transpose coefficients(phase 1) | |
447 punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 6
1 70 71) | |
448 punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 6
7 76 77) | |
449 movdqa xmm6,xmm5 ; transpose coefficients(phase 1) | |
450 punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 6
3 72 73) | |
451 punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 6
5 74 75) | |
452 | |
453 movdqa xmm4,xmm1 ; transpose coefficients(phase 2) | |
454 punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 3
1 32 33) | |
455 punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 7
1 72 73) | |
456 movdqa xmm2,xmm6 ; transpose coefficients(phase 2) | |
457 punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 3
5 36 37) | |
458 punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 7
5 76 77) | |
459 | |
460 movdqa xmm3,xmm1 ; transpose coefficients(phase 3) | |
461 punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 1
5 16 17) | |
462 punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 3
5 36 37) | |
463 movdqa xmm7,xmm4 ; transpose coefficients(phase 3) | |
464 punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 5
5 56 57) | |
465 punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 7
5 76 77) | |
466 | |
467 pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 0
5 06 07) | |
468 pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 2
5 26 27) | |
469 pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 4
5 46 47) | |
470 pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 6
5 66 67) | |
471 | |
472 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] | |
473 mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] | |
474 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 | |
475 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 | |
476 mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] | |
477 mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] | |
478 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 | |
479 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 | |
480 | |
481 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] | |
482 mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] | |
483 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 | |
484 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 | |
485 mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] | |
486 mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] | |
487 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 | |
488 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 | |
489 | |
490 pop edi | |
491 pop esi | |
492 ; pop edx ; need not be preserved | |
493 ; pop ecx ; unused | |
494 poppic ebx | |
495 mov esp,ebp ; esp <- aligned ebp | |
496 pop esp ; esp <- original ebp | |
497 pop ebp | |
498 ret | |
499 | |
500 ; For some reason, the OS X linker does not honor the request to align the | |
501 ; segment unless we do this. | |
502 align 16 | |
OLD | NEW |