OLD | NEW |
| (Empty) |
1 ; | |
2 ; jiss2fst-64.asm - fast integer IDCT (64-bit SSE2) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; Copyright 2009 D. R. Commander | |
6 ; | |
7 ; Based on | |
8 ; x86 SIMD extension for IJG JPEG library | |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
11 ; | |
12 ; This file should be assembled with NASM (Netwide Assembler), | |
13 ; can *not* be assembled with Microsoft's MASM or any compatible | |
14 ; assembler (including Borland's Turbo Assembler). | |
15 ; NASM is available from http://nasm.sourceforge.net/ or | |
16 ; http://sourceforge.net/projecpt/showfiles.php?group_id=6208 | |
17 ; | |
18 ; This file contains a fast, not so accurate integer implementation of | |
19 ; the inverse DCT (Discrete Cosine Transform). The following code is | |
20 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c | |
21 ; for more details. | |
22 ; | |
23 ; [TAB8] | |
24 | |
25 %include "jsimdext.inc" | |
26 %include "jdct.inc" | |
27 | |
28 ; -------------------------------------------------------------------------- | |
29 | |
30 %define CONST_BITS 8 ; 14 is also OK. | |
31 %define PASS1_BITS 2 | |
32 | |
33 %if IFAST_SCALE_BITS != PASS1_BITS | |
34 %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." | |
35 %endif | |
36 | |
37 %if CONST_BITS == 8 | |
38 F_1_082 equ 277 ; FIX(1.082392200) | |
39 F_1_414 equ 362 ; FIX(1.414213562) | |
40 F_1_847 equ 473 ; FIX(1.847759065) | |
41 F_2_613 equ 669 ; FIX(2.613125930) | |
42 F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) | |
43 %else | |
44 ; NASM cannot do compile-time arithmetic on floating-point constants. | |
45 %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) | |
46 F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) | |
47 F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) | |
48 F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) | |
49 F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) | |
50 F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) | |
51 %endif | |
52 | |
53 ; -------------------------------------------------------------------------- | |
54 SECTION SEG_CONST | |
55 | |
56 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) | |
57 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) | |
58 | |
59 %define PRE_MULTIPLY_SCALE_BITS 2 | |
60 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
61 | |
62 alignz 16 | |
63 global EXTN(jconst_idct_ifast_sse2) PRIVATE | |
64 | |
65 EXTN(jconst_idct_ifast_sse2): | |
66 | |
67 PW_F1414 times 8 dw F_1_414 << CONST_SHIFT | |
68 PW_F1847 times 8 dw F_1_847 << CONST_SHIFT | |
69 PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT | |
70 PW_F1082 times 8 dw F_1_082 << CONST_SHIFT | |
71 PB_CENTERJSAMP times 16 db CENTERJSAMPLE | |
72 | |
73 alignz 16 | |
74 | |
75 ; -------------------------------------------------------------------------- | |
76 SECTION SEG_TEXT | |
77 BITS 64 | |
78 ; | |
79 ; Perform dequantization and inverse DCT on one block of coefficients. | |
80 ; | |
81 ; GLOBAL(void) | |
82 ; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block, | |
83 ; JSAMPARRAY output_buf, JDIMENSION output_col) | |
84 ; | |
85 | |
86 ; r10 = jpeg_component_info * compptr | |
87 ; r11 = JCOEFPTR coef_block | |
88 ; r12 = JSAMPARRAY output_buf | |
89 ; r13 = JDIMENSION output_col | |
90 | |
91 %define original_rbp rbp+0 | |
92 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | |
93 %define WK_NUM 2 | |
94 | |
95 align 16 | |
96 global EXTN(jsimd_idct_ifast_sse2) PRIVATE | |
97 | |
98 EXTN(jsimd_idct_ifast_sse2): | |
99 push rbp | |
100 mov rax,rsp ; rax = original rbp | |
101 sub rsp, byte 4 | |
102 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | |
103 mov [rsp],rax | |
104 mov rbp,rsp ; rbp = aligned rbp | |
105 lea rsp, [wk(0)] | |
106 collect_args | |
107 | |
108 ; ---- Pass 1: process columns from input. | |
109 | |
110 mov rdx, r10 ; quantptr | |
111 mov rsi, r11 ; inptr | |
112 | |
113 %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 | |
114 mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] | |
115 or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] | |
116 jnz near .columnDCT | |
117 | |
118 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] | |
119 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] | |
120 por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] | |
121 por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] | |
122 por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] | |
123 por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] | |
124 por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] | |
125 por xmm1,xmm0 | |
126 packsswb xmm1,xmm1 | |
127 packsswb xmm1,xmm1 | |
128 movd eax,xmm1 | |
129 test rax,rax | |
130 jnz short .columnDCT | |
131 | |
132 ; -- AC terms all zero | |
133 | |
134 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] | |
135 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] | |
136 | |
137 movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) | |
138 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) | |
139 punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) | |
140 | |
141 pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) | |
142 pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) | |
143 pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) | |
144 pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) | |
145 pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) | |
146 pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) | |
147 pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) | |
148 pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) | |
149 | |
150 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 | |
151 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 | |
152 jmp near .column_end | |
153 %endif | |
154 .columnDCT: | |
155 | |
156 ; -- Even part | |
157 | |
158 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] | |
159 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] | |
160 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] | |
161 pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] | |
162 movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] | |
163 movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] | |
164 pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] | |
165 pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] | |
166 | |
167 movdqa xmm4,xmm0 | |
168 movdqa xmm5,xmm1 | |
169 psubw xmm0,xmm2 ; xmm0=tmp11 | |
170 psubw xmm1,xmm3 | |
171 paddw xmm4,xmm2 ; xmm4=tmp10 | |
172 paddw xmm5,xmm3 ; xmm5=tmp13 | |
173 | |
174 psllw xmm1,PRE_MULTIPLY_SCALE_BITS | |
175 pmulhw xmm1,[rel PW_F1414] | |
176 psubw xmm1,xmm5 ; xmm1=tmp12 | |
177 | |
178 movdqa xmm6,xmm4 | |
179 movdqa xmm7,xmm0 | |
180 psubw xmm4,xmm5 ; xmm4=tmp3 | |
181 psubw xmm0,xmm1 ; xmm0=tmp2 | |
182 paddw xmm6,xmm5 ; xmm6=tmp0 | |
183 paddw xmm7,xmm1 ; xmm7=tmp1 | |
184 | |
185 movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 | |
186 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 | |
187 | |
188 ; -- Odd part | |
189 | |
190 movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] | |
191 movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] | |
192 pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] | |
193 pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] | |
194 movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] | |
195 movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] | |
196 pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] | |
197 pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] | |
198 | |
199 movdqa xmm4,xmm2 | |
200 movdqa xmm0,xmm5 | |
201 psubw xmm2,xmm1 ; xmm2=z12 | |
202 psubw xmm5,xmm3 ; xmm5=z10 | |
203 paddw xmm4,xmm1 ; xmm4=z11 | |
204 paddw xmm0,xmm3 ; xmm0=z13 | |
205 | |
206 movdqa xmm1,xmm5 ; xmm1=z10(unscaled) | |
207 psllw xmm2,PRE_MULTIPLY_SCALE_BITS | |
208 psllw xmm5,PRE_MULTIPLY_SCALE_BITS | |
209 | |
210 movdqa xmm3,xmm4 | |
211 psubw xmm4,xmm0 | |
212 paddw xmm3,xmm0 ; xmm3=tmp7 | |
213 | |
214 psllw xmm4,PRE_MULTIPLY_SCALE_BITS | |
215 pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11 | |
216 | |
217 ; To avoid overflow... | |
218 ; | |
219 ; (Original) | |
220 ; tmp12 = -2.613125930 * z10 + z5; | |
221 ; | |
222 ; (This implementation) | |
223 ; tmp12 = (-1.613125930 - 1) * z10 + z5; | |
224 ; = -1.613125930 * z10 - z10 + z5; | |
225 | |
226 movdqa xmm0,xmm5 | |
227 paddw xmm5,xmm2 | |
228 pmulhw xmm5,[rel PW_F1847] ; xmm5=z5 | |
229 pmulhw xmm0,[rel PW_MF1613] | |
230 pmulhw xmm2,[rel PW_F1082] | |
231 psubw xmm0,xmm1 | |
232 psubw xmm2,xmm5 ; xmm2=tmp10 | |
233 paddw xmm0,xmm5 ; xmm0=tmp12 | |
234 | |
235 ; -- Final output stage | |
236 | |
237 psubw xmm0,xmm3 ; xmm0=tmp6 | |
238 movdqa xmm1,xmm6 | |
239 movdqa xmm5,xmm7 | |
240 paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) | |
241 paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) | |
242 psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) | |
243 psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) | |
244 psubw xmm4,xmm0 ; xmm4=tmp5 | |
245 | |
246 movdqa xmm3,xmm6 ; transpose coefficients(phase 1) | |
247 punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) | |
248 punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) | |
249 movdqa xmm0,xmm5 ; transpose coefficients(phase 1) | |
250 punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) | |
251 punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) | |
252 | |
253 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 | |
254 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 | |
255 | |
256 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) | |
257 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) | |
258 | |
259 paddw xmm2,xmm4 ; xmm2=tmp4 | |
260 movdqa xmm5,xmm7 | |
261 movdqa xmm0,xmm1 | |
262 paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) | |
263 paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) | |
264 psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) | |
265 psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) | |
266 | |
267 movdqa xmm4,xmm7 ; transpose coefficients(phase 1) | |
268 punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) | |
269 punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) | |
270 movdqa xmm2,xmm1 ; transpose coefficients(phase 1) | |
271 punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) | |
272 punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) | |
273 | |
274 movdqa xmm0,xmm3 ; transpose coefficients(phase 2) | |
275 punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) | |
276 punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) | |
277 movdqa xmm5,xmm6 ; transpose coefficients(phase 2) | |
278 punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) | |
279 punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) | |
280 | |
281 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) | |
282 movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) | |
283 | |
284 movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) | |
285 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) | |
286 | |
287 movdqa xmm3,xmm1 ; transpose coefficients(phase 2) | |
288 punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) | |
289 punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) | |
290 movdqa xmm0,xmm2 ; transpose coefficients(phase 2) | |
291 punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) | |
292 punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) | |
293 | |
294 movdqa xmm4,xmm6 ; transpose coefficients(phase 3) | |
295 punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) | |
296 punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) | |
297 movdqa xmm7,xmm5 ; transpose coefficients(phase 3) | |
298 punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) | |
299 punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) | |
300 | |
301 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) | |
302 movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) | |
303 | |
304 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 | |
305 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 | |
306 | |
307 movdqa xmm4,xmm1 ; transpose coefficients(phase 3) | |
308 punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) | |
309 punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) | |
310 movdqa xmm7,xmm3 ; transpose coefficients(phase 3) | |
311 punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) | |
312 punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) | |
313 .column_end: | |
314 | |
315 ; -- Prefetch the next coefficient block | |
316 | |
317 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] | |
318 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] | |
319 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] | |
320 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] | |
321 | |
322 ; ---- Pass 2: process rows from work array, store into output array. | |
323 | |
324 mov rax, [original_rbp] | |
325 mov rdi, r12 ; (JSAMPROW *) | |
326 mov eax, r13d | |
327 | |
328 ; -- Even part | |
329 | |
330 ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 | |
331 | |
332 movdqa xmm2,xmm6 | |
333 movdqa xmm0,xmm5 | |
334 psubw xmm6,xmm1 ; xmm6=tmp11 | |
335 psubw xmm5,xmm3 | |
336 paddw xmm2,xmm1 ; xmm2=tmp10 | |
337 paddw xmm0,xmm3 ; xmm0=tmp13 | |
338 | |
339 psllw xmm5,PRE_MULTIPLY_SCALE_BITS | |
340 pmulhw xmm5,[rel PW_F1414] | |
341 psubw xmm5,xmm0 ; xmm5=tmp12 | |
342 | |
343 movdqa xmm1,xmm2 | |
344 movdqa xmm3,xmm6 | |
345 psubw xmm2,xmm0 ; xmm2=tmp3 | |
346 psubw xmm6,xmm5 ; xmm6=tmp2 | |
347 paddw xmm1,xmm0 ; xmm1=tmp0 | |
348 paddw xmm3,xmm5 ; xmm3=tmp1 | |
349 | |
350 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 | |
351 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 | |
352 | |
353 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 | |
354 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 | |
355 | |
356 ; -- Odd part | |
357 | |
358 ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 | |
359 | |
360 movdqa xmm2,xmm0 | |
361 movdqa xmm6,xmm4 | |
362 psubw xmm0,xmm7 ; xmm0=z12 | |
363 psubw xmm4,xmm5 ; xmm4=z10 | |
364 paddw xmm2,xmm7 ; xmm2=z11 | |
365 paddw xmm6,xmm5 ; xmm6=z13 | |
366 | |
367 movdqa xmm7,xmm4 ; xmm7=z10(unscaled) | |
368 psllw xmm0,PRE_MULTIPLY_SCALE_BITS | |
369 psllw xmm4,PRE_MULTIPLY_SCALE_BITS | |
370 | |
371 movdqa xmm5,xmm2 | |
372 psubw xmm2,xmm6 | |
373 paddw xmm5,xmm6 ; xmm5=tmp7 | |
374 | |
375 psllw xmm2,PRE_MULTIPLY_SCALE_BITS | |
376 pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11 | |
377 | |
378 ; To avoid overflow... | |
379 ; | |
380 ; (Original) | |
381 ; tmp12 = -2.613125930 * z10 + z5; | |
382 ; | |
383 ; (This implementation) | |
384 ; tmp12 = (-1.613125930 - 1) * z10 + z5; | |
385 ; = -1.613125930 * z10 - z10 + z5; | |
386 | |
387 movdqa xmm6,xmm4 | |
388 paddw xmm4,xmm0 | |
389 pmulhw xmm4,[rel PW_F1847] ; xmm4=z5 | |
390 pmulhw xmm6,[rel PW_MF1613] | |
391 pmulhw xmm0,[rel PW_F1082] | |
392 psubw xmm6,xmm7 | |
393 psubw xmm0,xmm4 ; xmm0=tmp10 | |
394 paddw xmm6,xmm4 ; xmm6=tmp12 | |
395 | |
396 ; -- Final output stage | |
397 | |
398 psubw xmm6,xmm5 ; xmm6=tmp6 | |
399 movdqa xmm7,xmm1 | |
400 movdqa xmm4,xmm3 | |
401 paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) | |
402 paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) | |
403 psraw xmm1,(PASS1_BITS+3) ; descale | |
404 psraw xmm3,(PASS1_BITS+3) ; descale | |
405 psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) | |
406 psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) | |
407 psraw xmm7,(PASS1_BITS+3) ; descale | |
408 psraw xmm4,(PASS1_BITS+3) ; descale | |
409 psubw xmm2,xmm6 ; xmm2=tmp5 | |
410 | |
411 packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 5
6 66 76) | |
412 packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 5
7 67 77) | |
413 | |
414 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 | |
415 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 | |
416 | |
417 paddw xmm0,xmm2 ; xmm0=tmp4 | |
418 movdqa xmm4,xmm5 | |
419 movdqa xmm7,xmm6 | |
420 paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) | |
421 paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) | |
422 psraw xmm5,(PASS1_BITS+3) ; descale | |
423 psraw xmm6,(PASS1_BITS+3) ; descale | |
424 psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) | |
425 psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) | |
426 psraw xmm4,(PASS1_BITS+3) ; descale | |
427 psraw xmm7,(PASS1_BITS+3) ; descale | |
428 | |
429 movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] | |
430 | |
431 packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 5
4 64 74) | |
432 packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 5
5 65 75) | |
433 | |
434 paddb xmm1,xmm2 | |
435 paddb xmm3,xmm2 | |
436 paddb xmm5,xmm2 | |
437 paddb xmm7,xmm2 | |
438 | |
439 movdqa xmm0,xmm1 ; transpose coefficients(phase 1) | |
440 punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 6
1 70 71) | |
441 punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 6
7 76 77) | |
442 movdqa xmm6,xmm5 ; transpose coefficients(phase 1) | |
443 punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 6
3 72 73) | |
444 punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 6
5 74 75) | |
445 | |
446 movdqa xmm4,xmm1 ; transpose coefficients(phase 2) | |
447 punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 3
1 32 33) | |
448 punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 7
1 72 73) | |
449 movdqa xmm2,xmm6 ; transpose coefficients(phase 2) | |
450 punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 3
5 36 37) | |
451 punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 7
5 76 77) | |
452 | |
453 movdqa xmm3,xmm1 ; transpose coefficients(phase 3) | |
454 punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 1
5 16 17) | |
455 punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 3
5 36 37) | |
456 movdqa xmm7,xmm4 ; transpose coefficients(phase 3) | |
457 punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 5
5 56 57) | |
458 punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 7
5 76 77) | |
459 | |
460 pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 0
5 06 07) | |
461 pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 2
5 26 27) | |
462 pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 4
5 46 47) | |
463 pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 6
5 66 67) | |
464 | |
465 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] | |
466 mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] | |
467 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 | |
468 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 | |
469 mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] | |
470 mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] | |
471 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 | |
472 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 | |
473 | |
474 mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] | |
475 mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] | |
476 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 | |
477 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 | |
478 mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] | |
479 mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] | |
480 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 | |
481 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 | |
482 | |
483 uncollect_args | |
484 mov rsp,rbp ; rsp <- aligned rbp | |
485 pop rsp ; rsp <- original rbp | |
486 pop rbp | |
487 ret | |
488 ret | |
489 | |
490 ; For some reason, the OS X linker does not honor the request to align the | |
491 ; segment unless we do this. | |
492 align 16 | |
OLD | NEW |