OLD | NEW |
1 ; | 1 ; |
2 ; jcsamss2-64.asm - downsampling (64-bit SSE2) | 2 ; jcsamss2-64.asm - downsampling (64-bit SSE2) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
6 ; | 6 ; |
7 ; Based on | 7 ; Based on |
8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
42 | 42 |
43 align 16 | 43 align 16 |
44 global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE | 44 global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE |
45 | 45 |
46 EXTN(jsimd_h2v1_downsample_sse2): | 46 EXTN(jsimd_h2v1_downsample_sse2): |
47 push rbp | 47 push rbp |
48 mov rax,rsp | 48 mov rax,rsp |
49 mov rbp,rsp | 49 mov rbp,rsp |
50 collect_args | 50 collect_args |
51 | 51 |
52 » mov rcx, r13 | 52 » mov ecx, r13d |
53 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) | 53 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) |
54 jz near .return | 54 jz near .return |
55 | 55 |
56 » mov rdx, r10 | 56 » mov edx, r10d |
57 | 57 |
58 ; -- expand_right_edge | 58 ; -- expand_right_edge |
59 | 59 |
60 push rcx | 60 push rcx |
61 shl rcx,1 ; output_cols * 2 | 61 shl rcx,1 ; output_cols * 2 |
62 sub rcx,rdx | 62 sub rcx,rdx |
63 jle short .expand_end | 63 jle short .expand_end |
64 | 64 |
65 mov rax, r11 | 65 mov rax, r11 |
66 test rax,rax | 66 test rax,rax |
(...skipping 16 matching lines...) Expand all Loading... |
83 | 83 |
84 add rsi, byte SIZEOF_JSAMPROW | 84 add rsi, byte SIZEOF_JSAMPROW |
85 dec rax | 85 dec rax |
86 jg short .expandloop | 86 jg short .expandloop |
87 | 87 |
88 .expand_end: | 88 .expand_end: |
89 pop rcx ; output_cols | 89 pop rcx ; output_cols |
90 | 90 |
91 ; -- h2v1_downsample | 91 ; -- h2v1_downsample |
92 | 92 |
93 » mov» rax, r12» ; rowctr | 93 » mov» eax, r12d» ; rowctr |
94 test eax,eax | 94 test eax,eax |
95 jle near .return | 95 jle near .return |
96 | 96 |
97 mov rdx, 0x00010000 ; bias pattern | 97 mov rdx, 0x00010000 ; bias pattern |
98 movd xmm7,edx | 98 movd xmm7,edx |
99 pcmpeqw xmm6,xmm6 | 99 pcmpeqw xmm6,xmm6 |
100 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} | 100 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} |
101 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} | 101 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} |
102 | 102 |
103 mov rsi, r14 ; input_data | 103 mov rsi, r14 ; input_data |
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
186 | 186 |
187 align 16 | 187 align 16 |
188 global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE | 188 global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE |
189 | 189 |
190 EXTN(jsimd_h2v2_downsample_sse2): | 190 EXTN(jsimd_h2v2_downsample_sse2): |
191 push rbp | 191 push rbp |
192 mov rax,rsp | 192 mov rax,rsp |
193 mov rbp,rsp | 193 mov rbp,rsp |
194 collect_args | 194 collect_args |
195 | 195 |
196 » mov» rcx, r13 | 196 » mov» ecx, r13d |
197 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) | 197 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) |
198 jz near .return | 198 jz near .return |
199 | 199 |
200 mov rdx, r10 | 200 mov rdx, r10 |
201 | 201 |
202 ; -- expand_right_edge | 202 ; -- expand_right_edge |
203 | 203 |
204 push rcx | 204 push rcx |
205 shl rcx,1 ; output_cols * 2 | 205 shl rcx,1 ; output_cols * 2 |
206 sub rcx,rdx | 206 sub rcx,rdx |
(...skipping 20 matching lines...) Expand all Loading... |
227 | 227 |
228 add rsi, byte SIZEOF_JSAMPROW | 228 add rsi, byte SIZEOF_JSAMPROW |
229 dec rax | 229 dec rax |
230 jg short .expandloop | 230 jg short .expandloop |
231 | 231 |
232 .expand_end: | 232 .expand_end: |
233 pop rcx ; output_cols | 233 pop rcx ; output_cols |
234 | 234 |
235 ; -- h2v2_downsample | 235 ; -- h2v2_downsample |
236 | 236 |
237 » mov» rax, r12» ; rowctr | 237 » mov» eax, r12d» ; rowctr |
238 test rax,rax | 238 test rax,rax |
239 jle near .return | 239 jle near .return |
240 | 240 |
241 mov rdx, 0x00020001 ; bias pattern | 241 mov rdx, 0x00020001 ; bias pattern |
242 movd xmm7,edx | 242 movd xmm7,edx |
243 pcmpeqw xmm6,xmm6 | 243 pcmpeqw xmm6,xmm6 |
244 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} | 244 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} |
245 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} | 245 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} |
246 | 246 |
247 mov rsi, r14 ; input_data | 247 mov rsi, r14 ; input_data |
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
321 jg near .rowloop | 321 jg near .rowloop |
322 | 322 |
323 .return: | 323 .return: |
324 uncollect_args | 324 uncollect_args |
325 pop rbp | 325 pop rbp |
326 ret | 326 ret |
327 | 327 |
328 ; For some reason, the OS X linker does not honor the request to align the | 328 ; For some reason, the OS X linker does not honor the request to align the |
329 ; segment unless we do this. | 329 ; segment unless we do this. |
330 align 16 | 330 align 16 |
OLD | NEW |