| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; jcsamss2-64.asm - downsampling (64-bit SSE2) | 2 ; jcsamss2-64.asm - downsampling (64-bit SSE2) |
| 3 ; | 3 ; |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
| 5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
| 6 ; | 6 ; |
| 7 ; Based on | 7 ; Based on |
| 8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
| 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 42 | 42 |
| 43 align 16 | 43 align 16 |
| 44 global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE | 44 global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE |
| 45 | 45 |
| 46 EXTN(jsimd_h2v1_downsample_sse2): | 46 EXTN(jsimd_h2v1_downsample_sse2): |
| 47 push rbp | 47 push rbp |
| 48 mov rax,rsp | 48 mov rax,rsp |
| 49 mov rbp,rsp | 49 mov rbp,rsp |
| 50 collect_args | 50 collect_args |
| 51 | 51 |
| 52 » mov rcx, r13 | 52 » mov ecx, r13d |
| 53 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) | 53 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) |
| 54 jz near .return | 54 jz near .return |
| 55 | 55 |
| 56 » mov rdx, r10 | 56 » mov edx, r10d |
| 57 | 57 |
| 58 ; -- expand_right_edge | 58 ; -- expand_right_edge |
| 59 | 59 |
| 60 push rcx | 60 push rcx |
| 61 shl rcx,1 ; output_cols * 2 | 61 shl rcx,1 ; output_cols * 2 |
| 62 sub rcx,rdx | 62 sub rcx,rdx |
| 63 jle short .expand_end | 63 jle short .expand_end |
| 64 | 64 |
| 65 mov rax, r11 | 65 mov rax, r11 |
| 66 test rax,rax | 66 test rax,rax |
| (...skipping 16 matching lines...) Expand all Loading... |
| 83 | 83 |
| 84 add rsi, byte SIZEOF_JSAMPROW | 84 add rsi, byte SIZEOF_JSAMPROW |
| 85 dec rax | 85 dec rax |
| 86 jg short .expandloop | 86 jg short .expandloop |
| 87 | 87 |
| 88 .expand_end: | 88 .expand_end: |
| 89 pop rcx ; output_cols | 89 pop rcx ; output_cols |
| 90 | 90 |
| 91 ; -- h2v1_downsample | 91 ; -- h2v1_downsample |
| 92 | 92 |
| 93 » mov» rax, r12» ; rowctr | 93 » mov» eax, r12d» ; rowctr |
| 94 test eax,eax | 94 test eax,eax |
| 95 jle near .return | 95 jle near .return |
| 96 | 96 |
| 97 mov rdx, 0x00010000 ; bias pattern | 97 mov rdx, 0x00010000 ; bias pattern |
| 98 movd xmm7,edx | 98 movd xmm7,edx |
| 99 pcmpeqw xmm6,xmm6 | 99 pcmpeqw xmm6,xmm6 |
| 100 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} | 100 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} |
| 101 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} | 101 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} |
| 102 | 102 |
| 103 mov rsi, r14 ; input_data | 103 mov rsi, r14 ; input_data |
| (...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 186 | 186 |
| 187 align 16 | 187 align 16 |
| 188 global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE | 188 global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE |
| 189 | 189 |
| 190 EXTN(jsimd_h2v2_downsample_sse2): | 190 EXTN(jsimd_h2v2_downsample_sse2): |
| 191 push rbp | 191 push rbp |
| 192 mov rax,rsp | 192 mov rax,rsp |
| 193 mov rbp,rsp | 193 mov rbp,rsp |
| 194 collect_args | 194 collect_args |
| 195 | 195 |
| 196 » mov» rcx, r13 | 196 » mov» ecx, r13d |
| 197 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) | 197 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) |
| 198 jz near .return | 198 jz near .return |
| 199 | 199 |
| 200 mov rdx, r10 | 200 mov rdx, r10 |
| 201 | 201 |
| 202 ; -- expand_right_edge | 202 ; -- expand_right_edge |
| 203 | 203 |
| 204 push rcx | 204 push rcx |
| 205 shl rcx,1 ; output_cols * 2 | 205 shl rcx,1 ; output_cols * 2 |
| 206 sub rcx,rdx | 206 sub rcx,rdx |
| (...skipping 20 matching lines...) Expand all Loading... |
| 227 | 227 |
| 228 add rsi, byte SIZEOF_JSAMPROW | 228 add rsi, byte SIZEOF_JSAMPROW |
| 229 dec rax | 229 dec rax |
| 230 jg short .expandloop | 230 jg short .expandloop |
| 231 | 231 |
| 232 .expand_end: | 232 .expand_end: |
| 233 pop rcx ; output_cols | 233 pop rcx ; output_cols |
| 234 | 234 |
| 235 ; -- h2v2_downsample | 235 ; -- h2v2_downsample |
| 236 | 236 |
| 237 » mov» rax, r12» ; rowctr | 237 » mov» eax, r12d» ; rowctr |
| 238 test rax,rax | 238 test rax,rax |
| 239 jle near .return | 239 jle near .return |
| 240 | 240 |
| 241 mov rdx, 0x00020001 ; bias pattern | 241 mov rdx, 0x00020001 ; bias pattern |
| 242 movd xmm7,edx | 242 movd xmm7,edx |
| 243 pcmpeqw xmm6,xmm6 | 243 pcmpeqw xmm6,xmm6 |
| 244 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} | 244 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} |
| 245 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} | 245 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} |
| 246 | 246 |
| 247 mov rsi, r14 ; input_data | 247 mov rsi, r14 ; input_data |
| (...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 321 jg near .rowloop | 321 jg near .rowloop |
| 322 | 322 |
| 323 .return: | 323 .return: |
| 324 uncollect_args | 324 uncollect_args |
| 325 pop rbp | 325 pop rbp |
| 326 ret | 326 ret |
| 327 | 327 |
| 328 ; For some reason, the OS X linker does not honor the request to align the | 328 ; For some reason, the OS X linker does not honor the request to align the |
| 329 ; segment unless we do this. | 329 ; segment unless we do this. |
| 330 align 16 | 330 align 16 |
| OLD | NEW |