OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; jcsamss2-64.asm - downsampling (64-bit SSE2) |
| 3 ; |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
| 5 ; Copyright 2009 D. R. Commander |
| 6 ; |
| 7 ; Based on |
| 8 ; x86 SIMD extension for IJG JPEG library |
| 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| 11 ; |
| 12 ; This file should be assembled with NASM (Netwide Assembler), |
| 13 ; can *not* be assembled with Microsoft's MASM or any compatible |
| 14 ; assembler (including Borland's Turbo Assembler). |
| 15 ; NASM is available from http://nasm.sourceforge.net/ or |
| 16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
| 17 ; |
| 18 ; [TAB8] |
| 19 |
| 20 %include "jsimdext.inc" |
| 21 |
| 22 ; -------------------------------------------------------------------------- |
| 23 SECTION SEG_TEXT |
| 24 BITS 64 |
| 25 ; |
| 26 ; Downsample pixel values of a single component. |
| 27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, |
| 28 ; without smoothing. |
| 29 ; |
| 30 ; GLOBAL(void) |
| 31 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, |
| 32 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, |
| 33 ; JSAMPARRAY input_data, JSAMPARRAY output_data); |
| 34 ; |
| 35 |
| 36 ; r10 = JDIMENSION image_width |
| 37 ; r11 = int max_v_samp_factor |
| 38 ; r12 = JDIMENSION v_samp_factor |
| 39 ; r13 = JDIMENSION width_blocks |
| 40 ; r14 = JSAMPARRAY input_data |
| 41 ; r15 = JSAMPARRAY output_data |
| 42 |
| 43 align 16 |
| 44 global EXTN(jsimd_h2v1_downsample_sse2) |
| 45 |
| 46 EXTN(jsimd_h2v1_downsample_sse2): |
| 47 push rbp |
| 48 mov rax,rsp |
| 49 mov rbp,rsp |
| 50 collect_args |
| 51 |
| 52 mov rcx, r13 |
| 53 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) |
| 54 jz near .return |
| 55 |
| 56 mov rdx, r10 |
| 57 |
| 58 ; -- expand_right_edge |
| 59 |
| 60 push rcx |
| 61 shl rcx,1 ; output_cols * 2 |
| 62 sub rcx,rdx |
| 63 jle short .expand_end |
| 64 |
| 65 mov rax, r11 |
| 66 test rax,rax |
| 67 jle short .expand_end |
| 68 |
| 69 cld |
| 70 mov rsi, r14 ; input_data |
| 71 .expandloop: |
| 72 push rax |
| 73 push rcx |
| 74 |
| 75 mov rdi, JSAMPROW [rsi] |
| 76 add rdi,rdx |
| 77 mov al, JSAMPLE [rdi-1] |
| 78 |
| 79 rep stosb |
| 80 |
| 81 pop rcx |
| 82 pop rax |
| 83 |
| 84 add rsi, byte SIZEOF_JSAMPROW |
| 85 dec rax |
| 86 jg short .expandloop |
| 87 |
| 88 .expand_end: |
| 89 pop rcx ; output_cols |
| 90 |
| 91 ; -- h2v1_downsample |
| 92 |
| 93 mov rax, r12 ; rowctr |
| 94 test eax,eax |
| 95 jle near .return |
| 96 |
| 97 mov rdx, 0x00010000 ; bias pattern |
| 98 movd xmm7,edx |
| 99 pcmpeqw xmm6,xmm6 |
| 100 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} |
| 101 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} |
| 102 |
| 103 mov rsi, r14 ; input_data |
| 104 mov rdi, r15 ; output_data |
| 105 .rowloop: |
| 106 push rcx |
| 107 push rdi |
| 108 push rsi |
| 109 |
| 110 mov rsi, JSAMPROW [rsi] ; inptr |
| 111 mov rdi, JSAMPROW [rdi] ; outptr |
| 112 |
| 113 cmp rcx, byte SIZEOF_XMMWORD |
| 114 jae short .columnloop |
| 115 |
| 116 .columnloop_r8: |
| 117 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] |
| 118 pxor xmm1,xmm1 |
| 119 mov rcx, SIZEOF_XMMWORD |
| 120 jmp short .downsample |
| 121 |
| 122 .columnloop: |
| 123 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] |
| 124 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] |
| 125 |
| 126 .downsample: |
| 127 movdqa xmm2,xmm0 |
| 128 movdqa xmm3,xmm1 |
| 129 |
| 130 pand xmm0,xmm6 |
| 131 psrlw xmm2,BYTE_BIT |
| 132 pand xmm1,xmm6 |
| 133 psrlw xmm3,BYTE_BIT |
| 134 |
| 135 paddw xmm0,xmm2 |
| 136 paddw xmm1,xmm3 |
| 137 paddw xmm0,xmm7 |
| 138 paddw xmm1,xmm7 |
| 139 psrlw xmm0,1 |
| 140 psrlw xmm1,1 |
| 141 |
| 142 packuswb xmm0,xmm1 |
| 143 |
| 144 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 |
| 145 |
| 146 sub rcx, byte SIZEOF_XMMWORD ; outcol |
| 147 add rsi, byte 2*SIZEOF_XMMWORD ; inptr |
| 148 add rdi, byte 1*SIZEOF_XMMWORD ; outptr |
| 149 cmp rcx, byte SIZEOF_XMMWORD |
| 150 jae short .columnloop |
| 151 test rcx,rcx |
| 152 jnz short .columnloop_r8 |
| 153 |
| 154 pop rsi |
| 155 pop rdi |
| 156 pop rcx |
| 157 |
| 158 add rsi, byte SIZEOF_JSAMPROW ; input_data |
| 159 add rdi, byte SIZEOF_JSAMPROW ; output_data |
| 160 dec rax ; rowctr |
| 161 jg near .rowloop |
| 162 |
| 163 .return: |
| 164 uncollect_args |
| 165 pop rbp |
| 166 ret |
| 167 |
| 168 ; -------------------------------------------------------------------------- |
| 169 ; |
| 170 ; Downsample pixel values of a single component. |
| 171 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, |
| 172 ; without smoothing. |
| 173 ; |
| 174 ; GLOBAL(void) |
| 175 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, |
| 176 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, |
| 177 ; JSAMPARRAY input_data, JSAMPARRAY output_data); |
| 178 ; |
| 179 |
| 180 ; r10 = JDIMENSION image_width |
| 181 ; r11 = int max_v_samp_factor |
| 182 ; r12 = JDIMENSION v_samp_factor |
| 183 ; r13 = JDIMENSION width_blocks |
| 184 ; r14 = JSAMPARRAY input_data |
| 185 ; r15 = JSAMPARRAY output_data |
| 186 |
| 187 align 16 |
| 188 global EXTN(jsimd_h2v2_downsample_sse2) |
| 189 |
| 190 EXTN(jsimd_h2v2_downsample_sse2): |
| 191 push rbp |
| 192 mov rax,rsp |
| 193 mov rbp,rsp |
| 194 collect_args |
| 195 |
| 196 mov rcx, r13 |
| 197 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) |
| 198 jz near .return |
| 199 |
| 200 mov rdx, r10 |
| 201 |
| 202 ; -- expand_right_edge |
| 203 |
| 204 push rcx |
| 205 shl rcx,1 ; output_cols * 2 |
| 206 sub rcx,rdx |
| 207 jle short .expand_end |
| 208 |
| 209 mov rax, r11 |
| 210 test rax,rax |
| 211 jle short .expand_end |
| 212 |
| 213 cld |
| 214 mov rsi, r14 ; input_data |
| 215 .expandloop: |
| 216 push rax |
| 217 push rcx |
| 218 |
| 219 mov rdi, JSAMPROW [rsi] |
| 220 add rdi,rdx |
| 221 mov al, JSAMPLE [rdi-1] |
| 222 |
| 223 rep stosb |
| 224 |
| 225 pop rcx |
| 226 pop rax |
| 227 |
| 228 add rsi, byte SIZEOF_JSAMPROW |
| 229 dec rax |
| 230 jg short .expandloop |
| 231 |
| 232 .expand_end: |
| 233 pop rcx ; output_cols |
| 234 |
| 235 ; -- h2v2_downsample |
| 236 |
| 237 mov rax, r12 ; rowctr |
| 238 test rax,rax |
| 239 jle near .return |
| 240 |
| 241 mov rdx, 0x00020001 ; bias pattern |
| 242 movd xmm7,edx |
| 243 pcmpeqw xmm6,xmm6 |
| 244 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} |
| 245 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} |
| 246 |
| 247 mov rsi, r14 ; input_data |
| 248 mov rdi, r15 ; output_data |
| 249 .rowloop: |
| 250 push rcx |
| 251 push rdi |
| 252 push rsi |
| 253 |
| 254 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 |
| 255 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 |
| 256 mov rdi, JSAMPROW [rdi] ; outptr |
| 257 |
| 258 cmp rcx, byte SIZEOF_XMMWORD |
| 259 jae short .columnloop |
| 260 |
| 261 .columnloop_r8: |
| 262 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] |
| 263 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] |
| 264 pxor xmm2,xmm2 |
| 265 pxor xmm3,xmm3 |
| 266 mov rcx, SIZEOF_XMMWORD |
| 267 jmp short .downsample |
| 268 |
| 269 .columnloop: |
| 270 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] |
| 271 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] |
| 272 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] |
| 273 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] |
| 274 |
| 275 .downsample: |
| 276 movdqa xmm4,xmm0 |
| 277 movdqa xmm5,xmm1 |
| 278 pand xmm0,xmm6 |
| 279 psrlw xmm4,BYTE_BIT |
| 280 pand xmm1,xmm6 |
| 281 psrlw xmm5,BYTE_BIT |
| 282 paddw xmm0,xmm4 |
| 283 paddw xmm1,xmm5 |
| 284 |
| 285 movdqa xmm4,xmm2 |
| 286 movdqa xmm5,xmm3 |
| 287 pand xmm2,xmm6 |
| 288 psrlw xmm4,BYTE_BIT |
| 289 pand xmm3,xmm6 |
| 290 psrlw xmm5,BYTE_BIT |
| 291 paddw xmm2,xmm4 |
| 292 paddw xmm3,xmm5 |
| 293 |
| 294 paddw xmm0,xmm1 |
| 295 paddw xmm2,xmm3 |
| 296 paddw xmm0,xmm7 |
| 297 paddw xmm2,xmm7 |
| 298 psrlw xmm0,2 |
| 299 psrlw xmm2,2 |
| 300 |
| 301 packuswb xmm0,xmm2 |
| 302 |
| 303 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 |
| 304 |
| 305 sub rcx, byte SIZEOF_XMMWORD ; outcol |
| 306 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 |
| 307 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 |
| 308 add rdi, byte 1*SIZEOF_XMMWORD ; outptr |
| 309 cmp rcx, byte SIZEOF_XMMWORD |
| 310 jae near .columnloop |
| 311 test rcx,rcx |
| 312 jnz near .columnloop_r8 |
| 313 |
| 314 pop rsi |
| 315 pop rdi |
| 316 pop rcx |
| 317 |
| 318 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data |
| 319 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data |
| 320 dec rax ; rowctr |
| 321 jg near .rowloop |
| 322 |
| 323 .return: |
| 324 uncollect_args |
| 325 pop rbp |
| 326 ret |
| 327 |
| 328 ; For some reason, the OS X linker does not honor the request to align the |
| 329 ; segment unless we do this. |
| 330 align 16 |
OLD | NEW |