| OLD | NEW |
| (Empty) |
| 1 ; | |
| 2 ; jcsamss2-64.asm - downsampling (64-bit SSE2) | |
| 3 ; | |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
| 5 ; Copyright 2009 D. R. Commander | |
| 6 ; | |
| 7 ; Based on | |
| 8 ; x86 SIMD extension for IJG JPEG library | |
| 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
| 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
| 11 ; | |
| 12 ; This file should be assembled with NASM (Netwide Assembler), | |
| 13 ; can *not* be assembled with Microsoft's MASM or any compatible | |
| 14 ; assembler (including Borland's Turbo Assembler). | |
| 15 ; NASM is available from http://nasm.sourceforge.net/ or | |
| 16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
| 17 ; | |
| 18 ; [TAB8] | |
| 19 | |
| 20 %include "jsimdext.inc" | |
| 21 | |
| 22 ; -------------------------------------------------------------------------- | |
| 23 SECTION SEG_TEXT | |
| 24 BITS 64 | |
| 25 ; | |
| 26 ; Downsample pixel values of a single component. | |
| 27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, | |
| 28 ; without smoothing. | |
| 29 ; | |
| 30 ; GLOBAL(void) | |
| 31 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, | |
| 32 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, | |
| 33 ; JSAMPARRAY input_data, JSAMPARRAY output_data); | |
| 34 ; | |
| 35 | |
| 36 ; r10 = JDIMENSION image_width | |
| 37 ; r11 = int max_v_samp_factor | |
| 38 ; r12 = JDIMENSION v_samp_factor | |
| 39 ; r13 = JDIMENSION width_blocks | |
| 40 ; r14 = JSAMPARRAY input_data | |
| 41 ; r15 = JSAMPARRAY output_data | |
| 42 | |
| 43 align 16 | |
| 44 global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE | |
| 45 | |
| 46 EXTN(jsimd_h2v1_downsample_sse2): | |
| 47 push rbp | |
| 48 mov rax,rsp | |
| 49 mov rbp,rsp | |
| 50 collect_args | |
| 51 | |
| 52 mov ecx, r13d | |
| 53 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) | |
| 54 jz near .return | |
| 55 | |
| 56 mov edx, r10d | |
| 57 | |
| 58 ; -- expand_right_edge | |
| 59 | |
| 60 push rcx | |
| 61 shl rcx,1 ; output_cols * 2 | |
| 62 sub rcx,rdx | |
| 63 jle short .expand_end | |
| 64 | |
| 65 mov rax, r11 | |
| 66 test rax,rax | |
| 67 jle short .expand_end | |
| 68 | |
| 69 cld | |
| 70 mov rsi, r14 ; input_data | |
| 71 .expandloop: | |
| 72 push rax | |
| 73 push rcx | |
| 74 | |
| 75 mov rdi, JSAMPROW [rsi] | |
| 76 add rdi,rdx | |
| 77 mov al, JSAMPLE [rdi-1] | |
| 78 | |
| 79 rep stosb | |
| 80 | |
| 81 pop rcx | |
| 82 pop rax | |
| 83 | |
| 84 add rsi, byte SIZEOF_JSAMPROW | |
| 85 dec rax | |
| 86 jg short .expandloop | |
| 87 | |
| 88 .expand_end: | |
| 89 pop rcx ; output_cols | |
| 90 | |
| 91 ; -- h2v1_downsample | |
| 92 | |
| 93 mov eax, r12d ; rowctr | |
| 94 test eax,eax | |
| 95 jle near .return | |
| 96 | |
| 97 mov rdx, 0x00010000 ; bias pattern | |
| 98 movd xmm7,edx | |
| 99 pcmpeqw xmm6,xmm6 | |
| 100 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} | |
| 101 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} | |
| 102 | |
| 103 mov rsi, r14 ; input_data | |
| 104 mov rdi, r15 ; output_data | |
| 105 .rowloop: | |
| 106 push rcx | |
| 107 push rdi | |
| 108 push rsi | |
| 109 | |
| 110 mov rsi, JSAMPROW [rsi] ; inptr | |
| 111 mov rdi, JSAMPROW [rdi] ; outptr | |
| 112 | |
| 113 cmp rcx, byte SIZEOF_XMMWORD | |
| 114 jae short .columnloop | |
| 115 | |
| 116 .columnloop_r8: | |
| 117 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
| 118 pxor xmm1,xmm1 | |
| 119 mov rcx, SIZEOF_XMMWORD | |
| 120 jmp short .downsample | |
| 121 | |
| 122 .columnloop: | |
| 123 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
| 124 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] | |
| 125 | |
| 126 .downsample: | |
| 127 movdqa xmm2,xmm0 | |
| 128 movdqa xmm3,xmm1 | |
| 129 | |
| 130 pand xmm0,xmm6 | |
| 131 psrlw xmm2,BYTE_BIT | |
| 132 pand xmm1,xmm6 | |
| 133 psrlw xmm3,BYTE_BIT | |
| 134 | |
| 135 paddw xmm0,xmm2 | |
| 136 paddw xmm1,xmm3 | |
| 137 paddw xmm0,xmm7 | |
| 138 paddw xmm1,xmm7 | |
| 139 psrlw xmm0,1 | |
| 140 psrlw xmm1,1 | |
| 141 | |
| 142 packuswb xmm0,xmm1 | |
| 143 | |
| 144 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 | |
| 145 | |
| 146 sub rcx, byte SIZEOF_XMMWORD ; outcol | |
| 147 add rsi, byte 2*SIZEOF_XMMWORD ; inptr | |
| 148 add rdi, byte 1*SIZEOF_XMMWORD ; outptr | |
| 149 cmp rcx, byte SIZEOF_XMMWORD | |
| 150 jae short .columnloop | |
| 151 test rcx,rcx | |
| 152 jnz short .columnloop_r8 | |
| 153 | |
| 154 pop rsi | |
| 155 pop rdi | |
| 156 pop rcx | |
| 157 | |
| 158 add rsi, byte SIZEOF_JSAMPROW ; input_data | |
| 159 add rdi, byte SIZEOF_JSAMPROW ; output_data | |
| 160 dec rax ; rowctr | |
| 161 jg near .rowloop | |
| 162 | |
| 163 .return: | |
| 164 uncollect_args | |
| 165 pop rbp | |
| 166 ret | |
| 167 | |
| 168 ; -------------------------------------------------------------------------- | |
| 169 ; | |
| 170 ; Downsample pixel values of a single component. | |
| 171 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, | |
| 172 ; without smoothing. | |
| 173 ; | |
| 174 ; GLOBAL(void) | |
| 175 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, | |
| 176 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, | |
| 177 ; JSAMPARRAY input_data, JSAMPARRAY output_data); | |
| 178 ; | |
| 179 | |
| 180 ; r10 = JDIMENSION image_width | |
| 181 ; r11 = int max_v_samp_factor | |
| 182 ; r12 = JDIMENSION v_samp_factor | |
| 183 ; r13 = JDIMENSION width_blocks | |
| 184 ; r14 = JSAMPARRAY input_data | |
| 185 ; r15 = JSAMPARRAY output_data | |
| 186 | |
| 187 align 16 | |
| 188 global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE | |
| 189 | |
| 190 EXTN(jsimd_h2v2_downsample_sse2): | |
| 191 push rbp | |
| 192 mov rax,rsp | |
| 193 mov rbp,rsp | |
| 194 collect_args | |
| 195 | |
| 196 mov ecx, r13d | |
| 197 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) | |
| 198 jz near .return | |
| 199 | |
| 200 mov edx, r10d | |
| 201 | |
| 202 ; -- expand_right_edge | |
| 203 | |
| 204 push rcx | |
| 205 shl rcx,1 ; output_cols * 2 | |
| 206 sub rcx,rdx | |
| 207 jle short .expand_end | |
| 208 | |
| 209 mov rax, r11 | |
| 210 test rax,rax | |
| 211 jle short .expand_end | |
| 212 | |
| 213 cld | |
| 214 mov rsi, r14 ; input_data | |
| 215 .expandloop: | |
| 216 push rax | |
| 217 push rcx | |
| 218 | |
| 219 mov rdi, JSAMPROW [rsi] | |
| 220 add rdi,rdx | |
| 221 mov al, JSAMPLE [rdi-1] | |
| 222 | |
| 223 rep stosb | |
| 224 | |
| 225 pop rcx | |
| 226 pop rax | |
| 227 | |
| 228 add rsi, byte SIZEOF_JSAMPROW | |
| 229 dec rax | |
| 230 jg short .expandloop | |
| 231 | |
| 232 .expand_end: | |
| 233 pop rcx ; output_cols | |
| 234 | |
| 235 ; -- h2v2_downsample | |
| 236 | |
| 237 mov eax, r12d ; rowctr | |
| 238 test rax,rax | |
| 239 jle near .return | |
| 240 | |
| 241 mov rdx, 0x00020001 ; bias pattern | |
| 242 movd xmm7,edx | |
| 243 pcmpeqw xmm6,xmm6 | |
| 244 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} | |
| 245 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} | |
| 246 | |
| 247 mov rsi, r14 ; input_data | |
| 248 mov rdi, r15 ; output_data | |
| 249 .rowloop: | |
| 250 push rcx | |
| 251 push rdi | |
| 252 push rsi | |
| 253 | |
| 254 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 | |
| 255 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 | |
| 256 mov rdi, JSAMPROW [rdi] ; outptr | |
| 257 | |
| 258 cmp rcx, byte SIZEOF_XMMWORD | |
| 259 jae short .columnloop | |
| 260 | |
| 261 .columnloop_r8: | |
| 262 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] | |
| 263 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
| 264 pxor xmm2,xmm2 | |
| 265 pxor xmm3,xmm3 | |
| 266 mov rcx, SIZEOF_XMMWORD | |
| 267 jmp short .downsample | |
| 268 | |
| 269 .columnloop: | |
| 270 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] | |
| 271 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
| 272 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] | |
| 273 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] | |
| 274 | |
| 275 .downsample: | |
| 276 movdqa xmm4,xmm0 | |
| 277 movdqa xmm5,xmm1 | |
| 278 pand xmm0,xmm6 | |
| 279 psrlw xmm4,BYTE_BIT | |
| 280 pand xmm1,xmm6 | |
| 281 psrlw xmm5,BYTE_BIT | |
| 282 paddw xmm0,xmm4 | |
| 283 paddw xmm1,xmm5 | |
| 284 | |
| 285 movdqa xmm4,xmm2 | |
| 286 movdqa xmm5,xmm3 | |
| 287 pand xmm2,xmm6 | |
| 288 psrlw xmm4,BYTE_BIT | |
| 289 pand xmm3,xmm6 | |
| 290 psrlw xmm5,BYTE_BIT | |
| 291 paddw xmm2,xmm4 | |
| 292 paddw xmm3,xmm5 | |
| 293 | |
| 294 paddw xmm0,xmm1 | |
| 295 paddw xmm2,xmm3 | |
| 296 paddw xmm0,xmm7 | |
| 297 paddw xmm2,xmm7 | |
| 298 psrlw xmm0,2 | |
| 299 psrlw xmm2,2 | |
| 300 | |
| 301 packuswb xmm0,xmm2 | |
| 302 | |
| 303 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 | |
| 304 | |
| 305 sub rcx, byte SIZEOF_XMMWORD ; outcol | |
| 306 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 | |
| 307 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 | |
| 308 add rdi, byte 1*SIZEOF_XMMWORD ; outptr | |
| 309 cmp rcx, byte SIZEOF_XMMWORD | |
| 310 jae near .columnloop | |
| 311 test rcx,rcx | |
| 312 jnz near .columnloop_r8 | |
| 313 | |
| 314 pop rsi | |
| 315 pop rdi | |
| 316 pop rcx | |
| 317 | |
| 318 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data | |
| 319 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data | |
| 320 dec rax ; rowctr | |
| 321 jg near .rowloop | |
| 322 | |
| 323 .return: | |
| 324 uncollect_args | |
| 325 pop rbp | |
| 326 ret | |
| 327 | |
| 328 ; For some reason, the OS X linker does not honor the request to align the | |
| 329 ; segment unless we do this. | |
| 330 align 16 | |
| OLD | NEW |