OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; jcsamss2.asm - downsampling (SSE2) |
| 3 ; |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
| 5 ; |
| 6 ; Based on |
| 7 ; x86 SIMD extension for IJG JPEG library |
| 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| 10 ; |
| 11 ; This file should be assembled with NASM (Netwide Assembler), |
| 12 ; can *not* be assembled with Microsoft's MASM or any compatible |
| 13 ; assembler (including Borland's Turbo Assembler). |
| 14 ; NASM is available from http://nasm.sourceforge.net/ or |
| 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
| 16 ; |
| 17 ; [TAB8] |
| 18 |
| 19 %include "jsimdext.inc" |
| 20 |
| 21 ; -------------------------------------------------------------------------- |
| 22 SECTION SEG_TEXT |
| 23 BITS 32 |
| 24 ; |
| 25 ; Downsample pixel values of a single component. |
| 26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, |
| 27 ; without smoothing. |
| 28 ; |
| 29 ; GLOBAL(void) |
| 30 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, |
| 31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, |
| 32 ; JSAMPARRAY input_data, JSAMPARRAY output_data); |
| 33 ; |
| 34 |
| 35 %define img_width(b) (b)+8 ; JDIMENSION image_width |
| 36 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor |
| 37 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_fact
or |
| 38 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks |
| 39 %define input_data(b) (b)+24 ; JSAMPARRAY input_data |
| 40 %define output_data(b) (b)+28 ; JSAMPARRAY output_data |
| 41 |
| 42 align 16 |
| 43 global EXTN(jsimd_h2v1_downsample_sse2) |
| 44 |
| 45 EXTN(jsimd_h2v1_downsample_sse2): |
| 46 push ebp |
| 47 mov ebp,esp |
| 48 ; push ebx ; unused |
| 49 ; push ecx ; need not be preserved |
| 50 ; push edx ; need not be preserved |
| 51 push esi |
| 52 push edi |
| 53 |
| 54 mov ecx, JDIMENSION [width_blks(ebp)] |
| 55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) |
| 56 jz near .return |
| 57 |
| 58 mov edx, JDIMENSION [img_width(ebp)] |
| 59 |
| 60 ; -- expand_right_edge |
| 61 |
| 62 push ecx |
| 63 shl ecx,1 ; output_cols * 2 |
| 64 sub ecx,edx |
| 65 jle short .expand_end |
| 66 |
| 67 mov eax, INT [max_v_samp(ebp)] |
| 68 test eax,eax |
| 69 jle short .expand_end |
| 70 |
| 71 cld |
| 72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data |
| 73 alignx 16,7 |
| 74 .expandloop: |
| 75 push eax |
| 76 push ecx |
| 77 |
| 78 mov edi, JSAMPROW [esi] |
| 79 add edi,edx |
| 80 mov al, JSAMPLE [edi-1] |
| 81 |
| 82 rep stosb |
| 83 |
| 84 pop ecx |
| 85 pop eax |
| 86 |
| 87 add esi, byte SIZEOF_JSAMPROW |
| 88 dec eax |
| 89 jg short .expandloop |
| 90 |
| 91 .expand_end: |
| 92 pop ecx ; output_cols |
| 93 |
| 94 ; -- h2v1_downsample |
| 95 |
| 96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr |
| 97 test eax,eax |
| 98 jle near .return |
| 99 |
| 100 mov edx, 0x00010000 ; bias pattern |
| 101 movd xmm7,edx |
| 102 pcmpeqw xmm6,xmm6 |
| 103 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} |
| 104 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} |
| 105 |
| 106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data |
| 107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data |
| 108 alignx 16,7 |
| 109 .rowloop: |
| 110 push ecx |
| 111 push edi |
| 112 push esi |
| 113 |
| 114 mov esi, JSAMPROW [esi] ; inptr |
| 115 mov edi, JSAMPROW [edi] ; outptr |
| 116 |
| 117 cmp ecx, byte SIZEOF_XMMWORD |
| 118 jae short .columnloop |
| 119 alignx 16,7 |
| 120 |
| 121 .columnloop_r8: |
| 122 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] |
| 123 pxor xmm1,xmm1 |
| 124 mov ecx, SIZEOF_XMMWORD |
| 125 jmp short .downsample |
| 126 alignx 16,7 |
| 127 |
| 128 .columnloop: |
| 129 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] |
| 130 movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] |
| 131 |
| 132 .downsample: |
| 133 movdqa xmm2,xmm0 |
| 134 movdqa xmm3,xmm1 |
| 135 |
| 136 pand xmm0,xmm6 |
| 137 psrlw xmm2,BYTE_BIT |
| 138 pand xmm1,xmm6 |
| 139 psrlw xmm3,BYTE_BIT |
| 140 |
| 141 paddw xmm0,xmm2 |
| 142 paddw xmm1,xmm3 |
| 143 paddw xmm0,xmm7 |
| 144 paddw xmm1,xmm7 |
| 145 psrlw xmm0,1 |
| 146 psrlw xmm1,1 |
| 147 |
| 148 packuswb xmm0,xmm1 |
| 149 |
| 150 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 |
| 151 |
| 152 sub ecx, byte SIZEOF_XMMWORD ; outcol |
| 153 add esi, byte 2*SIZEOF_XMMWORD ; inptr |
| 154 add edi, byte 1*SIZEOF_XMMWORD ; outptr |
| 155 cmp ecx, byte SIZEOF_XMMWORD |
| 156 jae short .columnloop |
| 157 test ecx,ecx |
| 158 jnz short .columnloop_r8 |
| 159 |
| 160 pop esi |
| 161 pop edi |
| 162 pop ecx |
| 163 |
| 164 add esi, byte SIZEOF_JSAMPROW ; input_data |
| 165 add edi, byte SIZEOF_JSAMPROW ; output_data |
| 166 dec eax ; rowctr |
| 167 jg near .rowloop |
| 168 |
| 169 .return: |
| 170 pop edi |
| 171 pop esi |
| 172 ; pop edx ; need not be preserved |
| 173 ; pop ecx ; need not be preserved |
| 174 ; pop ebx ; unused |
| 175 pop ebp |
| 176 ret |
| 177 |
| 178 ; -------------------------------------------------------------------------- |
| 179 ; |
| 180 ; Downsample pixel values of a single component. |
| 181 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, |
| 182 ; without smoothing. |
| 183 ; |
| 184 ; GLOBAL(void) |
| 185 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, |
| 186 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, |
| 187 ; JSAMPARRAY input_data, JSAMPARRAY output_data); |
| 188 ; |
| 189 |
| 190 %define img_width(b) (b)+8 ; JDIMENSION image_width |
| 191 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor |
| 192 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_fact
or |
| 193 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks |
| 194 %define input_data(b) (b)+24 ; JSAMPARRAY input_data |
| 195 %define output_data(b) (b)+28 ; JSAMPARRAY output_data |
| 196 |
| 197 align 16 |
| 198 global EXTN(jsimd_h2v2_downsample_sse2) |
| 199 |
| 200 EXTN(jsimd_h2v2_downsample_sse2): |
| 201 push ebp |
| 202 mov ebp,esp |
| 203 ; push ebx ; unused |
| 204 ; push ecx ; need not be preserved |
| 205 ; push edx ; need not be preserved |
| 206 push esi |
| 207 push edi |
| 208 |
| 209 mov ecx, JDIMENSION [width_blks(ebp)] |
| 210 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) |
| 211 jz near .return |
| 212 |
| 213 mov edx, JDIMENSION [img_width(ebp)] |
| 214 |
| 215 ; -- expand_right_edge |
| 216 |
| 217 push ecx |
| 218 shl ecx,1 ; output_cols * 2 |
| 219 sub ecx,edx |
| 220 jle short .expand_end |
| 221 |
| 222 mov eax, INT [max_v_samp(ebp)] |
| 223 test eax,eax |
| 224 jle short .expand_end |
| 225 |
| 226 cld |
| 227 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data |
| 228 alignx 16,7 |
| 229 .expandloop: |
| 230 push eax |
| 231 push ecx |
| 232 |
| 233 mov edi, JSAMPROW [esi] |
| 234 add edi,edx |
| 235 mov al, JSAMPLE [edi-1] |
| 236 |
| 237 rep stosb |
| 238 |
| 239 pop ecx |
| 240 pop eax |
| 241 |
| 242 add esi, byte SIZEOF_JSAMPROW |
| 243 dec eax |
| 244 jg short .expandloop |
| 245 |
| 246 .expand_end: |
| 247 pop ecx ; output_cols |
| 248 |
| 249 ; -- h2v2_downsample |
| 250 |
| 251 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr |
| 252 test eax,eax |
| 253 jle near .return |
| 254 |
| 255 mov edx, 0x00020001 ; bias pattern |
| 256 movd xmm7,edx |
| 257 pcmpeqw xmm6,xmm6 |
| 258 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} |
| 259 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} |
| 260 |
| 261 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data |
| 262 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data |
| 263 alignx 16,7 |
| 264 .rowloop: |
| 265 push ecx |
| 266 push edi |
| 267 push esi |
| 268 |
| 269 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 |
| 270 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 |
| 271 mov edi, JSAMPROW [edi] ; outptr |
| 272 |
| 273 cmp ecx, byte SIZEOF_XMMWORD |
| 274 jae short .columnloop |
| 275 alignx 16,7 |
| 276 |
| 277 .columnloop_r8: |
| 278 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] |
| 279 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] |
| 280 pxor xmm2,xmm2 |
| 281 pxor xmm3,xmm3 |
| 282 mov ecx, SIZEOF_XMMWORD |
| 283 jmp short .downsample |
| 284 alignx 16,7 |
| 285 |
| 286 .columnloop: |
| 287 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] |
| 288 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] |
| 289 movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] |
| 290 movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] |
| 291 |
| 292 .downsample: |
| 293 movdqa xmm4,xmm0 |
| 294 movdqa xmm5,xmm1 |
| 295 pand xmm0,xmm6 |
| 296 psrlw xmm4,BYTE_BIT |
| 297 pand xmm1,xmm6 |
| 298 psrlw xmm5,BYTE_BIT |
| 299 paddw xmm0,xmm4 |
| 300 paddw xmm1,xmm5 |
| 301 |
| 302 movdqa xmm4,xmm2 |
| 303 movdqa xmm5,xmm3 |
| 304 pand xmm2,xmm6 |
| 305 psrlw xmm4,BYTE_BIT |
| 306 pand xmm3,xmm6 |
| 307 psrlw xmm5,BYTE_BIT |
| 308 paddw xmm2,xmm4 |
| 309 paddw xmm3,xmm5 |
| 310 |
| 311 paddw xmm0,xmm1 |
| 312 paddw xmm2,xmm3 |
| 313 paddw xmm0,xmm7 |
| 314 paddw xmm2,xmm7 |
| 315 psrlw xmm0,2 |
| 316 psrlw xmm2,2 |
| 317 |
| 318 packuswb xmm0,xmm2 |
| 319 |
| 320 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 |
| 321 |
| 322 sub ecx, byte SIZEOF_XMMWORD ; outcol |
| 323 add edx, byte 2*SIZEOF_XMMWORD ; inptr0 |
| 324 add esi, byte 2*SIZEOF_XMMWORD ; inptr1 |
| 325 add edi, byte 1*SIZEOF_XMMWORD ; outptr |
| 326 cmp ecx, byte SIZEOF_XMMWORD |
| 327 jae near .columnloop |
| 328 test ecx,ecx |
| 329 jnz near .columnloop_r8 |
| 330 |
| 331 pop esi |
| 332 pop edi |
| 333 pop ecx |
| 334 |
| 335 add esi, byte 2*SIZEOF_JSAMPROW ; input_data |
| 336 add edi, byte 1*SIZEOF_JSAMPROW ; output_data |
| 337 dec eax ; rowctr |
| 338 jg near .rowloop |
| 339 |
| 340 .return: |
| 341 pop edi |
| 342 pop esi |
| 343 ; pop edx ; need not be preserved |
| 344 ; pop ecx ; need not be preserved |
| 345 ; pop ebx ; unused |
| 346 pop ebp |
| 347 ret |
| 348 |
| 349 ; For some reason, the OS X linker does not honor the request to align the |
| 350 ; segment unless we do this. |
| 351 align 16 |
OLD | NEW |