OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; jcsample.asm - downsampling (MMX) |
| 3 ; |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
| 5 ; |
| 6 ; Based on |
| 7 ; x86 SIMD extension for IJG JPEG library |
| 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| 10 ; |
| 11 ; This file should be assembled with NASM (Netwide Assembler), |
| 12 ; can *not* be assembled with Microsoft's MASM or any compatible |
| 13 ; assembler (including Borland's Turbo Assembler). |
| 14 ; NASM is available from http://nasm.sourceforge.net/ or |
| 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
| 16 ; |
| 17 ; [TAB8] |
| 18 |
| 19 %include "jsimdext.inc" |
| 20 |
| 21 ; -------------------------------------------------------------------------- |
| 22 SECTION SEG_TEXT |
| 23 BITS 32 |
| 24 ; |
| 25 ; Downsample pixel values of a single component. |
| 26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, |
| 27 ; without smoothing. |
| 28 ; |
| 29 ; GLOBAL(void) |
| 30 ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, |
| 31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, |
| 32 ; JSAMPARRAY input_data, JSAMPARRAY output_data); |
| 33 ; |
| 34 |
| 35 %define img_width(b) (b)+8 ; JDIMENSION image_width |
| 36 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor |
| 37 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor |
| 38 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks |
| 39 %define input_data(b) (b)+24 ; JSAMPARRAY input_data |
| 40 %define output_data(b) (b)+28 ; JSAMPARRAY output_data |
| 41 |
| 42 align 16 |
| 43 global EXTN(jsimd_h2v1_downsample_mmx) |
| 44 |
| 45 EXTN(jsimd_h2v1_downsample_mmx): |
| 46 push ebp |
| 47 mov ebp,esp |
| 48 ; push ebx ; unused |
| 49 ; push ecx ; need not be preserved |
| 50 ; push edx ; need not be preserved |
| 51 push esi |
| 52 push edi |
| 53 |
| 54 mov ecx, JDIMENSION [width_blks(ebp)] |
| 55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) |
| 56 jz near .return |
| 57 |
| 58 mov edx, JDIMENSION [img_width(ebp)] |
| 59 |
| 60 ; -- expand_right_edge |
| 61 |
| 62 push ecx |
| 63 shl ecx,1 ; output_cols * 2 |
| 64 sub ecx,edx |
| 65 jle short .expand_end |
| 66 |
| 67 mov eax, INT [max_v_samp(ebp)] |
| 68 test eax,eax |
| 69 jle short .expand_end |
| 70 |
| 71 cld |
| 72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data |
| 73 alignx 16,7 |
| 74 .expandloop: |
| 75 push eax |
| 76 push ecx |
| 77 |
| 78 mov edi, JSAMPROW [esi] |
| 79 add edi,edx |
| 80 mov al, JSAMPLE [edi-1] |
| 81 |
| 82 rep stosb |
| 83 |
| 84 pop ecx |
| 85 pop eax |
| 86 |
| 87 add esi, byte SIZEOF_JSAMPROW |
| 88 dec eax |
| 89 jg short .expandloop |
| 90 |
| 91 .expand_end: |
| 92 pop ecx ; output_cols |
| 93 |
| 94 ; -- h2v1_downsample |
| 95 |
| 96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr |
| 97 test eax,eax |
| 98 jle near .return |
| 99 |
| 100 mov edx, 0x00010000 ; bias pattern |
| 101 movd mm7,edx |
| 102 pcmpeqw mm6,mm6 |
| 103 punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} |
| 104 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} |
| 105 |
| 106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data |
| 107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data |
| 108 alignx 16,7 |
| 109 .rowloop: |
| 110 push ecx |
| 111 push edi |
| 112 push esi |
| 113 |
| 114 mov esi, JSAMPROW [esi] ; inptr |
| 115 mov edi, JSAMPROW [edi] ; outptr |
| 116 alignx 16,7 |
| 117 .columnloop: |
| 118 |
| 119 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] |
| 120 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] |
| 121 movq mm2,mm0 |
| 122 movq mm3,mm1 |
| 123 |
| 124 pand mm0,mm6 |
| 125 psrlw mm2,BYTE_BIT |
| 126 pand mm1,mm6 |
| 127 psrlw mm3,BYTE_BIT |
| 128 |
| 129 paddw mm0,mm2 |
| 130 paddw mm1,mm3 |
| 131 paddw mm0,mm7 |
| 132 paddw mm1,mm7 |
| 133 psrlw mm0,1 |
| 134 psrlw mm1,1 |
| 135 |
| 136 packuswb mm0,mm1 |
| 137 |
| 138 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 |
| 139 |
| 140 add esi, byte 2*SIZEOF_MMWORD ; inptr |
| 141 add edi, byte 1*SIZEOF_MMWORD ; outptr |
| 142 sub ecx, byte SIZEOF_MMWORD ; outcol |
| 143 jnz short .columnloop |
| 144 |
| 145 pop esi |
| 146 pop edi |
| 147 pop ecx |
| 148 |
| 149 add esi, byte SIZEOF_JSAMPROW ; input_data |
| 150 add edi, byte SIZEOF_JSAMPROW ; output_data |
| 151 dec eax ; rowctr |
| 152 jg short .rowloop |
| 153 |
| 154 emms ; empty MMX state |
| 155 |
| 156 .return: |
| 157 pop edi |
| 158 pop esi |
| 159 ; pop edx ; need not be preserved |
| 160 ; pop ecx ; need not be preserved |
| 161 ; pop ebx ; unused |
| 162 pop ebp |
| 163 ret |
| 164 |
| 165 ; -------------------------------------------------------------------------- |
| 166 ; |
| 167 ; Downsample pixel values of a single component. |
| 168 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, |
| 169 ; without smoothing. |
| 170 ; |
| 171 ; GLOBAL(void) |
| 172 ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, |
| 173 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, |
| 174 ; JSAMPARRAY input_data, JSAMPARRAY output_data); |
| 175 ; |
| 176 |
| 177 %define img_width(b) (b)+8 ; JDIMENSION image_width |
| 178 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor |
| 179 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor |
| 180 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks |
| 181 %define input_data(b) (b)+24 ; JSAMPARRAY input_data |
| 182 %define output_data(b) (b)+28 ; JSAMPARRAY output_data |
| 183 |
| 184 align 16 |
| 185 global EXTN(jsimd_h2v2_downsample_mmx) |
| 186 |
| 187 EXTN(jsimd_h2v2_downsample_mmx): |
| 188 push ebp |
| 189 mov ebp,esp |
| 190 ; push ebx ; unused |
| 191 ; push ecx ; need not be preserved |
| 192 ; push edx ; need not be preserved |
| 193 push esi |
| 194 push edi |
| 195 |
| 196 mov ecx, JDIMENSION [width_blks(ebp)] |
| 197 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) |
| 198 jz near .return |
| 199 |
| 200 mov edx, JDIMENSION [img_width(ebp)] |
| 201 |
| 202 ; -- expand_right_edge |
| 203 |
| 204 push ecx |
| 205 shl ecx,1 ; output_cols * 2 |
| 206 sub ecx,edx |
| 207 jle short .expand_end |
| 208 |
| 209 mov eax, INT [max_v_samp(ebp)] |
| 210 test eax,eax |
| 211 jle short .expand_end |
| 212 |
| 213 cld |
| 214 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data |
| 215 alignx 16,7 |
| 216 .expandloop: |
| 217 push eax |
| 218 push ecx |
| 219 |
| 220 mov edi, JSAMPROW [esi] |
| 221 add edi,edx |
| 222 mov al, JSAMPLE [edi-1] |
| 223 |
| 224 rep stosb |
| 225 |
| 226 pop ecx |
| 227 pop eax |
| 228 |
| 229 add esi, byte SIZEOF_JSAMPROW |
| 230 dec eax |
| 231 jg short .expandloop |
| 232 |
| 233 .expand_end: |
| 234 pop ecx ; output_cols |
| 235 |
| 236 ; -- h2v2_downsample |
| 237 |
| 238 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr |
| 239 test eax,eax |
| 240 jle near .return |
| 241 |
| 242 mov edx, 0x00020001 ; bias pattern |
| 243 movd mm7,edx |
| 244 pcmpeqw mm6,mm6 |
| 245 punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} |
| 246 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} |
| 247 |
| 248 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data |
| 249 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data |
| 250 alignx 16,7 |
| 251 .rowloop: |
| 252 push ecx |
| 253 push edi |
| 254 push esi |
| 255 |
| 256 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 |
| 257 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 |
| 258 mov edi, JSAMPROW [edi] ; outptr |
| 259 alignx 16,7 |
| 260 .columnloop: |
| 261 |
| 262 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] |
| 263 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] |
| 264 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] |
| 265 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] |
| 266 |
| 267 movq mm4,mm0 |
| 268 movq mm5,mm1 |
| 269 pand mm0,mm6 |
| 270 psrlw mm4,BYTE_BIT |
| 271 pand mm1,mm6 |
| 272 psrlw mm5,BYTE_BIT |
| 273 paddw mm0,mm4 |
| 274 paddw mm1,mm5 |
| 275 |
| 276 movq mm4,mm2 |
| 277 movq mm5,mm3 |
| 278 pand mm2,mm6 |
| 279 psrlw mm4,BYTE_BIT |
| 280 pand mm3,mm6 |
| 281 psrlw mm5,BYTE_BIT |
| 282 paddw mm2,mm4 |
| 283 paddw mm3,mm5 |
| 284 |
| 285 paddw mm0,mm1 |
| 286 paddw mm2,mm3 |
| 287 paddw mm0,mm7 |
| 288 paddw mm2,mm7 |
| 289 psrlw mm0,2 |
| 290 psrlw mm2,2 |
| 291 |
| 292 packuswb mm0,mm2 |
| 293 |
| 294 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 |
| 295 |
| 296 add edx, byte 2*SIZEOF_MMWORD ; inptr0 |
| 297 add esi, byte 2*SIZEOF_MMWORD ; inptr1 |
| 298 add edi, byte 1*SIZEOF_MMWORD ; outptr |
| 299 sub ecx, byte SIZEOF_MMWORD ; outcol |
| 300 jnz near .columnloop |
| 301 |
| 302 pop esi |
| 303 pop edi |
| 304 pop ecx |
| 305 |
| 306 add esi, byte 2*SIZEOF_JSAMPROW ; input_data |
| 307 add edi, byte 1*SIZEOF_JSAMPROW ; output_data |
| 308 dec eax ; rowctr |
| 309 jg near .rowloop |
| 310 |
| 311 emms ; empty MMX state |
| 312 |
| 313 .return: |
| 314 pop edi |
| 315 pop esi |
| 316 ; pop edx ; need not be preserved |
| 317 ; pop ecx ; need not be preserved |
| 318 ; pop ebx ; unused |
| 319 pop ebp |
| 320 ret |
| 321 |
| 322 ; For some reason, the OS X linker does not honor the request to align the |
| 323 ; segment unless we do this. |
| 324 align 16 |
OLD | NEW |