| OLD | NEW |
| (Empty) |
| 1 ; | |
| 2 ; jcsammmx.asm - downsampling (MMX) | |
| 3 ; | |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
| 5 ; | |
| 6 ; Based on | |
| 7 ; x86 SIMD extension for IJG JPEG library | |
| 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
| 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
| 10 ; | |
| 11 ; This file should be assembled with NASM (Netwide Assembler), | |
| 12 ; can *not* be assembled with Microsoft's MASM or any compatible | |
| 13 ; assembler (including Borland's Turbo Assembler). | |
| 14 ; NASM is available from http://nasm.sourceforge.net/ or | |
| 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
| 16 ; | |
| 17 ; [TAB8] | |
| 18 | |
| 19 %include "jsimdext.inc" | |
| 20 | |
| 21 ; -------------------------------------------------------------------------- | |
| 22 SECTION SEG_TEXT | |
| 23 BITS 32 | |
| 24 ; | |
| 25 ; Downsample pixel values of a single component. | |
| 26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, | |
| 27 ; without smoothing. | |
| 28 ; | |
| 29 ; GLOBAL(void) | |
| 30 ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, | |
| 31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, | |
| 32 ; JSAMPARRAY input_data, JSAMPARRAY output_data); | |
| 33 ; | |
| 34 | |
| 35 %define img_width(b) (b)+8 ; JDIMENSION image_width | |
| 36 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor | |
| 37 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_fact
or | |
| 38 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks | |
| 39 %define input_data(b) (b)+24 ; JSAMPARRAY input_data | |
| 40 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
| 41 | |
| 42 align 16 | |
| 43 global EXTN(jsimd_h2v1_downsample_mmx) PRIVATE | |
| 44 | |
| 45 EXTN(jsimd_h2v1_downsample_mmx): | |
| 46 push ebp | |
| 47 mov ebp,esp | |
| 48 ; push ebx ; unused | |
| 49 ; push ecx ; need not be preserved | |
| 50 ; push edx ; need not be preserved | |
| 51 push esi | |
| 52 push edi | |
| 53 | |
| 54 mov ecx, JDIMENSION [width_blks(ebp)] | |
| 55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) | |
| 56 jz near .return | |
| 57 | |
| 58 mov edx, JDIMENSION [img_width(ebp)] | |
| 59 | |
| 60 ; -- expand_right_edge | |
| 61 | |
| 62 push ecx | |
| 63 shl ecx,1 ; output_cols * 2 | |
| 64 sub ecx,edx | |
| 65 jle short .expand_end | |
| 66 | |
| 67 mov eax, INT [max_v_samp(ebp)] | |
| 68 test eax,eax | |
| 69 jle short .expand_end | |
| 70 | |
| 71 cld | |
| 72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
| 73 alignx 16,7 | |
| 74 .expandloop: | |
| 75 push eax | |
| 76 push ecx | |
| 77 | |
| 78 mov edi, JSAMPROW [esi] | |
| 79 add edi,edx | |
| 80 mov al, JSAMPLE [edi-1] | |
| 81 | |
| 82 rep stosb | |
| 83 | |
| 84 pop ecx | |
| 85 pop eax | |
| 86 | |
| 87 add esi, byte SIZEOF_JSAMPROW | |
| 88 dec eax | |
| 89 jg short .expandloop | |
| 90 | |
| 91 .expand_end: | |
| 92 pop ecx ; output_cols | |
| 93 | |
| 94 ; -- h2v1_downsample | |
| 95 | |
| 96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr | |
| 97 test eax,eax | |
| 98 jle near .return | |
| 99 | |
| 100 mov edx, 0x00010000 ; bias pattern | |
| 101 movd mm7,edx | |
| 102 pcmpeqw mm6,mm6 | |
| 103 punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} | |
| 104 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} | |
| 105 | |
| 106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
| 107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data | |
| 108 alignx 16,7 | |
| 109 .rowloop: | |
| 110 push ecx | |
| 111 push edi | |
| 112 push esi | |
| 113 | |
| 114 mov esi, JSAMPROW [esi] ; inptr | |
| 115 mov edi, JSAMPROW [edi] ; outptr | |
| 116 alignx 16,7 | |
| 117 .columnloop: | |
| 118 | |
| 119 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] | |
| 120 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] | |
| 121 movq mm2,mm0 | |
| 122 movq mm3,mm1 | |
| 123 | |
| 124 pand mm0,mm6 | |
| 125 psrlw mm2,BYTE_BIT | |
| 126 pand mm1,mm6 | |
| 127 psrlw mm3,BYTE_BIT | |
| 128 | |
| 129 paddw mm0,mm2 | |
| 130 paddw mm1,mm3 | |
| 131 paddw mm0,mm7 | |
| 132 paddw mm1,mm7 | |
| 133 psrlw mm0,1 | |
| 134 psrlw mm1,1 | |
| 135 | |
| 136 packuswb mm0,mm1 | |
| 137 | |
| 138 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 | |
| 139 | |
| 140 add esi, byte 2*SIZEOF_MMWORD ; inptr | |
| 141 add edi, byte 1*SIZEOF_MMWORD ; outptr | |
| 142 sub ecx, byte SIZEOF_MMWORD ; outcol | |
| 143 jnz short .columnloop | |
| 144 | |
| 145 pop esi | |
| 146 pop edi | |
| 147 pop ecx | |
| 148 | |
| 149 add esi, byte SIZEOF_JSAMPROW ; input_data | |
| 150 add edi, byte SIZEOF_JSAMPROW ; output_data | |
| 151 dec eax ; rowctr | |
| 152 jg short .rowloop | |
| 153 | |
| 154 emms ; empty MMX state | |
| 155 | |
| 156 .return: | |
| 157 pop edi | |
| 158 pop esi | |
| 159 ; pop edx ; need not be preserved | |
| 160 ; pop ecx ; need not be preserved | |
| 161 ; pop ebx ; unused | |
| 162 pop ebp | |
| 163 ret | |
| 164 | |
| 165 ; -------------------------------------------------------------------------- | |
| 166 ; | |
| 167 ; Downsample pixel values of a single component. | |
| 168 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, | |
| 169 ; without smoothing. | |
| 170 ; | |
| 171 ; GLOBAL(void) | |
| 172 ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, | |
| 173 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, | |
| 174 ; JSAMPARRAY input_data, JSAMPARRAY output_data); | |
| 175 ; | |
| 176 | |
| 177 %define img_width(b) (b)+8 ; JDIMENSION image_width | |
| 178 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor | |
| 179 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_fact
or | |
| 180 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks | |
| 181 %define input_data(b) (b)+24 ; JSAMPARRAY input_data | |
| 182 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
| 183 | |
| 184 align 16 | |
| 185 global EXTN(jsimd_h2v2_downsample_mmx) PRIVATE | |
| 186 | |
| 187 EXTN(jsimd_h2v2_downsample_mmx): | |
| 188 push ebp | |
| 189 mov ebp,esp | |
| 190 ; push ebx ; unused | |
| 191 ; push ecx ; need not be preserved | |
| 192 ; push edx ; need not be preserved | |
| 193 push esi | |
| 194 push edi | |
| 195 | |
| 196 mov ecx, JDIMENSION [width_blks(ebp)] | |
| 197 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) | |
| 198 jz near .return | |
| 199 | |
| 200 mov edx, JDIMENSION [img_width(ebp)] | |
| 201 | |
| 202 ; -- expand_right_edge | |
| 203 | |
| 204 push ecx | |
| 205 shl ecx,1 ; output_cols * 2 | |
| 206 sub ecx,edx | |
| 207 jle short .expand_end | |
| 208 | |
| 209 mov eax, INT [max_v_samp(ebp)] | |
| 210 test eax,eax | |
| 211 jle short .expand_end | |
| 212 | |
| 213 cld | |
| 214 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
| 215 alignx 16,7 | |
| 216 .expandloop: | |
| 217 push eax | |
| 218 push ecx | |
| 219 | |
| 220 mov edi, JSAMPROW [esi] | |
| 221 add edi,edx | |
| 222 mov al, JSAMPLE [edi-1] | |
| 223 | |
| 224 rep stosb | |
| 225 | |
| 226 pop ecx | |
| 227 pop eax | |
| 228 | |
| 229 add esi, byte SIZEOF_JSAMPROW | |
| 230 dec eax | |
| 231 jg short .expandloop | |
| 232 | |
| 233 .expand_end: | |
| 234 pop ecx ; output_cols | |
| 235 | |
| 236 ; -- h2v2_downsample | |
| 237 | |
| 238 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr | |
| 239 test eax,eax | |
| 240 jle near .return | |
| 241 | |
| 242 mov edx, 0x00020001 ; bias pattern | |
| 243 movd mm7,edx | |
| 244 pcmpeqw mm6,mm6 | |
| 245 punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} | |
| 246 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} | |
| 247 | |
| 248 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
| 249 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data | |
| 250 alignx 16,7 | |
| 251 .rowloop: | |
| 252 push ecx | |
| 253 push edi | |
| 254 push esi | |
| 255 | |
| 256 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 | |
| 257 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 | |
| 258 mov edi, JSAMPROW [edi] ; outptr | |
| 259 alignx 16,7 | |
| 260 .columnloop: | |
| 261 | |
| 262 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] | |
| 263 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] | |
| 264 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] | |
| 265 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] | |
| 266 | |
| 267 movq mm4,mm0 | |
| 268 movq mm5,mm1 | |
| 269 pand mm0,mm6 | |
| 270 psrlw mm4,BYTE_BIT | |
| 271 pand mm1,mm6 | |
| 272 psrlw mm5,BYTE_BIT | |
| 273 paddw mm0,mm4 | |
| 274 paddw mm1,mm5 | |
| 275 | |
| 276 movq mm4,mm2 | |
| 277 movq mm5,mm3 | |
| 278 pand mm2,mm6 | |
| 279 psrlw mm4,BYTE_BIT | |
| 280 pand mm3,mm6 | |
| 281 psrlw mm5,BYTE_BIT | |
| 282 paddw mm2,mm4 | |
| 283 paddw mm3,mm5 | |
| 284 | |
| 285 paddw mm0,mm1 | |
| 286 paddw mm2,mm3 | |
| 287 paddw mm0,mm7 | |
| 288 paddw mm2,mm7 | |
| 289 psrlw mm0,2 | |
| 290 psrlw mm2,2 | |
| 291 | |
| 292 packuswb mm0,mm2 | |
| 293 | |
| 294 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 | |
| 295 | |
| 296 add edx, byte 2*SIZEOF_MMWORD ; inptr0 | |
| 297 add esi, byte 2*SIZEOF_MMWORD ; inptr1 | |
| 298 add edi, byte 1*SIZEOF_MMWORD ; outptr | |
| 299 sub ecx, byte SIZEOF_MMWORD ; outcol | |
| 300 jnz near .columnloop | |
| 301 | |
| 302 pop esi | |
| 303 pop edi | |
| 304 pop ecx | |
| 305 | |
| 306 add esi, byte 2*SIZEOF_JSAMPROW ; input_data | |
| 307 add edi, byte 1*SIZEOF_JSAMPROW ; output_data | |
| 308 dec eax ; rowctr | |
| 309 jg near .rowloop | |
| 310 | |
| 311 emms ; empty MMX state | |
| 312 | |
| 313 .return: | |
| 314 pop edi | |
| 315 pop esi | |
| 316 ; pop edx ; need not be preserved | |
| 317 ; pop ecx ; need not be preserved | |
| 318 ; pop ebx ; unused | |
| 319 pop ebp | |
| 320 ret | |
| 321 | |
| 322 ; For some reason, the OS X linker does not honor the request to align the | |
| 323 ; segment unless we do this. | |
| 324 align 16 | |
| OLD | NEW |