simd/jdsample-mmx.asm - Issue 1934113002: Update libjpeg_turbo to 1.4.90 from https://github.com/libjpeg-turbo/

Side by Side Diff: simd/jdsample-mmx.asm

Issue 1934113002: Update libjpeg_turbo to 1.4.90 from https://github.com/libjpeg-turbo/ (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 ;

	2 ; jdsample.asm - upsampling (MMX)

	3 ;

	4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

	5 ;

	6 ; Based on

	7 ; x86 SIMD extension for IJG JPEG library

	8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

	9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

	10 ;

	11 ; This file should be assembled with NASM (Netwide Assembler),

	12 ; can not be assembled with Microsoft's MASM or any compatible

	13 ; assembler (including Borland's Turbo Assembler).

	14 ; NASM is available from http://nasm.sourceforge.net/ or

	15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

	16 ;

	17 ; [TAB8]

	18

	19 %include "jsimdext.inc"

	20

	21 ; --------------------------------------------------------------------------

	22 SECTION SEG_CONST

	23

	24 alignz 16

	25 global EXTN(jconst_fancy_upsample_mmx)

	26

	27 EXTN(jconst_fancy_upsample_mmx):

	28

	29 PW_ONE times 4 dw 1

	30 PW_TWO times 4 dw 2

	31 PW_THREE times 4 dw 3

	32 PW_SEVEN times 4 dw 7

	33 PW_EIGHT times 4 dw 8

	34

	35 alignz 16

	36

	37 ; --------------------------------------------------------------------------

	38 SECTION SEG_TEXT

	39 BITS 32

	40 ;

	41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.

	42 ;

	43 ; The upsampling algorithm is linear interpolation between pixel centers,

	44 ; also known as a "triangle filter". This is a good compromise between

	45 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4

	46 ; of the way between input pixel centers.

	47 ;

	48 ; GLOBAL(void)

	49 ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,

	50 ; JDIMENSION downsampled_width,

	51 ; JSAMPARRAY input_data,

	52 ; JSAMPARRAY *output_data_ptr);

	53 ;

	54

	55 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

	56 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width

	57 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

	58 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr

	59

	60 align 16

	61 global EXTN(jsimd_h2v1_fancy_upsample_mmx)

	62

	63 EXTN(jsimd_h2v1_fancy_upsample_mmx):

	64 push ebp

	65 mov ebp,esp

	66 pushpic ebx

	67 ; push ecx ; need not be preserved

	68 ; push edx ; need not be preserved

	69 push esi

	70 push edi

	71

	72 get_GOT ebx ; get GOT address

	73

	74 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr

	75 test eax,eax

	76 jz near .return

	77

	78 mov ecx, INT [max_v_samp(ebp)] ; rowctr

	79 test ecx,ecx

	80 jz near .return

	81

	82 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data

	83 mov edi, POINTER [output_data_ptr(ebp)]

	84 mov edi, JSAMPARRAY [edi] ; output_data

	85 alignx 16,7

	86 .rowloop:

	87 push eax ; colctr

	88 push edi

	89 push esi

	90

	91 mov esi, JSAMPROW [esi] ; inptr

	92 mov edi, JSAMPROW [edi] ; outptr

	93

	94 test eax, SIZEOF_MMWORD-1

	95 jz short .skip

	96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]

	97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample

	98 .skip:

	99 pxor mm0,mm0 ; mm0=(all 0's)

	100 pcmpeqb mm7,mm7

	101 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT

	102 pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]

	103

	104 add eax, byte SIZEOF_MMWORD-1

	105 and eax, byte -SIZEOF_MMWORD

	106 cmp eax, byte SIZEOF_MMWORD

	107 ja short .columnloop

	108 alignx 16,7

	109

	110 .columnloop_last:

	111 pcmpeqb mm6,mm6

	112 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT

	113 pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]

	114 jmp short .upsample

	115 alignx 16,7

	116

	117 .columnloop:

	118 movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]

	119 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT

	120

	121 .upsample:

	122 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]

	123 movq mm2,mm1

	124 movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7)

	125 psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)

	126 psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)

	127

	128 por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6)

	129 por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8)

	130

	131 movq mm7,mm1

	132 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)

	133

	134 movq mm4,mm1

	135 punpcklbw mm1,mm0 ; mm1=( 0 1 2 3)

	136 punpckhbw mm4,mm0 ; mm4=( 4 5 6 7)

	137 movq mm5,mm2

	138 punpcklbw mm2,mm0 ; mm2=(-1 0 1 2)

	139 punpckhbw mm5,mm0 ; mm5=( 3 4 5 6)

	140 movq mm6,mm3

	141 punpcklbw mm3,mm0 ; mm3=( 1 2 3 4)

	142 punpckhbw mm6,mm0 ; mm6=( 5 6 7 8)

	143

	144 pmullw mm1,[GOTOFF(ebx,PW_THREE)]

	145 pmullw mm4,[GOTOFF(ebx,PW_THREE)]

	146 paddw mm2,[GOTOFF(ebx,PW_ONE)]

	147 paddw mm5,[GOTOFF(ebx,PW_ONE)]

	148 paddw mm3,[GOTOFF(ebx,PW_TWO)]

	149 paddw mm6,[GOTOFF(ebx,PW_TWO)]

	150

	151 paddw mm2,mm1

	152 paddw mm5,mm4

	153 psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6)

	154 psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14)

	155 paddw mm3,mm1

	156 paddw mm6,mm4

	157 psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7)

	158 psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15)

	159

	160 psllw mm3,BYTE_BIT

	161 psllw mm6,BYTE_BIT

	162 por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)

	163 por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)

	164

	165 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2

	166 movq MMWORD [edi+1*SIZEOF_MMWORD], mm5

	167

	168 sub eax, byte SIZEOF_MMWORD

	169 add esi, byte 1*SIZEOF_MMWORD ; inptr

	170 add edi, byte 2*SIZEOF_MMWORD ; outptr

	171 cmp eax, byte SIZEOF_MMWORD

	172 ja near .columnloop

	173 test eax,eax

	174 jnz near .columnloop_last

	175

	176 pop esi

	177 pop edi

	178 pop eax

	179

	180 add esi, byte SIZEOF_JSAMPROW ; input_data

	181 add edi, byte SIZEOF_JSAMPROW ; output_data

	182 dec ecx ; rowctr

	183 jg near .rowloop

	184

	185 emms ; empty MMX state

	186

	187 .return:

	188 pop edi

	189 pop esi

	190 ; pop edx ; need not be preserved

	191 ; pop ecx ; need not be preserved

	192 poppic ebx

	193 pop ebp

	194 ret

	195

	196 ; --------------------------------------------------------------------------

	197 ;

	198 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.

	199 ; Again a triangle filter; see comments for h2v1 case, above.

	200 ;

	201 ; GLOBAL(void)

	202 ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,

	203 ; JDIMENSION downsampled_width,

	204 ; JSAMPARRAY input_data,

	205 ; JSAMPARRAY *output_data_ptr);

	206 ;

	207

	208 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

	209 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width

	210 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

	211 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr

	212

	213 %define original_ebp ebp+0

	214 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]

	215 %define WK_NUM 4

	216 %define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr

	217

	218 align 16

	219 global EXTN(jsimd_h2v2_fancy_upsample_mmx)

	220

	221 EXTN(jsimd_h2v2_fancy_upsample_mmx):

	222 push ebp

	223 mov eax,esp ; eax = original ebp

	224 sub esp, byte 4

	225 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits

	226 mov [esp],eax

	227 mov ebp,esp ; ebp = aligned ebp

	228 lea esp, [wk(0)]

	229 pushpic eax ; make a room for GOT address

	230 push ebx

	231 ; push ecx ; need not be preserved

	232 ; push edx ; need not be preserved

	233 push esi

	234 push edi

	235

	236 get_GOT ebx ; get GOT address

	237 movpic POINTER [gotptr], ebx ; save GOT address

	238

	239 mov edx,eax ; edx = original ebp

	240 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr

	241 test eax,eax

	242 jz near .return

	243

	244 mov ecx, INT [max_v_samp(edx)] ; rowctr

	245 test ecx,ecx

	246 jz near .return

	247

	248 mov esi, JSAMPARRAY [input_data(edx)] ; input_data

	249 mov edi, POINTER [output_data_ptr(edx)]

	250 mov edi, JSAMPARRAY [edi] ; output_data

	251 alignx 16,7

	252 .rowloop:

	253 push eax ; colctr

	254 push ecx

	255 push edi

	256 push esi

	257

	258 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)

	259 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0

	260 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)

	261 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0

	262 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1

	263

	264 test eax, SIZEOF_MMWORD-1

	265 jz short .skip

	266 push edx

	267 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]

	268 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl

	269 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]

	270 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl

	271 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]

	272 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample

	273 pop edx

	274 .skip:

	275 ; -- process the first column block

	276

	277 movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]

	278 movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]

	279 movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]

	280

	281 pushpic ebx

	282 movpic ebx, POINTER [gotptr] ; load GOT address

	283

	284 pxor mm3,mm3 ; mm3=(all 0's)

	285 movq mm4,mm0

	286 punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3)

	287 punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7)

	288 movq mm5,mm1

	289 punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3)

	290 punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7)

	291 movq mm6,mm2

	292 punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3)

	293 punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7)

	294

	295 pmullw mm0,[GOTOFF(ebx,PW_THREE)]

	296 pmullw mm4,[GOTOFF(ebx,PW_THREE)]

	297

	298 pcmpeqb mm7,mm7

	299 psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT

	300

	301 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)

	302 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)

	303 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)

	304 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)

	305

	306 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save

	307 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data

	308 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2

	309 movq MMWORD [edi+1*SIZEOF_MMWORD], mm6

	310

	311 pand mm1,mm7 ; mm1=( 0 - - -)

	312 pand mm2,mm7 ; mm2=( 0 - - -)

	313

	314 movq MMWORD [wk(0)], mm1

	315 movq MMWORD [wk(1)], mm2

	316

	317 poppic ebx

	318

	319 add eax, byte SIZEOF_MMWORD-1

	320 and eax, byte -SIZEOF_MMWORD

	321 cmp eax, byte SIZEOF_MMWORD

	322 ja short .columnloop

	323 alignx 16,7

	324

	325 .columnloop_last:

	326 ; -- process the last column block

	327

	328 pushpic ebx

	329 movpic ebx, POINTER [gotptr] ; load GOT address

	330

	331 pcmpeqb mm1,mm1

	332 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT

	333 movq mm2,mm1

	334

	335 pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)

	336 pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)

	337

	338 movq MMWORD [wk(2)], mm1

	339 movq MMWORD [wk(3)], mm2

	340

	341 jmp short .upsample

	342 alignx 16,7

	343

	344 .columnloop:

	345 ; -- process the next column block

	346

	347 movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]

	348 movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]

	349 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]

	350

	351 pushpic ebx

	352 movpic ebx, POINTER [gotptr] ; load GOT address

	353

	354 pxor mm3,mm3 ; mm3=(all 0's)

	355 movq mm4,mm0

	356 punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3)

	357 punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7)

	358 movq mm5,mm1

	359 punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3)

	360 punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7)

	361 movq mm6,mm2

	362 punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3)

	363 punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7)

	364

	365 pmullw mm0,[GOTOFF(ebx,PW_THREE)]

	366 pmullw mm4,[GOTOFF(ebx,PW_THREE)]

	367

	368 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)

	369 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)

	370 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)

	371 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)

	372

	373 movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save

	374 movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data

	375 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2

	376 movq MMWORD [edi+3*SIZEOF_MMWORD], mm6

	377

	378 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)

	379 psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)

	380

	381 movq MMWORD [wk(2)], mm1

	382 movq MMWORD [wk(3)], mm2

	383

	384 .upsample:

	385 ; -- process the upper row

	386

	387 movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)

	388 movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)

	389

	390 movq mm0,mm7

	391 movq mm4,mm3

	392 psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -)

	393 psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)

	394 movq mm5,mm7

	395 movq mm6,mm3

	396 psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)

	397 psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6)

	398

	399 por mm0,mm4 ; mm0=( 1 2 3 4)

	400 por mm5,mm6 ; mm5=( 3 4 5 6)

	401

	402 movq mm1,mm7

	403 movq mm2,mm3

	404 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)

	405 psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -)

	406 movq mm4,mm3

	407 psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)

	408

	409 por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)

	410 por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)

	411

	412 movq MMWORD [wk(0)], mm4

	413

	414 pmullw mm7,[GOTOFF(ebx,PW_THREE)]

	415 pmullw mm3,[GOTOFF(ebx,PW_THREE)]

	416 paddw mm1,[GOTOFF(ebx,PW_EIGHT)]

	417 paddw mm5,[GOTOFF(ebx,PW_EIGHT)]

	418 paddw mm0,[GOTOFF(ebx,PW_SEVEN)]

	419 paddw mm2,[GOTOFF(ebx,PW_SEVEN)]

	420

	421 paddw mm1,mm7

	422 paddw mm5,mm3

	423 psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6)

	424 psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14)

	425 paddw mm0,mm7

	426 paddw mm2,mm3

	427 psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7)

	428 psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15)

	429

	430 psllw mm0,BYTE_BIT

	431 psllw mm2,BYTE_BIT

	432 por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)

	433 por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)

	434

	435 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1

	436 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5

	437

	438 ; -- process the lower row

	439

	440 movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)

	441 movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)

	442

	443 movq mm7,mm6

	444 movq mm3,mm4

	445 psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -)

	446 psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)

	447 movq mm0,mm6

	448 movq mm2,mm4

	449 psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)

	450 psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6)

	451

	452 por mm7,mm3 ; mm7=( 1 2 3 4)

	453 por mm0,mm2 ; mm0=( 3 4 5 6)

	454

	455 movq mm1,mm6

	456 movq mm5,mm4

	457 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)

	458 psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -)

	459 movq mm3,mm4

	460 psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)

	461

	462 por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)

	463 por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)

	464

	465 movq MMWORD [wk(1)], mm3

	466

	467 pmullw mm6,[GOTOFF(ebx,PW_THREE)]

	468 pmullw mm4,[GOTOFF(ebx,PW_THREE)]

	469 paddw mm1,[GOTOFF(ebx,PW_EIGHT)]

	470 paddw mm0,[GOTOFF(ebx,PW_EIGHT)]

	471 paddw mm7,[GOTOFF(ebx,PW_SEVEN)]

	472 paddw mm5,[GOTOFF(ebx,PW_SEVEN)]

	473

	474 paddw mm1,mm6

	475 paddw mm0,mm4

	476 psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6)

	477 psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14)

	478 paddw mm7,mm6

	479 paddw mm5,mm4

	480 psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7)

	481 psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15)

	482

	483 psllw mm7,BYTE_BIT

	484 psllw mm5,BYTE_BIT

	485 por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)

	486 por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)

	487

	488 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1

	489 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0

	490

	491 poppic ebx

	492

	493 sub eax, byte SIZEOF_MMWORD

	494 add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)

	495 add ebx, byte 1*SIZEOF_MMWORD ; inptr0

	496 add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)

	497 add edx, byte 2*SIZEOF_MMWORD ; outptr0

	498 add edi, byte 2*SIZEOF_MMWORD ; outptr1

	499 cmp eax, byte SIZEOF_MMWORD

	500 ja near .columnloop

	501 test eax,eax

	502 jnz near .columnloop_last

	503

	504 pop esi

	505 pop edi

	506 pop ecx

	507 pop eax

	508

	509 add esi, byte 1*SIZEOF_JSAMPROW ; input_data

	510 add edi, byte 2*SIZEOF_JSAMPROW ; output_data

	511 sub ecx, byte 2 ; rowctr

	512 jg near .rowloop

	513

	514 emms ; empty MMX state

	515

	516 .return:

	517 pop edi

	518 pop esi

	519 ; pop edx ; need not be preserved

	520 ; pop ecx ; need not be preserved

	521 pop ebx

	522 mov esp,ebp ; esp <- aligned ebp

	523 pop esp ; esp <- original ebp

	524 pop ebp

	525 ret

	526

	527 ; --------------------------------------------------------------------------

	528 ;

	529 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.

	530 ; It's still a box filter.

	531 ;

	532 ; GLOBAL(void)

	533 ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,

	534 ; JDIMENSION output_width,

	535 ; JSAMPARRAY input_data,

	536 ; JSAMPARRAY *output_data_ptr);

	537 ;

	538

	539 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

	540 %define output_width(b) (b)+12 ; JDIMENSION output_width

	541 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

	542 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr

	543

	544 align 16

	545 global EXTN(jsimd_h2v1_upsample_mmx)

	546

	547 EXTN(jsimd_h2v1_upsample_mmx):

	548 push ebp

	549 mov ebp,esp

	550 ; push ebx ; unused

	551 ; push ecx ; need not be preserved

	552 ; push edx ; need not be preserved

	553 push esi

	554 push edi

	555

	556 mov edx, JDIMENSION [output_width(ebp)]

	557 add edx, byte (2*SIZEOF_MMWORD)-1

	558 and edx, byte -(2*SIZEOF_MMWORD)

	559 jz short .return

	560

	561 mov ecx, INT [max_v_samp(ebp)] ; rowctr

	562 test ecx,ecx

	563 jz short .return

	564

	565 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data

	566 mov edi, POINTER [output_data_ptr(ebp)]

	567 mov edi, JSAMPARRAY [edi] ; output_data

	568 alignx 16,7

	569 .rowloop:

	570 push edi

	571 push esi

	572

	573 mov esi, JSAMPROW [esi] ; inptr

	574 mov edi, JSAMPROW [edi] ; outptr

	575 mov eax,edx ; colctr

	576 alignx 16,7

	577 .columnloop:

	578

	579 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]

	580

	581 movq mm1,mm0

	582 punpcklbw mm0,mm0

	583 punpckhbw mm1,mm1

	584

	585 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0

	586 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1

	587

	588 sub eax, byte 2*SIZEOF_MMWORD

	589 jz short .nextrow

	590

	591 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]

	592

	593 movq mm3,mm2

	594 punpcklbw mm2,mm2

	595 punpckhbw mm3,mm3

	596

	597 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2

	598 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3

	599

	600 sub eax, byte 2*SIZEOF_MMWORD

	601 jz short .nextrow

	602

	603 add esi, byte 2*SIZEOF_MMWORD ; inptr

	604 add edi, byte 4*SIZEOF_MMWORD ; outptr

	605 jmp short .columnloop

	606 alignx 16,7

	607

	608 .nextrow:

	609 pop esi

	610 pop edi

	611

	612 add esi, byte SIZEOF_JSAMPROW ; input_data

	613 add edi, byte SIZEOF_JSAMPROW ; output_data

	614 dec ecx ; rowctr

	615 jg short .rowloop

	616

	617 emms ; empty MMX state

	618

	619 .return:

	620 pop edi

	621 pop esi

	622 ; pop edx ; need not be preserved

	623 ; pop ecx ; need not be preserved

	624 ; pop ebx ; unused

	625 pop ebp

	626 ret

	627

	628 ; --------------------------------------------------------------------------

	629 ;

	630 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.

	631 ; It's still a box filter.

	632 ;

	633 ; GLOBAL(void)

	634 ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,

	635 ; JDIMENSION output_width,

	636 ; JSAMPARRAY input_data,

	637 ; JSAMPARRAY *output_data_ptr);

	638 ;

	639

	640 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

	641 %define output_width(b) (b)+12 ; JDIMENSION output_width

	642 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

	643 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr

	644

	645 align 16

	646 global EXTN(jsimd_h2v2_upsample_mmx)

	647

	648 EXTN(jsimd_h2v2_upsample_mmx):

	649 push ebp

	650 mov ebp,esp

	651 push ebx

	652 ; push ecx ; need not be preserved

	653 ; push edx ; need not be preserved

	654 push esi

	655 push edi

	656

	657 mov edx, JDIMENSION [output_width(ebp)]

	658 add edx, byte (2*SIZEOF_MMWORD)-1

	659 and edx, byte -(2*SIZEOF_MMWORD)

	660 jz near .return

	661

	662 mov ecx, INT [max_v_samp(ebp)] ; rowctr

	663 test ecx,ecx

	664 jz short .return

	665

	666 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data

	667 mov edi, POINTER [output_data_ptr(ebp)]

	668 mov edi, JSAMPARRAY [edi] ; output_data

	669 alignx 16,7

	670 .rowloop:

	671 push edi

	672 push esi

	673

	674 mov esi, JSAMPROW [esi] ; inptr

	675 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0

	676 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1

	677 mov eax,edx ; colctr

	678 alignx 16,7

	679 .columnloop:

	680

	681 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]

	682

	683 movq mm1,mm0

	684 punpcklbw mm0,mm0

	685 punpckhbw mm1,mm1

	686

	687 movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0

	688 movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1

	689 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0

	690 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1

	691

	692 sub eax, byte 2*SIZEOF_MMWORD

	693 jz short .nextrow

	694

	695 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]

	696

	697 movq mm3,mm2

	698 punpcklbw mm2,mm2

	699 punpckhbw mm3,mm3

	700

	701 movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2

	702 movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3

	703 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2

	704 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3

	705

	706 sub eax, byte 2*SIZEOF_MMWORD

	707 jz short .nextrow

	708

	709 add esi, byte 2*SIZEOF_MMWORD ; inptr

	710 add ebx, byte 4*SIZEOF_MMWORD ; outptr0

	711 add edi, byte 4*SIZEOF_MMWORD ; outptr1

	712 jmp short .columnloop

	713 alignx 16,7

	714

	715 .nextrow:

	716 pop esi

	717 pop edi

	718

	719 add esi, byte 1*SIZEOF_JSAMPROW ; input_data

	720 add edi, byte 2*SIZEOF_JSAMPROW ; output_data

	721 sub ecx, byte 2 ; rowctr

	722 jg short .rowloop

	723

	724 emms ; empty MMX state

	725

	726 .return:

	727 pop edi

	728 pop esi

	729 ; pop edx ; need not be preserved

	730 ; pop ecx ; need not be preserved

	731 pop ebx

	732 pop ebp

	733 ret

	734

	735 ; For some reason, the OS X linker does not honor the request to align the

	736 ; segment unless we do this.

	737 align 16

OLD	NEW

« simd/jccolext-sse2-64.asm ('K') | « simd/jdsample-altivec.c ('k') | simd/jdsample-sse2.asm » ('j') | no next file with comments »