simd/jdsample-sse2.asm - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jdsample-sse2.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 ;

	2 ; jdsample.asm - upsampling (SSE2)

	3 ;

	4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

	5 ;

	6 ; Based on

	7 ; x86 SIMD extension for IJG JPEG library

	8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

	9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

	10 ;

	11 ; This file should be assembled with NASM (Netwide Assembler),

	12 ; can not be assembled with Microsoft's MASM or any compatible

	13 ; assembler (including Borland's Turbo Assembler).

	14 ; NASM is available from http://nasm.sourceforge.net/ or

	15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

	16 ;

	17 ; [TAB8]

	18

	19 %include "jsimdext.inc"

	20

	21 ; --------------------------------------------------------------------------

	22 SECTION SEG_CONST

	23

	24 alignz 16

	25 global EXTN(jconst_fancy_upsample_sse2)

	26

	27 EXTN(jconst_fancy_upsample_sse2):

	28

	29 PW_ONE times 8 dw 1

	30 PW_TWO times 8 dw 2

	31 PW_THREE times 8 dw 3

	32 PW_SEVEN times 8 dw 7

	33 PW_EIGHT times 8 dw 8

	34

	35 alignz 16

	36

	37 ; --------------------------------------------------------------------------

	38 SECTION SEG_TEXT

	39 BITS 32

	40 ;

	41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.

	42 ;

	43 ; The upsampling algorithm is linear interpolation between pixel centers,

	44 ; also known as a "triangle filter". This is a good compromise between

	45 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4

	46 ; of the way between input pixel centers.

	47 ;

	48 ; GLOBAL(void)

	49 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,

	50 ; JDIMENSION downsampled_width,

	51 ; JSAMPARRAY input_data,

	52 ; JSAMPARRAY *output_data_ptr);

	53 ;

	54

	55 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

	56 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width

	57 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

	58 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr

	59

	60 align 16

	61 global EXTN(jsimd_h2v1_fancy_upsample_sse2)

	62

	63 EXTN(jsimd_h2v1_fancy_upsample_sse2):

	64 push ebp

	65 mov ebp,esp

	66 pushpic ebx

	67 ; push ecx ; need not be preserved

	68 ; push edx ; need not be preserved

	69 push esi

	70 push edi

	71

	72 get_GOT ebx ; get GOT address

	73

	74 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr

	75 test eax,eax

	76 jz near .return

	77

	78 mov ecx, INT [max_v_samp(ebp)] ; rowctr

	79 test ecx,ecx

	80 jz near .return

	81

	82 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data

	83 mov edi, POINTER [output_data_ptr(ebp)]

	84 mov edi, JSAMPARRAY [edi] ; output_data

	85 alignx 16,7

	86 .rowloop:

	87 push eax ; colctr

	88 push edi

	89 push esi

	90

	91 mov esi, JSAMPROW [esi] ; inptr

	92 mov edi, JSAMPROW [edi] ; outptr

	93

	94 test eax, SIZEOF_XMMWORD-1

	95 jz short .skip

	96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]

	97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample

	98 .skip:

	99 pxor xmm0,xmm0 ; xmm0=(all 0's)

	100 pcmpeqb xmm7,xmm7

	101 psrldq xmm7,(SIZEOF_XMMWORD-1)

	102 pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]

	103

	104 add eax, byte SIZEOF_XMMWORD-1

	105 and eax, byte -SIZEOF_XMMWORD

	106 cmp eax, byte SIZEOF_XMMWORD

	107 ja short .columnloop

	108 alignx 16,7

	109

	110 .columnloop_last:

	111 pcmpeqb xmm6,xmm6

	112 pslldq xmm6,(SIZEOF_XMMWORD-1)

	113 pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]

	114 jmp short .upsample

	115 alignx 16,7

	116

	117 .columnloop:

	118 movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]

	119 pslldq xmm6,(SIZEOF_XMMWORD-1)

	120

	121 .upsample:

	122 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]

	123 movdqa xmm2,xmm1

	124 movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)

	125 pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)

	126 psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)

	127

	128 por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)

	129 por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)

	130

	131 movdqa xmm7,xmm1

	132 psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)

	133

	134 movdqa xmm4,xmm1

	135 punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)

	136 punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)

	137 movdqa xmm5,xmm2

	138 punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)

	139 punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)

	140 movdqa xmm6,xmm3

	141 punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)

	142 punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)

	143

	144 pmullw xmm1,[GOTOFF(ebx,PW_THREE)]

	145 pmullw xmm4,[GOTOFF(ebx,PW_THREE)]

	146 paddw xmm2,[GOTOFF(ebx,PW_ONE)]

	147 paddw xmm5,[GOTOFF(ebx,PW_ONE)]

	148 paddw xmm3,[GOTOFF(ebx,PW_TWO)]

	149 paddw xmm6,[GOTOFF(ebx,PW_TWO)]

	150

	151 paddw xmm2,xmm1

	152 paddw xmm5,xmm4

	153 psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)

	154 psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)

	155 paddw xmm3,xmm1

	156 paddw xmm6,xmm4

	157 psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)

	158 psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)

	159

	160 psllw xmm3,BYTE_BIT

	161 psllw xmm6,BYTE_BIT

	162 por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)

	163 por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)

	164

	165 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2

	166 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5

	167

	168 sub eax, byte SIZEOF_XMMWORD

	169 add esi, byte 1*SIZEOF_XMMWORD ; inptr

	170 add edi, byte 2*SIZEOF_XMMWORD ; outptr

	171 cmp eax, byte SIZEOF_XMMWORD

	172 ja near .columnloop

	173 test eax,eax

	174 jnz near .columnloop_last

	175

	176 pop esi

	177 pop edi

	178 pop eax

	179

	180 add esi, byte SIZEOF_JSAMPROW ; input_data

	181 add edi, byte SIZEOF_JSAMPROW ; output_data

	182 dec ecx ; rowctr

	183 jg near .rowloop

	184

	185 .return:

	186 pop edi

	187 pop esi

	188 ; pop edx ; need not be preserved

	189 ; pop ecx ; need not be preserved

	190 poppic ebx

	191 pop ebp

	192 ret

	193

	194 ; --------------------------------------------------------------------------

	195 ;

	196 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.

	197 ; Again a triangle filter; see comments for h2v1 case, above.

	198 ;

	199 ; GLOBAL(void)

	200 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,

	201 ; JDIMENSION downsampled_width,

	202 ; JSAMPARRAY input_data,

	203 ; JSAMPARRAY *output_data_ptr);

	204 ;

	205

	206 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

	207 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width

	208 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

	209 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr

	210

	211 %define original_ebp ebp+0

	212 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]

	213 %define WK_NUM 4

	214 %define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr

	215

	216 align 16

	217 global EXTN(jsimd_h2v2_fancy_upsample_sse2)

	218

	219 EXTN(jsimd_h2v2_fancy_upsample_sse2):

	220 push ebp

	221 mov eax,esp ; eax = original ebp

	222 sub esp, byte 4

	223 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits

	224 mov [esp],eax

	225 mov ebp,esp ; ebp = aligned ebp

	226 lea esp, [wk(0)]

	227 pushpic eax ; make a room for GOT address

	228 push ebx

	229 ; push ecx ; need not be preserved

	230 ; push edx ; need not be preserved

	231 push esi

	232 push edi

	233

	234 get_GOT ebx ; get GOT address

	235 movpic POINTER [gotptr], ebx ; save GOT address

	236

	237 mov edx,eax ; edx = original ebp

	238 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr

	239 test eax,eax

	240 jz near .return

	241

	242 mov ecx, INT [max_v_samp(edx)] ; rowctr

	243 test ecx,ecx

	244 jz near .return

	245

	246 mov esi, JSAMPARRAY [input_data(edx)] ; input_data

	247 mov edi, POINTER [output_data_ptr(edx)]

	248 mov edi, JSAMPARRAY [edi] ; output_data

	249 alignx 16,7

	250 .rowloop:

	251 push eax ; colctr

	252 push ecx

	253 push edi

	254 push esi

	255

	256 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)

	257 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0

	258 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)

	259 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0

	260 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1

	261

	262 test eax, SIZEOF_XMMWORD-1

	263 jz short .skip

	264 push edx

	265 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]

	266 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl

	267 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]

	268 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl

	269 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]

	270 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample

	271 pop edx

	272 .skip:

	273 ; -- process the first column block

	274

	275 movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]

	276 movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]

	277 movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]

	278

	279 pushpic ebx

	280 movpic ebx, POINTER [gotptr] ; load GOT address

	281

	282 pxor xmm3,xmm3 ; xmm3=(all 0's)

	283 movdqa xmm4,xmm0

	284 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)

	285 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)

	286 movdqa xmm5,xmm1

	287 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)

	288 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)

	289 movdqa xmm6,xmm2

	290 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)

	291 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)

	292

	293 pmullw xmm0,[GOTOFF(ebx,PW_THREE)]

	294 pmullw xmm4,[GOTOFF(ebx,PW_THREE)]

	295

	296 pcmpeqb xmm7,xmm7

	297 psrldq xmm7,(SIZEOF_XMMWORD-2)

	298

	299 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)

	300 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)

	301 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)

	302 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)

	303

	304 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save

	305 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data

	306 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2

	307 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6

	308

	309 pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)

	310 pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)

	311

	312 movdqa XMMWORD [wk(0)], xmm1

	313 movdqa XMMWORD [wk(1)], xmm2

	314

	315 poppic ebx

	316

	317 add eax, byte SIZEOF_XMMWORD-1

	318 and eax, byte -SIZEOF_XMMWORD

	319 cmp eax, byte SIZEOF_XMMWORD

	320 ja short .columnloop

	321 alignx 16,7

	322

	323 .columnloop_last:

	324 ; -- process the last column block

	325

	326 pushpic ebx

	327 movpic ebx, POINTER [gotptr] ; load GOT address

	328

	329 pcmpeqb xmm1,xmm1

	330 pslldq xmm1,(SIZEOF_XMMWORD-2)

	331 movdqa xmm2,xmm1

	332

	333 pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]

	334 pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]

	335

	336 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)

	337 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)

	338

	339 jmp near .upsample

	340 alignx 16,7

	341

	342 .columnloop:

	343 ; -- process the next column block

	344

	345 movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]

	346 movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]

	347 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]

	348

	349 pushpic ebx

	350 movpic ebx, POINTER [gotptr] ; load GOT address

	351

	352 pxor xmm3,xmm3 ; xmm3=(all 0's)

	353 movdqa xmm4,xmm0

	354 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)

	355 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)

	356 movdqa xmm5,xmm1

	357 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)

	358 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)

	359 movdqa xmm6,xmm2

	360 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)

	361 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)

	362

	363 pmullw xmm0,[GOTOFF(ebx,PW_THREE)]

	364 pmullw xmm4,[GOTOFF(ebx,PW_THREE)]

	365

	366 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)

	367 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)

	368 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)

	369 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)

	370

	371 movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save

	372 movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data

	373 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2

	374 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6

	375

	376 pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)

	377 pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)

	378

	379 movdqa XMMWORD [wk(2)], xmm1

	380 movdqa XMMWORD [wk(3)], xmm2

	381

	382 .upsample:

	383 ; -- process the upper row

	384

	385 movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]

	386 movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]

	387

	388 movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)

	389 movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)

	390 psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)

	391 pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)

	392 movdqa xmm5,xmm7

	393 movdqa xmm6,xmm3

	394 psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)

	395 pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)

	396

	397 por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)

	398 por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)

	399

	400 movdqa xmm1,xmm7

	401 movdqa xmm2,xmm3

	402 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)

	403 psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)

	404 movdqa xmm4,xmm3

	405 psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)

	406

	407 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)

	408 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)

	409

	410 movdqa XMMWORD [wk(0)], xmm4

	411

	412 pmullw xmm7,[GOTOFF(ebx,PW_THREE)]

	413 pmullw xmm3,[GOTOFF(ebx,PW_THREE)]

	414 paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]

	415 paddw xmm5,[GOTOFF(ebx,PW_EIGHT)]

	416 paddw xmm0,[GOTOFF(ebx,PW_SEVEN)]

	417 paddw xmm2,[GOTOFF(ebx,PW_SEVEN)]

	418

	419 paddw xmm1,xmm7

	420 paddw xmm5,xmm3

	421 psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)

	422 psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)

	423 paddw xmm0,xmm7

	424 paddw xmm2,xmm3

	425 psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)

	426 psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)

	427

	428 psllw xmm0,BYTE_BIT

	429 psllw xmm2,BYTE_BIT

	430 por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)

	431 por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)

	432

	433 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1

	434 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5

	435

	436 ; -- process the lower row

	437

	438 movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]

	439 movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]

	440

	441 movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)

	442 movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)

	443 psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)

	444 pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)

	445 movdqa xmm0,xmm6

	446 movdqa xmm2,xmm4

	447 psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)

	448 pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)

	449

	450 por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)

	451 por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)

	452

	453 movdqa xmm1,xmm6

	454 movdqa xmm5,xmm4

	455 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)

	456 psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)

	457 movdqa xmm3,xmm4

	458 psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)

	459

	460 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)

	461 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)

	462

	463 movdqa XMMWORD [wk(1)], xmm3

	464

	465 pmullw xmm6,[GOTOFF(ebx,PW_THREE)]

	466 pmullw xmm4,[GOTOFF(ebx,PW_THREE)]

	467 paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]

	468 paddw xmm0,[GOTOFF(ebx,PW_EIGHT)]

	469 paddw xmm7,[GOTOFF(ebx,PW_SEVEN)]

	470 paddw xmm5,[GOTOFF(ebx,PW_SEVEN)]

	471

	472 paddw xmm1,xmm6

	473 paddw xmm0,xmm4

	474 psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)

	475 psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)

	476 paddw xmm7,xmm6

	477 paddw xmm5,xmm4

	478 psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)

	479 psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)

	480

	481 psllw xmm7,BYTE_BIT

	482 psllw xmm5,BYTE_BIT

	483 por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)

	484 por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)

	485

	486 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1

	487 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0

	488

	489 poppic ebx

	490

	491 sub eax, byte SIZEOF_XMMWORD

	492 add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)

	493 add ebx, byte 1*SIZEOF_XMMWORD ; inptr0

	494 add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)

	495 add edx, byte 2*SIZEOF_XMMWORD ; outptr0

	496 add edi, byte 2*SIZEOF_XMMWORD ; outptr1

	497 cmp eax, byte SIZEOF_XMMWORD

	498 ja near .columnloop

	499 test eax,eax

	500 jnz near .columnloop_last

	501

	502 pop esi

	503 pop edi

	504 pop ecx

	505 pop eax

	506

	507 add esi, byte 1*SIZEOF_JSAMPROW ; input_data

	508 add edi, byte 2*SIZEOF_JSAMPROW ; output_data

	509 sub ecx, byte 2 ; rowctr

	510 jg near .rowloop

	511

	512 .return:

	513 pop edi

	514 pop esi

	515 ; pop edx ; need not be preserved

	516 ; pop ecx ; need not be preserved

	517 pop ebx

	518 mov esp,ebp ; esp <- aligned ebp

	519 pop esp ; esp <- original ebp

	520 pop ebp

	521 ret

	522

	523 ; --------------------------------------------------------------------------

	524 ;

	525 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.

	526 ; It's still a box filter.

	527 ;

	528 ; GLOBAL(void)

	529 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,

	530 ; JDIMENSION output_width,

	531 ; JSAMPARRAY input_data,

	532 ; JSAMPARRAY *output_data_ptr);

	533 ;

	534

	535 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

	536 %define output_width(b) (b)+12 ; JDIMENSION output_width

	537 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

	538 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr

	539

	540 align 16

	541 global EXTN(jsimd_h2v1_upsample_sse2)

	542

	543 EXTN(jsimd_h2v1_upsample_sse2):

	544 push ebp

	545 mov ebp,esp

	546 ; push ebx ; unused

	547 ; push ecx ; need not be preserved

	548 ; push edx ; need not be preserved

	549 push esi

	550 push edi

	551

	552 mov edx, JDIMENSION [output_width(ebp)]

	553 add edx, byte (2*SIZEOF_XMMWORD)-1

	554 and edx, byte -(2*SIZEOF_XMMWORD)

	555 jz short .return

	556

	557 mov ecx, INT [max_v_samp(ebp)] ; rowctr

	558 test ecx,ecx

	559 jz short .return

	560

	561 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data

	562 mov edi, POINTER [output_data_ptr(ebp)]

	563 mov edi, JSAMPARRAY [edi] ; output_data

	564 alignx 16,7

	565 .rowloop:

	566 push edi

	567 push esi

	568

	569 mov esi, JSAMPROW [esi] ; inptr

	570 mov edi, JSAMPROW [edi] ; outptr

	571 mov eax,edx ; colctr

	572 alignx 16,7

	573 .columnloop:

	574

	575 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]

	576

	577 movdqa xmm1,xmm0

	578 punpcklbw xmm0,xmm0

	579 punpckhbw xmm1,xmm1

	580

	581 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0

	582 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1

	583

	584 sub eax, byte 2*SIZEOF_XMMWORD

	585 jz short .nextrow

	586

	587 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]

	588

	589 movdqa xmm3,xmm2

	590 punpcklbw xmm2,xmm2

	591 punpckhbw xmm3,xmm3

	592

	593 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2

	594 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3

	595

	596 sub eax, byte 2*SIZEOF_XMMWORD

	597 jz short .nextrow

	598

	599 add esi, byte 2*SIZEOF_XMMWORD ; inptr

	600 add edi, byte 4*SIZEOF_XMMWORD ; outptr

	601 jmp short .columnloop

	602 alignx 16,7

	603

	604 .nextrow:

	605 pop esi

	606 pop edi

	607

	608 add esi, byte SIZEOF_JSAMPROW ; input_data

	609 add edi, byte SIZEOF_JSAMPROW ; output_data

	610 dec ecx ; rowctr

	611 jg short .rowloop

	612

	613 .return:

	614 pop edi

	615 pop esi

	616 ; pop edx ; need not be preserved

	617 ; pop ecx ; need not be preserved

	618 ; pop ebx ; unused

	619 pop ebp

	620 ret

	621

	622 ; --------------------------------------------------------------------------

	623 ;

	624 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.

	625 ; It's still a box filter.

	626 ;

	627 ; GLOBAL(void)

	628 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,

	629 ; JDIMENSION output_width,

	630 ; JSAMPARRAY input_data,

	631 ; JSAMPARRAY *output_data_ptr);

	632 ;

	633

	634 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

	635 %define output_width(b) (b)+12 ; JDIMENSION output_width

	636 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

	637 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr

	638

	639 align 16

	640 global EXTN(jsimd_h2v2_upsample_sse2)

	641

	642 EXTN(jsimd_h2v2_upsample_sse2):

	643 push ebp

	644 mov ebp,esp

	645 push ebx

	646 ; push ecx ; need not be preserved

	647 ; push edx ; need not be preserved

	648 push esi

	649 push edi

	650

	651 mov edx, JDIMENSION [output_width(ebp)]

	652 add edx, byte (2*SIZEOF_XMMWORD)-1

	653 and edx, byte -(2*SIZEOF_XMMWORD)

	654 jz near .return

	655

	656 mov ecx, INT [max_v_samp(ebp)] ; rowctr

	657 test ecx,ecx

	658 jz near .return

	659

	660 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data

	661 mov edi, POINTER [output_data_ptr(ebp)]

	662 mov edi, JSAMPARRAY [edi] ; output_data

	663 alignx 16,7

	664 .rowloop:

	665 push edi

	666 push esi

	667

	668 mov esi, JSAMPROW [esi] ; inptr

	669 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0

	670 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1

	671 mov eax,edx ; colctr

	672 alignx 16,7

	673 .columnloop:

	674

	675 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]

	676

	677 movdqa xmm1,xmm0

	678 punpcklbw xmm0,xmm0

	679 punpckhbw xmm1,xmm1

	680

	681 movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0

	682 movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1

	683 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0

	684 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1

	685

	686 sub eax, byte 2*SIZEOF_XMMWORD

	687 jz short .nextrow

	688

	689 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]

	690

	691 movdqa xmm3,xmm2

	692 punpcklbw xmm2,xmm2

	693 punpckhbw xmm3,xmm3

	694

	695 movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2

	696 movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3

	697 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2

	698 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3

	699

	700 sub eax, byte 2*SIZEOF_XMMWORD

	701 jz short .nextrow

	702

	703 add esi, byte 2*SIZEOF_XMMWORD ; inptr

	704 add ebx, byte 4*SIZEOF_XMMWORD ; outptr0

	705 add edi, byte 4*SIZEOF_XMMWORD ; outptr1

	706 jmp short .columnloop

	707 alignx 16,7

	708

	709 .nextrow:

	710 pop esi

	711 pop edi

	712

	713 add esi, byte 1*SIZEOF_JSAMPROW ; input_data

	714 add edi, byte 2*SIZEOF_JSAMPROW ; output_data

	715 sub ecx, byte 2 ; rowctr

	716 jg short .rowloop

	717

	718 .return:

	719 pop edi

	720 pop esi

	721 ; pop edx ; need not be preserved

	722 ; pop ecx ; need not be preserved

	723 pop ebx

	724 pop ebp

	725 ret

	726

	727 ; For some reason, the OS X linker does not honor the request to align the

	728 ; segment unless we do this.

	729 align 16

OLD	NEW

« no previous file with comments | « simd/jdsample-mmx.asm ('k') | simd/jdsample-sse2-64.asm » ('j') | no next file with comments »