simd/jidctflt-sse2.asm - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jidctflt-sse2.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 ;

	2 ; jidctflt.asm - floating-point IDCT (SSE & SSE2)

	3 ;

	4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

	5 ;

	6 ; Based on

	7 ; x86 SIMD extension for IJG JPEG library

	8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

	9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

	10 ;

	11 ; This file should be assembled with NASM (Netwide Assembler),

	12 ; can not be assembled with Microsoft's MASM or any compatible

	13 ; assembler (including Borland's Turbo Assembler).

	14 ; NASM is available from http://nasm.sourceforge.net/ or

	15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

	16 ;

	17 ; This file contains a floating-point implementation of the inverse DCT

	18 ; (Discrete Cosine Transform). The following code is based directly on

	19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.

	20 ;

	21 ; [TAB8]

	22

	23 %include "jsimdext.inc"

	24 %include "jdct.inc"

	25

	26 ; --------------------------------------------------------------------------

	27

	28 %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)

	29 shufps %1,%2,0x44

	30 %endmacro

	31

	32 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)

	33 shufps %1,%2,0xEE

	34 %endmacro

	35

	36 ; --------------------------------------------------------------------------

	37 SECTION SEG_CONST

	38

	39 alignz 16

	40 global EXTN(jconst_idct_float_sse2)

	41

	42 EXTN(jconst_idct_float_sse2):

	43

	44 PD_1_414 times 4 dd 1.414213562373095048801689

	45 PD_1_847 times 4 dd 1.847759065022573512256366

	46 PD_1_082 times 4 dd 1.082392200292393968799446

	47 PD_M2_613 times 4 dd -2.613125929752753055713286

	48 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)

	49 PB_CENTERJSAMP times 16 db CENTERJSAMPLE

	50

	51 alignz 16

	52

	53 ; --------------------------------------------------------------------------

	54 SECTION SEG_TEXT

	55 BITS 32

	56 ;

	57 ; Perform dequantization and inverse DCT on one block of coefficients.

	58 ;

	59 ; GLOBAL(void)

	60 ; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,

	61 ; JSAMPARRAY output_buf, JDIMENSION output_col)

	62 ;

	63

	64 %define dct_table(b) (b)+8 ; void *dct_table

	65 %define coef_block(b) (b)+12 ; JCOEFPTR coef_block

	66 %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf

	67 %define output_col(b) (b)+20 ; JDIMENSION output_col

	68

	69 %define original_ebp ebp+0

	70 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]

	71 %define WK_NUM 2

	72 %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT

	73 ; FAST_FLOAT workspace[DCTSIZE2]

	74

	75 align 16

	76 global EXTN(jsimd_idct_float_sse2)

	77

	78 EXTN(jsimd_idct_float_sse2):

	79 push ebp

	80 mov eax,esp ; eax = original ebp

	81 sub esp, byte 4

	82 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits

	83 mov [esp],eax

	84 mov ebp,esp ; ebp = aligned ebp

	85 lea esp, [workspace]

	86 push ebx

	87 ; push ecx ; need not be preserved

	88 ; push edx ; need not be preserved

	89 push esi

	90 push edi

	91

	92 get_GOT ebx ; get GOT address

	93

	94 ; ---- Pass 1: process columns from input, store into work array.

	95

	96 ; mov eax, [original_ebp]

	97 mov edx, POINTER [dct_table(eax)] ; quantptr

	98 mov esi, JCOEFPTR [coef_block(eax)] ; inptr

	99 lea edi, [workspace] ; FAST_FLOAT *wsptr

	100 mov ecx, DCTSIZE/4 ; ctr

	101 alignx 16,7

	102 .columnloop:

	103 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE

	104 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]

	105 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]

	106 jnz near .columnDCT

	107

	108 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]

	109 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]

	110 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]

	111 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]

	112 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]

	113 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]

	114 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]

	115 por xmm1,xmm2

	116 por xmm3,xmm4

	117 por xmm5,xmm6

	118 por xmm1,xmm3

	119 por xmm5,xmm7

	120 por xmm1,xmm5

	121 packsswb xmm1,xmm1

	122 movd eax,xmm1

	123 test eax,eax

	124 jnz short .columnDCT

	125

	126 ; -- AC terms all zero

	127

	128 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]

	129

	130 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)

	131 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)

	132 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)

	133

	134 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

	135

	136 movaps xmm1,xmm0

	137 movaps xmm2,xmm0

	138 movaps xmm3,xmm0

	139

	140 shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)

	141 shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)

	142 shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)

	143 shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)

	144

	145 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0

	146 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0

	147 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1

	148 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1

	149 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2

	150 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2

	151 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3

	152 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3

	153 jmp near .nextcolumn

	154 alignx 16,7

	155 %endif

	156 .columnDCT:

	157

	158 ; -- Even part

	159

	160 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]

	161 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]

	162 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]

	163 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]

	164

	165 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)

	166 punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)

	167 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)

	168 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)

	169 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)

	170 cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)

	171

	172 punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)

	173 punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)

	174 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)

	175 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)

	176 cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)

	177 cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)

	178

	179 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

	180 mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

	181 mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

	182 mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

	183

	184 movaps xmm4,xmm0

	185 movaps xmm5,xmm1

	186 subps xmm0,xmm2 ; xmm0=tmp11

	187 subps xmm1,xmm3

	188 addps xmm4,xmm2 ; xmm4=tmp10

	189 addps xmm5,xmm3 ; xmm5=tmp13

	190

	191 mulps xmm1,[GOTOFF(ebx,PD_1_414)]

	192 subps xmm1,xmm5 ; xmm1=tmp12

	193

	194 movaps xmm6,xmm4

	195 movaps xmm7,xmm0

	196 subps xmm4,xmm5 ; xmm4=tmp3

	197 subps xmm0,xmm1 ; xmm0=tmp2

	198 addps xmm6,xmm5 ; xmm6=tmp0

	199 addps xmm7,xmm1 ; xmm7=tmp1

	200

	201 movaps XMMWORD [wk(1)], xmm4 ; tmp3

	202 movaps XMMWORD [wk(0)], xmm0 ; tmp2

	203

	204 ; -- Odd part

	205

	206 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]

	207 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]

	208 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]

	209 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]

	210

	211 punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)

	212 punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)

	213 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)

	214 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)

	215 cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)

	216 cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)

	217

	218 punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)

	219 punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)

	220 psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)

	221 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)

	222 cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)

	223 cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)

	224

	225 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

	226 mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

	227 mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

	228 mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

	229

	230 movaps xmm4,xmm2

	231 movaps xmm0,xmm5

	232 addps xmm2,xmm1 ; xmm2=z11

	233 addps xmm5,xmm3 ; xmm5=z13

	234 subps xmm4,xmm1 ; xmm4=z12

	235 subps xmm0,xmm3 ; xmm0=z10

	236

	237 movaps xmm1,xmm2

	238 subps xmm2,xmm5

	239 addps xmm1,xmm5 ; xmm1=tmp7

	240

	241 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11

	242

	243 movaps xmm3,xmm0

	244 addps xmm0,xmm4

	245 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5

	246 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)

	247 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)

	248 addps xmm3,xmm0 ; xmm3=tmp12

	249 subps xmm4,xmm0 ; xmm4=tmp10

	250

	251 ; -- Final output stage

	252

	253 subps xmm3,xmm1 ; xmm3=tmp6

	254 movaps xmm5,xmm6

	255 movaps xmm0,xmm7

	256 addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)

	257 addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)

	258 subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)

	259 subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)

	260 subps xmm2,xmm3 ; xmm2=tmp5

	261

	262 movaps xmm1,xmm6 ; transpose coefficients(phase 1)

	263 unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)

	264 unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)

	265 movaps xmm3,xmm0 ; transpose coefficients(phase 1)

	266 unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)

	267 unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)

	268

	269 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2

	270 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3

	271

	272 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)

	273 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)

	274

	275 addps xmm4,xmm2 ; xmm4=tmp4

	276 movaps xmm0,xmm7

	277 movaps xmm3,xmm5

	278 addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)

	279 addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)

	280 subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)

	281 subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)

	282

	283 movaps xmm2,xmm7 ; transpose coefficients(phase 1)

	284 unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)

	285 unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)

	286 movaps xmm4,xmm5 ; transpose coefficients(phase 1)

	287 unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)

	288 unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)

	289

	290 movaps xmm3,xmm6 ; transpose coefficients(phase 2)

	291 unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)

	292 unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)

	293 movaps xmm0,xmm1 ; transpose coefficients(phase 2)

	294 unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)

	295 unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)

	296

	297 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)

	298 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)

	299

	300 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6

	301 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3

	302 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1

	303 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0

	304

	305 movaps xmm6,xmm5 ; transpose coefficients(phase 2)

	306 unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)

	307 unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)

	308 movaps xmm3,xmm4 ; transpose coefficients(phase 2)

	309 unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)

	310 unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)

	311

	312 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5

	313 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6

	314 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4

	315 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3

	316

	317 .nextcolumn:

	318 add esi, byte 4*SIZEOF_JCOEF ; coef_block

	319 add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr

	320 add edi, 4DCTSIZESIZEOF_FAST_FLOAT ; wsptr

	321 dec ecx ; ctr

	322 jnz near .columnloop

	323

	324 ; -- Prefetch the next coefficient block

	325

	326 prefetchnta [esi + (DCTSIZE2-8)SIZEOF_JCOEF + 032]

	327 prefetchnta [esi + (DCTSIZE2-8)SIZEOF_JCOEF + 132]

	328 prefetchnta [esi + (DCTSIZE2-8)SIZEOF_JCOEF + 232]

	329 prefetchnta [esi + (DCTSIZE2-8)SIZEOF_JCOEF + 332]

	330

	331 ; ---- Pass 2: process rows from work array, store into output array.

	332

	333 mov eax, [original_ebp]

	334 lea esi, [workspace] ; FAST_FLOAT *wsptr

	335 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)

	336 mov eax, JDIMENSION [output_col(eax)]

	337 mov ecx, DCTSIZE/4 ; ctr

	338 alignx 16,7

	339 .rowloop:

	340

	341 ; -- Even part

	342

	343 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]

	344 movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]

	345 movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]

	346 movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]

	347

	348 movaps xmm4,xmm0

	349 movaps xmm5,xmm1

	350 subps xmm0,xmm2 ; xmm0=tmp11

	351 subps xmm1,xmm3

	352 addps xmm4,xmm2 ; xmm4=tmp10

	353 addps xmm5,xmm3 ; xmm5=tmp13

	354

	355 mulps xmm1,[GOTOFF(ebx,PD_1_414)]

	356 subps xmm1,xmm5 ; xmm1=tmp12

	357

	358 movaps xmm6,xmm4

	359 movaps xmm7,xmm0

	360 subps xmm4,xmm5 ; xmm4=tmp3

	361 subps xmm0,xmm1 ; xmm0=tmp2

	362 addps xmm6,xmm5 ; xmm6=tmp0

	363 addps xmm7,xmm1 ; xmm7=tmp1

	364

	365 movaps XMMWORD [wk(1)], xmm4 ; tmp3

	366 movaps XMMWORD [wk(0)], xmm0 ; tmp2

	367

	368 ; -- Odd part

	369

	370 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]

	371 movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]

	372 movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]

	373 movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]

	374

	375 movaps xmm4,xmm2

	376 movaps xmm0,xmm5

	377 addps xmm2,xmm1 ; xmm2=z11

	378 addps xmm5,xmm3 ; xmm5=z13

	379 subps xmm4,xmm1 ; xmm4=z12

	380 subps xmm0,xmm3 ; xmm0=z10

	381

	382 movaps xmm1,xmm2

	383 subps xmm2,xmm5

	384 addps xmm1,xmm5 ; xmm1=tmp7

	385

	386 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11

	387

	388 movaps xmm3,xmm0

	389 addps xmm0,xmm4

	390 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5

	391 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)

	392 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)

	393 addps xmm3,xmm0 ; xmm3=tmp12

	394 subps xmm4,xmm0 ; xmm4=tmp10

	395

	396 ; -- Final output stage

	397

	398 subps xmm3,xmm1 ; xmm3=tmp6

	399 movaps xmm5,xmm6

	400 movaps xmm0,xmm7

	401 addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)

	402 addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)

	403 subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)

	404 subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)

	405 subps xmm2,xmm3 ; xmm2=tmp5

	406

	407 movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]

	408 pcmpeqd xmm3,xmm3

	409 psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}

	410

	411 addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 10 20 ** 30 * *)

	412 addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 11 21 ** 31 * *)

	413 addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 16 26 ** 36 * *)

	414 addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 17 27 ** 37 * *)

	415

	416 pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)

	417 pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)

	418 pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)

	419 pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)

	420 por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)

	421 por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)

	422

	423 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2

	424 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3

	425

	426 addps xmm4,xmm2 ; xmm4=tmp4

	427 movaps xmm7,xmm1

	428 movaps xmm5,xmm3

	429 addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)

	430 addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)

	431 subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)

	432 subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)

	433

	434 movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]

	435 pcmpeqd xmm4,xmm4

	436 psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}

	437

	438 addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 14 24 ** 34 * *)

	439 addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 15 25 ** 35 * *)

	440 addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 12 22 ** 32 * *)

	441 addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 13 23 ** 33 * *)

	442

	443 pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)

	444 pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)

	445 pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)

	446 pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)

	447 por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)

	448 por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)

	449

	450 movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]

	451

	452 packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 2 5 34 35)

	453 packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 2 7 36 37)

	454 paddb xmm6,xmm2

	455 paddb xmm1,xmm2

	456

	457 movdqa xmm4,xmm6 ; transpose coefficients(phase 2)

	458 punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 3 1 32 33)

	459 punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 3 5 36 37)

	460

	461 movdqa xmm7,xmm6 ; transpose coefficients(phase 3)

	462 punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 1 5 16 17)

	463 punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 3 5 36 37)

	464

	465 pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 0 5 06 07)

	466 pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 2 5 26 27)

	467

	468 pushpic ebx ; save GOT address

	469

	470 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]

	471 mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]

	472 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6

	473 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7

	474 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]

	475 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]

	476 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5

	477 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3

	478

	479 poppic ebx ; restore GOT address

	480

	481 add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr

	482 add edi, byte 4*SIZEOF_JSAMPROW

	483 dec ecx ; ctr

	484 jnz near .rowloop

	485

	486 pop edi

	487 pop esi

	488 ; pop edx ; need not be preserved

	489 ; pop ecx ; need not be preserved

	490 pop ebx

	491 mov esp,ebp ; esp <- aligned ebp

	492 pop esp ; esp <- original ebp

	493 pop ebp

	494 ret

	495

	496 ; For some reason, the OS X linker does not honor the request to align the

	497 ; segment unless we do this.

	498 align 16

OLD	NEW

« no previous file with comments | « simd/jidctflt-sse.asm ('k') | simd/jidctflt-sse2-64.asm » ('j') | no next file with comments »