simd/jidctflt-sse2-64.asm - Issue 1939823002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jidctflt-sse2-64.asm

Issue 1939823002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Response to comments Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 ;

	2 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)

	3 ;

	4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

	5 ; Copyright 2009 D. R. Commander

	6 ;

	7 ; Based on

	8 ; x86 SIMD extension for IJG JPEG library

	9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

	10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

	11 ;

	12 ; This file should be assembled with NASM (Netwide Assembler),

	13 ; can not be assembled with Microsoft's MASM or any compatible

	14 ; assembler (including Borland's Turbo Assembler).

	15 ; NASM is available from http://nasm.sourceforge.net/ or

	16 ; http://sourceforge.net/project/showfiles.php?group_id=6208

	17 ;

	18 ; This file contains a floating-point implementation of the inverse DCT

	19 ; (Discrete Cosine Transform). The following code is based directly on

	20 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.

	21 ;

	22 ; [TAB8]

	23

	24 %include "jsimdext.inc"

	25 %include "jdct.inc"

	26

	27 ; --------------------------------------------------------------------------

	28

	29 %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)

	30 shufps %1,%2,0x44

	31 %endmacro

	32

	33 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)

	34 shufps %1,%2,0xEE

	35 %endmacro

	36

	37 ; --------------------------------------------------------------------------

	38 SECTION SEG_CONST

	39

	40 alignz 16

	41 global EXTN(jconst_idct_float_sse2)

	42

	43 EXTN(jconst_idct_float_sse2):

	44

	45 PD_1_414 times 4 dd 1.414213562373095048801689

	46 PD_1_847 times 4 dd 1.847759065022573512256366

	47 PD_1_082 times 4 dd 1.082392200292393968799446

	48 PD_M2_613 times 4 dd -2.613125929752753055713286

	49 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)

	50 PB_CENTERJSAMP times 16 db CENTERJSAMPLE

	51

	52 alignz 16

	53

	54 ; --------------------------------------------------------------------------

	55 SECTION SEG_TEXT

	56 BITS 64

	57 ;

	58 ; Perform dequantization and inverse DCT on one block of coefficients.

	59 ;

	60 ; GLOBAL(void)

	61 ; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,

	62 ; JSAMPARRAY output_buf, JDIMENSION output_col)

	63 ;

	64

	65 ; r10 = void *dct_table

	66 ; r11 = JCOEFPTR coef_block

	67 ; r12 = JSAMPARRAY output_buf

	68 ; r13 = JDIMENSION output_col

	69

	70 %define original_rbp rbp+0

	71 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]

	72 %define WK_NUM 2

	73 %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT

	74 ; FAST_FLOAT workspace[DCTSIZE2]

	75

	76 align 16

	77 global EXTN(jsimd_idct_float_sse2)

	78

	79 EXTN(jsimd_idct_float_sse2):

	80 push rbp

	81 mov rax,rsp ; rax = original rbp

	82 sub rsp, byte 4

	83 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits

	84 mov [rsp],rax

	85 mov rbp,rsp ; rbp = aligned rbp

	86 lea rsp, [workspace]

	87 collect_args

	88 push rbx

	89

	90 ; ---- Pass 1: process columns from input, store into work array.

	91

	92 mov rdx, r10 ; quantptr

	93 mov rsi, r11 ; inptr

	94 lea rdi, [workspace] ; FAST_FLOAT *wsptr

	95 mov rcx, DCTSIZE/4 ; ctr

	96 .columnloop:

	97 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE

	98 mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]

	99 or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]

	100 jnz near .columnDCT

	101

	102 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]

	103 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]

	104 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]

	105 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]

	106 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]

	107 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]

	108 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]

	109 por xmm1,xmm2

	110 por xmm3,xmm4

	111 por xmm5,xmm6

	112 por xmm1,xmm3

	113 por xmm5,xmm7

	114 por xmm1,xmm5

	115 packsswb xmm1,xmm1

	116 movd eax,xmm1

	117 test rax,rax

	118 jnz short .columnDCT

	119

	120 ; -- AC terms all zero

	121

	122 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]

	123

	124 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)

	125 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)

	126 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)

	127

	128 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

	129

	130 movaps xmm1,xmm0

	131 movaps xmm2,xmm0

	132 movaps xmm3,xmm0

	133

	134 shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)

	135 shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)

	136 shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)

	137 shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)

	138

	139 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0

	140 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0

	141 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1

	142 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1

	143 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2

	144 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2

	145 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3

	146 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3

	147 jmp near .nextcolumn

	148 %endif

	149 .columnDCT:

	150

	151 ; -- Even part

	152

	153 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]

	154 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]

	155 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]

	156 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]

	157

	158 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)

	159 punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)

	160 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)

	161 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)

	162 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)

	163 cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)

	164

	165 punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)

	166 punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)

	167 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)

	168 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)

	169 cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)

	170 cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)

	171

	172 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

	173 mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

	174 mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

	175 mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

	176

	177 movaps xmm4,xmm0

	178 movaps xmm5,xmm1

	179 subps xmm0,xmm2 ; xmm0=tmp11

	180 subps xmm1,xmm3

	181 addps xmm4,xmm2 ; xmm4=tmp10

	182 addps xmm5,xmm3 ; xmm5=tmp13

	183

	184 mulps xmm1,[rel PD_1_414]

	185 subps xmm1,xmm5 ; xmm1=tmp12

	186

	187 movaps xmm6,xmm4

	188 movaps xmm7,xmm0

	189 subps xmm4,xmm5 ; xmm4=tmp3

	190 subps xmm0,xmm1 ; xmm0=tmp2

	191 addps xmm6,xmm5 ; xmm6=tmp0

	192 addps xmm7,xmm1 ; xmm7=tmp1

	193

	194 movaps XMMWORD [wk(1)], xmm4 ; tmp3

	195 movaps XMMWORD [wk(0)], xmm0 ; tmp2

	196

	197 ; -- Odd part

	198

	199 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]

	200 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]

	201 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]

	202 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]

	203

	204 punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)

	205 punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)

	206 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)

	207 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)

	208 cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)

	209 cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)

	210

	211 punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)

	212 punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)

	213 psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)

	214 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)

	215 cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)

	216 cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)

	217

	218 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

	219 mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

	220 mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

	221 mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

	222

	223 movaps xmm4,xmm2

	224 movaps xmm0,xmm5

	225 addps xmm2,xmm1 ; xmm2=z11

	226 addps xmm5,xmm3 ; xmm5=z13

	227 subps xmm4,xmm1 ; xmm4=z12

	228 subps xmm0,xmm3 ; xmm0=z10

	229

	230 movaps xmm1,xmm2

	231 subps xmm2,xmm5

	232 addps xmm1,xmm5 ; xmm1=tmp7

	233

	234 mulps xmm2,[rel PD_1_414] ; xmm2=tmp11

	235

	236 movaps xmm3,xmm0

	237 addps xmm0,xmm4

	238 mulps xmm0,[rel PD_1_847] ; xmm0=z5

	239 mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)

	240 mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)

	241 addps xmm3,xmm0 ; xmm3=tmp12

	242 subps xmm4,xmm0 ; xmm4=tmp10

	243

	244 ; -- Final output stage

	245

	246 subps xmm3,xmm1 ; xmm3=tmp6

	247 movaps xmm5,xmm6

	248 movaps xmm0,xmm7

	249 addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)

	250 addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)

	251 subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)

	252 subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)

	253 subps xmm2,xmm3 ; xmm2=tmp5

	254

	255 movaps xmm1,xmm6 ; transpose coefficients(phase 1)

	256 unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)

	257 unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)

	258 movaps xmm3,xmm0 ; transpose coefficients(phase 1)

	259 unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)

	260 unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)

	261

	262 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2

	263 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3

	264

	265 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)

	266 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)

	267

	268 addps xmm4,xmm2 ; xmm4=tmp4

	269 movaps xmm0,xmm7

	270 movaps xmm3,xmm5

	271 addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)

	272 addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)

	273 subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)

	274 subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)

	275

	276 movaps xmm2,xmm7 ; transpose coefficients(phase 1)

	277 unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)

	278 unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)

	279 movaps xmm4,xmm5 ; transpose coefficients(phase 1)

	280 unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)

	281 unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)

	282

	283 movaps xmm3,xmm6 ; transpose coefficients(phase 2)

	284 unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)

	285 unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)

	286 movaps xmm0,xmm1 ; transpose coefficients(phase 2)

	287 unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)

	288 unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)

	289

	290 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)

	291 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)

	292

	293 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6

	294 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3

	295 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1

	296 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0

	297

	298 movaps xmm6,xmm5 ; transpose coefficients(phase 2)

	299 unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)

	300 unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)

	301 movaps xmm3,xmm4 ; transpose coefficients(phase 2)

	302 unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)

	303 unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)

	304

	305 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5

	306 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6

	307 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4

	308 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3

	309

	310 .nextcolumn:

	311 add rsi, byte 4*SIZEOF_JCOEF ; coef_block

	312 add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr

	313 add rdi, 4DCTSIZESIZEOF_FAST_FLOAT ; wsptr

	314 dec rcx ; ctr

	315 jnz near .columnloop

	316

	317 ; -- Prefetch the next coefficient block

	318

	319 prefetchnta [rsi + (DCTSIZE2-8)SIZEOF_JCOEF + 032]

	320 prefetchnta [rsi + (DCTSIZE2-8)SIZEOF_JCOEF + 132]

	321 prefetchnta [rsi + (DCTSIZE2-8)SIZEOF_JCOEF + 232]

	322 prefetchnta [rsi + (DCTSIZE2-8)SIZEOF_JCOEF + 332]

	323

	324 ; ---- Pass 2: process rows from work array, store into output array.

	325

	326 mov rax, [original_rbp]

	327 lea rsi, [workspace] ; FAST_FLOAT *wsptr

	328 mov rdi, r12 ; (JSAMPROW *)

	329 mov eax, r13d

	330 mov rcx, DCTSIZE/4 ; ctr

	331 .rowloop:

	332

	333 ; -- Even part

	334

	335 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]

	336 movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]

	337 movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]

	338 movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]

	339

	340 movaps xmm4,xmm0

	341 movaps xmm5,xmm1

	342 subps xmm0,xmm2 ; xmm0=tmp11

	343 subps xmm1,xmm3

	344 addps xmm4,xmm2 ; xmm4=tmp10

	345 addps xmm5,xmm3 ; xmm5=tmp13

	346

	347 mulps xmm1,[rel PD_1_414]

	348 subps xmm1,xmm5 ; xmm1=tmp12

	349

	350 movaps xmm6,xmm4

	351 movaps xmm7,xmm0

	352 subps xmm4,xmm5 ; xmm4=tmp3

	353 subps xmm0,xmm1 ; xmm0=tmp2

	354 addps xmm6,xmm5 ; xmm6=tmp0

	355 addps xmm7,xmm1 ; xmm7=tmp1

	356

	357 movaps XMMWORD [wk(1)], xmm4 ; tmp3

	358 movaps XMMWORD [wk(0)], xmm0 ; tmp2

	359

	360 ; -- Odd part

	361

	362 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]

	363 movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]

	364 movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]

	365 movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]

	366

	367 movaps xmm4,xmm2

	368 movaps xmm0,xmm5

	369 addps xmm2,xmm1 ; xmm2=z11

	370 addps xmm5,xmm3 ; xmm5=z13

	371 subps xmm4,xmm1 ; xmm4=z12

	372 subps xmm0,xmm3 ; xmm0=z10

	373

	374 movaps xmm1,xmm2

	375 subps xmm2,xmm5

	376 addps xmm1,xmm5 ; xmm1=tmp7

	377

	378 mulps xmm2,[rel PD_1_414] ; xmm2=tmp11

	379

	380 movaps xmm3,xmm0

	381 addps xmm0,xmm4

	382 mulps xmm0,[rel PD_1_847] ; xmm0=z5

	383 mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)

	384 mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)

	385 addps xmm3,xmm0 ; xmm3=tmp12

	386 subps xmm4,xmm0 ; xmm4=tmp10

	387

	388 ; -- Final output stage

	389

	390 subps xmm3,xmm1 ; xmm3=tmp6

	391 movaps xmm5,xmm6

	392 movaps xmm0,xmm7

	393 addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)

	394 addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)

	395 subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)

	396 subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)

	397 subps xmm2,xmm3 ; xmm2=tmp5

	398

	399 movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]

	400 pcmpeqd xmm3,xmm3

	401 psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}

	402

	403 addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 10 20 ** 30 * *)

	404 addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 11 21 ** 31 * *)

	405 addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 16 26 ** 36 * *)

	406 addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 17 27 ** 37 * *)

	407

	408 pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)

	409 pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)

	410 pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)

	411 pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)

	412 por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)

	413 por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)

	414

	415 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2

	416 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3

	417

	418 addps xmm4,xmm2 ; xmm4=tmp4

	419 movaps xmm7,xmm1

	420 movaps xmm5,xmm3

	421 addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)

	422 addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)

	423 subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)

	424 subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)

	425

	426 movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]

	427 pcmpeqd xmm4,xmm4

	428 psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}

	429

	430 addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 14 24 ** 34 * *)

	431 addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 15 25 ** 35 * *)

	432 addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 12 22 ** 32 * *)

	433 addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 13 23 ** 33 * *)

	434

	435 pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)

	436 pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)

	437 pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)

	438 pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)

	439 por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)

	440 por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)

	441

	442 movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]

	443

	444 packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 2 5 34 35)

	445 packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 2 7 36 37)

	446 paddb xmm6,xmm2

	447 paddb xmm1,xmm2

	448

	449 movdqa xmm4,xmm6 ; transpose coefficients(phase 2)

	450 punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 3 1 32 33)

	451 punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 3 5 36 37)

	452

	453 movdqa xmm7,xmm6 ; transpose coefficients(phase 3)

	454 punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 1 5 16 17)

	455 punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 3 5 36 37)

	456

	457 pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 0 5 06 07)

	458 pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 2 5 26 27)

	459

	460 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]

	461 mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]

	462 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6

	463 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7

	464 mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]

	465 mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]

	466 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5

	467 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3

	468

	469 add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr

	470 add rdi, byte 4*SIZEOF_JSAMPROW

	471 dec rcx ; ctr

	472 jnz near .rowloop

	473

	474 pop rbx

	475 uncollect_args

	476 mov rsp,rbp ; rsp <- aligned rbp

	477 pop rsp ; rsp <- original rbp

	478 pop rbp

	479 ret

	480

	481 ; For some reason, the OS X linker does not honor the request to align the

	482 ; segment unless we do this.

	483 align 16

OLD	NEW

« jdhuff.c ('K') | « simd/jidctflt-sse2.asm ('k') | simd/jidctfst-altivec.c » ('j') | no next file with comments »