simd/jquant-mmx.asm - Issue 1934113002: Update libjpeg_turbo to 1.4.90 from https://github.com/libjpeg-turbo/

Side by Side Diff: simd/jquant-mmx.asm

Issue 1934113002: Update libjpeg_turbo to 1.4.90 from https://github.com/libjpeg-turbo/ (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 ;

	2 ; jquant.asm - sample data conversion and quantization (MMX)

	3 ;

	4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

	5 ;

	6 ; Based on

	7 ; x86 SIMD extension for IJG JPEG library

	8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

	9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

	10 ;

	11 ; This file should be assembled with NASM (Netwide Assembler),

	12 ; can not be assembled with Microsoft's MASM or any compatible

	13 ; assembler (including Borland's Turbo Assembler).

	14 ; NASM is available from http://nasm.sourceforge.net/ or

	15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

	16 ;

	17 ; [TAB8]

	18

	19 %include "jsimdext.inc"

	20 %include "jdct.inc"

	21

	22 ; --------------------------------------------------------------------------

	23 SECTION SEG_TEXT

	24 BITS 32

	25 ;

	26 ; Load data into workspace, applying unsigned->signed conversion

	27 ;

	28 ; GLOBAL(void)

	29 ; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,

	30 ; DCTELEM *workspace);

	31 ;

	32

	33 %define sample_data ebp+8 ; JSAMPARRAY sample_data

	34 %define start_col ebp+12 ; JDIMENSION start_col

	35 %define workspace ebp+16 ; DCTELEM *workspace

	36

	37 align 16

	38 global EXTN(jsimd_convsamp_mmx)

	39

	40 EXTN(jsimd_convsamp_mmx):

	41 push ebp

	42 mov ebp,esp

	43 push ebx

	44 ; push ecx ; need not be preserved

	45 ; push edx ; need not be preserved

	46 push esi

	47 push edi

	48

	49 pxor mm6,mm6 ; mm6=(all 0's)

	50 pcmpeqw mm7,mm7

	51 psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}

	52

	53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)

	54 mov eax, JDIMENSION [start_col]

	55 mov edi, POINTER [workspace] ; (DCTELEM *)

	56 mov ecx, DCTSIZE/4

	57 alignx 16,7

	58 .convloop:

	59 mov ebx, JSAMPROW [esi+0SIZEOF_JSAMPROW] ; (JSAMPLE )

	60 mov edx, JSAMPROW [esi+1SIZEOF_JSAMPROW] ; (JSAMPLE )

	61

	62 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)

	63 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)

	64

	65 mov ebx, JSAMPROW [esi+2SIZEOF_JSAMPROW] ; (JSAMPLE )

	66 mov edx, JSAMPROW [esi+3SIZEOF_JSAMPROW] ; (JSAMPLE )

	67

	68 movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)

	69 movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)

	70

	71 movq mm4,mm0

	72 punpcklbw mm0,mm6 ; mm0=(0123)

	73 punpckhbw mm4,mm6 ; mm4=(4567)

	74 movq mm5,mm1

	75 punpcklbw mm1,mm6 ; mm1=(89AB)

	76 punpckhbw mm5,mm6 ; mm5=(CDEF)

	77

	78 paddw mm0,mm7

	79 paddw mm4,mm7

	80 paddw mm1,mm7

	81 paddw mm5,mm7

	82

	83 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0

	84 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4

	85 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1

	86 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5

	87

	88 movq mm0,mm2

	89 punpcklbw mm2,mm6 ; mm2=(GHIJ)

	90 punpckhbw mm0,mm6 ; mm0=(KLMN)

	91 movq mm4,mm3

	92 punpcklbw mm3,mm6 ; mm3=(OPQR)

	93 punpckhbw mm4,mm6 ; mm4=(STUV)

	94

	95 paddw mm2,mm7

	96 paddw mm0,mm7

	97 paddw mm3,mm7

	98 paddw mm4,mm7

	99

	100 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2

	101 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0

	102 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3

	103 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4

	104

	105 add esi, byte 4*SIZEOF_JSAMPROW

	106 add edi, byte 4DCTSIZESIZEOF_DCTELEM

	107 dec ecx

	108 jnz short .convloop

	109

	110 emms ; empty MMX state

	111

	112 pop edi

	113 pop esi

	114 ; pop edx ; need not be preserved

	115 ; pop ecx ; need not be preserved

	116 pop ebx

	117 pop ebp

	118 ret

	119

	120 ; --------------------------------------------------------------------------

	121 ;

	122 ; Quantize/descale the coefficients, and store into coef_block

	123 ;

	124 ; This implementation is based on an algorithm described in

	125 ; "How to optimize for the Pentium family of microprocessors"

	126 ; (http://www.agner.org/assem/).

	127 ;

	128 ; GLOBAL(void)

	129 ; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM *divisors,

	130 ; DCTELEM *workspace);

	131 ;

	132

	133 %define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)

	134 %define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)

	135 %define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)

	136 %define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)

	137

	138 %define coef_block ebp+8 ; JCOEFPTR coef_block

	139 %define divisors ebp+12 ; DCTELEM *divisors

	140 %define workspace ebp+16 ; DCTELEM *workspace

	141

	142 align 16

	143 global EXTN(jsimd_quantize_mmx)

	144

	145 EXTN(jsimd_quantize_mmx):

	146 push ebp

	147 mov ebp,esp

	148 ; push ebx ; unused

	149 ; push ecx ; unused

	150 ; push edx ; need not be preserved

	151 push esi

	152 push edi

	153

	154 mov esi, POINTER [workspace]

	155 mov edx, POINTER [divisors]

	156 mov edi, JCOEFPTR [coef_block]

	157 mov ah, 2

	158 alignx 16,7

	159 .quantloop1:

	160 mov al, DCTSIZE2/8/2

	161 alignx 16,7

	162 .quantloop2:

	163 movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]

	164 movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]

	165

	166 movq mm0,mm2

	167 movq mm1,mm3

	168

	169 psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise

	170 psraw mm3,(WORD_BIT-1)

	171

	172 pxor mm0,mm2 ; val = -val

	173 pxor mm1,mm3

	174 psubw mm0,mm2

	175 psubw mm1,mm3

	176

	177 ;

	178 ; MMX is an annoyingly crappy instruction set. It has two

	179 ; misfeatures that are causing problems here:

	180 ;

	181 ; - All multiplications are signed.

	182 ;

	183 ; - The second operand for the shifts is not treated as packed.

	184 ;

	185 ;

	186 ; We work around the first problem by implementing this algorithm:

	187 ;

	188 ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)

	189 ; {

	190 ; enum { SHORT_BIT = 16 };

	191 ; signed short sx = (signed short) x;

	192 ; signed short sy = (signed short) y;

	193 ; signed long sz;

	194 ;

	195 ; sz = (long) sx * (long) sy; /* signed multiply */

	196 ;

	197 ; if (sx < 0) sz += (long) sy << SHORT_BIT;

	198 ; if (sy < 0) sz += (long) sx << SHORT_BIT;

	199 ;

	200 ; return (unsigned long) sz;

	201 ; }

	202 ;

	203 ; (note that a negative sx adds _sy_ and vice versa)

	204 ;

	205 ; For the second problem, we replace the shift by a multiplication.

	206 ; Unfortunately that means we have to deal with the signed issue again.

	207 ;

	208

	209 paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor

	210 paddw mm1, MMWORD [CORRECTION(0,1,edx)]

	211

	212 movq mm4,mm0 ; store current value for later

	213 movq mm5,mm1

	214 pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal

	215 pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]

	216 paddw mm0,mm4 ; reciprocal is always negative (MSB=1),

	217 paddw mm1,mm5 ; so we always need to add the initial value

	218 ; (input value is never negative as we

	219 ; inverted it at the start of this routine)

	220

	221 ; here it gets a bit tricky as both scale

	222 ; and mm0/mm1 can be negative

	223 movq mm6, MMWORD [SCALE(0,0,edx)] ; scale

	224 movq mm7, MMWORD [SCALE(0,1,edx)]

	225 movq mm4,mm0

	226 movq mm5,mm1

	227 pmulhw mm0,mm6

	228 pmulhw mm1,mm7

	229

	230 psraw mm6,(WORD_BIT-1) ; determine if scale is negative

	231 psraw mm7,(WORD_BIT-1)

	232

	233 pand mm6,mm4 ; and add input if it is

	234 pand mm7,mm5

	235 paddw mm0,mm6

	236 paddw mm1,mm7

	237

	238 psraw mm4,(WORD_BIT-1) ; then check if negative input

	239 psraw mm5,(WORD_BIT-1)

	240

	241 pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is

	242 pand mm5, MMWORD [SCALE(0,1,edx)]

	243 paddw mm0,mm4

	244 paddw mm1,mm5

	245

	246 pxor mm0,mm2 ; val = -val

	247 pxor mm1,mm3

	248 psubw mm0,mm2

	249 psubw mm1,mm3

	250

	251 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0

	252 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1

	253

	254 add esi, byte 8*SIZEOF_DCTELEM

	255 add edx, byte 8*SIZEOF_DCTELEM

	256 add edi, byte 8*SIZEOF_JCOEF

	257 dec al

	258 jnz near .quantloop2

	259 dec ah

	260 jnz near .quantloop1 ; to avoid branch misprediction

	261

	262 emms ; empty MMX state

	263

	264 pop edi

	265 pop esi

	266 ; pop edx ; need not be preserved

	267 ; pop ecx ; unused

	268 ; pop ebx ; unused

	269 pop ebp

	270 ret

	271

	272 ; For some reason, the OS X linker does not honor the request to align the

	273 ; segment unless we do this.

	274 align 16

OLD	NEW

« simd/jccolext-sse2-64.asm ('K') | « simd/jquant-3dn.asm ('k') | simd/jquant-sse.asm » ('j') | no next file with comments »