simd/jcqntmmx.asm - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jcqntmmx.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 ;

2 ; jcqntmmx.asm - sample data conversion and quantization (MMX)

3 ;

4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

5 ;

6 ; Based on

7 ; x86 SIMD extension for IJG JPEG library

8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

10 ;

11 ; This file should be assembled with NASM (Netwide Assembler),

12 ; can not be assembled with Microsoft's MASM or any compatible

13 ; assembler (including Borland's Turbo Assembler).

14 ; NASM is available from http://nasm.sourceforge.net/ or

15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

16 ;

17 ; [TAB8]

18

19 %include "jsimdext.inc"

20 %include "jdct.inc"

21

22 ; --------------------------------------------------------------------------

23 SECTION SEG_TEXT

24 BITS 32

25 ;

26 ; Load data into workspace, applying unsigned->signed conversion

27 ;

28 ; GLOBAL(void)

29 ; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,

30 ; DCTELEM * workspace);

31 ;

32

33 %define sample_data ebp+8 ; JSAMPARRAY sample_data

34 %define start_col ebp+12 ; JDIMENSION start_col

35 %define workspace ebp+16 ; DCTELEM * workspace

36

37 align 16

38 global EXTN(jsimd_convsamp_mmx) PRIVATE

39

40 EXTN(jsimd_convsamp_mmx):

41 push ebp

42 mov ebp,esp

43 push ebx

44 ; push ecx ; need not be preserved

45 ; push edx ; need not be preserved

46 push esi

47 push edi

48

49 pxor mm6,mm6 ; mm6=(all 0's)

50 pcmpeqw mm7,mm7

51 psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}

52

53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)

54 mov eax, JDIMENSION [start_col]

55 mov edi, POINTER [workspace] ; (DCTELEM *)

56 mov ecx, DCTSIZE/4

57 alignx 16,7

58 .convloop:

59 mov ebx, JSAMPROW [esi+0SIZEOF_JSAMPROW] ; (JSAMPLE )

60 mov edx, JSAMPROW [esi+1SIZEOF_JSAMPROW] ; (JSAMPLE )

61

62 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)

63 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)

64

65 mov ebx, JSAMPROW [esi+2SIZEOF_JSAMPROW] ; (JSAMPLE )

66 mov edx, JSAMPROW [esi+3SIZEOF_JSAMPROW] ; (JSAMPLE )

67

68 movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)

69 movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)

70

71 movq mm4,mm0

72 punpcklbw mm0,mm6 ; mm0=(0123)

73 punpckhbw mm4,mm6 ; mm4=(4567)

74 movq mm5,mm1

75 punpcklbw mm1,mm6 ; mm1=(89AB)

76 punpckhbw mm5,mm6 ; mm5=(CDEF)

77

78 paddw mm0,mm7

79 paddw mm4,mm7

80 paddw mm1,mm7

81 paddw mm5,mm7

82

83 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0

84 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4

85 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1

86 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5

87

88 movq mm0,mm2

89 punpcklbw mm2,mm6 ; mm2=(GHIJ)

90 punpckhbw mm0,mm6 ; mm0=(KLMN)

91 movq mm4,mm3

92 punpcklbw mm3,mm6 ; mm3=(OPQR)

93 punpckhbw mm4,mm6 ; mm4=(STUV)

94

95 paddw mm2,mm7

96 paddw mm0,mm7

97 paddw mm3,mm7

98 paddw mm4,mm7

99

100 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2

101 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0

102 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3

103 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4

104

105 add esi, byte 4*SIZEOF_JSAMPROW

106 add edi, byte 4DCTSIZESIZEOF_DCTELEM

107 dec ecx

108 jnz short .convloop

109

110 emms ; empty MMX state

111

112 pop edi

113 pop esi

114 ; pop edx ; need not be preserved

115 ; pop ecx ; need not be preserved

116 pop ebx

117 pop ebp

118 ret

119

120 ; --------------------------------------------------------------------------

121 ;

122 ; Quantize/descale the coefficients, and store into coef_block

123 ;

124 ; This implementation is based on an algorithm described in

125 ; "How to optimize for the Pentium family of microprocessors"

126 ; (http://www.agner.org/assem/).

127 ;

128 ; GLOBAL(void)

129 ; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,

130 ; DCTELEM * workspace);

131 ;

132

133 %define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)

134 %define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)

135 %define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)

136 %define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)

137

138 %define coef_block ebp+8 ; JCOEFPTR coef_block

139 %define divisors ebp+12 ; DCTELEM * divisors

140 %define workspace ebp+16 ; DCTELEM * workspace

141

142 align 16

143 global EXTN(jsimd_quantize_mmx) PRIVATE

144

145 EXTN(jsimd_quantize_mmx):

146 push ebp

147 mov ebp,esp

148 ; push ebx ; unused

149 ; push ecx ; unused

150 ; push edx ; need not be preserved

151 push esi

152 push edi

153

154 mov esi, POINTER [workspace]

155 mov edx, POINTER [divisors]

156 mov edi, JCOEFPTR [coef_block]

157 mov ah, 2

158 alignx 16,7

159 .quantloop1:

160 mov al, DCTSIZE2/8/2

161 alignx 16,7

162 .quantloop2:

163 movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]

164 movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]

165

166 movq mm0,mm2

167 movq mm1,mm3

168

169 psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise

170 psraw mm3,(WORD_BIT-1)

171

172 pxor mm0,mm2 ; val = -val

173 pxor mm1,mm3

174 psubw mm0,mm2

175 psubw mm1,mm3

176

177 ;

178 ; MMX is an annoyingly crappy instruction set. It has two

179 ; misfeatures that are causing problems here:

180 ;

181 ; - All multiplications are signed.

182 ;

183 ; - The second operand for the shifts is not treated as packed.

184 ;

185 ;

186 ; We work around the first problem by implementing this algorithm:

187 ;

188 ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)

189 ; {

190 ; enum { SHORT_BIT = 16 };

191 ; signed short sx = (signed short) x;

192 ; signed short sy = (signed short) y;

193 ; signed long sz;

194 ;

195 ; sz = (long) sx * (long) sy; /* signed multiply */

196 ;

197 ; if (sx < 0) sz += (long) sy << SHORT_BIT;

198 ; if (sy < 0) sz += (long) sx << SHORT_BIT;

199 ;

200 ; return (unsigned long) sz;

201 ; }

202 ;

203 ; (note that a negative sx adds _sy_ and vice versa)

204 ;

205 ; For the second problem, we replace the shift by a multiplication.

206 ; Unfortunately that means we have to deal with the signed issue again.

207 ;

208

209 paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor

210 paddw mm1, MMWORD [CORRECTION(0,1,edx)]

211

212 movq mm4,mm0 ; store current value for later

213 movq mm5,mm1

214 pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal

215 pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]

216 paddw mm0,mm4 ; reciprocal is always negative (MSB=1),

217 paddw mm1,mm5 ; so we always need to add the initial value

218 ; (input value is never negative as we

219 ; inverted it at the start of this routine)

220

221 ; here it gets a bit tricky as both scale

222 ; and mm0/mm1 can be negative

223 movq mm6, MMWORD [SCALE(0,0,edx)] ; scale

224 movq mm7, MMWORD [SCALE(0,1,edx)]

225 movq mm4,mm0

226 movq mm5,mm1

227 pmulhw mm0,mm6

228 pmulhw mm1,mm7

229

230 psraw mm6,(WORD_BIT-1) ; determine if scale is negative

231 psraw mm7,(WORD_BIT-1)

232

233 pand mm6,mm4 ; and add input if it is

234 pand mm7,mm5

235 paddw mm0,mm6

236 paddw mm1,mm7

237

238 psraw mm4,(WORD_BIT-1) ; then check if negative input

239 psraw mm5,(WORD_BIT-1)

240

241 pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is

242 pand mm5, MMWORD [SCALE(0,1,edx)]

243 paddw mm0,mm4

244 paddw mm1,mm5

245

246 pxor mm0,mm2 ; val = -val

247 pxor mm1,mm3

248 psubw mm0,mm2

249 psubw mm1,mm3

250

251 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0

252 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1

253

254 add esi, byte 8*SIZEOF_DCTELEM

255 add edx, byte 8*SIZEOF_DCTELEM

256 add edi, byte 8*SIZEOF_JCOEF

257 dec al

258 jnz near .quantloop2

259 dec ah

260 jnz near .quantloop1 ; to avoid branch misprediction

261

262 emms ; empty MMX state

263

264 pop edi

265 pop esi

266 ; pop edx ; need not be preserved

267 ; pop ecx ; unused

268 ; pop ebx ; unused

269 pop ebp

270 ret

271

272 ; For some reason, the OS X linker does not honor the request to align the

273 ; segment unless we do this.

274 align 16

OLD	NEW

« no previous file with comments | « simd/jcqnt3dn.asm ('k') | simd/jcqnts2f.asm » ('j') | no next file with comments »