simd/jiss2flt.asm - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jiss2flt.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 ;

2 ; jiss2flt.asm - floating-point IDCT (SSE & SSE2)

3 ;

4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

5 ;

6 ; Based on

7 ; x86 SIMD extension for IJG JPEG library

8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

10 ;

11 ; This file should be assembled with NASM (Netwide Assembler),

12 ; can not be assembled with Microsoft's MASM or any compatible

13 ; assembler (including Borland's Turbo Assembler).

14 ; NASM is available from http://nasm.sourceforge.net/ or

15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

16 ;

17 ; This file contains a floating-point implementation of the inverse DCT

18 ; (Discrete Cosine Transform). The following code is based directly on

19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.

20 ;

21 ; [TAB8]

22

23 %include "jsimdext.inc"

24 %include "jdct.inc"

25

26 ; --------------------------------------------------------------------------

27

28 %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)

29 shufps %1,%2,0x44

30 %endmacro

31

32 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)

33 shufps %1,%2,0xEE

34 %endmacro

35

36 ; --------------------------------------------------------------------------

37 SECTION SEG_CONST

38

39 alignz 16

40 global EXTN(jconst_idct_float_sse2) PRIVATE

41

42 EXTN(jconst_idct_float_sse2):

43

44 PD_1_414 times 4 dd 1.414213562373095048801689

45 PD_1_847 times 4 dd 1.847759065022573512256366

46 PD_1_082 times 4 dd 1.082392200292393968799446

47 PD_M2_613 times 4 dd -2.613125929752753055713286

48 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)

49 PB_CENTERJSAMP times 16 db CENTERJSAMPLE

50

51 alignz 16

52

53 ; --------------------------------------------------------------------------

54 SECTION SEG_TEXT

55 BITS 32

56 ;

57 ; Perform dequantization and inverse DCT on one block of coefficients.

58 ;

59 ; GLOBAL(void)

60 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,

61 ; JSAMPARRAY output_buf, JDIMENSION output_col)

62 ;

63

64 %define dct_table(b) (b)+8 ; void * dct_table

65 %define coef_block(b) (b)+12 ; JCOEFPTR coef_block

66 %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf

67 %define output_col(b) (b)+20 ; JDIMENSION output_col

68

69 %define original_ebp ebp+0

70 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]

71 %define WK_NUM 2

72 %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT

73 ; FAST_FLOAT workspace[DCTSIZE2]

74

75 align 16

76 global EXTN(jsimd_idct_float_sse2) PRIVATE

77

78 EXTN(jsimd_idct_float_sse2):

79 push ebp

80 mov eax,esp ; eax = original ebp

81 sub esp, byte 4

82 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits

83 mov [esp],eax

84 mov ebp,esp ; ebp = aligned ebp

85 lea esp, [workspace]

86 push ebx

87 ; push ecx ; need not be preserved

88 ; push edx ; need not be preserved

89 push esi

90 push edi

91

92 get_GOT ebx ; get GOT address

93

94 ; ---- Pass 1: process columns from input, store into work array.

95

96 ; mov eax, [original_ebp]

97 mov edx, POINTER [dct_table(eax)] ; quantptr

98 mov esi, JCOEFPTR [coef_block(eax)] ; inptr

99 lea edi, [workspace] ; FAST_FLOAT * wsptr

100 mov ecx, DCTSIZE/4 ; ctr

101 alignx 16,7

102 .columnloop:

103 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE

104 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]

105 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]

106 jnz near .columnDCT

107

108 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]

109 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]

110 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]

111 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]

112 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]

113 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]

114 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]

115 por xmm1,xmm2

116 por xmm3,xmm4

117 por xmm5,xmm6

118 por xmm1,xmm3

119 por xmm5,xmm7

120 por xmm1,xmm5

121 packsswb xmm1,xmm1

122 movd eax,xmm1

123 test eax,eax

124 jnz short .columnDCT

125

126 ; -- AC terms all zero

127

128 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]

129

130 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)

131 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)

132 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)

133

134 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

135

136 movaps xmm1,xmm0

137 movaps xmm2,xmm0

138 movaps xmm3,xmm0

139

140 shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)

141 shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)

142 shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)

143 shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)

144

145 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0

146 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0

147 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1

148 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1

149 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2

150 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2

151 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3

152 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3

153 jmp near .nextcolumn

154 alignx 16,7

155 %endif

156 .columnDCT:

157

158 ; -- Even part

159

160 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]

161 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]

162 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]

163 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]

164

165 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)

166 punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)

167 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)

168 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)

169 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)

170 cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)

171

172 punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)

173 punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)

174 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)

175 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)

176 cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)

177 cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)

178

179 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

180 mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

181 mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

182 mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

183

184 movaps xmm4,xmm0

185 movaps xmm5,xmm1

186 subps xmm0,xmm2 ; xmm0=tmp11

187 subps xmm1,xmm3

188 addps xmm4,xmm2 ; xmm4=tmp10

189 addps xmm5,xmm3 ; xmm5=tmp13

190

191 mulps xmm1,[GOTOFF(ebx,PD_1_414)]

192 subps xmm1,xmm5 ; xmm1=tmp12

193

194 movaps xmm6,xmm4

195 movaps xmm7,xmm0

196 subps xmm4,xmm5 ; xmm4=tmp3

197 subps xmm0,xmm1 ; xmm0=tmp2

198 addps xmm6,xmm5 ; xmm6=tmp0

199 addps xmm7,xmm1 ; xmm7=tmp1

200

201 movaps XMMWORD [wk(1)], xmm4 ; tmp3

202 movaps XMMWORD [wk(0)], xmm0 ; tmp2

203

204 ; -- Odd part

205

206 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]

207 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]

208 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]

209 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]

210

211 punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)

212 punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)

213 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)

214 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)

215 cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)

216 cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)

217

218 punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)

219 punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)

220 psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)

221 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)

222 cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)

223 cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)

224

225 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

226 mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

227 mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

228 mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

229

230 movaps xmm4,xmm2

231 movaps xmm0,xmm5

232 addps xmm2,xmm1 ; xmm2=z11

233 addps xmm5,xmm3 ; xmm5=z13

234 subps xmm4,xmm1 ; xmm4=z12

235 subps xmm0,xmm3 ; xmm0=z10

236

237 movaps xmm1,xmm2

238 subps xmm2,xmm5

239 addps xmm1,xmm5 ; xmm1=tmp7

240

241 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11

242

243 movaps xmm3,xmm0

244 addps xmm0,xmm4

245 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5

246 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)

247 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)

248 addps xmm3,xmm0 ; xmm3=tmp12

249 subps xmm4,xmm0 ; xmm4=tmp10

250

251 ; -- Final output stage

252

253 subps xmm3,xmm1 ; xmm3=tmp6

254 movaps xmm5,xmm6

255 movaps xmm0,xmm7

256 addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)

257 addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)

258 subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)

259 subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)

260 subps xmm2,xmm3 ; xmm2=tmp5

261

262 movaps xmm1,xmm6 ; transpose coefficients(phase 1)

263 unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)

264 unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)

265 movaps xmm3,xmm0 ; transpose coefficients(phase 1)

266 unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)

267 unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)

268

269 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2

270 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3

271

272 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)

273 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)

274

275 addps xmm4,xmm2 ; xmm4=tmp4

276 movaps xmm0,xmm7

277 movaps xmm3,xmm5

278 addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)

279 addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)

280 subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)

281 subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)

282

283 movaps xmm2,xmm7 ; transpose coefficients(phase 1)

284 unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)

285 unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)

286 movaps xmm4,xmm5 ; transpose coefficients(phase 1)

287 unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)

288 unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)

289

290 movaps xmm3,xmm6 ; transpose coefficients(phase 2)

291 unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)

292 unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)

293 movaps xmm0,xmm1 ; transpose coefficients(phase 2)

294 unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)

295 unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)

296

297 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)

298 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)

299

300 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6

301 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3

302 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1

303 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0

304

305 movaps xmm6,xmm5 ; transpose coefficients(phase 2)

306 unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)

307 unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)

308 movaps xmm3,xmm4 ; transpose coefficients(phase 2)

309 unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)

310 unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)

311

312 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5

313 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6

314 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4

315 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3

316

317 .nextcolumn:

318 add esi, byte 4*SIZEOF_JCOEF ; coef_block

319 add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr

320 add edi, 4DCTSIZESIZEOF_FAST_FLOAT ; wsptr

321 dec ecx ; ctr

322 jnz near .columnloop

323

324 ; -- Prefetch the next coefficient block

325

326 prefetchnta [esi + (DCTSIZE2-8)SIZEOF_JCOEF + 032]

327 prefetchnta [esi + (DCTSIZE2-8)SIZEOF_JCOEF + 132]

328 prefetchnta [esi + (DCTSIZE2-8)SIZEOF_JCOEF + 232]

329 prefetchnta [esi + (DCTSIZE2-8)SIZEOF_JCOEF + 332]

330

331 ; ---- Pass 2: process rows from work array, store into output array.

332

333 mov eax, [original_ebp]

334 lea esi, [workspace] ; FAST_FLOAT * wsptr

335 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)

336 mov eax, JDIMENSION [output_col(eax)]

337 mov ecx, DCTSIZE/4 ; ctr

338 alignx 16,7

339 .rowloop:

340

341 ; -- Even part

342

343 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]

344 movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]

345 movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]

346 movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]

347

348 movaps xmm4,xmm0

349 movaps xmm5,xmm1

350 subps xmm0,xmm2 ; xmm0=tmp11

351 subps xmm1,xmm3

352 addps xmm4,xmm2 ; xmm4=tmp10

353 addps xmm5,xmm3 ; xmm5=tmp13

354

355 mulps xmm1,[GOTOFF(ebx,PD_1_414)]

356 subps xmm1,xmm5 ; xmm1=tmp12

357

358 movaps xmm6,xmm4

359 movaps xmm7,xmm0

360 subps xmm4,xmm5 ; xmm4=tmp3

361 subps xmm0,xmm1 ; xmm0=tmp2

362 addps xmm6,xmm5 ; xmm6=tmp0

363 addps xmm7,xmm1 ; xmm7=tmp1

364

365 movaps XMMWORD [wk(1)], xmm4 ; tmp3

366 movaps XMMWORD [wk(0)], xmm0 ; tmp2

367

368 ; -- Odd part

369

370 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]

371 movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]

372 movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]

373 movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]

374

375 movaps xmm4,xmm2

376 movaps xmm0,xmm5

377 addps xmm2,xmm1 ; xmm2=z11

378 addps xmm5,xmm3 ; xmm5=z13

379 subps xmm4,xmm1 ; xmm4=z12

380 subps xmm0,xmm3 ; xmm0=z10

381

382 movaps xmm1,xmm2

383 subps xmm2,xmm5

384 addps xmm1,xmm5 ; xmm1=tmp7

385

386 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11

387

388 movaps xmm3,xmm0

389 addps xmm0,xmm4

390 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5

391 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)

392 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)

393 addps xmm3,xmm0 ; xmm3=tmp12

394 subps xmm4,xmm0 ; xmm4=tmp10

395

396 ; -- Final output stage

397

398 subps xmm3,xmm1 ; xmm3=tmp6

399 movaps xmm5,xmm6

400 movaps xmm0,xmm7

401 addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)

402 addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)

403 subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)

404 subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)

405 subps xmm2,xmm3 ; xmm2=tmp5

406

407 movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]

408 pcmpeqd xmm3,xmm3

409 psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}

410

411 addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 10 20 ** 30 * *)

412 addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 11 21 ** 31 * *)

413 addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 16 26 ** 36 * *)

414 addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 17 27 ** 37 * *)

415

416 pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)

417 pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)

418 pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)

419 pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)

420 por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)

421 por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)

422

423 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2

424 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3

425

426 addps xmm4,xmm2 ; xmm4=tmp4

427 movaps xmm7,xmm1

428 movaps xmm5,xmm3

429 addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)

430 addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)

431 subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)

432 subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)

433

434 movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]

435 pcmpeqd xmm4,xmm4

436 psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}

437

438 addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 14 24 ** 34 * *)

439 addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 15 25 ** 35 * *)

440 addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 12 22 ** 32 * *)

441 addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 13 23 ** 33 * *)

442

443 pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)

444 pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)

445 pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)

446 pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)

447 por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)

448 por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)

449

450 movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]

451

452 packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 2 5 34 35)

453 packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 2 7 36 37)

454 paddb xmm6,xmm2

455 paddb xmm1,xmm2

456

457 movdqa xmm4,xmm6 ; transpose coefficients(phase 2)

458 punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 3 1 32 33)

459 punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 3 5 36 37)

460

461 movdqa xmm7,xmm6 ; transpose coefficients(phase 3)

462 punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 1 5 16 17)

463 punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 3 5 36 37)

464

465 pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 0 5 06 07)

466 pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 2 5 26 27)

467

468 pushpic ebx ; save GOT address

469

470 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]

471 mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]

472 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6

473 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7

474 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]

475 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]

476 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5

477 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3

478

479 poppic ebx ; restore GOT address

480

481 add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr

482 add edi, byte 4*SIZEOF_JSAMPROW

483 dec ecx ; ctr

484 jnz near .rowloop

485

486 pop edi

487 pop esi

488 ; pop edx ; need not be preserved

489 ; pop ecx ; need not be preserved

490 pop ebx

491 mov esp,ebp ; esp <- aligned ebp

492 pop esp ; esp <- original ebp

493 pop ebp

494 ret

495

496 ; For some reason, the OS X linker does not honor the request to align the

497 ; segment unless we do this.

498 align 16

OLD	NEW

« no previous file with comments | « simd/jimmxred.asm ('k') | simd/jiss2flt-64.asm » ('j') | no next file with comments »