simd/jiss2flt-64.asm - Issue 1939823002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jiss2flt-64.asm

Issue 1939823002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Response to comments Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 ;

2 ; jiss2flt-64.asm - floating-point IDCT (64-bit SSE & SSE2)

3 ;

4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

5 ; Copyright 2009 D. R. Commander

6 ;

7 ; Based on

8 ; x86 SIMD extension for IJG JPEG library

9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

11 ;

12 ; This file should be assembled with NASM (Netwide Assembler),

13 ; can not be assembled with Microsoft's MASM or any compatible

14 ; assembler (including Borland's Turbo Assembler).

15 ; NASM is available from http://nasm.sourceforge.net/ or

16 ; http://sourceforge.net/project/showfiles.php?group_id=6208

17 ;

18 ; This file contains a floating-point implementation of the inverse DCT

19 ; (Discrete Cosine Transform). The following code is based directly on

20 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.

21 ;

22 ; [TAB8]

23

24 %include "jsimdext.inc"

25 %include "jdct.inc"

26

27 ; --------------------------------------------------------------------------

28

29 %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)

30 shufps %1,%2,0x44

31 %endmacro

32

33 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)

34 shufps %1,%2,0xEE

35 %endmacro

36

37 ; --------------------------------------------------------------------------

38 SECTION SEG_CONST

39

40 alignz 16

41 global EXTN(jconst_idct_float_sse2) PRIVATE

42

43 EXTN(jconst_idct_float_sse2):

44

45 PD_1_414 times 4 dd 1.414213562373095048801689

46 PD_1_847 times 4 dd 1.847759065022573512256366

47 PD_1_082 times 4 dd 1.082392200292393968799446

48 PD_M2_613 times 4 dd -2.613125929752753055713286

49 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)

50 PB_CENTERJSAMP times 16 db CENTERJSAMPLE

51

52 alignz 16

53

54 ; --------------------------------------------------------------------------

55 SECTION SEG_TEXT

56 BITS 64

57 ;

58 ; Perform dequantization and inverse DCT on one block of coefficients.

59 ;

60 ; GLOBAL(void)

61 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,

62 ; JSAMPARRAY output_buf, JDIMENSION output_col)

63 ;

64

65 ; r10 = void * dct_table

66 ; r11 = JCOEFPTR coef_block

67 ; r12 = JSAMPARRAY output_buf

68 ; r13 = JDIMENSION output_col

69

70 %define original_rbp rbp+0

71 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]

72 %define WK_NUM 2

73 %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT

74 ; FAST_FLOAT workspace[DCTSIZE2]

75

76 align 16

77 global EXTN(jsimd_idct_float_sse2) PRIVATE

78

79 EXTN(jsimd_idct_float_sse2):

80 push rbp

81 mov rax,rsp ; rax = original rbp

82 sub rsp, byte 4

83 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits

84 mov [rsp],rax

85 mov rbp,rsp ; rbp = aligned rbp

86 lea rsp, [workspace]

87 collect_args

88 push rbx

89

90 ; ---- Pass 1: process columns from input, store into work array.

91

92 mov rdx, r10 ; quantptr

93 mov rsi, r11 ; inptr

94 lea rdi, [workspace] ; FAST_FLOAT * wsptr

95 mov rcx, DCTSIZE/4 ; ctr

96 .columnloop:

97 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE

98 mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]

99 or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]

100 jnz near .columnDCT

101

102 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]

103 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]

104 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]

105 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]

106 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]

107 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]

108 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]

109 por xmm1,xmm2

110 por xmm3,xmm4

111 por xmm5,xmm6

112 por xmm1,xmm3

113 por xmm5,xmm7

114 por xmm1,xmm5

115 packsswb xmm1,xmm1

116 movd eax,xmm1

117 test rax,rax

118 jnz short .columnDCT

119

120 ; -- AC terms all zero

121

122 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]

123

124 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)

125 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)

126 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)

127

128 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

129

130 movaps xmm1,xmm0

131 movaps xmm2,xmm0

132 movaps xmm3,xmm0

133

134 shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)

135 shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)

136 shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)

137 shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)

138

139 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0

140 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0

141 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1

142 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1

143 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2

144 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2

145 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3

146 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3

147 jmp near .nextcolumn

148 %endif

149 .columnDCT:

150

151 ; -- Even part

152

153 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]

154 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]

155 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]

156 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]

157

158 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)

159 punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)

160 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)

161 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)

162 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)

163 cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)

164

165 punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)

166 punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)

167 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)

168 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)

169 cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)

170 cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)

171

172 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

173 mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

174 mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

175 mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

176

177 movaps xmm4,xmm0

178 movaps xmm5,xmm1

179 subps xmm0,xmm2 ; xmm0=tmp11

180 subps xmm1,xmm3

181 addps xmm4,xmm2 ; xmm4=tmp10

182 addps xmm5,xmm3 ; xmm5=tmp13

183

184 mulps xmm1,[rel PD_1_414]

185 subps xmm1,xmm5 ; xmm1=tmp12

186

187 movaps xmm6,xmm4

188 movaps xmm7,xmm0

189 subps xmm4,xmm5 ; xmm4=tmp3

190 subps xmm0,xmm1 ; xmm0=tmp2

191 addps xmm6,xmm5 ; xmm6=tmp0

192 addps xmm7,xmm1 ; xmm7=tmp1

193

194 movaps XMMWORD [wk(1)], xmm4 ; tmp3

195 movaps XMMWORD [wk(0)], xmm0 ; tmp2

196

197 ; -- Odd part

198

199 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]

200 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]

201 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]

202 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]

203

204 punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)

205 punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)

206 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)

207 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)

208 cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)

209 cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)

210

211 punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)

212 punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)

213 psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)

214 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)

215 cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)

216 cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)

217

218 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

219 mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

220 mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

221 mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

222

223 movaps xmm4,xmm2

224 movaps xmm0,xmm5

225 addps xmm2,xmm1 ; xmm2=z11

226 addps xmm5,xmm3 ; xmm5=z13

227 subps xmm4,xmm1 ; xmm4=z12

228 subps xmm0,xmm3 ; xmm0=z10

229

230 movaps xmm1,xmm2

231 subps xmm2,xmm5

232 addps xmm1,xmm5 ; xmm1=tmp7

233

234 mulps xmm2,[rel PD_1_414] ; xmm2=tmp11

235

236 movaps xmm3,xmm0

237 addps xmm0,xmm4

238 mulps xmm0,[rel PD_1_847] ; xmm0=z5

239 mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)

240 mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)

241 addps xmm3,xmm0 ; xmm3=tmp12

242 subps xmm4,xmm0 ; xmm4=tmp10

243

244 ; -- Final output stage

245

246 subps xmm3,xmm1 ; xmm3=tmp6

247 movaps xmm5,xmm6

248 movaps xmm0,xmm7

249 addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)

250 addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)

251 subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)

252 subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)

253 subps xmm2,xmm3 ; xmm2=tmp5

254

255 movaps xmm1,xmm6 ; transpose coefficients(phase 1)

256 unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)

257 unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)

258 movaps xmm3,xmm0 ; transpose coefficients(phase 1)

259 unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)

260 unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)

261

262 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2

263 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3

264

265 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)

266 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)

267

268 addps xmm4,xmm2 ; xmm4=tmp4

269 movaps xmm0,xmm7

270 movaps xmm3,xmm5

271 addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)

272 addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)

273 subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)

274 subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)

275

276 movaps xmm2,xmm7 ; transpose coefficients(phase 1)

277 unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)

278 unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)

279 movaps xmm4,xmm5 ; transpose coefficients(phase 1)

280 unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)

281 unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)

282

283 movaps xmm3,xmm6 ; transpose coefficients(phase 2)

284 unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)

285 unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)

286 movaps xmm0,xmm1 ; transpose coefficients(phase 2)

287 unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)

288 unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)

289

290 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)

291 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)

292

293 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6

294 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3

295 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1

296 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0

297

298 movaps xmm6,xmm5 ; transpose coefficients(phase 2)

299 unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)

300 unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)

301 movaps xmm3,xmm4 ; transpose coefficients(phase 2)

302 unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)

303 unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)

304

305 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5

306 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6

307 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4

308 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3

309

310 .nextcolumn:

311 add rsi, byte 4*SIZEOF_JCOEF ; coef_block

312 add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr

313 add rdi, 4DCTSIZESIZEOF_FAST_FLOAT ; wsptr

314 dec rcx ; ctr

315 jnz near .columnloop

316

317 ; -- Prefetch the next coefficient block

318

319 prefetchnta [rsi + (DCTSIZE2-8)SIZEOF_JCOEF + 032]

320 prefetchnta [rsi + (DCTSIZE2-8)SIZEOF_JCOEF + 132]

321 prefetchnta [rsi + (DCTSIZE2-8)SIZEOF_JCOEF + 232]

322 prefetchnta [rsi + (DCTSIZE2-8)SIZEOF_JCOEF + 332]

323

324 ; ---- Pass 2: process rows from work array, store into output array.

325

326 mov rax, [original_rbp]

327 lea rsi, [workspace] ; FAST_FLOAT * wsptr

328 mov rdi, r12 ; (JSAMPROW *)

329 mov eax, r13d

330 mov rcx, DCTSIZE/4 ; ctr

331 .rowloop:

332

333 ; -- Even part

334

335 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]

336 movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]

337 movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]

338 movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]

339

340 movaps xmm4,xmm0

341 movaps xmm5,xmm1

342 subps xmm0,xmm2 ; xmm0=tmp11

343 subps xmm1,xmm3

344 addps xmm4,xmm2 ; xmm4=tmp10

345 addps xmm5,xmm3 ; xmm5=tmp13

346

347 mulps xmm1,[rel PD_1_414]

348 subps xmm1,xmm5 ; xmm1=tmp12

349

350 movaps xmm6,xmm4

351 movaps xmm7,xmm0

352 subps xmm4,xmm5 ; xmm4=tmp3

353 subps xmm0,xmm1 ; xmm0=tmp2

354 addps xmm6,xmm5 ; xmm6=tmp0

355 addps xmm7,xmm1 ; xmm7=tmp1

356

357 movaps XMMWORD [wk(1)], xmm4 ; tmp3

358 movaps XMMWORD [wk(0)], xmm0 ; tmp2

359

360 ; -- Odd part

361

362 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]

363 movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]

364 movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]

365 movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]

366

367 movaps xmm4,xmm2

368 movaps xmm0,xmm5

369 addps xmm2,xmm1 ; xmm2=z11

370 addps xmm5,xmm3 ; xmm5=z13

371 subps xmm4,xmm1 ; xmm4=z12

372 subps xmm0,xmm3 ; xmm0=z10

373

374 movaps xmm1,xmm2

375 subps xmm2,xmm5

376 addps xmm1,xmm5 ; xmm1=tmp7

377

378 mulps xmm2,[rel PD_1_414] ; xmm2=tmp11

379

380 movaps xmm3,xmm0

381 addps xmm0,xmm4

382 mulps xmm0,[rel PD_1_847] ; xmm0=z5

383 mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)

384 mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)

385 addps xmm3,xmm0 ; xmm3=tmp12

386 subps xmm4,xmm0 ; xmm4=tmp10

387

388 ; -- Final output stage

389

390 subps xmm3,xmm1 ; xmm3=tmp6

391 movaps xmm5,xmm6

392 movaps xmm0,xmm7

393 addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)

394 addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)

395 subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)

396 subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)

397 subps xmm2,xmm3 ; xmm2=tmp5

398

399 movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]

400 pcmpeqd xmm3,xmm3

401 psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}

402

403 addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 10 20 ** 30 * *)

404 addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 11 21 ** 31 * *)

405 addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 16 26 ** 36 * *)

406 addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 17 27 ** 37 * *)

407

408 pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)

409 pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)

410 pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)

411 pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)

412 por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)

413 por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)

414

415 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2

416 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3

417

418 addps xmm4,xmm2 ; xmm4=tmp4

419 movaps xmm7,xmm1

420 movaps xmm5,xmm3

421 addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)

422 addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)

423 subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)

424 subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)

425

426 movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]

427 pcmpeqd xmm4,xmm4

428 psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}

429

430 addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 14 24 ** 34 * *)

431 addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 15 25 ** 35 * *)

432 addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 12 22 ** 32 * *)

433 addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 13 23 ** 33 * *)

434

435 pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)

436 pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)

437 pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)

438 pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)

439 por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)

440 por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)

441

442 movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]

443

444 packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 2 5 34 35)

445 packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 2 7 36 37)

446 paddb xmm6,xmm2

447 paddb xmm1,xmm2

448

449 movdqa xmm4,xmm6 ; transpose coefficients(phase 2)

450 punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 3 1 32 33)

451 punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 3 5 36 37)

452

453 movdqa xmm7,xmm6 ; transpose coefficients(phase 3)

454 punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 1 5 16 17)

455 punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 3 5 36 37)

456

457 pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 0 5 06 07)

458 pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 2 5 26 27)

459

460 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]

461 mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]

462 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6

463 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7

464 mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]

465 mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]

466 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5

467 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3

468

469 add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr

470 add rdi, byte 4*SIZEOF_JSAMPROW

471 dec rcx ; ctr

472 jnz near .rowloop

473

474 pop rbx

475 uncollect_args

476 mov rsp,rbp ; rsp <- aligned rbp

477 pop rsp ; rsp <- original rbp

478 pop rbp

479 ret

480

481 ; For some reason, the OS X linker does not honor the request to align the

482 ; segment unless we do this.

483 align 16

OLD	NEW

« jdhuff.c ('K') | « simd/jiss2flt.asm ('k') | simd/jiss2fst.asm » ('j') | no next file with comments »