simd/jdsammmx.asm - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jdsammmx.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 ;

2 ; jdsammmx.asm - upsampling (MMX)

3 ;

4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

5 ;

6 ; Based on

7 ; x86 SIMD extension for IJG JPEG library

8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

10 ;

11 ; This file should be assembled with NASM (Netwide Assembler),

12 ; can not be assembled with Microsoft's MASM or any compatible

13 ; assembler (including Borland's Turbo Assembler).

14 ; NASM is available from http://nasm.sourceforge.net/ or

15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

16 ;

17 ; [TAB8]

18

19 %include "jsimdext.inc"

20

21 ; --------------------------------------------------------------------------

22 SECTION SEG_CONST

23

24 alignz 16

25 global EXTN(jconst_fancy_upsample_mmx) PRIVATE

26

27 EXTN(jconst_fancy_upsample_mmx):

28

29 PW_ONE times 4 dw 1

30 PW_TWO times 4 dw 2

31 PW_THREE times 4 dw 3

32 PW_SEVEN times 4 dw 7

33 PW_EIGHT times 4 dw 8

34

35 alignz 16

36

37 ; --------------------------------------------------------------------------

38 SECTION SEG_TEXT

39 BITS 32

40 ;

41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.

42 ;

43 ; The upsampling algorithm is linear interpolation between pixel centers,

44 ; also known as a "triangle filter". This is a good compromise between

45 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4

46 ; of the way between input pixel centers.

47 ;

48 ; GLOBAL(void)

49 ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,

50 ; JDIMENSION downsampled_width,

51 ; JSAMPARRAY input_data,

52 ; JSAMPARRAY * output_data_ptr);

53 ;

54

55 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

56 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width

57 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

58 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr

59

60 align 16

61 global EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE

62

63 EXTN(jsimd_h2v1_fancy_upsample_mmx):

64 push ebp

65 mov ebp,esp

66 pushpic ebx

67 ; push ecx ; need not be preserved

68 ; push edx ; need not be preserved

69 push esi

70 push edi

71

72 get_GOT ebx ; get GOT address

73

74 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr

75 test eax,eax

76 jz near .return

77

78 mov ecx, INT [max_v_samp(ebp)] ; rowctr

79 test ecx,ecx

80 jz near .return

81

82 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data

83 mov edi, POINTER [output_data_ptr(ebp)]

84 mov edi, JSAMPARRAY [edi] ; output_data

85 alignx 16,7

86 .rowloop:

87 push eax ; colctr

88 push edi

89 push esi

90

91 mov esi, JSAMPROW [esi] ; inptr

92 mov edi, JSAMPROW [edi] ; outptr

93

94 test eax, SIZEOF_MMWORD-1

95 jz short .skip

96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]

97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample

98 .skip:

99 pxor mm0,mm0 ; mm0=(all 0's)

100 pcmpeqb mm7,mm7

101 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT

102 pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]

103

104 add eax, byte SIZEOF_MMWORD-1

105 and eax, byte -SIZEOF_MMWORD

106 cmp eax, byte SIZEOF_MMWORD

107 ja short .columnloop

108 alignx 16,7

109

110 .columnloop_last:

111 pcmpeqb mm6,mm6

112 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT

113 pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]

114 jmp short .upsample

115 alignx 16,7

116

117 .columnloop:

118 movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]

119 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT

120

121 .upsample:

122 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]

123 movq mm2,mm1

124 movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7)

125 psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)

126 psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)

127

128 por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6)

129 por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8)

130

131 movq mm7,mm1

132 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)

133

134 movq mm4,mm1

135 punpcklbw mm1,mm0 ; mm1=( 0 1 2 3)

136 punpckhbw mm4,mm0 ; mm4=( 4 5 6 7)

137 movq mm5,mm2

138 punpcklbw mm2,mm0 ; mm2=(-1 0 1 2)

139 punpckhbw mm5,mm0 ; mm5=( 3 4 5 6)

140 movq mm6,mm3

141 punpcklbw mm3,mm0 ; mm3=( 1 2 3 4)

142 punpckhbw mm6,mm0 ; mm6=( 5 6 7 8)

143

144 pmullw mm1,[GOTOFF(ebx,PW_THREE)]

145 pmullw mm4,[GOTOFF(ebx,PW_THREE)]

146 paddw mm2,[GOTOFF(ebx,PW_ONE)]

147 paddw mm5,[GOTOFF(ebx,PW_ONE)]

148 paddw mm3,[GOTOFF(ebx,PW_TWO)]

149 paddw mm6,[GOTOFF(ebx,PW_TWO)]

150

151 paddw mm2,mm1

152 paddw mm5,mm4

153 psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6)

154 psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14)

155 paddw mm3,mm1

156 paddw mm6,mm4

157 psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7)

158 psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15)

159

160 psllw mm3,BYTE_BIT

161 psllw mm6,BYTE_BIT

162 por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)

163 por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)

164

165 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2

166 movq MMWORD [edi+1*SIZEOF_MMWORD], mm5

167

168 sub eax, byte SIZEOF_MMWORD

169 add esi, byte 1*SIZEOF_MMWORD ; inptr

170 add edi, byte 2*SIZEOF_MMWORD ; outptr

171 cmp eax, byte SIZEOF_MMWORD

172 ja near .columnloop

173 test eax,eax

174 jnz near .columnloop_last

175

176 pop esi

177 pop edi

178 pop eax

179

180 add esi, byte SIZEOF_JSAMPROW ; input_data

181 add edi, byte SIZEOF_JSAMPROW ; output_data

182 dec ecx ; rowctr

183 jg near .rowloop

184

185 emms ; empty MMX state

186

187 .return:

188 pop edi

189 pop esi

190 ; pop edx ; need not be preserved

191 ; pop ecx ; need not be preserved

192 poppic ebx

193 pop ebp

194 ret

195

196 ; --------------------------------------------------------------------------

197 ;

198 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.

199 ; Again a triangle filter; see comments for h2v1 case, above.

200 ;

201 ; GLOBAL(void)

202 ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,

203 ; JDIMENSION downsampled_width,

204 ; JSAMPARRAY input_data,

205 ; JSAMPARRAY * output_data_ptr);

206 ;

207

208 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

209 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width

210 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

211 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr

212

213 %define original_ebp ebp+0

214 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]

215 %define WK_NUM 4

216 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr

217

218 align 16

219 global EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE

220

221 EXTN(jsimd_h2v2_fancy_upsample_mmx):

222 push ebp

223 mov eax,esp ; eax = original ebp

224 sub esp, byte 4

225 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits

226 mov [esp],eax

227 mov ebp,esp ; ebp = aligned ebp

228 lea esp, [wk(0)]

229 pushpic eax ; make a room for GOT address

230 push ebx

231 ; push ecx ; need not be preserved

232 ; push edx ; need not be preserved

233 push esi

234 push edi

235

236 get_GOT ebx ; get GOT address

237 movpic POINTER [gotptr], ebx ; save GOT address

238

239 mov edx,eax ; edx = original ebp

240 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr

241 test eax,eax

242 jz near .return

243

244 mov ecx, INT [max_v_samp(edx)] ; rowctr

245 test ecx,ecx

246 jz near .return

247

248 mov esi, JSAMPARRAY [input_data(edx)] ; input_data

249 mov edi, POINTER [output_data_ptr(edx)]

250 mov edi, JSAMPARRAY [edi] ; output_data

251 alignx 16,7

252 .rowloop:

253 push eax ; colctr

254 push ecx

255 push edi

256 push esi

257

258 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)

259 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0

260 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)

261 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0

262 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1

263

264 test eax, SIZEOF_MMWORD-1

265 jz short .skip

266 push edx

267 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]

268 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl

269 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]

270 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl

271 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]

272 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample

273 pop edx

274 .skip:

275 ; -- process the first column block

276

277 movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]

278 movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]

279 movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]

280

281 pushpic ebx

282 movpic ebx, POINTER [gotptr] ; load GOT address

283

284 pxor mm3,mm3 ; mm3=(all 0's)

285 movq mm4,mm0

286 punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3)

287 punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7)

288 movq mm5,mm1

289 punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3)

290 punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7)

291 movq mm6,mm2

292 punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3)

293 punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7)

294

295 pmullw mm0,[GOTOFF(ebx,PW_THREE)]

296 pmullw mm4,[GOTOFF(ebx,PW_THREE)]

297

298 pcmpeqb mm7,mm7

299 psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT

300

301 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)

302 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)

303 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)

304 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)

305

306 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save

307 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data

308 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2

309 movq MMWORD [edi+1*SIZEOF_MMWORD], mm6

310

311 pand mm1,mm7 ; mm1=( 0 - - -)

312 pand mm2,mm7 ; mm2=( 0 - - -)

313

314 movq MMWORD [wk(0)], mm1

315 movq MMWORD [wk(1)], mm2

316

317 poppic ebx

318

319 add eax, byte SIZEOF_MMWORD-1

320 and eax, byte -SIZEOF_MMWORD

321 cmp eax, byte SIZEOF_MMWORD

322 ja short .columnloop

323 alignx 16,7

324

325 .columnloop_last:

326 ; -- process the last column block

327

328 pushpic ebx

329 movpic ebx, POINTER [gotptr] ; load GOT address

330

331 pcmpeqb mm1,mm1

332 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT

333 movq mm2,mm1

334

335 pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)

336 pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)

337

338 movq MMWORD [wk(2)], mm1

339 movq MMWORD [wk(3)], mm2

340

341 jmp short .upsample

342 alignx 16,7

343

344 .columnloop:

345 ; -- process the next column block

346

347 movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]

348 movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]

349 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]

350

351 pushpic ebx

352 movpic ebx, POINTER [gotptr] ; load GOT address

353

354 pxor mm3,mm3 ; mm3=(all 0's)

355 movq mm4,mm0

356 punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3)

357 punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7)

358 movq mm5,mm1

359 punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3)

360 punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7)

361 movq mm6,mm2

362 punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3)

363 punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7)

364

365 pmullw mm0,[GOTOFF(ebx,PW_THREE)]

366 pmullw mm4,[GOTOFF(ebx,PW_THREE)]

367

368 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)

369 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)

370 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)

371 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)

372

373 movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save

374 movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data

375 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2

376 movq MMWORD [edi+3*SIZEOF_MMWORD], mm6

377

378 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)

379 psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)

380

381 movq MMWORD [wk(2)], mm1

382 movq MMWORD [wk(3)], mm2

383

384 .upsample:

385 ; -- process the upper row

386

387 movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)

388 movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)

389

390 movq mm0,mm7

391 movq mm4,mm3

392 psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -)

393 psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)

394 movq mm5,mm7

395 movq mm6,mm3

396 psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)

397 psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6)

398

399 por mm0,mm4 ; mm0=( 1 2 3 4)

400 por mm5,mm6 ; mm5=( 3 4 5 6)

401

402 movq mm1,mm7

403 movq mm2,mm3

404 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)

405 psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -)

406 movq mm4,mm3

407 psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)

408

409 por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)

410 por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)

411

412 movq MMWORD [wk(0)], mm4

413

414 pmullw mm7,[GOTOFF(ebx,PW_THREE)]

415 pmullw mm3,[GOTOFF(ebx,PW_THREE)]

416 paddw mm1,[GOTOFF(ebx,PW_EIGHT)]

417 paddw mm5,[GOTOFF(ebx,PW_EIGHT)]

418 paddw mm0,[GOTOFF(ebx,PW_SEVEN)]

419 paddw mm2,[GOTOFF(ebx,PW_SEVEN)]

420

421 paddw mm1,mm7

422 paddw mm5,mm3

423 psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6)

424 psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14)

425 paddw mm0,mm7

426 paddw mm2,mm3

427 psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7)

428 psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15)

429

430 psllw mm0,BYTE_BIT

431 psllw mm2,BYTE_BIT

432 por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)

433 por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)

434

435 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1

436 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5

437

438 ; -- process the lower row

439

440 movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)

441 movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)

442

443 movq mm7,mm6

444 movq mm3,mm4

445 psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -)

446 psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)

447 movq mm0,mm6

448 movq mm2,mm4

449 psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)

450 psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6)

451

452 por mm7,mm3 ; mm7=( 1 2 3 4)

453 por mm0,mm2 ; mm0=( 3 4 5 6)

454

455 movq mm1,mm6

456 movq mm5,mm4

457 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)

458 psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -)

459 movq mm3,mm4

460 psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)

461

462 por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)

463 por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)

464

465 movq MMWORD [wk(1)], mm3

466

467 pmullw mm6,[GOTOFF(ebx,PW_THREE)]

468 pmullw mm4,[GOTOFF(ebx,PW_THREE)]

469 paddw mm1,[GOTOFF(ebx,PW_EIGHT)]

470 paddw mm0,[GOTOFF(ebx,PW_EIGHT)]

471 paddw mm7,[GOTOFF(ebx,PW_SEVEN)]

472 paddw mm5,[GOTOFF(ebx,PW_SEVEN)]

473

474 paddw mm1,mm6

475 paddw mm0,mm4

476 psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6)

477 psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14)

478 paddw mm7,mm6

479 paddw mm5,mm4

480 psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7)

481 psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15)

482

483 psllw mm7,BYTE_BIT

484 psllw mm5,BYTE_BIT

485 por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)

486 por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)

487

488 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1

489 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0

490

491 poppic ebx

492

493 sub eax, byte SIZEOF_MMWORD

494 add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)

495 add ebx, byte 1*SIZEOF_MMWORD ; inptr0

496 add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)

497 add edx, byte 2*SIZEOF_MMWORD ; outptr0

498 add edi, byte 2*SIZEOF_MMWORD ; outptr1

499 cmp eax, byte SIZEOF_MMWORD

500 ja near .columnloop

501 test eax,eax

502 jnz near .columnloop_last

503

504 pop esi

505 pop edi

506 pop ecx

507 pop eax

508

509 add esi, byte 1*SIZEOF_JSAMPROW ; input_data

510 add edi, byte 2*SIZEOF_JSAMPROW ; output_data

511 sub ecx, byte 2 ; rowctr

512 jg near .rowloop

513

514 emms ; empty MMX state

515

516 .return:

517 pop edi

518 pop esi

519 ; pop edx ; need not be preserved

520 ; pop ecx ; need not be preserved

521 pop ebx

522 mov esp,ebp ; esp <- aligned ebp

523 pop esp ; esp <- original ebp

524 pop ebp

525 ret

526

527 ; --------------------------------------------------------------------------

528 ;

529 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.

530 ; It's still a box filter.

531 ;

532 ; GLOBAL(void)

533 ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,

534 ; JDIMENSION output_width,

535 ; JSAMPARRAY input_data,

536 ; JSAMPARRAY * output_data_ptr);

537 ;

538

539 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

540 %define output_width(b) (b)+12 ; JDIMENSION output_width

541 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

542 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr

543

544 align 16

545 global EXTN(jsimd_h2v1_upsample_mmx) PRIVATE

546

547 EXTN(jsimd_h2v1_upsample_mmx):

548 push ebp

549 mov ebp,esp

550 ; push ebx ; unused

551 ; push ecx ; need not be preserved

552 ; push edx ; need not be preserved

553 push esi

554 push edi

555

556 mov edx, JDIMENSION [output_width(ebp)]

557 add edx, byte (2*SIZEOF_MMWORD)-1

558 and edx, byte -(2*SIZEOF_MMWORD)

559 jz short .return

560

561 mov ecx, INT [max_v_samp(ebp)] ; rowctr

562 test ecx,ecx

563 jz short .return

564

565 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data

566 mov edi, POINTER [output_data_ptr(ebp)]

567 mov edi, JSAMPARRAY [edi] ; output_data

568 alignx 16,7

569 .rowloop:

570 push edi

571 push esi

572

573 mov esi, JSAMPROW [esi] ; inptr

574 mov edi, JSAMPROW [edi] ; outptr

575 mov eax,edx ; colctr

576 alignx 16,7

577 .columnloop:

578

579 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]

580

581 movq mm1,mm0

582 punpcklbw mm0,mm0

583 punpckhbw mm1,mm1

584

585 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0

586 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1

587

588 sub eax, byte 2*SIZEOF_MMWORD

589 jz short .nextrow

590

591 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]

592

593 movq mm3,mm2

594 punpcklbw mm2,mm2

595 punpckhbw mm3,mm3

596

597 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2

598 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3

599

600 sub eax, byte 2*SIZEOF_MMWORD

601 jz short .nextrow

602

603 add esi, byte 2*SIZEOF_MMWORD ; inptr

604 add edi, byte 4*SIZEOF_MMWORD ; outptr

605 jmp short .columnloop

606 alignx 16,7

607

608 .nextrow:

609 pop esi

610 pop edi

611

612 add esi, byte SIZEOF_JSAMPROW ; input_data

613 add edi, byte SIZEOF_JSAMPROW ; output_data

614 dec ecx ; rowctr

615 jg short .rowloop

616

617 emms ; empty MMX state

618

619 .return:

620 pop edi

621 pop esi

622 ; pop edx ; need not be preserved

623 ; pop ecx ; need not be preserved

624 ; pop ebx ; unused

625 pop ebp

626 ret

627

628 ; --------------------------------------------------------------------------

629 ;

630 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.

631 ; It's still a box filter.

632 ;

633 ; GLOBAL(void)

634 ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,

635 ; JDIMENSION output_width,

636 ; JSAMPARRAY input_data,

637 ; JSAMPARRAY * output_data_ptr);

638 ;

639

640 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor

641 %define output_width(b) (b)+12 ; JDIMENSION output_width

642 %define input_data(b) (b)+16 ; JSAMPARRAY input_data

643 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr

644

645 align 16

646 global EXTN(jsimd_h2v2_upsample_mmx) PRIVATE

647

648 EXTN(jsimd_h2v2_upsample_mmx):

649 push ebp

650 mov ebp,esp

651 push ebx

652 ; push ecx ; need not be preserved

653 ; push edx ; need not be preserved

654 push esi

655 push edi

656

657 mov edx, JDIMENSION [output_width(ebp)]

658 add edx, byte (2*SIZEOF_MMWORD)-1

659 and edx, byte -(2*SIZEOF_MMWORD)

660 jz near .return

661

662 mov ecx, INT [max_v_samp(ebp)] ; rowctr

663 test ecx,ecx

664 jz short .return

665

666 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data

667 mov edi, POINTER [output_data_ptr(ebp)]

668 mov edi, JSAMPARRAY [edi] ; output_data

669 alignx 16,7

670 .rowloop:

671 push edi

672 push esi

673

674 mov esi, JSAMPROW [esi] ; inptr

675 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0

676 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1

677 mov eax,edx ; colctr

678 alignx 16,7

679 .columnloop:

680

681 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]

682

683 movq mm1,mm0

684 punpcklbw mm0,mm0

685 punpckhbw mm1,mm1

686

687 movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0

688 movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1

689 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0

690 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1

691

692 sub eax, byte 2*SIZEOF_MMWORD

693 jz short .nextrow

694

695 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]

696

697 movq mm3,mm2

698 punpcklbw mm2,mm2

699 punpckhbw mm3,mm3

700

701 movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2

702 movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3

703 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2

704 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3

705

706 sub eax, byte 2*SIZEOF_MMWORD

707 jz short .nextrow

708

709 add esi, byte 2*SIZEOF_MMWORD ; inptr

710 add ebx, byte 4*SIZEOF_MMWORD ; outptr0

711 add edi, byte 4*SIZEOF_MMWORD ; outptr1

712 jmp short .columnloop

713 alignx 16,7

714

715 .nextrow:

716 pop esi

717 pop edi

718

719 add esi, byte 1*SIZEOF_JSAMPROW ; input_data

720 add edi, byte 2*SIZEOF_JSAMPROW ; output_data

721 sub ecx, byte 2 ; rowctr

722 jg short .rowloop

723

724 emms ; empty MMX state

725

726 .return:

727 pop edi

728 pop esi

729 ; pop edx ; need not be preserved

730 ; pop ecx ; need not be preserved

731 pop ebx

732 pop ebp

733 ret

734

735 ; For some reason, the OS X linker does not honor the request to align the

736 ; segment unless we do this.

737 align 16

OLD	NEW

« no previous file with comments | « simd/jdmrgss2-64.asm ('k') | simd/jdsample-altivec.c » ('j') | no next file with comments »