source/libvpx/vpx_scale/win32/scaleopt.c - Issue 11555023: libvpx: Add VP9 decoder.

Side by Side Diff: source/libvpx/vpx_scale/win32/scaleopt.c

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11	11

12 /****************************************************************************	12 /****************************************************************************

13 *	13 *

14 * Module Title : scaleopt.cpp	14 * Module Title : scaleopt.cpp

15 *	15 *

16 * Description : Optimized scaling functions	16 * Description : Optimized scaling functions

17 *	17 *

18 ****************************************************************************/	18 ****************************************************************************/

19 #include "pragmas.h"	19 #include "pragmas.h"

20	20

21

22

23 /****************************************************************************	21 /****************************************************************************

24 * Module Statics	22 * Module Statics

25 ****************************************************************************/	23 ****************************************************************************/

26 __declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 5 1 };

27 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102 , 102 };

28 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 1 54, 154 };

29 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 20 5, 205 };

30 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 1 28, 128 };	24 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 1 28, 128 };

31 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};

32 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };

33 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };

34 __declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};

35 __declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };

36 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };

37

38

39	25

40 #include "vpx_scale/vpxscale.h"	26 #include "vpx_scale/vpxscale.h"

41 #include "vpx_mem/vpx_mem.h"	27 #include "vpx_mem/vpx_mem.h"

42	28

43 /****************************************************************************

44 *

45 * ROUTINE : horizontal_line_3_5_scale_mmx

46 *

47 * INPUTS : const unsigned char *source :

48 * unsigned int source_width :

49 * unsigned char *dest :

50 * unsigned int dest_width :

51 *

52 * OUTPUTS : None.

53 *

54 * RETURNS : void

55 *

56 * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.

57 *

58 * SPECIAL NOTES : None.

59 *

60 ****************************************************************************/

61 static

62 void horizontal_line_3_5_scale_mmx

63 (

64 const unsigned char *source,

65 unsigned int source_width,

66 unsigned char *dest,

67 unsigned int dest_width

68 ) {

69 (void) dest_width;

70

71 __asm {

72

73 push ebx

74

75 mov esi, source

76 mov edi, dest

77

78 mov ecx, source_width

79 lea edx, [esi+ecx-3];

80

81 movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx

82 movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx

83

84 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx

85 pxor mm7, mm7 // clear mm7

86

87 horiz_line_3_5_loop:

88

89 mov eax, DWORD PTR [esi] // eax = 00 01 02 03

90 mov ebx, eax

91

92 and ebx, 0xffff00 // ebx = xx 01 02 xx

93 mov ecx, eax // ecx = 00 01 02 03

94

95 and eax, 0xffff0000 // eax = xx xx 02 03

96 xor ecx, eax // ecx = 00 01 xx xx

97

98 shr ebx, 8 // ebx = 01 02 xx xx

99 or eax, ebx // eax = 01 02 02 03

100

101 shl ebx, 16 // ebx = xx xx 01 02

102 movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx

103

104 or ebx, ecx // ebx = 00 01 01 02

105 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx

106

107 movd mm0, ebx // mm0 = 00 01 01 02

108 pmullw mm1, mm6 //

109

110 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx

111 pmullw mm0, mm5 //

112

113 mov [edi], ebx // writeoutput 00 xx xx xx

114 add esi, 3

115

116 add edi, 5

117 paddw mm0, mm1

118

119 paddw mm0, mm4

120 psrlw mm0, 8

121

122 cmp esi, edx

123 packuswb mm0, mm7

124

125 movd DWORD Ptr [edi-4], mm0

126 jl horiz_line_3_5_loop

127

128 // Exit:

129 mov eax, DWORD PTR [esi] // eax = 00 01 02 03

130 mov ebx, eax

131

132 and ebx, 0xffff00 // ebx = xx 01 02 xx

133 mov ecx, eax // ecx = 00 01 02 03

134

135 and eax, 0xffff0000 // eax = xx xx 02 03

136 xor ecx, eax // ecx = 00 01 xx xx

137

138 shr ebx, 8 // ebx = 01 02 xx xx

139 or eax, ebx // eax = 01 02 02 03

140

141 shl eax, 8 // eax = xx 01 02 02

142 and eax, 0xffff0000 // eax = xx xx 02 02

143

144 or eax, ebx // eax = 01 02 02 02

145

146 shl ebx, 16 // ebx = xx xx 01 02

147 movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx

148

149 or ebx, ecx // ebx = 00 01 01 02

150 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx

151

152 movd mm0, ebx // mm0 = 00 01 01 02

153 pmullw mm1, mm6 //

154

155 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx

156 pmullw mm0, mm5 //

157

158 mov [edi], ebx // writeoutput 00 xx xx xx

159 paddw mm0, mm1

160

161 paddw mm0, mm4

162 psrlw mm0, 8

163

164 packuswb mm0, mm7

165 movd DWORD Ptr [edi+1], mm0

166

167 pop ebx

168

169 }

170

171 }

172

173

174 /****************************************************************************

175 *

176 * ROUTINE : horizontal_line_4_5_scale_mmx

177 *

178 * INPUTS : const unsigned char *source :

179 * unsigned int source_width :

180 * unsigned char *dest :

181 * unsigned int dest_width :

182 *

183 * OUTPUTS : None.

184 *

185 * RETURNS : void

186 *

187 * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.

188 *

189 * SPECIAL NOTES : None.

190 *

191 ****************************************************************************/

192 static

193 void horizontal_line_4_5_scale_mmx

194 (

195 const unsigned char *source,

196 unsigned int source_width,

197 unsigned char *dest,

198 unsigned int dest_width

199 ) {

200 (void)dest_width;

201

202 __asm {

203

204 mov esi, source

205 mov edi, dest

206

207 mov ecx, source_width

208 lea edx, [esi+ecx-8];

209

210 movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx

211 movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx

212

213 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx

214 pxor mm7, mm7 // clear mm7

215

216 horiz_line_4_5_loop:

217

218 movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07

219 movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08

220

221 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07

222 movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08

223

224 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx

225 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx

226

227 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx

228 pmullw mm0, mm5 // 00* 51 01102 02154 03*205

229

230 pmullw mm1, mm6 // 01205 02154 03102 04 51

231 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx

232

233 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx

234 pmullw mm2, mm5 // 04* 51 05102 06154 07*205

235

236 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx

237 pmullw mm3, mm6 // 05205 06154 07102 08 51

238

239 paddw mm0, mm1 // added round values

240 paddw mm0, mm4

241

242 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx

243 packuswb mm0, mm7

244

245 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04

246 add edi, 10

247

248 add esi, 8

249 paddw mm2, mm3 //

250

251 paddw mm2, mm4 // added round values

252 cmp esi, edx

253

254 psrlw mm2, 8

255 packuswb mm2, mm7

256

257 movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09

258 jl horiz_line_4_5_loop

259

260 // Exit:

261 movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07

262 movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07

263

264 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07

265 psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00

266

267 movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00

268 pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00

269

270 psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07

271 por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07

272

273 movq mm3, mm1

274

275 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx

276 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx

277

278 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx

279 pmullw mm0, mm5 // 00* 51 01102 02154 03*205

280

281 pmullw mm1, mm6 // 01205 02154 03102 04 51

282 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx

283

284 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx

285 pmullw mm2, mm5 // 04* 51 05102 06154 07*205

286

287 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx

288 pmullw mm3, mm6 // 05205 06154 07102 07 51

289

290 paddw mm0, mm1 // added round values

291 paddw mm0, mm4

292

293 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx

294 packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx

295

296 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04

297 paddw mm2, mm3 //

298

299 paddw mm2, mm4 // added round values

300 psrlw mm2, 8

301

302 packuswb mm2, mm7

303 movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09

304

305

306 }

307 }

308

309 /****************************************************************************

310 *

311 * ROUTINE : vertical_band_4_5_scale_mmx

312 *

313 * INPUTS : unsigned char *dest :

314 * unsigned int dest_pitch :

315 * unsigned int dest_width :

316 *

317 * OUTPUTS : None.

318 *

319 * RETURNS : void

320 *

321 * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.

322 *

323 * SPECIAL NOTES : The routine uses the first line of the band below

324 * the current band. The function also has a "C" only

325 * version.

326 *

327 ****************************************************************************/

328 static

329 void vertical_band_4_5_scale_mmx

330 (

331 unsigned char *dest,

332 unsigned int dest_pitch,

333 unsigned int dest_width

334 ) {

335 __asm {

336

337 mov esi, dest // Get the source and destinatio n pointer

338 mov ecx, dest_pitch // Get the pitch size

339

340 lea edi, [esi+ecx*2] // tow lines below

341 add edi, ecx // three lines below

342

343 pxor mm7, mm7 // clear out mm7

344 mov edx, dest_width // Loop counter

345

346 vs_4_5_loop:

347

348 movq mm0, QWORD ptr [esi] // src[0];

349 movq mm1, QWORD ptr [esi+ecx] // src[1];

350

351 movq mm2, mm0 // Make a copy

352 punpcklbw mm0, mm7 // unpack low to word

353

354 movq mm5, one_fifth

355 punpckhbw mm2, mm7 // unpack high to word

356

357 pmullw mm0, mm5 // a * 1/5

358

359 movq mm3, mm1 // make a copy

360 punpcklbw mm1, mm7 // unpack low to word

361

362 pmullw mm2, mm5 // a * 1/5

363 movq mm6, four_fifths // constan

364

365 movq mm4, mm1 // copy of low b

366 pmullw mm4, mm6 // b * 4/5

367

368 punpckhbw mm3, mm7 // unpack high to word

369 movq mm5, mm3 // copy of high b

370

371 pmullw mm5, mm6 // b * 4/5

372 paddw mm0, mm4 // a * 1/5 + b * 4/5

373

374 paddw mm2, mm5 // a * 1/5 + b * 4/5

375 paddw mm0, round_values // + 128

376

377 paddw mm2, round_values // + 128

378 psrlw mm0, 8

379

380 psrlw mm2, 8

381 packuswb mm0, mm2 // des [1]

382

383 movq QWORD ptr [esi+ecx], mm0 // write des[1]

384 movq mm0, [esi+ecx*2] // mm0 = src[2]

385

386 // mm1, mm3 --- Src[1]

387 // mm0 --- Src[2]

388 // mm7 for unpacking

389

390 movq mm5, two_fifths

391 movq mm2, mm0 // make a copy

392

393 pmullw mm1, mm5 // b * 2/5

394 movq mm6, three_fifths

395

396

397 punpcklbw mm0, mm7 // unpack low to word

398 pmullw mm3, mm5 // b * 2/5

399

400 movq mm4, mm0 // make copy of c

401 punpckhbw mm2, mm7 // unpack high to word

402

403 pmullw mm4, mm6 // c * 3/5

404 movq mm5, mm2

405

406 pmullw mm5, mm6 // c * 3/5

407 paddw mm1, mm4 // b * 2/5 + c * 3/5

408

409 paddw mm3, mm5 // b * 2/5 + c * 3/5

410 paddw mm1, round_values // + 128

411

412 paddw mm3, round_values // + 128

413 psrlw mm1, 8

414

415 psrlw mm3, 8

416 packuswb mm1, mm3 // des[2]

417

418 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]

419 movq mm1, [edi] // mm1=Src[3];

420

421 // mm0, mm2 --- Src[2]

422 // mm1 --- Src[3]

423 // mm6 --- 3/5

424 // mm7 for unpacking

425

426 pmullw mm0, mm6 // c * 3/5

427 movq mm5, two_fifths // mm5 = 2/5

428

429 movq mm3, mm1 // make a copy

430 pmullw mm2, mm6 // c * 3/5

431

432 punpcklbw mm1, mm7 // unpack low

433 movq mm4, mm1 // make a copy

434

435 punpckhbw mm3, mm7 // unpack high

436 pmullw mm4, mm5 // d * 2/5

437

438 movq mm6, mm3 // make a copy

439 pmullw mm6, mm5 // d * 2/5

440

441 paddw mm0, mm4 // c * 3/5 + d * 2/5

442 paddw mm2, mm6 // c * 3/5 + d * 2/5

443

444 paddw mm0, round_values // + 128

445 paddw mm2, round_values // + 128

446

447 psrlw mm0, 8

448 psrlw mm2, 8

449

450 packuswb mm0, mm2 // des[3]

451 movq QWORD ptr [edi], mm0 // write des[3]

452

453 // mm1, mm3 --- Src[3]

454 // mm7 -- cleared for unpacking

455

456 movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group

457

458 movq mm5, four_fifths // mm5 = 4/5

459 pmullw mm1, mm5 // d * 4/5

460

461 movq mm6, one_fifth // mm6 = 1/5

462 movq mm2, mm0 // make a copy

463

464 pmullw mm3, mm5 // d * 4/5

465 punpcklbw mm0, mm7 // unpack low

466

467 pmullw mm0, mm6 // an * 1/5

468 punpckhbw mm2, mm7 // unpack high

469

470 paddw mm1, mm0 // d * 4/5 + an * 1/5

471 pmullw mm2, mm6 // an * 1/5

472

473 paddw mm3, mm2 // d * 4/5 + an * 1/5

474 paddw mm1, round_values // + 128

475

476 paddw mm3, round_values // + 128

477 psrlw mm1, 8

478

479 psrlw mm3, 8

480 packuswb mm1, mm3 // des[4]

481

482 movq QWORD ptr [edi+ecx], mm1 // write des[4]

483

484 add edi, 8

485 add esi, 8

486

487 sub edx, 8

488 jg vs_4_5_loop

489 }

490 }

491

492 /****************************************************************************

493 *

494 * ROUTINE : last_vertical_band_4_5_scale_mmx

495 *

496 * INPUTS : unsigned char *dest :

497 * unsigned int dest_pitch :

498 * unsigned int dest_width :

499 *

500 * OUTPUTS : None.

501 *

502 * RETURNS : None

503 *

504 * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.

505 *

506 * SPECIAL NOTES : The routine uses the first line of the band below

507 * the current band. The function also has an "C" only

508 * version.

509 *

510 ****************************************************************************/

511 static

512 void last_vertical_band_4_5_scale_mmx

513 (

514 unsigned char *dest,

515 unsigned int dest_pitch,

516 unsigned int dest_width

517 ) {

518 __asm {

519 mov esi, dest // Get the source and destinatio n pointer

520 mov ecx, dest_pitch // Get the pitch size

521

522 lea edi, [esi+ecx*2] // tow lines below

523 add edi, ecx // three lines below

524

525 pxor mm7, mm7 // clear out mm7

526 mov edx, dest_width // Loop counter

527

528 last_vs_4_5_loop:

529

530 movq mm0, QWORD ptr [esi] // src[0];

531 movq mm1, QWORD ptr [esi+ecx] // src[1];

532

533 movq mm2, mm0 // Make a copy

534 punpcklbw mm0, mm7 // unpack low to word

535

536 movq mm5, one_fifth

537 punpckhbw mm2, mm7 // unpack high to word

538

539 pmullw mm0, mm5 // a * 1/5

540

541 movq mm3, mm1 // make a copy

542 punpcklbw mm1, mm7 // unpack low to word

543

544 pmullw mm2, mm5 // a * 1/5

545 movq mm6, four_fifths // constan

546

547 movq mm4, mm1 // copy of low b

548 pmullw mm4, mm6 // b * 4/5

549

550 punpckhbw mm3, mm7 // unpack high to word

551 movq mm5, mm3 // copy of high b

552

553 pmullw mm5, mm6 // b * 4/5

554 paddw mm0, mm4 // a * 1/5 + b * 4/5

555

556 paddw mm2, mm5 // a * 1/5 + b * 4/5

557 paddw mm0, round_values // + 128

558

559 paddw mm2, round_values // + 128

560 psrlw mm0, 8

561

562 psrlw mm2, 8

563 packuswb mm0, mm2 // des [1]

564

565 movq QWORD ptr [esi+ecx], mm0 // write des[1]

566 movq mm0, [esi+ecx*2] // mm0 = src[2]

567

568 // mm1, mm3 --- Src[1]

569 // mm0 --- Src[2]

570 // mm7 for unpacking

571

572 movq mm5, two_fifths

573 movq mm2, mm0 // make a copy

574

575 pmullw mm1, mm5 // b * 2/5

576 movq mm6, three_fifths

577

578

579 punpcklbw mm0, mm7 // unpack low to word

580 pmullw mm3, mm5 // b * 2/5

581

582 movq mm4, mm0 // make copy of c

583 punpckhbw mm2, mm7 // unpack high to word

584

585 pmullw mm4, mm6 // c * 3/5

586 movq mm5, mm2

587

588 pmullw mm5, mm6 // c * 3/5

589 paddw mm1, mm4 // b * 2/5 + c * 3/5

590

591 paddw mm3, mm5 // b * 2/5 + c * 3/5

592 paddw mm1, round_values // + 128

593

594 paddw mm3, round_values // + 128

595 psrlw mm1, 8

596

597 psrlw mm3, 8

598 packuswb mm1, mm3 // des[2]

599

600 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]

601 movq mm1, [edi] // mm1=Src[3];

602

603 movq QWORD ptr [edi+ecx], mm1 // write des[4];

604

605 // mm0, mm2 --- Src[2]

606 // mm1 --- Src[3]

607 // mm6 --- 3/5

608 // mm7 for unpacking

609

610 pmullw mm0, mm6 // c * 3/5

611 movq mm5, two_fifths // mm5 = 2/5

612

613 movq mm3, mm1 // make a copy

614 pmullw mm2, mm6 // c * 3/5

615

616 punpcklbw mm1, mm7 // unpack low

617 movq mm4, mm1 // make a copy

618

619 punpckhbw mm3, mm7 // unpack high

620 pmullw mm4, mm5 // d * 2/5

621

622 movq mm6, mm3 // make a copy

623 pmullw mm6, mm5 // d * 2/5

624

625 paddw mm0, mm4 // c * 3/5 + d * 2/5

626 paddw mm2, mm6 // c * 3/5 + d * 2/5

627

628 paddw mm0, round_values // + 128

629 paddw mm2, round_values // + 128

630

631 psrlw mm0, 8

632 psrlw mm2, 8

633

634 packuswb mm0, mm2 // des[3]

635 movq QWORD ptr [edi], mm0 // write des[3]

636

637 // mm1, mm3 --- Src[3]

638 // mm7 -- cleared for unpacking

639 add edi, 8

640 add esi, 8

641

642 sub edx, 8

643 jg last_vs_4_5_loop

644 }

645 }

646

647 /****************************************************************************

648 *

649 * ROUTINE : vertical_band_3_5_scale_mmx

650 *

651 * INPUTS : unsigned char *dest :

652 * unsigned int dest_pitch :

653 * unsigned int dest_width :

654 *

655 * OUTPUTS : None.

656 *

657 * RETURNS : void

658 *

659 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.

660 *

661 * SPECIAL NOTES : The routine uses the first line of the band below

662 * the current band. The function also has an "C" only

663 * version.

664 *

665 ****************************************************************************/

666 static

667 void vertical_band_3_5_scale_mmx

668 (

669 unsigned char *dest,

670 unsigned int dest_pitch,

671 unsigned int dest_width

672 ) {

673 __asm {

674 mov esi, dest // Get the source and destinatio n pointer

675 mov ecx, dest_pitch // Get the pitch size

676

677 lea edi, [esi+ecx*2] // tow lines below

678 add edi, ecx // three lines below

679

680 pxor mm7, mm7 // clear out mm7

681 mov edx, dest_width // Loop counter

682

683 vs_3_5_loop:

684

685 movq mm0, QWORD ptr [esi] // src[0];

686 movq mm1, QWORD ptr [esi+ecx] // src[1];

687

688 movq mm2, mm0 // Make a copy

689 punpcklbw mm0, mm7 // unpack low to word

690

691 movq mm5, two_fifths // mm5 = 2/5

692 punpckhbw mm2, mm7 // unpack high to word

693

694 pmullw mm0, mm5 // a * 2/5

695

696 movq mm3, mm1 // make a copy

697 punpcklbw mm1, mm7 // unpack low to word

698

699 pmullw mm2, mm5 // a * 2/5

700 movq mm6, three_fifths // mm6 = 3/5

701

702 movq mm4, mm1 // copy of low b

703 pmullw mm4, mm6 // b * 3/5

704

705 punpckhbw mm3, mm7 // unpack high to word

706 movq mm5, mm3 // copy of high b

707

708 pmullw mm5, mm6 // b * 3/5

709 paddw mm0, mm4 // a * 2/5 + b * 3/5

710

711 paddw mm2, mm5 // a * 2/5 + b * 3/5

712 paddw mm0, round_values // + 128

713

714 paddw mm2, round_values // + 128

715 psrlw mm0, 8

716

717 psrlw mm2, 8

718 packuswb mm0, mm2 // des [1]

719

720 movq QWORD ptr [esi+ecx], mm0 // write des[1]

721 movq mm0, [esi+ecx*2] // mm0 = src[2]

722

723 // mm1, mm3 --- Src[1]

724 // mm0 --- Src[2]

725 // mm7 for unpacking

726

727 movq mm4, mm1 // b low

728 pmullw mm1, four_fifths // b * 4/5 low

729

730 movq mm5, mm3 // b high

731 pmullw mm3, four_fifths // b * 4/5 high

732

733 movq mm2, mm0 // c

734 pmullw mm4, one_fifth // b * 1/5

735

736 punpcklbw mm0, mm7 // c low

737 pmullw mm5, one_fifth // b * 1/5

738

739 movq mm6, mm0 // make copy of c low

740 punpckhbw mm2, mm7 // c high

741

742 pmullw mm6, one_fifth // c * 1/5 low

743 movq mm7, mm2 // make copy of c high

744

745 pmullw mm7, one_fifth // c * 1/5 high

746 paddw mm1, mm6 // b * 4/5 + c * 1/5 low

747

748 paddw mm3, mm7 // b * 4/5 + c * 1/5 high

749 movq mm6, mm0 // make copy of c low

750

751 pmullw mm6, four_fifths // c * 4/5 low

752 movq mm7, mm2 // make copy of c high

753

754 pmullw mm7, four_fifths // c * 4/5 high

755

756 paddw mm4, mm6 // b * 1/5 + c * 4/5 low

757 paddw mm5, mm7 // b * 1/5 + c * 4/5 high

758

759 paddw mm1, round_values // + 128

760 paddw mm3, round_values // + 128

761

762 psrlw mm1, 8

763 psrlw mm3, 8

764

765 packuswb mm1, mm3 // des[2]

766 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]

767

768 paddw mm4, round_values // + 128

769 paddw mm5, round_values // + 128

770

771 psrlw mm4, 8

772 psrlw mm5, 8

773

774 packuswb mm4, mm5 // des[3]

775 movq QWORD ptr [edi], mm4 // write des[3]

776

777 // mm0, mm2 --- Src[3]

778

779 pxor mm7, mm7 // clear mm7 for unpacking

780 movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next grou p

781

782 movq mm5, three_fifths // mm5 = 3/5

783 pmullw mm0, mm5 // d * 3/5

784

785 movq mm6, two_fifths // mm6 = 2/5

786 movq mm3, mm1 // make a copy

787

788 pmullw mm2, mm5 // d * 3/5

789 punpcklbw mm1, mm7 // unpack low

790

791 pmullw mm1, mm6 // an * 2/5

792 punpckhbw mm3, mm7 // unpack high

793

794 paddw mm0, mm1 // d * 3/5 + an * 2/5

795 pmullw mm3, mm6 // an * 2/5

796

797 paddw mm2, mm3 // d * 3/5 + an * 2/5

798 paddw mm0, round_values // + 128

799

800 paddw mm2, round_values // + 128

801 psrlw mm0, 8

802

803 psrlw mm2, 8

804 packuswb mm0, mm2 // des[4]

805

806 movq QWORD ptr [edi+ecx], mm0 // write des[4]

807

808 add edi, 8

809 add esi, 8

810

811 sub edx, 8

812 jg vs_3_5_loop

813 }

814 }

815

816 /****************************************************************************

817 *

818 * ROUTINE : last_vertical_band_3_5_scale_mmx

819 *

820 * INPUTS : unsigned char *dest :

821 * unsigned int dest_pitch :

822 * unsigned int dest_width :

823 *

824 * OUTPUTS : None.

825 *

826 * RETURNS : void

827 *

828 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.

829 *

830 * SPECIAL NOTES : The routine uses the first line of the band below

831 * the current band. The function also has an "C" only

832 * version.

833 *

834 ****************************************************************************/

835 static

836 void last_vertical_band_3_5_scale_mmx

837 (

838 unsigned char *dest,

839 unsigned int dest_pitch,

840 unsigned int dest_width

841 ) {

842 __asm {

843 mov esi, dest // Get the source and destinatio n pointer

844 mov ecx, dest_pitch // Get the pitch size

845

846 lea edi, [esi+ecx*2] // tow lines below

847 add edi, ecx // three lines below

848

849 pxor mm7, mm7 // clear out mm7

850 mov edx, dest_width // Loop counter

851

852

853 last_vs_3_5_loop:

854

855 movq mm0, QWORD ptr [esi] // src[0];

856 movq mm1, QWORD ptr [esi+ecx] // src[1];

857

858 movq mm2, mm0 // Make a copy

859 punpcklbw mm0, mm7 // unpack low to word

860

861 movq mm5, two_fifths // mm5 = 2/5

862 punpckhbw mm2, mm7 // unpack high to word

863

864 pmullw mm0, mm5 // a * 2/5

865

866 movq mm3, mm1 // make a copy

867 punpcklbw mm1, mm7 // unpack low to word

868

869 pmullw mm2, mm5 // a * 2/5

870 movq mm6, three_fifths // mm6 = 3/5

871

872 movq mm4, mm1 // copy of low b

873 pmullw mm4, mm6 // b * 3/5

874

875 punpckhbw mm3, mm7 // unpack high to word

876 movq mm5, mm3 // copy of high b

877

878 pmullw mm5, mm6 // b * 3/5

879 paddw mm0, mm4 // a * 2/5 + b * 3/5

880

881 paddw mm2, mm5 // a * 2/5 + b * 3/5

882 paddw mm0, round_values // + 128

883

884 paddw mm2, round_values // + 128

885 psrlw mm0, 8

886

887 psrlw mm2, 8

888 packuswb mm0, mm2 // des [1]

889

890 movq QWORD ptr [esi+ecx], mm0 // write des[1]

891 movq mm0, [esi+ecx*2] // mm0 = src[2]

892

893

894

895 // mm1, mm3 --- Src[1]

896 // mm0 --- Src[2]

897 // mm7 for unpacking

898

899 movq mm4, mm1 // b low

900 pmullw mm1, four_fifths // b * 4/5 low

901

902 movq QWORD ptr [edi+ecx], mm0 // write des[4]

903

904 movq mm5, mm3 // b high

905 pmullw mm3, four_fifths // b * 4/5 high

906

907 movq mm2, mm0 // c

908 pmullw mm4, one_fifth // b * 1/5

909

910 punpcklbw mm0, mm7 // c low

911 pmullw mm5, one_fifth // b * 1/5

912

913 movq mm6, mm0 // make copy of c low

914 punpckhbw mm2, mm7 // c high

915

916 pmullw mm6, one_fifth // c * 1/5 low

917 movq mm7, mm2 // make copy of c high

918

919 pmullw mm7, one_fifth // c * 1/5 high

920 paddw mm1, mm6 // b * 4/5 + c * 1/5 low

921

922 paddw mm3, mm7 // b * 4/5 + c * 1/5 high

923 movq mm6, mm0 // make copy of c low

924

925 pmullw mm6, four_fifths // c * 4/5 low

926 movq mm7, mm2 // make copy of c high

927

928 pmullw mm7, four_fifths // c * 4/5 high

929

930 paddw mm4, mm6 // b * 1/5 + c * 4/5 low

931 paddw mm5, mm7 // b * 1/5 + c * 4/5 high

932

933 paddw mm1, round_values // + 128

934 paddw mm3, round_values // + 128

935

936 psrlw mm1, 8

937 psrlw mm3, 8

938

939 packuswb mm1, mm3 // des[2]

940 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]

941

942 paddw mm4, round_values // + 128

943 paddw mm5, round_values // + 128

944

945 psrlw mm4, 8

946 psrlw mm5, 8

947

948 packuswb mm4, mm5 // des[3]

949 movq QWORD ptr [edi], mm4 // write des[3]

950

951 // mm0, mm2 --- Src[3]

952

953 add edi, 8

954 add esi, 8

955

956 sub edx, 8

957 jg last_vs_3_5_loop

958 }

959 }

960

961 /****************************************************************************

962 *

963 * ROUTINE : vertical_band_1_2_scale_mmx

964 *

965 * INPUTS : unsigned char *dest :

966 * unsigned int dest_pitch :

967 * unsigned int dest_width :

968 *

969 * OUTPUTS : None.

970 *

971 * RETURNS : void

972 *

973 * FUNCTION : 1 to 2 up-scaling of a band of pixels.

974 *

975 * SPECIAL NOTES : The routine uses the first line of the band below

976 * the current band. The function also has an "C" only

977 * version.

978 *

979 ****************************************************************************/

980 static

981 void vertical_band_1_2_scale_mmx

982 (

983 unsigned char *dest,

984 unsigned int dest_pitch,

985 unsigned int dest_width

986 ) {

987 __asm {

988

989 mov esi, dest // Get the source and destinatio n pointer

990 mov ecx, dest_pitch // Get the pitch size

991

992 pxor mm7, mm7 // clear out mm7

993 mov edx, dest_width // Loop counter

994

995 vs_1_2_loop:

996

997 movq mm0, [esi] // get Src[0]

998 movq mm1, [esi + ecx * 2] // get Src[1]

999

1000 movq mm2, mm0 // make copy before unpack

1001 movq mm3, mm1 // make copy before unpack

1002

1003 punpcklbw mm0, mm7 // low Src[0]

1004 movq mm6, four_ones // mm6= 1, 1, 1, 1

1005

1006 punpcklbw mm1, mm7 // low Src[1]

1007 paddw mm0, mm1 // low (a + b)

1008

1009 punpckhbw mm2, mm7 // high Src[0]

1010 paddw mm0, mm6 // low (a + b + 1)

1011

1012 punpckhbw mm3, mm7

1013 paddw mm2, mm3 // high (a + b )

1014

1015 psraw mm0, 1 // low (a + b +1 )/2

1016 paddw mm2, mm6 // high (a + b + 1)

1017

1018 psraw mm2, 1 // high (a + b + 1)/2

1019 packuswb mm0, mm2 // pack results

1020

1021 movq [esi+ecx], mm0 // write out eight bytes

1022 add esi, 8

1023

1024 sub edx, 8

1025 jg vs_1_2_loop

1026 }

1027

1028 }

1029

1030 /****************************************************************************

1031 *

1032 * ROUTINE : last_vertical_band_1_2_scale_mmx

1033 *

1034 * INPUTS : unsigned char *dest :

1035 * unsigned int dest_pitch :

1036 * unsigned int dest_width :

1037 *

1038 * OUTPUTS : None.

1039 *

1040 * RETURNS : void

1041 *

1042 * FUNCTION : 1 to 2 up-scaling of band of pixels.

1043 *

1044 * SPECIAL NOTES : The routine uses the first line of the band below

1045 * the current band. The function also has an "C" only

1046 * version.

1047 *

1048 ****************************************************************************/

1049 static

1050 void last_vertical_band_1_2_scale_mmx

1051 (

1052 unsigned char *dest,

1053 unsigned int dest_pitch,

1054 unsigned int dest_width

1055 ) {

1056 __asm {

1057 mov esi, dest // Get the source and destinatio n pointer

1058 mov ecx, dest_pitch // Get the pitch size

1059

1060 mov edx, dest_width // Loop counter

1061

1062 last_vs_1_2_loop:

1063

1064 movq mm0, [esi] // get Src[0]

1065 movq [esi+ecx], mm0 // write out eight bytes

1066

1067 add esi, 8

1068 sub edx, 8

1069

1070 jg last_vs_1_2_loop

1071 }

1072 }

1073

1074 /****************************************************************************

1075 *

1076 * ROUTINE : horizontal_line_1_2_scale

1077 *

1078 * INPUTS : const unsigned char *source :

1079 * unsigned int source_width :

1080 * unsigned char *dest :

1081 * unsigned int dest_width :

1082 *

1083 * OUTPUTS : None.

1084 *

1085 * RETURNS : void

1086 *

1087 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.

1088 *

1089 * SPECIAL NOTES : None.

1090 *

1091 ****************************************************************************/

1092 static

1093 void horizontal_line_1_2_scale_mmx

1094 (

1095 const unsigned char *source,

1096 unsigned int source_width,

1097 unsigned char *dest,

1098 unsigned int dest_width

1099 ) {

1100 (void) dest_width;

1101

1102 __asm {

1103 mov esi, source

1104 mov edi, dest

1105

1106 pxor mm7, mm7

1107 movq mm6, four_ones

1108

1109 mov ecx, source_width

1110

1111 hs_1_2_loop:

1112

1113 movq mm0, [esi]

1114 movq mm1, [esi+1]

1115

1116 movq mm2, mm0

1117 movq mm3, mm1

1118

1119 movq mm4, mm0

1120 punpcklbw mm0, mm7

1121

1122 punpcklbw mm1, mm7

1123 paddw mm0, mm1

1124

1125 paddw mm0, mm6

1126 punpckhbw mm2, mm7

1127

1128 punpckhbw mm3, mm7

1129 paddw mm2, mm3

1130

1131 paddw mm2, mm6

1132 psraw mm0, 1

1133

1134 psraw mm2, 1

1135 packuswb mm0, mm2

1136

1137 movq mm2, mm4

1138 punpcklbw mm2, mm0

1139

1140 movq [edi], mm2

1141 punpckhbw mm4, mm0

1142

1143 movq [edi+8], mm4

1144 add esi, 8

1145

1146 add edi, 16

1147 sub ecx, 8

1148

1149 cmp ecx, 8

1150 jg hs_1_2_loop

1151

1152 // last eight pixel

1153

1154 movq mm0, [esi]

1155 movq mm1, mm0

1156

1157 movq mm2, mm0

1158 movq mm3, mm1

1159

1160 psrlq mm1, 8

1161 psrlq mm3, 56

1162

1163 psllq mm3, 56

1164 por mm1, mm3

1165

1166 movq mm3, mm1

1167 movq mm4, mm0

1168

1169 punpcklbw mm0, mm7

1170 punpcklbw mm1, mm7

1171

1172 paddw mm0, mm1

1173 paddw mm0, mm6

1174

1175 punpckhbw mm2, mm7

1176 punpckhbw mm3, mm7

1177

1178 paddw mm2, mm3

1179 paddw mm2, mm6

1180

1181 psraw mm0, 1

1182 psraw mm2, 1

1183

1184 packuswb mm0, mm2

1185 movq mm2, mm4

1186

1187 punpcklbw mm2, mm0

1188 movq [edi], mm2

1189

1190 punpckhbw mm4, mm0

1191 movq [edi+8], mm4

1192 }

1193 }

1194

1195

1196

1197

1198

1199 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };	29 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };

1200 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };	30 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };

1201	31

1202	32

1203 /****************************************************************************	33 /****************************************************************************

1204 *	34 *

1205 * ROUTINE : horizontal_line_5_4_scale_mmx	35 * ROUTINE : horizontal_line_5_4_scale_mmx

1206 *	36 *

1207 * INPUTS : const unsigned char *source : Pointer to source data.	37 * INPUTS : const unsigned char *source : Pointer to source data.

1208 * unsigned int source_width : Stride of source.	38 * unsigned int source_width : Stride of source.

(...skipping 469 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1678 cmp esi, ecx	508 cmp esi, ecx

1679 jl vs_2_1_i_loop	509 jl vs_2_1_i_loop

1680	510

1681 }	511 }

1682 }	512 }

1683	513

1684	514

1685	515

1686 void	516 void

1687 register_mmxscalers(void) {	517 register_mmxscalers(void) {

1688 vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;

1689 vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;

1690 vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;

1691 vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;

1692 vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;

1693 vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;

1694 vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;

1695 vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;

1696 vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;

1697

1698 vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c;

1699 vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c;

1700 vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;

1701 vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c;

1702 vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c;

1703 vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;

1704

1705

1706

1707 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;	518 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;

1708 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;	519 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;

1709 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;	520 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;

1710 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;	521 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;

1711 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;	522 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;

1712 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;	523 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;

1713 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;	524 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;

1714

1715

1716

1717

1718 }	525 }

OLD	NEW

« libvpx.gyp ('K') | « source/libvpx/vpx_scale/vpxscale.h ('k') | source/libvpx/vpxdec.c » ('j') | no next file with comments »