source/libvpx/vp8/common/ppc/filter_altivec.asm - Issue 1124333011: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp8/common/ppc/filter_altivec.asm

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: only update to last nights LKGR Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 ;

2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 ;

4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.

9 ;

10

11

12 .globl sixtap_predict_ppc

13 .globl sixtap_predict8x4_ppc

14 .globl sixtap_predict8x8_ppc

15 .globl sixtap_predict16x16_ppc

16

17 .macro load_c V, LABEL, OFF, R0, R1

18 lis \R0, \LABEL@ha

19 la \R1, \LABEL@l(\R0)

20 lvx \V, \OFF, \R1

21 .endm

22

23 .macro load_hfilter V0, V1

24 load_c \V0, HFilter, r5, r9, r10

25

26 addi r5, r5, 16

27 lvx \V1, r5, r10

28 .endm

29

30 ;# Vertical filtering

31 .macro Vprolog

32 load_c v0, VFilter, r6, r3, r10

33

34 vspltish v5, 8

35 vspltish v6, 3

36 vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

37

38 vspltb v1, v0, 1

39 vspltb v2, v0, 2

40 vspltb v3, v0, 3

41 vspltb v4, v0, 4

42 vspltb v5, v0, 5

43 vspltb v0, v0, 0

44 .endm

45

46 .macro vpre_load

47 Vprolog

48 li r10, 16

49 lvx v10, 0, r9 ;# v10..v14 = first 5 rows

50 lvx v11, r10, r9

51 addi r9, r9, 32

52 lvx v12, 0, r9

53 lvx v13, r10, r9

54 addi r9, r9, 32

55 lvx v14, 0, r9

56 .endm

57

58 .macro Msum Re, Ro, V, T, TMP

59 ;# (Re,Ro) += (V*T)

60 vmuleub \TMP, \V, \T ;# trashes v8

61 vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary

62 vmuloub \TMP, \V, \T

63 vadduhm \Ro, \Ro, \TMP ;# Ro = odds

64 .endm

65

66 .macro vinterp_no_store P0 P1 P2 P3 P4 P5

67 vmuleub v8, \P0, v0 ;# 64 + 4 positive taps

68 vadduhm v16, v6, v8

69 vmuloub v8, \P0, v0

70 vadduhm v17, v6, v8

71 Msum v16, v17, \P2, v2, v8

72 Msum v16, v17, \P3, v3, v8

73 Msum v16, v17, \P5, v5, v8

74

75 vmuleub v18, \P1, v1 ;# 2 negative taps

76 vmuloub v19, \P1, v1

77 Msum v18, v19, \P4, v4, v8

78

79 vsubuhs v16, v16, v18 ;# subtract neg from pos

80 vsubuhs v17, v17, v19

81 vsrh v16, v16, v7 ;# divide by 128

82 vsrh v17, v17, v7 ;# v16 v17 = evens, odds

83 vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order

84 vmrglh v19, v16, v17

85 vpkuhus \P0, v18, v19 ;# P0 = 8-bit result

86 .endm

87

88 .macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5

89 vmuleub v24, \P0, v13 ;# 64 + 4 positive taps

90 vadduhm v21, v20, v24

91 vmuloub v24, \P0, v13

92 vadduhm v22, v20, v24

93 Msum v21, v22, \P2, v15, v25

94 Msum v21, v22, \P3, v16, v25

95 Msum v21, v22, \P5, v18, v25

96

97 vmuleub v23, \P1, v14 ;# 2 negative taps

98 vmuloub v24, \P1, v14

99 Msum v23, v24, \P4, v17, v25

100

101 vsubuhs v21, v21, v23 ;# subtract neg from pos

102 vsubuhs v22, v22, v24

103 vsrh v21, v21, v19 ;# divide by 128

104 vsrh v22, v22, v19 ;# v16 v17 = evens, odds

105 vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order

106 vmrglh v24, v21, v22

107 vpkuhus \P0, v23, v24 ;# P0 = 8-bit result

108 .endm

109

110

111 .macro Vinterp P0 P1 P2 P3 P4 P5

112 vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5

113 stvx \P0, 0, r7

114 add r7, r7, r8 ;# 33 ops per 16 pels

115 .endm

116

117

118 .macro luma_v P0, P1, P2, P3, P4, P5

119 addi r9, r9, 16 ;# P5 = newest input row

120 lvx \P5, 0, r9

121 Vinterp \P0, \P1, \P2, \P3, \P4, \P5

122 .endm

123

124 .macro luma_vtwo

125 luma_v v10, v11, v12, v13, v14, v15

126 luma_v v11, v12, v13, v14, v15, v10

127 .endm

128

129 .macro luma_vfour

130 luma_vtwo

131 luma_v v12, v13, v14, v15, v10, v11

132 luma_v v13, v14, v15, v10, v11, v12

133 .endm

134

135 .macro luma_vsix

136 luma_vfour

137 luma_v v14, v15, v10, v11, v12, v13

138 luma_v v15, v10, v11, v12, v13, v14

139 .endm

140

141 .macro Interp4 R I I4

142 vmsummbm \R, v13, \I, v15

143 vmsummbm \R, v14, \I4, \R

144 .endm

145

146 .macro Read8x8 VD, RS, RP, increment_counter

147 lvsl v21, 0, \RS ;# permutate value for alignment

148

149 ;# input to filter is 21 bytes wide, output is 16 bytes.

150 ;# input will can span three vectors if not aligned correctly.

151 lvx \VD, 0, \RS

152 lvx v20, r10, \RS

153

154 .if \increment_counter

155 add \RS, \RS, \RP

156 .endif

157

158 vperm \VD, \VD, v20, v21

159 .endm

160

161 .macro interp_8x8 R

162 vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456

163 vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A

164 Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3

165 vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx

166 Interp4 v21, v21, \R ;# v21 = result 4 5 6 7

167

168 vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7

169 vsrh \R, \R, v19

170

171 vpkuhus \R, \R, \R ;# saturate and pack

172

173 .endm

174

175 .macro Read4x4 VD, RS, RP, increment_counter

176 lvsl v21, 0, \RS ;# permutate value for alignment

177

178 ;# input to filter is 21 bytes wide, output is 16 bytes.

179 ;# input will can span three vectors if not aligned correctly.

180 lvx v20, 0, \RS

181

182 .if \increment_counter

183 add \RS, \RS, \RP

184 .endif

185

186 vperm \VD, v20, v20, v21

187 .endm

188 .text

189

190 .align 2

191 ;# r3 unsigned char * src

192 ;# r4 int src_pitch

193 ;# r5 int x_offset

194 ;# r6 int y_offset

195 ;# r7 unsigned char * dst

196 ;# r8 int dst_pitch

197 sixtap_predict_ppc:

198 mfspr r11, 256 ;# get old VRSAVE

199 oris r12, r11, 0xff87

200 ori r12, r12, 0xffc0

201 mtspr 256, r12 ;# set VRSAVE

202

203 stwu r1,-32(r1) ;# create space on the stack

204

205 slwi. r5, r5, 5 ;# index into horizontal filter array

206

207 vspltish v19, 7

208

209 ;# If there isn't any filtering to be done for the horizontal, then

210 ;# just skip to the second pass.

211 beq- vertical_only_4x4

212

213 ;# load up horizontal filter

214 load_hfilter v13, v14

215

216 ;# rounding added in on the multiply

217 vspltisw v16, 8

218 vspltisw v15, 3

219 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040

220

221 ;# Load up permutation constants

222 load_c v16, B_0123, 0, r9, r10

223 load_c v17, B_4567, 0, r9, r10

224 load_c v18, B_89AB, 0, r9, r10

225

226 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after

227 addi r3, r3, -2

228

229 addi r9, r3, 0

230 li r10, 16

231 Read8x8 v2, r3, r4, 1

232 Read8x8 v3, r3, r4, 1

233 Read8x8 v4, r3, r4, 1

234 Read8x8 v5, r3, r4, 1

235

236 slwi. r6, r6, 4 ;# index into vertical filter array

237

238 ;# filter a line

239 interp_8x8 v2

240 interp_8x8 v3

241 interp_8x8 v4

242 interp_8x8 v5

243

244 ;# Finished filtering main horizontal block. If there is no

245 ;# vertical filtering, jump to storing the data. Otherwise

246 ;# load up and filter the additional 5 lines that are needed

247 ;# for the vertical filter.

248 beq- store_4x4

249

250 ;# only needed if there is a vertical filter present

251 ;# if the second filter is not null then need to back off by 2*pitch

252 sub r9, r9, r4

253 sub r9, r9, r4

254

255 Read8x8 v0, r9, r4, 1

256 Read8x8 v1, r9, r4, 0

257 Read8x8 v6, r3, r4, 1

258 Read8x8 v7, r3, r4, 1

259 Read8x8 v8, r3, r4, 0

260

261 interp_8x8 v0

262 interp_8x8 v1

263 interp_8x8 v6

264 interp_8x8 v7

265 interp_8x8 v8

266

267 b second_pass_4x4

268

269 vertical_only_4x4:

270 ;# only needed if there is a vertical filter present

271 ;# if the second filter is not null then need to back off by 2*pitch

272 sub r3, r3, r4

273 sub r3, r3, r4

274 li r10, 16

275

276 Read8x8 v0, r3, r4, 1

277 Read8x8 v1, r3, r4, 1

278 Read8x8 v2, r3, r4, 1

279 Read8x8 v3, r3, r4, 1

280 Read8x8 v4, r3, r4, 1

281 Read8x8 v5, r3, r4, 1

282 Read8x8 v6, r3, r4, 1

283 Read8x8 v7, r3, r4, 1

284 Read8x8 v8, r3, r4, 0

285

286 slwi r6, r6, 4 ;# index into vertical filter array

287

288 second_pass_4x4:

289 load_c v20, b_hilo_4x4, 0, r9, r10

290 load_c v21, b_hilo, 0, r9, r10

291

292 ;# reposition input so that it can go through the

293 ;# filtering phase with one pass.

294 vperm v0, v0, v1, v20 ;# 0 1 x x

295 vperm v2, v2, v3, v20 ;# 2 3 x x

296 vperm v4, v4, v5, v20 ;# 4 5 x x

297 vperm v6, v6, v7, v20 ;# 6 7 x x

298

299 vperm v0, v0, v2, v21 ;# 0 1 2 3

300 vperm v4, v4, v6, v21 ;# 4 5 6 7

301

302 vsldoi v1, v0, v4, 4

303 vsldoi v2, v0, v4, 8

304 vsldoi v3, v0, v4, 12

305

306 vsldoi v5, v4, v8, 4

307

308 load_c v13, VFilter, r6, r9, r10

309

310 vspltish v15, 8

311 vspltish v20, 3

312 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

313

314 vspltb v14, v13, 1

315 vspltb v15, v13, 2

316 vspltb v16, v13, 3

317 vspltb v17, v13, 4

318 vspltb v18, v13, 5

319 vspltb v13, v13, 0

320

321 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5

322

323 stvx v0, 0, r1

324

325 lwz r0, 0(r1)

326 stw r0, 0(r7)

327 add r7, r7, r8

328

329 lwz r0, 4(r1)

330 stw r0, 0(r7)

331 add r7, r7, r8

332

333 lwz r0, 8(r1)

334 stw r0, 0(r7)

335 add r7, r7, r8

336

337 lwz r0, 12(r1)

338 stw r0, 0(r7)

339

340 b exit_4x4

341

342 store_4x4:

343

344 stvx v2, 0, r1

345 lwz r0, 0(r1)

346 stw r0, 0(r7)

347 add r7, r7, r8

348

349 stvx v3, 0, r1

350 lwz r0, 0(r1)

351 stw r0, 0(r7)

352 add r7, r7, r8

353

354 stvx v4, 0, r1

355 lwz r0, 0(r1)

356 stw r0, 0(r7)

357 add r7, r7, r8

358

359 stvx v5, 0, r1

360 lwz r0, 0(r1)

361 stw r0, 0(r7)

362

363 exit_4x4:

364

365 addi r1, r1, 32 ;# recover stack

366

367 mtspr 256, r11 ;# reset old VRSAVE

368

369 blr

370

371 .macro w_8x8 V, D, R, P

372 stvx \V, 0, r1

373 lwz \R, 0(r1)

374 stw \R, 0(r7)

375 lwz \R, 4(r1)

376 stw \R, 4(r7)

377 add \D, \D, \P

378 .endm

379

380 .align 2

381 ;# r3 unsigned char * src

382 ;# r4 int src_pitch

383 ;# r5 int x_offset

384 ;# r6 int y_offset

385 ;# r7 unsigned char * dst

386 ;# r8 int dst_pitch

387

388 sixtap_predict8x4_ppc:

389 mfspr r11, 256 ;# get old VRSAVE

390 oris r12, r11, 0xffff

391 ori r12, r12, 0xffc0

392 mtspr 256, r12 ;# set VRSAVE

393

394 stwu r1,-32(r1) ;# create space on the stack

395

396 slwi. r5, r5, 5 ;# index into horizontal filter array

397

398 vspltish v19, 7

399

400 ;# If there isn't any filtering to be done for the horizontal, then

401 ;# just skip to the second pass.

402 beq- second_pass_pre_copy_8x4

403

404 load_hfilter v13, v14

405

406 ;# rounding added in on the multiply

407 vspltisw v16, 8

408 vspltisw v15, 3

409 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040

410

411 ;# Load up permutation constants

412 load_c v16, B_0123, 0, r9, r10

413 load_c v17, B_4567, 0, r9, r10

414 load_c v18, B_89AB, 0, r9, r10

415

416 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after

417 addi r3, r3, -2

418

419 addi r9, r3, 0

420 li r10, 16

421 Read8x8 v2, r3, r4, 1

422 Read8x8 v3, r3, r4, 1

423 Read8x8 v4, r3, r4, 1

424 Read8x8 v5, r3, r4, 1

425

426 slwi. r6, r6, 4 ;# index into vertical filter array

427

428 ;# filter a line

429 interp_8x8 v2

430 interp_8x8 v3

431 interp_8x8 v4

432 interp_8x8 v5

433

434 ;# Finished filtering main horizontal block. If there is no

435 ;# vertical filtering, jump to storing the data. Otherwise

436 ;# load up and filter the additional 5 lines that are needed

437 ;# for the vertical filter.

438 beq- store_8x4

439

440 ;# only needed if there is a vertical filter present

441 ;# if the second filter is not null then need to back off by 2*pitch

442 sub r9, r9, r4

443 sub r9, r9, r4

444

445 Read8x8 v0, r9, r4, 1

446 Read8x8 v1, r9, r4, 0

447 Read8x8 v6, r3, r4, 1

448 Read8x8 v7, r3, r4, 1

449 Read8x8 v8, r3, r4, 0

450

451 interp_8x8 v0

452 interp_8x8 v1

453 interp_8x8 v6

454 interp_8x8 v7

455 interp_8x8 v8

456

457 b second_pass_8x4

458

459 second_pass_pre_copy_8x4:

460 ;# only needed if there is a vertical filter present

461 ;# if the second filter is not null then need to back off by 2*pitch

462 sub r3, r3, r4

463 sub r3, r3, r4

464 li r10, 16

465

466 Read8x8 v0, r3, r4, 1

467 Read8x8 v1, r3, r4, 1

468 Read8x8 v2, r3, r4, 1

469 Read8x8 v3, r3, r4, 1

470 Read8x8 v4, r3, r4, 1

471 Read8x8 v5, r3, r4, 1

472 Read8x8 v6, r3, r4, 1

473 Read8x8 v7, r3, r4, 1

474 Read8x8 v8, r3, r4, 1

475

476 slwi r6, r6, 4 ;# index into vertical filter array

477

478 second_pass_8x4:

479 load_c v13, VFilter, r6, r9, r10

480

481 vspltish v15, 8

482 vspltish v20, 3

483 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

484

485 vspltb v14, v13, 1

486 vspltb v15, v13, 2

487 vspltb v16, v13, 3

488 vspltb v17, v13, 4

489 vspltb v18, v13, 5

490 vspltb v13, v13, 0

491

492 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5

493 vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6

494 vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7

495 vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8

496

497 cmpi cr0, r8, 8

498 beq cr0, store_aligned_8x4

499

500 w_8x8 v0, r7, r0, r8

501 w_8x8 v1, r7, r0, r8

502 w_8x8 v2, r7, r0, r8

503 w_8x8 v3, r7, r0, r8

504

505 b exit_8x4

506

507 store_aligned_8x4:

508

509 load_c v10, b_hilo, 0, r9, r10

510

511 vperm v0, v0, v1, v10

512 vperm v2, v2, v3, v10

513

514 stvx v0, 0, r7

515 addi r7, r7, 16

516 stvx v2, 0, r7

517

518 b exit_8x4

519

520 store_8x4:

521 cmpi cr0, r8, 8

522 beq cr0, store_aligned2_8x4

523

524 w_8x8 v2, r7, r0, r8

525 w_8x8 v3, r7, r0, r8

526 w_8x8 v4, r7, r0, r8

527 w_8x8 v5, r7, r0, r8

528

529 b exit_8x4

530

531 store_aligned2_8x4:

532 load_c v10, b_hilo, 0, r9, r10

533

534 vperm v2, v2, v3, v10

535 vperm v4, v4, v5, v10

536

537 stvx v2, 0, r7

538 addi r7, r7, 16

539 stvx v4, 0, r7

540

541 exit_8x4:

542

543 addi r1, r1, 32 ;# recover stack

544

545 mtspr 256, r11 ;# reset old VRSAVE

546

547

548 blr

549

550 .align 2

551 ;# r3 unsigned char * src

552 ;# r4 int src_pitch

553 ;# r5 int x_offset

554 ;# r6 int y_offset

555 ;# r7 unsigned char * dst

556 ;# r8 int dst_pitch

557

558 ;# Because the width that needs to be filtered will fit in a single altivec

559 ;# register there is no need to loop. Everything can stay in registers.

560 sixtap_predict8x8_ppc:

561 mfspr r11, 256 ;# get old VRSAVE

562 oris r12, r11, 0xffff

563 ori r12, r12, 0xffc0

564 mtspr 256, r12 ;# set VRSAVE

565

566 stwu r1,-32(r1) ;# create space on the stack

567

568 slwi. r5, r5, 5 ;# index into horizontal filter array

569

570 vspltish v19, 7

571

572 ;# If there isn't any filtering to be done for the horizontal, then

573 ;# just skip to the second pass.

574 beq- second_pass_pre_copy_8x8

575

576 load_hfilter v13, v14

577

578 ;# rounding added in on the multiply

579 vspltisw v16, 8

580 vspltisw v15, 3

581 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040

582

583 ;# Load up permutation constants

584 load_c v16, B_0123, 0, r9, r10

585 load_c v17, B_4567, 0, r9, r10

586 load_c v18, B_89AB, 0, r9, r10

587

588 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after

589 addi r3, r3, -2

590

591 addi r9, r3, 0

592 li r10, 16

593 Read8x8 v2, r3, r4, 1

594 Read8x8 v3, r3, r4, 1

595 Read8x8 v4, r3, r4, 1

596 Read8x8 v5, r3, r4, 1

597 Read8x8 v6, r3, r4, 1

598 Read8x8 v7, r3, r4, 1

599 Read8x8 v8, r3, r4, 1

600 Read8x8 v9, r3, r4, 1

601

602 slwi. r6, r6, 4 ;# index into vertical filter array

603

604 ;# filter a line

605 interp_8x8 v2

606 interp_8x8 v3

607 interp_8x8 v4

608 interp_8x8 v5

609 interp_8x8 v6

610 interp_8x8 v7

611 interp_8x8 v8

612 interp_8x8 v9

613

614 ;# Finished filtering main horizontal block. If there is no

615 ;# vertical filtering, jump to storing the data. Otherwise

616 ;# load up and filter the additional 5 lines that are needed

617 ;# for the vertical filter.

618 beq- store_8x8

619

620 ;# only needed if there is a vertical filter present

621 ;# if the second filter is not null then need to back off by 2*pitch

622 sub r9, r9, r4

623 sub r9, r9, r4

624

625 Read8x8 v0, r9, r4, 1

626 Read8x8 v1, r9, r4, 0

627 Read8x8 v10, r3, r4, 1

628 Read8x8 v11, r3, r4, 1

629 Read8x8 v12, r3, r4, 0

630

631 interp_8x8 v0

632 interp_8x8 v1

633 interp_8x8 v10

634 interp_8x8 v11

635 interp_8x8 v12

636

637 b second_pass_8x8

638

639 second_pass_pre_copy_8x8:

640 ;# only needed if there is a vertical filter present

641 ;# if the second filter is not null then need to back off by 2*pitch

642 sub r3, r3, r4

643 sub r3, r3, r4

644 li r10, 16

645

646 Read8x8 v0, r3, r4, 1

647 Read8x8 v1, r3, r4, 1

648 Read8x8 v2, r3, r4, 1

649 Read8x8 v3, r3, r4, 1

650 Read8x8 v4, r3, r4, 1

651 Read8x8 v5, r3, r4, 1

652 Read8x8 v6, r3, r4, 1

653 Read8x8 v7, r3, r4, 1

654 Read8x8 v8, r3, r4, 1

655 Read8x8 v9, r3, r4, 1

656 Read8x8 v10, r3, r4, 1

657 Read8x8 v11, r3, r4, 1

658 Read8x8 v12, r3, r4, 0

659

660 slwi r6, r6, 4 ;# index into vertical filter array

661

662 second_pass_8x8:

663 load_c v13, VFilter, r6, r9, r10

664

665 vspltish v15, 8

666 vspltish v20, 3

667 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

668

669 vspltb v14, v13, 1

670 vspltb v15, v13, 2

671 vspltb v16, v13, 3

672 vspltb v17, v13, 4

673 vspltb v18, v13, 5

674 vspltb v13, v13, 0

675

676 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5

677 vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6

678 vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7

679 vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8

680 vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9

681 vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10

682 vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11

683 vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12

684

685 cmpi cr0, r8, 8

686 beq cr0, store_aligned_8x8

687

688 w_8x8 v0, r7, r0, r8

689 w_8x8 v1, r7, r0, r8

690 w_8x8 v2, r7, r0, r8

691 w_8x8 v3, r7, r0, r8

692 w_8x8 v4, r7, r0, r8

693 w_8x8 v5, r7, r0, r8

694 w_8x8 v6, r7, r0, r8

695 w_8x8 v7, r7, r0, r8

696

697 b exit_8x8

698

699 store_aligned_8x8:

700

701 load_c v10, b_hilo, 0, r9, r10

702

703 vperm v0, v0, v1, v10

704 vperm v2, v2, v3, v10

705 vperm v4, v4, v5, v10

706 vperm v6, v6, v7, v10

707

708 stvx v0, 0, r7

709 addi r7, r7, 16

710 stvx v2, 0, r7

711 addi r7, r7, 16

712 stvx v4, 0, r7

713 addi r7, r7, 16

714 stvx v6, 0, r7

715

716 b exit_8x8

717

718 store_8x8:

719 cmpi cr0, r8, 8

720 beq cr0, store_aligned2_8x8

721

722 w_8x8 v2, r7, r0, r8

723 w_8x8 v3, r7, r0, r8

724 w_8x8 v4, r7, r0, r8

725 w_8x8 v5, r7, r0, r8

726 w_8x8 v6, r7, r0, r8

727 w_8x8 v7, r7, r0, r8

728 w_8x8 v8, r7, r0, r8

729 w_8x8 v9, r7, r0, r8

730

731 b exit_8x8

732

733 store_aligned2_8x8:

734 load_c v10, b_hilo, 0, r9, r10

735

736 vperm v2, v2, v3, v10

737 vperm v4, v4, v5, v10

738 vperm v6, v6, v7, v10

739 vperm v8, v8, v9, v10

740

741 stvx v2, 0, r7

742 addi r7, r7, 16

743 stvx v4, 0, r7

744 addi r7, r7, 16

745 stvx v6, 0, r7

746 addi r7, r7, 16

747 stvx v8, 0, r7

748

749 exit_8x8:

750

751 addi r1, r1, 32 ;# recover stack

752

753 mtspr 256, r11 ;# reset old VRSAVE

754

755 blr

756

757 .align 2

758 ;# r3 unsigned char * src

759 ;# r4 int src_pitch

760 ;# r5 int x_offset

761 ;# r6 int y_offset

762 ;# r7 unsigned char * dst

763 ;# r8 int dst_pitch

764

765 ;# Two pass filtering. First pass is Horizontal edges, second pass is vertical

766 ;# edges. One of the filters can be null, but both won't be. Needs to use a

767 ;# temporary buffer because the source buffer can't be modified and the buffer

768 ;# for the destination is not large enough to hold the temporary data.

769 sixtap_predict16x16_ppc:

770 mfspr r11, 256 ;# get old VRSAVE

771 oris r12, r11, 0xffff

772 ori r12, r12, 0xf000

773 mtspr 256, r12 ;# set VRSAVE

774

775 stwu r1,-416(r1) ;# create space on the stack

776

777 ;# Three possiblities

778 ;# 1. First filter is null. Don't use a temp buffer.

779 ;# 2. Second filter is null. Don't use a temp buffer.

780 ;# 3. Neither are null, use temp buffer.

781

782 ;# First Pass (horizontal edge)

783 ;# setup pointers for src

784 ;# if possiblity (1) then setup the src pointer to be the orginal and jump

785 ;# to second pass. this is based on if x_offset is 0.

786

787 ;# load up horizontal filter

788 slwi. r5, r5, 5 ;# index into horizontal filter array

789

790 load_hfilter v4, v5

791

792 beq- copy_horizontal_16x21

793

794 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after

795 addi r3, r3, -2

796

797 slwi. r6, r6, 4 ;# index into vertical filter array

798

799 ;# setup constants

800 ;# v14 permutation value for alignment

801 load_c v14, b_hperm, 0, r9, r10

802

803 ;# These statements are guessing that there won't be a second pass,

804 ;# but if there is then inside the bypass they need to be set

805 li r0, 16 ;# prepare for no vertical filter

806

807 ;# Change the output pointer and pitch to be the actual

808 ;# desination instead of a temporary buffer.

809 addi r9, r7, 0

810 addi r5, r8, 0

811

812 ;# no vertical filter, so write the output from the first pass

813 ;# directly into the output buffer.

814 beq- no_vertical_filter_bypass

815

816 ;# if the second filter is not null then need to back off by 2*pitch

817 sub r3, r3, r4

818 sub r3, r3, r4

819

820 ;# setup counter for the number of lines that are going to be filtered

821 li r0, 21

822

823 ;# use the stack as temporary storage

824 la r9, 48(r1)

825 li r5, 16

826

827 no_vertical_filter_bypass:

828

829 mtctr r0

830

831 ;# rounding added in on the multiply

832 vspltisw v10, 8

833 vspltisw v12, 3

834 vslw v12, v10, v12 ;# 0x00000040000000400000004000000040

835

836 ;# downshift by 7 ( divide by 128 ) at the end

837 vspltish v13, 7

838

839 ;# index to the next set of vectors in the row.

840 li r10, 16

841 li r12, 32

842

843 horizontal_loop_16x16:

844

845 lvsl v15, 0, r3 ;# permutate value for alignment

846

847 ;# input to filter is 21 bytes wide, output is 16 bytes.

848 ;# input will can span three vectors if not aligned correctly.

849 lvx v1, 0, r3

850 lvx v2, r10, r3

851 lvx v3, r12, r3

852

853 vperm v8, v1, v2, v15

854 vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified

855

856 vsldoi v11, v8, v9, 4

857

858 ;# set 0

859 vmsummbm v6, v4, v8, v12 ;# taps times elements

860 vmsummbm v0, v5, v11, v6

861

862 ;# set 1

863 vsldoi v10, v8, v9, 1

864 vsldoi v11, v8, v9, 5

865

866 vmsummbm v6, v4, v10, v12

867 vmsummbm v1, v5, v11, v6

868

869 ;# set 2

870 vsldoi v10, v8, v9, 2

871 vsldoi v11, v8, v9, 6

872

873 vmsummbm v6, v4, v10, v12

874 vmsummbm v2, v5, v11, v6

875

876 ;# set 3

877 vsldoi v10, v8, v9, 3

878 vsldoi v11, v8, v9, 7

879

880 vmsummbm v6, v4, v10, v12

881 vmsummbm v3, v5, v11, v6

882

883 vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit)

884 vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F

885

886 vsrh v0, v0, v13 ;# divide v0, v1 by 128

887 vsrh v1, v1, v13

888

889 vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result

890 vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result

891

892 stvx v0, 0, r9

893 add r9, r9, r5

894

895 add r3, r3, r4

896

897 bdnz horizontal_loop_16x16

898

899 ;# check again to see if vertical filter needs to be done.

900 cmpi cr0, r6, 0

901 beq cr0, end_16x16

902

903 ;# yes there is, so go to the second pass

904 b second_pass_16x16

905

906 copy_horizontal_16x21:

907 li r10, 21

908 mtctr r10

909

910 li r10, 16

911

912 sub r3, r3, r4

913 sub r3, r3, r4

914

915 ;# this is done above if there is a horizontal filter,

916 ;# if not it needs to be done down here.

917 slwi r6, r6, 4 ;# index into vertical filter array

918

919 ;# always write to the stack when doing a horizontal copy

920 la r9, 48(r1)

921

922 copy_horizontal_loop_16x21:

923 lvsl v15, 0, r3 ;# permutate value for alignment

924

925 lvx v1, 0, r3

926 lvx v2, r10, r3

927

928 vperm v8, v1, v2, v15

929

930 stvx v8, 0, r9

931 addi r9, r9, 16

932

933 add r3, r3, r4

934

935 bdnz copy_horizontal_loop_16x21

936

937 second_pass_16x16:

938

939 ;# always read from the stack when doing a vertical filter

940 la r9, 48(r1)

941

942 ;# downshift by 7 ( divide by 128 ) at the end

943 vspltish v7, 7

944

945 vpre_load

946

947 luma_vsix

948 luma_vsix

949 luma_vfour

950

951 end_16x16:

952

953 addi r1, r1, 416 ;# recover stack

954

955 mtspr 256, r11 ;# reset old VRSAVE

956

957 blr

958

959 .data

960

961 .align 4

962 HFilter:

963 .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0

964 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

965 .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12

966 .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0

967 .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36

968 .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0

969 .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50

970 .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0

971 .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77

972 .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0

973 .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93

974 .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0

975 .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108

976 .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0

977 .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123

978 .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0

979

980 .align 4

981 VFilter:

982 .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

983 .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

984 .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

985 .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

986 .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

987 .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

988 .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

989 .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

990

991 .align 4

992 b_hperm:

993 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15

994

995 .align 4

996 B_0123:

997 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6

998

999 .align 4

1000 B_4567:

1001 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10

1002

1003 .align 4

1004 B_89AB:

1005 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14

1006

1007 .align 4

1008 b_hilo:

1009 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23

1010

1011 .align 4

1012 b_hilo_4x4:

1013 .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0

OLD	NEW

« no previous file with comments | « source/libvpx/vp8/common/ppc/copy_altivec.asm ('k') | source/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm » ('j') | no next file with comments »