source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm - Issue 812033011: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 ;

2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.

3 ;

4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.

9 ;

10

11 EXPORT \|vp9_iht8x8_64_add_neon\|

12 ARM

13 REQUIRE8

14 PRESERVE8

15

16 AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2

17

18 ; Generate IADST constants in r0 - r12 for the IADST.

19 MACRO

20 GENERATE_IADST_CONSTANTS

21 ; generate cospi_2_64 = 16305

22 mov r0, #0x3f00

23 add r0, #0xb1

24

25 ; generate cospi_30_64 = 1606

26 mov r1, #0x600

27 add r1, #0x46

28

29 ; generate cospi_10_64 = 14449

30 mov r2, #0x3800

31 add r2, #0x71

32

33 ; generate cospi_22_64 = 7723

34 mov r3, #0x1e00

35 add r3, #0x2b

36

37 ; generate cospi_18_64 = 10394

38 mov r4, #0x2800

39 add r4, #0x9a

40

41 ; generate cospi_14_64 = 12665

42 mov r5, #0x3100

43 add r5, #0x79

44

45 ; generate cospi_26_64 = 4756

46 mov r6, #0x1200

47 add r6, #0x94

48

49 ; generate cospi_6_64 = 15679

50 mov r7, #0x3d00

51 add r7, #0x3f

52

53 ; generate cospi_8_64 = 15137

54 mov r8, #0x3b00

55 add r8, #0x21

56

57 ; generate cospi_24_64 = 6270

58 mov r9, #0x1800

59 add r9, #0x7e

60

61 ; generate 0

62 mov r10, #0

63

64 ; generate cospi_16_64 = 11585

65 mov r12, #0x2d00

66 add r12, #0x41

67 MEND

68

69 ; Generate IDCT constants in r3 - r9 for the IDCT.

70 MACRO

71 GENERATE_IDCT_CONSTANTS

72 ; generate cospi_28_64 = 3196

73 mov r3, #0x0c00

74 add r3, #0x7c

75

76 ; generate cospi_4_64 = 16069

77 mov r4, #0x3e00

78 add r4, #0xc5

79

80 ; generate cospi_12_64 = 13623

81 mov r5, #0x3500

82 add r5, #0x37

83

84 ; generate cospi_20_64 = 9102

85 mov r6, #0x2300

86 add r6, #0x8e

87

88 ; generate cospi_16_64 = 11585

89 mov r7, #0x2d00

90 add r7, #0x41

91

92 ; generate cospi_24_64 = 6270

93 mov r8, #0x1800

94 add r8, #0x7e

95

96 ; generate cospi_8_64 = 15137

97 mov r9, #0x3b00

98 add r9, #0x21

99 MEND

100

101 ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.

102 MACRO

103 TRANSPOSE8X8

104 vswp d17, d24

105 vswp d23, d30

106 vswp d21, d28

107 vswp d19, d26

108 vtrn.32 q8, q10

109 vtrn.32 q9, q11

110 vtrn.32 q12, q14

111 vtrn.32 q13, q15

112 vtrn.16 q8, q9

113 vtrn.16 q10, q11

114 vtrn.16 q12, q13

115 vtrn.16 q14, q15

116 MEND

117

118 ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are

119 ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output

120 ; will be stored back into q8-q15 registers. This macro will touch q0-q7

121 ; registers and use them as buffer during calculation.

122 MACRO

123 IDCT8x8_1D

124 ; stage 1

125 vdup.16 d0, r3 ; duplicate cospi_28_64

126 vdup.16 d1, r4 ; duplicate cospi_4_64

127 vdup.16 d2, r5 ; duplicate cospi_12_64

128 vdup.16 d3, r6 ; duplicate cospi_20_64

129

130 ; input[1] * cospi_28_64

131 vmull.s16 q2, d18, d0

132 vmull.s16 q3, d19, d0

133

134 ; input[5] * cospi_12_64

135 vmull.s16 q5, d26, d2

136 vmull.s16 q6, d27, d2

137

138 ; input[1]cospi_28_64-input[7]cospi_4_64

139 vmlsl.s16 q2, d30, d1

140 vmlsl.s16 q3, d31, d1

141

142 ; input[5] * cospi_12_64 - input[3] * cospi_20_64

143 vmlsl.s16 q5, d22, d3

144 vmlsl.s16 q6, d23, d3

145

146 ; dct_const_round_shift(input_dc * cospi_16_64)

147 vqrshrn.s32 d8, q2, #14 ; >> 14

148 vqrshrn.s32 d9, q3, #14 ; >> 14

149

150 ; dct_const_round_shift(input_dc * cospi_16_64)

151 vqrshrn.s32 d10, q5, #14 ; >> 14

152 vqrshrn.s32 d11, q6, #14 ; >> 14

153

154 ; input[1] * cospi_4_64

155 vmull.s16 q2, d18, d1

156 vmull.s16 q3, d19, d1

157

158 ; input[5] * cospi_20_64

159 vmull.s16 q9, d26, d3

160 vmull.s16 q13, d27, d3

161

162 ; input[1]cospi_4_64+input[7]cospi_28_64

163 vmlal.s16 q2, d30, d0

164 vmlal.s16 q3, d31, d0

165

166 ; input[5] * cospi_20_64 + input[3] * cospi_12_64

167 vmlal.s16 q9, d22, d2

168 vmlal.s16 q13, d23, d2

169

170 ; dct_const_round_shift(input_dc * cospi_16_64)

171 vqrshrn.s32 d14, q2, #14 ; >> 14

172 vqrshrn.s32 d15, q3, #14 ; >> 14

173

174 ; stage 2 & stage 3 - even half

175 vdup.16 d0, r7 ; duplicate cospi_16_64

176

177 ; dct_const_round_shift(input_dc * cospi_16_64)

178 vqrshrn.s32 d12, q9, #14 ; >> 14

179 vqrshrn.s32 d13, q13, #14 ; >> 14

180

181 ; input[0] * cospi_16_64

182 vmull.s16 q2, d16, d0

183 vmull.s16 q3, d17, d0

184

185 ; input[0] * cospi_16_64

186 vmull.s16 q13, d16, d0

187 vmull.s16 q15, d17, d0

188

189 ; (input[0] + input[2]) * cospi_16_64

190 vmlal.s16 q2, d24, d0

191 vmlal.s16 q3, d25, d0

192

193 ; (input[0] - input[2]) * cospi_16_64

194 vmlsl.s16 q13, d24, d0

195 vmlsl.s16 q15, d25, d0

196

197 vdup.16 d0, r8 ; duplicate cospi_24_64

198 vdup.16 d1, r9 ; duplicate cospi_8_64

199

200 ; dct_const_round_shift(input_dc * cospi_16_64)

201 vqrshrn.s32 d18, q2, #14 ; >> 14

202 vqrshrn.s32 d19, q3, #14 ; >> 14

203

204 ; dct_const_round_shift(input_dc * cospi_16_64)

205 vqrshrn.s32 d22, q13, #14 ; >> 14

206 vqrshrn.s32 d23, q15, #14 ; >> 14

207

208 ; input[1] * cospi_24_64

209 vmull.s16 q2, d20, d0

210 vmull.s16 q3, d21, d0

211

212 ; input[1] * cospi_8_64

213 vmull.s16 q8, d20, d1

214 vmull.s16 q12, d21, d1

215

216 ; input[1] * cospi_24_64 - input[3] * cospi_8_64

217 vmlsl.s16 q2, d28, d1

218 vmlsl.s16 q3, d29, d1

219

220 ; input[1] * cospi_8_64 + input[3] * cospi_24_64

221 vmlal.s16 q8, d28, d0

222 vmlal.s16 q12, d29, d0

223

224 ; dct_const_round_shift(input_dc * cospi_16_64)

225 vqrshrn.s32 d26, q2, #14 ; >> 14

226 vqrshrn.s32 d27, q3, #14 ; >> 14

227

228 ; dct_const_round_shift(input_dc * cospi_16_64)

229 vqrshrn.s32 d30, q8, #14 ; >> 14

230 vqrshrn.s32 d31, q12, #14 ; >> 14

231

232 vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]

233 vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]

234 vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]

235 vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]

236

237 ; stage 3 -odd half

238 vdup.16 d16, r7 ; duplicate cospi_16_64

239

240 ; stage 2 - odd half

241 vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]

242 vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]

243 vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]

244 vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]

245

246 ; step2[6] * cospi_16_64

247 vmull.s16 q9, d28, d16

248 vmull.s16 q10, d29, d16

249

250 ; step2[6] * cospi_16_64

251 vmull.s16 q11, d28, d16

252 vmull.s16 q12, d29, d16

253

254 ; (step2[6] - step2[5]) * cospi_16_64

255 vmlsl.s16 q9, d26, d16

256 vmlsl.s16 q10, d27, d16

257

258 ; (step2[5] + step2[6]) * cospi_16_64

259 vmlal.s16 q11, d26, d16

260 vmlal.s16 q12, d27, d16

261

262 ; dct_const_round_shift(input_dc * cospi_16_64)

263 vqrshrn.s32 d10, q9, #14 ; >> 14

264 vqrshrn.s32 d11, q10, #14 ; >> 14

265

266 ; dct_const_round_shift(input_dc * cospi_16_64)

267 vqrshrn.s32 d12, q11, #14 ; >> 14

268 vqrshrn.s32 d13, q12, #14 ; >> 14

269

270 ; stage 4

271 vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];

272 vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];

273 vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];

274 vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];

275 vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];

276 vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];

277 vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];

278 vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];

279 MEND

280

281 ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which

282 ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The

283 ; output will be stored back into q8-q15 registers. This macro will touch

284 ; q0 - q7 registers and use them as buffer during calculation.

285 MACRO

286 IADST8X8_1D

287 vdup.16 d14, r0 ; duplicate cospi_2_64

288 vdup.16 d15, r1 ; duplicate cospi_30_64

289

290 ; cospi_2_64 * x0

291 vmull.s16 q1, d30, d14

292 vmull.s16 q2, d31, d14

293

294 ; cospi_30_64 * x0

295 vmull.s16 q3, d30, d15

296 vmull.s16 q4, d31, d15

297

298 vdup.16 d30, r4 ; duplicate cospi_18_64

299 vdup.16 d31, r5 ; duplicate cospi_14_64

300

301 ; s0 = cospi_2_64 * x0 + cospi_30_64 * x1;

302 vmlal.s16 q1, d16, d15

303 vmlal.s16 q2, d17, d15

304

305 ; s1 = cospi_30_64 * x0 - cospi_2_64 * x1

306 vmlsl.s16 q3, d16, d14

307 vmlsl.s16 q4, d17, d14

308

309 ; cospi_18_64 * x4

310 vmull.s16 q5, d22, d30

311 vmull.s16 q6, d23, d30

312

313 ; cospi_14_64 * x4

314 vmull.s16 q7, d22, d31

315 vmull.s16 q8, d23, d31

316

317 ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

318 vmlal.s16 q5, d24, d31

319 vmlal.s16 q6, d25, d31

320

321 ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5

322 vmlsl.s16 q7, d24, d30

323 vmlsl.s16 q8, d25, d30

324

325 ; (s0 + s4)

326 vadd.s32 q11, q1, q5

327 vadd.s32 q12, q2, q6

328

329 vdup.16 d0, r2 ; duplicate cospi_10_64

330 vdup.16 d1, r3 ; duplicate cospi_22_64

331

332 ; (s0 - s4)

333 vsub.s32 q1, q1, q5

334 vsub.s32 q2, q2, q6

335

336 ; x0 = dct_const_round_shift(s0 + s4);

337 vqrshrn.s32 d22, q11, #14 ; >> 14

338 vqrshrn.s32 d23, q12, #14 ; >> 14

339

340 ; (s1 + s5)

341 vadd.s32 q12, q3, q7

342 vadd.s32 q15, q4, q8

343

344 ; (s1 - s5)

345 vsub.s32 q3, q3, q7

346 vsub.s32 q4, q4, q8

347

348 ; x4 = dct_const_round_shift(s0 - s4);

349 vqrshrn.s32 d2, q1, #14 ; >> 14

350 vqrshrn.s32 d3, q2, #14 ; >> 14

351

352 ; x1 = dct_const_round_shift(s1 + s5);

353 vqrshrn.s32 d24, q12, #14 ; >> 14

354 vqrshrn.s32 d25, q15, #14 ; >> 14

355

356 ; x5 = dct_const_round_shift(s1 - s5);

357 vqrshrn.s32 d6, q3, #14 ; >> 14

358 vqrshrn.s32 d7, q4, #14 ; >> 14

359

360 ; cospi_10_64 * x2

361 vmull.s16 q4, d26, d0

362 vmull.s16 q5, d27, d0

363

364 ; cospi_22_64 * x2

365 vmull.s16 q2, d26, d1

366 vmull.s16 q6, d27, d1

367

368 vdup.16 d30, r6 ; duplicate cospi_26_64

369 vdup.16 d31, r7 ; duplicate cospi_6_64

370

371 ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

372 vmlal.s16 q4, d20, d1

373 vmlal.s16 q5, d21, d1

374

375 ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

376 vmlsl.s16 q2, d20, d0

377 vmlsl.s16 q6, d21, d0

378

379 ; cospi_26_64 * x6

380 vmull.s16 q0, d18, d30

381 vmull.s16 q13, d19, d30

382

383 ; s6 = cospi_26_64 * x6 + cospi_6_64 * x7;

384 vmlal.s16 q0, d28, d31

385 vmlal.s16 q13, d29, d31

386

387 ; cospi_6_64 * x6

388 vmull.s16 q10, d18, d31

389 vmull.s16 q9, d19, d31

390

391 ; s7 = cospi_6_64 * x6 - cospi_26_64 * x7;

392 vmlsl.s16 q10, d28, d30

393 vmlsl.s16 q9, d29, d30

394

395 ; (s3 + s7)

396 vadd.s32 q14, q2, q10

397 vadd.s32 q15, q6, q9

398

399 ; (s3 - s7)

400 vsub.s32 q2, q2, q10

401 vsub.s32 q6, q6, q9

402

403 ; x3 = dct_const_round_shift(s3 + s7);

404 vqrshrn.s32 d28, q14, #14 ; >> 14

405 vqrshrn.s32 d29, q15, #14 ; >> 14

406

407 ; x7 = dct_const_round_shift(s3 - s7);

408 vqrshrn.s32 d4, q2, #14 ; >> 14

409 vqrshrn.s32 d5, q6, #14 ; >> 14

410

411 ; (s2 + s6)

412 vadd.s32 q9, q4, q0

413 vadd.s32 q10, q5, q13

414

415 ; (s2 - s6)

416 vsub.s32 q4, q4, q0

417 vsub.s32 q5, q5, q13

418

419 vdup.16 d30, r8 ; duplicate cospi_8_64

420 vdup.16 d31, r9 ; duplicate cospi_24_64

421

422 ; x2 = dct_const_round_shift(s2 + s6);

423 vqrshrn.s32 d18, q9, #14 ; >> 14

424 vqrshrn.s32 d19, q10, #14 ; >> 14

425

426 ; x6 = dct_const_round_shift(s2 - s6);

427 vqrshrn.s32 d8, q4, #14 ; >> 14

428 vqrshrn.s32 d9, q5, #14 ; >> 14

429

430 ; cospi_8_64 * x4

431 vmull.s16 q5, d2, d30

432 vmull.s16 q6, d3, d30

433

434 ; cospi_24_64 * x4

435 vmull.s16 q7, d2, d31

436 vmull.s16 q0, d3, d31

437

438 ; s4 = cospi_8_64 * x4 + cospi_24_64 * x5;

439 vmlal.s16 q5, d6, d31

440 vmlal.s16 q6, d7, d31

441

442 ; s5 = cospi_24_64 * x4 - cospi_8_64 * x5;

443 vmlsl.s16 q7, d6, d30

444 vmlsl.s16 q0, d7, d30

445

446 ; cospi_8_64 * x7

447 vmull.s16 q1, d4, d30

448 vmull.s16 q3, d5, d30

449

450 ; cospi_24_64 * x7

451 vmull.s16 q10, d4, d31

452 vmull.s16 q2, d5, d31

453

454 ; s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;

455 vmlsl.s16 q1, d8, d31

456 vmlsl.s16 q3, d9, d31

457

458 ; s7 = cospi_8_64 * x6 + cospi_24_64 * x7;

459 vmlal.s16 q10, d8, d30

460 vmlal.s16 q2, d9, d30

461

462 vadd.s16 q8, q11, q9 ; x0 = s0 + s2;

463

464 vsub.s16 q11, q11, q9 ; x2 = s0 - s2;

465

466 vadd.s16 q4, q12, q14 ; x1 = s1 + s3;

467

468 vsub.s16 q12, q12, q14 ; x3 = s1 - s3;

469

470 ; (s4 + s6)

471 vadd.s32 q14, q5, q1

472 vadd.s32 q15, q6, q3

473

474 ; (s4 - s6)

475 vsub.s32 q5, q5, q1

476 vsub.s32 q6, q6, q3

477

478 ; x4 = dct_const_round_shift(s4 + s6);

479 vqrshrn.s32 d18, q14, #14 ; >> 14

480 vqrshrn.s32 d19, q15, #14 ; >> 14

481

482 ; x6 = dct_const_round_shift(s4 - s6);

483 vqrshrn.s32 d10, q5, #14 ; >> 14

484 vqrshrn.s32 d11, q6, #14 ; >> 14

485

486 ; (s5 + s7)

487 vadd.s32 q1, q7, q10

488 vadd.s32 q3, q0, q2

489

490 ; (s5 - s7))

491 vsub.s32 q7, q7, q10

492 vsub.s32 q0, q0, q2

493

494 ; x5 = dct_const_round_shift(s5 + s7);

495 vqrshrn.s32 d28, q1, #14 ; >> 14

496 vqrshrn.s32 d29, q3, #14 ; >> 14

497

498 ; x7 = dct_const_round_shift(s5 - s7);

499 vqrshrn.s32 d14, q7, #14 ; >> 14

500 vqrshrn.s32 d15, q0, #14 ; >> 14

501

502 vdup.16 d30, r12 ; duplicate cospi_16_64

503

504 ; cospi_16_64 * x2

505 vmull.s16 q2, d22, d30

506 vmull.s16 q3, d23, d30

507

508 ; cospi_6_64 * x6

509 vmull.s16 q13, d22, d30

510 vmull.s16 q1, d23, d30

511

512 ; cospi_16_64 * x2 + cospi_16_64 * x3;

513 vmlal.s16 q2, d24, d30

514 vmlal.s16 q3, d25, d30

515

516 ; cospi_16_64 * x2 - cospi_16_64 * x3;

517 vmlsl.s16 q13, d24, d30

518 vmlsl.s16 q1, d25, d30

519

520 ; x2 = dct_const_round_shift(s2);

521 vqrshrn.s32 d4, q2, #14 ; >> 14

522 vqrshrn.s32 d5, q3, #14 ; >> 14

523

524 ;x3 = dct_const_round_shift(s3);

525 vqrshrn.s32 d24, q13, #14 ; >> 14

526 vqrshrn.s32 d25, q1, #14 ; >> 14

527

528 ; cospi_16_64 * x6

529 vmull.s16 q13, d10, d30

530 vmull.s16 q1, d11, d30

531

532 ; cospi_6_64 * x6

533 vmull.s16 q11, d10, d30

534 vmull.s16 q0, d11, d30

535

536 ; cospi_16_64 * x6 + cospi_16_64 * x7;

537 vmlal.s16 q13, d14, d30

538 vmlal.s16 q1, d15, d30

539

540 ; cospi_16_64 * x6 - cospi_16_64 * x7;

541 vmlsl.s16 q11, d14, d30

542 vmlsl.s16 q0, d15, d30

543

544 ; x6 = dct_const_round_shift(s6);

545 vqrshrn.s32 d20, q13, #14 ; >> 14

546 vqrshrn.s32 d21, q1, #14 ; >> 14

547

548 ;x7 = dct_const_round_shift(s7);

549 vqrshrn.s32 d12, q11, #14 ; >> 14

550 vqrshrn.s32 d13, q0, #14 ; >> 14

551

552 vdup.16 q5, r10 ; duplicate 0

553

554 vsub.s16 q9, q5, q9 ; output[1] = -x4;

555 vsub.s16 q11, q5, q2 ; output[3] = -x2;

556 vsub.s16 q13, q5, q6 ; output[5] = -x7;

557 vsub.s16 q15, q5, q4 ; output[7] = -x1;

558 MEND

559

560

561 AREA Block, CODE, READONLY ; name this block of code

562 ;void vp9_iht8x8_64_add_neon(int16_t input, uint8_t dest,

563 ; int dest_stride, int tx_type)

564 ;

565 ; r0 int16_t input

566 ; r1 uint8_t *dest

567 ; r2 int dest_stride

568 ; r3 int tx_type)

569 ; This function will only handle tx_type of 1,2,3.

570 \|vp9_iht8x8_64_add_neon\| PROC

571

572 ; load the inputs into d16-d19

573 vld1.s16 {q8,q9}, [r0]!

574 vld1.s16 {q10,q11}, [r0]!

575 vld1.s16 {q12,q13}, [r0]!

576 vld1.s16 {q14,q15}, [r0]!

577

578 push {r0-r10}

579 vpush {d8-d15}

580

581 ; transpose the input data

582 TRANSPOSE8X8

583

584 ; decide the type of transform

585 cmp r3, #2

586 beq idct_iadst

587 cmp r3, #3

588 beq iadst_iadst

589

590 iadst_idct

591 ; generate IDCT constants

592 GENERATE_IDCT_CONSTANTS

593

594 ; first transform rows

595 IDCT8x8_1D

596

597 ; transpose the matrix

598 TRANSPOSE8X8

599

600 ; generate IADST constants

601 GENERATE_IADST_CONSTANTS

602

603 ; then transform columns

604 IADST8X8_1D

605

606 b end_vp9_iht8x8_64_add_neon

607

608 idct_iadst

609 ; generate IADST constants

610 GENERATE_IADST_CONSTANTS

611

612 ; first transform rows

613 IADST8X8_1D

614

615 ; transpose the matrix

616 TRANSPOSE8X8

617

618 ; generate IDCT constants

619 GENERATE_IDCT_CONSTANTS

620

621 ; then transform columns

622 IDCT8x8_1D

623

624 b end_vp9_iht8x8_64_add_neon

625

626 iadst_iadst

627 ; generate IADST constants

628 GENERATE_IADST_CONSTANTS

629

630 ; first transform rows

631 IADST8X8_1D

632

633 ; transpose the matrix

634 TRANSPOSE8X8

635

636 ; then transform columns

637 IADST8X8_1D

638

639 end_vp9_iht8x8_64_add_neon

640 vpop {d8-d15}

641 pop {r0-r10}

642

643 ; ROUND_POWER_OF_TWO(temp_out[j], 5)

644 vrshr.s16 q8, q8, #5

645 vrshr.s16 q9, q9, #5

646 vrshr.s16 q10, q10, #5

647 vrshr.s16 q11, q11, #5

648 vrshr.s16 q12, q12, #5

649 vrshr.s16 q13, q13, #5

650 vrshr.s16 q14, q14, #5

651 vrshr.s16 q15, q15, #5

652

653 ; save dest pointer

654 mov r0, r1

655

656 ; load destination data

657 vld1.64 {d0}, [r1], r2

658 vld1.64 {d1}, [r1], r2

659 vld1.64 {d2}, [r1], r2

660 vld1.64 {d3}, [r1], r2

661 vld1.64 {d4}, [r1], r2

662 vld1.64 {d5}, [r1], r2

663 vld1.64 {d6}, [r1], r2

664 vld1.64 {d7}, [r1]

665

666 ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]

667 vaddw.u8 q8, q8, d0

668 vaddw.u8 q9, q9, d1

669 vaddw.u8 q10, q10, d2

670 vaddw.u8 q11, q11, d3

671 vaddw.u8 q12, q12, d4

672 vaddw.u8 q13, q13, d5

673 vaddw.u8 q14, q14, d6

674 vaddw.u8 q15, q15, d7

675

676 ; clip_pixel

677 vqmovun.s16 d0, q8

678 vqmovun.s16 d1, q9

679 vqmovun.s16 d2, q10

680 vqmovun.s16 d3, q11

681 vqmovun.s16 d4, q12

682 vqmovun.s16 d5, q13

683 vqmovun.s16 d6, q14

684 vqmovun.s16 d7, q15

685

686 ; store the data

687 vst1.64 {d0}, [r0], r2

688 vst1.64 {d1}, [r0], r2

689 vst1.64 {d2}, [r0], r2

690 vst1.64 {d3}, [r0], r2

691 vst1.64 {d4}, [r0], r2

692 vst1.64 {d5}, [r0], r2

693 vst1.64 {d6}, [r0], r2

694 vst1.64 {d7}, [r0], r2

695 bx lr

696 ENDP ; \|vp9_iht8x8_64_add_neon\|

697

698 END

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c ('k') | source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c » ('j') | no next file with comments »