source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <assert.h>	11 #include <assert.h>

12 #include <stdio.h>	12 #include <stdio.h>

13	13

14 #include "./vpx_config.h"	14 #include "./vpx_config.h"

15 #include "./vp9_rtcd.h"	15 #include "./vp9_rtcd.h"

16 #include "vp9/common/vp9_common.h"	16 #include "vp9/common/vp9_common.h"

17 #include "vp9/common/vp9_blockd.h"	17 #include "vp9/common/vp9_blockd.h"

18 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"	18 #include "vpx_dsp/mips/inv_txfm_dspr2.h"

19 #include "vpx_dsp/txfm_common.h"	19 #include "vpx_dsp/txfm_common.h"

20 #include "vpx_ports/mem.h"	20 #include "vpx_ports/mem.h"

21	21

22 #if HAVE_DSPR2	22 #if HAVE_DSPR2

23 static void idct8_rows_dspr2(const int16_t input, int16_t output,

24 uint32_t no_rows) {

25 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

26 const int const_2_power_13 = 8192;

27 int Temp0, Temp1, Temp2, Temp3, Temp4;

28 int i;

29

30 for (i = no_rows; i--; ) {

31 __asm__ __volatile__ (

32 /*

33 temp_1 = (input[0] + input[4]) * cospi_16_64;

34 step2_0 = dct_const_round_shift(temp_1);

35

36 temp_2 = (input[0] - input[4]) * cospi_16_64;

37 step2_1 = dct_const_round_shift(temp_2);

38 */

39 "lh %[Temp0], 0(%[input]) \n\t"

40 "lh %[Temp1], 8(%[input]) \n\t"

41 "mtlo %[const_2_power_13], $ac0 \n\t"

42 "mthi $zero, $ac0 \n\t"

43 "mtlo %[const_2_power_13], $ac1 \n\t"

44 "mthi $zero, $ac1 \n\t"

45 "add %[Temp2], %[Temp0], %[Temp1] \n\t"

46 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"

47 "extp %[Temp4], $ac0, 31 \n\t"

48

49 "sub %[Temp3], %[Temp0], %[Temp1] \n\t"

50 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"

51 "mtlo %[const_2_power_13], $ac0 \n\t"

52 "mthi $zero, $ac0 \n\t"

53 "extp %[Temp2], $ac1, 31 \n\t"

54

55 /*

56 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;

57 step2_2 = dct_const_round_shift(temp_1);

58 */

59 "lh %[Temp0], 4(%[input]) \n\t"

60 "lh %[Temp1], 12(%[input]) \n\t"

61 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"

62 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"

63 "mtlo %[const_2_power_13], $ac1 \n\t"

64 "mthi $zero, $ac1 \n\t"

65 "extp %[Temp3], $ac0, 31 \n\t"

66

67 /*

68 step1_1 = step2_1 + step2_2;

69 step1_2 = step2_1 - step2_2;

70 */

71 "add %[step1_1], %[Temp2], %[Temp3] \n\t"

72 "sub %[step1_2], %[Temp2], %[Temp3] \n\t"

73

74 /*

75 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;

76 step2_3 = dct_const_round_shift(temp_2);

77 */

78 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"

79 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"

80 "extp %[Temp1], $ac1, 31 \n\t"

81

82 "mtlo %[const_2_power_13], $ac0 \n\t"

83 "mthi $zero, $ac0 \n\t"

84

85 /*

86 step1_0 = step2_0 + step2_3;

87 step1_3 = step2_0 - step2_3;

88 */

89 "add %[step1_0], %[Temp4], %[Temp1] \n\t"

90 "sub %[step1_3], %[Temp4], %[Temp1] \n\t"

91

92 /*

93 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

94 step1_4 = dct_const_round_shift(temp_1);

95 */

96 "lh %[Temp0], 2(%[input]) \n\t"

97 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"

98 "mtlo %[const_2_power_13], $ac1 \n\t"

99 "mthi $zero, $ac1 \n\t"

100 "lh %[Temp1], 14(%[input]) \n\t"

101 "lh %[Temp0], 2(%[input]) \n\t"

102 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"

103 "extp %[step1_4], $ac0, 31 \n\t"

104

105 /*

106 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

107 step1_7 = dct_const_round_shift(temp_2);

108 */

109 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"

110 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"

111 "extp %[step1_7], $ac1, 31 \n\t"

112

113 /*

114 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

115 step1_5 = dct_const_round_shift(temp_1);

116 */

117 "mtlo %[const_2_power_13], $ac0 \n\t"

118 "mthi $zero, $ac0 \n\t"

119 "lh %[Temp0], 10(%[input]) \n\t"

120 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"

121 "lh %[Temp1], 6(%[input]) \n\t"

122 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"

123 "extp %[step1_5], $ac0, 31 \n\t"

124

125 /*

126 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

127 step1_6 = dct_const_round_shift(temp_2);

128 */

129 "mtlo %[const_2_power_13], $ac1 \n\t"

130 "mthi $zero, $ac1 \n\t"

131 "lh %[Temp0], 10(%[input]) \n\t"

132 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"

133 "lh %[Temp1], 6(%[input]) \n\t"

134 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"

135 "extp %[step1_6], $ac1, 31 \n\t"

136

137 /*

138 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;

139 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;

140 */

141 "sub %[Temp0], %[step1_7], %[step1_6] \n\t"

142 "sub %[Temp0], %[Temp0], %[step1_4] \n\t"

143 "add %[Temp0], %[Temp0], %[step1_5] \n\t"

144 "sub %[Temp1], %[step1_4], %[step1_5] \n\t"

145 "sub %[Temp1], %[Temp1], %[step1_6] \n\t"

146 "add %[Temp1], %[Temp1], %[step1_7] \n\t"

147

148 "mtlo %[const_2_power_13], $ac0 \n\t"

149 "mthi $zero, $ac0 \n\t"

150 "mtlo %[const_2_power_13], $ac1 \n\t"

151 "mthi $zero, $ac1 \n\t"

152

153 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"

154 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"

155

156 /*

157 step1_4 = step1_4 + step1_5;

158 step1_7 = step1_6 + step1_7;

159 */

160 "add %[step1_4], %[step1_4], %[step1_5] \n\t"

161 "add %[step1_7], %[step1_7], %[step1_6] \n\t"

162

163 "extp %[step1_5], $ac0, 31 \n\t"

164 "extp %[step1_6], $ac1, 31 \n\t"

165

166 "add %[Temp0], %[step1_0], %[step1_7] \n\t"

167 "sh %[Temp0], 0(%[output]) \n\t"

168 "add %[Temp1], %[step1_1], %[step1_6] \n\t"

169 "sh %[Temp1], 16(%[output]) \n\t"

170 "add %[Temp0], %[step1_2], %[step1_5] \n\t"

171 "sh %[Temp0], 32(%[output]) \n\t"

172 "add %[Temp1], %[step1_3], %[step1_4] \n\t"

173 "sh %[Temp1], 48(%[output]) \n\t"

174

175 "sub %[Temp0], %[step1_3], %[step1_4] \n\t"

176 "sh %[Temp0], 64(%[output]) \n\t"

177 "sub %[Temp1], %[step1_2], %[step1_5] \n\t"

178 "sh %[Temp1], 80(%[output]) \n\t"

179 "sub %[Temp0], %[step1_1], %[step1_6] \n\t"

180 "sh %[Temp0], 96(%[output]) \n\t"

181 "sub %[Temp1], %[step1_0], %[step1_7] \n\t"

182 "sh %[Temp1], 112(%[output]) \n\t"

183

184 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),

185 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),

186 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),

187 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),

188 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),

189 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

190 [Temp4] "=&r" (Temp4)

191 : [const_2_power_13] "r" (const_2_power_13),

192 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),

193 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),

194 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),

195 [cospi_24_64] "r" (cospi_24_64),

196 [output] "r" (output), [input] "r" (input)

197 );

198

199 input += 8;

200 output += 1;

201 }

202 }

203

204 static void idct8_columns_add_blk_dspr2(int16_t input, uint8_t dest,

205 int dest_stride) {

206 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

207 int Temp0, Temp1, Temp2, Temp3;

208 int i;

209 const int const_2_power_13 = 8192;

210 uint8_t *dest_pix;

211 uint8_t *cm = vpx_ff_cropTbl;

212

213 /* prefetch vpx_ff_cropTbl */

214 prefetch_load(vpx_ff_cropTbl);

215 prefetch_load(vpx_ff_cropTbl + 32);

216 prefetch_load(vpx_ff_cropTbl + 64);

217 prefetch_load(vpx_ff_cropTbl + 96);

218 prefetch_load(vpx_ff_cropTbl + 128);

219 prefetch_load(vpx_ff_cropTbl + 160);

220 prefetch_load(vpx_ff_cropTbl + 192);

221 prefetch_load(vpx_ff_cropTbl + 224);

222

223 for (i = 0; i < 8; ++i) {

224 dest_pix = (dest + i);

225

226 __asm__ __volatile__ (

227 /*

228 temp_1 = (input[0] + input[4]) * cospi_16_64;

229 step2_0 = dct_const_round_shift(temp_1);

230

231 temp_2 = (input[0] - input[4]) * cospi_16_64;

232 step2_1 = dct_const_round_shift(temp_2);

233 */

234 "lh %[Temp0], 0(%[input]) \n\t"

235 "lh %[Temp1], 8(%[input]) \n\t"

236 "mtlo %[const_2_power_13], $ac0 \n\t"

237 "mthi $zero, $ac0 \n\t"

238 "mtlo %[const_2_power_13], $ac1 \n\t"

239 "mthi $zero, $ac1 \n\t"

240 "add %[Temp2], %[Temp0], %[Temp1] \n\t"

241 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"

242 "extp %[step1_6], $ac0, 31 \n\t"

243

244 "sub %[Temp3], %[Temp0], %[Temp1] \n\t"

245 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"

246 "mtlo %[const_2_power_13], $ac0 \n\t"

247 "mthi $zero, $ac0 \n\t"

248 "extp %[Temp2], $ac1, 31 \n\t"

249

250 /*

251 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;

252 step2_2 = dct_const_round_shift(temp_1);

253 */

254 "lh %[Temp0], 4(%[input]) \n\t"

255 "lh %[Temp1], 12(%[input]) \n\t"

256 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"

257 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"

258 "mtlo %[const_2_power_13], $ac1 \n\t"

259 "mthi $zero, $ac1 \n\t"

260 "extp %[Temp3], $ac0, 31 \n\t"

261

262 /*

263 step1_1 = step2_1 + step2_2;

264 step1_2 = step2_1 - step2_2;

265 */

266 "add %[step1_1], %[Temp2], %[Temp3] \n\t"

267 "sub %[step1_2], %[Temp2], %[Temp3] \n\t"

268

269 /*

270 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;

271 step2_3 = dct_const_round_shift(temp_2);

272 */

273 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"

274 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"

275 "extp %[Temp1], $ac1, 31 \n\t"

276

277 "mtlo %[const_2_power_13], $ac0 \n\t"

278 "mthi $zero, $ac0 \n\t"

279

280 /*

281 step1_0 = step2_0 + step2_3;

282 step1_3 = step2_0 - step2_3;

283 */

284 "add %[step1_0], %[step1_6], %[Temp1] \n\t"

285 "sub %[step1_3], %[step1_6], %[Temp1] \n\t"

286

287 /*

288 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

289 step1_4 = dct_const_round_shift(temp_1);

290 */

291 "lh %[Temp0], 2(%[input]) \n\t"

292 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"

293 "mtlo %[const_2_power_13], $ac1 \n\t"

294 "mthi $zero, $ac1 \n\t"

295 "lh %[Temp1], 14(%[input]) \n\t"

296 "lh %[Temp0], 2(%[input]) \n\t"

297 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"

298 "extp %[step1_4], $ac0, 31 \n\t"

299

300 /*

301 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

302 step1_7 = dct_const_round_shift(temp_2);

303 */

304 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"

305 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"

306 "extp %[step1_7], $ac1, 31 \n\t"

307

308 /*

309 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

310 step1_5 = dct_const_round_shift(temp_1);

311 */

312 "mtlo %[const_2_power_13], $ac0 \n\t"

313 "mthi $zero, $ac0 \n\t"

314 "lh %[Temp0], 10(%[input]) \n\t"

315 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"

316 "lh %[Temp1], 6(%[input]) \n\t"

317 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"

318 "extp %[step1_5], $ac0, 31 \n\t"

319

320 /*

321 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

322 step1_6 = dct_const_round_shift(temp_2);

323 */

324 "mtlo %[const_2_power_13], $ac1 \n\t"

325 "mthi $zero, $ac1 \n\t"

326 "lh %[Temp0], 10(%[input]) \n\t"

327 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"

328 "lh %[Temp1], 6(%[input]) \n\t"

329 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"

330 "extp %[step1_6], $ac1, 31 \n\t"

331

332 /*

333 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;

334 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;

335 */

336 "sub %[Temp0], %[step1_7], %[step1_6] \n\t"

337 "sub %[Temp0], %[Temp0], %[step1_4] \n\t"

338 "add %[Temp0], %[Temp0], %[step1_5] \n\t"

339 "sub %[Temp1], %[step1_4], %[step1_5] \n\t"

340 "sub %[Temp1], %[Temp1], %[step1_6] \n\t"

341 "add %[Temp1], %[Temp1], %[step1_7] \n\t"

342

343 "mtlo %[const_2_power_13], $ac0 \n\t"

344 "mthi $zero, $ac0 \n\t"

345 "mtlo %[const_2_power_13], $ac1 \n\t"

346 "mthi $zero, $ac1 \n\t"

347

348 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"

349 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"

350

351 /*

352 step1_4 = step1_4 + step1_5;

353 step1_7 = step1_6 + step1_7;

354 */

355 "add %[step1_4], %[step1_4], %[step1_5] \n\t"

356 "add %[step1_7], %[step1_7], %[step1_6] \n\t"

357

358 "extp %[step1_5], $ac0, 31 \n\t"

359 "extp %[step1_6], $ac1, 31 \n\t"

360

361 /* add block */

362 "lbu %[Temp1], 0(%[dest_pix]) \n\t"

363 "add %[Temp0], %[step1_0], %[step1_7] \n\t"

364 "addi %[Temp0], %[Temp0], 16 \n\t"

365 "sra %[Temp0], %[Temp0], 5 \n\t"

366 "add %[Temp1], %[Temp1], %[Temp0] \n\t"

367 "add %[Temp0], %[step1_1], %[step1_6] \n\t"

368 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"

369 "sb %[Temp2], 0(%[dest_pix]) \n\t"

370 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

371

372 "lbu %[Temp1], 0(%[dest_pix]) \n\t"

373 "addi %[Temp0], %[Temp0], 16 \n\t"

374 "sra %[Temp0], %[Temp0], 5 \n\t"

375 "add %[Temp1], %[Temp1], %[Temp0] \n\t"

376 "add %[Temp0], %[step1_2], %[step1_5] \n\t"

377 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"

378 "sb %[Temp2], 0(%[dest_pix]) \n\t"

379 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

380

381 "lbu %[Temp1], 0(%[dest_pix]) \n\t"

382 "addi %[Temp0], %[Temp0], 16 \n\t"

383 "sra %[Temp0], %[Temp0], 5 \n\t"

384 "add %[Temp1], %[Temp1], %[Temp0] \n\t"

385 "add %[Temp0], %[step1_3], %[step1_4] \n\t"

386 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"

387 "sb %[Temp2], 0(%[dest_pix]) \n\t"

388 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

389

390 "lbu %[Temp1], 0(%[dest_pix]) \n\t"

391 "addi %[Temp0], %[Temp0], 16 \n\t"

392 "sra %[Temp0], %[Temp0], 5 \n\t"

393 "add %[Temp1], %[Temp1], %[Temp0] \n\t"

394 "sub %[Temp0], %[step1_3], %[step1_4] \n\t"

395 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"

396 "sb %[Temp2], 0(%[dest_pix]) \n\t"

397 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

398

399 "lbu %[Temp1], 0(%[dest_pix]) \n\t"

400 "addi %[Temp0], %[Temp0], 16 \n\t"

401 "sra %[Temp0], %[Temp0], 5 \n\t"

402 "add %[Temp1], %[Temp1], %[Temp0] \n\t"

403 "sub %[Temp0], %[step1_2], %[step1_5] \n\t"

404 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"

405 "sb %[Temp2], 0(%[dest_pix]) \n\t"

406 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

407

408 "lbu %[Temp1], 0(%[dest_pix]) \n\t"

409 "addi %[Temp0], %[Temp0], 16 \n\t"

410 "sra %[Temp0], %[Temp0], 5 \n\t"

411 "add %[Temp1], %[Temp1], %[Temp0] \n\t"

412 "sub %[Temp0], %[step1_1], %[step1_6] \n\t"

413 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"

414 "sb %[Temp2], 0(%[dest_pix]) \n\t"

415 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

416

417 "lbu %[Temp1], 0(%[dest_pix]) \n\t"

418 "addi %[Temp0], %[Temp0], 16 \n\t"

419 "sra %[Temp0], %[Temp0], 5 \n\t"

420 "add %[Temp1], %[Temp1], %[Temp0] \n\t"

421 "sub %[Temp0], %[step1_0], %[step1_7] \n\t"

422 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"

423 "sb %[Temp2], 0(%[dest_pix]) \n\t"

424 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

425

426 "lbu %[Temp1], 0(%[dest_pix]) \n\t"

427 "addi %[Temp0], %[Temp0], 16 \n\t"

428 "sra %[Temp0], %[Temp0], 5 \n\t"

429 "add %[Temp1], %[Temp1], %[Temp0] \n\t"

430 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"

431 "sb %[Temp2], 0(%[dest_pix]) \n\t"

432

433 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),

434 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),

435 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),

436 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),

437 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),

438 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

439 [dest_pix] "+r" (dest_pix)

440 : [const_2_power_13] "r" (const_2_power_13),

441 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),

442 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),

443 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),

444 [cospi_24_64] "r" (cospi_24_64),

445 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)

446 );

447

448 input += 8;

449 }

450 }

451

452 void vp9_idct8x8_64_add_dspr2(const int16_t input, uint8_t dest,

453 int dest_stride) {

454 DECLARE_ALIGNED(32, int16_t, out[8 * 8]);

455 int16_t *outptr = out;

456 uint32_t pos = 45;

457

458 /* bit positon for extract from acc */

459 __asm__ __volatile__ (

460 "wrdsp %[pos], 1 \n\t"

461 :

462 : [pos] "r" (pos)

463 );

464

465 // First transform rows

466 idct8_rows_dspr2(input, outptr, 8);

467

468 // Then transform columns and add to dest

469 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);

470 }

471

472 static void iadst8_dspr2(const int16_t input, int16_t output) {

473 int s0, s1, s2, s3, s4, s5, s6, s7;

474 int x0, x1, x2, x3, x4, x5, x6, x7;

475

476 x0 = input[7];

477 x1 = input[0];

478 x2 = input[5];

479 x3 = input[2];

480 x4 = input[3];

481 x5 = input[4];

482 x6 = input[1];

483 x7 = input[6];

484

485 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7)) {

486 output[0] = output[1] = output[2] = output[3] = output[4]

487 = output[5] = output[6] = output[7] = 0;

488 return;

489 }

490

491 // stage 1

492 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;

493 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;

494 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

495 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

496 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

497 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

498 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;

499 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;

500

501 x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);

502 x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);

503 x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);

504 x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);

505 x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);

506 x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);

507 x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);

508 x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);

509

510 // stage 2

511 s0 = x0;

512 s1 = x1;

513 s2 = x2;

514 s3 = x3;

515 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;

516 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;

517 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;

518 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;

519

520 x0 = s0 + s2;

521 x1 = s1 + s3;

522 x2 = s0 - s2;

523 x3 = s1 - s3;

524 x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);

525 x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);

526 x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);

527 x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);

528

529 // stage 3

530 s2 = cospi_16_64 * (x2 + x3);

531 s3 = cospi_16_64 * (x2 - x3);

532 s6 = cospi_16_64 * (x6 + x7);

533 s7 = cospi_16_64 * (x6 - x7);

534

535 x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);

536 x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);

537 x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);

538 x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);

539

540 output[0] = x0;

541 output[1] = -x4;

542 output[2] = x6;

543 output[3] = -x2;

544 output[4] = x3;

545 output[5] = -x7;

546 output[6] = x5;

547 output[7] = -x1;

548 }

549

550 void vp9_iht8x8_64_add_dspr2(const int16_t input, uint8_t dest,	23 void vp9_iht8x8_64_add_dspr2(const int16_t input, uint8_t dest,

551 int dest_stride, int tx_type) {	24 int dest_stride, int tx_type) {

552 int i, j;	25 int i, j;

553 DECLARE_ALIGNED(32, int16_t, out[8 * 8]);	26 DECLARE_ALIGNED(32, int16_t, out[8 * 8]);

554 int16_t *outptr = out;	27 int16_t *outptr = out;

555 int16_t temp_in[8 * 8], temp_out[8];	28 int16_t temp_in[8 * 8], temp_out[8];

556 uint32_t pos = 45;	29 uint32_t pos = 45;

557	30

558 /* bit positon for extract from acc */	31 /* bit positon for extract from acc */

559 __asm__ __volatile__ (	32 __asm__ __volatile__ (

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
610 dest[j * dest_stride + i] =	83 dest[j * dest_stride + i] =

611 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)	84 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)

612 + dest[j * dest_stride + i]);	85 + dest[j * dest_stride + i]);

613 }	86 }

614 break;	87 break;

615 default:	88 default:

616 printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");	89 printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");

617 break;	90 break;

618 }	91 }

619 }	92 }

620

621 void vp9_idct8x8_12_add_dspr2(const int16_t input, uint8_t dest,

622 int dest_stride) {

623 DECLARE_ALIGNED(32, int16_t, out[8 * 8]);

624 int16_t *outptr = out;

625 uint32_t pos = 45;

626

627 /* bit positon for extract from acc */

628 __asm__ __volatile__ (

629 "wrdsp %[pos], 1 \n\t"

630 :

631 : [pos] "r" (pos)

632 );

633

634 // First transform rows

635 idct8_rows_dspr2(input, outptr, 4);

636

637 outptr += 4;

638

639 __asm__ __volatile__ (

640 "sw $zero, 0(%[outptr]) \n\t"

641 "sw $zero, 4(%[outptr]) \n\t"

642 "sw $zero, 16(%[outptr]) \n\t"

643 "sw $zero, 20(%[outptr]) \n\t"

644 "sw $zero, 32(%[outptr]) \n\t"

645 "sw $zero, 36(%[outptr]) \n\t"

646 "sw $zero, 48(%[outptr]) \n\t"

647 "sw $zero, 52(%[outptr]) \n\t"

648 "sw $zero, 64(%[outptr]) \n\t"

649 "sw $zero, 68(%[outptr]) \n\t"

650 "sw $zero, 80(%[outptr]) \n\t"

651 "sw $zero, 84(%[outptr]) \n\t"

652 "sw $zero, 96(%[outptr]) \n\t"

653 "sw $zero, 100(%[outptr]) \n\t"

654 "sw $zero, 112(%[outptr]) \n\t"

655 "sw $zero, 116(%[outptr]) \n\t"

656

657 :

658 : [outptr] "r" (outptr)

659 );

660

661

662 // Then transform columns and add to dest

663 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);

664 }

665

666 void vp9_idct8x8_1_add_dspr2(const int16_t input, uint8_t dest,

667 int dest_stride) {

668 uint32_t pos = 45;

669 int32_t out;

670 int32_t r;

671 int32_t a1, absa1;

672 int32_t t1, t2, vector_a1, vector_1, vector_2;

673

674 /* bit positon for extract from acc */

675 __asm__ __volatile__ (

676 "wrdsp %[pos], 1 \n\t"

677

678 :

679 : [pos] "r" (pos)

680 );

681

682 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

683 __asm__ __volatile__ (

684 "addi %[out], %[out], 16 \n\t"

685 "sra %[a1], %[out], 5 \n\t"

686

687 : [out] "+r" (out), [a1] "=r" (a1)

688 :

689 );

690

691 if (a1 < 0) {

692 /* use quad-byte

693 * input and output memory are four byte aligned */

694 __asm__ __volatile__ (

695 "abs %[absa1], %[a1] \n\t"

696 "replv.qb %[vector_a1], %[absa1] \n\t"

697

698 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

699 : [a1] "r" (a1)

700 );

701

702 for (r = 8; r--;) {

703 __asm__ __volatile__ (

704 "lw %[t1], 0(%[dest]) \n\t"

705 "lw %[t2], 4(%[dest]) \n\t"

706 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

707 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

708 "sw %[vector_1], 0(%[dest]) \n\t"

709 "sw %[vector_2], 4(%[dest]) \n\t"

710 "add %[dest], %[dest], %[dest_stride] \n\t"

711

712 : [t1] "=&r" (t1), [t2] "=&r" (t2),

713 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

714 [dest] "+&r" (dest)

715 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

716 );

717 }

718 } else {

719 /* use quad-byte

720 * input and output memory are four byte aligned */

721 __asm__ __volatile__ (

722 "replv.qb %[vector_a1], %[a1] \n\t"

723

724 : [vector_a1] "=r" (vector_a1)

725 : [a1] "r" (a1)

726 );

727

728 for (r = 8; r--;) {

729 __asm__ __volatile__ (

730 "lw %[t1], 0(%[dest]) \n\t"

731 "lw %[t2], 4(%[dest]) \n\t"

732 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

733 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

734 "sw %[vector_1], 0(%[dest]) \n\t"

735 "sw %[vector_2], 4(%[dest]) \n\t"

736 "add %[dest], %[dest], %[dest_stride] \n\t"

737

738 : [t1] "=&r" (t1), [t2] "=&r" (t2),

739 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

740 [dest] "+r" (dest)

741 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

742 );

743 }

744 }

745 }

746 #endif // #if HAVE_DSPR2	93 #endif // #if HAVE_DSPR2

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c ('k') | source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c » ('j') | no next file with comments »