source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.

3 *

4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.

9 */

10

11 #include <assert.h>

12 #include <stdio.h>

13

14 #include "./vpx_config.h"

15 #include "./vp9_rtcd.h"

16 #include "vp9/common/vp9_common.h"

17 #include "vp9/common/vp9_blockd.h"

18 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

19 #include "vpx_dsp/txfm_common.h"

20

21 #if HAVE_DSPR2

22 static void idct32_rows_dspr2(const int16_t input, int16_t output,

23 uint32_t no_rows) {

24 int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;

25 int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;

26 int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;

27 int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;

28 int16_t step1_28, step1_29, step1_30, step1_31;

29 int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;

30 int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;

31 int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;

32 int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;

33 int16_t step2_28, step2_29, step2_30, step2_31;

34 int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;

35 int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;

36 int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;

37 int16_t step3_29, step3_30, step3_31;

38 int temp0, temp1, temp2, temp3;

39 int load1, load2, load3, load4;

40 int result1, result2;

41 int temp21;

42 int i;

43 const int const_2_power_13 = 8192;

44 const int32_t *input_int;

45

46 for (i = no_rows; i--; ) {

47 input_int = (const int32_t *)input;

48

49 if (!(input_int[0] \| input_int[1] \| input_int[2] \| input_int[3] \|

50 input_int[4] \| input_int[5] \| input_int[6] \| input_int[7] \|

51 input_int[8] \| input_int[9] \| input_int[10] \| input_int[11] \|

52 input_int[12] \| input_int[13] \| input_int[14] \| input_int[15])) {

53 input += 32;

54

55 __asm__ __volatile__ (

56 "sh $zero, 0(%[output]) \n\t"

57 "sh $zero, 64(%[output]) \n\t"

58 "sh $zero, 128(%[output]) \n\t"

59 "sh $zero, 192(%[output]) \n\t"

60 "sh $zero, 256(%[output]) \n\t"

61 "sh $zero, 320(%[output]) \n\t"

62 "sh $zero, 384(%[output]) \n\t"

63 "sh $zero, 448(%[output]) \n\t"

64 "sh $zero, 512(%[output]) \n\t"

65 "sh $zero, 576(%[output]) \n\t"

66 "sh $zero, 640(%[output]) \n\t"

67 "sh $zero, 704(%[output]) \n\t"

68 "sh $zero, 768(%[output]) \n\t"

69 "sh $zero, 832(%[output]) \n\t"

70 "sh $zero, 896(%[output]) \n\t"

71 "sh $zero, 960(%[output]) \n\t"

72 "sh $zero, 1024(%[output]) \n\t"

73 "sh $zero, 1088(%[output]) \n\t"

74 "sh $zero, 1152(%[output]) \n\t"

75 "sh $zero, 1216(%[output]) \n\t"

76 "sh $zero, 1280(%[output]) \n\t"

77 "sh $zero, 1344(%[output]) \n\t"

78 "sh $zero, 1408(%[output]) \n\t"

79 "sh $zero, 1472(%[output]) \n\t"

80 "sh $zero, 1536(%[output]) \n\t"

81 "sh $zero, 1600(%[output]) \n\t"

82 "sh $zero, 1664(%[output]) \n\t"

83 "sh $zero, 1728(%[output]) \n\t"

84 "sh $zero, 1792(%[output]) \n\t"

85 "sh $zero, 1856(%[output]) \n\t"

86 "sh $zero, 1920(%[output]) \n\t"

87 "sh $zero, 1984(%[output]) \n\t"

88

89 :

90 : [output] "r" (output)

91 );

92

93 output += 1;

94

95 continue;

96 }

97

98 /* prefetch row */

99 prefetch_load((const uint8_t *)(input + 32));

100 prefetch_load((const uint8_t *)(input + 48));

101

102 __asm__ __volatile__ (

103 "lh %[load1], 2(%[input]) \n\t"

104 "lh %[load2], 62(%[input]) \n\t"

105 "lh %[load3], 34(%[input]) \n\t"

106 "lh %[load4], 30(%[input]) \n\t"

107

108 "mtlo %[const_2_power_13], $ac1 \n\t"

109 "mthi $zero, $ac1 \n\t"

110 "mtlo %[const_2_power_13], $ac3 \n\t"

111 "mthi $zero, $ac3 \n\t"

112

113 "madd $ac1, %[load1], %[cospi_31_64] \n\t"

114 "msub $ac1, %[load2], %[cospi_1_64] \n\t"

115 "extp %[temp0], $ac1, 31 \n\t"

116

117 "madd $ac3, %[load1], %[cospi_1_64] \n\t"

118 "madd $ac3, %[load2], %[cospi_31_64] \n\t"

119 "extp %[temp3], $ac3, 31 \n\t"

120

121 "mtlo %[const_2_power_13], $ac1 \n\t"

122 "mthi $zero, $ac1 \n\t"

123 "mtlo %[const_2_power_13], $ac2 \n\t"

124 "mthi $zero, $ac2 \n\t"

125

126 "madd $ac2, %[load3], %[cospi_15_64] \n\t"

127 "msub $ac2, %[load4], %[cospi_17_64] \n\t"

128 "extp %[temp1], $ac2, 31 \n\t"

129

130 "madd $ac1, %[load3], %[cospi_17_64] \n\t"

131 "madd $ac1, %[load4], %[cospi_15_64] \n\t"

132 "extp %[temp2], $ac1, 31 \n\t"

133

134 "mtlo %[const_2_power_13], $ac1 \n\t"

135 "mthi $zero, $ac1 \n\t"

136 "mtlo %[const_2_power_13], $ac3 \n\t"

137 "mthi $zero, $ac3 \n\t"

138

139 "sub %[load1], %[temp3], %[temp2] \n\t"

140 "sub %[load2], %[temp0], %[temp1] \n\t"

141

142 "madd $ac1, %[load1], %[cospi_28_64] \n\t"

143 "msub $ac1, %[load2], %[cospi_4_64] \n\t"

144 "madd $ac3, %[load1], %[cospi_4_64] \n\t"

145 "madd $ac3, %[load2], %[cospi_28_64] \n\t"

146

147 "extp %[step1_17], $ac1, 31 \n\t"

148 "extp %[step1_30], $ac3, 31 \n\t"

149 "add %[step1_16], %[temp0], %[temp1] \n\t"

150 "add %[step1_31], %[temp2], %[temp3] \n\t"

151

152 : [load1] "=&r" (load1), [load2] "=&r" (load2),

153 [load3] "=&r" (load3), [load4] "=&r" (load4),

154 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

155 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

156 [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),

157 [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)

158 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

159 [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),

160 [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),

161 [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)

162 );

163

164 __asm__ __volatile__ (

165 "lh %[load1], 18(%[input]) \n\t"

166 "lh %[load2], 46(%[input]) \n\t"

167 "lh %[load3], 50(%[input]) \n\t"

168 "lh %[load4], 14(%[input]) \n\t"

169

170 "mtlo %[const_2_power_13], $ac1 \n\t"

171 "mthi $zero, $ac1 \n\t"

172 "mtlo %[const_2_power_13], $ac3 \n\t"

173 "mthi $zero, $ac3 \n\t"

174

175 "madd $ac1, %[load1], %[cospi_23_64] \n\t"

176 "msub $ac1, %[load2], %[cospi_9_64] \n\t"

177 "extp %[temp0], $ac1, 31 \n\t"

178

179 "madd $ac3, %[load1], %[cospi_9_64] \n\t"

180 "madd $ac3, %[load2], %[cospi_23_64] \n\t"

181 "extp %[temp3], $ac3, 31 \n\t"

182

183 "mtlo %[const_2_power_13], $ac1 \n\t"

184 "mthi $zero, $ac1 \n\t"

185 "mtlo %[const_2_power_13], $ac2 \n\t"

186 "mthi $zero, $ac2 \n\t"

187

188 "madd $ac2, %[load3], %[cospi_7_64] \n\t"

189 "msub $ac2, %[load4], %[cospi_25_64] \n\t"

190 "extp %[temp1], $ac2, 31 \n\t"

191

192 "madd $ac1, %[load3], %[cospi_25_64] \n\t"

193 "madd $ac1, %[load4], %[cospi_7_64] \n\t"

194 "extp %[temp2], $ac1, 31 \n\t"

195

196 "mtlo %[const_2_power_13], $ac1 \n\t"

197 "mthi $zero, $ac1 \n\t"

198 "mtlo %[const_2_power_13], $ac3 \n\t"

199 "mthi $zero, $ac3 \n\t"

200

201 "sub %[load1], %[temp1], %[temp0] \n\t"

202 "sub %[load2], %[temp2], %[temp3] \n\t"

203

204 "msub $ac1, %[load1], %[cospi_28_64] \n\t"

205 "msub $ac1, %[load2], %[cospi_4_64] \n\t"

206 "msub $ac3, %[load1], %[cospi_4_64] \n\t"

207 "madd $ac3, %[load2], %[cospi_28_64] \n\t"

208

209 "extp %[step1_18], $ac1, 31 \n\t"

210 "extp %[step1_29], $ac3, 31 \n\t"

211 "add %[step1_19], %[temp0], %[temp1] \n\t"

212 "add %[step1_28], %[temp2], %[temp3] \n\t"

213

214 : [load1] "=&r" (load1), [load2] "=&r" (load2),

215 [load3] "=&r" (load3), [load4] "=&r" (load4),

216 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

217 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

218 [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),

219 [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)

220 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

221 [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),

222 [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),

223 [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)

224 );

225

226 __asm__ __volatile__ (

227 "lh %[load1], 10(%[input]) \n\t"

228 "lh %[load2], 54(%[input]) \n\t"

229 "lh %[load3], 42(%[input]) \n\t"

230 "lh %[load4], 22(%[input]) \n\t"

231

232 "mtlo %[const_2_power_13], $ac1 \n\t"

233 "mthi $zero, $ac1 \n\t"

234 "mtlo %[const_2_power_13], $ac3 \n\t"

235 "mthi $zero, $ac3 \n\t"

236

237 "madd $ac1, %[load1], %[cospi_27_64] \n\t"

238 "msub $ac1, %[load2], %[cospi_5_64] \n\t"

239 "extp %[temp0], $ac1, 31 \n\t"

240

241 "madd $ac3, %[load1], %[cospi_5_64] \n\t"

242 "madd $ac3, %[load2], %[cospi_27_64] \n\t"

243 "extp %[temp3], $ac3, 31 \n\t"

244

245 "mtlo %[const_2_power_13], $ac1 \n\t"

246 "mthi $zero, $ac1 \n\t"

247 "mtlo %[const_2_power_13], $ac2 \n\t"

248 "mthi $zero, $ac2 \n\t"

249

250 "madd $ac2, %[load3], %[cospi_11_64] \n\t"

251 "msub $ac2, %[load4], %[cospi_21_64] \n\t"

252 "extp %[temp1], $ac2, 31 \n\t"

253

254 "madd $ac1, %[load3], %[cospi_21_64] \n\t"

255 "madd $ac1, %[load4], %[cospi_11_64] \n\t"

256 "extp %[temp2], $ac1, 31 \n\t"

257

258 "mtlo %[const_2_power_13], $ac1 \n\t"

259 "mthi $zero, $ac1 \n\t"

260 "mtlo %[const_2_power_13], $ac3 \n\t"

261 "mthi $zero, $ac3 \n\t"

262

263 "sub %[load1], %[temp0], %[temp1] \n\t"

264 "sub %[load2], %[temp3], %[temp2] \n\t"

265

266 "madd $ac1, %[load2], %[cospi_12_64] \n\t"

267 "msub $ac1, %[load1], %[cospi_20_64] \n\t"

268 "madd $ac3, %[load1], %[cospi_12_64] \n\t"

269 "madd $ac3, %[load2], %[cospi_20_64] \n\t"

270

271 "extp %[step1_21], $ac1, 31 \n\t"

272 "extp %[step1_26], $ac3, 31 \n\t"

273 "add %[step1_20], %[temp0], %[temp1] \n\t"

274 "add %[step1_27], %[temp2], %[temp3] \n\t"

275

276 : [load1] "=&r" (load1), [load2] "=&r" (load2),

277 [load3] "=&r" (load3), [load4] "=&r" (load4),

278 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

279 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

280 [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),

281 [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)

282 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

283 [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),

284 [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),

285 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

286 );

287

288 __asm__ __volatile__ (

289 "lh %[load1], 26(%[input]) \n\t"

290 "lh %[load2], 38(%[input]) \n\t"

291 "lh %[load3], 58(%[input]) \n\t"

292 "lh %[load4], 6(%[input]) \n\t"

293

294 "mtlo %[const_2_power_13], $ac1 \n\t"

295 "mthi $zero, $ac1 \n\t"

296 "mtlo %[const_2_power_13], $ac3 \n\t"

297 "mthi $zero, $ac3 \n\t"

298

299 "madd $ac1, %[load1], %[cospi_19_64] \n\t"

300 "msub $ac1, %[load2], %[cospi_13_64] \n\t"

301 "extp %[temp0], $ac1, 31 \n\t"

302

303 "madd $ac3, %[load1], %[cospi_13_64] \n\t"

304 "madd $ac3, %[load2], %[cospi_19_64] \n\t"

305 "extp %[temp3], $ac3, 31 \n\t"

306

307 "mtlo %[const_2_power_13], $ac1 \n\t"

308 "mthi $zero, $ac1 \n\t"

309 "mtlo %[const_2_power_13], $ac2 \n\t"

310 "mthi $zero, $ac2 \n\t"

311

312 "madd $ac2, %[load3], %[cospi_3_64] \n\t"

313 "msub $ac2, %[load4], %[cospi_29_64] \n\t"

314 "extp %[temp1], $ac2, 31 \n\t"

315

316 "madd $ac1, %[load3], %[cospi_29_64] \n\t"

317 "madd $ac1, %[load4], %[cospi_3_64] \n\t"

318 "extp %[temp2], $ac1, 31 \n\t"

319

320 "mtlo %[const_2_power_13], $ac1 \n\t"

321 "mthi $zero, $ac1 \n\t"

322 "mtlo %[const_2_power_13], $ac3 \n\t"

323 "mthi $zero, $ac3 \n\t"

324

325 "sub %[load1], %[temp1], %[temp0] \n\t"

326 "sub %[load2], %[temp2], %[temp3] \n\t"

327

328 "msub $ac1, %[load1], %[cospi_12_64] \n\t"

329 "msub $ac1, %[load2], %[cospi_20_64] \n\t"

330 "msub $ac3, %[load1], %[cospi_20_64] \n\t"

331 "madd $ac3, %[load2], %[cospi_12_64] \n\t"

332

333 "extp %[step1_22], $ac1, 31 \n\t"

334 "extp %[step1_25], $ac3, 31 \n\t"

335 "add %[step1_23], %[temp0], %[temp1] \n\t"

336 "add %[step1_24], %[temp2], %[temp3] \n\t"

337

338 : [load1] "=&r" (load1), [load2] "=&r" (load2),

339 [load3] "=&r" (load3), [load4] "=&r" (load4),

340 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

341 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

342 [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),

343 [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)

344 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

345 [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),

346 [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),

347 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

348 );

349

350 __asm__ __volatile__ (

351 "lh %[load1], 4(%[input]) \n\t"

352 "lh %[load2], 60(%[input]) \n\t"

353 "lh %[load3], 36(%[input]) \n\t"

354 "lh %[load4], 28(%[input]) \n\t"

355

356 "mtlo %[const_2_power_13], $ac1 \n\t"

357 "mthi $zero, $ac1 \n\t"

358 "mtlo %[const_2_power_13], $ac3 \n\t"

359 "mthi $zero, $ac3 \n\t"

360

361 "madd $ac1, %[load1], %[cospi_30_64] \n\t"

362 "msub $ac1, %[load2], %[cospi_2_64] \n\t"

363 "extp %[temp0], $ac1, 31 \n\t"

364

365 "madd $ac3, %[load1], %[cospi_2_64] \n\t"

366 "madd $ac3, %[load2], %[cospi_30_64] \n\t"

367 "extp %[temp3], $ac3, 31 \n\t"

368

369 "mtlo %[const_2_power_13], $ac1 \n\t"

370 "mthi $zero, $ac1 \n\t"

371 "mtlo %[const_2_power_13], $ac2 \n\t"

372 "mthi $zero, $ac2 \n\t"

373

374 "madd $ac2, %[load3], %[cospi_14_64] \n\t"

375 "msub $ac2, %[load4], %[cospi_18_64] \n\t"

376 "extp %[temp1], $ac2, 31 \n\t"

377

378 "madd $ac1, %[load3], %[cospi_18_64] \n\t"

379 "madd $ac1, %[load4], %[cospi_14_64] \n\t"

380 "extp %[temp2], $ac1, 31 \n\t"

381

382 "mtlo %[const_2_power_13], $ac1 \n\t"

383 "mthi $zero, $ac1 \n\t"

384 "mtlo %[const_2_power_13], $ac3 \n\t"

385 "mthi $zero, $ac3 \n\t"

386

387 "sub %[load1], %[temp0], %[temp1] \n\t"

388 "sub %[load2], %[temp3], %[temp2] \n\t"

389

390 "msub $ac1, %[load1], %[cospi_8_64] \n\t"

391 "madd $ac1, %[load2], %[cospi_24_64] \n\t"

392 "madd $ac3, %[load1], %[cospi_24_64] \n\t"

393 "madd $ac3, %[load2], %[cospi_8_64] \n\t"

394

395 "extp %[step2_9], $ac1, 31 \n\t"

396 "extp %[step2_14], $ac3, 31 \n\t"

397 "add %[step2_8], %[temp0], %[temp1] \n\t"

398 "add %[step2_15], %[temp2], %[temp3] \n\t"

399

400 : [load1] "=&r" (load1), [load2] "=&r" (load2),

401 [load3] "=&r" (load3), [load4] "=&r" (load4),

402 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

403 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

404 [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),

405 [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)

406 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

407 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

408 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

409 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

410 );

411

412 __asm__ __volatile__ (

413 "lh %[load1], 20(%[input]) \n\t"

414 "lh %[load2], 44(%[input]) \n\t"

415 "lh %[load3], 52(%[input]) \n\t"

416 "lh %[load4], 12(%[input]) \n\t"

417

418 "mtlo %[const_2_power_13], $ac1 \n\t"

419 "mthi $zero, $ac1 \n\t"

420 "mtlo %[const_2_power_13], $ac3 \n\t"

421 "mthi $zero, $ac3 \n\t"

422

423 "madd $ac1, %[load1], %[cospi_22_64] \n\t"

424 "msub $ac1, %[load2], %[cospi_10_64] \n\t"

425 "extp %[temp0], $ac1, 31 \n\t"

426

427 "madd $ac3, %[load1], %[cospi_10_64] \n\t"

428 "madd $ac3, %[load2], %[cospi_22_64] \n\t"

429 "extp %[temp3], $ac3, 31 \n\t"

430

431 "mtlo %[const_2_power_13], $ac1 \n\t"

432 "mthi $zero, $ac1 \n\t"

433 "mtlo %[const_2_power_13], $ac2 \n\t"

434 "mthi $zero, $ac2 \n\t"

435

436 "madd $ac2, %[load3], %[cospi_6_64] \n\t"

437 "msub $ac2, %[load4], %[cospi_26_64] \n\t"

438 "extp %[temp1], $ac2, 31 \n\t"

439

440 "madd $ac1, %[load3], %[cospi_26_64] \n\t"

441 "madd $ac1, %[load4], %[cospi_6_64] \n\t"

442 "extp %[temp2], $ac1, 31 \n\t"

443

444 "mtlo %[const_2_power_13], $ac1 \n\t"

445 "mthi $zero, $ac1 \n\t"

446 "mtlo %[const_2_power_13], $ac3 \n\t"

447 "mthi $zero, $ac3 \n\t"

448

449 "sub %[load1], %[temp1], %[temp0] \n\t"

450 "sub %[load2], %[temp2], %[temp3] \n\t"

451

452 "msub $ac1, %[load1], %[cospi_24_64] \n\t"

453 "msub $ac1, %[load2], %[cospi_8_64] \n\t"

454 "madd $ac3, %[load2], %[cospi_24_64] \n\t"

455 "msub $ac3, %[load1], %[cospi_8_64] \n\t"

456

457 "extp %[step2_10], $ac1, 31 \n\t"

458 "extp %[step2_13], $ac3, 31 \n\t"

459 "add %[step2_11], %[temp0], %[temp1] \n\t"

460 "add %[step2_12], %[temp2], %[temp3] \n\t"

461

462 : [load1] "=&r" (load1), [load2] "=&r" (load2),

463 [load3] "=&r" (load3), [load4] "=&r" (load4),

464 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

465 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

466 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

467 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

468 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

469 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

470 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

471 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

472 );

473

474 __asm__ __volatile__ (

475 "mtlo %[const_2_power_13], $ac0 \n\t"

476 "mthi $zero, $ac0 \n\t"

477 "sub %[temp0], %[step2_14], %[step2_13] \n\t"

478 "sub %[temp0], %[temp0], %[step2_9] \n\t"

479 "add %[temp0], %[temp0], %[step2_10] \n\t"

480 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

481

482 "mtlo %[const_2_power_13], $ac1 \n\t"

483 "mthi $zero, $ac1 \n\t"

484 "sub %[temp1], %[step2_14], %[step2_13] \n\t"

485 "add %[temp1], %[temp1], %[step2_9] \n\t"

486 "sub %[temp1], %[temp1], %[step2_10] \n\t"

487 "madd $ac1, %[temp1], %[cospi_16_64] \n\t"

488

489 "mtlo %[const_2_power_13], $ac2 \n\t"

490 "mthi $zero, $ac2 \n\t"

491 "sub %[temp0], %[step2_15], %[step2_12] \n\t"

492 "sub %[temp0], %[temp0], %[step2_8] \n\t"

493 "add %[temp0], %[temp0], %[step2_11] \n\t"

494 "madd $ac2, %[temp0], %[cospi_16_64] \n\t"

495

496 "mtlo %[const_2_power_13], $ac3 \n\t"

497 "mthi $zero, $ac3 \n\t"

498 "sub %[temp1], %[step2_15], %[step2_12] \n\t"

499 "add %[temp1], %[temp1], %[step2_8] \n\t"

500 "sub %[temp1], %[temp1], %[step2_11] \n\t"

501 "madd $ac3, %[temp1], %[cospi_16_64] \n\t"

502

503 "add %[step3_8], %[step2_8], %[step2_11] \n\t"

504 "add %[step3_9], %[step2_9], %[step2_10] \n\t"

505 "add %[step3_14], %[step2_13], %[step2_14] \n\t"

506 "add %[step3_15], %[step2_12], %[step2_15] \n\t"

507

508 "extp %[step3_10], $ac0, 31 \n\t"

509 "extp %[step3_13], $ac1, 31 \n\t"

510 "extp %[step3_11], $ac2, 31 \n\t"

511 "extp %[step3_12], $ac3, 31 \n\t"

512

513 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

514 [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),

515 [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),

516 [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),

517 [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)

518 : [const_2_power_13] "r" (const_2_power_13),

519 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),

520 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),

521 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),

522 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),

523 [cospi_16_64] "r" (cospi_16_64)

524 );

525

526 step2_18 = step1_17 - step1_18;

527 step2_29 = step1_30 - step1_29;

528

529 __asm__ __volatile__ (

530 "mtlo %[const_2_power_13], $ac0 \n\t"

531 "mthi $zero, $ac0 \n\t"

532 "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"

533 "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"

534 "extp %[step3_18], $ac0, 31 \n\t"

535

536 : [step3_18] "=r" (step3_18)

537 : [const_2_power_13] "r" (const_2_power_13),

538 [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),

539 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

540 );

541

542 temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;

543 step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

544

545 step2_19 = step1_16 - step1_19;

546 step2_28 = step1_31 - step1_28;

547

548 __asm__ __volatile__ (

549 "mtlo %[const_2_power_13], $ac0 \n\t"

550 "mthi $zero, $ac0 \n\t"

551 "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"

552 "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"

553 "extp %[step3_19], $ac0, 31 \n\t"

554

555 : [step3_19] "=r" (step3_19)

556 : [const_2_power_13] "r" (const_2_power_13),

557 [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),

558 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

559 );

560

561 temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;

562 step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

563

564 step3_16 = step1_16 + step1_19;

565 step3_17 = step1_17 + step1_18;

566 step3_30 = step1_29 + step1_30;

567 step3_31 = step1_28 + step1_31;

568

569 step2_20 = step1_23 - step1_20;

570 step2_27 = step1_24 - step1_27;

571

572 __asm__ __volatile__ (

573 "mtlo %[const_2_power_13], $ac0 \n\t"

574 "mthi $zero, $ac0 \n\t"

575 "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"

576 "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"

577 "extp %[step3_20], $ac0, 31 \n\t"

578

579 : [step3_20] "=r" (step3_20)

580 : [const_2_power_13] "r" (const_2_power_13),

581 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),

582 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

583 );

584

585 temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;

586 step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

587

588 step2_21 = step1_22 - step1_21;

589 step2_26 = step1_25 - step1_26;

590

591 __asm__ __volatile__ (

592 "mtlo %[const_2_power_13], $ac1 \n\t"

593 "mthi $zero, $ac1 \n\t"

594 "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"

595 "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"

596 "extp %[step3_21], $ac1, 31 \n\t"

597

598 : [step3_21] "=r" (step3_21)

599 : [const_2_power_13] "r" (const_2_power_13),

600 [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),

601 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

602 );

603

604 temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;

605 step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

606

607 step3_22 = step1_21 + step1_22;

608 step3_23 = step1_20 + step1_23;

609 step3_24 = step1_24 + step1_27;

610 step3_25 = step1_25 + step1_26;

611

612 step2_16 = step3_16 + step3_23;

613 step2_17 = step3_17 + step3_22;

614 step2_18 = step3_18 + step3_21;

615 step2_19 = step3_19 + step3_20;

616 step2_20 = step3_19 - step3_20;

617 step2_21 = step3_18 - step3_21;

618 step2_22 = step3_17 - step3_22;

619 step2_23 = step3_16 - step3_23;

620

621 step2_24 = step3_31 - step3_24;

622 step2_25 = step3_30 - step3_25;

623 step2_26 = step3_29 - step3_26;

624 step2_27 = step3_28 - step3_27;

625 step2_28 = step3_28 + step3_27;

626 step2_29 = step3_29 + step3_26;

627 step2_30 = step3_30 + step3_25;

628 step2_31 = step3_31 + step3_24;

629

630 __asm__ __volatile__ (

631 "lh %[load1], 0(%[input]) \n\t"

632 "lh %[load2], 32(%[input]) \n\t"

633 "lh %[load3], 16(%[input]) \n\t"

634 "lh %[load4], 48(%[input]) \n\t"

635

636 "mtlo %[const_2_power_13], $ac1 \n\t"

637 "mthi $zero, $ac1 \n\t"

638 "mtlo %[const_2_power_13], $ac2 \n\t"

639 "mthi $zero, $ac2 \n\t"

640 "add %[result1], %[load1], %[load2] \n\t"

641 "sub %[result2], %[load1], %[load2] \n\t"

642 "madd $ac1, %[result1], %[cospi_16_64] \n\t"

643 "madd $ac2, %[result2], %[cospi_16_64] \n\t"

644 "extp %[temp0], $ac1, 31 \n\t"

645 "extp %[temp1], $ac2, 31 \n\t"

646

647 "mtlo %[const_2_power_13], $ac3 \n\t"

648 "mthi $zero, $ac3 \n\t"

649 "madd $ac3, %[load3], %[cospi_24_64] \n\t"

650 "msub $ac3, %[load4], %[cospi_8_64] \n\t"

651 "extp %[temp2], $ac3, 31 \n\t"

652

653 "mtlo %[const_2_power_13], $ac1 \n\t"

654 "mthi $zero, $ac1 \n\t"

655 "madd $ac1, %[load3], %[cospi_8_64] \n\t"

656 "madd $ac1, %[load4], %[cospi_24_64] \n\t"

657 "extp %[temp3], $ac1, 31 \n\t"

658

659 "add %[step1_0], %[temp0], %[temp3] \n\t"

660 "add %[step1_1], %[temp1], %[temp2] \n\t"

661 "sub %[step1_2], %[temp1], %[temp2] \n\t"

662 "sub %[step1_3], %[temp0], %[temp3] \n\t"

663

664 : [load1] "=&r" (load1), [load2] "=&r" (load2),

665 [load3] "=&r" (load3), [load4] "=&r" (load4),

666 [result1] "=&r" (result1), [result2] "=&r" (result2),

667 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

668 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

669 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

670 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

671 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

672 [cospi_16_64] "r" (cospi_16_64),

673 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

674

675 );

676

677 __asm__ __volatile__ (

678 "lh %[load1], 8(%[input]) \n\t"

679 "lh %[load2], 56(%[input]) \n\t"

680 "lh %[load3], 40(%[input]) \n\t"

681 "lh %[load4], 24(%[input]) \n\t"

682

683 "mtlo %[const_2_power_13], $ac1 \n\t"

684 "mthi $zero, $ac1 \n\t"

685 "mtlo %[const_2_power_13], $ac3 \n\t"

686 "mthi $zero, $ac3 \n\t"

687

688 "madd $ac1, %[load1], %[cospi_28_64] \n\t"

689 "msub $ac1, %[load2], %[cospi_4_64] \n\t"

690 "extp %[temp0], $ac1, 31 \n\t"

691

692 "madd $ac3, %[load1], %[cospi_4_64] \n\t"

693 "madd $ac3, %[load2], %[cospi_28_64] \n\t"

694 "extp %[temp3], $ac3, 31 \n\t"

695

696 "mtlo %[const_2_power_13], $ac1 \n\t"

697 "mthi $zero, $ac1 \n\t"

698 "mtlo %[const_2_power_13], $ac2 \n\t"

699 "mthi $zero, $ac2 \n\t"

700

701 "madd $ac2, %[load3], %[cospi_12_64] \n\t"

702 "msub $ac2, %[load4], %[cospi_20_64] \n\t"

703 "extp %[temp1], $ac2, 31 \n\t"

704

705 "madd $ac1, %[load3], %[cospi_20_64] \n\t"

706 "madd $ac1, %[load4], %[cospi_12_64] \n\t"

707 "extp %[temp2], $ac1, 31 \n\t"

708

709 "mtlo %[const_2_power_13], $ac1 \n\t"

710 "mthi $zero, $ac1 \n\t"

711 "mtlo %[const_2_power_13], $ac3 \n\t"

712 "mthi $zero, $ac3 \n\t"

713

714 "sub %[load1], %[temp3], %[temp2] \n\t"

715 "sub %[load1], %[load1], %[temp0] \n\t"

716 "add %[load1], %[load1], %[temp1] \n\t"

717

718 "sub %[load2], %[temp0], %[temp1] \n\t"

719 "sub %[load2], %[load2], %[temp2] \n\t"

720 "add %[load2], %[load2], %[temp3] \n\t"

721

722 "madd $ac1, %[load1], %[cospi_16_64] \n\t"

723 "madd $ac3, %[load2], %[cospi_16_64] \n\t"

724

725 "extp %[step1_5], $ac1, 31 \n\t"

726 "extp %[step1_6], $ac3, 31 \n\t"

727 "add %[step1_4], %[temp0], %[temp1] \n\t"

728 "add %[step1_7], %[temp3], %[temp2] \n\t"

729

730 : [load1] "=&r" (load1), [load2] "=&r" (load2),

731 [load3] "=&r" (load3), [load4] "=&r" (load4),

732 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

733 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

734 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

735 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

736 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

737 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

738 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

739 [cospi_16_64] "r" (cospi_16_64)

740 );

741

742 step2_0 = step1_0 + step1_7;

743 step2_1 = step1_1 + step1_6;

744 step2_2 = step1_2 + step1_5;

745 step2_3 = step1_3 + step1_4;

746 step2_4 = step1_3 - step1_4;

747 step2_5 = step1_2 - step1_5;

748 step2_6 = step1_1 - step1_6;

749 step2_7 = step1_0 - step1_7;

750

751 step1_0 = step2_0 + step3_15;

752 step1_1 = step2_1 + step3_14;

753 step1_2 = step2_2 + step3_13;

754 step1_3 = step2_3 + step3_12;

755 step1_4 = step2_4 + step3_11;

756 step1_5 = step2_5 + step3_10;

757 step1_6 = step2_6 + step3_9;

758 step1_7 = step2_7 + step3_8;

759 step1_8 = step2_7 - step3_8;

760 step1_9 = step2_6 - step3_9;

761 step1_10 = step2_5 - step3_10;

762 step1_11 = step2_4 - step3_11;

763 step1_12 = step2_3 - step3_12;

764 step1_13 = step2_2 - step3_13;

765 step1_14 = step2_1 - step3_14;

766 step1_15 = step2_0 - step3_15;

767

768 __asm__ __volatile__ (

769 "sub %[temp0], %[step2_27], %[step2_20] \n\t"

770 "mtlo %[const_2_power_13], $ac0 \n\t"

771 "mthi $zero, $ac0 \n\t"

772 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

773 "extp %[step1_20], $ac0, 31 \n\t"

774

775 : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)

776 : [const_2_power_13] "r" (const_2_power_13),

777 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),

778 [cospi_16_64] "r" (cospi_16_64)

779 );

780

781 temp21 = (step2_20 + step2_27) * cospi_16_64;

782 step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

783

784 __asm__ __volatile__ (

785 "sub %[temp0], %[step2_26], %[step2_21] \n\t"

786 "mtlo %[const_2_power_13], $ac0 \n\t"

787 "mthi $zero, $ac0 \n\t"

788 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

789 "extp %[step1_21], $ac0, 31 \n\t"

790

791 : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)

792 : [const_2_power_13] "r" (const_2_power_13),

793 [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),

794 [cospi_16_64] "r" (cospi_16_64)

795 );

796

797 temp21 = (step2_21 + step2_26) * cospi_16_64;

798 step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

799

800 __asm__ __volatile__ (

801 "sub %[temp0], %[step2_25], %[step2_22] \n\t"

802 "mtlo %[const_2_power_13], $ac0 \n\t"

803 "mthi $zero, $ac0 \n\t"

804 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

805 "extp %[step1_22], $ac0, 31 \n\t"

806

807 : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)

808 : [const_2_power_13] "r" (const_2_power_13),

809 [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),

810 [cospi_16_64] "r" (cospi_16_64)

811 );

812

813 temp21 = (step2_22 + step2_25) * cospi_16_64;

814 step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

815

816 __asm__ __volatile__ (

817 "sub %[temp0], %[step2_24], %[step2_23] \n\t"

818 "mtlo %[const_2_power_13], $ac0 \n\t"

819 "mthi $zero, $ac0 \n\t"

820 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

821 "extp %[step1_23], $ac0, 31 \n\t"

822

823 : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)

824 : [const_2_power_13] "r" (const_2_power_13),

825 [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),

826 [cospi_16_64] "r" (cospi_16_64)

827 );

828

829 temp21 = (step2_23 + step2_24) * cospi_16_64;

830 step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

831

832 // final stage

833 output[0 * 32] = step1_0 + step2_31;

834 output[1 * 32] = step1_1 + step2_30;

835 output[2 * 32] = step1_2 + step2_29;

836 output[3 * 32] = step1_3 + step2_28;

837 output[4 * 32] = step1_4 + step1_27;

838 output[5 * 32] = step1_5 + step1_26;

839 output[6 * 32] = step1_6 + step1_25;

840 output[7 * 32] = step1_7 + step1_24;

841 output[8 * 32] = step1_8 + step1_23;

842 output[9 * 32] = step1_9 + step1_22;

843 output[10 * 32] = step1_10 + step1_21;

844 output[11 * 32] = step1_11 + step1_20;

845 output[12 * 32] = step1_12 + step2_19;

846 output[13 * 32] = step1_13 + step2_18;

847 output[14 * 32] = step1_14 + step2_17;

848 output[15 * 32] = step1_15 + step2_16;

849 output[16 * 32] = step1_15 - step2_16;

850 output[17 * 32] = step1_14 - step2_17;

851 output[18 * 32] = step1_13 - step2_18;

852 output[19 * 32] = step1_12 - step2_19;

853 output[20 * 32] = step1_11 - step1_20;

854 output[21 * 32] = step1_10 - step1_21;

855 output[22 * 32] = step1_9 - step1_22;

856 output[23 * 32] = step1_8 - step1_23;

857 output[24 * 32] = step1_7 - step1_24;

858 output[25 * 32] = step1_6 - step1_25;

859 output[26 * 32] = step1_5 - step1_26;

860 output[27 * 32] = step1_4 - step1_27;

861 output[28 * 32] = step1_3 - step2_28;

862 output[29 * 32] = step1_2 - step2_29;

863 output[30 * 32] = step1_1 - step2_30;

864 output[31 * 32] = step1_0 - step2_31;

865

866 input += 32;

867 output += 1;

868 }

869 }

870

871 void vp9_idct32x32_1024_add_dspr2(const int16_t input, uint8_t dest,

872 int dest_stride) {

873 DECLARE_ALIGNED(32, int16_t, out[32 * 32]);

874 int16_t *outptr = out;

875 uint32_t pos = 45;

876

877 /* bit positon for extract from acc */

878 __asm__ __volatile__ (

879 "wrdsp %[pos], 1 \n\t"

880 :

881 : [pos] "r" (pos)

882 );

883

884 // Rows

885 idct32_rows_dspr2(input, outptr, 32);

886

887 // Columns

888 vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride);

889 }

890

891 void vp9_idct32x32_34_add_dspr2(const int16_t input, uint8_t dest,

892 int stride) {

893 DECLARE_ALIGNED(32, int16_t, out[32 * 32]);

894 int16_t *outptr = out;

895 uint32_t i;

896 uint32_t pos = 45;

897

898 /* bit positon for extract from acc */

899 __asm__ __volatile__ (

900 "wrdsp %[pos], 1 \n\t"

901 :

902 : [pos] "r" (pos)

903 );

904

905 // Rows

906 idct32_rows_dspr2(input, outptr, 8);

907

908 outptr += 8;

909 __asm__ __volatile__ (

910 "sw $zero, 0(%[outptr]) \n\t"

911 "sw $zero, 4(%[outptr]) \n\t"

912 "sw $zero, 8(%[outptr]) \n\t"

913 "sw $zero, 12(%[outptr]) \n\t"

914 "sw $zero, 16(%[outptr]) \n\t"

915 "sw $zero, 20(%[outptr]) \n\t"

916 "sw $zero, 24(%[outptr]) \n\t"

917 "sw $zero, 28(%[outptr]) \n\t"

918 "sw $zero, 32(%[outptr]) \n\t"

919 "sw $zero, 36(%[outptr]) \n\t"

920 "sw $zero, 40(%[outptr]) \n\t"

921 "sw $zero, 44(%[outptr]) \n\t"

922

923 :

924 : [outptr] "r" (outptr)

925 );

926

927 for (i = 0; i < 31; ++i) {

928 outptr += 32;

929

930 __asm__ __volatile__ (

931 "sw $zero, 0(%[outptr]) \n\t"

932 "sw $zero, 4(%[outptr]) \n\t"

933 "sw $zero, 8(%[outptr]) \n\t"

934 "sw $zero, 12(%[outptr]) \n\t"

935 "sw $zero, 16(%[outptr]) \n\t"

936 "sw $zero, 20(%[outptr]) \n\t"

937 "sw $zero, 24(%[outptr]) \n\t"

938 "sw $zero, 28(%[outptr]) \n\t"

939 "sw $zero, 32(%[outptr]) \n\t"

940 "sw $zero, 36(%[outptr]) \n\t"

941 "sw $zero, 40(%[outptr]) \n\t"

942 "sw $zero, 44(%[outptr]) \n\t"

943

944 :

945 : [outptr] "r" (outptr)

946 );

947 }

948

949 // Columns

950 vp9_idct32_cols_add_blk_dspr2(out, dest, stride);

951 }

952

953 void vp9_idct32x32_1_add_dspr2(const int16_t input, uint8_t dest,

954 int stride) {

955 int r, out;

956 int32_t a1, absa1;

957 int32_t vector_a1;

958 int32_t t1, t2, t3, t4;

959 int32_t vector_1, vector_2, vector_3, vector_4;

960 uint32_t pos = 45;

961

962 /* bit positon for extract from acc */

963 __asm__ __volatile__ (

964 "wrdsp %[pos], 1 \n\t"

965

966 :

967 : [pos] "r" (pos)

968 );

969

970 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

971 __asm__ __volatile__ (

972 "addi %[out], %[out], 32 \n\t"

973 "sra %[a1], %[out], 6 \n\t"

974

975 : [out] "+r" (out), [a1] "=r" (a1)

976 :

977 );

978

979 if (a1 < 0) {

980 /* use quad-byte

981 * input and output memory are four byte aligned */

982 __asm__ __volatile__ (

983 "abs %[absa1], %[a1] \n\t"

984 "replv.qb %[vector_a1], %[absa1] \n\t"

985

986 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

987 : [a1] "r" (a1)

988 );

989

990 for (r = 32; r--;) {

991 __asm__ __volatile__ (

992 "lw %[t1], 0(%[dest]) \n\t"

993 "lw %[t2], 4(%[dest]) \n\t"

994 "lw %[t3], 8(%[dest]) \n\t"

995 "lw %[t4], 12(%[dest]) \n\t"

996 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

997 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

998 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

999 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

1000 "sw %[vector_1], 0(%[dest]) \n\t"

1001 "sw %[vector_2], 4(%[dest]) \n\t"

1002 "sw %[vector_3], 8(%[dest]) \n\t"

1003 "sw %[vector_4], 12(%[dest]) \n\t"

1004

1005 "lw %[t1], 16(%[dest]) \n\t"

1006 "lw %[t2], 20(%[dest]) \n\t"

1007 "lw %[t3], 24(%[dest]) \n\t"

1008 "lw %[t4], 28(%[dest]) \n\t"

1009 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

1010 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

1011 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

1012 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

1013 "sw %[vector_1], 16(%[dest]) \n\t"

1014 "sw %[vector_2], 20(%[dest]) \n\t"

1015 "sw %[vector_3], 24(%[dest]) \n\t"

1016 "sw %[vector_4], 28(%[dest]) \n\t"

1017

1018 "add %[dest], %[dest], %[stride] \n\t"

1019

1020 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

1021 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

1022 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

1023 [dest] "+&r" (dest)

1024 : [stride] "r" (stride), [vector_a1] "r" (vector_a1)

1025 );

1026 }

1027 } else {

1028 /* use quad-byte

1029 * input and output memory are four byte aligned */

1030 __asm__ __volatile__ (

1031 "replv.qb %[vector_a1], %[a1] \n\t"

1032

1033 : [vector_a1] "=r" (vector_a1)

1034 : [a1] "r" (a1)

1035 );

1036

1037 for (r = 32; r--;) {

1038 __asm__ __volatile__ (

1039 "lw %[t1], 0(%[dest]) \n\t"

1040 "lw %[t2], 4(%[dest]) \n\t"

1041 "lw %[t3], 8(%[dest]) \n\t"

1042 "lw %[t4], 12(%[dest]) \n\t"

1043 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

1044 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

1045 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

1046 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

1047 "sw %[vector_1], 0(%[dest]) \n\t"

1048 "sw %[vector_2], 4(%[dest]) \n\t"

1049 "sw %[vector_3], 8(%[dest]) \n\t"

1050 "sw %[vector_4], 12(%[dest]) \n\t"

1051

1052 "lw %[t1], 16(%[dest]) \n\t"

1053 "lw %[t2], 20(%[dest]) \n\t"

1054 "lw %[t3], 24(%[dest]) \n\t"

1055 "lw %[t4], 28(%[dest]) \n\t"

1056 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

1057 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

1058 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

1059 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

1060 "sw %[vector_1], 16(%[dest]) \n\t"

1061 "sw %[vector_2], 20(%[dest]) \n\t"

1062 "sw %[vector_3], 24(%[dest]) \n\t"

1063 "sw %[vector_4], 28(%[dest]) \n\t"

1064

1065 "add %[dest], %[dest], %[stride] \n\t"

1066

1067 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

1068 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

1069 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

1070 [dest] "+&r" (dest)

1071 : [stride] "r" (stride), [vector_a1] "r" (vector_a1)

1072 );

1073 }

1074 }

1075 }

1076 #endif // #if HAVE_DSPR2

OLD	NEW