source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <assert.h>	11 #include <assert.h>

12 #include <stdio.h>	12 #include <stdio.h>

13	13

14 #include "./vpx_config.h"	14 #include "./vpx_config.h"

15 #include "./vp9_rtcd.h"	15 #include "./vp9_rtcd.h"

16 #include "vp9/common/vp9_common.h"	16 #include "vp9/common/vp9_common.h"

17 #include "vp9/common/vp9_blockd.h"	17 #include "vp9/common/vp9_blockd.h"

18 #include "vp9/common/vp9_idct.h"	18 #include "vp9/common/vp9_idct.h"

19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"	19 #include "vpx_dsp/mips/inv_txfm_dspr2.h"

20 #include "vpx_dsp/txfm_common.h"	20 #include "vpx_dsp/txfm_common.h"

21 #include "vpx_ports/mem.h"	21 #include "vpx_ports/mem.h"

22	22

23 #if HAVE_DSPR2	23 #if HAVE_DSPR2

24 static void idct16_rows_dspr2(const int16_t input, int16_t output,

25 uint32_t no_rows) {

26 int i;

27 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

28 int step1_10, step1_11, step1_12, step1_13;

29 int step2_0, step2_1, step2_2, step2_3;

30 int step2_8, step2_9, step2_10, step2_11;

31 int step2_12, step2_13, step2_14, step2_15;

32 int load1, load2, load3, load4, load5, load6, load7, load8;

33 int result1, result2, result3, result4;

34 const int const_2_power_13 = 8192;

35

36 for (i = no_rows; i--; ) {

37 /* prefetch row */

38 prefetch_load((const uint8_t *)(input + 16));

39

40 __asm__ __volatile__ (

41 "lh %[load1], 0(%[input]) \n\t"

42 "lh %[load2], 16(%[input]) \n\t"

43 "lh %[load3], 8(%[input]) \n\t"

44 "lh %[load4], 24(%[input]) \n\t"

45

46 "mtlo %[const_2_power_13], $ac1 \n\t"

47 "mthi $zero, $ac1 \n\t"

48 "mtlo %[const_2_power_13], $ac2 \n\t"

49 "mthi $zero, $ac2 \n\t"

50 "add %[result1], %[load1], %[load2] \n\t"

51 "sub %[result2], %[load1], %[load2] \n\t"

52 "madd $ac1, %[result1], %[cospi_16_64] \n\t"

53 "madd $ac2, %[result2], %[cospi_16_64] \n\t"

54 "extp %[step2_0], $ac1, 31 \n\t"

55 "extp %[step2_1], $ac2, 31 \n\t"

56

57 "mtlo %[const_2_power_13], $ac3 \n\t"

58 "mthi $zero, $ac3 \n\t"

59 "madd $ac3, %[load3], %[cospi_24_64] \n\t"

60 "msub $ac3, %[load4], %[cospi_8_64] \n\t"

61 "extp %[step2_2], $ac3, 31 \n\t"

62

63 "mtlo %[const_2_power_13], $ac1 \n\t"

64 "mthi $zero, $ac1 \n\t"

65 "madd $ac1, %[load3], %[cospi_8_64] \n\t"

66 "madd $ac1, %[load4], %[cospi_24_64] \n\t"

67 "extp %[step2_3], $ac1, 31 \n\t"

68

69 "add %[step1_0], %[step2_0], %[step2_3] \n\t"

70 "add %[step1_1], %[step2_1], %[step2_2] \n\t"

71 "sub %[step1_2], %[step2_1], %[step2_2] \n\t"

72 "sub %[step1_3], %[step2_0], %[step2_3] \n\t"

73

74 : [load1] "=&r" (load1), [load2] "=&r" (load2),

75 [load3] "=&r" (load3), [load4] "=&r" (load4),

76 [result1] "=&r" (result1), [result2] "=&r" (result2),

77 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),

78 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),

79 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

80 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

81 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

82 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),

83 [cospi_16_64] "r" (cospi_16_64)

84 );

85

86 __asm__ __volatile__ (

87 "lh %[load5], 2(%[input]) \n\t"

88 "lh %[load6], 30(%[input]) \n\t"

89 "lh %[load7], 18(%[input]) \n\t"

90 "lh %[load8], 14(%[input]) \n\t"

91

92 "mtlo %[const_2_power_13], $ac1 \n\t"

93 "mthi $zero, $ac1 \n\t"

94 "mtlo %[const_2_power_13], $ac3 \n\t"

95 "mthi $zero, $ac3 \n\t"

96

97 "madd $ac1, %[load5], %[cospi_30_64] \n\t"

98 "msub $ac1, %[load6], %[cospi_2_64] \n\t"

99 "extp %[result1], $ac1, 31 \n\t"

100

101 "madd $ac3, %[load7], %[cospi_14_64] \n\t"

102 "msub $ac3, %[load8], %[cospi_18_64] \n\t"

103 "extp %[result2], $ac3, 31 \n\t"

104

105 "mtlo %[const_2_power_13], $ac1 \n\t"

106 "mthi $zero, $ac1 \n\t"

107 "mtlo %[const_2_power_13], $ac2 \n\t"

108 "mthi $zero, $ac2 \n\t"

109

110 "madd $ac1, %[load7], %[cospi_18_64] \n\t"

111 "madd $ac1, %[load8], %[cospi_14_64] \n\t"

112 "extp %[result3], $ac1, 31 \n\t"

113

114 "madd $ac2, %[load5], %[cospi_2_64] \n\t"

115 "madd $ac2, %[load6], %[cospi_30_64] \n\t"

116 "extp %[result4], $ac2, 31 \n\t"

117

118 "sub %[load5], %[result1], %[result2] \n\t"

119 "sub %[load6], %[result4], %[result3] \n\t"

120

121 "mtlo %[const_2_power_13], $ac1 \n\t"

122 "mthi $zero, $ac1 \n\t"

123 "mtlo %[const_2_power_13], $ac3 \n\t"

124 "mthi $zero, $ac3 \n\t"

125

126 "madd $ac1, %[load6], %[cospi_24_64] \n\t"

127 "msub $ac1, %[load5], %[cospi_8_64] \n\t"

128 "madd $ac3, %[load5], %[cospi_24_64] \n\t"

129 "madd $ac3, %[load6], %[cospi_8_64] \n\t"

130

131 "extp %[step2_9], $ac1, 31 \n\t"

132 "extp %[step2_14], $ac3, 31 \n\t"

133 "add %[step2_8], %[result1], %[result2] \n\t"

134 "add %[step2_15], %[result4], %[result3] \n\t"

135

136 : [load5] "=&r" (load5), [load6] "=&r" (load6),

137 [load7] "=&r" (load7), [load8] "=&r" (load8),

138 [result1] "=&r" (result1), [result2] "=&r" (result2),

139 [result3] "=&r" (result3), [result4] "=&r" (result4),

140 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),

141 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)

142 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

143 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

144 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

145 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

146 );

147

148 __asm__ __volatile__ (

149 "lh %[load1], 10(%[input]) \n\t"

150 "lh %[load2], 22(%[input]) \n\t"

151 "lh %[load3], 26(%[input]) \n\t"

152 "lh %[load4], 6(%[input]) \n\t"

153

154 "mtlo %[const_2_power_13], $ac1 \n\t"

155 "mthi $zero, $ac1 \n\t"

156 "mtlo %[const_2_power_13], $ac3 \n\t"

157 "mthi $zero, $ac3 \n\t"

158

159 "madd $ac1, %[load1], %[cospi_22_64] \n\t"

160 "msub $ac1, %[load2], %[cospi_10_64] \n\t"

161 "extp %[result1], $ac1, 31 \n\t"

162

163 "madd $ac3, %[load3], %[cospi_6_64] \n\t"

164 "msub $ac3, %[load4], %[cospi_26_64] \n\t"

165 "extp %[result2], $ac3, 31 \n\t"

166

167 "mtlo %[const_2_power_13], $ac1 \n\t"

168 "mthi $zero, $ac1 \n\t"

169 "mtlo %[const_2_power_13], $ac2 \n\t"

170 "mthi $zero, $ac2 \n\t"

171

172 "madd $ac1, %[load1], %[cospi_10_64] \n\t"

173 "madd $ac1, %[load2], %[cospi_22_64] \n\t"

174 "extp %[result3], $ac1, 31 \n\t"

175

176 "madd $ac2, %[load3], %[cospi_26_64] \n\t"

177 "madd $ac2, %[load4], %[cospi_6_64] \n\t"

178 "extp %[result4], $ac2, 31 \n\t"

179

180 "mtlo %[const_2_power_13], $ac1 \n\t"

181 "mthi $zero, $ac1 \n\t"

182 "mtlo %[const_2_power_13], $ac3 \n\t"

183 "mthi $zero, $ac3 \n\t"

184

185 "sub %[load1], %[result2], %[result1] \n\t"

186 "sub %[load2], %[result4], %[result3] \n\t"

187

188 "msub $ac1, %[load1], %[cospi_24_64] \n\t"

189 "msub $ac1, %[load2], %[cospi_8_64] \n\t"

190 "madd $ac3, %[load2], %[cospi_24_64] \n\t"

191 "msub $ac3, %[load1], %[cospi_8_64] \n\t"

192

193 "extp %[step2_10], $ac1, 31 \n\t"

194 "extp %[step2_13], $ac3, 31 \n\t"

195 "add %[step2_11], %[result1], %[result2] \n\t"

196 "add %[step2_12], %[result4], %[result3] \n\t"

197

198 : [load1] "=&r" (load1), [load2] "=&r" (load2),

199 [load3] "=&r" (load3), [load4] "=&r" (load4),

200 [result1] "=&r" (result1), [result2] "=&r" (result2),

201 [result3] "=&r" (result3), [result4] "=&r" (result4),

202 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

203 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

204 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

205 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

206 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

207 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

208 );

209

210 __asm__ __volatile__ (

211 "lh %[load5], 4(%[input]) \n\t"

212 "lh %[load6], 28(%[input]) \n\t"

213 "lh %[load7], 20(%[input]) \n\t"

214 "lh %[load8], 12(%[input]) \n\t"

215

216 "mtlo %[const_2_power_13], $ac1 \n\t"

217 "mthi $zero, $ac1 \n\t"

218 "mtlo %[const_2_power_13], $ac3 \n\t"

219 "mthi $zero, $ac3 \n\t"

220

221 "madd $ac1, %[load5], %[cospi_28_64] \n\t"

222 "msub $ac1, %[load6], %[cospi_4_64] \n\t"

223 "extp %[result1], $ac1, 31 \n\t"

224

225 "madd $ac3, %[load7], %[cospi_12_64] \n\t"

226 "msub $ac3, %[load8], %[cospi_20_64] \n\t"

227 "extp %[result2], $ac3, 31 \n\t"

228

229 "mtlo %[const_2_power_13], $ac1 \n\t"

230 "mthi $zero, $ac1 \n\t"

231 "mtlo %[const_2_power_13], $ac2 \n\t"

232 "mthi $zero, $ac2 \n\t"

233

234 "madd $ac1, %[load7], %[cospi_20_64] \n\t"

235 "madd $ac1, %[load8], %[cospi_12_64] \n\t"

236 "extp %[result3], $ac1, 31 \n\t"

237

238 "madd $ac2, %[load5], %[cospi_4_64] \n\t"

239 "madd $ac2, %[load6], %[cospi_28_64] \n\t"

240 "extp %[result4], $ac2, 31 \n\t"

241

242 "mtlo %[const_2_power_13], $ac1 \n\t"

243 "mthi $zero, $ac1 \n\t"

244 "mtlo %[const_2_power_13], $ac3 \n\t"

245 "mthi $zero, $ac3 \n\t"

246

247 "sub %[load5], %[result4], %[result3] \n\t"

248 "sub %[load5], %[load5], %[result1] \n\t"

249 "add %[load5], %[load5], %[result2] \n\t"

250

251 "sub %[load6], %[result1], %[result2] \n\t"

252 "sub %[load6], %[load6], %[result3] \n\t"

253 "add %[load6], %[load6], %[result4] \n\t"

254

255 "madd $ac1, %[load5], %[cospi_16_64] \n\t"

256 "madd $ac3, %[load6], %[cospi_16_64] \n\t"

257

258 "extp %[step1_5], $ac1, 31 \n\t"

259 "extp %[step1_6], $ac3, 31 \n\t"

260 "add %[step1_4], %[result1], %[result2] \n\t"

261 "add %[step1_7], %[result4], %[result3] \n\t"

262

263 : [load5] "=&r" (load5), [load6] "=&r" (load6),

264 [load7] "=&r" (load7), [load8] "=&r" (load8),

265 [result1] "=&r" (result1), [result2] "=&r" (result2),

266 [result3] "=&r" (result3), [result4] "=&r" (result4),

267 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

268 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

269 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

270 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

271 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

272 [cospi_16_64] "r" (cospi_16_64)

273 );

274

275 __asm__ __volatile__ (

276 "mtlo %[const_2_power_13], $ac0 \n\t"

277 "mthi $zero, $ac0 \n\t"

278 "mtlo %[const_2_power_13], $ac1 \n\t"

279 "mthi $zero, $ac1 \n\t"

280

281 "sub %[load5], %[step2_14], %[step2_13] \n\t"

282 "sub %[load5], %[load5], %[step2_9] \n\t"

283 "add %[load5], %[load5], %[step2_10] \n\t"

284

285 "madd $ac0, %[load5], %[cospi_16_64] \n\t"

286

287 "sub %[load6], %[step2_14], %[step2_13] \n\t"

288 "sub %[load6], %[load6], %[step2_10] \n\t"

289 "add %[load6], %[load6], %[step2_9] \n\t"

290

291 "madd $ac1, %[load6], %[cospi_16_64] \n\t"

292

293 "mtlo %[const_2_power_13], $ac2 \n\t"

294 "mthi $zero, $ac2 \n\t"

295 "mtlo %[const_2_power_13], $ac3 \n\t"

296 "mthi $zero, $ac3 \n\t"

297

298 "sub %[load5], %[step2_15], %[step2_12] \n\t"

299 "sub %[load5], %[load5], %[step2_8] \n\t"

300 "add %[load5], %[load5], %[step2_11] \n\t"

301

302 "madd $ac2, %[load5], %[cospi_16_64] \n\t"

303

304 "sub %[load6], %[step2_15], %[step2_12] \n\t"

305 "sub %[load6], %[load6], %[step2_11] \n\t"

306 "add %[load6], %[load6], %[step2_8] \n\t"

307

308 "madd $ac3, %[load6], %[cospi_16_64] \n\t"

309

310 "extp %[step1_10], $ac0, 31 \n\t"

311 "extp %[step1_13], $ac1, 31 \n\t"

312 "extp %[step1_11], $ac2, 31 \n\t"

313 "extp %[step1_12], $ac3, 31 \n\t"

314

315 : [load5] "=&r" (load5), [load6] "=&r" (load6),

316 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),

317 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)

318 : [const_2_power_13] "r" (const_2_power_13),

319 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),

320 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),

321 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),

322 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),

323 [cospi_16_64] "r" (cospi_16_64)

324 );

325

326 __asm__ __volatile__ (

327 "add %[load5], %[step1_0], %[step1_7] \n\t"

328 "add %[load5], %[load5], %[step2_12] \n\t"

329 "add %[load5], %[load5], %[step2_15] \n\t"

330 "add %[load6], %[step1_1], %[step1_6] \n\t"

331 "add %[load6], %[load6], %[step2_13] \n\t"

332 "add %[load6], %[load6], %[step2_14] \n\t"

333 "sh %[load5], 0(%[output]) \n\t"

334 "sh %[load6], 32(%[output]) \n\t"

335 "sub %[load5], %[step1_1], %[step1_6] \n\t"

336 "add %[load5], %[load5], %[step2_9] \n\t"

337 "add %[load5], %[load5], %[step2_10] \n\t"

338 "sub %[load6], %[step1_0], %[step1_7] \n\t"

339 "add %[load6], %[load6], %[step2_8] \n\t"

340 "add %[load6], %[load6], %[step2_11] \n\t"

341 "sh %[load5], 192(%[output]) \n\t"

342 "sh %[load6], 224(%[output]) \n\t"

343 "sub %[load5], %[step1_0], %[step1_7] \n\t"

344 "sub %[load5], %[load5], %[step2_8] \n\t"

345 "sub %[load5], %[load5], %[step2_11] \n\t"

346 "sub %[load6], %[step1_1], %[step1_6] \n\t"

347 "sub %[load6], %[load6], %[step2_9] \n\t"

348 "sub %[load6], %[load6], %[step2_10] \n\t"

349 "sh %[load5], 256(%[output]) \n\t"

350 "sh %[load6], 288(%[output]) \n\t"

351 "add %[load5], %[step1_1], %[step1_6] \n\t"

352 "sub %[load5], %[load5], %[step2_13] \n\t"

353 "sub %[load5], %[load5], %[step2_14] \n\t"

354 "add %[load6], %[step1_0], %[step1_7] \n\t"

355 "sub %[load6], %[load6], %[step2_12] \n\t"

356 "sub %[load6], %[load6], %[step2_15] \n\t"

357 "sh %[load5], 448(%[output]) \n\t"

358 "sh %[load6], 480(%[output]) \n\t"

359

360 : [load5] "=&r" (load5), [load6] "=&r" (load6)

361 : [output] "r" (output),

362 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),

363 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),

364 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),

365 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),

366 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),

367 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)

368 );

369

370 __asm__ __volatile__ (

371 "add %[load5], %[step1_2], %[step1_5] \n\t"

372 "add %[load5], %[load5], %[step1_13] \n\t"

373 "add %[load6], %[step1_3], %[step1_4] \n\t"

374 "add %[load6], %[load6], %[step1_12] \n\t"

375 "sh %[load5], 64(%[output]) \n\t"

376 "sh %[load6], 96(%[output]) \n\t"

377 "sub %[load5], %[step1_3], %[step1_4] \n\t"

378 "add %[load5], %[load5], %[step1_11] \n\t"

379 "sub %[load6], %[step1_2], %[step1_5] \n\t"

380 "add %[load6], %[load6], %[step1_10] \n\t"

381 "sh %[load5], 128(%[output]) \n\t"

382 "sh %[load6], 160(%[output]) \n\t"

383 "sub %[load5], %[step1_2], %[step1_5] \n\t"

384 "sub %[load5], %[load5], %[step1_10] \n\t"

385 "sub %[load6], %[step1_3], %[step1_4] \n\t"

386 "sub %[load6], %[load6], %[step1_11] \n\t"

387 "sh %[load5], 320(%[output]) \n\t"

388 "sh %[load6], 352(%[output]) \n\t"

389 "add %[load5], %[step1_3], %[step1_4] \n\t"

390 "sub %[load5], %[load5], %[step1_12] \n\t"

391 "add %[load6], %[step1_2], %[step1_5] \n\t"

392 "sub %[load6], %[load6], %[step1_13] \n\t"

393 "sh %[load5], 384(%[output]) \n\t"

394 "sh %[load6], 416(%[output]) \n\t"

395

396 : [load5] "=&r" (load5), [load6] "=&r" (load6)

397 : [output] "r" (output),

398 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),

399 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),

400 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

401 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)

402 );

403

404 input += 16;

405 output += 1;

406 }

407 }

408

409 static void idct16_cols_add_blk_dspr2(int16_t input, uint8_t dest,

410 int dest_stride) {

411 int i;

412 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

413 int step1_8, step1_9, step1_10, step1_11;

414 int step1_12, step1_13, step1_14, step1_15;

415 int step2_0, step2_1, step2_2, step2_3;

416 int step2_8, step2_9, step2_10, step2_11;

417 int step2_12, step2_13, step2_14, step2_15;

418 int load1, load2, load3, load4, load5, load6, load7, load8;

419 int result1, result2, result3, result4;

420 const int const_2_power_13 = 8192;

421 uint8_t *dest_pix;

422 uint8_t *cm = vpx_ff_cropTbl;

423

424 /* prefetch vpx_ff_cropTbl */

425 prefetch_load(vpx_ff_cropTbl);

426 prefetch_load(vpx_ff_cropTbl + 32);

427 prefetch_load(vpx_ff_cropTbl + 64);

428 prefetch_load(vpx_ff_cropTbl + 96);

429 prefetch_load(vpx_ff_cropTbl + 128);

430 prefetch_load(vpx_ff_cropTbl + 160);

431 prefetch_load(vpx_ff_cropTbl + 192);

432 prefetch_load(vpx_ff_cropTbl + 224);

433

434 for (i = 0; i < 16; ++i) {

435 dest_pix = (dest + i);

436 __asm__ __volatile__ (

437 "lh %[load1], 0(%[input]) \n\t"

438 "lh %[load2], 16(%[input]) \n\t"

439 "lh %[load3], 8(%[input]) \n\t"

440 "lh %[load4], 24(%[input]) \n\t"

441

442 "mtlo %[const_2_power_13], $ac1 \n\t"

443 "mthi $zero, $ac1 \n\t"

444 "mtlo %[const_2_power_13], $ac2 \n\t"

445 "mthi $zero, $ac2 \n\t"

446 "add %[result1], %[load1], %[load2] \n\t"

447 "sub %[result2], %[load1], %[load2] \n\t"

448 "madd $ac1, %[result1], %[cospi_16_64] \n\t"

449 "madd $ac2, %[result2], %[cospi_16_64] \n\t"

450 "extp %[step2_0], $ac1, 31 \n\t"

451 "extp %[step2_1], $ac2, 31 \n\t"

452

453 "mtlo %[const_2_power_13], $ac3 \n\t"

454 "mthi $zero, $ac3 \n\t"

455 "madd $ac3, %[load3], %[cospi_24_64] \n\t"

456 "msub $ac3, %[load4], %[cospi_8_64] \n\t"

457 "extp %[step2_2], $ac3, 31 \n\t"

458

459 "mtlo %[const_2_power_13], $ac1 \n\t"

460 "mthi $zero, $ac1 \n\t"

461 "madd $ac1, %[load3], %[cospi_8_64] \n\t"

462 "madd $ac1, %[load4], %[cospi_24_64] \n\t"

463 "extp %[step2_3], $ac1, 31 \n\t"

464

465 "add %[step1_0], %[step2_0], %[step2_3] \n\t"

466 "add %[step1_1], %[step2_1], %[step2_2] \n\t"

467 "sub %[step1_2], %[step2_1], %[step2_2] \n\t"

468 "sub %[step1_3], %[step2_0], %[step2_3] \n\t"

469

470 : [load1] "=&r" (load1), [load2] "=&r" (load2),

471 [load3] "=&r" (load3), [load4] "=&r" (load4),

472 [result1] "=&r" (result1), [result2] "=&r" (result2),

473 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),

474 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),

475 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

476 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

477 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

478 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),

479 [cospi_16_64] "r" (cospi_16_64)

480 );

481

482 __asm__ __volatile__ (

483 "lh %[load5], 2(%[input]) \n\t"

484 "lh %[load6], 30(%[input]) \n\t"

485 "lh %[load7], 18(%[input]) \n\t"

486 "lh %[load8], 14(%[input]) \n\t"

487

488 "mtlo %[const_2_power_13], $ac1 \n\t"

489 "mthi $zero, $ac1 \n\t"

490 "mtlo %[const_2_power_13], $ac3 \n\t"

491 "mthi $zero, $ac3 \n\t"

492

493 "madd $ac1, %[load5], %[cospi_30_64] \n\t"

494 "msub $ac1, %[load6], %[cospi_2_64] \n\t"

495 "extp %[result1], $ac1, 31 \n\t"

496

497 "madd $ac3, %[load7], %[cospi_14_64] \n\t"

498 "msub $ac3, %[load8], %[cospi_18_64] \n\t"

499 "extp %[result2], $ac3, 31 \n\t"

500

501 "mtlo %[const_2_power_13], $ac1 \n\t"

502 "mthi $zero, $ac1 \n\t"

503 "mtlo %[const_2_power_13], $ac2 \n\t"

504 "mthi $zero, $ac2 \n\t"

505

506 "madd $ac1, %[load7], %[cospi_18_64] \n\t"

507 "madd $ac1, %[load8], %[cospi_14_64] \n\t"

508 "extp %[result3], $ac1, 31 \n\t"

509

510 "madd $ac2, %[load5], %[cospi_2_64] \n\t"

511 "madd $ac2, %[load6], %[cospi_30_64] \n\t"

512 "extp %[result4], $ac2, 31 \n\t"

513

514 "sub %[load5], %[result1], %[result2] \n\t"

515 "sub %[load6], %[result4], %[result3] \n\t"

516

517 "mtlo %[const_2_power_13], $ac1 \n\t"

518 "mthi $zero, $ac1 \n\t"

519 "mtlo %[const_2_power_13], $ac3 \n\t"

520 "mthi $zero, $ac3 \n\t"

521

522 "madd $ac1, %[load6], %[cospi_24_64] \n\t"

523 "msub $ac1, %[load5], %[cospi_8_64] \n\t"

524 "madd $ac3, %[load5], %[cospi_24_64] \n\t"

525 "madd $ac3, %[load6], %[cospi_8_64] \n\t"

526

527 "extp %[step2_9], $ac1, 31 \n\t"

528 "extp %[step2_14], $ac3, 31 \n\t"

529 "add %[step2_8], %[result1], %[result2] \n\t"

530 "add %[step2_15], %[result4], %[result3] \n\t"

531

532 : [load5] "=&r" (load5), [load6] "=&r" (load6),

533 [load7] "=&r" (load7), [load8] "=&r" (load8),

534 [result1] "=&r" (result1), [result2] "=&r" (result2),

535 [result3] "=&r" (result3), [result4] "=&r" (result4),

536 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),

537 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)

538 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

539 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

540 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

541 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

542 );

543

544 __asm__ __volatile__ (

545 "lh %[load1], 10(%[input]) \n\t"

546 "lh %[load2], 22(%[input]) \n\t"

547 "lh %[load3], 26(%[input]) \n\t"

548 "lh %[load4], 6(%[input]) \n\t"

549

550 "mtlo %[const_2_power_13], $ac1 \n\t"

551 "mthi $zero, $ac1 \n\t"

552 "mtlo %[const_2_power_13], $ac3 \n\t"

553 "mthi $zero, $ac3 \n\t"

554

555 "madd $ac1, %[load1], %[cospi_22_64] \n\t"

556 "msub $ac1, %[load2], %[cospi_10_64] \n\t"

557 "extp %[result1], $ac1, 31 \n\t"

558

559 "madd $ac3, %[load3], %[cospi_6_64] \n\t"

560 "msub $ac3, %[load4], %[cospi_26_64] \n\t"

561 "extp %[result2], $ac3, 31 \n\t"

562

563 "mtlo %[const_2_power_13], $ac1 \n\t"

564 "mthi $zero, $ac1 \n\t"

565 "mtlo %[const_2_power_13], $ac2 \n\t"

566 "mthi $zero, $ac2 \n\t"

567

568 "madd $ac1, %[load1], %[cospi_10_64] \n\t"

569 "madd $ac1, %[load2], %[cospi_22_64] \n\t"

570 "extp %[result3], $ac1, 31 \n\t"

571

572 "madd $ac2, %[load3], %[cospi_26_64] \n\t"

573 "madd $ac2, %[load4], %[cospi_6_64] \n\t"

574 "extp %[result4], $ac2, 31 \n\t"

575

576 "mtlo %[const_2_power_13], $ac1 \n\t"

577 "mthi $zero, $ac1 \n\t"

578 "mtlo %[const_2_power_13], $ac3 \n\t"

579 "mthi $zero, $ac3 \n\t"

580

581 "sub %[load1], %[result2], %[result1] \n\t"

582 "sub %[load2], %[result4], %[result3] \n\t"

583

584 "msub $ac1, %[load1], %[cospi_24_64] \n\t"

585 "msub $ac1, %[load2], %[cospi_8_64] \n\t"

586 "madd $ac3, %[load2], %[cospi_24_64] \n\t"

587 "msub $ac3, %[load1], %[cospi_8_64] \n\t"

588

589 "extp %[step2_10], $ac1, 31 \n\t"

590 "extp %[step2_13], $ac3, 31 \n\t"

591 "add %[step2_11], %[result1], %[result2] \n\t"

592 "add %[step2_12], %[result4], %[result3] \n\t"

593

594 : [load1] "=&r" (load1), [load2] "=&r" (load2),

595 [load3] "=&r" (load3), [load4] "=&r" (load4),

596 [result1] "=&r" (result1), [result2] "=&r" (result2),

597 [result3] "=&r" (result3), [result4] "=&r" (result4),

598 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

599 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

600 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

601 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

602 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

603 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

604 );

605

606 __asm__ __volatile__ (

607 "lh %[load5], 4(%[input]) \n\t"

608 "lh %[load6], 28(%[input]) \n\t"

609 "lh %[load7], 20(%[input]) \n\t"

610 "lh %[load8], 12(%[input]) \n\t"

611

612 "mtlo %[const_2_power_13], $ac1 \n\t"

613 "mthi $zero, $ac1 \n\t"

614 "mtlo %[const_2_power_13], $ac3 \n\t"

615 "mthi $zero, $ac3 \n\t"

616

617 "madd $ac1, %[load5], %[cospi_28_64] \n\t"

618 "msub $ac1, %[load6], %[cospi_4_64] \n\t"

619 "extp %[result1], $ac1, 31 \n\t"

620

621 "madd $ac3, %[load7], %[cospi_12_64] \n\t"

622 "msub $ac3, %[load8], %[cospi_20_64] \n\t"

623 "extp %[result2], $ac3, 31 \n\t"

624

625 "mtlo %[const_2_power_13], $ac1 \n\t"

626 "mthi $zero, $ac1 \n\t"

627 "mtlo %[const_2_power_13], $ac2 \n\t"

628 "mthi $zero, $ac2 \n\t"

629

630 "madd $ac1, %[load7], %[cospi_20_64] \n\t"

631 "madd $ac1, %[load8], %[cospi_12_64] \n\t"

632 "extp %[result3], $ac1, 31 \n\t"

633

634 "madd $ac2, %[load5], %[cospi_4_64] \n\t"

635 "madd $ac2, %[load6], %[cospi_28_64] \n\t"

636 "extp %[result4], $ac2, 31 \n\t"

637

638 "mtlo %[const_2_power_13], $ac1 \n\t"

639 "mthi $zero, $ac1 \n\t"

640 "mtlo %[const_2_power_13], $ac3 \n\t"

641 "mthi $zero, $ac3 \n\t"

642

643 "sub %[load5], %[result4], %[result3] \n\t"

644 "sub %[load5], %[load5], %[result1] \n\t"

645 "add %[load5], %[load5], %[result2] \n\t"

646

647 "sub %[load6], %[result1], %[result2] \n\t"

648 "sub %[load6], %[load6], %[result3] \n\t"

649 "add %[load6], %[load6], %[result4] \n\t"

650

651 "madd $ac1, %[load5], %[cospi_16_64] \n\t"

652 "madd $ac3, %[load6], %[cospi_16_64] \n\t"

653

654 "extp %[step1_5], $ac1, 31 \n\t"

655 "extp %[step1_6], $ac3, 31 \n\t"

656

657 "add %[step1_4], %[result1], %[result2] \n\t"

658 "add %[step1_7], %[result4], %[result3] \n\t"

659

660 : [load5] "=&r" (load5), [load6] "=&r" (load6),

661 [load7] "=&r" (load7), [load8] "=&r" (load8),

662 [result1] "=&r" (result1), [result2] "=&r" (result2),

663 [result3] "=&r" (result3), [result4] "=&r" (result4),

664 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

665 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

666 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

667 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

668 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

669 [cospi_16_64] "r" (cospi_16_64)

670 );

671

672 __asm__ __volatile__ (

673 "mtlo %[const_2_power_13], $ac0 \n\t"

674 "mthi $zero, $ac0 \n\t"

675 "mtlo %[const_2_power_13], $ac1 \n\t"

676 "mthi $zero, $ac1 \n\t"

677

678 "sub %[load5], %[step2_14], %[step2_13] \n\t"

679 "sub %[load5], %[load5], %[step2_9] \n\t"

680 "add %[load5], %[load5], %[step2_10] \n\t"

681

682 "madd $ac0, %[load5], %[cospi_16_64] \n\t"

683

684 "sub %[load6], %[step2_14], %[step2_13] \n\t"

685 "sub %[load6], %[load6], %[step2_10] \n\t"

686 "add %[load6], %[load6], %[step2_9] \n\t"

687

688 "madd $ac1, %[load6], %[cospi_16_64] \n\t"

689

690 "mtlo %[const_2_power_13], $ac2 \n\t"

691 "mthi $zero, $ac2 \n\t"

692 "mtlo %[const_2_power_13], $ac3 \n\t"

693 "mthi $zero, $ac3 \n\t"

694

695 "sub %[load5], %[step2_15], %[step2_12] \n\t"

696 "sub %[load5], %[load5], %[step2_8] \n\t"

697 "add %[load5], %[load5], %[step2_11] \n\t"

698

699 "madd $ac2, %[load5], %[cospi_16_64] \n\t"

700

701 "sub %[load6], %[step2_15], %[step2_12] \n\t"

702 "sub %[load6], %[load6], %[step2_11] \n\t"

703 "add %[load6], %[load6], %[step2_8] \n\t"

704

705 "madd $ac3, %[load6], %[cospi_16_64] \n\t"

706

707 "extp %[step1_10], $ac0, 31 \n\t"

708 "extp %[step1_13], $ac1, 31 \n\t"

709 "extp %[step1_11], $ac2, 31 \n\t"

710 "extp %[step1_12], $ac3, 31 \n\t"

711

712 : [load5] "=&r" (load5), [load6] "=&r" (load6),

713 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),

714 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)

715 : [const_2_power_13] "r" (const_2_power_13),

716 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),

717 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),

718 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),

719 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),

720 [cospi_16_64] "r" (cospi_16_64)

721 );

722

723 step1_8 = step2_8 + step2_11;

724 step1_9 = step2_9 + step2_10;

725 step1_14 = step2_13 + step2_14;

726 step1_15 = step2_12 + step2_15;

727

728 __asm__ __volatile__ (

729 "lbu %[load7], 0(%[dest_pix]) \n\t"

730 "add %[load5], %[step1_0], %[step1_7] \n\t"

731 "add %[load5], %[load5], %[step1_15] \n\t"

732 "addi %[load5], %[load5], 32 \n\t"

733 "sra %[load5], %[load5], 6 \n\t"

734 "add %[load7], %[load7], %[load5] \n\t"

735 "lbux %[load5], %[load7](%[cm]) \n\t"

736 "add %[load6], %[step1_1], %[step1_6] \n\t"

737 "add %[load6], %[load6], %[step1_14] \n\t"

738 "sb %[load5], 0(%[dest_pix]) \n\t"

739 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

740 "lbu %[load8], 0(%[dest_pix]) \n\t"

741 "addi %[load6], %[load6], 32 \n\t"

742 "sra %[load6], %[load6], 6 \n\t"

743 "add %[load8], %[load8], %[load6] \n\t"

744 "lbux %[load6], %[load8](%[cm]) \n\t"

745 "sb %[load6], 0(%[dest_pix]) \n\t"

746 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

747

748 "lbu %[load7], 0(%[dest_pix]) \n\t"

749 "add %[load5], %[step1_2], %[step1_5] \n\t"

750 "add %[load5], %[load5], %[step1_13] \n\t"

751 "addi %[load5], %[load5], 32 \n\t"

752 "sra %[load5], %[load5], 6 \n\t"

753 "add %[load7], %[load7], %[load5] \n\t"

754 "lbux %[load5], %[load7](%[cm]) \n\t"

755 "add %[load6], %[step1_3], %[step1_4] \n\t"

756 "add %[load6], %[load6], %[step1_12] \n\t"

757 "sb %[load5], 0(%[dest_pix]) \n\t"

758 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

759 "lbu %[load8], 0(%[dest_pix]) \n\t"

760 "addi %[load6], %[load6], 32 \n\t"

761 "sra %[load6], %[load6], 6 \n\t"

762 "add %[load8], %[load8], %[load6] \n\t"

763 "lbux %[load6], %[load8](%[cm]) \n\t"

764 "sb %[load6], 0(%[dest_pix]) \n\t"

765 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

766

767 "lbu %[load7], 0(%[dest_pix]) \n\t"

768 "sub %[load5], %[step1_3], %[step1_4] \n\t"

769 "add %[load5], %[load5], %[step1_11] \n\t"

770 "addi %[load5], %[load5], 32 \n\t"

771 "sra %[load5], %[load5], 6 \n\t"

772 "add %[load7], %[load7], %[load5] \n\t"

773 "lbux %[load5], %[load7](%[cm]) \n\t"

774 "sub %[load6], %[step1_2], %[step1_5] \n\t"

775 "add %[load6], %[load6], %[step1_10] \n\t"

776 "sb %[load5], 0(%[dest_pix]) \n\t"

777 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

778 "lbu %[load8], 0(%[dest_pix]) \n\t"

779 "addi %[load6], %[load6], 32 \n\t"

780 "sra %[load6], %[load6], 6 \n\t"

781 "add %[load8], %[load8], %[load6] \n\t"

782 "lbux %[load6], %[load8](%[cm]) \n\t"

783 "sb %[load6], 0(%[dest_pix]) \n\t"

784 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

785

786 "sub %[load5], %[step1_1], %[step1_6] \n\t"

787 "lbu %[load7], 0(%[dest_pix]) \n\t"

788 "add %[load5], %[load5], %[step1_9] \n\t"

789 "addi %[load5], %[load5], 32 \n\t"

790 "sra %[load5], %[load5], 6 \n\t"

791 "add %[load7], %[load7], %[load5] \n\t"

792 "lbux %[load5], %[load7](%[cm]) \n\t"

793 "sub %[load6], %[step1_0], %[step1_7] \n\t"

794 "add %[load6], %[load6], %[step1_8] \n\t"

795 "sb %[load5], 0(%[dest_pix]) \n\t"

796 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

797 "lbu %[load8], 0(%[dest_pix]) \n\t"

798 "addi %[load6], %[load6], 32 \n\t"

799 "sra %[load6], %[load6], 6 \n\t"

800 "add %[load8], %[load8], %[load6] \n\t"

801 "lbux %[load6], %[load8](%[cm]) \n\t"

802 "sb %[load6], 0(%[dest_pix]) \n\t"

803 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

804

805 "lbu %[load7], 0(%[dest_pix]) \n\t"

806 "sub %[load5], %[step1_0], %[step1_7] \n\t"

807 "sub %[load5], %[load5], %[step1_8] \n\t"

808 "addi %[load5], %[load5], 32 \n\t"

809 "sra %[load5], %[load5], 6 \n\t"

810 "add %[load7], %[load7], %[load5] \n\t"

811 "lbux %[load5], %[load7](%[cm]) \n\t"

812 "sub %[load6], %[step1_1], %[step1_6] \n\t"

813 "sub %[load6], %[load6], %[step1_9] \n\t"

814 "sb %[load5], 0(%[dest_pix]) \n\t"

815 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

816 "lbu %[load8], 0(%[dest_pix]) \n\t"

817 "addi %[load6], %[load6], 32 \n\t"

818 "sra %[load6], %[load6], 6 \n\t"

819 "add %[load8], %[load8], %[load6] \n\t"

820 "lbux %[load6], %[load8](%[cm]) \n\t"

821 "sb %[load6], 0(%[dest_pix]) \n\t"

822 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

823

824 "lbu %[load7], 0(%[dest_pix]) \n\t"

825 "sub %[load5], %[step1_2], %[step1_5] \n\t"

826 "sub %[load5], %[load5], %[step1_10] \n\t"

827 "addi %[load5], %[load5], 32 \n\t"

828 "sra %[load5], %[load5], 6 \n\t"

829 "add %[load7], %[load7], %[load5] \n\t"

830 "lbux %[load5], %[load7](%[cm]) \n\t"

831 "sub %[load6], %[step1_3], %[step1_4] \n\t"

832 "sub %[load6], %[load6], %[step1_11] \n\t"

833 "sb %[load5], 0(%[dest_pix]) \n\t"

834 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

835 "lbu %[load8], 0(%[dest_pix]) \n\t"

836 "addi %[load6], %[load6], 32 \n\t"

837 "sra %[load6], %[load6], 6 \n\t"

838 "add %[load8], %[load8], %[load6] \n\t"

839 "lbux %[load6], %[load8](%[cm]) \n\t"

840 "sb %[load6], 0(%[dest_pix]) \n\t"

841 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

842

843 "lbu %[load7], 0(%[dest_pix]) \n\t"

844 "add %[load5], %[step1_3], %[step1_4] \n\t"

845 "sub %[load5], %[load5], %[step1_12] \n\t"

846 "addi %[load5], %[load5], 32 \n\t"

847 "sra %[load5], %[load5], 6 \n\t"

848 "add %[load7], %[load7], %[load5] \n\t"

849 "lbux %[load5], %[load7](%[cm]) \n\t"

850 "add %[load6], %[step1_2], %[step1_5] \n\t"

851 "sub %[load6], %[load6], %[step1_13] \n\t"

852 "sb %[load5], 0(%[dest_pix]) \n\t"

853 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

854 "lbu %[load8], 0(%[dest_pix]) \n\t"

855 "addi %[load6], %[load6], 32 \n\t"

856 "sra %[load6], %[load6], 6 \n\t"

857 "add %[load8], %[load8], %[load6] \n\t"

858 "lbux %[load6], %[load8](%[cm]) \n\t"

859 "sb %[load6], 0(%[dest_pix]) \n\t"

860 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

861

862 "lbu %[load7], 0(%[dest_pix]) \n\t"

863 "add %[load5], %[step1_1], %[step1_6] \n\t"

864 "sub %[load5], %[load5], %[step1_14] \n\t"

865 "addi %[load5], %[load5], 32 \n\t"

866 "sra %[load5], %[load5], 6 \n\t"

867 "add %[load7], %[load7], %[load5] \n\t"

868 "lbux %[load5], %[load7](%[cm]) \n\t"

869 "add %[load6], %[step1_0], %[step1_7] \n\t"

870 "sub %[load6], %[load6], %[step1_15] \n\t"

871 "sb %[load5], 0(%[dest_pix]) \n\t"

872 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

873 "lbu %[load8], 0(%[dest_pix]) \n\t"

874 "addi %[load6], %[load6], 32 \n\t"

875 "sra %[load6], %[load6], 6 \n\t"

876 "add %[load8], %[load8], %[load6] \n\t"

877 "lbux %[load6], %[load8](%[cm]) \n\t"

878 "sb %[load6], 0(%[dest_pix]) \n\t"

879

880 : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),

881 [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)

882 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

883 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),

884 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),

885 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),

886 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),

887 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),

888 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

889 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),

890 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)

891 );

892

893 input += 16;

894 }

895 }

896

897 void vp9_idct16x16_256_add_dspr2(const int16_t input, uint8_t dest,

898 int dest_stride) {

899 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

900 uint32_t pos = 45;

901

902 /* bit positon for extract from acc */

903 __asm__ __volatile__ (

904 "wrdsp %[pos], 1 \n\t"

905 :

906 : [pos] "r" (pos)

907 );

908

909 // First transform rows

910 idct16_rows_dspr2(input, out, 16);

911

912 // Then transform columns and add to dest

913 idct16_cols_add_blk_dspr2(out, dest, dest_stride);

914 }

915

916 static void iadst16(const int16_t input, int16_t output) {

917 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

918

919 int x0 = input[15];

920 int x1 = input[0];

921 int x2 = input[13];

922 int x3 = input[2];

923 int x4 = input[11];

924 int x5 = input[4];

925 int x6 = input[9];

926 int x7 = input[6];

927 int x8 = input[7];

928 int x9 = input[8];

929 int x10 = input[5];

930 int x11 = input[10];

931 int x12 = input[3];

932 int x13 = input[12];

933 int x14 = input[1];

934 int x15 = input[14];

935

936 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7 \| x8

937 \| x9 \| x10 \| x11 \| x12 \| x13 \| x14 \| x15)) {

938 output[0] = output[1] = output[2] = output[3] = output[4]

939 = output[5] = output[6] = output[7] = output[8]

940 = output[9] = output[10] = output[11] = output[12]

941 = output[13] = output[14] = output[15] = 0;

942 return;

943 }

944

945 // stage 1

946 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;

947 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

948 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;

949 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

950 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;

951 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

952 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

953 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

954 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

955 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

956 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

957 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

958 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

959 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;

960 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

961 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;

962

963 x0 = dct_const_round_shift(s0 + s8);

964 x1 = dct_const_round_shift(s1 + s9);

965 x2 = dct_const_round_shift(s2 + s10);

966 x3 = dct_const_round_shift(s3 + s11);

967 x4 = dct_const_round_shift(s4 + s12);

968 x5 = dct_const_round_shift(s5 + s13);

969 x6 = dct_const_round_shift(s6 + s14);

970 x7 = dct_const_round_shift(s7 + s15);

971 x8 = dct_const_round_shift(s0 - s8);

972 x9 = dct_const_round_shift(s1 - s9);

973 x10 = dct_const_round_shift(s2 - s10);

974 x11 = dct_const_round_shift(s3 - s11);

975 x12 = dct_const_round_shift(s4 - s12);

976 x13 = dct_const_round_shift(s5 - s13);

977 x14 = dct_const_round_shift(s6 - s14);

978 x15 = dct_const_round_shift(s7 - s15);

979

980 // stage 2

981 s0 = x0;

982 s1 = x1;

983 s2 = x2;

984 s3 = x3;

985 s4 = x4;

986 s5 = x5;

987 s6 = x6;

988 s7 = x7;

989 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;

990 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;

991 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;

992 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;

993 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

994 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;

995 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

996 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;

997

998 x0 = s0 + s4;

999 x1 = s1 + s5;

1000 x2 = s2 + s6;

1001 x3 = s3 + s7;

1002 x4 = s0 - s4;

1003 x5 = s1 - s5;

1004 x6 = s2 - s6;

1005 x7 = s3 - s7;

1006 x8 = dct_const_round_shift(s8 + s12);

1007 x9 = dct_const_round_shift(s9 + s13);

1008 x10 = dct_const_round_shift(s10 + s14);

1009 x11 = dct_const_round_shift(s11 + s15);

1010 x12 = dct_const_round_shift(s8 - s12);

1011 x13 = dct_const_round_shift(s9 - s13);

1012 x14 = dct_const_round_shift(s10 - s14);

1013 x15 = dct_const_round_shift(s11 - s15);

1014

1015 // stage 3

1016 s0 = x0;

1017 s1 = x1;

1018 s2 = x2;

1019 s3 = x3;

1020 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;

1021 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

1022 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

1023 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;

1024 s8 = x8;

1025 s9 = x9;

1026 s10 = x10;

1027 s11 = x11;

1028 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;

1029 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

1030 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

1031 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;

1032

1033 x0 = s0 + s2;

1034 x1 = s1 + s3;

1035 x2 = s0 - s2;

1036 x3 = s1 - s3;

1037 x4 = dct_const_round_shift(s4 + s6);

1038 x5 = dct_const_round_shift(s5 + s7);

1039 x6 = dct_const_round_shift(s4 - s6);

1040 x7 = dct_const_round_shift(s5 - s7);

1041 x8 = s8 + s10;

1042 x9 = s9 + s11;

1043 x10 = s8 - s10;

1044 x11 = s9 - s11;

1045 x12 = dct_const_round_shift(s12 + s14);

1046 x13 = dct_const_round_shift(s13 + s15);

1047 x14 = dct_const_round_shift(s12 - s14);

1048 x15 = dct_const_round_shift(s13 - s15);

1049

1050 // stage 4

1051 s2 = (- cospi_16_64) * (x2 + x3);

1052 s3 = cospi_16_64 * (x2 - x3);

1053 s6 = cospi_16_64 * (x6 + x7);

1054 s7 = cospi_16_64 * (- x6 + x7);

1055 s10 = cospi_16_64 * (x10 + x11);

1056 s11 = cospi_16_64 * (- x10 + x11);

1057 s14 = (- cospi_16_64) * (x14 + x15);

1058 s15 = cospi_16_64 * (x14 - x15);

1059

1060 x2 = dct_const_round_shift(s2);

1061 x3 = dct_const_round_shift(s3);

1062 x6 = dct_const_round_shift(s6);

1063 x7 = dct_const_round_shift(s7);

1064 x10 = dct_const_round_shift(s10);

1065 x11 = dct_const_round_shift(s11);

1066 x14 = dct_const_round_shift(s14);

1067 x15 = dct_const_round_shift(s15);

1068

1069 output[0] = x0;

1070 output[1] = -x8;

1071 output[2] = x12;

1072 output[3] = -x4;

1073 output[4] = x6;

1074 output[5] = x14;

1075 output[6] = x10;

1076 output[7] = x2;

1077 output[8] = x3;

1078 output[9] = x11;

1079 output[10] = x15;

1080 output[11] = x7;

1081 output[12] = x5;

1082 output[13] = -x13;

1083 output[14] = x9;

1084 output[15] = -x1;

1085 }

1086

1087 void vp9_iht16x16_256_add_dspr2(const int16_t input, uint8_t dest,	24 void vp9_iht16x16_256_add_dspr2(const int16_t input, uint8_t dest,

1088 int pitch, int tx_type) {	25 int pitch, int tx_type) {

1089 int i, j;	26 int i, j;

1090 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);	27 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

1091 int16_t *outptr = out;	28 int16_t *outptr = out;

1092 int16_t temp_out[16];	29 int16_t temp_out[16];

1093 uint32_t pos = 45;	30 uint32_t pos = 45;

1094	31

1095 /* bit positon for extract from acc */	32 /* bit positon for extract from acc */

1096 __asm__ __volatile__ (	33 __asm__ __volatile__ (

1097 "wrdsp %[pos], 1 \n\t"	34 "wrdsp %[pos], 1 \n\t"

1098 :	35 :

1099 : [pos] "r" (pos)	36 : [pos] "r" (pos)

1100 );	37 );

1101	38

1102 switch (tx_type) {	39 switch (tx_type) {

1103 case DCT_DCT: // DCT in both horizontal and vertical	40 case DCT_DCT: // DCT in both horizontal and vertical

1104 idct16_rows_dspr2(input, outptr, 16);	41 idct16_rows_dspr2(input, outptr, 16);

1105 idct16_cols_add_blk_dspr2(out, dest, pitch);	42 idct16_cols_add_blk_dspr2(out, dest, pitch);

1106 break;	43 break;

1107 case ADST_DCT: // ADST in vertical, DCT in horizontal	44 case ADST_DCT: // ADST in vertical, DCT in horizontal

1108 idct16_rows_dspr2(input, outptr, 16);	45 idct16_rows_dspr2(input, outptr, 16);

1109	46

1110 outptr = out;	47 outptr = out;

1111	48

1112 for (i = 0; i < 16; ++i) {	49 for (i = 0; i < 16; ++i) {

1113 iadst16(outptr, temp_out);	50 iadst16_dspr2(outptr, temp_out);

1114	51

1115 for (j = 0; j < 16; ++j)	52 for (j = 0; j < 16; ++j)

1116 dest[j * pitch + i] =	53 dest[j * pitch + i] =

1117 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	54 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

1118 + dest[j * pitch + i]);	55 + dest[j * pitch + i]);

1119 outptr += 16;	56 outptr += 16;

1120 }	57 }

1121 break;	58 break;

1122 case DCT_ADST: // DCT in vertical, ADST in horizontal	59 case DCT_ADST: // DCT in vertical, ADST in horizontal

1123 {	60 {

1124 int16_t temp_in[16 * 16];	61 int16_t temp_in[16 * 16];

1125	62

1126 for (i = 0; i < 16; ++i) {	63 for (i = 0; i < 16; ++i) {

1127 /* prefetch row */	64 /* prefetch row */

1128 prefetch_load((const uint8_t *)(input + 16));	65 prefetch_load((const uint8_t *)(input + 16));

1129	66

1130 iadst16(input, outptr);	67 iadst16_dspr2(input, outptr);

1131 input += 16;	68 input += 16;

1132 outptr += 16;	69 outptr += 16;

1133 }	70 }

1134	71

1135 for (i = 0; i < 16; ++i)	72 for (i = 0; i < 16; ++i)

1136 for (j = 0; j < 16; ++j)	73 for (j = 0; j < 16; ++j)

1137 temp_in[j * 16 + i] = out[i * 16 + j];	74 temp_in[j * 16 + i] = out[i * 16 + j];

1138	75

1139 idct16_cols_add_blk_dspr2(temp_in, dest, pitch);	76 idct16_cols_add_blk_dspr2(temp_in, dest, pitch);

1140 }	77 }

1141 break;	78 break;

1142 case ADST_ADST: // ADST in both directions	79 case ADST_ADST: // ADST in both directions

1143 {	80 {

1144 int16_t temp_in[16];	81 int16_t temp_in[16];

1145	82

1146 for (i = 0; i < 16; ++i) {	83 for (i = 0; i < 16; ++i) {

1147 /* prefetch row */	84 /* prefetch row */

1148 prefetch_load((const uint8_t *)(input + 16));	85 prefetch_load((const uint8_t *)(input + 16));

1149	86

1150 iadst16(input, outptr);	87 iadst16_dspr2(input, outptr);

1151 input += 16;	88 input += 16;

1152 outptr += 16;	89 outptr += 16;

1153 }	90 }

1154	91

1155 for (i = 0; i < 16; ++i) {	92 for (i = 0; i < 16; ++i) {

1156 for (j = 0; j < 16; ++j)	93 for (j = 0; j < 16; ++j)

1157 temp_in[j] = out[j * 16 + i];	94 temp_in[j] = out[j * 16 + i];

1158 iadst16(temp_in, temp_out);	95 iadst16_dspr2(temp_in, temp_out);

1159 for (j = 0; j < 16; ++j)	96 for (j = 0; j < 16; ++j)

1160 dest[j * pitch + i] =	97 dest[j * pitch + i] =

1161 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	98 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

1162 + dest[j * pitch + i]);	99 + dest[j * pitch + i]);

1163 }	100 }

1164 }	101 }

1165 break;	102 break;

1166 default:	103 default:

1167 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");	104 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");

1168 break;	105 break;

1169 }	106 }

1170 }	107 }

1171

1172 void vp9_idct16x16_10_add_dspr2(const int16_t input, uint8_t dest,

1173 int dest_stride) {

1174 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

1175 int16_t *outptr = out;

1176 uint32_t i;

1177 uint32_t pos = 45;

1178

1179 /* bit positon for extract from acc */

1180 __asm__ __volatile__ (

1181 "wrdsp %[pos], 1 \n\t"

1182 :

1183 : [pos] "r" (pos)

1184 );

1185

1186 // First transform rows. Since all non-zero dct coefficients are in

1187 // upper-left 4x4 area, we only need to calculate first 4 rows here.

1188 idct16_rows_dspr2(input, outptr, 4);

1189

1190 outptr += 4;

1191 for (i = 0; i < 6; ++i) {

1192 __asm__ __volatile__ (

1193 "sw $zero, 0(%[outptr]) \n\t"

1194 "sw $zero, 32(%[outptr]) \n\t"

1195 "sw $zero, 64(%[outptr]) \n\t"

1196 "sw $zero, 96(%[outptr]) \n\t"

1197 "sw $zero, 128(%[outptr]) \n\t"

1198 "sw $zero, 160(%[outptr]) \n\t"

1199 "sw $zero, 192(%[outptr]) \n\t"

1200 "sw $zero, 224(%[outptr]) \n\t"

1201 "sw $zero, 256(%[outptr]) \n\t"

1202 "sw $zero, 288(%[outptr]) \n\t"

1203 "sw $zero, 320(%[outptr]) \n\t"

1204 "sw $zero, 352(%[outptr]) \n\t"

1205 "sw $zero, 384(%[outptr]) \n\t"

1206 "sw $zero, 416(%[outptr]) \n\t"

1207 "sw $zero, 448(%[outptr]) \n\t"

1208 "sw $zero, 480(%[outptr]) \n\t"

1209

1210 :

1211 : [outptr] "r" (outptr)

1212 );

1213

1214 outptr += 2;

1215 }

1216

1217 // Then transform columns

1218 idct16_cols_add_blk_dspr2(out, dest, dest_stride);

1219 }

1220

1221 void vp9_idct16x16_1_add_dspr2(const int16_t input, uint8_t dest,

1222 int dest_stride) {

1223 uint32_t pos = 45;

1224 int32_t out;

1225 int32_t r;

1226 int32_t a1, absa1;

1227 int32_t vector_a1;

1228 int32_t t1, t2, t3, t4;

1229 int32_t vector_1, vector_2, vector_3, vector_4;

1230

1231 /* bit positon for extract from acc */

1232 __asm__ __volatile__ (

1233 "wrdsp %[pos], 1 \n\t"

1234

1235 :

1236 : [pos] "r" (pos)

1237 );

1238

1239 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

1240 __asm__ __volatile__ (

1241 "addi %[out], %[out], 32 \n\t"

1242 "sra %[a1], %[out], 6 \n\t"

1243

1244 : [out] "+r" (out), [a1] "=r" (a1)

1245 :

1246 );

1247

1248 if (a1 < 0) {

1249 /* use quad-byte

1250 * input and output memory are four byte aligned */

1251 __asm__ __volatile__ (

1252 "abs %[absa1], %[a1] \n\t"

1253 "replv.qb %[vector_a1], %[absa1] \n\t"

1254

1255 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

1256 : [a1] "r" (a1)

1257 );

1258

1259 for (r = 16; r--;) {

1260 __asm__ __volatile__ (

1261 "lw %[t1], 0(%[dest]) \n\t"

1262 "lw %[t2], 4(%[dest]) \n\t"

1263 "lw %[t3], 8(%[dest]) \n\t"

1264 "lw %[t4], 12(%[dest]) \n\t"

1265 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

1266 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

1267 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

1268 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

1269 "sw %[vector_1], 0(%[dest]) \n\t"

1270 "sw %[vector_2], 4(%[dest]) \n\t"

1271 "sw %[vector_3], 8(%[dest]) \n\t"

1272 "sw %[vector_4], 12(%[dest]) \n\t"

1273 "add %[dest], %[dest], %[dest_stride] \n\t"

1274

1275 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

1276 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

1277 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

1278 [dest] "+&r" (dest)

1279 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

1280 );

1281 }

1282 } else {

1283 /* use quad-byte

1284 * input and output memory are four byte aligned */

1285 __asm__ __volatile__ (

1286 "replv.qb %[vector_a1], %[a1] \n\t"

1287

1288 : [vector_a1] "=r" (vector_a1)

1289 : [a1] "r" (a1)

1290 );

1291

1292 for (r = 16; r--;) {

1293 __asm__ __volatile__ (

1294 "lw %[t1], 0(%[dest]) \n\t"

1295 "lw %[t2], 4(%[dest]) \n\t"

1296 "lw %[t3], 8(%[dest]) \n\t"

1297 "lw %[t4], 12(%[dest]) \n\t"

1298 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

1299 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

1300 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

1301 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

1302 "sw %[vector_1], 0(%[dest]) \n\t"

1303 "sw %[vector_2], 4(%[dest]) \n\t"

1304 "sw %[vector_3], 8(%[dest]) \n\t"

1305 "sw %[vector_4], 12(%[dest]) \n\t"

1306 "add %[dest], %[dest], %[dest_stride] \n\t"

1307

1308 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

1309 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

1310 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

1311 [dest] "+&r" (dest)

1312 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

1313 );

1314 }

1315 }

1316 }

1317 #endif // #if HAVE_DSPR2	108 #endif // #if HAVE_DSPR2

OLD	NEW