Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(49)

Side by Side Diff: source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h> 11 #include <assert.h>
12 #include <stdio.h> 12 #include <stdio.h>
13 13
14 #include "./vpx_config.h" 14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h" 15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h" 16 #include "vp9/common/vp9_common.h"
17 #include "vp9/common/vp9_blockd.h" 17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_idct.h" 18 #include "vp9/common/vp9_idct.h"
19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 19 #include "vpx_dsp/mips/inv_txfm_dspr2.h"
20 #include "vpx_dsp/txfm_common.h" 20 #include "vpx_dsp/txfm_common.h"
21 #include "vpx_ports/mem.h" 21 #include "vpx_ports/mem.h"
22 22
23 #if HAVE_DSPR2 23 #if HAVE_DSPR2
24 static void idct16_rows_dspr2(const int16_t *input, int16_t *output,
25 uint32_t no_rows) {
26 int i;
27 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
28 int step1_10, step1_11, step1_12, step1_13;
29 int step2_0, step2_1, step2_2, step2_3;
30 int step2_8, step2_9, step2_10, step2_11;
31 int step2_12, step2_13, step2_14, step2_15;
32 int load1, load2, load3, load4, load5, load6, load7, load8;
33 int result1, result2, result3, result4;
34 const int const_2_power_13 = 8192;
35
36 for (i = no_rows; i--; ) {
37 /* prefetch row */
38 prefetch_load((const uint8_t *)(input + 16));
39
40 __asm__ __volatile__ (
41 "lh %[load1], 0(%[input]) \n\t"
42 "lh %[load2], 16(%[input]) \n\t"
43 "lh %[load3], 8(%[input]) \n\t"
44 "lh %[load4], 24(%[input]) \n\t"
45
46 "mtlo %[const_2_power_13], $ac1 \n\t"
47 "mthi $zero, $ac1 \n\t"
48 "mtlo %[const_2_power_13], $ac2 \n\t"
49 "mthi $zero, $ac2 \n\t"
50 "add %[result1], %[load1], %[load2] \n\t"
51 "sub %[result2], %[load1], %[load2] \n\t"
52 "madd $ac1, %[result1], %[cospi_16_64] \n\t"
53 "madd $ac2, %[result2], %[cospi_16_64] \n\t"
54 "extp %[step2_0], $ac1, 31 \n\t"
55 "extp %[step2_1], $ac2, 31 \n\t"
56
57 "mtlo %[const_2_power_13], $ac3 \n\t"
58 "mthi $zero, $ac3 \n\t"
59 "madd $ac3, %[load3], %[cospi_24_64] \n\t"
60 "msub $ac3, %[load4], %[cospi_8_64] \n\t"
61 "extp %[step2_2], $ac3, 31 \n\t"
62
63 "mtlo %[const_2_power_13], $ac1 \n\t"
64 "mthi $zero, $ac1 \n\t"
65 "madd $ac1, %[load3], %[cospi_8_64] \n\t"
66 "madd $ac1, %[load4], %[cospi_24_64] \n\t"
67 "extp %[step2_3], $ac1, 31 \n\t"
68
69 "add %[step1_0], %[step2_0], %[step2_3] \n\t"
70 "add %[step1_1], %[step2_1], %[step2_2] \n\t"
71 "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
72 "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
73
74 : [load1] "=&r" (load1), [load2] "=&r" (load2),
75 [load3] "=&r" (load3), [load4] "=&r" (load4),
76 [result1] "=&r" (result1), [result2] "=&r" (result2),
77 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
78 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
79 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
80 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
81 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
82 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
83 [cospi_16_64] "r" (cospi_16_64)
84 );
85
86 __asm__ __volatile__ (
87 "lh %[load5], 2(%[input]) \n\t"
88 "lh %[load6], 30(%[input]) \n\t"
89 "lh %[load7], 18(%[input]) \n\t"
90 "lh %[load8], 14(%[input]) \n\t"
91
92 "mtlo %[const_2_power_13], $ac1 \n\t"
93 "mthi $zero, $ac1 \n\t"
94 "mtlo %[const_2_power_13], $ac3 \n\t"
95 "mthi $zero, $ac3 \n\t"
96
97 "madd $ac1, %[load5], %[cospi_30_64] \n\t"
98 "msub $ac1, %[load6], %[cospi_2_64] \n\t"
99 "extp %[result1], $ac1, 31 \n\t"
100
101 "madd $ac3, %[load7], %[cospi_14_64] \n\t"
102 "msub $ac3, %[load8], %[cospi_18_64] \n\t"
103 "extp %[result2], $ac3, 31 \n\t"
104
105 "mtlo %[const_2_power_13], $ac1 \n\t"
106 "mthi $zero, $ac1 \n\t"
107 "mtlo %[const_2_power_13], $ac2 \n\t"
108 "mthi $zero, $ac2 \n\t"
109
110 "madd $ac1, %[load7], %[cospi_18_64] \n\t"
111 "madd $ac1, %[load8], %[cospi_14_64] \n\t"
112 "extp %[result3], $ac1, 31 \n\t"
113
114 "madd $ac2, %[load5], %[cospi_2_64] \n\t"
115 "madd $ac2, %[load6], %[cospi_30_64] \n\t"
116 "extp %[result4], $ac2, 31 \n\t"
117
118 "sub %[load5], %[result1], %[result2] \n\t"
119 "sub %[load6], %[result4], %[result3] \n\t"
120
121 "mtlo %[const_2_power_13], $ac1 \n\t"
122 "mthi $zero, $ac1 \n\t"
123 "mtlo %[const_2_power_13], $ac3 \n\t"
124 "mthi $zero, $ac3 \n\t"
125
126 "madd $ac1, %[load6], %[cospi_24_64] \n\t"
127 "msub $ac1, %[load5], %[cospi_8_64] \n\t"
128 "madd $ac3, %[load5], %[cospi_24_64] \n\t"
129 "madd $ac3, %[load6], %[cospi_8_64] \n\t"
130
131 "extp %[step2_9], $ac1, 31 \n\t"
132 "extp %[step2_14], $ac3, 31 \n\t"
133 "add %[step2_8], %[result1], %[result2] \n\t"
134 "add %[step2_15], %[result4], %[result3] \n\t"
135
136 : [load5] "=&r" (load5), [load6] "=&r" (load6),
137 [load7] "=&r" (load7), [load8] "=&r" (load8),
138 [result1] "=&r" (result1), [result2] "=&r" (result2),
139 [result3] "=&r" (result3), [result4] "=&r" (result4),
140 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
141 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
142 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
143 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
144 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
145 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
146 );
147
148 __asm__ __volatile__ (
149 "lh %[load1], 10(%[input]) \n\t"
150 "lh %[load2], 22(%[input]) \n\t"
151 "lh %[load3], 26(%[input]) \n\t"
152 "lh %[load4], 6(%[input]) \n\t"
153
154 "mtlo %[const_2_power_13], $ac1 \n\t"
155 "mthi $zero, $ac1 \n\t"
156 "mtlo %[const_2_power_13], $ac3 \n\t"
157 "mthi $zero, $ac3 \n\t"
158
159 "madd $ac1, %[load1], %[cospi_22_64] \n\t"
160 "msub $ac1, %[load2], %[cospi_10_64] \n\t"
161 "extp %[result1], $ac1, 31 \n\t"
162
163 "madd $ac3, %[load3], %[cospi_6_64] \n\t"
164 "msub $ac3, %[load4], %[cospi_26_64] \n\t"
165 "extp %[result2], $ac3, 31 \n\t"
166
167 "mtlo %[const_2_power_13], $ac1 \n\t"
168 "mthi $zero, $ac1 \n\t"
169 "mtlo %[const_2_power_13], $ac2 \n\t"
170 "mthi $zero, $ac2 \n\t"
171
172 "madd $ac1, %[load1], %[cospi_10_64] \n\t"
173 "madd $ac1, %[load2], %[cospi_22_64] \n\t"
174 "extp %[result3], $ac1, 31 \n\t"
175
176 "madd $ac2, %[load3], %[cospi_26_64] \n\t"
177 "madd $ac2, %[load4], %[cospi_6_64] \n\t"
178 "extp %[result4], $ac2, 31 \n\t"
179
180 "mtlo %[const_2_power_13], $ac1 \n\t"
181 "mthi $zero, $ac1 \n\t"
182 "mtlo %[const_2_power_13], $ac3 \n\t"
183 "mthi $zero, $ac3 \n\t"
184
185 "sub %[load1], %[result2], %[result1] \n\t"
186 "sub %[load2], %[result4], %[result3] \n\t"
187
188 "msub $ac1, %[load1], %[cospi_24_64] \n\t"
189 "msub $ac1, %[load2], %[cospi_8_64] \n\t"
190 "madd $ac3, %[load2], %[cospi_24_64] \n\t"
191 "msub $ac3, %[load1], %[cospi_8_64] \n\t"
192
193 "extp %[step2_10], $ac1, 31 \n\t"
194 "extp %[step2_13], $ac3, 31 \n\t"
195 "add %[step2_11], %[result1], %[result2] \n\t"
196 "add %[step2_12], %[result4], %[result3] \n\t"
197
198 : [load1] "=&r" (load1), [load2] "=&r" (load2),
199 [load3] "=&r" (load3), [load4] "=&r" (load4),
200 [result1] "=&r" (result1), [result2] "=&r" (result2),
201 [result3] "=&r" (result3), [result4] "=&r" (result4),
202 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
203 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
204 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
205 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
206 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
207 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
208 );
209
210 __asm__ __volatile__ (
211 "lh %[load5], 4(%[input]) \n\t"
212 "lh %[load6], 28(%[input]) \n\t"
213 "lh %[load7], 20(%[input]) \n\t"
214 "lh %[load8], 12(%[input]) \n\t"
215
216 "mtlo %[const_2_power_13], $ac1 \n\t"
217 "mthi $zero, $ac1 \n\t"
218 "mtlo %[const_2_power_13], $ac3 \n\t"
219 "mthi $zero, $ac3 \n\t"
220
221 "madd $ac1, %[load5], %[cospi_28_64] \n\t"
222 "msub $ac1, %[load6], %[cospi_4_64] \n\t"
223 "extp %[result1], $ac1, 31 \n\t"
224
225 "madd $ac3, %[load7], %[cospi_12_64] \n\t"
226 "msub $ac3, %[load8], %[cospi_20_64] \n\t"
227 "extp %[result2], $ac3, 31 \n\t"
228
229 "mtlo %[const_2_power_13], $ac1 \n\t"
230 "mthi $zero, $ac1 \n\t"
231 "mtlo %[const_2_power_13], $ac2 \n\t"
232 "mthi $zero, $ac2 \n\t"
233
234 "madd $ac1, %[load7], %[cospi_20_64] \n\t"
235 "madd $ac1, %[load8], %[cospi_12_64] \n\t"
236 "extp %[result3], $ac1, 31 \n\t"
237
238 "madd $ac2, %[load5], %[cospi_4_64] \n\t"
239 "madd $ac2, %[load6], %[cospi_28_64] \n\t"
240 "extp %[result4], $ac2, 31 \n\t"
241
242 "mtlo %[const_2_power_13], $ac1 \n\t"
243 "mthi $zero, $ac1 \n\t"
244 "mtlo %[const_2_power_13], $ac3 \n\t"
245 "mthi $zero, $ac3 \n\t"
246
247 "sub %[load5], %[result4], %[result3] \n\t"
248 "sub %[load5], %[load5], %[result1] \n\t"
249 "add %[load5], %[load5], %[result2] \n\t"
250
251 "sub %[load6], %[result1], %[result2] \n\t"
252 "sub %[load6], %[load6], %[result3] \n\t"
253 "add %[load6], %[load6], %[result4] \n\t"
254
255 "madd $ac1, %[load5], %[cospi_16_64] \n\t"
256 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
257
258 "extp %[step1_5], $ac1, 31 \n\t"
259 "extp %[step1_6], $ac3, 31 \n\t"
260 "add %[step1_4], %[result1], %[result2] \n\t"
261 "add %[step1_7], %[result4], %[result3] \n\t"
262
263 : [load5] "=&r" (load5), [load6] "=&r" (load6),
264 [load7] "=&r" (load7), [load8] "=&r" (load8),
265 [result1] "=&r" (result1), [result2] "=&r" (result2),
266 [result3] "=&r" (result3), [result4] "=&r" (result4),
267 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
268 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
269 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
270 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
271 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
272 [cospi_16_64] "r" (cospi_16_64)
273 );
274
275 __asm__ __volatile__ (
276 "mtlo %[const_2_power_13], $ac0 \n\t"
277 "mthi $zero, $ac0 \n\t"
278 "mtlo %[const_2_power_13], $ac1 \n\t"
279 "mthi $zero, $ac1 \n\t"
280
281 "sub %[load5], %[step2_14], %[step2_13] \n\t"
282 "sub %[load5], %[load5], %[step2_9] \n\t"
283 "add %[load5], %[load5], %[step2_10] \n\t"
284
285 "madd $ac0, %[load5], %[cospi_16_64] \n\t"
286
287 "sub %[load6], %[step2_14], %[step2_13] \n\t"
288 "sub %[load6], %[load6], %[step2_10] \n\t"
289 "add %[load6], %[load6], %[step2_9] \n\t"
290
291 "madd $ac1, %[load6], %[cospi_16_64] \n\t"
292
293 "mtlo %[const_2_power_13], $ac2 \n\t"
294 "mthi $zero, $ac2 \n\t"
295 "mtlo %[const_2_power_13], $ac3 \n\t"
296 "mthi $zero, $ac3 \n\t"
297
298 "sub %[load5], %[step2_15], %[step2_12] \n\t"
299 "sub %[load5], %[load5], %[step2_8] \n\t"
300 "add %[load5], %[load5], %[step2_11] \n\t"
301
302 "madd $ac2, %[load5], %[cospi_16_64] \n\t"
303
304 "sub %[load6], %[step2_15], %[step2_12] \n\t"
305 "sub %[load6], %[load6], %[step2_11] \n\t"
306 "add %[load6], %[load6], %[step2_8] \n\t"
307
308 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
309
310 "extp %[step1_10], $ac0, 31 \n\t"
311 "extp %[step1_13], $ac1, 31 \n\t"
312 "extp %[step1_11], $ac2, 31 \n\t"
313 "extp %[step1_12], $ac3, 31 \n\t"
314
315 : [load5] "=&r" (load5), [load6] "=&r" (load6),
316 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
317 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
318 : [const_2_power_13] "r" (const_2_power_13),
319 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
320 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
321 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
322 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
323 [cospi_16_64] "r" (cospi_16_64)
324 );
325
326 __asm__ __volatile__ (
327 "add %[load5], %[step1_0], %[step1_7] \n\t"
328 "add %[load5], %[load5], %[step2_12] \n\t"
329 "add %[load5], %[load5], %[step2_15] \n\t"
330 "add %[load6], %[step1_1], %[step1_6] \n\t"
331 "add %[load6], %[load6], %[step2_13] \n\t"
332 "add %[load6], %[load6], %[step2_14] \n\t"
333 "sh %[load5], 0(%[output]) \n\t"
334 "sh %[load6], 32(%[output]) \n\t"
335 "sub %[load5], %[step1_1], %[step1_6] \n\t"
336 "add %[load5], %[load5], %[step2_9] \n\t"
337 "add %[load5], %[load5], %[step2_10] \n\t"
338 "sub %[load6], %[step1_0], %[step1_7] \n\t"
339 "add %[load6], %[load6], %[step2_8] \n\t"
340 "add %[load6], %[load6], %[step2_11] \n\t"
341 "sh %[load5], 192(%[output]) \n\t"
342 "sh %[load6], 224(%[output]) \n\t"
343 "sub %[load5], %[step1_0], %[step1_7] \n\t"
344 "sub %[load5], %[load5], %[step2_8] \n\t"
345 "sub %[load5], %[load5], %[step2_11] \n\t"
346 "sub %[load6], %[step1_1], %[step1_6] \n\t"
347 "sub %[load6], %[load6], %[step2_9] \n\t"
348 "sub %[load6], %[load6], %[step2_10] \n\t"
349 "sh %[load5], 256(%[output]) \n\t"
350 "sh %[load6], 288(%[output]) \n\t"
351 "add %[load5], %[step1_1], %[step1_6] \n\t"
352 "sub %[load5], %[load5], %[step2_13] \n\t"
353 "sub %[load5], %[load5], %[step2_14] \n\t"
354 "add %[load6], %[step1_0], %[step1_7] \n\t"
355 "sub %[load6], %[load6], %[step2_12] \n\t"
356 "sub %[load6], %[load6], %[step2_15] \n\t"
357 "sh %[load5], 448(%[output]) \n\t"
358 "sh %[load6], 480(%[output]) \n\t"
359
360 : [load5] "=&r" (load5), [load6] "=&r" (load6)
361 : [output] "r" (output),
362 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
363 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
364 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
365 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
366 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
367 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
368 );
369
370 __asm__ __volatile__ (
371 "add %[load5], %[step1_2], %[step1_5] \n\t"
372 "add %[load5], %[load5], %[step1_13] \n\t"
373 "add %[load6], %[step1_3], %[step1_4] \n\t"
374 "add %[load6], %[load6], %[step1_12] \n\t"
375 "sh %[load5], 64(%[output]) \n\t"
376 "sh %[load6], 96(%[output]) \n\t"
377 "sub %[load5], %[step1_3], %[step1_4] \n\t"
378 "add %[load5], %[load5], %[step1_11] \n\t"
379 "sub %[load6], %[step1_2], %[step1_5] \n\t"
380 "add %[load6], %[load6], %[step1_10] \n\t"
381 "sh %[load5], 128(%[output]) \n\t"
382 "sh %[load6], 160(%[output]) \n\t"
383 "sub %[load5], %[step1_2], %[step1_5] \n\t"
384 "sub %[load5], %[load5], %[step1_10] \n\t"
385 "sub %[load6], %[step1_3], %[step1_4] \n\t"
386 "sub %[load6], %[load6], %[step1_11] \n\t"
387 "sh %[load5], 320(%[output]) \n\t"
388 "sh %[load6], 352(%[output]) \n\t"
389 "add %[load5], %[step1_3], %[step1_4] \n\t"
390 "sub %[load5], %[load5], %[step1_12] \n\t"
391 "add %[load6], %[step1_2], %[step1_5] \n\t"
392 "sub %[load6], %[load6], %[step1_13] \n\t"
393 "sh %[load5], 384(%[output]) \n\t"
394 "sh %[load6], 416(%[output]) \n\t"
395
396 : [load5] "=&r" (load5), [load6] "=&r" (load6)
397 : [output] "r" (output),
398 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
399 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
400 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
401 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
402 );
403
404 input += 16;
405 output += 1;
406 }
407 }
408
409 static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
410 int dest_stride) {
411 int i;
412 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
413 int step1_8, step1_9, step1_10, step1_11;
414 int step1_12, step1_13, step1_14, step1_15;
415 int step2_0, step2_1, step2_2, step2_3;
416 int step2_8, step2_9, step2_10, step2_11;
417 int step2_12, step2_13, step2_14, step2_15;
418 int load1, load2, load3, load4, load5, load6, load7, load8;
419 int result1, result2, result3, result4;
420 const int const_2_power_13 = 8192;
421 uint8_t *dest_pix;
422 uint8_t *cm = vpx_ff_cropTbl;
423
424 /* prefetch vpx_ff_cropTbl */
425 prefetch_load(vpx_ff_cropTbl);
426 prefetch_load(vpx_ff_cropTbl + 32);
427 prefetch_load(vpx_ff_cropTbl + 64);
428 prefetch_load(vpx_ff_cropTbl + 96);
429 prefetch_load(vpx_ff_cropTbl + 128);
430 prefetch_load(vpx_ff_cropTbl + 160);
431 prefetch_load(vpx_ff_cropTbl + 192);
432 prefetch_load(vpx_ff_cropTbl + 224);
433
434 for (i = 0; i < 16; ++i) {
435 dest_pix = (dest + i);
436 __asm__ __volatile__ (
437 "lh %[load1], 0(%[input]) \n\t"
438 "lh %[load2], 16(%[input]) \n\t"
439 "lh %[load3], 8(%[input]) \n\t"
440 "lh %[load4], 24(%[input]) \n\t"
441
442 "mtlo %[const_2_power_13], $ac1 \n\t"
443 "mthi $zero, $ac1 \n\t"
444 "mtlo %[const_2_power_13], $ac2 \n\t"
445 "mthi $zero, $ac2 \n\t"
446 "add %[result1], %[load1], %[load2] \n\t"
447 "sub %[result2], %[load1], %[load2] \n\t"
448 "madd $ac1, %[result1], %[cospi_16_64] \n\t"
449 "madd $ac2, %[result2], %[cospi_16_64] \n\t"
450 "extp %[step2_0], $ac1, 31 \n\t"
451 "extp %[step2_1], $ac2, 31 \n\t"
452
453 "mtlo %[const_2_power_13], $ac3 \n\t"
454 "mthi $zero, $ac3 \n\t"
455 "madd $ac3, %[load3], %[cospi_24_64] \n\t"
456 "msub $ac3, %[load4], %[cospi_8_64] \n\t"
457 "extp %[step2_2], $ac3, 31 \n\t"
458
459 "mtlo %[const_2_power_13], $ac1 \n\t"
460 "mthi $zero, $ac1 \n\t"
461 "madd $ac1, %[load3], %[cospi_8_64] \n\t"
462 "madd $ac1, %[load4], %[cospi_24_64] \n\t"
463 "extp %[step2_3], $ac1, 31 \n\t"
464
465 "add %[step1_0], %[step2_0], %[step2_3] \n\t"
466 "add %[step1_1], %[step2_1], %[step2_2] \n\t"
467 "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
468 "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
469
470 : [load1] "=&r" (load1), [load2] "=&r" (load2),
471 [load3] "=&r" (load3), [load4] "=&r" (load4),
472 [result1] "=&r" (result1), [result2] "=&r" (result2),
473 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
474 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
475 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
476 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
477 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
478 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
479 [cospi_16_64] "r" (cospi_16_64)
480 );
481
482 __asm__ __volatile__ (
483 "lh %[load5], 2(%[input]) \n\t"
484 "lh %[load6], 30(%[input]) \n\t"
485 "lh %[load7], 18(%[input]) \n\t"
486 "lh %[load8], 14(%[input]) \n\t"
487
488 "mtlo %[const_2_power_13], $ac1 \n\t"
489 "mthi $zero, $ac1 \n\t"
490 "mtlo %[const_2_power_13], $ac3 \n\t"
491 "mthi $zero, $ac3 \n\t"
492
493 "madd $ac1, %[load5], %[cospi_30_64] \n\t"
494 "msub $ac1, %[load6], %[cospi_2_64] \n\t"
495 "extp %[result1], $ac1, 31 \n\t"
496
497 "madd $ac3, %[load7], %[cospi_14_64] \n\t"
498 "msub $ac3, %[load8], %[cospi_18_64] \n\t"
499 "extp %[result2], $ac3, 31 \n\t"
500
501 "mtlo %[const_2_power_13], $ac1 \n\t"
502 "mthi $zero, $ac1 \n\t"
503 "mtlo %[const_2_power_13], $ac2 \n\t"
504 "mthi $zero, $ac2 \n\t"
505
506 "madd $ac1, %[load7], %[cospi_18_64] \n\t"
507 "madd $ac1, %[load8], %[cospi_14_64] \n\t"
508 "extp %[result3], $ac1, 31 \n\t"
509
510 "madd $ac2, %[load5], %[cospi_2_64] \n\t"
511 "madd $ac2, %[load6], %[cospi_30_64] \n\t"
512 "extp %[result4], $ac2, 31 \n\t"
513
514 "sub %[load5], %[result1], %[result2] \n\t"
515 "sub %[load6], %[result4], %[result3] \n\t"
516
517 "mtlo %[const_2_power_13], $ac1 \n\t"
518 "mthi $zero, $ac1 \n\t"
519 "mtlo %[const_2_power_13], $ac3 \n\t"
520 "mthi $zero, $ac3 \n\t"
521
522 "madd $ac1, %[load6], %[cospi_24_64] \n\t"
523 "msub $ac1, %[load5], %[cospi_8_64] \n\t"
524 "madd $ac3, %[load5], %[cospi_24_64] \n\t"
525 "madd $ac3, %[load6], %[cospi_8_64] \n\t"
526
527 "extp %[step2_9], $ac1, 31 \n\t"
528 "extp %[step2_14], $ac3, 31 \n\t"
529 "add %[step2_8], %[result1], %[result2] \n\t"
530 "add %[step2_15], %[result4], %[result3] \n\t"
531
532 : [load5] "=&r" (load5), [load6] "=&r" (load6),
533 [load7] "=&r" (load7), [load8] "=&r" (load8),
534 [result1] "=&r" (result1), [result2] "=&r" (result2),
535 [result3] "=&r" (result3), [result4] "=&r" (result4),
536 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
537 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
538 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
539 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
540 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
541 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
542 );
543
544 __asm__ __volatile__ (
545 "lh %[load1], 10(%[input]) \n\t"
546 "lh %[load2], 22(%[input]) \n\t"
547 "lh %[load3], 26(%[input]) \n\t"
548 "lh %[load4], 6(%[input]) \n\t"
549
550 "mtlo %[const_2_power_13], $ac1 \n\t"
551 "mthi $zero, $ac1 \n\t"
552 "mtlo %[const_2_power_13], $ac3 \n\t"
553 "mthi $zero, $ac3 \n\t"
554
555 "madd $ac1, %[load1], %[cospi_22_64] \n\t"
556 "msub $ac1, %[load2], %[cospi_10_64] \n\t"
557 "extp %[result1], $ac1, 31 \n\t"
558
559 "madd $ac3, %[load3], %[cospi_6_64] \n\t"
560 "msub $ac3, %[load4], %[cospi_26_64] \n\t"
561 "extp %[result2], $ac3, 31 \n\t"
562
563 "mtlo %[const_2_power_13], $ac1 \n\t"
564 "mthi $zero, $ac1 \n\t"
565 "mtlo %[const_2_power_13], $ac2 \n\t"
566 "mthi $zero, $ac2 \n\t"
567
568 "madd $ac1, %[load1], %[cospi_10_64] \n\t"
569 "madd $ac1, %[load2], %[cospi_22_64] \n\t"
570 "extp %[result3], $ac1, 31 \n\t"
571
572 "madd $ac2, %[load3], %[cospi_26_64] \n\t"
573 "madd $ac2, %[load4], %[cospi_6_64] \n\t"
574 "extp %[result4], $ac2, 31 \n\t"
575
576 "mtlo %[const_2_power_13], $ac1 \n\t"
577 "mthi $zero, $ac1 \n\t"
578 "mtlo %[const_2_power_13], $ac3 \n\t"
579 "mthi $zero, $ac3 \n\t"
580
581 "sub %[load1], %[result2], %[result1] \n\t"
582 "sub %[load2], %[result4], %[result3] \n\t"
583
584 "msub $ac1, %[load1], %[cospi_24_64] \n\t"
585 "msub $ac1, %[load2], %[cospi_8_64] \n\t"
586 "madd $ac3, %[load2], %[cospi_24_64] \n\t"
587 "msub $ac3, %[load1], %[cospi_8_64] \n\t"
588
589 "extp %[step2_10], $ac1, 31 \n\t"
590 "extp %[step2_13], $ac3, 31 \n\t"
591 "add %[step2_11], %[result1], %[result2] \n\t"
592 "add %[step2_12], %[result4], %[result3] \n\t"
593
594 : [load1] "=&r" (load1), [load2] "=&r" (load2),
595 [load3] "=&r" (load3), [load4] "=&r" (load4),
596 [result1] "=&r" (result1), [result2] "=&r" (result2),
597 [result3] "=&r" (result3), [result4] "=&r" (result4),
598 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
599 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
600 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
601 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
602 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
603 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
604 );
605
606 __asm__ __volatile__ (
607 "lh %[load5], 4(%[input]) \n\t"
608 "lh %[load6], 28(%[input]) \n\t"
609 "lh %[load7], 20(%[input]) \n\t"
610 "lh %[load8], 12(%[input]) \n\t"
611
612 "mtlo %[const_2_power_13], $ac1 \n\t"
613 "mthi $zero, $ac1 \n\t"
614 "mtlo %[const_2_power_13], $ac3 \n\t"
615 "mthi $zero, $ac3 \n\t"
616
617 "madd $ac1, %[load5], %[cospi_28_64] \n\t"
618 "msub $ac1, %[load6], %[cospi_4_64] \n\t"
619 "extp %[result1], $ac1, 31 \n\t"
620
621 "madd $ac3, %[load7], %[cospi_12_64] \n\t"
622 "msub $ac3, %[load8], %[cospi_20_64] \n\t"
623 "extp %[result2], $ac3, 31 \n\t"
624
625 "mtlo %[const_2_power_13], $ac1 \n\t"
626 "mthi $zero, $ac1 \n\t"
627 "mtlo %[const_2_power_13], $ac2 \n\t"
628 "mthi $zero, $ac2 \n\t"
629
630 "madd $ac1, %[load7], %[cospi_20_64] \n\t"
631 "madd $ac1, %[load8], %[cospi_12_64] \n\t"
632 "extp %[result3], $ac1, 31 \n\t"
633
634 "madd $ac2, %[load5], %[cospi_4_64] \n\t"
635 "madd $ac2, %[load6], %[cospi_28_64] \n\t"
636 "extp %[result4], $ac2, 31 \n\t"
637
638 "mtlo %[const_2_power_13], $ac1 \n\t"
639 "mthi $zero, $ac1 \n\t"
640 "mtlo %[const_2_power_13], $ac3 \n\t"
641 "mthi $zero, $ac3 \n\t"
642
643 "sub %[load5], %[result4], %[result3] \n\t"
644 "sub %[load5], %[load5], %[result1] \n\t"
645 "add %[load5], %[load5], %[result2] \n\t"
646
647 "sub %[load6], %[result1], %[result2] \n\t"
648 "sub %[load6], %[load6], %[result3] \n\t"
649 "add %[load6], %[load6], %[result4] \n\t"
650
651 "madd $ac1, %[load5], %[cospi_16_64] \n\t"
652 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
653
654 "extp %[step1_5], $ac1, 31 \n\t"
655 "extp %[step1_6], $ac3, 31 \n\t"
656
657 "add %[step1_4], %[result1], %[result2] \n\t"
658 "add %[step1_7], %[result4], %[result3] \n\t"
659
660 : [load5] "=&r" (load5), [load6] "=&r" (load6),
661 [load7] "=&r" (load7), [load8] "=&r" (load8),
662 [result1] "=&r" (result1), [result2] "=&r" (result2),
663 [result3] "=&r" (result3), [result4] "=&r" (result4),
664 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
665 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
666 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
667 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
668 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
669 [cospi_16_64] "r" (cospi_16_64)
670 );
671
672 __asm__ __volatile__ (
673 "mtlo %[const_2_power_13], $ac0 \n\t"
674 "mthi $zero, $ac0 \n\t"
675 "mtlo %[const_2_power_13], $ac1 \n\t"
676 "mthi $zero, $ac1 \n\t"
677
678 "sub %[load5], %[step2_14], %[step2_13] \n\t"
679 "sub %[load5], %[load5], %[step2_9] \n\t"
680 "add %[load5], %[load5], %[step2_10] \n\t"
681
682 "madd $ac0, %[load5], %[cospi_16_64] \n\t"
683
684 "sub %[load6], %[step2_14], %[step2_13] \n\t"
685 "sub %[load6], %[load6], %[step2_10] \n\t"
686 "add %[load6], %[load6], %[step2_9] \n\t"
687
688 "madd $ac1, %[load6], %[cospi_16_64] \n\t"
689
690 "mtlo %[const_2_power_13], $ac2 \n\t"
691 "mthi $zero, $ac2 \n\t"
692 "mtlo %[const_2_power_13], $ac3 \n\t"
693 "mthi $zero, $ac3 \n\t"
694
695 "sub %[load5], %[step2_15], %[step2_12] \n\t"
696 "sub %[load5], %[load5], %[step2_8] \n\t"
697 "add %[load5], %[load5], %[step2_11] \n\t"
698
699 "madd $ac2, %[load5], %[cospi_16_64] \n\t"
700
701 "sub %[load6], %[step2_15], %[step2_12] \n\t"
702 "sub %[load6], %[load6], %[step2_11] \n\t"
703 "add %[load6], %[load6], %[step2_8] \n\t"
704
705 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
706
707 "extp %[step1_10], $ac0, 31 \n\t"
708 "extp %[step1_13], $ac1, 31 \n\t"
709 "extp %[step1_11], $ac2, 31 \n\t"
710 "extp %[step1_12], $ac3, 31 \n\t"
711
712 : [load5] "=&r" (load5), [load6] "=&r" (load6),
713 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
714 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
715 : [const_2_power_13] "r" (const_2_power_13),
716 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
717 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
718 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
719 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
720 [cospi_16_64] "r" (cospi_16_64)
721 );
722
723 step1_8 = step2_8 + step2_11;
724 step1_9 = step2_9 + step2_10;
725 step1_14 = step2_13 + step2_14;
726 step1_15 = step2_12 + step2_15;
727
728 __asm__ __volatile__ (
729 "lbu %[load7], 0(%[dest_pix]) \n\t"
730 "add %[load5], %[step1_0], %[step1_7] \n\t"
731 "add %[load5], %[load5], %[step1_15] \n\t"
732 "addi %[load5], %[load5], 32 \n\t"
733 "sra %[load5], %[load5], 6 \n\t"
734 "add %[load7], %[load7], %[load5] \n\t"
735 "lbux %[load5], %[load7](%[cm]) \n\t"
736 "add %[load6], %[step1_1], %[step1_6] \n\t"
737 "add %[load6], %[load6], %[step1_14] \n\t"
738 "sb %[load5], 0(%[dest_pix]) \n\t"
739 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
740 "lbu %[load8], 0(%[dest_pix]) \n\t"
741 "addi %[load6], %[load6], 32 \n\t"
742 "sra %[load6], %[load6], 6 \n\t"
743 "add %[load8], %[load8], %[load6] \n\t"
744 "lbux %[load6], %[load8](%[cm]) \n\t"
745 "sb %[load6], 0(%[dest_pix]) \n\t"
746 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
747
748 "lbu %[load7], 0(%[dest_pix]) \n\t"
749 "add %[load5], %[step1_2], %[step1_5] \n\t"
750 "add %[load5], %[load5], %[step1_13] \n\t"
751 "addi %[load5], %[load5], 32 \n\t"
752 "sra %[load5], %[load5], 6 \n\t"
753 "add %[load7], %[load7], %[load5] \n\t"
754 "lbux %[load5], %[load7](%[cm]) \n\t"
755 "add %[load6], %[step1_3], %[step1_4] \n\t"
756 "add %[load6], %[load6], %[step1_12] \n\t"
757 "sb %[load5], 0(%[dest_pix]) \n\t"
758 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
759 "lbu %[load8], 0(%[dest_pix]) \n\t"
760 "addi %[load6], %[load6], 32 \n\t"
761 "sra %[load6], %[load6], 6 \n\t"
762 "add %[load8], %[load8], %[load6] \n\t"
763 "lbux %[load6], %[load8](%[cm]) \n\t"
764 "sb %[load6], 0(%[dest_pix]) \n\t"
765 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
766
767 "lbu %[load7], 0(%[dest_pix]) \n\t"
768 "sub %[load5], %[step1_3], %[step1_4] \n\t"
769 "add %[load5], %[load5], %[step1_11] \n\t"
770 "addi %[load5], %[load5], 32 \n\t"
771 "sra %[load5], %[load5], 6 \n\t"
772 "add %[load7], %[load7], %[load5] \n\t"
773 "lbux %[load5], %[load7](%[cm]) \n\t"
774 "sub %[load6], %[step1_2], %[step1_5] \n\t"
775 "add %[load6], %[load6], %[step1_10] \n\t"
776 "sb %[load5], 0(%[dest_pix]) \n\t"
777 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
778 "lbu %[load8], 0(%[dest_pix]) \n\t"
779 "addi %[load6], %[load6], 32 \n\t"
780 "sra %[load6], %[load6], 6 \n\t"
781 "add %[load8], %[load8], %[load6] \n\t"
782 "lbux %[load6], %[load8](%[cm]) \n\t"
783 "sb %[load6], 0(%[dest_pix]) \n\t"
784 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
785
786 "sub %[load5], %[step1_1], %[step1_6] \n\t"
787 "lbu %[load7], 0(%[dest_pix]) \n\t"
788 "add %[load5], %[load5], %[step1_9] \n\t"
789 "addi %[load5], %[load5], 32 \n\t"
790 "sra %[load5], %[load5], 6 \n\t"
791 "add %[load7], %[load7], %[load5] \n\t"
792 "lbux %[load5], %[load7](%[cm]) \n\t"
793 "sub %[load6], %[step1_0], %[step1_7] \n\t"
794 "add %[load6], %[load6], %[step1_8] \n\t"
795 "sb %[load5], 0(%[dest_pix]) \n\t"
796 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
797 "lbu %[load8], 0(%[dest_pix]) \n\t"
798 "addi %[load6], %[load6], 32 \n\t"
799 "sra %[load6], %[load6], 6 \n\t"
800 "add %[load8], %[load8], %[load6] \n\t"
801 "lbux %[load6], %[load8](%[cm]) \n\t"
802 "sb %[load6], 0(%[dest_pix]) \n\t"
803 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
804
805 "lbu %[load7], 0(%[dest_pix]) \n\t"
806 "sub %[load5], %[step1_0], %[step1_7] \n\t"
807 "sub %[load5], %[load5], %[step1_8] \n\t"
808 "addi %[load5], %[load5], 32 \n\t"
809 "sra %[load5], %[load5], 6 \n\t"
810 "add %[load7], %[load7], %[load5] \n\t"
811 "lbux %[load5], %[load7](%[cm]) \n\t"
812 "sub %[load6], %[step1_1], %[step1_6] \n\t"
813 "sub %[load6], %[load6], %[step1_9] \n\t"
814 "sb %[load5], 0(%[dest_pix]) \n\t"
815 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
816 "lbu %[load8], 0(%[dest_pix]) \n\t"
817 "addi %[load6], %[load6], 32 \n\t"
818 "sra %[load6], %[load6], 6 \n\t"
819 "add %[load8], %[load8], %[load6] \n\t"
820 "lbux %[load6], %[load8](%[cm]) \n\t"
821 "sb %[load6], 0(%[dest_pix]) \n\t"
822 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
823
824 "lbu %[load7], 0(%[dest_pix]) \n\t"
825 "sub %[load5], %[step1_2], %[step1_5] \n\t"
826 "sub %[load5], %[load5], %[step1_10] \n\t"
827 "addi %[load5], %[load5], 32 \n\t"
828 "sra %[load5], %[load5], 6 \n\t"
829 "add %[load7], %[load7], %[load5] \n\t"
830 "lbux %[load5], %[load7](%[cm]) \n\t"
831 "sub %[load6], %[step1_3], %[step1_4] \n\t"
832 "sub %[load6], %[load6], %[step1_11] \n\t"
833 "sb %[load5], 0(%[dest_pix]) \n\t"
834 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
835 "lbu %[load8], 0(%[dest_pix]) \n\t"
836 "addi %[load6], %[load6], 32 \n\t"
837 "sra %[load6], %[load6], 6 \n\t"
838 "add %[load8], %[load8], %[load6] \n\t"
839 "lbux %[load6], %[load8](%[cm]) \n\t"
840 "sb %[load6], 0(%[dest_pix]) \n\t"
841 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
842
843 "lbu %[load7], 0(%[dest_pix]) \n\t"
844 "add %[load5], %[step1_3], %[step1_4] \n\t"
845 "sub %[load5], %[load5], %[step1_12] \n\t"
846 "addi %[load5], %[load5], 32 \n\t"
847 "sra %[load5], %[load5], 6 \n\t"
848 "add %[load7], %[load7], %[load5] \n\t"
849 "lbux %[load5], %[load7](%[cm]) \n\t"
850 "add %[load6], %[step1_2], %[step1_5] \n\t"
851 "sub %[load6], %[load6], %[step1_13] \n\t"
852 "sb %[load5], 0(%[dest_pix]) \n\t"
853 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
854 "lbu %[load8], 0(%[dest_pix]) \n\t"
855 "addi %[load6], %[load6], 32 \n\t"
856 "sra %[load6], %[load6], 6 \n\t"
857 "add %[load8], %[load8], %[load6] \n\t"
858 "lbux %[load6], %[load8](%[cm]) \n\t"
859 "sb %[load6], 0(%[dest_pix]) \n\t"
860 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
861
862 "lbu %[load7], 0(%[dest_pix]) \n\t"
863 "add %[load5], %[step1_1], %[step1_6] \n\t"
864 "sub %[load5], %[load5], %[step1_14] \n\t"
865 "addi %[load5], %[load5], 32 \n\t"
866 "sra %[load5], %[load5], 6 \n\t"
867 "add %[load7], %[load7], %[load5] \n\t"
868 "lbux %[load5], %[load7](%[cm]) \n\t"
869 "add %[load6], %[step1_0], %[step1_7] \n\t"
870 "sub %[load6], %[load6], %[step1_15] \n\t"
871 "sb %[load5], 0(%[dest_pix]) \n\t"
872 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
873 "lbu %[load8], 0(%[dest_pix]) \n\t"
874 "addi %[load6], %[load6], 32 \n\t"
875 "sra %[load6], %[load6], 6 \n\t"
876 "add %[load8], %[load8], %[load6] \n\t"
877 "lbux %[load6], %[load8](%[cm]) \n\t"
878 "sb %[load6], 0(%[dest_pix]) \n\t"
879
880 : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
881 [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
882 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
883 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
884 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
885 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
886 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
887 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
888 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
889 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
890 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
891 );
892
893 input += 16;
894 }
895 }
896
897 void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
898 int dest_stride) {
899 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
900 uint32_t pos = 45;
901
902 /* bit positon for extract from acc */
903 __asm__ __volatile__ (
904 "wrdsp %[pos], 1 \n\t"
905 :
906 : [pos] "r" (pos)
907 );
908
909 // First transform rows
910 idct16_rows_dspr2(input, out, 16);
911
912 // Then transform columns and add to dest
913 idct16_cols_add_blk_dspr2(out, dest, dest_stride);
914 }
915
916 static void iadst16(const int16_t *input, int16_t *output) {
917 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
918
919 int x0 = input[15];
920 int x1 = input[0];
921 int x2 = input[13];
922 int x3 = input[2];
923 int x4 = input[11];
924 int x5 = input[4];
925 int x6 = input[9];
926 int x7 = input[6];
927 int x8 = input[7];
928 int x9 = input[8];
929 int x10 = input[5];
930 int x11 = input[10];
931 int x12 = input[3];
932 int x13 = input[12];
933 int x14 = input[1];
934 int x15 = input[14];
935
936 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
937 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
938 output[0] = output[1] = output[2] = output[3] = output[4]
939 = output[5] = output[6] = output[7] = output[8]
940 = output[9] = output[10] = output[11] = output[12]
941 = output[13] = output[14] = output[15] = 0;
942 return;
943 }
944
945 // stage 1
946 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
947 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
948 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
949 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
950 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
951 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
952 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
953 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
954 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
955 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
956 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
957 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
958 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
959 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
960 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
961 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
962
963 x0 = dct_const_round_shift(s0 + s8);
964 x1 = dct_const_round_shift(s1 + s9);
965 x2 = dct_const_round_shift(s2 + s10);
966 x3 = dct_const_round_shift(s3 + s11);
967 x4 = dct_const_round_shift(s4 + s12);
968 x5 = dct_const_round_shift(s5 + s13);
969 x6 = dct_const_round_shift(s6 + s14);
970 x7 = dct_const_round_shift(s7 + s15);
971 x8 = dct_const_round_shift(s0 - s8);
972 x9 = dct_const_round_shift(s1 - s9);
973 x10 = dct_const_round_shift(s2 - s10);
974 x11 = dct_const_round_shift(s3 - s11);
975 x12 = dct_const_round_shift(s4 - s12);
976 x13 = dct_const_round_shift(s5 - s13);
977 x14 = dct_const_round_shift(s6 - s14);
978 x15 = dct_const_round_shift(s7 - s15);
979
980 // stage 2
981 s0 = x0;
982 s1 = x1;
983 s2 = x2;
984 s3 = x3;
985 s4 = x4;
986 s5 = x5;
987 s6 = x6;
988 s7 = x7;
989 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
990 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
991 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
992 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
993 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
994 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
995 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
996 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
997
998 x0 = s0 + s4;
999 x1 = s1 + s5;
1000 x2 = s2 + s6;
1001 x3 = s3 + s7;
1002 x4 = s0 - s4;
1003 x5 = s1 - s5;
1004 x6 = s2 - s6;
1005 x7 = s3 - s7;
1006 x8 = dct_const_round_shift(s8 + s12);
1007 x9 = dct_const_round_shift(s9 + s13);
1008 x10 = dct_const_round_shift(s10 + s14);
1009 x11 = dct_const_round_shift(s11 + s15);
1010 x12 = dct_const_round_shift(s8 - s12);
1011 x13 = dct_const_round_shift(s9 - s13);
1012 x14 = dct_const_round_shift(s10 - s14);
1013 x15 = dct_const_round_shift(s11 - s15);
1014
1015 // stage 3
1016 s0 = x0;
1017 s1 = x1;
1018 s2 = x2;
1019 s3 = x3;
1020 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1021 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1022 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
1023 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1024 s8 = x8;
1025 s9 = x9;
1026 s10 = x10;
1027 s11 = x11;
1028 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1029 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1030 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
1031 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1032
1033 x0 = s0 + s2;
1034 x1 = s1 + s3;
1035 x2 = s0 - s2;
1036 x3 = s1 - s3;
1037 x4 = dct_const_round_shift(s4 + s6);
1038 x5 = dct_const_round_shift(s5 + s7);
1039 x6 = dct_const_round_shift(s4 - s6);
1040 x7 = dct_const_round_shift(s5 - s7);
1041 x8 = s8 + s10;
1042 x9 = s9 + s11;
1043 x10 = s8 - s10;
1044 x11 = s9 - s11;
1045 x12 = dct_const_round_shift(s12 + s14);
1046 x13 = dct_const_round_shift(s13 + s15);
1047 x14 = dct_const_round_shift(s12 - s14);
1048 x15 = dct_const_round_shift(s13 - s15);
1049
1050 // stage 4
1051 s2 = (- cospi_16_64) * (x2 + x3);
1052 s3 = cospi_16_64 * (x2 - x3);
1053 s6 = cospi_16_64 * (x6 + x7);
1054 s7 = cospi_16_64 * (- x6 + x7);
1055 s10 = cospi_16_64 * (x10 + x11);
1056 s11 = cospi_16_64 * (- x10 + x11);
1057 s14 = (- cospi_16_64) * (x14 + x15);
1058 s15 = cospi_16_64 * (x14 - x15);
1059
1060 x2 = dct_const_round_shift(s2);
1061 x3 = dct_const_round_shift(s3);
1062 x6 = dct_const_round_shift(s6);
1063 x7 = dct_const_round_shift(s7);
1064 x10 = dct_const_round_shift(s10);
1065 x11 = dct_const_round_shift(s11);
1066 x14 = dct_const_round_shift(s14);
1067 x15 = dct_const_round_shift(s15);
1068
1069 output[0] = x0;
1070 output[1] = -x8;
1071 output[2] = x12;
1072 output[3] = -x4;
1073 output[4] = x6;
1074 output[5] = x14;
1075 output[6] = x10;
1076 output[7] = x2;
1077 output[8] = x3;
1078 output[9] = x11;
1079 output[10] = x15;
1080 output[11] = x7;
1081 output[12] = x5;
1082 output[13] = -x13;
1083 output[14] = x9;
1084 output[15] = -x1;
1085 }
1086
1087 void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, 24 void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
1088 int pitch, int tx_type) { 25 int pitch, int tx_type) {
1089 int i, j; 26 int i, j;
1090 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); 27 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
1091 int16_t *outptr = out; 28 int16_t *outptr = out;
1092 int16_t temp_out[16]; 29 int16_t temp_out[16];
1093 uint32_t pos = 45; 30 uint32_t pos = 45;
1094 31
1095 /* bit positon for extract from acc */ 32 /* bit positon for extract from acc */
1096 __asm__ __volatile__ ( 33 __asm__ __volatile__ (
1097 "wrdsp %[pos], 1 \n\t" 34 "wrdsp %[pos], 1 \n\t"
1098 : 35 :
1099 : [pos] "r" (pos) 36 : [pos] "r" (pos)
1100 ); 37 );
1101 38
1102 switch (tx_type) { 39 switch (tx_type) {
1103 case DCT_DCT: // DCT in both horizontal and vertical 40 case DCT_DCT: // DCT in both horizontal and vertical
1104 idct16_rows_dspr2(input, outptr, 16); 41 idct16_rows_dspr2(input, outptr, 16);
1105 idct16_cols_add_blk_dspr2(out, dest, pitch); 42 idct16_cols_add_blk_dspr2(out, dest, pitch);
1106 break; 43 break;
1107 case ADST_DCT: // ADST in vertical, DCT in horizontal 44 case ADST_DCT: // ADST in vertical, DCT in horizontal
1108 idct16_rows_dspr2(input, outptr, 16); 45 idct16_rows_dspr2(input, outptr, 16);
1109 46
1110 outptr = out; 47 outptr = out;
1111 48
1112 for (i = 0; i < 16; ++i) { 49 for (i = 0; i < 16; ++i) {
1113 iadst16(outptr, temp_out); 50 iadst16_dspr2(outptr, temp_out);
1114 51
1115 for (j = 0; j < 16; ++j) 52 for (j = 0; j < 16; ++j)
1116 dest[j * pitch + i] = 53 dest[j * pitch + i] =
1117 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 54 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1118 + dest[j * pitch + i]); 55 + dest[j * pitch + i]);
1119 outptr += 16; 56 outptr += 16;
1120 } 57 }
1121 break; 58 break;
1122 case DCT_ADST: // DCT in vertical, ADST in horizontal 59 case DCT_ADST: // DCT in vertical, ADST in horizontal
1123 { 60 {
1124 int16_t temp_in[16 * 16]; 61 int16_t temp_in[16 * 16];
1125 62
1126 for (i = 0; i < 16; ++i) { 63 for (i = 0; i < 16; ++i) {
1127 /* prefetch row */ 64 /* prefetch row */
1128 prefetch_load((const uint8_t *)(input + 16)); 65 prefetch_load((const uint8_t *)(input + 16));
1129 66
1130 iadst16(input, outptr); 67 iadst16_dspr2(input, outptr);
1131 input += 16; 68 input += 16;
1132 outptr += 16; 69 outptr += 16;
1133 } 70 }
1134 71
1135 for (i = 0; i < 16; ++i) 72 for (i = 0; i < 16; ++i)
1136 for (j = 0; j < 16; ++j) 73 for (j = 0; j < 16; ++j)
1137 temp_in[j * 16 + i] = out[i * 16 + j]; 74 temp_in[j * 16 + i] = out[i * 16 + j];
1138 75
1139 idct16_cols_add_blk_dspr2(temp_in, dest, pitch); 76 idct16_cols_add_blk_dspr2(temp_in, dest, pitch);
1140 } 77 }
1141 break; 78 break;
1142 case ADST_ADST: // ADST in both directions 79 case ADST_ADST: // ADST in both directions
1143 { 80 {
1144 int16_t temp_in[16]; 81 int16_t temp_in[16];
1145 82
1146 for (i = 0; i < 16; ++i) { 83 for (i = 0; i < 16; ++i) {
1147 /* prefetch row */ 84 /* prefetch row */
1148 prefetch_load((const uint8_t *)(input + 16)); 85 prefetch_load((const uint8_t *)(input + 16));
1149 86
1150 iadst16(input, outptr); 87 iadst16_dspr2(input, outptr);
1151 input += 16; 88 input += 16;
1152 outptr += 16; 89 outptr += 16;
1153 } 90 }
1154 91
1155 for (i = 0; i < 16; ++i) { 92 for (i = 0; i < 16; ++i) {
1156 for (j = 0; j < 16; ++j) 93 for (j = 0; j < 16; ++j)
1157 temp_in[j] = out[j * 16 + i]; 94 temp_in[j] = out[j * 16 + i];
1158 iadst16(temp_in, temp_out); 95 iadst16_dspr2(temp_in, temp_out);
1159 for (j = 0; j < 16; ++j) 96 for (j = 0; j < 16; ++j)
1160 dest[j * pitch + i] = 97 dest[j * pitch + i] =
1161 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 98 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1162 + dest[j * pitch + i]); 99 + dest[j * pitch + i]);
1163 } 100 }
1164 } 101 }
1165 break; 102 break;
1166 default: 103 default:
1167 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); 104 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");
1168 break; 105 break;
1169 } 106 }
1170 } 107 }
1171
1172 void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
1173 int dest_stride) {
1174 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
1175 int16_t *outptr = out;
1176 uint32_t i;
1177 uint32_t pos = 45;
1178
1179 /* bit positon for extract from acc */
1180 __asm__ __volatile__ (
1181 "wrdsp %[pos], 1 \n\t"
1182 :
1183 : [pos] "r" (pos)
1184 );
1185
1186 // First transform rows. Since all non-zero dct coefficients are in
1187 // upper-left 4x4 area, we only need to calculate first 4 rows here.
1188 idct16_rows_dspr2(input, outptr, 4);
1189
1190 outptr += 4;
1191 for (i = 0; i < 6; ++i) {
1192 __asm__ __volatile__ (
1193 "sw $zero, 0(%[outptr]) \n\t"
1194 "sw $zero, 32(%[outptr]) \n\t"
1195 "sw $zero, 64(%[outptr]) \n\t"
1196 "sw $zero, 96(%[outptr]) \n\t"
1197 "sw $zero, 128(%[outptr]) \n\t"
1198 "sw $zero, 160(%[outptr]) \n\t"
1199 "sw $zero, 192(%[outptr]) \n\t"
1200 "sw $zero, 224(%[outptr]) \n\t"
1201 "sw $zero, 256(%[outptr]) \n\t"
1202 "sw $zero, 288(%[outptr]) \n\t"
1203 "sw $zero, 320(%[outptr]) \n\t"
1204 "sw $zero, 352(%[outptr]) \n\t"
1205 "sw $zero, 384(%[outptr]) \n\t"
1206 "sw $zero, 416(%[outptr]) \n\t"
1207 "sw $zero, 448(%[outptr]) \n\t"
1208 "sw $zero, 480(%[outptr]) \n\t"
1209
1210 :
1211 : [outptr] "r" (outptr)
1212 );
1213
1214 outptr += 2;
1215 }
1216
1217 // Then transform columns
1218 idct16_cols_add_blk_dspr2(out, dest, dest_stride);
1219 }
1220
1221 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
1222 int dest_stride) {
1223 uint32_t pos = 45;
1224 int32_t out;
1225 int32_t r;
1226 int32_t a1, absa1;
1227 int32_t vector_a1;
1228 int32_t t1, t2, t3, t4;
1229 int32_t vector_1, vector_2, vector_3, vector_4;
1230
1231 /* bit positon for extract from acc */
1232 __asm__ __volatile__ (
1233 "wrdsp %[pos], 1 \n\t"
1234
1235 :
1236 : [pos] "r" (pos)
1237 );
1238
1239 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
1240 __asm__ __volatile__ (
1241 "addi %[out], %[out], 32 \n\t"
1242 "sra %[a1], %[out], 6 \n\t"
1243
1244 : [out] "+r" (out), [a1] "=r" (a1)
1245 :
1246 );
1247
1248 if (a1 < 0) {
1249 /* use quad-byte
1250 * input and output memory are four byte aligned */
1251 __asm__ __volatile__ (
1252 "abs %[absa1], %[a1] \n\t"
1253 "replv.qb %[vector_a1], %[absa1] \n\t"
1254
1255 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
1256 : [a1] "r" (a1)
1257 );
1258
1259 for (r = 16; r--;) {
1260 __asm__ __volatile__ (
1261 "lw %[t1], 0(%[dest]) \n\t"
1262 "lw %[t2], 4(%[dest]) \n\t"
1263 "lw %[t3], 8(%[dest]) \n\t"
1264 "lw %[t4], 12(%[dest]) \n\t"
1265 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
1266 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
1267 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
1268 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
1269 "sw %[vector_1], 0(%[dest]) \n\t"
1270 "sw %[vector_2], 4(%[dest]) \n\t"
1271 "sw %[vector_3], 8(%[dest]) \n\t"
1272 "sw %[vector_4], 12(%[dest]) \n\t"
1273 "add %[dest], %[dest], %[dest_stride] \n\t"
1274
1275 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
1276 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
1277 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
1278 [dest] "+&r" (dest)
1279 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
1280 );
1281 }
1282 } else {
1283 /* use quad-byte
1284 * input and output memory are four byte aligned */
1285 __asm__ __volatile__ (
1286 "replv.qb %[vector_a1], %[a1] \n\t"
1287
1288 : [vector_a1] "=r" (vector_a1)
1289 : [a1] "r" (a1)
1290 );
1291
1292 for (r = 16; r--;) {
1293 __asm__ __volatile__ (
1294 "lw %[t1], 0(%[dest]) \n\t"
1295 "lw %[t2], 4(%[dest]) \n\t"
1296 "lw %[t3], 8(%[dest]) \n\t"
1297 "lw %[t4], 12(%[dest]) \n\t"
1298 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
1299 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
1300 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
1301 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
1302 "sw %[vector_1], 0(%[dest]) \n\t"
1303 "sw %[vector_2], 4(%[dest]) \n\t"
1304 "sw %[vector_3], 8(%[dest]) \n\t"
1305 "sw %[vector_4], 12(%[dest]) \n\t"
1306 "add %[dest], %[dest], %[dest_stride] \n\t"
1307
1308 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
1309 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
1310 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
1311 [dest] "+&r" (dest)
1312 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
1313 );
1314 }
1315 }
1316 }
1317 #endif // #if HAVE_DSPR2 108 #endif // #if HAVE_DSPR2
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.c ('k') | source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698