OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <assert.h> | |
12 #include <stdio.h> | |
13 | |
14 #include "./vpx_config.h" | |
15 #include "./vp9_rtcd.h" | |
16 #include "vp9/common/vp9_common.h" | |
17 #include "vp9/common/vp9_blockd.h" | |
18 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" | |
19 #include "vpx_dsp/txfm_common.h" | |
20 | |
21 #if HAVE_DSPR2 | |
22 static void idct32_rows_dspr2(const int16_t *input, int16_t *output, | |
23 uint32_t no_rows) { | |
24 int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; | |
25 int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; | |
26 int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; | |
27 int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; | |
28 int16_t step1_28, step1_29, step1_30, step1_31; | |
29 int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; | |
30 int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; | |
31 int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; | |
32 int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; | |
33 int16_t step2_28, step2_29, step2_30, step2_31; | |
34 int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; | |
35 int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; | |
36 int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; | |
37 int16_t step3_29, step3_30, step3_31; | |
38 int temp0, temp1, temp2, temp3; | |
39 int load1, load2, load3, load4; | |
40 int result1, result2; | |
41 int temp21; | |
42 int i; | |
43 const int const_2_power_13 = 8192; | |
44 const int32_t *input_int; | |
45 | |
46 for (i = no_rows; i--; ) { | |
47 input_int = (const int32_t *)input; | |
48 | |
49 if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | | |
50 input_int[4] | input_int[5] | input_int[6] | input_int[7] | | |
51 input_int[8] | input_int[9] | input_int[10] | input_int[11] | | |
52 input_int[12] | input_int[13] | input_int[14] | input_int[15])) { | |
53 input += 32; | |
54 | |
55 __asm__ __volatile__ ( | |
56 "sh $zero, 0(%[output]) \n\t" | |
57 "sh $zero, 64(%[output]) \n\t" | |
58 "sh $zero, 128(%[output]) \n\t" | |
59 "sh $zero, 192(%[output]) \n\t" | |
60 "sh $zero, 256(%[output]) \n\t" | |
61 "sh $zero, 320(%[output]) \n\t" | |
62 "sh $zero, 384(%[output]) \n\t" | |
63 "sh $zero, 448(%[output]) \n\t" | |
64 "sh $zero, 512(%[output]) \n\t" | |
65 "sh $zero, 576(%[output]) \n\t" | |
66 "sh $zero, 640(%[output]) \n\t" | |
67 "sh $zero, 704(%[output]) \n\t" | |
68 "sh $zero, 768(%[output]) \n\t" | |
69 "sh $zero, 832(%[output]) \n\t" | |
70 "sh $zero, 896(%[output]) \n\t" | |
71 "sh $zero, 960(%[output]) \n\t" | |
72 "sh $zero, 1024(%[output]) \n\t" | |
73 "sh $zero, 1088(%[output]) \n\t" | |
74 "sh $zero, 1152(%[output]) \n\t" | |
75 "sh $zero, 1216(%[output]) \n\t" | |
76 "sh $zero, 1280(%[output]) \n\t" | |
77 "sh $zero, 1344(%[output]) \n\t" | |
78 "sh $zero, 1408(%[output]) \n\t" | |
79 "sh $zero, 1472(%[output]) \n\t" | |
80 "sh $zero, 1536(%[output]) \n\t" | |
81 "sh $zero, 1600(%[output]) \n\t" | |
82 "sh $zero, 1664(%[output]) \n\t" | |
83 "sh $zero, 1728(%[output]) \n\t" | |
84 "sh $zero, 1792(%[output]) \n\t" | |
85 "sh $zero, 1856(%[output]) \n\t" | |
86 "sh $zero, 1920(%[output]) \n\t" | |
87 "sh $zero, 1984(%[output]) \n\t" | |
88 | |
89 : | |
90 : [output] "r" (output) | |
91 ); | |
92 | |
93 output += 1; | |
94 | |
95 continue; | |
96 } | |
97 | |
98 /* prefetch row */ | |
99 prefetch_load((const uint8_t *)(input + 32)); | |
100 prefetch_load((const uint8_t *)(input + 48)); | |
101 | |
102 __asm__ __volatile__ ( | |
103 "lh %[load1], 2(%[input]) \n\t" | |
104 "lh %[load2], 62(%[input]) \n\t" | |
105 "lh %[load3], 34(%[input]) \n\t" | |
106 "lh %[load4], 30(%[input]) \n\t" | |
107 | |
108 "mtlo %[const_2_power_13], $ac1 \n\t" | |
109 "mthi $zero, $ac1 \n\t" | |
110 "mtlo %[const_2_power_13], $ac3 \n\t" | |
111 "mthi $zero, $ac3 \n\t" | |
112 | |
113 "madd $ac1, %[load1], %[cospi_31_64] \n\t" | |
114 "msub $ac1, %[load2], %[cospi_1_64] \n\t" | |
115 "extp %[temp0], $ac1, 31 \n\t" | |
116 | |
117 "madd $ac3, %[load1], %[cospi_1_64] \n\t" | |
118 "madd $ac3, %[load2], %[cospi_31_64] \n\t" | |
119 "extp %[temp3], $ac3, 31 \n\t" | |
120 | |
121 "mtlo %[const_2_power_13], $ac1 \n\t" | |
122 "mthi $zero, $ac1 \n\t" | |
123 "mtlo %[const_2_power_13], $ac2 \n\t" | |
124 "mthi $zero, $ac2 \n\t" | |
125 | |
126 "madd $ac2, %[load3], %[cospi_15_64] \n\t" | |
127 "msub $ac2, %[load4], %[cospi_17_64] \n\t" | |
128 "extp %[temp1], $ac2, 31 \n\t" | |
129 | |
130 "madd $ac1, %[load3], %[cospi_17_64] \n\t" | |
131 "madd $ac1, %[load4], %[cospi_15_64] \n\t" | |
132 "extp %[temp2], $ac1, 31 \n\t" | |
133 | |
134 "mtlo %[const_2_power_13], $ac1 \n\t" | |
135 "mthi $zero, $ac1 \n\t" | |
136 "mtlo %[const_2_power_13], $ac3 \n\t" | |
137 "mthi $zero, $ac3 \n\t" | |
138 | |
139 "sub %[load1], %[temp3], %[temp2] \n\t" | |
140 "sub %[load2], %[temp0], %[temp1] \n\t" | |
141 | |
142 "madd $ac1, %[load1], %[cospi_28_64] \n\t" | |
143 "msub $ac1, %[load2], %[cospi_4_64] \n\t" | |
144 "madd $ac3, %[load1], %[cospi_4_64] \n\t" | |
145 "madd $ac3, %[load2], %[cospi_28_64] \n\t" | |
146 | |
147 "extp %[step1_17], $ac1, 31 \n\t" | |
148 "extp %[step1_30], $ac3, 31 \n\t" | |
149 "add %[step1_16], %[temp0], %[temp1] \n\t" | |
150 "add %[step1_31], %[temp2], %[temp3] \n\t" | |
151 | |
152 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
153 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
154 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
155 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
156 [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), | |
157 [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) | |
158 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
159 [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), | |
160 [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), | |
161 [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) | |
162 ); | |
163 | |
164 __asm__ __volatile__ ( | |
165 "lh %[load1], 18(%[input]) \n\t" | |
166 "lh %[load2], 46(%[input]) \n\t" | |
167 "lh %[load3], 50(%[input]) \n\t" | |
168 "lh %[load4], 14(%[input]) \n\t" | |
169 | |
170 "mtlo %[const_2_power_13], $ac1 \n\t" | |
171 "mthi $zero, $ac1 \n\t" | |
172 "mtlo %[const_2_power_13], $ac3 \n\t" | |
173 "mthi $zero, $ac3 \n\t" | |
174 | |
175 "madd $ac1, %[load1], %[cospi_23_64] \n\t" | |
176 "msub $ac1, %[load2], %[cospi_9_64] \n\t" | |
177 "extp %[temp0], $ac1, 31 \n\t" | |
178 | |
179 "madd $ac3, %[load1], %[cospi_9_64] \n\t" | |
180 "madd $ac3, %[load2], %[cospi_23_64] \n\t" | |
181 "extp %[temp3], $ac3, 31 \n\t" | |
182 | |
183 "mtlo %[const_2_power_13], $ac1 \n\t" | |
184 "mthi $zero, $ac1 \n\t" | |
185 "mtlo %[const_2_power_13], $ac2 \n\t" | |
186 "mthi $zero, $ac2 \n\t" | |
187 | |
188 "madd $ac2, %[load3], %[cospi_7_64] \n\t" | |
189 "msub $ac2, %[load4], %[cospi_25_64] \n\t" | |
190 "extp %[temp1], $ac2, 31 \n\t" | |
191 | |
192 "madd $ac1, %[load3], %[cospi_25_64] \n\t" | |
193 "madd $ac1, %[load4], %[cospi_7_64] \n\t" | |
194 "extp %[temp2], $ac1, 31 \n\t" | |
195 | |
196 "mtlo %[const_2_power_13], $ac1 \n\t" | |
197 "mthi $zero, $ac1 \n\t" | |
198 "mtlo %[const_2_power_13], $ac3 \n\t" | |
199 "mthi $zero, $ac3 \n\t" | |
200 | |
201 "sub %[load1], %[temp1], %[temp0] \n\t" | |
202 "sub %[load2], %[temp2], %[temp3] \n\t" | |
203 | |
204 "msub $ac1, %[load1], %[cospi_28_64] \n\t" | |
205 "msub $ac1, %[load2], %[cospi_4_64] \n\t" | |
206 "msub $ac3, %[load1], %[cospi_4_64] \n\t" | |
207 "madd $ac3, %[load2], %[cospi_28_64] \n\t" | |
208 | |
209 "extp %[step1_18], $ac1, 31 \n\t" | |
210 "extp %[step1_29], $ac3, 31 \n\t" | |
211 "add %[step1_19], %[temp0], %[temp1] \n\t" | |
212 "add %[step1_28], %[temp2], %[temp3] \n\t" | |
213 | |
214 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
215 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
216 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
217 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
218 [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), | |
219 [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) | |
220 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
221 [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), | |
222 [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), | |
223 [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) | |
224 ); | |
225 | |
226 __asm__ __volatile__ ( | |
227 "lh %[load1], 10(%[input]) \n\t" | |
228 "lh %[load2], 54(%[input]) \n\t" | |
229 "lh %[load3], 42(%[input]) \n\t" | |
230 "lh %[load4], 22(%[input]) \n\t" | |
231 | |
232 "mtlo %[const_2_power_13], $ac1 \n\t" | |
233 "mthi $zero, $ac1 \n\t" | |
234 "mtlo %[const_2_power_13], $ac3 \n\t" | |
235 "mthi $zero, $ac3 \n\t" | |
236 | |
237 "madd $ac1, %[load1], %[cospi_27_64] \n\t" | |
238 "msub $ac1, %[load2], %[cospi_5_64] \n\t" | |
239 "extp %[temp0], $ac1, 31 \n\t" | |
240 | |
241 "madd $ac3, %[load1], %[cospi_5_64] \n\t" | |
242 "madd $ac3, %[load2], %[cospi_27_64] \n\t" | |
243 "extp %[temp3], $ac3, 31 \n\t" | |
244 | |
245 "mtlo %[const_2_power_13], $ac1 \n\t" | |
246 "mthi $zero, $ac1 \n\t" | |
247 "mtlo %[const_2_power_13], $ac2 \n\t" | |
248 "mthi $zero, $ac2 \n\t" | |
249 | |
250 "madd $ac2, %[load3], %[cospi_11_64] \n\t" | |
251 "msub $ac2, %[load4], %[cospi_21_64] \n\t" | |
252 "extp %[temp1], $ac2, 31 \n\t" | |
253 | |
254 "madd $ac1, %[load3], %[cospi_21_64] \n\t" | |
255 "madd $ac1, %[load4], %[cospi_11_64] \n\t" | |
256 "extp %[temp2], $ac1, 31 \n\t" | |
257 | |
258 "mtlo %[const_2_power_13], $ac1 \n\t" | |
259 "mthi $zero, $ac1 \n\t" | |
260 "mtlo %[const_2_power_13], $ac3 \n\t" | |
261 "mthi $zero, $ac3 \n\t" | |
262 | |
263 "sub %[load1], %[temp0], %[temp1] \n\t" | |
264 "sub %[load2], %[temp3], %[temp2] \n\t" | |
265 | |
266 "madd $ac1, %[load2], %[cospi_12_64] \n\t" | |
267 "msub $ac1, %[load1], %[cospi_20_64] \n\t" | |
268 "madd $ac3, %[load1], %[cospi_12_64] \n\t" | |
269 "madd $ac3, %[load2], %[cospi_20_64] \n\t" | |
270 | |
271 "extp %[step1_21], $ac1, 31 \n\t" | |
272 "extp %[step1_26], $ac3, 31 \n\t" | |
273 "add %[step1_20], %[temp0], %[temp1] \n\t" | |
274 "add %[step1_27], %[temp2], %[temp3] \n\t" | |
275 | |
276 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
277 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
278 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
279 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
280 [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), | |
281 [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) | |
282 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
283 [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), | |
284 [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), | |
285 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) | |
286 ); | |
287 | |
288 __asm__ __volatile__ ( | |
289 "lh %[load1], 26(%[input]) \n\t" | |
290 "lh %[load2], 38(%[input]) \n\t" | |
291 "lh %[load3], 58(%[input]) \n\t" | |
292 "lh %[load4], 6(%[input]) \n\t" | |
293 | |
294 "mtlo %[const_2_power_13], $ac1 \n\t" | |
295 "mthi $zero, $ac1 \n\t" | |
296 "mtlo %[const_2_power_13], $ac3 \n\t" | |
297 "mthi $zero, $ac3 \n\t" | |
298 | |
299 "madd $ac1, %[load1], %[cospi_19_64] \n\t" | |
300 "msub $ac1, %[load2], %[cospi_13_64] \n\t" | |
301 "extp %[temp0], $ac1, 31 \n\t" | |
302 | |
303 "madd $ac3, %[load1], %[cospi_13_64] \n\t" | |
304 "madd $ac3, %[load2], %[cospi_19_64] \n\t" | |
305 "extp %[temp3], $ac3, 31 \n\t" | |
306 | |
307 "mtlo %[const_2_power_13], $ac1 \n\t" | |
308 "mthi $zero, $ac1 \n\t" | |
309 "mtlo %[const_2_power_13], $ac2 \n\t" | |
310 "mthi $zero, $ac2 \n\t" | |
311 | |
312 "madd $ac2, %[load3], %[cospi_3_64] \n\t" | |
313 "msub $ac2, %[load4], %[cospi_29_64] \n\t" | |
314 "extp %[temp1], $ac2, 31 \n\t" | |
315 | |
316 "madd $ac1, %[load3], %[cospi_29_64] \n\t" | |
317 "madd $ac1, %[load4], %[cospi_3_64] \n\t" | |
318 "extp %[temp2], $ac1, 31 \n\t" | |
319 | |
320 "mtlo %[const_2_power_13], $ac1 \n\t" | |
321 "mthi $zero, $ac1 \n\t" | |
322 "mtlo %[const_2_power_13], $ac3 \n\t" | |
323 "mthi $zero, $ac3 \n\t" | |
324 | |
325 "sub %[load1], %[temp1], %[temp0] \n\t" | |
326 "sub %[load2], %[temp2], %[temp3] \n\t" | |
327 | |
328 "msub $ac1, %[load1], %[cospi_12_64] \n\t" | |
329 "msub $ac1, %[load2], %[cospi_20_64] \n\t" | |
330 "msub $ac3, %[load1], %[cospi_20_64] \n\t" | |
331 "madd $ac3, %[load2], %[cospi_12_64] \n\t" | |
332 | |
333 "extp %[step1_22], $ac1, 31 \n\t" | |
334 "extp %[step1_25], $ac3, 31 \n\t" | |
335 "add %[step1_23], %[temp0], %[temp1] \n\t" | |
336 "add %[step1_24], %[temp2], %[temp3] \n\t" | |
337 | |
338 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
339 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
340 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
341 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
342 [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), | |
343 [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) | |
344 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
345 [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), | |
346 [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), | |
347 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) | |
348 ); | |
349 | |
350 __asm__ __volatile__ ( | |
351 "lh %[load1], 4(%[input]) \n\t" | |
352 "lh %[load2], 60(%[input]) \n\t" | |
353 "lh %[load3], 36(%[input]) \n\t" | |
354 "lh %[load4], 28(%[input]) \n\t" | |
355 | |
356 "mtlo %[const_2_power_13], $ac1 \n\t" | |
357 "mthi $zero, $ac1 \n\t" | |
358 "mtlo %[const_2_power_13], $ac3 \n\t" | |
359 "mthi $zero, $ac3 \n\t" | |
360 | |
361 "madd $ac1, %[load1], %[cospi_30_64] \n\t" | |
362 "msub $ac1, %[load2], %[cospi_2_64] \n\t" | |
363 "extp %[temp0], $ac1, 31 \n\t" | |
364 | |
365 "madd $ac3, %[load1], %[cospi_2_64] \n\t" | |
366 "madd $ac3, %[load2], %[cospi_30_64] \n\t" | |
367 "extp %[temp3], $ac3, 31 \n\t" | |
368 | |
369 "mtlo %[const_2_power_13], $ac1 \n\t" | |
370 "mthi $zero, $ac1 \n\t" | |
371 "mtlo %[const_2_power_13], $ac2 \n\t" | |
372 "mthi $zero, $ac2 \n\t" | |
373 | |
374 "madd $ac2, %[load3], %[cospi_14_64] \n\t" | |
375 "msub $ac2, %[load4], %[cospi_18_64] \n\t" | |
376 "extp %[temp1], $ac2, 31 \n\t" | |
377 | |
378 "madd $ac1, %[load3], %[cospi_18_64] \n\t" | |
379 "madd $ac1, %[load4], %[cospi_14_64] \n\t" | |
380 "extp %[temp2], $ac1, 31 \n\t" | |
381 | |
382 "mtlo %[const_2_power_13], $ac1 \n\t" | |
383 "mthi $zero, $ac1 \n\t" | |
384 "mtlo %[const_2_power_13], $ac3 \n\t" | |
385 "mthi $zero, $ac3 \n\t" | |
386 | |
387 "sub %[load1], %[temp0], %[temp1] \n\t" | |
388 "sub %[load2], %[temp3], %[temp2] \n\t" | |
389 | |
390 "msub $ac1, %[load1], %[cospi_8_64] \n\t" | |
391 "madd $ac1, %[load2], %[cospi_24_64] \n\t" | |
392 "madd $ac3, %[load1], %[cospi_24_64] \n\t" | |
393 "madd $ac3, %[load2], %[cospi_8_64] \n\t" | |
394 | |
395 "extp %[step2_9], $ac1, 31 \n\t" | |
396 "extp %[step2_14], $ac3, 31 \n\t" | |
397 "add %[step2_8], %[temp0], %[temp1] \n\t" | |
398 "add %[step2_15], %[temp2], %[temp3] \n\t" | |
399 | |
400 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
401 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
402 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
403 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
404 [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), | |
405 [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) | |
406 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
407 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), | |
408 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), | |
409 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) | |
410 ); | |
411 | |
412 __asm__ __volatile__ ( | |
413 "lh %[load1], 20(%[input]) \n\t" | |
414 "lh %[load2], 44(%[input]) \n\t" | |
415 "lh %[load3], 52(%[input]) \n\t" | |
416 "lh %[load4], 12(%[input]) \n\t" | |
417 | |
418 "mtlo %[const_2_power_13], $ac1 \n\t" | |
419 "mthi $zero, $ac1 \n\t" | |
420 "mtlo %[const_2_power_13], $ac3 \n\t" | |
421 "mthi $zero, $ac3 \n\t" | |
422 | |
423 "madd $ac1, %[load1], %[cospi_22_64] \n\t" | |
424 "msub $ac1, %[load2], %[cospi_10_64] \n\t" | |
425 "extp %[temp0], $ac1, 31 \n\t" | |
426 | |
427 "madd $ac3, %[load1], %[cospi_10_64] \n\t" | |
428 "madd $ac3, %[load2], %[cospi_22_64] \n\t" | |
429 "extp %[temp3], $ac3, 31 \n\t" | |
430 | |
431 "mtlo %[const_2_power_13], $ac1 \n\t" | |
432 "mthi $zero, $ac1 \n\t" | |
433 "mtlo %[const_2_power_13], $ac2 \n\t" | |
434 "mthi $zero, $ac2 \n\t" | |
435 | |
436 "madd $ac2, %[load3], %[cospi_6_64] \n\t" | |
437 "msub $ac2, %[load4], %[cospi_26_64] \n\t" | |
438 "extp %[temp1], $ac2, 31 \n\t" | |
439 | |
440 "madd $ac1, %[load3], %[cospi_26_64] \n\t" | |
441 "madd $ac1, %[load4], %[cospi_6_64] \n\t" | |
442 "extp %[temp2], $ac1, 31 \n\t" | |
443 | |
444 "mtlo %[const_2_power_13], $ac1 \n\t" | |
445 "mthi $zero, $ac1 \n\t" | |
446 "mtlo %[const_2_power_13], $ac3 \n\t" | |
447 "mthi $zero, $ac3 \n\t" | |
448 | |
449 "sub %[load1], %[temp1], %[temp0] \n\t" | |
450 "sub %[load2], %[temp2], %[temp3] \n\t" | |
451 | |
452 "msub $ac1, %[load1], %[cospi_24_64] \n\t" | |
453 "msub $ac1, %[load2], %[cospi_8_64] \n\t" | |
454 "madd $ac3, %[load2], %[cospi_24_64] \n\t" | |
455 "msub $ac3, %[load1], %[cospi_8_64] \n\t" | |
456 | |
457 "extp %[step2_10], $ac1, 31 \n\t" | |
458 "extp %[step2_13], $ac3, 31 \n\t" | |
459 "add %[step2_11], %[temp0], %[temp1] \n\t" | |
460 "add %[step2_12], %[temp2], %[temp3] \n\t" | |
461 | |
462 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
463 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
464 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
465 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
466 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), | |
467 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) | |
468 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
469 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), | |
470 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), | |
471 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) | |
472 ); | |
473 | |
474 __asm__ __volatile__ ( | |
475 "mtlo %[const_2_power_13], $ac0 \n\t" | |
476 "mthi $zero, $ac0 \n\t" | |
477 "sub %[temp0], %[step2_14], %[step2_13] \n\t" | |
478 "sub %[temp0], %[temp0], %[step2_9] \n\t" | |
479 "add %[temp0], %[temp0], %[step2_10] \n\t" | |
480 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" | |
481 | |
482 "mtlo %[const_2_power_13], $ac1 \n\t" | |
483 "mthi $zero, $ac1 \n\t" | |
484 "sub %[temp1], %[step2_14], %[step2_13] \n\t" | |
485 "add %[temp1], %[temp1], %[step2_9] \n\t" | |
486 "sub %[temp1], %[temp1], %[step2_10] \n\t" | |
487 "madd $ac1, %[temp1], %[cospi_16_64] \n\t" | |
488 | |
489 "mtlo %[const_2_power_13], $ac2 \n\t" | |
490 "mthi $zero, $ac2 \n\t" | |
491 "sub %[temp0], %[step2_15], %[step2_12] \n\t" | |
492 "sub %[temp0], %[temp0], %[step2_8] \n\t" | |
493 "add %[temp0], %[temp0], %[step2_11] \n\t" | |
494 "madd $ac2, %[temp0], %[cospi_16_64] \n\t" | |
495 | |
496 "mtlo %[const_2_power_13], $ac3 \n\t" | |
497 "mthi $zero, $ac3 \n\t" | |
498 "sub %[temp1], %[step2_15], %[step2_12] \n\t" | |
499 "add %[temp1], %[temp1], %[step2_8] \n\t" | |
500 "sub %[temp1], %[temp1], %[step2_11] \n\t" | |
501 "madd $ac3, %[temp1], %[cospi_16_64] \n\t" | |
502 | |
503 "add %[step3_8], %[step2_8], %[step2_11] \n\t" | |
504 "add %[step3_9], %[step2_9], %[step2_10] \n\t" | |
505 "add %[step3_14], %[step2_13], %[step2_14] \n\t" | |
506 "add %[step3_15], %[step2_12], %[step2_15] \n\t" | |
507 | |
508 "extp %[step3_10], $ac0, 31 \n\t" | |
509 "extp %[step3_13], $ac1, 31 \n\t" | |
510 "extp %[step3_11], $ac2, 31 \n\t" | |
511 "extp %[step3_12], $ac3, 31 \n\t" | |
512 | |
513 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
514 [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), | |
515 [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), | |
516 [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), | |
517 [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) | |
518 : [const_2_power_13] "r" (const_2_power_13), | |
519 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), | |
520 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), | |
521 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), | |
522 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15), | |
523 [cospi_16_64] "r" (cospi_16_64) | |
524 ); | |
525 | |
526 step2_18 = step1_17 - step1_18; | |
527 step2_29 = step1_30 - step1_29; | |
528 | |
529 __asm__ __volatile__ ( | |
530 "mtlo %[const_2_power_13], $ac0 \n\t" | |
531 "mthi $zero, $ac0 \n\t" | |
532 "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" | |
533 "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" | |
534 "extp %[step3_18], $ac0, 31 \n\t" | |
535 | |
536 : [step3_18] "=r" (step3_18) | |
537 : [const_2_power_13] "r" (const_2_power_13), | |
538 [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), | |
539 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
540 ); | |
541 | |
542 temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; | |
543 step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
544 | |
545 step2_19 = step1_16 - step1_19; | |
546 step2_28 = step1_31 - step1_28; | |
547 | |
548 __asm__ __volatile__ ( | |
549 "mtlo %[const_2_power_13], $ac0 \n\t" | |
550 "mthi $zero, $ac0 \n\t" | |
551 "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" | |
552 "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" | |
553 "extp %[step3_19], $ac0, 31 \n\t" | |
554 | |
555 : [step3_19] "=r" (step3_19) | |
556 : [const_2_power_13] "r" (const_2_power_13), | |
557 [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), | |
558 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
559 ); | |
560 | |
561 temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; | |
562 step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
563 | |
564 step3_16 = step1_16 + step1_19; | |
565 step3_17 = step1_17 + step1_18; | |
566 step3_30 = step1_29 + step1_30; | |
567 step3_31 = step1_28 + step1_31; | |
568 | |
569 step2_20 = step1_23 - step1_20; | |
570 step2_27 = step1_24 - step1_27; | |
571 | |
572 __asm__ __volatile__ ( | |
573 "mtlo %[const_2_power_13], $ac0 \n\t" | |
574 "mthi $zero, $ac0 \n\t" | |
575 "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" | |
576 "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" | |
577 "extp %[step3_20], $ac0, 31 \n\t" | |
578 | |
579 : [step3_20] "=r" (step3_20) | |
580 : [const_2_power_13] "r" (const_2_power_13), | |
581 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), | |
582 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
583 ); | |
584 | |
585 temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; | |
586 step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
587 | |
588 step2_21 = step1_22 - step1_21; | |
589 step2_26 = step1_25 - step1_26; | |
590 | |
591 __asm__ __volatile__ ( | |
592 "mtlo %[const_2_power_13], $ac1 \n\t" | |
593 "mthi $zero, $ac1 \n\t" | |
594 "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" | |
595 "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" | |
596 "extp %[step3_21], $ac1, 31 \n\t" | |
597 | |
598 : [step3_21] "=r" (step3_21) | |
599 : [const_2_power_13] "r" (const_2_power_13), | |
600 [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), | |
601 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
602 ); | |
603 | |
604 temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; | |
605 step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
606 | |
607 step3_22 = step1_21 + step1_22; | |
608 step3_23 = step1_20 + step1_23; | |
609 step3_24 = step1_24 + step1_27; | |
610 step3_25 = step1_25 + step1_26; | |
611 | |
612 step2_16 = step3_16 + step3_23; | |
613 step2_17 = step3_17 + step3_22; | |
614 step2_18 = step3_18 + step3_21; | |
615 step2_19 = step3_19 + step3_20; | |
616 step2_20 = step3_19 - step3_20; | |
617 step2_21 = step3_18 - step3_21; | |
618 step2_22 = step3_17 - step3_22; | |
619 step2_23 = step3_16 - step3_23; | |
620 | |
621 step2_24 = step3_31 - step3_24; | |
622 step2_25 = step3_30 - step3_25; | |
623 step2_26 = step3_29 - step3_26; | |
624 step2_27 = step3_28 - step3_27; | |
625 step2_28 = step3_28 + step3_27; | |
626 step2_29 = step3_29 + step3_26; | |
627 step2_30 = step3_30 + step3_25; | |
628 step2_31 = step3_31 + step3_24; | |
629 | |
630 __asm__ __volatile__ ( | |
631 "lh %[load1], 0(%[input]) \n\t" | |
632 "lh %[load2], 32(%[input]) \n\t" | |
633 "lh %[load3], 16(%[input]) \n\t" | |
634 "lh %[load4], 48(%[input]) \n\t" | |
635 | |
636 "mtlo %[const_2_power_13], $ac1 \n\t" | |
637 "mthi $zero, $ac1 \n\t" | |
638 "mtlo %[const_2_power_13], $ac2 \n\t" | |
639 "mthi $zero, $ac2 \n\t" | |
640 "add %[result1], %[load1], %[load2] \n\t" | |
641 "sub %[result2], %[load1], %[load2] \n\t" | |
642 "madd $ac1, %[result1], %[cospi_16_64] \n\t" | |
643 "madd $ac2, %[result2], %[cospi_16_64] \n\t" | |
644 "extp %[temp0], $ac1, 31 \n\t" | |
645 "extp %[temp1], $ac2, 31 \n\t" | |
646 | |
647 "mtlo %[const_2_power_13], $ac3 \n\t" | |
648 "mthi $zero, $ac3 \n\t" | |
649 "madd $ac3, %[load3], %[cospi_24_64] \n\t" | |
650 "msub $ac3, %[load4], %[cospi_8_64] \n\t" | |
651 "extp %[temp2], $ac3, 31 \n\t" | |
652 | |
653 "mtlo %[const_2_power_13], $ac1 \n\t" | |
654 "mthi $zero, $ac1 \n\t" | |
655 "madd $ac1, %[load3], %[cospi_8_64] \n\t" | |
656 "madd $ac1, %[load4], %[cospi_24_64] \n\t" | |
657 "extp %[temp3], $ac1, 31 \n\t" | |
658 | |
659 "add %[step1_0], %[temp0], %[temp3] \n\t" | |
660 "add %[step1_1], %[temp1], %[temp2] \n\t" | |
661 "sub %[step1_2], %[temp1], %[temp2] \n\t" | |
662 "sub %[step1_3], %[temp0], %[temp3] \n\t" | |
663 | |
664 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
665 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
666 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
667 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
668 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
669 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), | |
670 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) | |
671 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
672 [cospi_16_64] "r" (cospi_16_64), | |
673 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
674 | |
675 ); | |
676 | |
677 __asm__ __volatile__ ( | |
678 "lh %[load1], 8(%[input]) \n\t" | |
679 "lh %[load2], 56(%[input]) \n\t" | |
680 "lh %[load3], 40(%[input]) \n\t" | |
681 "lh %[load4], 24(%[input]) \n\t" | |
682 | |
683 "mtlo %[const_2_power_13], $ac1 \n\t" | |
684 "mthi $zero, $ac1 \n\t" | |
685 "mtlo %[const_2_power_13], $ac3 \n\t" | |
686 "mthi $zero, $ac3 \n\t" | |
687 | |
688 "madd $ac1, %[load1], %[cospi_28_64] \n\t" | |
689 "msub $ac1, %[load2], %[cospi_4_64] \n\t" | |
690 "extp %[temp0], $ac1, 31 \n\t" | |
691 | |
692 "madd $ac3, %[load1], %[cospi_4_64] \n\t" | |
693 "madd $ac3, %[load2], %[cospi_28_64] \n\t" | |
694 "extp %[temp3], $ac3, 31 \n\t" | |
695 | |
696 "mtlo %[const_2_power_13], $ac1 \n\t" | |
697 "mthi $zero, $ac1 \n\t" | |
698 "mtlo %[const_2_power_13], $ac2 \n\t" | |
699 "mthi $zero, $ac2 \n\t" | |
700 | |
701 "madd $ac2, %[load3], %[cospi_12_64] \n\t" | |
702 "msub $ac2, %[load4], %[cospi_20_64] \n\t" | |
703 "extp %[temp1], $ac2, 31 \n\t" | |
704 | |
705 "madd $ac1, %[load3], %[cospi_20_64] \n\t" | |
706 "madd $ac1, %[load4], %[cospi_12_64] \n\t" | |
707 "extp %[temp2], $ac1, 31 \n\t" | |
708 | |
709 "mtlo %[const_2_power_13], $ac1 \n\t" | |
710 "mthi $zero, $ac1 \n\t" | |
711 "mtlo %[const_2_power_13], $ac3 \n\t" | |
712 "mthi $zero, $ac3 \n\t" | |
713 | |
714 "sub %[load1], %[temp3], %[temp2] \n\t" | |
715 "sub %[load1], %[load1], %[temp0] \n\t" | |
716 "add %[load1], %[load1], %[temp1] \n\t" | |
717 | |
718 "sub %[load2], %[temp0], %[temp1] \n\t" | |
719 "sub %[load2], %[load2], %[temp2] \n\t" | |
720 "add %[load2], %[load2], %[temp3] \n\t" | |
721 | |
722 "madd $ac1, %[load1], %[cospi_16_64] \n\t" | |
723 "madd $ac3, %[load2], %[cospi_16_64] \n\t" | |
724 | |
725 "extp %[step1_5], $ac1, 31 \n\t" | |
726 "extp %[step1_6], $ac3, 31 \n\t" | |
727 "add %[step1_4], %[temp0], %[temp1] \n\t" | |
728 "add %[step1_7], %[temp3], %[temp2] \n\t" | |
729 | |
730 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
731 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
732 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
733 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
734 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), | |
735 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) | |
736 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
737 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), | |
738 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), | |
739 [cospi_16_64] "r" (cospi_16_64) | |
740 ); | |
741 | |
742 step2_0 = step1_0 + step1_7; | |
743 step2_1 = step1_1 + step1_6; | |
744 step2_2 = step1_2 + step1_5; | |
745 step2_3 = step1_3 + step1_4; | |
746 step2_4 = step1_3 - step1_4; | |
747 step2_5 = step1_2 - step1_5; | |
748 step2_6 = step1_1 - step1_6; | |
749 step2_7 = step1_0 - step1_7; | |
750 | |
751 step1_0 = step2_0 + step3_15; | |
752 step1_1 = step2_1 + step3_14; | |
753 step1_2 = step2_2 + step3_13; | |
754 step1_3 = step2_3 + step3_12; | |
755 step1_4 = step2_4 + step3_11; | |
756 step1_5 = step2_5 + step3_10; | |
757 step1_6 = step2_6 + step3_9; | |
758 step1_7 = step2_7 + step3_8; | |
759 step1_8 = step2_7 - step3_8; | |
760 step1_9 = step2_6 - step3_9; | |
761 step1_10 = step2_5 - step3_10; | |
762 step1_11 = step2_4 - step3_11; | |
763 step1_12 = step2_3 - step3_12; | |
764 step1_13 = step2_2 - step3_13; | |
765 step1_14 = step2_1 - step3_14; | |
766 step1_15 = step2_0 - step3_15; | |
767 | |
768 __asm__ __volatile__ ( | |
769 "sub %[temp0], %[step2_27], %[step2_20] \n\t" | |
770 "mtlo %[const_2_power_13], $ac0 \n\t" | |
771 "mthi $zero, $ac0 \n\t" | |
772 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" | |
773 "extp %[step1_20], $ac0, 31 \n\t" | |
774 | |
775 : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) | |
776 : [const_2_power_13] "r" (const_2_power_13), | |
777 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), | |
778 [cospi_16_64] "r" (cospi_16_64) | |
779 ); | |
780 | |
781 temp21 = (step2_20 + step2_27) * cospi_16_64; | |
782 step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
783 | |
784 __asm__ __volatile__ ( | |
785 "sub %[temp0], %[step2_26], %[step2_21] \n\t" | |
786 "mtlo %[const_2_power_13], $ac0 \n\t" | |
787 "mthi $zero, $ac0 \n\t" | |
788 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" | |
789 "extp %[step1_21], $ac0, 31 \n\t" | |
790 | |
791 : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) | |
792 : [const_2_power_13] "r" (const_2_power_13), | |
793 [step2_26] "r" (step2_26), [step2_21] "r" (step2_21), | |
794 [cospi_16_64] "r" (cospi_16_64) | |
795 ); | |
796 | |
797 temp21 = (step2_21 + step2_26) * cospi_16_64; | |
798 step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
799 | |
800 __asm__ __volatile__ ( | |
801 "sub %[temp0], %[step2_25], %[step2_22] \n\t" | |
802 "mtlo %[const_2_power_13], $ac0 \n\t" | |
803 "mthi $zero, $ac0 \n\t" | |
804 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" | |
805 "extp %[step1_22], $ac0, 31 \n\t" | |
806 | |
807 : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) | |
808 : [const_2_power_13] "r" (const_2_power_13), | |
809 [step2_25] "r" (step2_25), [step2_22] "r" (step2_22), | |
810 [cospi_16_64] "r" (cospi_16_64) | |
811 ); | |
812 | |
813 temp21 = (step2_22 + step2_25) * cospi_16_64; | |
814 step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
815 | |
816 __asm__ __volatile__ ( | |
817 "sub %[temp0], %[step2_24], %[step2_23] \n\t" | |
818 "mtlo %[const_2_power_13], $ac0 \n\t" | |
819 "mthi $zero, $ac0 \n\t" | |
820 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" | |
821 "extp %[step1_23], $ac0, 31 \n\t" | |
822 | |
823 : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) | |
824 : [const_2_power_13] "r" (const_2_power_13), | |
825 [step2_24] "r" (step2_24), [step2_23] "r" (step2_23), | |
826 [cospi_16_64] "r" (cospi_16_64) | |
827 ); | |
828 | |
829 temp21 = (step2_23 + step2_24) * cospi_16_64; | |
830 step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
831 | |
832 // final stage | |
833 output[0 * 32] = step1_0 + step2_31; | |
834 output[1 * 32] = step1_1 + step2_30; | |
835 output[2 * 32] = step1_2 + step2_29; | |
836 output[3 * 32] = step1_3 + step2_28; | |
837 output[4 * 32] = step1_4 + step1_27; | |
838 output[5 * 32] = step1_5 + step1_26; | |
839 output[6 * 32] = step1_6 + step1_25; | |
840 output[7 * 32] = step1_7 + step1_24; | |
841 output[8 * 32] = step1_8 + step1_23; | |
842 output[9 * 32] = step1_9 + step1_22; | |
843 output[10 * 32] = step1_10 + step1_21; | |
844 output[11 * 32] = step1_11 + step1_20; | |
845 output[12 * 32] = step1_12 + step2_19; | |
846 output[13 * 32] = step1_13 + step2_18; | |
847 output[14 * 32] = step1_14 + step2_17; | |
848 output[15 * 32] = step1_15 + step2_16; | |
849 output[16 * 32] = step1_15 - step2_16; | |
850 output[17 * 32] = step1_14 - step2_17; | |
851 output[18 * 32] = step1_13 - step2_18; | |
852 output[19 * 32] = step1_12 - step2_19; | |
853 output[20 * 32] = step1_11 - step1_20; | |
854 output[21 * 32] = step1_10 - step1_21; | |
855 output[22 * 32] = step1_9 - step1_22; | |
856 output[23 * 32] = step1_8 - step1_23; | |
857 output[24 * 32] = step1_7 - step1_24; | |
858 output[25 * 32] = step1_6 - step1_25; | |
859 output[26 * 32] = step1_5 - step1_26; | |
860 output[27 * 32] = step1_4 - step1_27; | |
861 output[28 * 32] = step1_3 - step2_28; | |
862 output[29 * 32] = step1_2 - step2_29; | |
863 output[30 * 32] = step1_1 - step2_30; | |
864 output[31 * 32] = step1_0 - step2_31; | |
865 | |
866 input += 32; | |
867 output += 1; | |
868 } | |
869 } | |
870 | |
871 void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, | |
872 int dest_stride) { | |
873 DECLARE_ALIGNED(32, int16_t, out[32 * 32]); | |
874 int16_t *outptr = out; | |
875 uint32_t pos = 45; | |
876 | |
877 /* bit positon for extract from acc */ | |
878 __asm__ __volatile__ ( | |
879 "wrdsp %[pos], 1 \n\t" | |
880 : | |
881 : [pos] "r" (pos) | |
882 ); | |
883 | |
884 // Rows | |
885 idct32_rows_dspr2(input, outptr, 32); | |
886 | |
887 // Columns | |
888 vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride); | |
889 } | |
890 | |
891 void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, | |
892 int stride) { | |
893 DECLARE_ALIGNED(32, int16_t, out[32 * 32]); | |
894 int16_t *outptr = out; | |
895 uint32_t i; | |
896 uint32_t pos = 45; | |
897 | |
898 /* bit positon for extract from acc */ | |
899 __asm__ __volatile__ ( | |
900 "wrdsp %[pos], 1 \n\t" | |
901 : | |
902 : [pos] "r" (pos) | |
903 ); | |
904 | |
905 // Rows | |
906 idct32_rows_dspr2(input, outptr, 8); | |
907 | |
908 outptr += 8; | |
909 __asm__ __volatile__ ( | |
910 "sw $zero, 0(%[outptr]) \n\t" | |
911 "sw $zero, 4(%[outptr]) \n\t" | |
912 "sw $zero, 8(%[outptr]) \n\t" | |
913 "sw $zero, 12(%[outptr]) \n\t" | |
914 "sw $zero, 16(%[outptr]) \n\t" | |
915 "sw $zero, 20(%[outptr]) \n\t" | |
916 "sw $zero, 24(%[outptr]) \n\t" | |
917 "sw $zero, 28(%[outptr]) \n\t" | |
918 "sw $zero, 32(%[outptr]) \n\t" | |
919 "sw $zero, 36(%[outptr]) \n\t" | |
920 "sw $zero, 40(%[outptr]) \n\t" | |
921 "sw $zero, 44(%[outptr]) \n\t" | |
922 | |
923 : | |
924 : [outptr] "r" (outptr) | |
925 ); | |
926 | |
927 for (i = 0; i < 31; ++i) { | |
928 outptr += 32; | |
929 | |
930 __asm__ __volatile__ ( | |
931 "sw $zero, 0(%[outptr]) \n\t" | |
932 "sw $zero, 4(%[outptr]) \n\t" | |
933 "sw $zero, 8(%[outptr]) \n\t" | |
934 "sw $zero, 12(%[outptr]) \n\t" | |
935 "sw $zero, 16(%[outptr]) \n\t" | |
936 "sw $zero, 20(%[outptr]) \n\t" | |
937 "sw $zero, 24(%[outptr]) \n\t" | |
938 "sw $zero, 28(%[outptr]) \n\t" | |
939 "sw $zero, 32(%[outptr]) \n\t" | |
940 "sw $zero, 36(%[outptr]) \n\t" | |
941 "sw $zero, 40(%[outptr]) \n\t" | |
942 "sw $zero, 44(%[outptr]) \n\t" | |
943 | |
944 : | |
945 : [outptr] "r" (outptr) | |
946 ); | |
947 } | |
948 | |
949 // Columns | |
950 vp9_idct32_cols_add_blk_dspr2(out, dest, stride); | |
951 } | |
952 | |
953 void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, | |
954 int stride) { | |
955 int r, out; | |
956 int32_t a1, absa1; | |
957 int32_t vector_a1; | |
958 int32_t t1, t2, t3, t4; | |
959 int32_t vector_1, vector_2, vector_3, vector_4; | |
960 uint32_t pos = 45; | |
961 | |
962 /* bit positon for extract from acc */ | |
963 __asm__ __volatile__ ( | |
964 "wrdsp %[pos], 1 \n\t" | |
965 | |
966 : | |
967 : [pos] "r" (pos) | |
968 ); | |
969 | |
970 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); | |
971 __asm__ __volatile__ ( | |
972 "addi %[out], %[out], 32 \n\t" | |
973 "sra %[a1], %[out], 6 \n\t" | |
974 | |
975 : [out] "+r" (out), [a1] "=r" (a1) | |
976 : | |
977 ); | |
978 | |
979 if (a1 < 0) { | |
980 /* use quad-byte | |
981 * input and output memory are four byte aligned */ | |
982 __asm__ __volatile__ ( | |
983 "abs %[absa1], %[a1] \n\t" | |
984 "replv.qb %[vector_a1], %[absa1] \n\t" | |
985 | |
986 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) | |
987 : [a1] "r" (a1) | |
988 ); | |
989 | |
990 for (r = 32; r--;) { | |
991 __asm__ __volatile__ ( | |
992 "lw %[t1], 0(%[dest]) \n\t" | |
993 "lw %[t2], 4(%[dest]) \n\t" | |
994 "lw %[t3], 8(%[dest]) \n\t" | |
995 "lw %[t4], 12(%[dest]) \n\t" | |
996 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
997 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
998 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
999 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
1000 "sw %[vector_1], 0(%[dest]) \n\t" | |
1001 "sw %[vector_2], 4(%[dest]) \n\t" | |
1002 "sw %[vector_3], 8(%[dest]) \n\t" | |
1003 "sw %[vector_4], 12(%[dest]) \n\t" | |
1004 | |
1005 "lw %[t1], 16(%[dest]) \n\t" | |
1006 "lw %[t2], 20(%[dest]) \n\t" | |
1007 "lw %[t3], 24(%[dest]) \n\t" | |
1008 "lw %[t4], 28(%[dest]) \n\t" | |
1009 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
1010 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
1011 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
1012 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
1013 "sw %[vector_1], 16(%[dest]) \n\t" | |
1014 "sw %[vector_2], 20(%[dest]) \n\t" | |
1015 "sw %[vector_3], 24(%[dest]) \n\t" | |
1016 "sw %[vector_4], 28(%[dest]) \n\t" | |
1017 | |
1018 "add %[dest], %[dest], %[stride] \n\t" | |
1019 | |
1020 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
1021 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
1022 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
1023 [dest] "+&r" (dest) | |
1024 : [stride] "r" (stride), [vector_a1] "r" (vector_a1) | |
1025 ); | |
1026 } | |
1027 } else { | |
1028 /* use quad-byte | |
1029 * input and output memory are four byte aligned */ | |
1030 __asm__ __volatile__ ( | |
1031 "replv.qb %[vector_a1], %[a1] \n\t" | |
1032 | |
1033 : [vector_a1] "=r" (vector_a1) | |
1034 : [a1] "r" (a1) | |
1035 ); | |
1036 | |
1037 for (r = 32; r--;) { | |
1038 __asm__ __volatile__ ( | |
1039 "lw %[t1], 0(%[dest]) \n\t" | |
1040 "lw %[t2], 4(%[dest]) \n\t" | |
1041 "lw %[t3], 8(%[dest]) \n\t" | |
1042 "lw %[t4], 12(%[dest]) \n\t" | |
1043 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
1044 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
1045 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
1046 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
1047 "sw %[vector_1], 0(%[dest]) \n\t" | |
1048 "sw %[vector_2], 4(%[dest]) \n\t" | |
1049 "sw %[vector_3], 8(%[dest]) \n\t" | |
1050 "sw %[vector_4], 12(%[dest]) \n\t" | |
1051 | |
1052 "lw %[t1], 16(%[dest]) \n\t" | |
1053 "lw %[t2], 20(%[dest]) \n\t" | |
1054 "lw %[t3], 24(%[dest]) \n\t" | |
1055 "lw %[t4], 28(%[dest]) \n\t" | |
1056 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
1057 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
1058 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
1059 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
1060 "sw %[vector_1], 16(%[dest]) \n\t" | |
1061 "sw %[vector_2], 20(%[dest]) \n\t" | |
1062 "sw %[vector_3], 24(%[dest]) \n\t" | |
1063 "sw %[vector_4], 28(%[dest]) \n\t" | |
1064 | |
1065 "add %[dest], %[dest], %[stride] \n\t" | |
1066 | |
1067 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
1068 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
1069 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
1070 [dest] "+&r" (dest) | |
1071 : [stride] "r" (stride), [vector_a1] "r" (vector_a1) | |
1072 ); | |
1073 } | |
1074 } | |
1075 } | |
1076 #endif // #if HAVE_DSPR2 | |
OLD | NEW |