OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | 11 #include <assert.h> |
12 #include <stdio.h> | 12 #include <stdio.h> |
13 | 13 |
14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
15 #include "./vp9_rtcd.h" | 15 #include "./vp9_rtcd.h" |
16 #include "vp9/common/vp9_common.h" | 16 #include "vp9/common/vp9_common.h" |
17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" |
18 #include "vp9/common/vp9_idct.h" | 18 #include "vp9/common/vp9_idct.h" |
19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" | 19 #include "vpx_dsp/mips/inv_txfm_dspr2.h" |
20 #include "vpx_dsp/txfm_common.h" | 20 #include "vpx_dsp/txfm_common.h" |
21 #include "vpx_ports/mem.h" | 21 #include "vpx_ports/mem.h" |
22 | 22 |
23 #if HAVE_DSPR2 | 23 #if HAVE_DSPR2 |
24 static void idct16_rows_dspr2(const int16_t *input, int16_t *output, | |
25 uint32_t no_rows) { | |
26 int i; | |
27 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | |
28 int step1_10, step1_11, step1_12, step1_13; | |
29 int step2_0, step2_1, step2_2, step2_3; | |
30 int step2_8, step2_9, step2_10, step2_11; | |
31 int step2_12, step2_13, step2_14, step2_15; | |
32 int load1, load2, load3, load4, load5, load6, load7, load8; | |
33 int result1, result2, result3, result4; | |
34 const int const_2_power_13 = 8192; | |
35 | |
36 for (i = no_rows; i--; ) { | |
37 /* prefetch row */ | |
38 prefetch_load((const uint8_t *)(input + 16)); | |
39 | |
40 __asm__ __volatile__ ( | |
41 "lh %[load1], 0(%[input]) \n\t" | |
42 "lh %[load2], 16(%[input]) \n\t" | |
43 "lh %[load3], 8(%[input]) \n\t" | |
44 "lh %[load4], 24(%[input]) \n\t" | |
45 | |
46 "mtlo %[const_2_power_13], $ac1 \n\t" | |
47 "mthi $zero, $ac1 \n\t" | |
48 "mtlo %[const_2_power_13], $ac2 \n\t" | |
49 "mthi $zero, $ac2 \n\t" | |
50 "add %[result1], %[load1], %[load2] \n\t" | |
51 "sub %[result2], %[load1], %[load2] \n\t" | |
52 "madd $ac1, %[result1], %[cospi_16_64] \n\t" | |
53 "madd $ac2, %[result2], %[cospi_16_64] \n\t" | |
54 "extp %[step2_0], $ac1, 31 \n\t" | |
55 "extp %[step2_1], $ac2, 31 \n\t" | |
56 | |
57 "mtlo %[const_2_power_13], $ac3 \n\t" | |
58 "mthi $zero, $ac3 \n\t" | |
59 "madd $ac3, %[load3], %[cospi_24_64] \n\t" | |
60 "msub $ac3, %[load4], %[cospi_8_64] \n\t" | |
61 "extp %[step2_2], $ac3, 31 \n\t" | |
62 | |
63 "mtlo %[const_2_power_13], $ac1 \n\t" | |
64 "mthi $zero, $ac1 \n\t" | |
65 "madd $ac1, %[load3], %[cospi_8_64] \n\t" | |
66 "madd $ac1, %[load4], %[cospi_24_64] \n\t" | |
67 "extp %[step2_3], $ac1, 31 \n\t" | |
68 | |
69 "add %[step1_0], %[step2_0], %[step2_3] \n\t" | |
70 "add %[step1_1], %[step2_1], %[step2_2] \n\t" | |
71 "sub %[step1_2], %[step2_1], %[step2_2] \n\t" | |
72 "sub %[step1_3], %[step2_0], %[step2_3] \n\t" | |
73 | |
74 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
75 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
76 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
77 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), | |
78 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), | |
79 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), | |
80 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) | |
81 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
82 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), | |
83 [cospi_16_64] "r" (cospi_16_64) | |
84 ); | |
85 | |
86 __asm__ __volatile__ ( | |
87 "lh %[load5], 2(%[input]) \n\t" | |
88 "lh %[load6], 30(%[input]) \n\t" | |
89 "lh %[load7], 18(%[input]) \n\t" | |
90 "lh %[load8], 14(%[input]) \n\t" | |
91 | |
92 "mtlo %[const_2_power_13], $ac1 \n\t" | |
93 "mthi $zero, $ac1 \n\t" | |
94 "mtlo %[const_2_power_13], $ac3 \n\t" | |
95 "mthi $zero, $ac3 \n\t" | |
96 | |
97 "madd $ac1, %[load5], %[cospi_30_64] \n\t" | |
98 "msub $ac1, %[load6], %[cospi_2_64] \n\t" | |
99 "extp %[result1], $ac1, 31 \n\t" | |
100 | |
101 "madd $ac3, %[load7], %[cospi_14_64] \n\t" | |
102 "msub $ac3, %[load8], %[cospi_18_64] \n\t" | |
103 "extp %[result2], $ac3, 31 \n\t" | |
104 | |
105 "mtlo %[const_2_power_13], $ac1 \n\t" | |
106 "mthi $zero, $ac1 \n\t" | |
107 "mtlo %[const_2_power_13], $ac2 \n\t" | |
108 "mthi $zero, $ac2 \n\t" | |
109 | |
110 "madd $ac1, %[load7], %[cospi_18_64] \n\t" | |
111 "madd $ac1, %[load8], %[cospi_14_64] \n\t" | |
112 "extp %[result3], $ac1, 31 \n\t" | |
113 | |
114 "madd $ac2, %[load5], %[cospi_2_64] \n\t" | |
115 "madd $ac2, %[load6], %[cospi_30_64] \n\t" | |
116 "extp %[result4], $ac2, 31 \n\t" | |
117 | |
118 "sub %[load5], %[result1], %[result2] \n\t" | |
119 "sub %[load6], %[result4], %[result3] \n\t" | |
120 | |
121 "mtlo %[const_2_power_13], $ac1 \n\t" | |
122 "mthi $zero, $ac1 \n\t" | |
123 "mtlo %[const_2_power_13], $ac3 \n\t" | |
124 "mthi $zero, $ac3 \n\t" | |
125 | |
126 "madd $ac1, %[load6], %[cospi_24_64] \n\t" | |
127 "msub $ac1, %[load5], %[cospi_8_64] \n\t" | |
128 "madd $ac3, %[load5], %[cospi_24_64] \n\t" | |
129 "madd $ac3, %[load6], %[cospi_8_64] \n\t" | |
130 | |
131 "extp %[step2_9], $ac1, 31 \n\t" | |
132 "extp %[step2_14], $ac3, 31 \n\t" | |
133 "add %[step2_8], %[result1], %[result2] \n\t" | |
134 "add %[step2_15], %[result4], %[result3] \n\t" | |
135 | |
136 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
137 [load7] "=&r" (load7), [load8] "=&r" (load8), | |
138 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
139 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
140 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), | |
141 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) | |
142 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
143 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), | |
144 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), | |
145 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
146 ); | |
147 | |
148 __asm__ __volatile__ ( | |
149 "lh %[load1], 10(%[input]) \n\t" | |
150 "lh %[load2], 22(%[input]) \n\t" | |
151 "lh %[load3], 26(%[input]) \n\t" | |
152 "lh %[load4], 6(%[input]) \n\t" | |
153 | |
154 "mtlo %[const_2_power_13], $ac1 \n\t" | |
155 "mthi $zero, $ac1 \n\t" | |
156 "mtlo %[const_2_power_13], $ac3 \n\t" | |
157 "mthi $zero, $ac3 \n\t" | |
158 | |
159 "madd $ac1, %[load1], %[cospi_22_64] \n\t" | |
160 "msub $ac1, %[load2], %[cospi_10_64] \n\t" | |
161 "extp %[result1], $ac1, 31 \n\t" | |
162 | |
163 "madd $ac3, %[load3], %[cospi_6_64] \n\t" | |
164 "msub $ac3, %[load4], %[cospi_26_64] \n\t" | |
165 "extp %[result2], $ac3, 31 \n\t" | |
166 | |
167 "mtlo %[const_2_power_13], $ac1 \n\t" | |
168 "mthi $zero, $ac1 \n\t" | |
169 "mtlo %[const_2_power_13], $ac2 \n\t" | |
170 "mthi $zero, $ac2 \n\t" | |
171 | |
172 "madd $ac1, %[load1], %[cospi_10_64] \n\t" | |
173 "madd $ac1, %[load2], %[cospi_22_64] \n\t" | |
174 "extp %[result3], $ac1, 31 \n\t" | |
175 | |
176 "madd $ac2, %[load3], %[cospi_26_64] \n\t" | |
177 "madd $ac2, %[load4], %[cospi_6_64] \n\t" | |
178 "extp %[result4], $ac2, 31 \n\t" | |
179 | |
180 "mtlo %[const_2_power_13], $ac1 \n\t" | |
181 "mthi $zero, $ac1 \n\t" | |
182 "mtlo %[const_2_power_13], $ac3 \n\t" | |
183 "mthi $zero, $ac3 \n\t" | |
184 | |
185 "sub %[load1], %[result2], %[result1] \n\t" | |
186 "sub %[load2], %[result4], %[result3] \n\t" | |
187 | |
188 "msub $ac1, %[load1], %[cospi_24_64] \n\t" | |
189 "msub $ac1, %[load2], %[cospi_8_64] \n\t" | |
190 "madd $ac3, %[load2], %[cospi_24_64] \n\t" | |
191 "msub $ac3, %[load1], %[cospi_8_64] \n\t" | |
192 | |
193 "extp %[step2_10], $ac1, 31 \n\t" | |
194 "extp %[step2_13], $ac3, 31 \n\t" | |
195 "add %[step2_11], %[result1], %[result2] \n\t" | |
196 "add %[step2_12], %[result4], %[result3] \n\t" | |
197 | |
198 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
199 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
200 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
201 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
202 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), | |
203 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) | |
204 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
205 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), | |
206 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), | |
207 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
208 ); | |
209 | |
210 __asm__ __volatile__ ( | |
211 "lh %[load5], 4(%[input]) \n\t" | |
212 "lh %[load6], 28(%[input]) \n\t" | |
213 "lh %[load7], 20(%[input]) \n\t" | |
214 "lh %[load8], 12(%[input]) \n\t" | |
215 | |
216 "mtlo %[const_2_power_13], $ac1 \n\t" | |
217 "mthi $zero, $ac1 \n\t" | |
218 "mtlo %[const_2_power_13], $ac3 \n\t" | |
219 "mthi $zero, $ac3 \n\t" | |
220 | |
221 "madd $ac1, %[load5], %[cospi_28_64] \n\t" | |
222 "msub $ac1, %[load6], %[cospi_4_64] \n\t" | |
223 "extp %[result1], $ac1, 31 \n\t" | |
224 | |
225 "madd $ac3, %[load7], %[cospi_12_64] \n\t" | |
226 "msub $ac3, %[load8], %[cospi_20_64] \n\t" | |
227 "extp %[result2], $ac3, 31 \n\t" | |
228 | |
229 "mtlo %[const_2_power_13], $ac1 \n\t" | |
230 "mthi $zero, $ac1 \n\t" | |
231 "mtlo %[const_2_power_13], $ac2 \n\t" | |
232 "mthi $zero, $ac2 \n\t" | |
233 | |
234 "madd $ac1, %[load7], %[cospi_20_64] \n\t" | |
235 "madd $ac1, %[load8], %[cospi_12_64] \n\t" | |
236 "extp %[result3], $ac1, 31 \n\t" | |
237 | |
238 "madd $ac2, %[load5], %[cospi_4_64] \n\t" | |
239 "madd $ac2, %[load6], %[cospi_28_64] \n\t" | |
240 "extp %[result4], $ac2, 31 \n\t" | |
241 | |
242 "mtlo %[const_2_power_13], $ac1 \n\t" | |
243 "mthi $zero, $ac1 \n\t" | |
244 "mtlo %[const_2_power_13], $ac3 \n\t" | |
245 "mthi $zero, $ac3 \n\t" | |
246 | |
247 "sub %[load5], %[result4], %[result3] \n\t" | |
248 "sub %[load5], %[load5], %[result1] \n\t" | |
249 "add %[load5], %[load5], %[result2] \n\t" | |
250 | |
251 "sub %[load6], %[result1], %[result2] \n\t" | |
252 "sub %[load6], %[load6], %[result3] \n\t" | |
253 "add %[load6], %[load6], %[result4] \n\t" | |
254 | |
255 "madd $ac1, %[load5], %[cospi_16_64] \n\t" | |
256 "madd $ac3, %[load6], %[cospi_16_64] \n\t" | |
257 | |
258 "extp %[step1_5], $ac1, 31 \n\t" | |
259 "extp %[step1_6], $ac3, 31 \n\t" | |
260 "add %[step1_4], %[result1], %[result2] \n\t" | |
261 "add %[step1_7], %[result4], %[result3] \n\t" | |
262 | |
263 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
264 [load7] "=&r" (load7), [load8] "=&r" (load8), | |
265 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
266 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
267 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), | |
268 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) | |
269 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
270 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), | |
271 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), | |
272 [cospi_16_64] "r" (cospi_16_64) | |
273 ); | |
274 | |
275 __asm__ __volatile__ ( | |
276 "mtlo %[const_2_power_13], $ac0 \n\t" | |
277 "mthi $zero, $ac0 \n\t" | |
278 "mtlo %[const_2_power_13], $ac1 \n\t" | |
279 "mthi $zero, $ac1 \n\t" | |
280 | |
281 "sub %[load5], %[step2_14], %[step2_13] \n\t" | |
282 "sub %[load5], %[load5], %[step2_9] \n\t" | |
283 "add %[load5], %[load5], %[step2_10] \n\t" | |
284 | |
285 "madd $ac0, %[load5], %[cospi_16_64] \n\t" | |
286 | |
287 "sub %[load6], %[step2_14], %[step2_13] \n\t" | |
288 "sub %[load6], %[load6], %[step2_10] \n\t" | |
289 "add %[load6], %[load6], %[step2_9] \n\t" | |
290 | |
291 "madd $ac1, %[load6], %[cospi_16_64] \n\t" | |
292 | |
293 "mtlo %[const_2_power_13], $ac2 \n\t" | |
294 "mthi $zero, $ac2 \n\t" | |
295 "mtlo %[const_2_power_13], $ac3 \n\t" | |
296 "mthi $zero, $ac3 \n\t" | |
297 | |
298 "sub %[load5], %[step2_15], %[step2_12] \n\t" | |
299 "sub %[load5], %[load5], %[step2_8] \n\t" | |
300 "add %[load5], %[load5], %[step2_11] \n\t" | |
301 | |
302 "madd $ac2, %[load5], %[cospi_16_64] \n\t" | |
303 | |
304 "sub %[load6], %[step2_15], %[step2_12] \n\t" | |
305 "sub %[load6], %[load6], %[step2_11] \n\t" | |
306 "add %[load6], %[load6], %[step2_8] \n\t" | |
307 | |
308 "madd $ac3, %[load6], %[cospi_16_64] \n\t" | |
309 | |
310 "extp %[step1_10], $ac0, 31 \n\t" | |
311 "extp %[step1_13], $ac1, 31 \n\t" | |
312 "extp %[step1_11], $ac2, 31 \n\t" | |
313 "extp %[step1_12], $ac3, 31 \n\t" | |
314 | |
315 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
316 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), | |
317 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) | |
318 : [const_2_power_13] "r" (const_2_power_13), | |
319 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), | |
320 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), | |
321 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), | |
322 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), | |
323 [cospi_16_64] "r" (cospi_16_64) | |
324 ); | |
325 | |
326 __asm__ __volatile__ ( | |
327 "add %[load5], %[step1_0], %[step1_7] \n\t" | |
328 "add %[load5], %[load5], %[step2_12] \n\t" | |
329 "add %[load5], %[load5], %[step2_15] \n\t" | |
330 "add %[load6], %[step1_1], %[step1_6] \n\t" | |
331 "add %[load6], %[load6], %[step2_13] \n\t" | |
332 "add %[load6], %[load6], %[step2_14] \n\t" | |
333 "sh %[load5], 0(%[output]) \n\t" | |
334 "sh %[load6], 32(%[output]) \n\t" | |
335 "sub %[load5], %[step1_1], %[step1_6] \n\t" | |
336 "add %[load5], %[load5], %[step2_9] \n\t" | |
337 "add %[load5], %[load5], %[step2_10] \n\t" | |
338 "sub %[load6], %[step1_0], %[step1_7] \n\t" | |
339 "add %[load6], %[load6], %[step2_8] \n\t" | |
340 "add %[load6], %[load6], %[step2_11] \n\t" | |
341 "sh %[load5], 192(%[output]) \n\t" | |
342 "sh %[load6], 224(%[output]) \n\t" | |
343 "sub %[load5], %[step1_0], %[step1_7] \n\t" | |
344 "sub %[load5], %[load5], %[step2_8] \n\t" | |
345 "sub %[load5], %[load5], %[step2_11] \n\t" | |
346 "sub %[load6], %[step1_1], %[step1_6] \n\t" | |
347 "sub %[load6], %[load6], %[step2_9] \n\t" | |
348 "sub %[load6], %[load6], %[step2_10] \n\t" | |
349 "sh %[load5], 256(%[output]) \n\t" | |
350 "sh %[load6], 288(%[output]) \n\t" | |
351 "add %[load5], %[step1_1], %[step1_6] \n\t" | |
352 "sub %[load5], %[load5], %[step2_13] \n\t" | |
353 "sub %[load5], %[load5], %[step2_14] \n\t" | |
354 "add %[load6], %[step1_0], %[step1_7] \n\t" | |
355 "sub %[load6], %[load6], %[step2_12] \n\t" | |
356 "sub %[load6], %[load6], %[step2_15] \n\t" | |
357 "sh %[load5], 448(%[output]) \n\t" | |
358 "sh %[load6], 480(%[output]) \n\t" | |
359 | |
360 : [load5] "=&r" (load5), [load6] "=&r" (load6) | |
361 : [output] "r" (output), | |
362 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), | |
363 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), | |
364 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), | |
365 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), | |
366 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), | |
367 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15) | |
368 ); | |
369 | |
370 __asm__ __volatile__ ( | |
371 "add %[load5], %[step1_2], %[step1_5] \n\t" | |
372 "add %[load5], %[load5], %[step1_13] \n\t" | |
373 "add %[load6], %[step1_3], %[step1_4] \n\t" | |
374 "add %[load6], %[load6], %[step1_12] \n\t" | |
375 "sh %[load5], 64(%[output]) \n\t" | |
376 "sh %[load6], 96(%[output]) \n\t" | |
377 "sub %[load5], %[step1_3], %[step1_4] \n\t" | |
378 "add %[load5], %[load5], %[step1_11] \n\t" | |
379 "sub %[load6], %[step1_2], %[step1_5] \n\t" | |
380 "add %[load6], %[load6], %[step1_10] \n\t" | |
381 "sh %[load5], 128(%[output]) \n\t" | |
382 "sh %[load6], 160(%[output]) \n\t" | |
383 "sub %[load5], %[step1_2], %[step1_5] \n\t" | |
384 "sub %[load5], %[load5], %[step1_10] \n\t" | |
385 "sub %[load6], %[step1_3], %[step1_4] \n\t" | |
386 "sub %[load6], %[load6], %[step1_11] \n\t" | |
387 "sh %[load5], 320(%[output]) \n\t" | |
388 "sh %[load6], 352(%[output]) \n\t" | |
389 "add %[load5], %[step1_3], %[step1_4] \n\t" | |
390 "sub %[load5], %[load5], %[step1_12] \n\t" | |
391 "add %[load6], %[step1_2], %[step1_5] \n\t" | |
392 "sub %[load6], %[load6], %[step1_13] \n\t" | |
393 "sh %[load5], 384(%[output]) \n\t" | |
394 "sh %[load6], 416(%[output]) \n\t" | |
395 | |
396 : [load5] "=&r" (load5), [load6] "=&r" (load6) | |
397 : [output] "r" (output), | |
398 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), | |
399 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), | |
400 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), | |
401 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) | |
402 ); | |
403 | |
404 input += 16; | |
405 output += 1; | |
406 } | |
407 } | |
408 | |
409 static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, | |
410 int dest_stride) { | |
411 int i; | |
412 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | |
413 int step1_8, step1_9, step1_10, step1_11; | |
414 int step1_12, step1_13, step1_14, step1_15; | |
415 int step2_0, step2_1, step2_2, step2_3; | |
416 int step2_8, step2_9, step2_10, step2_11; | |
417 int step2_12, step2_13, step2_14, step2_15; | |
418 int load1, load2, load3, load4, load5, load6, load7, load8; | |
419 int result1, result2, result3, result4; | |
420 const int const_2_power_13 = 8192; | |
421 uint8_t *dest_pix; | |
422 uint8_t *cm = vpx_ff_cropTbl; | |
423 | |
424 /* prefetch vpx_ff_cropTbl */ | |
425 prefetch_load(vpx_ff_cropTbl); | |
426 prefetch_load(vpx_ff_cropTbl + 32); | |
427 prefetch_load(vpx_ff_cropTbl + 64); | |
428 prefetch_load(vpx_ff_cropTbl + 96); | |
429 prefetch_load(vpx_ff_cropTbl + 128); | |
430 prefetch_load(vpx_ff_cropTbl + 160); | |
431 prefetch_load(vpx_ff_cropTbl + 192); | |
432 prefetch_load(vpx_ff_cropTbl + 224); | |
433 | |
434 for (i = 0; i < 16; ++i) { | |
435 dest_pix = (dest + i); | |
436 __asm__ __volatile__ ( | |
437 "lh %[load1], 0(%[input]) \n\t" | |
438 "lh %[load2], 16(%[input]) \n\t" | |
439 "lh %[load3], 8(%[input]) \n\t" | |
440 "lh %[load4], 24(%[input]) \n\t" | |
441 | |
442 "mtlo %[const_2_power_13], $ac1 \n\t" | |
443 "mthi $zero, $ac1 \n\t" | |
444 "mtlo %[const_2_power_13], $ac2 \n\t" | |
445 "mthi $zero, $ac2 \n\t" | |
446 "add %[result1], %[load1], %[load2] \n\t" | |
447 "sub %[result2], %[load1], %[load2] \n\t" | |
448 "madd $ac1, %[result1], %[cospi_16_64] \n\t" | |
449 "madd $ac2, %[result2], %[cospi_16_64] \n\t" | |
450 "extp %[step2_0], $ac1, 31 \n\t" | |
451 "extp %[step2_1], $ac2, 31 \n\t" | |
452 | |
453 "mtlo %[const_2_power_13], $ac3 \n\t" | |
454 "mthi $zero, $ac3 \n\t" | |
455 "madd $ac3, %[load3], %[cospi_24_64] \n\t" | |
456 "msub $ac3, %[load4], %[cospi_8_64] \n\t" | |
457 "extp %[step2_2], $ac3, 31 \n\t" | |
458 | |
459 "mtlo %[const_2_power_13], $ac1 \n\t" | |
460 "mthi $zero, $ac1 \n\t" | |
461 "madd $ac1, %[load3], %[cospi_8_64] \n\t" | |
462 "madd $ac1, %[load4], %[cospi_24_64] \n\t" | |
463 "extp %[step2_3], $ac1, 31 \n\t" | |
464 | |
465 "add %[step1_0], %[step2_0], %[step2_3] \n\t" | |
466 "add %[step1_1], %[step2_1], %[step2_2] \n\t" | |
467 "sub %[step1_2], %[step2_1], %[step2_2] \n\t" | |
468 "sub %[step1_3], %[step2_0], %[step2_3] \n\t" | |
469 | |
470 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
471 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
472 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
473 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), | |
474 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), | |
475 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), | |
476 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) | |
477 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
478 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), | |
479 [cospi_16_64] "r" (cospi_16_64) | |
480 ); | |
481 | |
482 __asm__ __volatile__ ( | |
483 "lh %[load5], 2(%[input]) \n\t" | |
484 "lh %[load6], 30(%[input]) \n\t" | |
485 "lh %[load7], 18(%[input]) \n\t" | |
486 "lh %[load8], 14(%[input]) \n\t" | |
487 | |
488 "mtlo %[const_2_power_13], $ac1 \n\t" | |
489 "mthi $zero, $ac1 \n\t" | |
490 "mtlo %[const_2_power_13], $ac3 \n\t" | |
491 "mthi $zero, $ac3 \n\t" | |
492 | |
493 "madd $ac1, %[load5], %[cospi_30_64] \n\t" | |
494 "msub $ac1, %[load6], %[cospi_2_64] \n\t" | |
495 "extp %[result1], $ac1, 31 \n\t" | |
496 | |
497 "madd $ac3, %[load7], %[cospi_14_64] \n\t" | |
498 "msub $ac3, %[load8], %[cospi_18_64] \n\t" | |
499 "extp %[result2], $ac3, 31 \n\t" | |
500 | |
501 "mtlo %[const_2_power_13], $ac1 \n\t" | |
502 "mthi $zero, $ac1 \n\t" | |
503 "mtlo %[const_2_power_13], $ac2 \n\t" | |
504 "mthi $zero, $ac2 \n\t" | |
505 | |
506 "madd $ac1, %[load7], %[cospi_18_64] \n\t" | |
507 "madd $ac1, %[load8], %[cospi_14_64] \n\t" | |
508 "extp %[result3], $ac1, 31 \n\t" | |
509 | |
510 "madd $ac2, %[load5], %[cospi_2_64] \n\t" | |
511 "madd $ac2, %[load6], %[cospi_30_64] \n\t" | |
512 "extp %[result4], $ac2, 31 \n\t" | |
513 | |
514 "sub %[load5], %[result1], %[result2] \n\t" | |
515 "sub %[load6], %[result4], %[result3] \n\t" | |
516 | |
517 "mtlo %[const_2_power_13], $ac1 \n\t" | |
518 "mthi $zero, $ac1 \n\t" | |
519 "mtlo %[const_2_power_13], $ac3 \n\t" | |
520 "mthi $zero, $ac3 \n\t" | |
521 | |
522 "madd $ac1, %[load6], %[cospi_24_64] \n\t" | |
523 "msub $ac1, %[load5], %[cospi_8_64] \n\t" | |
524 "madd $ac3, %[load5], %[cospi_24_64] \n\t" | |
525 "madd $ac3, %[load6], %[cospi_8_64] \n\t" | |
526 | |
527 "extp %[step2_9], $ac1, 31 \n\t" | |
528 "extp %[step2_14], $ac3, 31 \n\t" | |
529 "add %[step2_8], %[result1], %[result2] \n\t" | |
530 "add %[step2_15], %[result4], %[result3] \n\t" | |
531 | |
532 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
533 [load7] "=&r" (load7), [load8] "=&r" (load8), | |
534 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
535 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
536 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), | |
537 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) | |
538 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
539 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), | |
540 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), | |
541 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
542 ); | |
543 | |
544 __asm__ __volatile__ ( | |
545 "lh %[load1], 10(%[input]) \n\t" | |
546 "lh %[load2], 22(%[input]) \n\t" | |
547 "lh %[load3], 26(%[input]) \n\t" | |
548 "lh %[load4], 6(%[input]) \n\t" | |
549 | |
550 "mtlo %[const_2_power_13], $ac1 \n\t" | |
551 "mthi $zero, $ac1 \n\t" | |
552 "mtlo %[const_2_power_13], $ac3 \n\t" | |
553 "mthi $zero, $ac3 \n\t" | |
554 | |
555 "madd $ac1, %[load1], %[cospi_22_64] \n\t" | |
556 "msub $ac1, %[load2], %[cospi_10_64] \n\t" | |
557 "extp %[result1], $ac1, 31 \n\t" | |
558 | |
559 "madd $ac3, %[load3], %[cospi_6_64] \n\t" | |
560 "msub $ac3, %[load4], %[cospi_26_64] \n\t" | |
561 "extp %[result2], $ac3, 31 \n\t" | |
562 | |
563 "mtlo %[const_2_power_13], $ac1 \n\t" | |
564 "mthi $zero, $ac1 \n\t" | |
565 "mtlo %[const_2_power_13], $ac2 \n\t" | |
566 "mthi $zero, $ac2 \n\t" | |
567 | |
568 "madd $ac1, %[load1], %[cospi_10_64] \n\t" | |
569 "madd $ac1, %[load2], %[cospi_22_64] \n\t" | |
570 "extp %[result3], $ac1, 31 \n\t" | |
571 | |
572 "madd $ac2, %[load3], %[cospi_26_64] \n\t" | |
573 "madd $ac2, %[load4], %[cospi_6_64] \n\t" | |
574 "extp %[result4], $ac2, 31 \n\t" | |
575 | |
576 "mtlo %[const_2_power_13], $ac1 \n\t" | |
577 "mthi $zero, $ac1 \n\t" | |
578 "mtlo %[const_2_power_13], $ac3 \n\t" | |
579 "mthi $zero, $ac3 \n\t" | |
580 | |
581 "sub %[load1], %[result2], %[result1] \n\t" | |
582 "sub %[load2], %[result4], %[result3] \n\t" | |
583 | |
584 "msub $ac1, %[load1], %[cospi_24_64] \n\t" | |
585 "msub $ac1, %[load2], %[cospi_8_64] \n\t" | |
586 "madd $ac3, %[load2], %[cospi_24_64] \n\t" | |
587 "msub $ac3, %[load1], %[cospi_8_64] \n\t" | |
588 | |
589 "extp %[step2_10], $ac1, 31 \n\t" | |
590 "extp %[step2_13], $ac3, 31 \n\t" | |
591 "add %[step2_11], %[result1], %[result2] \n\t" | |
592 "add %[step2_12], %[result4], %[result3] \n\t" | |
593 | |
594 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
595 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
596 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
597 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
598 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), | |
599 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) | |
600 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
601 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), | |
602 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), | |
603 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
604 ); | |
605 | |
606 __asm__ __volatile__ ( | |
607 "lh %[load5], 4(%[input]) \n\t" | |
608 "lh %[load6], 28(%[input]) \n\t" | |
609 "lh %[load7], 20(%[input]) \n\t" | |
610 "lh %[load8], 12(%[input]) \n\t" | |
611 | |
612 "mtlo %[const_2_power_13], $ac1 \n\t" | |
613 "mthi $zero, $ac1 \n\t" | |
614 "mtlo %[const_2_power_13], $ac3 \n\t" | |
615 "mthi $zero, $ac3 \n\t" | |
616 | |
617 "madd $ac1, %[load5], %[cospi_28_64] \n\t" | |
618 "msub $ac1, %[load6], %[cospi_4_64] \n\t" | |
619 "extp %[result1], $ac1, 31 \n\t" | |
620 | |
621 "madd $ac3, %[load7], %[cospi_12_64] \n\t" | |
622 "msub $ac3, %[load8], %[cospi_20_64] \n\t" | |
623 "extp %[result2], $ac3, 31 \n\t" | |
624 | |
625 "mtlo %[const_2_power_13], $ac1 \n\t" | |
626 "mthi $zero, $ac1 \n\t" | |
627 "mtlo %[const_2_power_13], $ac2 \n\t" | |
628 "mthi $zero, $ac2 \n\t" | |
629 | |
630 "madd $ac1, %[load7], %[cospi_20_64] \n\t" | |
631 "madd $ac1, %[load8], %[cospi_12_64] \n\t" | |
632 "extp %[result3], $ac1, 31 \n\t" | |
633 | |
634 "madd $ac2, %[load5], %[cospi_4_64] \n\t" | |
635 "madd $ac2, %[load6], %[cospi_28_64] \n\t" | |
636 "extp %[result4], $ac2, 31 \n\t" | |
637 | |
638 "mtlo %[const_2_power_13], $ac1 \n\t" | |
639 "mthi $zero, $ac1 \n\t" | |
640 "mtlo %[const_2_power_13], $ac3 \n\t" | |
641 "mthi $zero, $ac3 \n\t" | |
642 | |
643 "sub %[load5], %[result4], %[result3] \n\t" | |
644 "sub %[load5], %[load5], %[result1] \n\t" | |
645 "add %[load5], %[load5], %[result2] \n\t" | |
646 | |
647 "sub %[load6], %[result1], %[result2] \n\t" | |
648 "sub %[load6], %[load6], %[result3] \n\t" | |
649 "add %[load6], %[load6], %[result4] \n\t" | |
650 | |
651 "madd $ac1, %[load5], %[cospi_16_64] \n\t" | |
652 "madd $ac3, %[load6], %[cospi_16_64] \n\t" | |
653 | |
654 "extp %[step1_5], $ac1, 31 \n\t" | |
655 "extp %[step1_6], $ac3, 31 \n\t" | |
656 | |
657 "add %[step1_4], %[result1], %[result2] \n\t" | |
658 "add %[step1_7], %[result4], %[result3] \n\t" | |
659 | |
660 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
661 [load7] "=&r" (load7), [load8] "=&r" (load8), | |
662 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
663 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
664 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), | |
665 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) | |
666 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
667 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), | |
668 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), | |
669 [cospi_16_64] "r" (cospi_16_64) | |
670 ); | |
671 | |
672 __asm__ __volatile__ ( | |
673 "mtlo %[const_2_power_13], $ac0 \n\t" | |
674 "mthi $zero, $ac0 \n\t" | |
675 "mtlo %[const_2_power_13], $ac1 \n\t" | |
676 "mthi $zero, $ac1 \n\t" | |
677 | |
678 "sub %[load5], %[step2_14], %[step2_13] \n\t" | |
679 "sub %[load5], %[load5], %[step2_9] \n\t" | |
680 "add %[load5], %[load5], %[step2_10] \n\t" | |
681 | |
682 "madd $ac0, %[load5], %[cospi_16_64] \n\t" | |
683 | |
684 "sub %[load6], %[step2_14], %[step2_13] \n\t" | |
685 "sub %[load6], %[load6], %[step2_10] \n\t" | |
686 "add %[load6], %[load6], %[step2_9] \n\t" | |
687 | |
688 "madd $ac1, %[load6], %[cospi_16_64] \n\t" | |
689 | |
690 "mtlo %[const_2_power_13], $ac2 \n\t" | |
691 "mthi $zero, $ac2 \n\t" | |
692 "mtlo %[const_2_power_13], $ac3 \n\t" | |
693 "mthi $zero, $ac3 \n\t" | |
694 | |
695 "sub %[load5], %[step2_15], %[step2_12] \n\t" | |
696 "sub %[load5], %[load5], %[step2_8] \n\t" | |
697 "add %[load5], %[load5], %[step2_11] \n\t" | |
698 | |
699 "madd $ac2, %[load5], %[cospi_16_64] \n\t" | |
700 | |
701 "sub %[load6], %[step2_15], %[step2_12] \n\t" | |
702 "sub %[load6], %[load6], %[step2_11] \n\t" | |
703 "add %[load6], %[load6], %[step2_8] \n\t" | |
704 | |
705 "madd $ac3, %[load6], %[cospi_16_64] \n\t" | |
706 | |
707 "extp %[step1_10], $ac0, 31 \n\t" | |
708 "extp %[step1_13], $ac1, 31 \n\t" | |
709 "extp %[step1_11], $ac2, 31 \n\t" | |
710 "extp %[step1_12], $ac3, 31 \n\t" | |
711 | |
712 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
713 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), | |
714 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) | |
715 : [const_2_power_13] "r" (const_2_power_13), | |
716 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), | |
717 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), | |
718 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), | |
719 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), | |
720 [cospi_16_64] "r" (cospi_16_64) | |
721 ); | |
722 | |
723 step1_8 = step2_8 + step2_11; | |
724 step1_9 = step2_9 + step2_10; | |
725 step1_14 = step2_13 + step2_14; | |
726 step1_15 = step2_12 + step2_15; | |
727 | |
728 __asm__ __volatile__ ( | |
729 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
730 "add %[load5], %[step1_0], %[step1_7] \n\t" | |
731 "add %[load5], %[load5], %[step1_15] \n\t" | |
732 "addi %[load5], %[load5], 32 \n\t" | |
733 "sra %[load5], %[load5], 6 \n\t" | |
734 "add %[load7], %[load7], %[load5] \n\t" | |
735 "lbux %[load5], %[load7](%[cm]) \n\t" | |
736 "add %[load6], %[step1_1], %[step1_6] \n\t" | |
737 "add %[load6], %[load6], %[step1_14] \n\t" | |
738 "sb %[load5], 0(%[dest_pix]) \n\t" | |
739 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
740 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
741 "addi %[load6], %[load6], 32 \n\t" | |
742 "sra %[load6], %[load6], 6 \n\t" | |
743 "add %[load8], %[load8], %[load6] \n\t" | |
744 "lbux %[load6], %[load8](%[cm]) \n\t" | |
745 "sb %[load6], 0(%[dest_pix]) \n\t" | |
746 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
747 | |
748 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
749 "add %[load5], %[step1_2], %[step1_5] \n\t" | |
750 "add %[load5], %[load5], %[step1_13] \n\t" | |
751 "addi %[load5], %[load5], 32 \n\t" | |
752 "sra %[load5], %[load5], 6 \n\t" | |
753 "add %[load7], %[load7], %[load5] \n\t" | |
754 "lbux %[load5], %[load7](%[cm]) \n\t" | |
755 "add %[load6], %[step1_3], %[step1_4] \n\t" | |
756 "add %[load6], %[load6], %[step1_12] \n\t" | |
757 "sb %[load5], 0(%[dest_pix]) \n\t" | |
758 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
759 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
760 "addi %[load6], %[load6], 32 \n\t" | |
761 "sra %[load6], %[load6], 6 \n\t" | |
762 "add %[load8], %[load8], %[load6] \n\t" | |
763 "lbux %[load6], %[load8](%[cm]) \n\t" | |
764 "sb %[load6], 0(%[dest_pix]) \n\t" | |
765 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
766 | |
767 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
768 "sub %[load5], %[step1_3], %[step1_4] \n\t" | |
769 "add %[load5], %[load5], %[step1_11] \n\t" | |
770 "addi %[load5], %[load5], 32 \n\t" | |
771 "sra %[load5], %[load5], 6 \n\t" | |
772 "add %[load7], %[load7], %[load5] \n\t" | |
773 "lbux %[load5], %[load7](%[cm]) \n\t" | |
774 "sub %[load6], %[step1_2], %[step1_5] \n\t" | |
775 "add %[load6], %[load6], %[step1_10] \n\t" | |
776 "sb %[load5], 0(%[dest_pix]) \n\t" | |
777 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
778 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
779 "addi %[load6], %[load6], 32 \n\t" | |
780 "sra %[load6], %[load6], 6 \n\t" | |
781 "add %[load8], %[load8], %[load6] \n\t" | |
782 "lbux %[load6], %[load8](%[cm]) \n\t" | |
783 "sb %[load6], 0(%[dest_pix]) \n\t" | |
784 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
785 | |
786 "sub %[load5], %[step1_1], %[step1_6] \n\t" | |
787 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
788 "add %[load5], %[load5], %[step1_9] \n\t" | |
789 "addi %[load5], %[load5], 32 \n\t" | |
790 "sra %[load5], %[load5], 6 \n\t" | |
791 "add %[load7], %[load7], %[load5] \n\t" | |
792 "lbux %[load5], %[load7](%[cm]) \n\t" | |
793 "sub %[load6], %[step1_0], %[step1_7] \n\t" | |
794 "add %[load6], %[load6], %[step1_8] \n\t" | |
795 "sb %[load5], 0(%[dest_pix]) \n\t" | |
796 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
797 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
798 "addi %[load6], %[load6], 32 \n\t" | |
799 "sra %[load6], %[load6], 6 \n\t" | |
800 "add %[load8], %[load8], %[load6] \n\t" | |
801 "lbux %[load6], %[load8](%[cm]) \n\t" | |
802 "sb %[load6], 0(%[dest_pix]) \n\t" | |
803 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
804 | |
805 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
806 "sub %[load5], %[step1_0], %[step1_7] \n\t" | |
807 "sub %[load5], %[load5], %[step1_8] \n\t" | |
808 "addi %[load5], %[load5], 32 \n\t" | |
809 "sra %[load5], %[load5], 6 \n\t" | |
810 "add %[load7], %[load7], %[load5] \n\t" | |
811 "lbux %[load5], %[load7](%[cm]) \n\t" | |
812 "sub %[load6], %[step1_1], %[step1_6] \n\t" | |
813 "sub %[load6], %[load6], %[step1_9] \n\t" | |
814 "sb %[load5], 0(%[dest_pix]) \n\t" | |
815 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
816 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
817 "addi %[load6], %[load6], 32 \n\t" | |
818 "sra %[load6], %[load6], 6 \n\t" | |
819 "add %[load8], %[load8], %[load6] \n\t" | |
820 "lbux %[load6], %[load8](%[cm]) \n\t" | |
821 "sb %[load6], 0(%[dest_pix]) \n\t" | |
822 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
823 | |
824 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
825 "sub %[load5], %[step1_2], %[step1_5] \n\t" | |
826 "sub %[load5], %[load5], %[step1_10] \n\t" | |
827 "addi %[load5], %[load5], 32 \n\t" | |
828 "sra %[load5], %[load5], 6 \n\t" | |
829 "add %[load7], %[load7], %[load5] \n\t" | |
830 "lbux %[load5], %[load7](%[cm]) \n\t" | |
831 "sub %[load6], %[step1_3], %[step1_4] \n\t" | |
832 "sub %[load6], %[load6], %[step1_11] \n\t" | |
833 "sb %[load5], 0(%[dest_pix]) \n\t" | |
834 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
835 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
836 "addi %[load6], %[load6], 32 \n\t" | |
837 "sra %[load6], %[load6], 6 \n\t" | |
838 "add %[load8], %[load8], %[load6] \n\t" | |
839 "lbux %[load6], %[load8](%[cm]) \n\t" | |
840 "sb %[load6], 0(%[dest_pix]) \n\t" | |
841 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
842 | |
843 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
844 "add %[load5], %[step1_3], %[step1_4] \n\t" | |
845 "sub %[load5], %[load5], %[step1_12] \n\t" | |
846 "addi %[load5], %[load5], 32 \n\t" | |
847 "sra %[load5], %[load5], 6 \n\t" | |
848 "add %[load7], %[load7], %[load5] \n\t" | |
849 "lbux %[load5], %[load7](%[cm]) \n\t" | |
850 "add %[load6], %[step1_2], %[step1_5] \n\t" | |
851 "sub %[load6], %[load6], %[step1_13] \n\t" | |
852 "sb %[load5], 0(%[dest_pix]) \n\t" | |
853 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
854 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
855 "addi %[load6], %[load6], 32 \n\t" | |
856 "sra %[load6], %[load6], 6 \n\t" | |
857 "add %[load8], %[load8], %[load6] \n\t" | |
858 "lbux %[load6], %[load8](%[cm]) \n\t" | |
859 "sb %[load6], 0(%[dest_pix]) \n\t" | |
860 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
861 | |
862 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
863 "add %[load5], %[step1_1], %[step1_6] \n\t" | |
864 "sub %[load5], %[load5], %[step1_14] \n\t" | |
865 "addi %[load5], %[load5], 32 \n\t" | |
866 "sra %[load5], %[load5], 6 \n\t" | |
867 "add %[load7], %[load7], %[load5] \n\t" | |
868 "lbux %[load5], %[load7](%[cm]) \n\t" | |
869 "add %[load6], %[step1_0], %[step1_7] \n\t" | |
870 "sub %[load6], %[load6], %[step1_15] \n\t" | |
871 "sb %[load5], 0(%[dest_pix]) \n\t" | |
872 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
873 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
874 "addi %[load6], %[load6], 32 \n\t" | |
875 "sra %[load6], %[load6], 6 \n\t" | |
876 "add %[load8], %[load8], %[load6] \n\t" | |
877 "lbux %[load6], %[load8](%[cm]) \n\t" | |
878 "sb %[load6], 0(%[dest_pix]) \n\t" | |
879 | |
880 : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7), | |
881 [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix) | |
882 : [cm] "r" (cm), [dest_stride] "r" (dest_stride), | |
883 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), | |
884 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), | |
885 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), | |
886 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), | |
887 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), | |
888 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), | |
889 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), | |
890 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) | |
891 ); | |
892 | |
893 input += 16; | |
894 } | |
895 } | |
896 | |
897 void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, | |
898 int dest_stride) { | |
899 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | |
900 uint32_t pos = 45; | |
901 | |
902 /* bit positon for extract from acc */ | |
903 __asm__ __volatile__ ( | |
904 "wrdsp %[pos], 1 \n\t" | |
905 : | |
906 : [pos] "r" (pos) | |
907 ); | |
908 | |
909 // First transform rows | |
910 idct16_rows_dspr2(input, out, 16); | |
911 | |
912 // Then transform columns and add to dest | |
913 idct16_cols_add_blk_dspr2(out, dest, dest_stride); | |
914 } | |
915 | |
916 static void iadst16(const int16_t *input, int16_t *output) { | |
917 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; | |
918 | |
919 int x0 = input[15]; | |
920 int x1 = input[0]; | |
921 int x2 = input[13]; | |
922 int x3 = input[2]; | |
923 int x4 = input[11]; | |
924 int x5 = input[4]; | |
925 int x6 = input[9]; | |
926 int x7 = input[6]; | |
927 int x8 = input[7]; | |
928 int x9 = input[8]; | |
929 int x10 = input[5]; | |
930 int x11 = input[10]; | |
931 int x12 = input[3]; | |
932 int x13 = input[12]; | |
933 int x14 = input[1]; | |
934 int x15 = input[14]; | |
935 | |
936 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | |
937 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { | |
938 output[0] = output[1] = output[2] = output[3] = output[4] | |
939 = output[5] = output[6] = output[7] = output[8] | |
940 = output[9] = output[10] = output[11] = output[12] | |
941 = output[13] = output[14] = output[15] = 0; | |
942 return; | |
943 } | |
944 | |
945 // stage 1 | |
946 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; | |
947 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; | |
948 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; | |
949 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; | |
950 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; | |
951 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; | |
952 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; | |
953 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; | |
954 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; | |
955 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; | |
956 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; | |
957 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; | |
958 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; | |
959 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; | |
960 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; | |
961 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; | |
962 | |
963 x0 = dct_const_round_shift(s0 + s8); | |
964 x1 = dct_const_round_shift(s1 + s9); | |
965 x2 = dct_const_round_shift(s2 + s10); | |
966 x3 = dct_const_round_shift(s3 + s11); | |
967 x4 = dct_const_round_shift(s4 + s12); | |
968 x5 = dct_const_round_shift(s5 + s13); | |
969 x6 = dct_const_round_shift(s6 + s14); | |
970 x7 = dct_const_round_shift(s7 + s15); | |
971 x8 = dct_const_round_shift(s0 - s8); | |
972 x9 = dct_const_round_shift(s1 - s9); | |
973 x10 = dct_const_round_shift(s2 - s10); | |
974 x11 = dct_const_round_shift(s3 - s11); | |
975 x12 = dct_const_round_shift(s4 - s12); | |
976 x13 = dct_const_round_shift(s5 - s13); | |
977 x14 = dct_const_round_shift(s6 - s14); | |
978 x15 = dct_const_round_shift(s7 - s15); | |
979 | |
980 // stage 2 | |
981 s0 = x0; | |
982 s1 = x1; | |
983 s2 = x2; | |
984 s3 = x3; | |
985 s4 = x4; | |
986 s5 = x5; | |
987 s6 = x6; | |
988 s7 = x7; | |
989 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; | |
990 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; | |
991 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; | |
992 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; | |
993 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; | |
994 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; | |
995 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; | |
996 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; | |
997 | |
998 x0 = s0 + s4; | |
999 x1 = s1 + s5; | |
1000 x2 = s2 + s6; | |
1001 x3 = s3 + s7; | |
1002 x4 = s0 - s4; | |
1003 x5 = s1 - s5; | |
1004 x6 = s2 - s6; | |
1005 x7 = s3 - s7; | |
1006 x8 = dct_const_round_shift(s8 + s12); | |
1007 x9 = dct_const_round_shift(s9 + s13); | |
1008 x10 = dct_const_round_shift(s10 + s14); | |
1009 x11 = dct_const_round_shift(s11 + s15); | |
1010 x12 = dct_const_round_shift(s8 - s12); | |
1011 x13 = dct_const_round_shift(s9 - s13); | |
1012 x14 = dct_const_round_shift(s10 - s14); | |
1013 x15 = dct_const_round_shift(s11 - s15); | |
1014 | |
1015 // stage 3 | |
1016 s0 = x0; | |
1017 s1 = x1; | |
1018 s2 = x2; | |
1019 s3 = x3; | |
1020 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; | |
1021 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; | |
1022 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; | |
1023 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; | |
1024 s8 = x8; | |
1025 s9 = x9; | |
1026 s10 = x10; | |
1027 s11 = x11; | |
1028 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; | |
1029 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; | |
1030 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; | |
1031 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; | |
1032 | |
1033 x0 = s0 + s2; | |
1034 x1 = s1 + s3; | |
1035 x2 = s0 - s2; | |
1036 x3 = s1 - s3; | |
1037 x4 = dct_const_round_shift(s4 + s6); | |
1038 x5 = dct_const_round_shift(s5 + s7); | |
1039 x6 = dct_const_round_shift(s4 - s6); | |
1040 x7 = dct_const_round_shift(s5 - s7); | |
1041 x8 = s8 + s10; | |
1042 x9 = s9 + s11; | |
1043 x10 = s8 - s10; | |
1044 x11 = s9 - s11; | |
1045 x12 = dct_const_round_shift(s12 + s14); | |
1046 x13 = dct_const_round_shift(s13 + s15); | |
1047 x14 = dct_const_round_shift(s12 - s14); | |
1048 x15 = dct_const_round_shift(s13 - s15); | |
1049 | |
1050 // stage 4 | |
1051 s2 = (- cospi_16_64) * (x2 + x3); | |
1052 s3 = cospi_16_64 * (x2 - x3); | |
1053 s6 = cospi_16_64 * (x6 + x7); | |
1054 s7 = cospi_16_64 * (- x6 + x7); | |
1055 s10 = cospi_16_64 * (x10 + x11); | |
1056 s11 = cospi_16_64 * (- x10 + x11); | |
1057 s14 = (- cospi_16_64) * (x14 + x15); | |
1058 s15 = cospi_16_64 * (x14 - x15); | |
1059 | |
1060 x2 = dct_const_round_shift(s2); | |
1061 x3 = dct_const_round_shift(s3); | |
1062 x6 = dct_const_round_shift(s6); | |
1063 x7 = dct_const_round_shift(s7); | |
1064 x10 = dct_const_round_shift(s10); | |
1065 x11 = dct_const_round_shift(s11); | |
1066 x14 = dct_const_round_shift(s14); | |
1067 x15 = dct_const_round_shift(s15); | |
1068 | |
1069 output[0] = x0; | |
1070 output[1] = -x8; | |
1071 output[2] = x12; | |
1072 output[3] = -x4; | |
1073 output[4] = x6; | |
1074 output[5] = x14; | |
1075 output[6] = x10; | |
1076 output[7] = x2; | |
1077 output[8] = x3; | |
1078 output[9] = x11; | |
1079 output[10] = x15; | |
1080 output[11] = x7; | |
1081 output[12] = x5; | |
1082 output[13] = -x13; | |
1083 output[14] = x9; | |
1084 output[15] = -x1; | |
1085 } | |
1086 | |
1087 void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, | 24 void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, |
1088 int pitch, int tx_type) { | 25 int pitch, int tx_type) { |
1089 int i, j; | 26 int i, j; |
1090 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | 27 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); |
1091 int16_t *outptr = out; | 28 int16_t *outptr = out; |
1092 int16_t temp_out[16]; | 29 int16_t temp_out[16]; |
1093 uint32_t pos = 45; | 30 uint32_t pos = 45; |
1094 | 31 |
1095 /* bit positon for extract from acc */ | 32 /* bit positon for extract from acc */ |
1096 __asm__ __volatile__ ( | 33 __asm__ __volatile__ ( |
1097 "wrdsp %[pos], 1 \n\t" | 34 "wrdsp %[pos], 1 \n\t" |
1098 : | 35 : |
1099 : [pos] "r" (pos) | 36 : [pos] "r" (pos) |
1100 ); | 37 ); |
1101 | 38 |
1102 switch (tx_type) { | 39 switch (tx_type) { |
1103 case DCT_DCT: // DCT in both horizontal and vertical | 40 case DCT_DCT: // DCT in both horizontal and vertical |
1104 idct16_rows_dspr2(input, outptr, 16); | 41 idct16_rows_dspr2(input, outptr, 16); |
1105 idct16_cols_add_blk_dspr2(out, dest, pitch); | 42 idct16_cols_add_blk_dspr2(out, dest, pitch); |
1106 break; | 43 break; |
1107 case ADST_DCT: // ADST in vertical, DCT in horizontal | 44 case ADST_DCT: // ADST in vertical, DCT in horizontal |
1108 idct16_rows_dspr2(input, outptr, 16); | 45 idct16_rows_dspr2(input, outptr, 16); |
1109 | 46 |
1110 outptr = out; | 47 outptr = out; |
1111 | 48 |
1112 for (i = 0; i < 16; ++i) { | 49 for (i = 0; i < 16; ++i) { |
1113 iadst16(outptr, temp_out); | 50 iadst16_dspr2(outptr, temp_out); |
1114 | 51 |
1115 for (j = 0; j < 16; ++j) | 52 for (j = 0; j < 16; ++j) |
1116 dest[j * pitch + i] = | 53 dest[j * pitch + i] = |
1117 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | 54 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
1118 + dest[j * pitch + i]); | 55 + dest[j * pitch + i]); |
1119 outptr += 16; | 56 outptr += 16; |
1120 } | 57 } |
1121 break; | 58 break; |
1122 case DCT_ADST: // DCT in vertical, ADST in horizontal | 59 case DCT_ADST: // DCT in vertical, ADST in horizontal |
1123 { | 60 { |
1124 int16_t temp_in[16 * 16]; | 61 int16_t temp_in[16 * 16]; |
1125 | 62 |
1126 for (i = 0; i < 16; ++i) { | 63 for (i = 0; i < 16; ++i) { |
1127 /* prefetch row */ | 64 /* prefetch row */ |
1128 prefetch_load((const uint8_t *)(input + 16)); | 65 prefetch_load((const uint8_t *)(input + 16)); |
1129 | 66 |
1130 iadst16(input, outptr); | 67 iadst16_dspr2(input, outptr); |
1131 input += 16; | 68 input += 16; |
1132 outptr += 16; | 69 outptr += 16; |
1133 } | 70 } |
1134 | 71 |
1135 for (i = 0; i < 16; ++i) | 72 for (i = 0; i < 16; ++i) |
1136 for (j = 0; j < 16; ++j) | 73 for (j = 0; j < 16; ++j) |
1137 temp_in[j * 16 + i] = out[i * 16 + j]; | 74 temp_in[j * 16 + i] = out[i * 16 + j]; |
1138 | 75 |
1139 idct16_cols_add_blk_dspr2(temp_in, dest, pitch); | 76 idct16_cols_add_blk_dspr2(temp_in, dest, pitch); |
1140 } | 77 } |
1141 break; | 78 break; |
1142 case ADST_ADST: // ADST in both directions | 79 case ADST_ADST: // ADST in both directions |
1143 { | 80 { |
1144 int16_t temp_in[16]; | 81 int16_t temp_in[16]; |
1145 | 82 |
1146 for (i = 0; i < 16; ++i) { | 83 for (i = 0; i < 16; ++i) { |
1147 /* prefetch row */ | 84 /* prefetch row */ |
1148 prefetch_load((const uint8_t *)(input + 16)); | 85 prefetch_load((const uint8_t *)(input + 16)); |
1149 | 86 |
1150 iadst16(input, outptr); | 87 iadst16_dspr2(input, outptr); |
1151 input += 16; | 88 input += 16; |
1152 outptr += 16; | 89 outptr += 16; |
1153 } | 90 } |
1154 | 91 |
1155 for (i = 0; i < 16; ++i) { | 92 for (i = 0; i < 16; ++i) { |
1156 for (j = 0; j < 16; ++j) | 93 for (j = 0; j < 16; ++j) |
1157 temp_in[j] = out[j * 16 + i]; | 94 temp_in[j] = out[j * 16 + i]; |
1158 iadst16(temp_in, temp_out); | 95 iadst16_dspr2(temp_in, temp_out); |
1159 for (j = 0; j < 16; ++j) | 96 for (j = 0; j < 16; ++j) |
1160 dest[j * pitch + i] = | 97 dest[j * pitch + i] = |
1161 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | 98 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
1162 + dest[j * pitch + i]); | 99 + dest[j * pitch + i]); |
1163 } | 100 } |
1164 } | 101 } |
1165 break; | 102 break; |
1166 default: | 103 default: |
1167 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); | 104 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); |
1168 break; | 105 break; |
1169 } | 106 } |
1170 } | 107 } |
1171 | |
1172 void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, | |
1173 int dest_stride) { | |
1174 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | |
1175 int16_t *outptr = out; | |
1176 uint32_t i; | |
1177 uint32_t pos = 45; | |
1178 | |
1179 /* bit positon for extract from acc */ | |
1180 __asm__ __volatile__ ( | |
1181 "wrdsp %[pos], 1 \n\t" | |
1182 : | |
1183 : [pos] "r" (pos) | |
1184 ); | |
1185 | |
1186 // First transform rows. Since all non-zero dct coefficients are in | |
1187 // upper-left 4x4 area, we only need to calculate first 4 rows here. | |
1188 idct16_rows_dspr2(input, outptr, 4); | |
1189 | |
1190 outptr += 4; | |
1191 for (i = 0; i < 6; ++i) { | |
1192 __asm__ __volatile__ ( | |
1193 "sw $zero, 0(%[outptr]) \n\t" | |
1194 "sw $zero, 32(%[outptr]) \n\t" | |
1195 "sw $zero, 64(%[outptr]) \n\t" | |
1196 "sw $zero, 96(%[outptr]) \n\t" | |
1197 "sw $zero, 128(%[outptr]) \n\t" | |
1198 "sw $zero, 160(%[outptr]) \n\t" | |
1199 "sw $zero, 192(%[outptr]) \n\t" | |
1200 "sw $zero, 224(%[outptr]) \n\t" | |
1201 "sw $zero, 256(%[outptr]) \n\t" | |
1202 "sw $zero, 288(%[outptr]) \n\t" | |
1203 "sw $zero, 320(%[outptr]) \n\t" | |
1204 "sw $zero, 352(%[outptr]) \n\t" | |
1205 "sw $zero, 384(%[outptr]) \n\t" | |
1206 "sw $zero, 416(%[outptr]) \n\t" | |
1207 "sw $zero, 448(%[outptr]) \n\t" | |
1208 "sw $zero, 480(%[outptr]) \n\t" | |
1209 | |
1210 : | |
1211 : [outptr] "r" (outptr) | |
1212 ); | |
1213 | |
1214 outptr += 2; | |
1215 } | |
1216 | |
1217 // Then transform columns | |
1218 idct16_cols_add_blk_dspr2(out, dest, dest_stride); | |
1219 } | |
1220 | |
1221 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, | |
1222 int dest_stride) { | |
1223 uint32_t pos = 45; | |
1224 int32_t out; | |
1225 int32_t r; | |
1226 int32_t a1, absa1; | |
1227 int32_t vector_a1; | |
1228 int32_t t1, t2, t3, t4; | |
1229 int32_t vector_1, vector_2, vector_3, vector_4; | |
1230 | |
1231 /* bit positon for extract from acc */ | |
1232 __asm__ __volatile__ ( | |
1233 "wrdsp %[pos], 1 \n\t" | |
1234 | |
1235 : | |
1236 : [pos] "r" (pos) | |
1237 ); | |
1238 | |
1239 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); | |
1240 __asm__ __volatile__ ( | |
1241 "addi %[out], %[out], 32 \n\t" | |
1242 "sra %[a1], %[out], 6 \n\t" | |
1243 | |
1244 : [out] "+r" (out), [a1] "=r" (a1) | |
1245 : | |
1246 ); | |
1247 | |
1248 if (a1 < 0) { | |
1249 /* use quad-byte | |
1250 * input and output memory are four byte aligned */ | |
1251 __asm__ __volatile__ ( | |
1252 "abs %[absa1], %[a1] \n\t" | |
1253 "replv.qb %[vector_a1], %[absa1] \n\t" | |
1254 | |
1255 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) | |
1256 : [a1] "r" (a1) | |
1257 ); | |
1258 | |
1259 for (r = 16; r--;) { | |
1260 __asm__ __volatile__ ( | |
1261 "lw %[t1], 0(%[dest]) \n\t" | |
1262 "lw %[t2], 4(%[dest]) \n\t" | |
1263 "lw %[t3], 8(%[dest]) \n\t" | |
1264 "lw %[t4], 12(%[dest]) \n\t" | |
1265 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
1266 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
1267 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
1268 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
1269 "sw %[vector_1], 0(%[dest]) \n\t" | |
1270 "sw %[vector_2], 4(%[dest]) \n\t" | |
1271 "sw %[vector_3], 8(%[dest]) \n\t" | |
1272 "sw %[vector_4], 12(%[dest]) \n\t" | |
1273 "add %[dest], %[dest], %[dest_stride] \n\t" | |
1274 | |
1275 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
1276 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
1277 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
1278 [dest] "+&r" (dest) | |
1279 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
1280 ); | |
1281 } | |
1282 } else { | |
1283 /* use quad-byte | |
1284 * input and output memory are four byte aligned */ | |
1285 __asm__ __volatile__ ( | |
1286 "replv.qb %[vector_a1], %[a1] \n\t" | |
1287 | |
1288 : [vector_a1] "=r" (vector_a1) | |
1289 : [a1] "r" (a1) | |
1290 ); | |
1291 | |
1292 for (r = 16; r--;) { | |
1293 __asm__ __volatile__ ( | |
1294 "lw %[t1], 0(%[dest]) \n\t" | |
1295 "lw %[t2], 4(%[dest]) \n\t" | |
1296 "lw %[t3], 8(%[dest]) \n\t" | |
1297 "lw %[t4], 12(%[dest]) \n\t" | |
1298 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
1299 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
1300 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
1301 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
1302 "sw %[vector_1], 0(%[dest]) \n\t" | |
1303 "sw %[vector_2], 4(%[dest]) \n\t" | |
1304 "sw %[vector_3], 8(%[dest]) \n\t" | |
1305 "sw %[vector_4], 12(%[dest]) \n\t" | |
1306 "add %[dest], %[dest], %[dest_stride] \n\t" | |
1307 | |
1308 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
1309 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
1310 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
1311 [dest] "+&r" (dest) | |
1312 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
1313 ); | |
1314 } | |
1315 } | |
1316 } | |
1317 #endif // #if HAVE_DSPR2 | 108 #endif // #if HAVE_DSPR2 |
OLD | NEW |