OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | 11 #include <assert.h> |
12 #include <stdio.h> | 12 #include <stdio.h> |
13 | 13 |
14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
15 #include "./vp9_rtcd.h" | 15 #include "./vp9_rtcd.h" |
16 #include "vp9/common/vp9_common.h" | 16 #include "vp9/common/vp9_common.h" |
17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" |
18 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" | 18 #include "vpx_dsp/mips/inv_txfm_dspr2.h" |
19 #include "vpx_dsp/txfm_common.h" | 19 #include "vpx_dsp/txfm_common.h" |
20 #include "vpx_ports/mem.h" | 20 #include "vpx_ports/mem.h" |
21 | 21 |
22 #if HAVE_DSPR2 | 22 #if HAVE_DSPR2 |
23 static void idct8_rows_dspr2(const int16_t *input, int16_t *output, | |
24 uint32_t no_rows) { | |
25 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | |
26 const int const_2_power_13 = 8192; | |
27 int Temp0, Temp1, Temp2, Temp3, Temp4; | |
28 int i; | |
29 | |
30 for (i = no_rows; i--; ) { | |
31 __asm__ __volatile__ ( | |
32 /* | |
33 temp_1 = (input[0] + input[4]) * cospi_16_64; | |
34 step2_0 = dct_const_round_shift(temp_1); | |
35 | |
36 temp_2 = (input[0] - input[4]) * cospi_16_64; | |
37 step2_1 = dct_const_round_shift(temp_2); | |
38 */ | |
39 "lh %[Temp0], 0(%[input]) \n\t" | |
40 "lh %[Temp1], 8(%[input]) \n\t" | |
41 "mtlo %[const_2_power_13], $ac0 \n\t" | |
42 "mthi $zero, $ac0 \n\t" | |
43 "mtlo %[const_2_power_13], $ac1 \n\t" | |
44 "mthi $zero, $ac1 \n\t" | |
45 "add %[Temp2], %[Temp0], %[Temp1] \n\t" | |
46 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" | |
47 "extp %[Temp4], $ac0, 31 \n\t" | |
48 | |
49 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" | |
50 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" | |
51 "mtlo %[const_2_power_13], $ac0 \n\t" | |
52 "mthi $zero, $ac0 \n\t" | |
53 "extp %[Temp2], $ac1, 31 \n\t" | |
54 | |
55 /* | |
56 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; | |
57 step2_2 = dct_const_round_shift(temp_1); | |
58 */ | |
59 "lh %[Temp0], 4(%[input]) \n\t" | |
60 "lh %[Temp1], 12(%[input]) \n\t" | |
61 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" | |
62 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" | |
63 "mtlo %[const_2_power_13], $ac1 \n\t" | |
64 "mthi $zero, $ac1 \n\t" | |
65 "extp %[Temp3], $ac0, 31 \n\t" | |
66 | |
67 /* | |
68 step1_1 = step2_1 + step2_2; | |
69 step1_2 = step2_1 - step2_2; | |
70 */ | |
71 "add %[step1_1], %[Temp2], %[Temp3] \n\t" | |
72 "sub %[step1_2], %[Temp2], %[Temp3] \n\t" | |
73 | |
74 /* | |
75 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; | |
76 step2_3 = dct_const_round_shift(temp_2); | |
77 */ | |
78 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" | |
79 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" | |
80 "extp %[Temp1], $ac1, 31 \n\t" | |
81 | |
82 "mtlo %[const_2_power_13], $ac0 \n\t" | |
83 "mthi $zero, $ac0 \n\t" | |
84 | |
85 /* | |
86 step1_0 = step2_0 + step2_3; | |
87 step1_3 = step2_0 - step2_3; | |
88 */ | |
89 "add %[step1_0], %[Temp4], %[Temp1] \n\t" | |
90 "sub %[step1_3], %[Temp4], %[Temp1] \n\t" | |
91 | |
92 /* | |
93 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; | |
94 step1_4 = dct_const_round_shift(temp_1); | |
95 */ | |
96 "lh %[Temp0], 2(%[input]) \n\t" | |
97 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" | |
98 "mtlo %[const_2_power_13], $ac1 \n\t" | |
99 "mthi $zero, $ac1 \n\t" | |
100 "lh %[Temp1], 14(%[input]) \n\t" | |
101 "lh %[Temp0], 2(%[input]) \n\t" | |
102 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" | |
103 "extp %[step1_4], $ac0, 31 \n\t" | |
104 | |
105 /* | |
106 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; | |
107 step1_7 = dct_const_round_shift(temp_2); | |
108 */ | |
109 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" | |
110 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" | |
111 "extp %[step1_7], $ac1, 31 \n\t" | |
112 | |
113 /* | |
114 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; | |
115 step1_5 = dct_const_round_shift(temp_1); | |
116 */ | |
117 "mtlo %[const_2_power_13], $ac0 \n\t" | |
118 "mthi $zero, $ac0 \n\t" | |
119 "lh %[Temp0], 10(%[input]) \n\t" | |
120 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" | |
121 "lh %[Temp1], 6(%[input]) \n\t" | |
122 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" | |
123 "extp %[step1_5], $ac0, 31 \n\t" | |
124 | |
125 /* | |
126 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; | |
127 step1_6 = dct_const_round_shift(temp_2); | |
128 */ | |
129 "mtlo %[const_2_power_13], $ac1 \n\t" | |
130 "mthi $zero, $ac1 \n\t" | |
131 "lh %[Temp0], 10(%[input]) \n\t" | |
132 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" | |
133 "lh %[Temp1], 6(%[input]) \n\t" | |
134 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" | |
135 "extp %[step1_6], $ac1, 31 \n\t" | |
136 | |
137 /* | |
138 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; | |
139 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; | |
140 */ | |
141 "sub %[Temp0], %[step1_7], %[step1_6] \n\t" | |
142 "sub %[Temp0], %[Temp0], %[step1_4] \n\t" | |
143 "add %[Temp0], %[Temp0], %[step1_5] \n\t" | |
144 "sub %[Temp1], %[step1_4], %[step1_5] \n\t" | |
145 "sub %[Temp1], %[Temp1], %[step1_6] \n\t" | |
146 "add %[Temp1], %[Temp1], %[step1_7] \n\t" | |
147 | |
148 "mtlo %[const_2_power_13], $ac0 \n\t" | |
149 "mthi $zero, $ac0 \n\t" | |
150 "mtlo %[const_2_power_13], $ac1 \n\t" | |
151 "mthi $zero, $ac1 \n\t" | |
152 | |
153 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" | |
154 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" | |
155 | |
156 /* | |
157 step1_4 = step1_4 + step1_5; | |
158 step1_7 = step1_6 + step1_7; | |
159 */ | |
160 "add %[step1_4], %[step1_4], %[step1_5] \n\t" | |
161 "add %[step1_7], %[step1_7], %[step1_6] \n\t" | |
162 | |
163 "extp %[step1_5], $ac0, 31 \n\t" | |
164 "extp %[step1_6], $ac1, 31 \n\t" | |
165 | |
166 "add %[Temp0], %[step1_0], %[step1_7] \n\t" | |
167 "sh %[Temp0], 0(%[output]) \n\t" | |
168 "add %[Temp1], %[step1_1], %[step1_6] \n\t" | |
169 "sh %[Temp1], 16(%[output]) \n\t" | |
170 "add %[Temp0], %[step1_2], %[step1_5] \n\t" | |
171 "sh %[Temp0], 32(%[output]) \n\t" | |
172 "add %[Temp1], %[step1_3], %[step1_4] \n\t" | |
173 "sh %[Temp1], 48(%[output]) \n\t" | |
174 | |
175 "sub %[Temp0], %[step1_3], %[step1_4] \n\t" | |
176 "sh %[Temp0], 64(%[output]) \n\t" | |
177 "sub %[Temp1], %[step1_2], %[step1_5] \n\t" | |
178 "sh %[Temp1], 80(%[output]) \n\t" | |
179 "sub %[Temp0], %[step1_1], %[step1_6] \n\t" | |
180 "sh %[Temp0], 96(%[output]) \n\t" | |
181 "sub %[Temp1], %[step1_0], %[step1_7] \n\t" | |
182 "sh %[Temp1], 112(%[output]) \n\t" | |
183 | |
184 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), | |
185 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), | |
186 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), | |
187 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), | |
188 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), | |
189 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
190 [Temp4] "=&r" (Temp4) | |
191 : [const_2_power_13] "r" (const_2_power_13), | |
192 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), | |
193 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), | |
194 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), | |
195 [cospi_24_64] "r" (cospi_24_64), | |
196 [output] "r" (output), [input] "r" (input) | |
197 ); | |
198 | |
199 input += 8; | |
200 output += 1; | |
201 } | |
202 } | |
203 | |
204 static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, | |
205 int dest_stride) { | |
206 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | |
207 int Temp0, Temp1, Temp2, Temp3; | |
208 int i; | |
209 const int const_2_power_13 = 8192; | |
210 uint8_t *dest_pix; | |
211 uint8_t *cm = vpx_ff_cropTbl; | |
212 | |
213 /* prefetch vpx_ff_cropTbl */ | |
214 prefetch_load(vpx_ff_cropTbl); | |
215 prefetch_load(vpx_ff_cropTbl + 32); | |
216 prefetch_load(vpx_ff_cropTbl + 64); | |
217 prefetch_load(vpx_ff_cropTbl + 96); | |
218 prefetch_load(vpx_ff_cropTbl + 128); | |
219 prefetch_load(vpx_ff_cropTbl + 160); | |
220 prefetch_load(vpx_ff_cropTbl + 192); | |
221 prefetch_load(vpx_ff_cropTbl + 224); | |
222 | |
223 for (i = 0; i < 8; ++i) { | |
224 dest_pix = (dest + i); | |
225 | |
226 __asm__ __volatile__ ( | |
227 /* | |
228 temp_1 = (input[0] + input[4]) * cospi_16_64; | |
229 step2_0 = dct_const_round_shift(temp_1); | |
230 | |
231 temp_2 = (input[0] - input[4]) * cospi_16_64; | |
232 step2_1 = dct_const_round_shift(temp_2); | |
233 */ | |
234 "lh %[Temp0], 0(%[input]) \n\t" | |
235 "lh %[Temp1], 8(%[input]) \n\t" | |
236 "mtlo %[const_2_power_13], $ac0 \n\t" | |
237 "mthi $zero, $ac0 \n\t" | |
238 "mtlo %[const_2_power_13], $ac1 \n\t" | |
239 "mthi $zero, $ac1 \n\t" | |
240 "add %[Temp2], %[Temp0], %[Temp1] \n\t" | |
241 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" | |
242 "extp %[step1_6], $ac0, 31 \n\t" | |
243 | |
244 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" | |
245 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" | |
246 "mtlo %[const_2_power_13], $ac0 \n\t" | |
247 "mthi $zero, $ac0 \n\t" | |
248 "extp %[Temp2], $ac1, 31 \n\t" | |
249 | |
250 /* | |
251 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; | |
252 step2_2 = dct_const_round_shift(temp_1); | |
253 */ | |
254 "lh %[Temp0], 4(%[input]) \n\t" | |
255 "lh %[Temp1], 12(%[input]) \n\t" | |
256 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" | |
257 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" | |
258 "mtlo %[const_2_power_13], $ac1 \n\t" | |
259 "mthi $zero, $ac1 \n\t" | |
260 "extp %[Temp3], $ac0, 31 \n\t" | |
261 | |
262 /* | |
263 step1_1 = step2_1 + step2_2; | |
264 step1_2 = step2_1 - step2_2; | |
265 */ | |
266 "add %[step1_1], %[Temp2], %[Temp3] \n\t" | |
267 "sub %[step1_2], %[Temp2], %[Temp3] \n\t" | |
268 | |
269 /* | |
270 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; | |
271 step2_3 = dct_const_round_shift(temp_2); | |
272 */ | |
273 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" | |
274 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" | |
275 "extp %[Temp1], $ac1, 31 \n\t" | |
276 | |
277 "mtlo %[const_2_power_13], $ac0 \n\t" | |
278 "mthi $zero, $ac0 \n\t" | |
279 | |
280 /* | |
281 step1_0 = step2_0 + step2_3; | |
282 step1_3 = step2_0 - step2_3; | |
283 */ | |
284 "add %[step1_0], %[step1_6], %[Temp1] \n\t" | |
285 "sub %[step1_3], %[step1_6], %[Temp1] \n\t" | |
286 | |
287 /* | |
288 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; | |
289 step1_4 = dct_const_round_shift(temp_1); | |
290 */ | |
291 "lh %[Temp0], 2(%[input]) \n\t" | |
292 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" | |
293 "mtlo %[const_2_power_13], $ac1 \n\t" | |
294 "mthi $zero, $ac1 \n\t" | |
295 "lh %[Temp1], 14(%[input]) \n\t" | |
296 "lh %[Temp0], 2(%[input]) \n\t" | |
297 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" | |
298 "extp %[step1_4], $ac0, 31 \n\t" | |
299 | |
300 /* | |
301 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; | |
302 step1_7 = dct_const_round_shift(temp_2); | |
303 */ | |
304 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" | |
305 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" | |
306 "extp %[step1_7], $ac1, 31 \n\t" | |
307 | |
308 /* | |
309 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; | |
310 step1_5 = dct_const_round_shift(temp_1); | |
311 */ | |
312 "mtlo %[const_2_power_13], $ac0 \n\t" | |
313 "mthi $zero, $ac0 \n\t" | |
314 "lh %[Temp0], 10(%[input]) \n\t" | |
315 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" | |
316 "lh %[Temp1], 6(%[input]) \n\t" | |
317 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" | |
318 "extp %[step1_5], $ac0, 31 \n\t" | |
319 | |
320 /* | |
321 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; | |
322 step1_6 = dct_const_round_shift(temp_2); | |
323 */ | |
324 "mtlo %[const_2_power_13], $ac1 \n\t" | |
325 "mthi $zero, $ac1 \n\t" | |
326 "lh %[Temp0], 10(%[input]) \n\t" | |
327 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" | |
328 "lh %[Temp1], 6(%[input]) \n\t" | |
329 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" | |
330 "extp %[step1_6], $ac1, 31 \n\t" | |
331 | |
332 /* | |
333 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; | |
334 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; | |
335 */ | |
336 "sub %[Temp0], %[step1_7], %[step1_6] \n\t" | |
337 "sub %[Temp0], %[Temp0], %[step1_4] \n\t" | |
338 "add %[Temp0], %[Temp0], %[step1_5] \n\t" | |
339 "sub %[Temp1], %[step1_4], %[step1_5] \n\t" | |
340 "sub %[Temp1], %[Temp1], %[step1_6] \n\t" | |
341 "add %[Temp1], %[Temp1], %[step1_7] \n\t" | |
342 | |
343 "mtlo %[const_2_power_13], $ac0 \n\t" | |
344 "mthi $zero, $ac0 \n\t" | |
345 "mtlo %[const_2_power_13], $ac1 \n\t" | |
346 "mthi $zero, $ac1 \n\t" | |
347 | |
348 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" | |
349 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" | |
350 | |
351 /* | |
352 step1_4 = step1_4 + step1_5; | |
353 step1_7 = step1_6 + step1_7; | |
354 */ | |
355 "add %[step1_4], %[step1_4], %[step1_5] \n\t" | |
356 "add %[step1_7], %[step1_7], %[step1_6] \n\t" | |
357 | |
358 "extp %[step1_5], $ac0, 31 \n\t" | |
359 "extp %[step1_6], $ac1, 31 \n\t" | |
360 | |
361 /* add block */ | |
362 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
363 "add %[Temp0], %[step1_0], %[step1_7] \n\t" | |
364 "addi %[Temp0], %[Temp0], 16 \n\t" | |
365 "sra %[Temp0], %[Temp0], 5 \n\t" | |
366 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
367 "add %[Temp0], %[step1_1], %[step1_6] \n\t" | |
368 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
369 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
370 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
371 | |
372 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
373 "addi %[Temp0], %[Temp0], 16 \n\t" | |
374 "sra %[Temp0], %[Temp0], 5 \n\t" | |
375 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
376 "add %[Temp0], %[step1_2], %[step1_5] \n\t" | |
377 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
378 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
379 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
380 | |
381 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
382 "addi %[Temp0], %[Temp0], 16 \n\t" | |
383 "sra %[Temp0], %[Temp0], 5 \n\t" | |
384 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
385 "add %[Temp0], %[step1_3], %[step1_4] \n\t" | |
386 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
387 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
388 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
389 | |
390 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
391 "addi %[Temp0], %[Temp0], 16 \n\t" | |
392 "sra %[Temp0], %[Temp0], 5 \n\t" | |
393 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
394 "sub %[Temp0], %[step1_3], %[step1_4] \n\t" | |
395 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
396 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
397 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
398 | |
399 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
400 "addi %[Temp0], %[Temp0], 16 \n\t" | |
401 "sra %[Temp0], %[Temp0], 5 \n\t" | |
402 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
403 "sub %[Temp0], %[step1_2], %[step1_5] \n\t" | |
404 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
405 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
406 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
407 | |
408 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
409 "addi %[Temp0], %[Temp0], 16 \n\t" | |
410 "sra %[Temp0], %[Temp0], 5 \n\t" | |
411 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
412 "sub %[Temp0], %[step1_1], %[step1_6] \n\t" | |
413 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
414 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
415 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
416 | |
417 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
418 "addi %[Temp0], %[Temp0], 16 \n\t" | |
419 "sra %[Temp0], %[Temp0], 5 \n\t" | |
420 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
421 "sub %[Temp0], %[step1_0], %[step1_7] \n\t" | |
422 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
423 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
424 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
425 | |
426 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
427 "addi %[Temp0], %[Temp0], 16 \n\t" | |
428 "sra %[Temp0], %[Temp0], 5 \n\t" | |
429 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
430 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
431 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
432 | |
433 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), | |
434 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), | |
435 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), | |
436 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), | |
437 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), | |
438 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
439 [dest_pix] "+r" (dest_pix) | |
440 : [const_2_power_13] "r" (const_2_power_13), | |
441 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), | |
442 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), | |
443 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), | |
444 [cospi_24_64] "r" (cospi_24_64), | |
445 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) | |
446 ); | |
447 | |
448 input += 8; | |
449 } | |
450 } | |
451 | |
452 void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, | |
453 int dest_stride) { | |
454 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); | |
455 int16_t *outptr = out; | |
456 uint32_t pos = 45; | |
457 | |
458 /* bit positon for extract from acc */ | |
459 __asm__ __volatile__ ( | |
460 "wrdsp %[pos], 1 \n\t" | |
461 : | |
462 : [pos] "r" (pos) | |
463 ); | |
464 | |
465 // First transform rows | |
466 idct8_rows_dspr2(input, outptr, 8); | |
467 | |
468 // Then transform columns and add to dest | |
469 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); | |
470 } | |
471 | |
472 static void iadst8_dspr2(const int16_t *input, int16_t *output) { | |
473 int s0, s1, s2, s3, s4, s5, s6, s7; | |
474 int x0, x1, x2, x3, x4, x5, x6, x7; | |
475 | |
476 x0 = input[7]; | |
477 x1 = input[0]; | |
478 x2 = input[5]; | |
479 x3 = input[2]; | |
480 x4 = input[3]; | |
481 x5 = input[4]; | |
482 x6 = input[1]; | |
483 x7 = input[6]; | |
484 | |
485 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { | |
486 output[0] = output[1] = output[2] = output[3] = output[4] | |
487 = output[5] = output[6] = output[7] = 0; | |
488 return; | |
489 } | |
490 | |
491 // stage 1 | |
492 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; | |
493 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; | |
494 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; | |
495 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; | |
496 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; | |
497 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; | |
498 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; | |
499 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; | |
500 | |
501 x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); | |
502 x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); | |
503 x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); | |
504 x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); | |
505 x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); | |
506 x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); | |
507 x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); | |
508 x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); | |
509 | |
510 // stage 2 | |
511 s0 = x0; | |
512 s1 = x1; | |
513 s2 = x2; | |
514 s3 = x3; | |
515 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; | |
516 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; | |
517 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; | |
518 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; | |
519 | |
520 x0 = s0 + s2; | |
521 x1 = s1 + s3; | |
522 x2 = s0 - s2; | |
523 x3 = s1 - s3; | |
524 x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); | |
525 x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); | |
526 x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); | |
527 x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); | |
528 | |
529 // stage 3 | |
530 s2 = cospi_16_64 * (x2 + x3); | |
531 s3 = cospi_16_64 * (x2 - x3); | |
532 s6 = cospi_16_64 * (x6 + x7); | |
533 s7 = cospi_16_64 * (x6 - x7); | |
534 | |
535 x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); | |
536 x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); | |
537 x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); | |
538 x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); | |
539 | |
540 output[0] = x0; | |
541 output[1] = -x4; | |
542 output[2] = x6; | |
543 output[3] = -x2; | |
544 output[4] = x3; | |
545 output[5] = -x7; | |
546 output[6] = x5; | |
547 output[7] = -x1; | |
548 } | |
549 | |
550 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, | 23 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, |
551 int dest_stride, int tx_type) { | 24 int dest_stride, int tx_type) { |
552 int i, j; | 25 int i, j; |
553 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); | 26 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); |
554 int16_t *outptr = out; | 27 int16_t *outptr = out; |
555 int16_t temp_in[8 * 8], temp_out[8]; | 28 int16_t temp_in[8 * 8], temp_out[8]; |
556 uint32_t pos = 45; | 29 uint32_t pos = 45; |
557 | 30 |
558 /* bit positon for extract from acc */ | 31 /* bit positon for extract from acc */ |
559 __asm__ __volatile__ ( | 32 __asm__ __volatile__ ( |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
610 dest[j * dest_stride + i] = | 83 dest[j * dest_stride + i] = |
611 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) | 84 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) |
612 + dest[j * dest_stride + i]); | 85 + dest[j * dest_stride + i]); |
613 } | 86 } |
614 break; | 87 break; |
615 default: | 88 default: |
616 printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); | 89 printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); |
617 break; | 90 break; |
618 } | 91 } |
619 } | 92 } |
620 | |
621 void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, | |
622 int dest_stride) { | |
623 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); | |
624 int16_t *outptr = out; | |
625 uint32_t pos = 45; | |
626 | |
627 /* bit positon for extract from acc */ | |
628 __asm__ __volatile__ ( | |
629 "wrdsp %[pos], 1 \n\t" | |
630 : | |
631 : [pos] "r" (pos) | |
632 ); | |
633 | |
634 // First transform rows | |
635 idct8_rows_dspr2(input, outptr, 4); | |
636 | |
637 outptr += 4; | |
638 | |
639 __asm__ __volatile__ ( | |
640 "sw $zero, 0(%[outptr]) \n\t" | |
641 "sw $zero, 4(%[outptr]) \n\t" | |
642 "sw $zero, 16(%[outptr]) \n\t" | |
643 "sw $zero, 20(%[outptr]) \n\t" | |
644 "sw $zero, 32(%[outptr]) \n\t" | |
645 "sw $zero, 36(%[outptr]) \n\t" | |
646 "sw $zero, 48(%[outptr]) \n\t" | |
647 "sw $zero, 52(%[outptr]) \n\t" | |
648 "sw $zero, 64(%[outptr]) \n\t" | |
649 "sw $zero, 68(%[outptr]) \n\t" | |
650 "sw $zero, 80(%[outptr]) \n\t" | |
651 "sw $zero, 84(%[outptr]) \n\t" | |
652 "sw $zero, 96(%[outptr]) \n\t" | |
653 "sw $zero, 100(%[outptr]) \n\t" | |
654 "sw $zero, 112(%[outptr]) \n\t" | |
655 "sw $zero, 116(%[outptr]) \n\t" | |
656 | |
657 : | |
658 : [outptr] "r" (outptr) | |
659 ); | |
660 | |
661 | |
662 // Then transform columns and add to dest | |
663 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); | |
664 } | |
665 | |
666 void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, | |
667 int dest_stride) { | |
668 uint32_t pos = 45; | |
669 int32_t out; | |
670 int32_t r; | |
671 int32_t a1, absa1; | |
672 int32_t t1, t2, vector_a1, vector_1, vector_2; | |
673 | |
674 /* bit positon for extract from acc */ | |
675 __asm__ __volatile__ ( | |
676 "wrdsp %[pos], 1 \n\t" | |
677 | |
678 : | |
679 : [pos] "r" (pos) | |
680 ); | |
681 | |
682 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); | |
683 __asm__ __volatile__ ( | |
684 "addi %[out], %[out], 16 \n\t" | |
685 "sra %[a1], %[out], 5 \n\t" | |
686 | |
687 : [out] "+r" (out), [a1] "=r" (a1) | |
688 : | |
689 ); | |
690 | |
691 if (a1 < 0) { | |
692 /* use quad-byte | |
693 * input and output memory are four byte aligned */ | |
694 __asm__ __volatile__ ( | |
695 "abs %[absa1], %[a1] \n\t" | |
696 "replv.qb %[vector_a1], %[absa1] \n\t" | |
697 | |
698 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) | |
699 : [a1] "r" (a1) | |
700 ); | |
701 | |
702 for (r = 8; r--;) { | |
703 __asm__ __volatile__ ( | |
704 "lw %[t1], 0(%[dest]) \n\t" | |
705 "lw %[t2], 4(%[dest]) \n\t" | |
706 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
707 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
708 "sw %[vector_1], 0(%[dest]) \n\t" | |
709 "sw %[vector_2], 4(%[dest]) \n\t" | |
710 "add %[dest], %[dest], %[dest_stride] \n\t" | |
711 | |
712 : [t1] "=&r" (t1), [t2] "=&r" (t2), | |
713 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
714 [dest] "+&r" (dest) | |
715 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
716 ); | |
717 } | |
718 } else { | |
719 /* use quad-byte | |
720 * input and output memory are four byte aligned */ | |
721 __asm__ __volatile__ ( | |
722 "replv.qb %[vector_a1], %[a1] \n\t" | |
723 | |
724 : [vector_a1] "=r" (vector_a1) | |
725 : [a1] "r" (a1) | |
726 ); | |
727 | |
728 for (r = 8; r--;) { | |
729 __asm__ __volatile__ ( | |
730 "lw %[t1], 0(%[dest]) \n\t" | |
731 "lw %[t2], 4(%[dest]) \n\t" | |
732 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
733 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
734 "sw %[vector_1], 0(%[dest]) \n\t" | |
735 "sw %[vector_2], 4(%[dest]) \n\t" | |
736 "add %[dest], %[dest], %[dest_stride] \n\t" | |
737 | |
738 : [t1] "=&r" (t1), [t2] "=&r" (t2), | |
739 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
740 [dest] "+r" (dest) | |
741 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
742 ); | |
743 } | |
744 } | |
745 } | |
746 #endif // #if HAVE_DSPR2 | 93 #endif // #if HAVE_DSPR2 |
OLD | NEW |