Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(190)

Side by Side Diff: source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h> 11 #include <assert.h>
12 #include <stdio.h> 12 #include <stdio.h>
13 13
14 #include "./vpx_config.h" 14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h" 15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h" 16 #include "vp9/common/vp9_common.h"
17 #include "vp9/common/vp9_blockd.h" 17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 18 #include "vpx_dsp/mips/inv_txfm_dspr2.h"
19 #include "vpx_dsp/txfm_common.h" 19 #include "vpx_dsp/txfm_common.h"
20 #include "vpx_ports/mem.h" 20 #include "vpx_ports/mem.h"
21 21
22 #if HAVE_DSPR2 22 #if HAVE_DSPR2
23 static void idct8_rows_dspr2(const int16_t *input, int16_t *output,
24 uint32_t no_rows) {
25 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
26 const int const_2_power_13 = 8192;
27 int Temp0, Temp1, Temp2, Temp3, Temp4;
28 int i;
29
30 for (i = no_rows; i--; ) {
31 __asm__ __volatile__ (
32 /*
33 temp_1 = (input[0] + input[4]) * cospi_16_64;
34 step2_0 = dct_const_round_shift(temp_1);
35
36 temp_2 = (input[0] - input[4]) * cospi_16_64;
37 step2_1 = dct_const_round_shift(temp_2);
38 */
39 "lh %[Temp0], 0(%[input]) \n\t"
40 "lh %[Temp1], 8(%[input]) \n\t"
41 "mtlo %[const_2_power_13], $ac0 \n\t"
42 "mthi $zero, $ac0 \n\t"
43 "mtlo %[const_2_power_13], $ac1 \n\t"
44 "mthi $zero, $ac1 \n\t"
45 "add %[Temp2], %[Temp0], %[Temp1] \n\t"
46 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
47 "extp %[Temp4], $ac0, 31 \n\t"
48
49 "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
50 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
51 "mtlo %[const_2_power_13], $ac0 \n\t"
52 "mthi $zero, $ac0 \n\t"
53 "extp %[Temp2], $ac1, 31 \n\t"
54
55 /*
56 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
57 step2_2 = dct_const_round_shift(temp_1);
58 */
59 "lh %[Temp0], 4(%[input]) \n\t"
60 "lh %[Temp1], 12(%[input]) \n\t"
61 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
62 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
63 "mtlo %[const_2_power_13], $ac1 \n\t"
64 "mthi $zero, $ac1 \n\t"
65 "extp %[Temp3], $ac0, 31 \n\t"
66
67 /*
68 step1_1 = step2_1 + step2_2;
69 step1_2 = step2_1 - step2_2;
70 */
71 "add %[step1_1], %[Temp2], %[Temp3] \n\t"
72 "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
73
74 /*
75 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
76 step2_3 = dct_const_round_shift(temp_2);
77 */
78 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
79 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
80 "extp %[Temp1], $ac1, 31 \n\t"
81
82 "mtlo %[const_2_power_13], $ac0 \n\t"
83 "mthi $zero, $ac0 \n\t"
84
85 /*
86 step1_0 = step2_0 + step2_3;
87 step1_3 = step2_0 - step2_3;
88 */
89 "add %[step1_0], %[Temp4], %[Temp1] \n\t"
90 "sub %[step1_3], %[Temp4], %[Temp1] \n\t"
91
92 /*
93 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
94 step1_4 = dct_const_round_shift(temp_1);
95 */
96 "lh %[Temp0], 2(%[input]) \n\t"
97 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
98 "mtlo %[const_2_power_13], $ac1 \n\t"
99 "mthi $zero, $ac1 \n\t"
100 "lh %[Temp1], 14(%[input]) \n\t"
101 "lh %[Temp0], 2(%[input]) \n\t"
102 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
103 "extp %[step1_4], $ac0, 31 \n\t"
104
105 /*
106 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
107 step1_7 = dct_const_round_shift(temp_2);
108 */
109 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
110 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
111 "extp %[step1_7], $ac1, 31 \n\t"
112
113 /*
114 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
115 step1_5 = dct_const_round_shift(temp_1);
116 */
117 "mtlo %[const_2_power_13], $ac0 \n\t"
118 "mthi $zero, $ac0 \n\t"
119 "lh %[Temp0], 10(%[input]) \n\t"
120 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
121 "lh %[Temp1], 6(%[input]) \n\t"
122 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
123 "extp %[step1_5], $ac0, 31 \n\t"
124
125 /*
126 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
127 step1_6 = dct_const_round_shift(temp_2);
128 */
129 "mtlo %[const_2_power_13], $ac1 \n\t"
130 "mthi $zero, $ac1 \n\t"
131 "lh %[Temp0], 10(%[input]) \n\t"
132 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
133 "lh %[Temp1], 6(%[input]) \n\t"
134 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
135 "extp %[step1_6], $ac1, 31 \n\t"
136
137 /*
138 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
139 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
140 */
141 "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
142 "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
143 "add %[Temp0], %[Temp0], %[step1_5] \n\t"
144 "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
145 "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
146 "add %[Temp1], %[Temp1], %[step1_7] \n\t"
147
148 "mtlo %[const_2_power_13], $ac0 \n\t"
149 "mthi $zero, $ac0 \n\t"
150 "mtlo %[const_2_power_13], $ac1 \n\t"
151 "mthi $zero, $ac1 \n\t"
152
153 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
154 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
155
156 /*
157 step1_4 = step1_4 + step1_5;
158 step1_7 = step1_6 + step1_7;
159 */
160 "add %[step1_4], %[step1_4], %[step1_5] \n\t"
161 "add %[step1_7], %[step1_7], %[step1_6] \n\t"
162
163 "extp %[step1_5], $ac0, 31 \n\t"
164 "extp %[step1_6], $ac1, 31 \n\t"
165
166 "add %[Temp0], %[step1_0], %[step1_7] \n\t"
167 "sh %[Temp0], 0(%[output]) \n\t"
168 "add %[Temp1], %[step1_1], %[step1_6] \n\t"
169 "sh %[Temp1], 16(%[output]) \n\t"
170 "add %[Temp0], %[step1_2], %[step1_5] \n\t"
171 "sh %[Temp0], 32(%[output]) \n\t"
172 "add %[Temp1], %[step1_3], %[step1_4] \n\t"
173 "sh %[Temp1], 48(%[output]) \n\t"
174
175 "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
176 "sh %[Temp0], 64(%[output]) \n\t"
177 "sub %[Temp1], %[step1_2], %[step1_5] \n\t"
178 "sh %[Temp1], 80(%[output]) \n\t"
179 "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
180 "sh %[Temp0], 96(%[output]) \n\t"
181 "sub %[Temp1], %[step1_0], %[step1_7] \n\t"
182 "sh %[Temp1], 112(%[output]) \n\t"
183
184 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
185 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
186 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
187 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
188 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
189 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
190 [Temp4] "=&r" (Temp4)
191 : [const_2_power_13] "r" (const_2_power_13),
192 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
193 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
194 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
195 [cospi_24_64] "r" (cospi_24_64),
196 [output] "r" (output), [input] "r" (input)
197 );
198
199 input += 8;
200 output += 1;
201 }
202 }
203
204 static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
205 int dest_stride) {
206 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
207 int Temp0, Temp1, Temp2, Temp3;
208 int i;
209 const int const_2_power_13 = 8192;
210 uint8_t *dest_pix;
211 uint8_t *cm = vpx_ff_cropTbl;
212
213 /* prefetch vpx_ff_cropTbl */
214 prefetch_load(vpx_ff_cropTbl);
215 prefetch_load(vpx_ff_cropTbl + 32);
216 prefetch_load(vpx_ff_cropTbl + 64);
217 prefetch_load(vpx_ff_cropTbl + 96);
218 prefetch_load(vpx_ff_cropTbl + 128);
219 prefetch_load(vpx_ff_cropTbl + 160);
220 prefetch_load(vpx_ff_cropTbl + 192);
221 prefetch_load(vpx_ff_cropTbl + 224);
222
223 for (i = 0; i < 8; ++i) {
224 dest_pix = (dest + i);
225
226 __asm__ __volatile__ (
227 /*
228 temp_1 = (input[0] + input[4]) * cospi_16_64;
229 step2_0 = dct_const_round_shift(temp_1);
230
231 temp_2 = (input[0] - input[4]) * cospi_16_64;
232 step2_1 = dct_const_round_shift(temp_2);
233 */
234 "lh %[Temp0], 0(%[input]) \n\t"
235 "lh %[Temp1], 8(%[input]) \n\t"
236 "mtlo %[const_2_power_13], $ac0 \n\t"
237 "mthi $zero, $ac0 \n\t"
238 "mtlo %[const_2_power_13], $ac1 \n\t"
239 "mthi $zero, $ac1 \n\t"
240 "add %[Temp2], %[Temp0], %[Temp1] \n\t"
241 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
242 "extp %[step1_6], $ac0, 31 \n\t"
243
244 "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
245 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
246 "mtlo %[const_2_power_13], $ac0 \n\t"
247 "mthi $zero, $ac0 \n\t"
248 "extp %[Temp2], $ac1, 31 \n\t"
249
250 /*
251 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
252 step2_2 = dct_const_round_shift(temp_1);
253 */
254 "lh %[Temp0], 4(%[input]) \n\t"
255 "lh %[Temp1], 12(%[input]) \n\t"
256 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
257 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
258 "mtlo %[const_2_power_13], $ac1 \n\t"
259 "mthi $zero, $ac1 \n\t"
260 "extp %[Temp3], $ac0, 31 \n\t"
261
262 /*
263 step1_1 = step2_1 + step2_2;
264 step1_2 = step2_1 - step2_2;
265 */
266 "add %[step1_1], %[Temp2], %[Temp3] \n\t"
267 "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
268
269 /*
270 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
271 step2_3 = dct_const_round_shift(temp_2);
272 */
273 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
274 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
275 "extp %[Temp1], $ac1, 31 \n\t"
276
277 "mtlo %[const_2_power_13], $ac0 \n\t"
278 "mthi $zero, $ac0 \n\t"
279
280 /*
281 step1_0 = step2_0 + step2_3;
282 step1_3 = step2_0 - step2_3;
283 */
284 "add %[step1_0], %[step1_6], %[Temp1] \n\t"
285 "sub %[step1_3], %[step1_6], %[Temp1] \n\t"
286
287 /*
288 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
289 step1_4 = dct_const_round_shift(temp_1);
290 */
291 "lh %[Temp0], 2(%[input]) \n\t"
292 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
293 "mtlo %[const_2_power_13], $ac1 \n\t"
294 "mthi $zero, $ac1 \n\t"
295 "lh %[Temp1], 14(%[input]) \n\t"
296 "lh %[Temp0], 2(%[input]) \n\t"
297 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
298 "extp %[step1_4], $ac0, 31 \n\t"
299
300 /*
301 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
302 step1_7 = dct_const_round_shift(temp_2);
303 */
304 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
305 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
306 "extp %[step1_7], $ac1, 31 \n\t"
307
308 /*
309 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
310 step1_5 = dct_const_round_shift(temp_1);
311 */
312 "mtlo %[const_2_power_13], $ac0 \n\t"
313 "mthi $zero, $ac0 \n\t"
314 "lh %[Temp0], 10(%[input]) \n\t"
315 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
316 "lh %[Temp1], 6(%[input]) \n\t"
317 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
318 "extp %[step1_5], $ac0, 31 \n\t"
319
320 /*
321 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
322 step1_6 = dct_const_round_shift(temp_2);
323 */
324 "mtlo %[const_2_power_13], $ac1 \n\t"
325 "mthi $zero, $ac1 \n\t"
326 "lh %[Temp0], 10(%[input]) \n\t"
327 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
328 "lh %[Temp1], 6(%[input]) \n\t"
329 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
330 "extp %[step1_6], $ac1, 31 \n\t"
331
332 /*
333 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
334 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
335 */
336 "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
337 "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
338 "add %[Temp0], %[Temp0], %[step1_5] \n\t"
339 "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
340 "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
341 "add %[Temp1], %[Temp1], %[step1_7] \n\t"
342
343 "mtlo %[const_2_power_13], $ac0 \n\t"
344 "mthi $zero, $ac0 \n\t"
345 "mtlo %[const_2_power_13], $ac1 \n\t"
346 "mthi $zero, $ac1 \n\t"
347
348 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
349 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
350
351 /*
352 step1_4 = step1_4 + step1_5;
353 step1_7 = step1_6 + step1_7;
354 */
355 "add %[step1_4], %[step1_4], %[step1_5] \n\t"
356 "add %[step1_7], %[step1_7], %[step1_6] \n\t"
357
358 "extp %[step1_5], $ac0, 31 \n\t"
359 "extp %[step1_6], $ac1, 31 \n\t"
360
361 /* add block */
362 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
363 "add %[Temp0], %[step1_0], %[step1_7] \n\t"
364 "addi %[Temp0], %[Temp0], 16 \n\t"
365 "sra %[Temp0], %[Temp0], 5 \n\t"
366 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
367 "add %[Temp0], %[step1_1], %[step1_6] \n\t"
368 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
369 "sb %[Temp2], 0(%[dest_pix]) \n\t"
370 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
371
372 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
373 "addi %[Temp0], %[Temp0], 16 \n\t"
374 "sra %[Temp0], %[Temp0], 5 \n\t"
375 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
376 "add %[Temp0], %[step1_2], %[step1_5] \n\t"
377 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
378 "sb %[Temp2], 0(%[dest_pix]) \n\t"
379 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
380
381 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
382 "addi %[Temp0], %[Temp0], 16 \n\t"
383 "sra %[Temp0], %[Temp0], 5 \n\t"
384 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
385 "add %[Temp0], %[step1_3], %[step1_4] \n\t"
386 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
387 "sb %[Temp2], 0(%[dest_pix]) \n\t"
388 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
389
390 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
391 "addi %[Temp0], %[Temp0], 16 \n\t"
392 "sra %[Temp0], %[Temp0], 5 \n\t"
393 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
394 "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
395 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
396 "sb %[Temp2], 0(%[dest_pix]) \n\t"
397 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
398
399 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
400 "addi %[Temp0], %[Temp0], 16 \n\t"
401 "sra %[Temp0], %[Temp0], 5 \n\t"
402 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
403 "sub %[Temp0], %[step1_2], %[step1_5] \n\t"
404 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
405 "sb %[Temp2], 0(%[dest_pix]) \n\t"
406 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
407
408 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
409 "addi %[Temp0], %[Temp0], 16 \n\t"
410 "sra %[Temp0], %[Temp0], 5 \n\t"
411 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
412 "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
413 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
414 "sb %[Temp2], 0(%[dest_pix]) \n\t"
415 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
416
417 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
418 "addi %[Temp0], %[Temp0], 16 \n\t"
419 "sra %[Temp0], %[Temp0], 5 \n\t"
420 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
421 "sub %[Temp0], %[step1_0], %[step1_7] \n\t"
422 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
423 "sb %[Temp2], 0(%[dest_pix]) \n\t"
424 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
425
426 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
427 "addi %[Temp0], %[Temp0], 16 \n\t"
428 "sra %[Temp0], %[Temp0], 5 \n\t"
429 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
430 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
431 "sb %[Temp2], 0(%[dest_pix]) \n\t"
432
433 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
434 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
435 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
436 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
437 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
438 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
439 [dest_pix] "+r" (dest_pix)
440 : [const_2_power_13] "r" (const_2_power_13),
441 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
442 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
443 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
444 [cospi_24_64] "r" (cospi_24_64),
445 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
446 );
447
448 input += 8;
449 }
450 }
451
452 void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
453 int dest_stride) {
454 DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
455 int16_t *outptr = out;
456 uint32_t pos = 45;
457
458 /* bit positon for extract from acc */
459 __asm__ __volatile__ (
460 "wrdsp %[pos], 1 \n\t"
461 :
462 : [pos] "r" (pos)
463 );
464
465 // First transform rows
466 idct8_rows_dspr2(input, outptr, 8);
467
468 // Then transform columns and add to dest
469 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
470 }
471
472 static void iadst8_dspr2(const int16_t *input, int16_t *output) {
473 int s0, s1, s2, s3, s4, s5, s6, s7;
474 int x0, x1, x2, x3, x4, x5, x6, x7;
475
476 x0 = input[7];
477 x1 = input[0];
478 x2 = input[5];
479 x3 = input[2];
480 x4 = input[3];
481 x5 = input[4];
482 x6 = input[1];
483 x7 = input[6];
484
485 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
486 output[0] = output[1] = output[2] = output[3] = output[4]
487 = output[5] = output[6] = output[7] = 0;
488 return;
489 }
490
491 // stage 1
492 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
493 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
494 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
495 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
496 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
497 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
498 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
499 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
500
501 x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
502 x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
503 x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
504 x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
505 x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
506 x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
507 x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
508 x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
509
510 // stage 2
511 s0 = x0;
512 s1 = x1;
513 s2 = x2;
514 s3 = x3;
515 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
516 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
517 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
518 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
519
520 x0 = s0 + s2;
521 x1 = s1 + s3;
522 x2 = s0 - s2;
523 x3 = s1 - s3;
524 x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
525 x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
526 x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
527 x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
528
529 // stage 3
530 s2 = cospi_16_64 * (x2 + x3);
531 s3 = cospi_16_64 * (x2 - x3);
532 s6 = cospi_16_64 * (x6 + x7);
533 s7 = cospi_16_64 * (x6 - x7);
534
535 x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
536 x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
537 x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
538 x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
539
540 output[0] = x0;
541 output[1] = -x4;
542 output[2] = x6;
543 output[3] = -x2;
544 output[4] = x3;
545 output[5] = -x7;
546 output[6] = x5;
547 output[7] = -x1;
548 }
549
550 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, 23 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
551 int dest_stride, int tx_type) { 24 int dest_stride, int tx_type) {
552 int i, j; 25 int i, j;
553 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); 26 DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
554 int16_t *outptr = out; 27 int16_t *outptr = out;
555 int16_t temp_in[8 * 8], temp_out[8]; 28 int16_t temp_in[8 * 8], temp_out[8];
556 uint32_t pos = 45; 29 uint32_t pos = 45;
557 30
558 /* bit positon for extract from acc */ 31 /* bit positon for extract from acc */
559 __asm__ __volatile__ ( 32 __asm__ __volatile__ (
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
610 dest[j * dest_stride + i] = 83 dest[j * dest_stride + i] =
611 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 84 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
612 + dest[j * dest_stride + i]); 85 + dest[j * dest_stride + i]);
613 } 86 }
614 break; 87 break;
615 default: 88 default:
616 printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); 89 printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
617 break; 90 break;
618 } 91 }
619 } 92 }
620
621 void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
622 int dest_stride) {
623 DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
624 int16_t *outptr = out;
625 uint32_t pos = 45;
626
627 /* bit positon for extract from acc */
628 __asm__ __volatile__ (
629 "wrdsp %[pos], 1 \n\t"
630 :
631 : [pos] "r" (pos)
632 );
633
634 // First transform rows
635 idct8_rows_dspr2(input, outptr, 4);
636
637 outptr += 4;
638
639 __asm__ __volatile__ (
640 "sw $zero, 0(%[outptr]) \n\t"
641 "sw $zero, 4(%[outptr]) \n\t"
642 "sw $zero, 16(%[outptr]) \n\t"
643 "sw $zero, 20(%[outptr]) \n\t"
644 "sw $zero, 32(%[outptr]) \n\t"
645 "sw $zero, 36(%[outptr]) \n\t"
646 "sw $zero, 48(%[outptr]) \n\t"
647 "sw $zero, 52(%[outptr]) \n\t"
648 "sw $zero, 64(%[outptr]) \n\t"
649 "sw $zero, 68(%[outptr]) \n\t"
650 "sw $zero, 80(%[outptr]) \n\t"
651 "sw $zero, 84(%[outptr]) \n\t"
652 "sw $zero, 96(%[outptr]) \n\t"
653 "sw $zero, 100(%[outptr]) \n\t"
654 "sw $zero, 112(%[outptr]) \n\t"
655 "sw $zero, 116(%[outptr]) \n\t"
656
657 :
658 : [outptr] "r" (outptr)
659 );
660
661
662 // Then transform columns and add to dest
663 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
664 }
665
666 void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
667 int dest_stride) {
668 uint32_t pos = 45;
669 int32_t out;
670 int32_t r;
671 int32_t a1, absa1;
672 int32_t t1, t2, vector_a1, vector_1, vector_2;
673
674 /* bit positon for extract from acc */
675 __asm__ __volatile__ (
676 "wrdsp %[pos], 1 \n\t"
677
678 :
679 : [pos] "r" (pos)
680 );
681
682 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
683 __asm__ __volatile__ (
684 "addi %[out], %[out], 16 \n\t"
685 "sra %[a1], %[out], 5 \n\t"
686
687 : [out] "+r" (out), [a1] "=r" (a1)
688 :
689 );
690
691 if (a1 < 0) {
692 /* use quad-byte
693 * input and output memory are four byte aligned */
694 __asm__ __volatile__ (
695 "abs %[absa1], %[a1] \n\t"
696 "replv.qb %[vector_a1], %[absa1] \n\t"
697
698 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
699 : [a1] "r" (a1)
700 );
701
702 for (r = 8; r--;) {
703 __asm__ __volatile__ (
704 "lw %[t1], 0(%[dest]) \n\t"
705 "lw %[t2], 4(%[dest]) \n\t"
706 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
707 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
708 "sw %[vector_1], 0(%[dest]) \n\t"
709 "sw %[vector_2], 4(%[dest]) \n\t"
710 "add %[dest], %[dest], %[dest_stride] \n\t"
711
712 : [t1] "=&r" (t1), [t2] "=&r" (t2),
713 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
714 [dest] "+&r" (dest)
715 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
716 );
717 }
718 } else {
719 /* use quad-byte
720 * input and output memory are four byte aligned */
721 __asm__ __volatile__ (
722 "replv.qb %[vector_a1], %[a1] \n\t"
723
724 : [vector_a1] "=r" (vector_a1)
725 : [a1] "r" (a1)
726 );
727
728 for (r = 8; r--;) {
729 __asm__ __volatile__ (
730 "lw %[t1], 0(%[dest]) \n\t"
731 "lw %[t2], 4(%[dest]) \n\t"
732 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
733 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
734 "sw %[vector_1], 0(%[dest]) \n\t"
735 "sw %[vector_2], 4(%[dest]) \n\t"
736 "add %[dest], %[dest], %[dest_stride] \n\t"
737
738 : [t1] "=&r" (t1), [t2] "=&r" (t2),
739 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
740 [dest] "+r" (dest)
741 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
742 );
743 }
744 }
745 }
746 #endif // #if HAVE_DSPR2 93 #endif // #if HAVE_DSPR2
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c ('k') | source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698