Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(75)

Side by Side Diff: source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c

Issue 54923004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_idct.h"
19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
20
21 #if HAVE_DSPR2
22 static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
23 int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
24 int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
25 int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
26 int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
27 int16_t step1_28, step1_29, step1_30, step1_31;
28 int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
29 int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
30 int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
31 int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
32 int16_t step2_28, step2_29, step2_30, step2_31;
33 int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
34 int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
35 int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
36 int16_t step3_29, step3_30, step3_31;
37 int temp0, temp1, temp2, temp3;
38 int load1, load2, load3, load4;
39 int result1, result2;
40 int temp21;
41 int i;
42 const int const_2_power_13 = 8192;
43 const int32_t *input_int;
44
45 for (i = 32; i--; ) {
46 input_int = (const int32_t *)input;
47
48 if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
49 input_int[4] | input_int[5] | input_int[6] | input_int[7] |
50 input_int[8] | input_int[9] | input_int[10] | input_int[11] |
51 input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
52 input += 32;
53
54 __asm__ __volatile__ (
55 "sh $zero, 0(%[output]) \n\t"
56 "sh $zero, 64(%[output]) \n\t"
57 "sh $zero, 128(%[output]) \n\t"
58 "sh $zero, 192(%[output]) \n\t"
59 "sh $zero, 256(%[output]) \n\t"
60 "sh $zero, 320(%[output]) \n\t"
61 "sh $zero, 384(%[output]) \n\t"
62 "sh $zero, 448(%[output]) \n\t"
63 "sh $zero, 512(%[output]) \n\t"
64 "sh $zero, 576(%[output]) \n\t"
65 "sh $zero, 640(%[output]) \n\t"
66 "sh $zero, 704(%[output]) \n\t"
67 "sh $zero, 768(%[output]) \n\t"
68 "sh $zero, 832(%[output]) \n\t"
69 "sh $zero, 896(%[output]) \n\t"
70 "sh $zero, 960(%[output]) \n\t"
71 "sh $zero, 1024(%[output]) \n\t"
72 "sh $zero, 1088(%[output]) \n\t"
73 "sh $zero, 1152(%[output]) \n\t"
74 "sh $zero, 1216(%[output]) \n\t"
75 "sh $zero, 1280(%[output]) \n\t"
76 "sh $zero, 1344(%[output]) \n\t"
77 "sh $zero, 1408(%[output]) \n\t"
78 "sh $zero, 1472(%[output]) \n\t"
79 "sh $zero, 1536(%[output]) \n\t"
80 "sh $zero, 1600(%[output]) \n\t"
81 "sh $zero, 1664(%[output]) \n\t"
82 "sh $zero, 1728(%[output]) \n\t"
83 "sh $zero, 1792(%[output]) \n\t"
84 "sh $zero, 1856(%[output]) \n\t"
85 "sh $zero, 1920(%[output]) \n\t"
86 "sh $zero, 1984(%[output]) \n\t"
87
88 :
89 : [output] "r" (output)
90 );
91
92 output += 1;
93
94 continue;
95 }
96
97 /* prefetch row */
98 vp9_prefetch_load((const uint8_t *)(input + 32));
99 vp9_prefetch_load((const uint8_t *)(input + 48));
100
101 __asm__ __volatile__ (
102 "lh %[load1], 2(%[input]) \n\t"
103 "lh %[load2], 62(%[input]) \n\t"
104 "lh %[load3], 34(%[input]) \n\t"
105 "lh %[load4], 30(%[input]) \n\t"
106
107 "mtlo %[const_2_power_13], $ac1 \n\t"
108 "mthi $zero, $ac1 \n\t"
109 "mtlo %[const_2_power_13], $ac3 \n\t"
110 "mthi $zero, $ac3 \n\t"
111
112 "madd $ac1, %[load1], %[cospi_31_64] \n\t"
113 "msub $ac1, %[load2], %[cospi_1_64] \n\t"
114 "extp %[temp0], $ac1, 31 \n\t"
115
116 "madd $ac3, %[load1], %[cospi_1_64] \n\t"
117 "madd $ac3, %[load2], %[cospi_31_64] \n\t"
118 "extp %[temp3], $ac3, 31 \n\t"
119
120 "mtlo %[const_2_power_13], $ac1 \n\t"
121 "mthi $zero, $ac1 \n\t"
122 "mtlo %[const_2_power_13], $ac2 \n\t"
123 "mthi $zero, $ac2 \n\t"
124
125 "madd $ac2, %[load3], %[cospi_15_64] \n\t"
126 "msub $ac2, %[load4], %[cospi_17_64] \n\t"
127 "extp %[temp1], $ac2, 31 \n\t"
128
129 "madd $ac1, %[load3], %[cospi_17_64] \n\t"
130 "madd $ac1, %[load4], %[cospi_15_64] \n\t"
131 "extp %[temp2], $ac1, 31 \n\t"
132
133 "mtlo %[const_2_power_13], $ac1 \n\t"
134 "mthi $zero, $ac1 \n\t"
135 "mtlo %[const_2_power_13], $ac3 \n\t"
136 "mthi $zero, $ac3 \n\t"
137
138 "sub %[load1], %[temp3], %[temp2] \n\t"
139 "sub %[load2], %[temp0], %[temp1] \n\t"
140
141 "madd $ac1, %[load1], %[cospi_28_64] \n\t"
142 "msub $ac1, %[load2], %[cospi_4_64] \n\t"
143 "madd $ac3, %[load1], %[cospi_4_64] \n\t"
144 "madd $ac3, %[load2], %[cospi_28_64] \n\t"
145
146 "extp %[step1_17], $ac1, 31 \n\t"
147 "extp %[step1_30], $ac3, 31 \n\t"
148 "add %[step1_16], %[temp0], %[temp1] \n\t"
149 "add %[step1_31], %[temp2], %[temp3] \n\t"
150
151 : [load1] "=&r" (load1), [load2] "=&r" (load2),
152 [load3] "=&r" (load3), [load4] "=&r" (load4),
153 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
154 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
155 [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
156 [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
157 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
158 [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
159 [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
160 [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
161 );
162
163 __asm__ __volatile__ (
164 "lh %[load1], 18(%[input]) \n\t"
165 "lh %[load2], 46(%[input]) \n\t"
166 "lh %[load3], 50(%[input]) \n\t"
167 "lh %[load4], 14(%[input]) \n\t"
168
169 "mtlo %[const_2_power_13], $ac1 \n\t"
170 "mthi $zero, $ac1 \n\t"
171 "mtlo %[const_2_power_13], $ac3 \n\t"
172 "mthi $zero, $ac3 \n\t"
173
174 "madd $ac1, %[load1], %[cospi_23_64] \n\t"
175 "msub $ac1, %[load2], %[cospi_9_64] \n\t"
176 "extp %[temp0], $ac1, 31 \n\t"
177
178 "madd $ac3, %[load1], %[cospi_9_64] \n\t"
179 "madd $ac3, %[load2], %[cospi_23_64] \n\t"
180 "extp %[temp3], $ac3, 31 \n\t"
181
182 "mtlo %[const_2_power_13], $ac1 \n\t"
183 "mthi $zero, $ac1 \n\t"
184 "mtlo %[const_2_power_13], $ac2 \n\t"
185 "mthi $zero, $ac2 \n\t"
186
187 "madd $ac2, %[load3], %[cospi_7_64] \n\t"
188 "msub $ac2, %[load4], %[cospi_25_64] \n\t"
189 "extp %[temp1], $ac2, 31 \n\t"
190
191 "madd $ac1, %[load3], %[cospi_25_64] \n\t"
192 "madd $ac1, %[load4], %[cospi_7_64] \n\t"
193 "extp %[temp2], $ac1, 31 \n\t"
194
195 "mtlo %[const_2_power_13], $ac1 \n\t"
196 "mthi $zero, $ac1 \n\t"
197 "mtlo %[const_2_power_13], $ac3 \n\t"
198 "mthi $zero, $ac3 \n\t"
199
200 "sub %[load1], %[temp1], %[temp0] \n\t"
201 "sub %[load2], %[temp2], %[temp3] \n\t"
202
203 "msub $ac1, %[load1], %[cospi_28_64] \n\t"
204 "msub $ac1, %[load2], %[cospi_4_64] \n\t"
205 "msub $ac3, %[load1], %[cospi_4_64] \n\t"
206 "madd $ac3, %[load2], %[cospi_28_64] \n\t"
207
208 "extp %[step1_18], $ac1, 31 \n\t"
209 "extp %[step1_29], $ac3, 31 \n\t"
210 "add %[step1_19], %[temp0], %[temp1] \n\t"
211 "add %[step1_28], %[temp2], %[temp3] \n\t"
212
213 : [load1] "=&r" (load1), [load2] "=&r" (load2),
214 [load3] "=&r" (load3), [load4] "=&r" (load4),
215 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
216 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
217 [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
218 [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
219 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
220 [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
221 [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
222 [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
223 );
224
225 __asm__ __volatile__ (
226 "lh %[load1], 10(%[input]) \n\t"
227 "lh %[load2], 54(%[input]) \n\t"
228 "lh %[load3], 42(%[input]) \n\t"
229 "lh %[load4], 22(%[input]) \n\t"
230
231 "mtlo %[const_2_power_13], $ac1 \n\t"
232 "mthi $zero, $ac1 \n\t"
233 "mtlo %[const_2_power_13], $ac3 \n\t"
234 "mthi $zero, $ac3 \n\t"
235
236 "madd $ac1, %[load1], %[cospi_27_64] \n\t"
237 "msub $ac1, %[load2], %[cospi_5_64] \n\t"
238 "extp %[temp0], $ac1, 31 \n\t"
239
240 "madd $ac3, %[load1], %[cospi_5_64] \n\t"
241 "madd $ac3, %[load2], %[cospi_27_64] \n\t"
242 "extp %[temp3], $ac3, 31 \n\t"
243
244 "mtlo %[const_2_power_13], $ac1 \n\t"
245 "mthi $zero, $ac1 \n\t"
246 "mtlo %[const_2_power_13], $ac2 \n\t"
247 "mthi $zero, $ac2 \n\t"
248
249 "madd $ac2, %[load3], %[cospi_11_64] \n\t"
250 "msub $ac2, %[load4], %[cospi_21_64] \n\t"
251 "extp %[temp1], $ac2, 31 \n\t"
252
253 "madd $ac1, %[load3], %[cospi_21_64] \n\t"
254 "madd $ac1, %[load4], %[cospi_11_64] \n\t"
255 "extp %[temp2], $ac1, 31 \n\t"
256
257 "mtlo %[const_2_power_13], $ac1 \n\t"
258 "mthi $zero, $ac1 \n\t"
259 "mtlo %[const_2_power_13], $ac3 \n\t"
260 "mthi $zero, $ac3 \n\t"
261
262 "sub %[load1], %[temp0], %[temp1] \n\t"
263 "sub %[load2], %[temp3], %[temp2] \n\t"
264
265 "madd $ac1, %[load2], %[cospi_12_64] \n\t"
266 "msub $ac1, %[load1], %[cospi_20_64] \n\t"
267 "madd $ac3, %[load1], %[cospi_12_64] \n\t"
268 "madd $ac3, %[load2], %[cospi_20_64] \n\t"
269
270 "extp %[step1_21], $ac1, 31 \n\t"
271 "extp %[step1_26], $ac3, 31 \n\t"
272 "add %[step1_20], %[temp0], %[temp1] \n\t"
273 "add %[step1_27], %[temp2], %[temp3] \n\t"
274
275 : [load1] "=&r" (load1), [load2] "=&r" (load2),
276 [load3] "=&r" (load3), [load4] "=&r" (load4),
277 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
278 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
279 [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
280 [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
281 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
282 [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
283 [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
284 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
285 );
286
287 __asm__ __volatile__ (
288 "lh %[load1], 26(%[input]) \n\t"
289 "lh %[load2], 38(%[input]) \n\t"
290 "lh %[load3], 58(%[input]) \n\t"
291 "lh %[load4], 6(%[input]) \n\t"
292
293 "mtlo %[const_2_power_13], $ac1 \n\t"
294 "mthi $zero, $ac1 \n\t"
295 "mtlo %[const_2_power_13], $ac3 \n\t"
296 "mthi $zero, $ac3 \n\t"
297
298 "madd $ac1, %[load1], %[cospi_19_64] \n\t"
299 "msub $ac1, %[load2], %[cospi_13_64] \n\t"
300 "extp %[temp0], $ac1, 31 \n\t"
301
302 "madd $ac3, %[load1], %[cospi_13_64] \n\t"
303 "madd $ac3, %[load2], %[cospi_19_64] \n\t"
304 "extp %[temp3], $ac3, 31 \n\t"
305
306 "mtlo %[const_2_power_13], $ac1 \n\t"
307 "mthi $zero, $ac1 \n\t"
308 "mtlo %[const_2_power_13], $ac2 \n\t"
309 "mthi $zero, $ac2 \n\t"
310
311 "madd $ac2, %[load3], %[cospi_3_64] \n\t"
312 "msub $ac2, %[load4], %[cospi_29_64] \n\t"
313 "extp %[temp1], $ac2, 31 \n\t"
314
315 "madd $ac1, %[load3], %[cospi_29_64] \n\t"
316 "madd $ac1, %[load4], %[cospi_3_64] \n\t"
317 "extp %[temp2], $ac1, 31 \n\t"
318
319 "mtlo %[const_2_power_13], $ac1 \n\t"
320 "mthi $zero, $ac1 \n\t"
321 "mtlo %[const_2_power_13], $ac3 \n\t"
322 "mthi $zero, $ac3 \n\t"
323
324 "sub %[load1], %[temp1], %[temp0] \n\t"
325 "sub %[load2], %[temp2], %[temp3] \n\t"
326
327 "msub $ac1, %[load1], %[cospi_12_64] \n\t"
328 "msub $ac1, %[load2], %[cospi_20_64] \n\t"
329 "msub $ac3, %[load1], %[cospi_20_64] \n\t"
330 "madd $ac3, %[load2], %[cospi_12_64] \n\t"
331
332 "extp %[step1_22], $ac1, 31 \n\t"
333 "extp %[step1_25], $ac3, 31 \n\t"
334 "add %[step1_23], %[temp0], %[temp1] \n\t"
335 "add %[step1_24], %[temp2], %[temp3] \n\t"
336
337 : [load1] "=&r" (load1), [load2] "=&r" (load2),
338 [load3] "=&r" (load3), [load4] "=&r" (load4),
339 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
340 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
341 [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
342 [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
343 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
344 [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
345 [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
346 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
347 );
348
349 __asm__ __volatile__ (
350 "lh %[load1], 4(%[input]) \n\t"
351 "lh %[load2], 60(%[input]) \n\t"
352 "lh %[load3], 36(%[input]) \n\t"
353 "lh %[load4], 28(%[input]) \n\t"
354
355 "mtlo %[const_2_power_13], $ac1 \n\t"
356 "mthi $zero, $ac1 \n\t"
357 "mtlo %[const_2_power_13], $ac3 \n\t"
358 "mthi $zero, $ac3 \n\t"
359
360 "madd $ac1, %[load1], %[cospi_30_64] \n\t"
361 "msub $ac1, %[load2], %[cospi_2_64] \n\t"
362 "extp %[temp0], $ac1, 31 \n\t"
363
364 "madd $ac3, %[load1], %[cospi_2_64] \n\t"
365 "madd $ac3, %[load2], %[cospi_30_64] \n\t"
366 "extp %[temp3], $ac3, 31 \n\t"
367
368 "mtlo %[const_2_power_13], $ac1 \n\t"
369 "mthi $zero, $ac1 \n\t"
370 "mtlo %[const_2_power_13], $ac2 \n\t"
371 "mthi $zero, $ac2 \n\t"
372
373 "madd $ac2, %[load3], %[cospi_14_64] \n\t"
374 "msub $ac2, %[load4], %[cospi_18_64] \n\t"
375 "extp %[temp1], $ac2, 31 \n\t"
376
377 "madd $ac1, %[load3], %[cospi_18_64] \n\t"
378 "madd $ac1, %[load4], %[cospi_14_64] \n\t"
379 "extp %[temp2], $ac1, 31 \n\t"
380
381 "mtlo %[const_2_power_13], $ac1 \n\t"
382 "mthi $zero, $ac1 \n\t"
383 "mtlo %[const_2_power_13], $ac3 \n\t"
384 "mthi $zero, $ac3 \n\t"
385
386 "sub %[load1], %[temp0], %[temp1] \n\t"
387 "sub %[load2], %[temp3], %[temp2] \n\t"
388
389 "msub $ac1, %[load1], %[cospi_8_64] \n\t"
390 "madd $ac1, %[load2], %[cospi_24_64] \n\t"
391 "madd $ac3, %[load1], %[cospi_24_64] \n\t"
392 "madd $ac3, %[load2], %[cospi_8_64] \n\t"
393
394 "extp %[step2_9], $ac1, 31 \n\t"
395 "extp %[step2_14], $ac3, 31 \n\t"
396 "add %[step2_8], %[temp0], %[temp1] \n\t"
397 "add %[step2_15], %[temp2], %[temp3] \n\t"
398
399 : [load1] "=&r" (load1), [load2] "=&r" (load2),
400 [load3] "=&r" (load3), [load4] "=&r" (load4),
401 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
402 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
403 [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
404 [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
405 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
406 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
407 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
408 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
409 );
410
411 __asm__ __volatile__ (
412 "lh %[load1], 20(%[input]) \n\t"
413 "lh %[load2], 44(%[input]) \n\t"
414 "lh %[load3], 52(%[input]) \n\t"
415 "lh %[load4], 12(%[input]) \n\t"
416
417 "mtlo %[const_2_power_13], $ac1 \n\t"
418 "mthi $zero, $ac1 \n\t"
419 "mtlo %[const_2_power_13], $ac3 \n\t"
420 "mthi $zero, $ac3 \n\t"
421
422 "madd $ac1, %[load1], %[cospi_22_64] \n\t"
423 "msub $ac1, %[load2], %[cospi_10_64] \n\t"
424 "extp %[temp0], $ac1, 31 \n\t"
425
426 "madd $ac3, %[load1], %[cospi_10_64] \n\t"
427 "madd $ac3, %[load2], %[cospi_22_64] \n\t"
428 "extp %[temp3], $ac3, 31 \n\t"
429
430 "mtlo %[const_2_power_13], $ac1 \n\t"
431 "mthi $zero, $ac1 \n\t"
432 "mtlo %[const_2_power_13], $ac2 \n\t"
433 "mthi $zero, $ac2 \n\t"
434
435 "madd $ac2, %[load3], %[cospi_6_64] \n\t"
436 "msub $ac2, %[load4], %[cospi_26_64] \n\t"
437 "extp %[temp1], $ac2, 31 \n\t"
438
439 "madd $ac1, %[load3], %[cospi_26_64] \n\t"
440 "madd $ac1, %[load4], %[cospi_6_64] \n\t"
441 "extp %[temp2], $ac1, 31 \n\t"
442
443 "mtlo %[const_2_power_13], $ac1 \n\t"
444 "mthi $zero, $ac1 \n\t"
445 "mtlo %[const_2_power_13], $ac3 \n\t"
446 "mthi $zero, $ac3 \n\t"
447
448 "sub %[load1], %[temp1], %[temp0] \n\t"
449 "sub %[load2], %[temp2], %[temp3] \n\t"
450
451 "msub $ac1, %[load1], %[cospi_24_64] \n\t"
452 "msub $ac1, %[load2], %[cospi_8_64] \n\t"
453 "madd $ac3, %[load2], %[cospi_24_64] \n\t"
454 "msub $ac3, %[load1], %[cospi_8_64] \n\t"
455
456 "extp %[step2_10], $ac1, 31 \n\t"
457 "extp %[step2_13], $ac3, 31 \n\t"
458 "add %[step2_11], %[temp0], %[temp1] \n\t"
459 "add %[step2_12], %[temp2], %[temp3] \n\t"
460
461 : [load1] "=&r" (load1), [load2] "=&r" (load2),
462 [load3] "=&r" (load3), [load4] "=&r" (load4),
463 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
464 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
465 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
466 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
467 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
468 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
469 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
470 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
471 );
472
473 __asm__ __volatile__ (
474 "mtlo %[const_2_power_13], $ac0 \n\t"
475 "mthi $zero, $ac0 \n\t"
476 "sub %[temp0], %[step2_14], %[step2_13] \n\t"
477 "sub %[temp0], %[temp0], %[step2_9] \n\t"
478 "add %[temp0], %[temp0], %[step2_10] \n\t"
479 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
480
481 "mtlo %[const_2_power_13], $ac1 \n\t"
482 "mthi $zero, $ac1 \n\t"
483 "sub %[temp1], %[step2_14], %[step2_13] \n\t"
484 "add %[temp1], %[temp1], %[step2_9] \n\t"
485 "sub %[temp1], %[temp1], %[step2_10] \n\t"
486 "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
487
488 "mtlo %[const_2_power_13], $ac2 \n\t"
489 "mthi $zero, $ac2 \n\t"
490 "sub %[temp0], %[step2_15], %[step2_12] \n\t"
491 "sub %[temp0], %[temp0], %[step2_8] \n\t"
492 "add %[temp0], %[temp0], %[step2_11] \n\t"
493 "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
494
495 "mtlo %[const_2_power_13], $ac3 \n\t"
496 "mthi $zero, $ac3 \n\t"
497 "sub %[temp1], %[step2_15], %[step2_12] \n\t"
498 "add %[temp1], %[temp1], %[step2_8] \n\t"
499 "sub %[temp1], %[temp1], %[step2_11] \n\t"
500 "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
501
502 "add %[step3_8], %[step2_8], %[step2_11] \n\t"
503 "add %[step3_9], %[step2_9], %[step2_10] \n\t"
504 "add %[step3_14], %[step2_13], %[step2_14] \n\t"
505 "add %[step3_15], %[step2_12], %[step2_15] \n\t"
506
507 "extp %[step3_10], $ac0, 31 \n\t"
508 "extp %[step3_13], $ac1, 31 \n\t"
509 "extp %[step3_11], $ac2, 31 \n\t"
510 "extp %[step3_12], $ac3, 31 \n\t"
511
512 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
513 [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
514 [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
515 [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
516 [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
517 : [const_2_power_13] "r" (const_2_power_13),
518 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
519 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
520 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
521 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),
522 [cospi_16_64] "r" (cospi_16_64)
523 );
524
525 step2_18 = step1_17 - step1_18;
526 step2_29 = step1_30 - step1_29;
527
528 __asm__ __volatile__ (
529 "mtlo %[const_2_power_13], $ac0 \n\t"
530 "mthi $zero, $ac0 \n\t"
531 "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
532 "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
533 "extp %[step3_18], $ac0, 31 \n\t"
534
535 : [step3_18] "=r" (step3_18)
536 : [const_2_power_13] "r" (const_2_power_13),
537 [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
538 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
539 );
540
541 temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
542 step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
543
544 step2_19 = step1_16 - step1_19;
545 step2_28 = step1_31 - step1_28;
546
547 __asm__ __volatile__ (
548 "mtlo %[const_2_power_13], $ac0 \n\t"
549 "mthi $zero, $ac0 \n\t"
550 "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
551 "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
552 "extp %[step3_19], $ac0, 31 \n\t"
553
554 : [step3_19] "=r" (step3_19)
555 : [const_2_power_13] "r" (const_2_power_13),
556 [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
557 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
558 );
559
560 temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
561 step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
562
563 step3_16 = step1_16 + step1_19;
564 step3_17 = step1_17 + step1_18;
565 step3_30 = step1_29 + step1_30;
566 step3_31 = step1_28 + step1_31;
567
568 step2_20 = step1_23 - step1_20;
569 step2_27 = step1_24 - step1_27;
570
571 __asm__ __volatile__ (
572 "mtlo %[const_2_power_13], $ac0 \n\t"
573 "mthi $zero, $ac0 \n\t"
574 "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
575 "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
576 "extp %[step3_20], $ac0, 31 \n\t"
577
578 : [step3_20] "=r" (step3_20)
579 : [const_2_power_13] "r" (const_2_power_13),
580 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
581 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
582 );
583
584 temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
585 step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
586
587 step2_21 = step1_22 - step1_21;
588 step2_26 = step1_25 - step1_26;
589
590 __asm__ __volatile__ (
591 "mtlo %[const_2_power_13], $ac1 \n\t"
592 "mthi $zero, $ac1 \n\t"
593 "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
594 "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
595 "extp %[step3_21], $ac1, 31 \n\t"
596
597 : [step3_21] "=r" (step3_21)
598 : [const_2_power_13] "r" (const_2_power_13),
599 [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
600 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
601 );
602
603 temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
604 step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
605
606 step3_22 = step1_21 + step1_22;
607 step3_23 = step1_20 + step1_23;
608 step3_24 = step1_24 + step1_27;
609 step3_25 = step1_25 + step1_26;
610
611 step2_16 = step3_16 + step3_23;
612 step2_17 = step3_17 + step3_22;
613 step2_18 = step3_18 + step3_21;
614 step2_19 = step3_19 + step3_20;
615 step2_20 = step3_19 - step3_20;
616 step2_21 = step3_18 - step3_21;
617 step2_22 = step3_17 - step3_22;
618 step2_23 = step3_16 - step3_23;
619
620 step2_24 = step3_31 - step3_24;
621 step2_25 = step3_30 - step3_25;
622 step2_26 = step3_29 - step3_26;
623 step2_27 = step3_28 - step3_27;
624 step2_28 = step3_28 + step3_27;
625 step2_29 = step3_29 + step3_26;
626 step2_30 = step3_30 + step3_25;
627 step2_31 = step3_31 + step3_24;
628
629 __asm__ __volatile__ (
630 "lh %[load1], 0(%[input]) \n\t"
631 "lh %[load2], 32(%[input]) \n\t"
632 "lh %[load3], 16(%[input]) \n\t"
633 "lh %[load4], 48(%[input]) \n\t"
634
635 "mtlo %[const_2_power_13], $ac1 \n\t"
636 "mthi $zero, $ac1 \n\t"
637 "mtlo %[const_2_power_13], $ac2 \n\t"
638 "mthi $zero, $ac2 \n\t"
639 "add %[result1], %[load1], %[load2] \n\t"
640 "sub %[result2], %[load1], %[load2] \n\t"
641 "madd $ac1, %[result1], %[cospi_16_64] \n\t"
642 "madd $ac2, %[result2], %[cospi_16_64] \n\t"
643 "extp %[temp0], $ac1, 31 \n\t"
644 "extp %[temp1], $ac2, 31 \n\t"
645
646 "mtlo %[const_2_power_13], $ac3 \n\t"
647 "mthi $zero, $ac3 \n\t"
648 "madd $ac3, %[load3], %[cospi_24_64] \n\t"
649 "msub $ac3, %[load4], %[cospi_8_64] \n\t"
650 "extp %[temp2], $ac3, 31 \n\t"
651
652 "mtlo %[const_2_power_13], $ac1 \n\t"
653 "mthi $zero, $ac1 \n\t"
654 "madd $ac1, %[load3], %[cospi_8_64] \n\t"
655 "madd $ac1, %[load4], %[cospi_24_64] \n\t"
656 "extp %[temp3], $ac1, 31 \n\t"
657
658 "add %[step1_0], %[temp0], %[temp3] \n\t"
659 "add %[step1_1], %[temp1], %[temp2] \n\t"
660 "sub %[step1_2], %[temp1], %[temp2] \n\t"
661 "sub %[step1_3], %[temp0], %[temp3] \n\t"
662
663 : [load1] "=&r" (load1), [load2] "=&r" (load2),
664 [load3] "=&r" (load3), [load4] "=&r" (load4),
665 [result1] "=&r" (result1), [result2] "=&r" (result2),
666 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
667 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
668 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
669 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
670 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
671 [cospi_16_64] "r" (cospi_16_64),
672 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
673
674 );
675
676 __asm__ __volatile__ (
677 "lh %[load1], 8(%[input]) \n\t"
678 "lh %[load2], 56(%[input]) \n\t"
679 "lh %[load3], 40(%[input]) \n\t"
680 "lh %[load4], 24(%[input]) \n\t"
681
682 "mtlo %[const_2_power_13], $ac1 \n\t"
683 "mthi $zero, $ac1 \n\t"
684 "mtlo %[const_2_power_13], $ac3 \n\t"
685 "mthi $zero, $ac3 \n\t"
686
687 "madd $ac1, %[load1], %[cospi_28_64] \n\t"
688 "msub $ac1, %[load2], %[cospi_4_64] \n\t"
689 "extp %[temp0], $ac1, 31 \n\t"
690
691 "madd $ac3, %[load1], %[cospi_4_64] \n\t"
692 "madd $ac3, %[load2], %[cospi_28_64] \n\t"
693 "extp %[temp3], $ac3, 31 \n\t"
694
695 "mtlo %[const_2_power_13], $ac1 \n\t"
696 "mthi $zero, $ac1 \n\t"
697 "mtlo %[const_2_power_13], $ac2 \n\t"
698 "mthi $zero, $ac2 \n\t"
699
700 "madd $ac2, %[load3], %[cospi_12_64] \n\t"
701 "msub $ac2, %[load4], %[cospi_20_64] \n\t"
702 "extp %[temp1], $ac2, 31 \n\t"
703
704 "madd $ac1, %[load3], %[cospi_20_64] \n\t"
705 "madd $ac1, %[load4], %[cospi_12_64] \n\t"
706 "extp %[temp2], $ac1, 31 \n\t"
707
708 "mtlo %[const_2_power_13], $ac1 \n\t"
709 "mthi $zero, $ac1 \n\t"
710 "mtlo %[const_2_power_13], $ac3 \n\t"
711 "mthi $zero, $ac3 \n\t"
712
713 "sub %[load1], %[temp3], %[temp2] \n\t"
714 "sub %[load1], %[load1], %[temp0] \n\t"
715 "add %[load1], %[load1], %[temp1] \n\t"
716
717 "sub %[load2], %[temp0], %[temp1] \n\t"
718 "sub %[load2], %[load2], %[temp2] \n\t"
719 "add %[load2], %[load2], %[temp3] \n\t"
720
721 "madd $ac1, %[load1], %[cospi_16_64] \n\t"
722 "madd $ac3, %[load2], %[cospi_16_64] \n\t"
723
724 "extp %[step1_5], $ac1, 31 \n\t"
725 "extp %[step1_6], $ac3, 31 \n\t"
726 "add %[step1_4], %[temp0], %[temp1] \n\t"
727 "add %[step1_7], %[temp3], %[temp2] \n\t"
728
729 : [load1] "=&r" (load1), [load2] "=&r" (load2),
730 [load3] "=&r" (load3), [load4] "=&r" (load4),
731 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
732 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
733 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
734 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
735 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
736 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
737 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
738 [cospi_16_64] "r" (cospi_16_64)
739 );
740
741 step2_0 = step1_0 + step1_7;
742 step2_1 = step1_1 + step1_6;
743 step2_2 = step1_2 + step1_5;
744 step2_3 = step1_3 + step1_4;
745 step2_4 = step1_3 - step1_4;
746 step2_5 = step1_2 - step1_5;
747 step2_6 = step1_1 - step1_6;
748 step2_7 = step1_0 - step1_7;
749
750 step1_0 = step2_0 + step3_15;
751 step1_1 = step2_1 + step3_14;
752 step1_2 = step2_2 + step3_13;
753 step1_3 = step2_3 + step3_12;
754 step1_4 = step2_4 + step3_11;
755 step1_5 = step2_5 + step3_10;
756 step1_6 = step2_6 + step3_9;
757 step1_7 = step2_7 + step3_8;
758 step1_8 = step2_7 - step3_8;
759 step1_9 = step2_6 - step3_9;
760 step1_10 = step2_5 - step3_10;
761 step1_11 = step2_4 - step3_11;
762 step1_12 = step2_3 - step3_12;
763 step1_13 = step2_2 - step3_13;
764 step1_14 = step2_1 - step3_14;
765 step1_15 = step2_0 - step3_15;
766
767 __asm__ __volatile__ (
768 "sub %[temp0], %[step2_27], %[step2_20] \n\t"
769 "mtlo %[const_2_power_13], $ac0 \n\t"
770 "mthi $zero, $ac0 \n\t"
771 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
772 "extp %[step1_20], $ac0, 31 \n\t"
773
774 : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
775 : [const_2_power_13] "r" (const_2_power_13),
776 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
777 [cospi_16_64] "r" (cospi_16_64)
778 );
779
780 temp21 = (step2_20 + step2_27) * cospi_16_64;
781 step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
782
783 __asm__ __volatile__ (
784 "sub %[temp0], %[step2_26], %[step2_21] \n\t"
785 "mtlo %[const_2_power_13], $ac0 \n\t"
786 "mthi $zero, $ac0 \n\t"
787 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
788 "extp %[step1_21], $ac0, 31 \n\t"
789
790 : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
791 : [const_2_power_13] "r" (const_2_power_13),
792 [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),
793 [cospi_16_64] "r" (cospi_16_64)
794 );
795
796 temp21 = (step2_21 + step2_26) * cospi_16_64;
797 step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
798
799 __asm__ __volatile__ (
800 "sub %[temp0], %[step2_25], %[step2_22] \n\t"
801 "mtlo %[const_2_power_13], $ac0 \n\t"
802 "mthi $zero, $ac0 \n\t"
803 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
804 "extp %[step1_22], $ac0, 31 \n\t"
805
806 : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
807 : [const_2_power_13] "r" (const_2_power_13),
808 [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),
809 [cospi_16_64] "r" (cospi_16_64)
810 );
811
812 temp21 = (step2_22 + step2_25) * cospi_16_64;
813 step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
814
815 __asm__ __volatile__ (
816 "sub %[temp0], %[step2_24], %[step2_23] \n\t"
817 "mtlo %[const_2_power_13], $ac0 \n\t"
818 "mthi $zero, $ac0 \n\t"
819 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
820 "extp %[step1_23], $ac0, 31 \n\t"
821
822 : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
823 : [const_2_power_13] "r" (const_2_power_13),
824 [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),
825 [cospi_16_64] "r" (cospi_16_64)
826 );
827
828 temp21 = (step2_23 + step2_24) * cospi_16_64;
829 step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
830
831 // final stage
832 output[0 * 32] = step1_0 + step2_31;
833 output[1 * 32] = step1_1 + step2_30;
834 output[2 * 32] = step1_2 + step2_29;
835 output[3 * 32] = step1_3 + step2_28;
836 output[4 * 32] = step1_4 + step1_27;
837 output[5 * 32] = step1_5 + step1_26;
838 output[6 * 32] = step1_6 + step1_25;
839 output[7 * 32] = step1_7 + step1_24;
840 output[8 * 32] = step1_8 + step1_23;
841 output[9 * 32] = step1_9 + step1_22;
842 output[10 * 32] = step1_10 + step1_21;
843 output[11 * 32] = step1_11 + step1_20;
844 output[12 * 32] = step1_12 + step2_19;
845 output[13 * 32] = step1_13 + step2_18;
846 output[14 * 32] = step1_14 + step2_17;
847 output[15 * 32] = step1_15 + step2_16;
848 output[16 * 32] = step1_15 - step2_16;
849 output[17 * 32] = step1_14 - step2_17;
850 output[18 * 32] = step1_13 - step2_18;
851 output[19 * 32] = step1_12 - step2_19;
852 output[20 * 32] = step1_11 - step1_20;
853 output[21 * 32] = step1_10 - step1_21;
854 output[22 * 32] = step1_9 - step1_22;
855 output[23 * 32] = step1_8 - step1_23;
856 output[24 * 32] = step1_7 - step1_24;
857 output[25 * 32] = step1_6 - step1_25;
858 output[26 * 32] = step1_5 - step1_26;
859 output[27 * 32] = step1_4 - step1_27;
860 output[28 * 32] = step1_3 - step2_28;
861 output[29 * 32] = step1_2 - step2_29;
862 output[30 * 32] = step1_1 - step2_30;
863 output[31 * 32] = step1_0 - step2_31;
864
865 input += 32;
866 output += 1;
867 }
868 }
869
870 void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
871 int dest_stride) {
872 DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
873 int16_t *outptr = out;
874 uint32_t pos = 45;
875
876 /* bit positon for extract from acc */
877 __asm__ __volatile__ (
878 "wrdsp %[pos], 1 \n\t"
879 :
880 : [pos] "r" (pos)
881 );
882
883 // Rows
884 idct32_1d_rows_dspr2(input, outptr);
885
886 // Columns
887 vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
888 }
889
890 void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
891 int stride) {
892 int r, out;
893 int32_t a1, absa1;
894 int32_t vector_a1;
895 int32_t t1, t2, t3, t4;
896 int32_t vector_1, vector_2, vector_3, vector_4;
897 uint32_t pos = 45;
898
899 /* bit positon for extract from acc */
900 __asm__ __volatile__ (
901 "wrdsp %[pos], 1 \n\t"
902
903 :
904 : [pos] "r" (pos)
905 );
906
907 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
908 __asm__ __volatile__ (
909 "addi %[out], %[out], 32 \n\t"
910 "sra %[a1], %[out], 6 \n\t"
911
912 : [out] "+r" (out), [a1] "=r" (a1)
913 :
914 );
915
916 if (a1 < 0) {
917 /* use quad-byte
918 * input and output memory are four byte aligned */
919 __asm__ __volatile__ (
920 "abs %[absa1], %[a1] \n\t"
921 "replv.qb %[vector_a1], %[absa1] \n\t"
922
923 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
924 : [a1] "r" (a1)
925 );
926
927 for (r = 32; r--;) {
928 __asm__ __volatile__ (
929 "lw %[t1], 0(%[dest]) \n\t"
930 "lw %[t2], 4(%[dest]) \n\t"
931 "lw %[t3], 8(%[dest]) \n\t"
932 "lw %[t4], 12(%[dest]) \n\t"
933 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
934 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
935 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
936 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
937 "sw %[vector_1], 0(%[dest]) \n\t"
938 "sw %[vector_2], 4(%[dest]) \n\t"
939 "sw %[vector_3], 8(%[dest]) \n\t"
940 "sw %[vector_4], 12(%[dest]) \n\t"
941
942 "lw %[t1], 16(%[dest]) \n\t"
943 "lw %[t2], 20(%[dest]) \n\t"
944 "lw %[t3], 24(%[dest]) \n\t"
945 "lw %[t4], 28(%[dest]) \n\t"
946 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
947 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
948 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
949 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
950 "sw %[vector_1], 16(%[dest]) \n\t"
951 "sw %[vector_2], 20(%[dest]) \n\t"
952 "sw %[vector_3], 24(%[dest]) \n\t"
953 "sw %[vector_4], 28(%[dest]) \n\t"
954
955 "add %[dest], %[dest], %[stride] \n\t"
956
957 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
958 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
959 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
960 [dest] "+&r" (dest)
961 : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
962 );
963 }
964 } else {
965 /* use quad-byte
966 * input and output memory are four byte aligned */
967 __asm__ __volatile__ (
968 "replv.qb %[vector_a1], %[a1] \n\t"
969
970 : [vector_a1] "=r" (vector_a1)
971 : [a1] "r" (a1)
972 );
973
974 for (r = 32; r--;) {
975 __asm__ __volatile__ (
976 "lw %[t1], 0(%[dest]) \n\t"
977 "lw %[t2], 4(%[dest]) \n\t"
978 "lw %[t3], 8(%[dest]) \n\t"
979 "lw %[t4], 12(%[dest]) \n\t"
980 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
981 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
982 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
983 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
984 "sw %[vector_1], 0(%[dest]) \n\t"
985 "sw %[vector_2], 4(%[dest]) \n\t"
986 "sw %[vector_3], 8(%[dest]) \n\t"
987 "sw %[vector_4], 12(%[dest]) \n\t"
988
989 "lw %[t1], 16(%[dest]) \n\t"
990 "lw %[t2], 20(%[dest]) \n\t"
991 "lw %[t3], 24(%[dest]) \n\t"
992 "lw %[t4], 28(%[dest]) \n\t"
993 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
994 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
995 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
996 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
997 "sw %[vector_1], 16(%[dest]) \n\t"
998 "sw %[vector_2], 20(%[dest]) \n\t"
999 "sw %[vector_3], 24(%[dest]) \n\t"
1000 "sw %[vector_4], 28(%[dest]) \n\t"
1001
1002 "add %[dest], %[dest], %[stride] \n\t"
1003
1004 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
1005 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
1006 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
1007 [dest] "+&r" (dest)
1008 : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
1009 );
1010 }
1011 }
1012 }
1013 #endif // #if HAVE_DSPR2
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c ('k') | source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698