Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(185)

Side by Side Diff: source/libvpx/vpx_dsp/mips/vpx_convolve8_dspr2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/vpx_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
21 uint8_t vpx_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
22 uint8_t *vpx_ff_cropTbl;
23
24 void vpx_dsputil_static_init(void) {
25 int i;
26
27 for (i = 0; i < 256; i++) vpx_ff_cropTbl_a[i + CROP_WIDTH] = i;
28
29 for (i = 0; i < CROP_WIDTH; i++) {
30 vpx_ff_cropTbl_a[i] = 0;
31 vpx_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
32 }
33
34 vpx_ff_cropTbl = &vpx_ff_cropTbl_a[CROP_WIDTH];
35 }
36
37 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
38 int32_t src_stride,
39 uint8_t *dst,
40 int32_t dst_stride,
41 const int16_t *filter_x0,
42 int32_t h) {
43 int32_t y;
44 uint8_t *cm = vpx_ff_cropTbl;
45 uint8_t *dst_ptr;
46 int32_t vector1b, vector2b, vector3b, vector4b;
47 int32_t Temp1, Temp2, Temp3, Temp4;
48 uint32_t vector4a = 64;
49 uint32_t tp1, tp2;
50 uint32_t p1, p2, p3, p4;
51 uint32_t tn1, tn2;
52
53 vector1b = ((const int32_t *)filter_x0)[0];
54 vector2b = ((const int32_t *)filter_x0)[1];
55 vector3b = ((const int32_t *)filter_x0)[2];
56 vector4b = ((const int32_t *)filter_x0)[3];
57
58 for (y = h; y--;) {
59 dst_ptr = dst;
60 /* prefetch data to cache memory */
61 prefetch_load(src + src_stride);
62 prefetch_load(src + src_stride + 32);
63
64 __asm__ __volatile__ (
65 "ulw %[tp1], 0(%[src]) \n\t"
66 "ulw %[tp2], 4(%[src]) \n\t"
67
68 /* even 1. pixel */
69 "mtlo %[vector4a], $ac3 \n\t"
70 "mthi $zero, $ac3 \n\t"
71 "preceu.ph.qbr %[p1], %[tp1] \n\t"
72 "preceu.ph.qbl %[p2], %[tp1] \n\t"
73 "preceu.ph.qbr %[p3], %[tp2] \n\t"
74 "preceu.ph.qbl %[p4], %[tp2] \n\t"
75 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
76 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
77 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
78 "ulw %[tn2], 8(%[src]) \n\t"
79 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
80 "extp %[Temp1], $ac3, 31 \n\t"
81
82 /* even 2. pixel */
83 "mtlo %[vector4a], $ac2 \n\t"
84 "mthi $zero, $ac2 \n\t"
85 "preceu.ph.qbr %[p1], %[tn2] \n\t"
86 "balign %[tn1], %[tn2], 3 \n\t"
87 "balign %[tn2], %[tp2], 3 \n\t"
88 "balign %[tp2], %[tp1], 3 \n\t"
89 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
90 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
91 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
92 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
93 "extp %[Temp3], $ac2, 31 \n\t"
94
95 /* odd 1. pixel */
96 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
97 "mtlo %[vector4a], $ac3 \n\t"
98 "mthi $zero, $ac3 \n\t"
99 "preceu.ph.qbr %[p1], %[tp2] \n\t"
100 "preceu.ph.qbl %[p2], %[tp2] \n\t"
101 "preceu.ph.qbr %[p3], %[tn2] \n\t"
102 "preceu.ph.qbl %[p4], %[tn2] \n\t"
103 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
104 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
105 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
106 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
107 "extp %[Temp2], $ac3, 31 \n\t"
108
109 /* odd 2. pixel */
110 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
111 "mtlo %[vector4a], $ac2 \n\t"
112 "mthi $zero, $ac2 \n\t"
113 "preceu.ph.qbr %[p1], %[tn1] \n\t"
114 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
115 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
116 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
117 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
118 "extp %[Temp4], $ac2, 31 \n\t"
119
120 /* clamp */
121 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
122 "lbux %[p2], %[Temp4](%[cm]) \n\t"
123
124 /* store bytes */
125 "sb %[tp1], 0(%[dst_ptr]) \n\t"
126 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
127
128 "sb %[tn1], 0(%[dst_ptr]) \n\t"
129 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
130
131 "sb %[tp2], 0(%[dst_ptr]) \n\t"
132 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
133
134 "sb %[p2], 0(%[dst_ptr]) \n\t"
135 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
136
137 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" ( tn2),
138 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
139 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [ Temp4] "=&r" (Temp4),
140 [dst_ptr] "+r" (dst_ptr)
141 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
142 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
143 [vector4a] "r" (vector4a),
144 [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
145 );
146
147 /* Next row... */
148 src += src_stride;
149 dst += 1;
150 }
151 }
152
153 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
154 int32_t src_stride,
155 uint8_t *dst,
156 int32_t dst_stride,
157 const int16_t *filter_x0,
158 int32_t h) {
159 int32_t y;
160 uint8_t *cm = vpx_ff_cropTbl;
161 uint8_t *dst_ptr;
162 uint32_t vector4a = 64;
163 int32_t vector1b, vector2b, vector3b, vector4b;
164 int32_t Temp1, Temp2, Temp3;
165 uint32_t tp1, tp2, tp3;
166 uint32_t p1, p2, p3, p4, n1;
167 uint8_t *odd_dst;
168 uint32_t dst_pitch_2 = (dst_stride << 1);
169
170 vector1b = ((const int32_t *)filter_x0)[0];
171 vector2b = ((const int32_t *)filter_x0)[1];
172 vector3b = ((const int32_t *)filter_x0)[2];
173 vector4b = ((const int32_t *)filter_x0)[3];
174
175 for (y = h; y--;) {
176 /* prefetch data to cache memory */
177 prefetch_load(src + src_stride);
178 prefetch_load(src + src_stride + 32);
179
180 dst_ptr = dst;
181 odd_dst = (dst_ptr + dst_stride);
182
183 __asm__ __volatile__ (
184 "ulw %[tp2], 0(%[src]) \n\t"
185 "ulw %[tp1], 4(%[src]) \n\t"
186
187 /* even 1. pixel */
188 "mtlo %[vector4a], $ac3 \n\t"
189 "mthi $zero, $ac3 \n\t"
190 "mtlo %[vector4a], $ac2 \n\t"
191 "mthi $zero, $ac2 \n\t"
192 "preceu.ph.qbr %[p1], %[tp2] \n\t"
193 "preceu.ph.qbl %[p2], %[tp2] \n\t"
194 "preceu.ph.qbr %[p3], %[tp1] \n\t"
195 "preceu.ph.qbl %[p4], %[tp1] \n\t"
196 "ulw %[tp3], 8(%[src]) \n\t"
197 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
198 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
199 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
200 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
201 "extp %[Temp1], $ac3, 31 \n\t"
202
203 /* even 2. pixel */
204 "preceu.ph.qbr %[p1], %[tp3] \n\t"
205 "preceu.ph.qbl %[n1], %[tp3] \n\t"
206 "ulw %[tp2], 12(%[src]) \n\t"
207 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
208 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
209 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
210 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
211 "extp %[Temp3], $ac2, 31 \n\t"
212
213 /* even 3. pixel */
214 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
215 "mtlo %[vector4a], $ac1 \n\t"
216 "mthi $zero, $ac1 \n\t"
217 "preceu.ph.qbr %[p2], %[tp2] \n\t"
218 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
219 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
220 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
221 "lbux %[tp3], %[Temp3](%[cm]) \n\t"
222 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
223 "extp %[p3], $ac1, 31 \n\t"
224
225 /* even 4. pixel */
226 "mtlo %[vector4a], $ac2 \n\t"
227 "mthi $zero, $ac2 \n\t"
228 "mtlo %[vector4a], $ac3 \n\t"
229 "mthi $zero, $ac3 \n\t"
230 "sb %[Temp2], 0(%[dst_ptr]) \n\t"
231 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
232 "sb %[tp3], 0(%[dst_ptr]) \n\t"
233 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
234
235 "ulw %[tp1], 1(%[src]) \n\t"
236 "ulw %[tp3], 5(%[src]) \n\t"
237
238 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
239 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
240 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
241 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
242 "extp %[Temp3], $ac2, 31 \n\t"
243
244 "lbux %[tp2], %[p3](%[cm]) \n\t"
245
246 /* odd 1. pixel */
247 "mtlo %[vector4a], $ac1 \n\t"
248 "mthi $zero, $ac1 \n\t"
249 "preceu.ph.qbr %[p1], %[tp1] \n\t"
250 "preceu.ph.qbl %[p2], %[tp1] \n\t"
251 "preceu.ph.qbr %[p3], %[tp3] \n\t"
252 "preceu.ph.qbl %[p4], %[tp3] \n\t"
253 "sb %[tp2], 0(%[dst_ptr]) \n\t"
254 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
255 "ulw %[tp2], 9(%[src]) \n\t"
256
257 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
258 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
259 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
260 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
261 "extp %[Temp2], $ac3, 31 \n\t"
262
263 /* odd 2. pixel */
264 "lbux %[tp1], %[Temp3](%[cm]) \n\t"
265 "mtlo %[vector4a], $ac3 \n\t"
266 "mthi $zero, $ac3 \n\t"
267 "mtlo %[vector4a], $ac2 \n\t"
268 "mthi $zero, $ac2 \n\t"
269 "preceu.ph.qbr %[p1], %[tp2] \n\t"
270 "preceu.ph.qbl %[n1], %[tp2] \n\t"
271 "ulw %[Temp1], 13(%[src]) \n\t"
272 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
273 "sb %[tp1], 0(%[dst_ptr]) \n\t"
274 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
275 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
276 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
277 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
278 "extp %[Temp3], $ac1, 31 \n\t"
279
280 /* odd 3. pixel */
281 "lbux %[tp3], %[Temp2](%[cm]) \n\t"
282 "preceu.ph.qbr %[p2], %[Temp1] \n\t"
283 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
284 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
285 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
286 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
287 "extp %[Temp2], $ac3, 31 \n\t"
288
289 /* odd 4. pixel */
290 "sb %[tp3], 0(%[odd_dst]) \n\t"
291 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
292 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
293 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
294 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
295 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
296 "extp %[Temp1], $ac2, 31 \n\t"
297
298 /* clamp */
299 "lbux %[p4], %[Temp3](%[cm]) \n\t"
300 "lbux %[p2], %[Temp2](%[cm]) \n\t"
301 "lbux %[n1], %[Temp1](%[cm]) \n\t"
302
303 /* store bytes */
304 "sb %[p4], 0(%[odd_dst]) \n\t"
305 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
306
307 "sb %[p2], 0(%[odd_dst]) \n\t"
308 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
309
310 "sb %[n1], 0(%[odd_dst]) \n\t"
311
312 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
313 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
314 [n1] "=&r" (n1),
315 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
316 [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
317 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
318 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
319 [vector4a] "r" (vector4a), [cm] "r" (cm),
320 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
321 );
322
323 /* Next row... */
324 src += src_stride;
325 dst += 1;
326 }
327 }
328
329 static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
330 int32_t src_stride,
331 uint8_t *dst_ptr,
332 int32_t dst_stride,
333 const int16_t *filter_x0,
334 int32_t h,
335 int32_t count) {
336 int32_t c, y;
337 const uint8_t *src;
338 uint8_t *dst;
339 uint8_t *cm = vpx_ff_cropTbl;
340 uint32_t vector_64 = 64;
341 int32_t filter12, filter34, filter56, filter78;
342 int32_t Temp1, Temp2, Temp3;
343 uint32_t qload1, qload2;
344 uint32_t p1, p2, p3, p4, p5;
345 uint32_t st1, st2, st3;
346 uint32_t dst_pitch_2 = (dst_stride << 1);
347 uint8_t *odd_dst;
348
349 filter12 = ((const int32_t *)filter_x0)[0];
350 filter34 = ((const int32_t *)filter_x0)[1];
351 filter56 = ((const int32_t *)filter_x0)[2];
352 filter78 = ((const int32_t *)filter_x0)[3];
353
354 for (y = h; y--;) {
355 /* prefetch data to cache memory */
356 prefetch_load(src_ptr + src_stride);
357 prefetch_load(src_ptr + src_stride + 32);
358
359 src = src_ptr;
360 dst = dst_ptr;
361
362 odd_dst = (dst + dst_stride);
363
364 for (c = 0; c < count; c++) {
365 __asm__ __volatile__ (
366 "ulw %[qload1], 0(%[src]) \n \t"
367 "ulw %[qload2], 4(%[src]) \n \t"
368
369 /* even 1. pixel */
370 "mtlo %[vector_64], $ac1 \n \t" /* even 1 */
371 "mthi $zero, $ac1 \n \t"
372 "mtlo %[vector_64], $ac2 \n \t" /* even 2 */
373 "mthi $zero, $ac2 \n \t"
374 "preceu.ph.qbr %[p3], %[qload2] \n \t"
375 "preceu.ph.qbl %[p4], %[qload2] \n \t"
376 "preceu.ph.qbr %[p1], %[qload1] \n \t"
377 "preceu.ph.qbl %[p2], %[qload1] \n \t"
378 "ulw %[qload2], 8(%[src]) \n \t"
379 "dpa.w.ph $ac1, %[p1], %[filter12] \n \t" /* even 1 */
380 "dpa.w.ph $ac1, %[p2], %[filter34] \n \t" /* even 1 */
381 "dpa.w.ph $ac1, %[p3], %[filter56] \n \t" /* even 1 */
382 "dpa.w.ph $ac1, %[p4], %[filter78] \n \t" /* even 1 */
383 "extp %[Temp1], $ac1, 31 \n \t" /* even 1 */
384
385 /* even 2. pixel */
386 "mtlo %[vector_64], $ac3 \n \t" /* even 3 */
387 "mthi $zero, $ac3 \n \t"
388 "preceu.ph.qbr %[p1], %[qload2] \n \t"
389 "preceu.ph.qbl %[p5], %[qload2] \n \t"
390 "ulw %[qload1], 12(%[src]) \n \t"
391 "dpa.w.ph $ac2, %[p2], %[filter12] \n \t" /* even 1 */
392 "dpa.w.ph $ac2, %[p3], %[filter34] \n \t" /* even 1 */
393 "dpa.w.ph $ac2, %[p4], %[filter56] \n \t" /* even 1 */
394 "dpa.w.ph $ac2, %[p1], %[filter78] \n \t" /* even 1 */
395 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* even 1 */
396 "extp %[Temp2], $ac2, 31 \n \t" /* even 1 */
397
398 /* even 3. pixel */
399 "mtlo %[vector_64], $ac1 \n \t" /* even 4 */
400 "mthi $zero, $ac1 \n \t"
401 "preceu.ph.qbr %[p2], %[qload1] \n \t"
402 "sb %[st1], 0(%[dst]) \n \t" /* even 1 */
403 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
404 "dpa.w.ph $ac3, %[p3], %[filter12] \n \t" /* even 3 */
405 "dpa.w.ph $ac3, %[p4], %[filter34] \n \t" /* even 3 */
406 "dpa.w.ph $ac3, %[p1], %[filter56] \n \t" /* even 3 */
407 "dpa.w.ph $ac3, %[p5], %[filter78] \n \t" /* even 3 */
408 "extp %[Temp3], $ac3, 31 \n \t" /* even 3 */
409 "lbux %[st2], %[Temp2](%[cm]) \n \t" /* even 1 */
410
411 /* even 4. pixel */
412 "mtlo %[vector_64], $ac2 \n \t" /* even 5 */
413 "mthi $zero, $ac2 \n \t"
414 "preceu.ph.qbl %[p3], %[qload1] \n \t"
415 "sb %[st2], 0(%[dst]) \n \t" /* even 2 */
416 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
417 "ulw %[qload2], 16(%[src]) \n \t"
418 "dpa.w.ph $ac1, %[p4], %[filter12] \n \t" /* even 4 */
419 "dpa.w.ph $ac1, %[p1], %[filter34] \n \t" /* even 4 */
420 "dpa.w.ph $ac1, %[p5], %[filter56] \n \t" /* even 4 */
421 "dpa.w.ph $ac1, %[p2], %[filter78] \n \t" /* even 4 */
422 "extp %[Temp1], $ac1, 31 \n \t" /* even 4 */
423 "lbux %[st3], %[Temp3](%[cm]) \n \t" /* even 3 */
424
425 /* even 5. pixel */
426 "mtlo %[vector_64], $ac3 \n \t" /* even 6 */
427 "mthi $zero, $ac3 \n \t"
428 "preceu.ph.qbr %[p4], %[qload2] \n \t"
429 "sb %[st3], 0(%[dst]) \n \t" /* even 3 */
430 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
431 "dpa.w.ph $ac2, %[p1], %[filter12] \n \t" /* even 5 */
432 "dpa.w.ph $ac2, %[p5], %[filter34] \n \t" /* even 5 */
433 "dpa.w.ph $ac2, %[p2], %[filter56] \n \t" /* even 5 */
434 "dpa.w.ph $ac2, %[p3], %[filter78] \n \t" /* even 5 */
435 "extp %[Temp2], $ac2, 31 \n \t" /* even 5 */
436 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* even 4 */
437
438 /* even 6. pixel */
439 "mtlo %[vector_64], $ac1 \n \t" /* even 7 */
440 "mthi $zero, $ac1 \n \t"
441 "preceu.ph.qbl %[p1], %[qload2] \n \t"
442 "sb %[st1], 0(%[dst]) \n \t" /* even 4 */
443 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
444 "ulw %[qload1], 20(%[src]) \n \t"
445 "dpa.w.ph $ac3, %[p5], %[filter12] \n \t" /* even 6 */
446 "dpa.w.ph $ac3, %[p2], %[filter34] \n \t" /* even 6 */
447 "dpa.w.ph $ac3, %[p3], %[filter56] \n \t" /* even 6 */
448 "dpa.w.ph $ac3, %[p4], %[filter78] \n \t" /* even 6 */
449 "extp %[Temp3], $ac3, 31 \n \t" /* even 6 */
450 "lbux %[st2], %[Temp2](%[cm]) \n \t" /* even 5 */
451
452 /* even 7. pixel */
453 "mtlo %[vector_64], $ac2 \n \t" /* even 8 */
454 "mthi $zero, $ac2 \n \t"
455 "preceu.ph.qbr %[p5], %[qload1] \n \t"
456 "sb %[st2], 0(%[dst]) \n \t" /* even 5 */
457 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
458 "dpa.w.ph $ac1, %[p2], %[filter12] \n \t" /* even 7 */
459 "dpa.w.ph $ac1, %[p3], %[filter34] \n \t" /* even 7 */
460 "dpa.w.ph $ac1, %[p4], %[filter56] \n \t" /* even 7 */
461 "dpa.w.ph $ac1, %[p1], %[filter78] \n \t" /* even 7 */
462 "extp %[Temp1], $ac1, 31 \n \t" /* even 7 */
463 "lbux %[st3], %[Temp3](%[cm]) \n \t" /* even 6 */
464
465 /* even 8. pixel */
466 "mtlo %[vector_64], $ac3 \n \t" /* odd 1 */
467 "mthi $zero, $ac3 \n \t"
468 "dpa.w.ph $ac2, %[p3], %[filter12] \n \t" /* even 8 */
469 "dpa.w.ph $ac2, %[p4], %[filter34] \n \t" /* even 8 */
470 "sb %[st3], 0(%[dst]) \n \t" /* even 6 */
471 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
472 "dpa.w.ph $ac2, %[p1], %[filter56] \n \t" /* even 8 */
473 "dpa.w.ph $ac2, %[p5], %[filter78] \n \t" /* even 8 */
474 "extp %[Temp2], $ac2, 31 \n \t" /* even 8 */
475 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* even 7 */
476
477 /* ODD pixels */
478 "ulw %[qload1], 1(%[src]) \n \t"
479 "ulw %[qload2], 5(%[src]) \n \t"
480
481 /* odd 1. pixel */
482 "mtlo %[vector_64], $ac1 \n \t" /* odd 2 */
483 "mthi $zero, $ac1 \n \t"
484 "preceu.ph.qbr %[p1], %[qload1] \n \t"
485 "preceu.ph.qbl %[p2], %[qload1] \n \t"
486 "preceu.ph.qbr %[p3], %[qload2] \n \t"
487 "preceu.ph.qbl %[p4], %[qload2] \n \t"
488 "sb %[st1], 0(%[dst]) \n \t" /* even 7 */
489 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
490 "ulw %[qload2], 9(%[src]) \n \t"
491 "dpa.w.ph $ac3, %[p1], %[filter12] \n \t" /* odd 1 */
492 "dpa.w.ph $ac3, %[p2], %[filter34] \n \t" /* odd 1 */
493 "dpa.w.ph $ac3, %[p3], %[filter56] \n \t" /* odd 1 */
494 "dpa.w.ph $ac3, %[p4], %[filter78] \n \t" /* odd 1 */
495 "extp %[Temp3], $ac3, 31 \n \t" /* odd 1 */
496 "lbux %[st2], %[Temp2](%[cm]) \n \t" /* even 8 */
497
498 /* odd 2. pixel */
499 "mtlo %[vector_64], $ac2 \n \t" /* odd 3 */
500 "mthi $zero, $ac2 \n \t"
501 "preceu.ph.qbr %[p1], %[qload2] \n \t"
502 "preceu.ph.qbl %[p5], %[qload2] \n \t"
503 "sb %[st2], 0(%[dst]) \n \t" /* even 8 */
504 "ulw %[qload1], 13(%[src]) \n \t"
505 "dpa.w.ph $ac1, %[p2], %[filter12] \n \t" /* odd 2 */
506 "dpa.w.ph $ac1, %[p3], %[filter34] \n \t" /* odd 2 */
507 "dpa.w.ph $ac1, %[p4], %[filter56] \n \t" /* odd 2 */
508 "dpa.w.ph $ac1, %[p1], %[filter78] \n \t" /* odd 2 */
509 "extp %[Temp1], $ac1, 31 \n \t" /* odd 2 */
510 "lbux %[st3], %[Temp3](%[cm]) \n \t" /* odd 1 */
511
512 /* odd 3. pixel */
513 "mtlo %[vector_64], $ac3 \n \t" /* odd 4 */
514 "mthi $zero, $ac3 \n \t"
515 "preceu.ph.qbr %[p2], %[qload1] \n \t"
516 "sb %[st3], 0(%[odd_dst]) \n \t" /* odd 1 */
517 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
518 "dpa.w.ph $ac2, %[p3], %[filter12] \n \t" /* odd 3 */
519 "dpa.w.ph $ac2, %[p4], %[filter34] \n \t" /* odd 3 */
520 "dpa.w.ph $ac2, %[p1], %[filter56] \n \t" /* odd 3 */
521 "dpa.w.ph $ac2, %[p5], %[filter78] \n \t" /* odd 3 */
522 "extp %[Temp2], $ac2, 31 \n \t" /* odd 3 */
523 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* odd 2 */
524
525 /* odd 4. pixel */
526 "mtlo %[vector_64], $ac1 \n \t" /* odd 5 */
527 "mthi $zero, $ac1 \n \t"
528 "preceu.ph.qbl %[p3], %[qload1] \n \t"
529 "sb %[st1], 0(%[odd_dst]) \n \t" /* odd 2 */
530 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
531 "ulw %[qload2], 17(%[src]) \n \t"
532 "dpa.w.ph $ac3, %[p4], %[filter12] \n \t" /* odd 4 */
533 "dpa.w.ph $ac3, %[p1], %[filter34] \n \t" /* odd 4 */
534 "dpa.w.ph $ac3, %[p5], %[filter56] \n \t" /* odd 4 */
535 "dpa.w.ph $ac3, %[p2], %[filter78] \n \t" /* odd 4 */
536 "extp %[Temp3], $ac3, 31 \n \t" /* odd 4 */
537 "lbux %[st2], %[Temp2](%[cm]) \n \t" /* odd 3 */
538
539 /* odd 5. pixel */
540 "mtlo %[vector_64], $ac2 \n \t" /* odd 6 */
541 "mthi $zero, $ac2 \n \t"
542 "preceu.ph.qbr %[p4], %[qload2] \n \t"
543 "sb %[st2], 0(%[odd_dst]) \n \t" /* odd 3 */
544 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
545 "dpa.w.ph $ac1, %[p1], %[filter12] \n \t" /* odd 5 */
546 "dpa.w.ph $ac1, %[p5], %[filter34] \n \t" /* odd 5 */
547 "dpa.w.ph $ac1, %[p2], %[filter56] \n \t" /* odd 5 */
548 "dpa.w.ph $ac1, %[p3], %[filter78] \n \t" /* odd 5 */
549 "extp %[Temp1], $ac1, 31 \n \t" /* odd 5 */
550 "lbux %[st3], %[Temp3](%[cm]) \n \t" /* odd 4 */
551
552 /* odd 6. pixel */
553 "mtlo %[vector_64], $ac3 \n \t" /* odd 7 */
554 "mthi $zero, $ac3 \n \t"
555 "preceu.ph.qbl %[p1], %[qload2] \n \t"
556 "sb %[st3], 0(%[odd_dst]) \n \t" /* odd 4 */
557 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
558 "ulw %[qload1], 21(%[src]) \n \t"
559 "dpa.w.ph $ac2, %[p5], %[filter12] \n \t" /* odd 6 */
560 "dpa.w.ph $ac2, %[p2], %[filter34] \n \t" /* odd 6 */
561 "dpa.w.ph $ac2, %[p3], %[filter56] \n \t" /* odd 6 */
562 "dpa.w.ph $ac2, %[p4], %[filter78] \n \t" /* odd 6 */
563 "extp %[Temp2], $ac2, 31 \n \t" /* odd 6 */
564 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* odd 5 */
565
566 /* odd 7. pixel */
567 "mtlo %[vector_64], $ac1 \n \t" /* odd 8 */
568 "mthi $zero, $ac1 \n \t"
569 "preceu.ph.qbr %[p5], %[qload1] \n \t"
570 "sb %[st1], 0(%[odd_dst]) \n \t" /* odd 5 */
571 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
572 "dpa.w.ph $ac3, %[p2], %[filter12] \n \t" /* odd 7 */
573 "dpa.w.ph $ac3, %[p3], %[filter34] \n \t" /* odd 7 */
574 "dpa.w.ph $ac3, %[p4], %[filter56] \n \t" /* odd 7 */
575 "dpa.w.ph $ac3, %[p1], %[filter78] \n \t" /* odd 7 */
576 "extp %[Temp3], $ac3, 31 \n \t" /* odd 7 */
577
578 /* odd 8. pixel */
579 "dpa.w.ph $ac1, %[p3], %[filter12] \n \t" /* odd 8 */
580 "dpa.w.ph $ac1, %[p4], %[filter34] \n \t" /* odd 8 */
581 "dpa.w.ph $ac1, %[p1], %[filter56] \n \t" /* odd 8 */
582 "dpa.w.ph $ac1, %[p5], %[filter78] \n \t" /* odd 8 */
583 "extp %[Temp1], $ac1, 31 \n \t" /* odd 8 */
584
585 "lbux %[st2], %[Temp2](%[cm]) \n \t" /* odd 6 */
586 "lbux %[st3], %[Temp3](%[cm]) \n \t" /* odd 7 */
587 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* odd 8 */
588
589 "sb %[st2], 0(%[odd_dst]) \n \t" /* odd 6 */
590 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
591
592 "sb %[st3], 0(%[odd_dst]) \n \t" /* odd 7 */
593 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
594
595 "sb %[st1], 0(%[odd_dst]) \n \t" /* odd 8 */
596
597 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
598 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
599 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
600 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
601 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
602 : [filter12] "r" (filter12), [filter34] "r" (filter34),
603 [filter56] "r" (filter56), [filter78] "r" (filter78),
604 [vector_64] "r" (vector_64), [cm] "r" (cm),
605 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
606 );
607
608 src += 16;
609 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
610 odd_dst = (dst + dst_stride);
611 }
612
613 /* Next row... */
614 src_ptr += src_stride;
615
616 dst_ptr += 1;
617 }
618 }
619
620 static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
621 int32_t src_stride,
622 uint8_t *dst_ptr,
623 int32_t dst_stride,
624 const int16_t *filter_x0,
625 int32_t h) {
626 int32_t c, y;
627 const uint8_t *src;
628 uint8_t *dst;
629 uint8_t *cm = vpx_ff_cropTbl;
630 uint32_t vector_64 = 64;
631 int32_t filter12, filter34, filter56, filter78;
632 int32_t Temp1, Temp2, Temp3;
633 uint32_t qload1, qload2;
634 uint32_t p1, p2, p3, p4, p5;
635 uint32_t st1, st2, st3;
636 uint32_t dst_pitch_2 = (dst_stride << 1);
637 uint8_t *odd_dst;
638
639 filter12 = ((const int32_t *)filter_x0)[0];
640 filter34 = ((const int32_t *)filter_x0)[1];
641 filter56 = ((const int32_t *)filter_x0)[2];
642 filter78 = ((const int32_t *)filter_x0)[3];
643
644 for (y = h; y--;) {
645 /* prefetch data to cache memory */
646 prefetch_load(src_ptr + src_stride);
647 prefetch_load(src_ptr + src_stride + 32);
648 prefetch_load(src_ptr + src_stride + 64);
649
650 src = src_ptr;
651 dst = dst_ptr;
652
653 odd_dst = (dst + dst_stride);
654
655 for (c = 0; c < 4; c++) {
656 __asm__ __volatile__ (
657 "ulw %[qload1], 0(%[src]) \n \t"
658 "ulw %[qload2], 4(%[src]) \n \t"
659
660 /* even 1. pixel */
661 "mtlo %[vector_64], $ac1 \n \t" /* even 1 */
662 "mthi $zero, $ac1 \n \t"
663 "mtlo %[vector_64], $ac2 \n \t" /* even 2 */
664 "mthi $zero, $ac2 \n \t"
665 "preceu.ph.qbr %[p3], %[qload2] \n \t"
666 "preceu.ph.qbl %[p4], %[qload2] \n \t"
667 "preceu.ph.qbr %[p1], %[qload1] \n \t"
668 "preceu.ph.qbl %[p2], %[qload1] \n \t"
669 "ulw %[qload2], 8(%[src]) \n \t"
670 "dpa.w.ph $ac1, %[p1], %[filter12] \n \t" /* even 1 */
671 "dpa.w.ph $ac1, %[p2], %[filter34] \n \t" /* even 1 */
672 "dpa.w.ph $ac1, %[p3], %[filter56] \n \t" /* even 1 */
673 "dpa.w.ph $ac1, %[p4], %[filter78] \n \t" /* even 1 */
674 "extp %[Temp1], $ac1, 31 \n \t" /* even 1 */
675
676 /* even 2. pixel */
677 "mtlo %[vector_64], $ac3 \n \t" /* even 3 */
678 "mthi $zero, $ac3 \n \t"
679 "preceu.ph.qbr %[p1], %[qload2] \n \t"
680 "preceu.ph.qbl %[p5], %[qload2] \n \t"
681 "ulw %[qload1], 12(%[src]) \n \t"
682 "dpa.w.ph $ac2, %[p2], %[filter12] \n \t" /* even 1 */
683 "dpa.w.ph $ac2, %[p3], %[filter34] \n \t" /* even 1 */
684 "dpa.w.ph $ac2, %[p4], %[filter56] \n \t" /* even 1 */
685 "dpa.w.ph $ac2, %[p1], %[filter78] \n \t" /* even 1 */
686 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* even 1 */
687 "extp %[Temp2], $ac2, 31 \n \t" /* even 1 */
688
689 /* even 3. pixel */
690 "mtlo %[vector_64], $ac1 \n \t" /* even 4 */
691 "mthi $zero, $ac1 \n \t"
692 "preceu.ph.qbr %[p2], %[qload1] \n \t"
693 "sb %[st1], 0(%[dst]) \n \t" /* even 1 */
694 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
695 "dpa.w.ph $ac3, %[p3], %[filter12] \n \t" /* even 3 */
696 "dpa.w.ph $ac3, %[p4], %[filter34] \n \t" /* even 3 */
697 "dpa.w.ph $ac3, %[p1], %[filter56] \n \t" /* even 3 */
698 "dpa.w.ph $ac3, %[p5], %[filter78] \n \t" /* even 3 */
699 "extp %[Temp3], $ac3, 31 \n \t" /* even 3 */
700 "lbux %[st2], %[Temp2](%[cm]) \n \t" /* even 1 */
701
702 /* even 4. pixel */
703 "mtlo %[vector_64], $ac2 \n \t" /* even 5 */
704 "mthi $zero, $ac2 \n \t"
705 "preceu.ph.qbl %[p3], %[qload1] \n \t"
706 "sb %[st2], 0(%[dst]) \n \t" /* even 2 */
707 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
708 "ulw %[qload2], 16(%[src]) \n \t"
709 "dpa.w.ph $ac1, %[p4], %[filter12] \n \t" /* even 4 */
710 "dpa.w.ph $ac1, %[p1], %[filter34] \n \t" /* even 4 */
711 "dpa.w.ph $ac1, %[p5], %[filter56] \n \t" /* even 4 */
712 "dpa.w.ph $ac1, %[p2], %[filter78] \n \t" /* even 4 */
713 "extp %[Temp1], $ac1, 31 \n \t" /* even 4 */
714 "lbux %[st3], %[Temp3](%[cm]) \n \t" /* even 3 */
715
716 /* even 5. pixel */
717 "mtlo %[vector_64], $ac3 \n \t" /* even 6 */
718 "mthi $zero, $ac3 \n \t"
719 "preceu.ph.qbr %[p4], %[qload2] \n \t"
720 "sb %[st3], 0(%[dst]) \n \t" /* even 3 */
721 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
722 "dpa.w.ph $ac2, %[p1], %[filter12] \n \t" /* even 5 */
723 "dpa.w.ph $ac2, %[p5], %[filter34] \n \t" /* even 5 */
724 "dpa.w.ph $ac2, %[p2], %[filter56] \n \t" /* even 5 */
725 "dpa.w.ph $ac2, %[p3], %[filter78] \n \t" /* even 5 */
726 "extp %[Temp2], $ac2, 31 \n \t" /* even 5 */
727 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* even 4 */
728
729 /* even 6. pixel */
730 "mtlo %[vector_64], $ac1 \n \t" /* even 7 */
731 "mthi $zero, $ac1 \n \t"
732 "preceu.ph.qbl %[p1], %[qload2] \n \t"
733 "sb %[st1], 0(%[dst]) \n \t" /* even 4 */
734 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
735 "ulw %[qload1], 20(%[src]) \n \t"
736 "dpa.w.ph $ac3, %[p5], %[filter12] \n \t" /* even 6 */
737 "dpa.w.ph $ac3, %[p2], %[filter34] \n \t" /* even 6 */
738 "dpa.w.ph $ac3, %[p3], %[filter56] \n \t" /* even 6 */
739 "dpa.w.ph $ac3, %[p4], %[filter78] \n \t" /* even 6 */
740 "extp %[Temp3], $ac3, 31 \n \t" /* even 6 */
741 "lbux %[st2], %[Temp2](%[cm]) \n \t" /* even 5 */
742
743 /* even 7. pixel */
744 "mtlo %[vector_64], $ac2 \n \t" /* even 8 */
745 "mthi $zero, $ac2 \n \t"
746 "preceu.ph.qbr %[p5], %[qload1] \n \t"
747 "sb %[st2], 0(%[dst]) \n \t" /* even 5 */
748 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
749 "dpa.w.ph $ac1, %[p2], %[filter12] \n \t" /* even 7 */
750 "dpa.w.ph $ac1, %[p3], %[filter34] \n \t" /* even 7 */
751 "dpa.w.ph $ac1, %[p4], %[filter56] \n \t" /* even 7 */
752 "dpa.w.ph $ac1, %[p1], %[filter78] \n \t" /* even 7 */
753 "extp %[Temp1], $ac1, 31 \n \t" /* even 7 */
754 "lbux %[st3], %[Temp3](%[cm]) \n \t" /* even 6 */
755
756 /* even 8. pixel */
757 "mtlo %[vector_64], $ac3 \n \t" /* odd 1 */
758 "mthi $zero, $ac3 \n \t"
759 "dpa.w.ph $ac2, %[p3], %[filter12] \n \t" /* even 8 */
760 "dpa.w.ph $ac2, %[p4], %[filter34] \n \t" /* even 8 */
761 "sb %[st3], 0(%[dst]) \n \t" /* even 6 */
762 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
763 "dpa.w.ph $ac2, %[p1], %[filter56] \n \t" /* even 8 */
764 "dpa.w.ph $ac2, %[p5], %[filter78] \n \t" /* even 8 */
765 "extp %[Temp2], $ac2, 31 \n \t" /* even 8 */
766 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* even 7 */
767
768 /* ODD pixels */
769 "ulw %[qload1], 1(%[src]) \n \t"
770 "ulw %[qload2], 5(%[src]) \n \t"
771
772 /* odd 1. pixel */
773 "mtlo %[vector_64], $ac1 \n \t" /* odd 2 */
774 "mthi $zero, $ac1 \n \t"
775 "preceu.ph.qbr %[p1], %[qload1] \n \t"
776 "preceu.ph.qbl %[p2], %[qload1] \n \t"
777 "preceu.ph.qbr %[p3], %[qload2] \n \t"
778 "preceu.ph.qbl %[p4], %[qload2] \n \t"
779 "sb %[st1], 0(%[dst]) \n \t" /* even 7 */
780 "addu %[dst], %[dst], %[dst_pitch_2] \n \t"
781 "ulw %[qload2], 9(%[src]) \n \t"
782 "dpa.w.ph $ac3, %[p1], %[filter12] \n \t" /* odd 1 */
783 "dpa.w.ph $ac3, %[p2], %[filter34] \n \t" /* odd 1 */
784 "dpa.w.ph $ac3, %[p3], %[filter56] \n \t" /* odd 1 */
785 "dpa.w.ph $ac3, %[p4], %[filter78] \n \t" /* odd 1 */
786 "extp %[Temp3], $ac3, 31 \n \t" /* odd 1 */
787 "lbux %[st2], %[Temp2](%[cm]) \n \t" /* even 8 */
788
789 /* odd 2. pixel */
790 "mtlo %[vector_64], $ac2 \n \t" /* odd 3 */
791 "mthi $zero, $ac2 \n \t"
792 "preceu.ph.qbr %[p1], %[qload2] \n \t"
793 "preceu.ph.qbl %[p5], %[qload2] \n \t"
794 "sb %[st2], 0(%[dst]) \n \t" /* even 8 */
795 "ulw %[qload1], 13(%[src]) \n \t"
796 "dpa.w.ph $ac1, %[p2], %[filter12] \n \t" /* odd 2 */
797 "dpa.w.ph $ac1, %[p3], %[filter34] \n \t" /* odd 2 */
798 "dpa.w.ph $ac1, %[p4], %[filter56] \n \t" /* odd 2 */
799 "dpa.w.ph $ac1, %[p1], %[filter78] \n \t" /* odd 2 */
800 "extp %[Temp1], $ac1, 31 \n \t" /* odd 2 */
801 "lbux %[st3], %[Temp3](%[cm]) \n \t" /* odd 1 */
802
803 /* odd 3. pixel */
804 "mtlo %[vector_64], $ac3 \n \t" /* odd 4 */
805 "mthi $zero, $ac3 \n \t"
806 "preceu.ph.qbr %[p2], %[qload1] \n \t"
807 "sb %[st3], 0(%[odd_dst]) \n \t" /* odd 1 */
808 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
809 "dpa.w.ph $ac2, %[p3], %[filter12] \n \t" /* odd 3 */
810 "dpa.w.ph $ac2, %[p4], %[filter34] \n \t" /* odd 3 */
811 "dpa.w.ph $ac2, %[p1], %[filter56] \n \t" /* odd 3 */
812 "dpa.w.ph $ac2, %[p5], %[filter78] \n \t" /* odd 3 */
813 "extp %[Temp2], $ac2, 31 \n \t" /* odd 3 */
814 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* odd 2 */
815
816 /* odd 4. pixel */
817 "mtlo %[vector_64], $ac1 \n \t" /* odd 5 */
818 "mthi $zero, $ac1 \n \t"
819 "preceu.ph.qbl %[p3], %[qload1] \n \t"
820 "sb %[st1], 0(%[odd_dst]) \n \t" /* odd 2 */
821 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
822 "ulw %[qload2], 17(%[src]) \n \t"
823 "dpa.w.ph $ac3, %[p4], %[filter12] \n \t" /* odd 4 */
824 "dpa.w.ph $ac3, %[p1], %[filter34] \n \t" /* odd 4 */
825 "dpa.w.ph $ac3, %[p5], %[filter56] \n \t" /* odd 4 */
826 "dpa.w.ph $ac3, %[p2], %[filter78] \n \t" /* odd 4 */
827 "extp %[Temp3], $ac3, 31 \n \t" /* odd 4 */
828 "lbux %[st2], %[Temp2](%[cm]) \n \t" /* odd 3 */
829
830 /* odd 5. pixel */
831 "mtlo %[vector_64], $ac2 \n \t" /* odd 6 */
832 "mthi $zero, $ac2 \n \t"
833 "preceu.ph.qbr %[p4], %[qload2] \n \t"
834 "sb %[st2], 0(%[odd_dst]) \n \t" /* odd 3 */
835 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
836 "dpa.w.ph $ac1, %[p1], %[filter12] \n \t" /* odd 5 */
837 "dpa.w.ph $ac1, %[p5], %[filter34] \n \t" /* odd 5 */
838 "dpa.w.ph $ac1, %[p2], %[filter56] \n \t" /* odd 5 */
839 "dpa.w.ph $ac1, %[p3], %[filter78] \n \t" /* odd 5 */
840 "extp %[Temp1], $ac1, 31 \n \t" /* odd 5 */
841 "lbux %[st3], %[Temp3](%[cm]) \n \t" /* odd 4 */
842
843 /* odd 6. pixel */
844 "mtlo %[vector_64], $ac3 \n \t" /* odd 7 */
845 "mthi $zero, $ac3 \n \t"
846 "preceu.ph.qbl %[p1], %[qload2] \n \t"
847 "sb %[st3], 0(%[odd_dst]) \n \t" /* odd 4 */
848 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
849 "ulw %[qload1], 21(%[src]) \n \t"
850 "dpa.w.ph $ac2, %[p5], %[filter12] \n \t" /* odd 6 */
851 "dpa.w.ph $ac2, %[p2], %[filter34] \n \t" /* odd 6 */
852 "dpa.w.ph $ac2, %[p3], %[filter56] \n \t" /* odd 6 */
853 "dpa.w.ph $ac2, %[p4], %[filter78] \n \t" /* odd 6 */
854 "extp %[Temp2], $ac2, 31 \n \t" /* odd 6 */
855 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* odd 5 */
856
857 /* odd 7. pixel */
858 "mtlo %[vector_64], $ac1 \n \t" /* odd 8 */
859 "mthi $zero, $ac1 \n \t"
860 "preceu.ph.qbr %[p5], %[qload1] \n \t"
861 "sb %[st1], 0(%[odd_dst]) \n \t" /* odd 5 */
862 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
863 "dpa.w.ph $ac3, %[p2], %[filter12] \n \t" /* odd 7 */
864 "dpa.w.ph $ac3, %[p3], %[filter34] \n \t" /* odd 7 */
865 "dpa.w.ph $ac3, %[p4], %[filter56] \n \t" /* odd 7 */
866 "dpa.w.ph $ac3, %[p1], %[filter78] \n \t" /* odd 7 */
867 "extp %[Temp3], $ac3, 31 \n \t" /* odd 7 */
868
869 /* odd 8. pixel */
870 "dpa.w.ph $ac1, %[p3], %[filter12] \n \t" /* odd 8 */
871 "dpa.w.ph $ac1, %[p4], %[filter34] \n \t" /* odd 8 */
872 "dpa.w.ph $ac1, %[p1], %[filter56] \n \t" /* odd 8 */
873 "dpa.w.ph $ac1, %[p5], %[filter78] \n \t" /* odd 8 */
874 "extp %[Temp1], $ac1, 31 \n \t" /* odd 8 */
875
876 "lbux %[st2], %[Temp2](%[cm]) \n \t" /* odd 6 */
877 "lbux %[st3], %[Temp3](%[cm]) \n \t" /* odd 7 */
878 "lbux %[st1], %[Temp1](%[cm]) \n \t" /* odd 8 */
879
880 "sb %[st2], 0(%[odd_dst]) \n \t" /* odd 6 */
881 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
882
883 "sb %[st3], 0(%[odd_dst]) \n \t" /* odd 7 */
884 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n \t"
885
886 "sb %[st1], 0(%[odd_dst]) \n \t" /* odd 8 */
887
888 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
889 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
890 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
891 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
892 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
893 : [filter12] "r" (filter12), [filter34] "r" (filter34),
894 [filter56] "r" (filter56), [filter78] "r" (filter78),
895 [vector_64] "r" (vector_64), [cm] "r" (cm),
896 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
897 );
898
899 src += 16;
900 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
901 odd_dst = (dst + dst_stride);
902 }
903
904 /* Next row... */
905 src_ptr += src_stride;
906
907 dst_ptr += 1;
908 }
909 }
910
911 void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
912 uint8_t *dst, ptrdiff_t dst_stride,
913 const int16_t *filter, int w, int h) {
914 int x, y, k;
915
916 for (y = 0; y < h; ++y) {
917 for (x = 0; x < w; ++x) {
918 int sum = 0;
919
920 for (k = 0; k < 8; ++k)
921 sum += src[x + k] * filter[k];
922
923 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
924 }
925
926 src += src_stride;
927 dst += 1;
928 }
929 }
930
931 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
932 uint8_t *dst, ptrdiff_t dst_stride,
933 int w, int h) {
934 int x, y;
935
936 for (y = 0; y < h; ++y) {
937 for (x = 0; x < w; ++x) {
938 dst[x * dst_stride] = src[x];
939 }
940
941 src += src_stride;
942 dst += 1;
943 }
944 }
945
946 void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
947 uint8_t *dst, ptrdiff_t dst_stride,
948 const int16_t *filter_x, int x_step_q4,
949 const int16_t *filter_y, int y_step_q4,
950 int w, int h) {
951 DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
952 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
953 uint32_t pos = 38;
954
955 /* bit positon for extract from acc */
956 __asm__ __volatile__ (
957 "wrdsp %[pos], 1 \n\t"
958 :
959 : [pos] "r" (pos)
960 );
961
962 if (intermediate_height < h)
963 intermediate_height = h;
964
965 if (x_step_q4 != 16 || y_step_q4 != 16)
966 return vpx_convolve8_c(src, src_stride,
967 dst, dst_stride,
968 filter_x, x_step_q4,
969 filter_y, y_step_q4,
970 w, h);
971
972 if ((((const int32_t *)filter_x)[1] == 0x800000)
973 && (((const int32_t *)filter_y)[1] == 0x800000))
974 return vpx_convolve_copy(src, src_stride,
975 dst, dst_stride,
976 filter_x, x_step_q4,
977 filter_y, y_step_q4,
978 w, h);
979
980 /* copy the src to dst */
981 if (filter_x[3] == 0x80) {
982 copy_horiz_transposed(src - src_stride * 3, src_stride,
983 temp, intermediate_height,
984 w, intermediate_height);
985 } else if (((const int32_t *)filter_x)[0] == 0) {
986 vpx_convolve2_dspr2(src - src_stride * 3, src_stride,
987 temp, intermediate_height,
988 filter_x,
989 w, intermediate_height);
990 } else {
991 src -= (src_stride * 3 + 3);
992
993 /* prefetch data to cache memory */
994 prefetch_load(src);
995 prefetch_load(src + 32);
996
997 switch (w) {
998 case 4:
999 convolve_horiz_4_transposed_dspr2(src, src_stride,
1000 temp, intermediate_height,
1001 filter_x, intermediate_height);
1002 break;
1003 case 8:
1004 convolve_horiz_8_transposed_dspr2(src, src_stride,
1005 temp, intermediate_height,
1006 filter_x, intermediate_height);
1007 break;
1008 case 16:
1009 case 32:
1010 convolve_horiz_16_transposed_dspr2(src, src_stride,
1011 temp, intermediate_height,
1012 filter_x, intermediate_height,
1013 (w/16));
1014 break;
1015 case 64:
1016 prefetch_load(src + 32);
1017 convolve_horiz_64_transposed_dspr2(src, src_stride,
1018 temp, intermediate_height,
1019 filter_x, intermediate_height);
1020 break;
1021 default:
1022 convolve_horiz_transposed(src, src_stride,
1023 temp, intermediate_height,
1024 filter_x, w, intermediate_height);
1025 break;
1026 }
1027 }
1028
1029 /* copy the src to dst */
1030 if (filter_y[3] == 0x80) {
1031 copy_horiz_transposed(temp + 3, intermediate_height,
1032 dst, dst_stride,
1033 h, w);
1034 } else if (((const int32_t *)filter_y)[0] == 0) {
1035 vpx_convolve2_dspr2(temp + 3, intermediate_height,
1036 dst, dst_stride,
1037 filter_y,
1038 h, w);
1039 } else {
1040 switch (h) {
1041 case 4:
1042 convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
1043 dst, dst_stride,
1044 filter_y, w);
1045 break;
1046 case 8:
1047 convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
1048 dst, dst_stride,
1049 filter_y, w);
1050 break;
1051 case 16:
1052 case 32:
1053 convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
1054 dst, dst_stride,
1055 filter_y, w, (h/16));
1056 break;
1057 case 64:
1058 convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
1059 dst, dst_stride,
1060 filter_y, w);
1061 break;
1062 default:
1063 convolve_horiz_transposed(temp, intermediate_height,
1064 dst, dst_stride,
1065 filter_y, h, w);
1066 break;
1067 }
1068 }
1069 }
1070
1071 void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
1072 uint8_t *dst, ptrdiff_t dst_stride,
1073 const int16_t *filter_x, int filter_x_stride,
1074 const int16_t *filter_y, int filter_y_stride,
1075 int w, int h) {
1076 int x, y;
1077
1078 /* prefetch data to cache memory */
1079 prefetch_load(src);
1080 prefetch_load(src + 32);
1081 prefetch_store(dst);
1082
1083 switch (w) {
1084 case 4:
1085 {
1086 uint32_t tp1;
1087
1088 /* 1 word storage */
1089 for (y = h; y--; ) {
1090 prefetch_load(src + src_stride);
1091 prefetch_load(src + src_stride + 32);
1092 prefetch_store(dst + dst_stride);
1093
1094 __asm__ __volatile__ (
1095 "ulw %[tp1], (%[src]) \n\t"
1096 "sw %[tp1], (%[dst]) \n\t" /* store */
1097
1098 : [tp1] "=&r" (tp1)
1099 : [src] "r" (src), [dst] "r" (dst)
1100 );
1101
1102 src += src_stride;
1103 dst += dst_stride;
1104 }
1105 }
1106 break;
1107 case 8:
1108 {
1109 uint32_t tp1, tp2;
1110
1111 /* 2 word storage */
1112 for (y = h; y--; ) {
1113 prefetch_load(src + src_stride);
1114 prefetch_load(src + src_stride + 32);
1115 prefetch_store(dst + dst_stride);
1116
1117 __asm__ __volatile__ (
1118 "ulw %[tp1], 0(%[src]) \n\t"
1119 "ulw %[tp2], 4(%[src]) \n\t"
1120 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1121 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1122
1123 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
1124 : [src] "r" (src), [dst] "r" (dst)
1125 );
1126
1127 src += src_stride;
1128 dst += dst_stride;
1129 }
1130 }
1131 break;
1132 case 16:
1133 {
1134 uint32_t tp1, tp2, tp3, tp4;
1135
1136 /* 4 word storage */
1137 for (y = h; y--; ) {
1138 prefetch_load(src + src_stride);
1139 prefetch_load(src + src_stride + 32);
1140 prefetch_store(dst + dst_stride);
1141
1142 __asm__ __volatile__ (
1143 "ulw %[tp1], 0(%[src]) \n\t"
1144 "ulw %[tp2], 4(%[src]) \n\t"
1145 "ulw %[tp3], 8(%[src]) \n\t"
1146 "ulw %[tp4], 12(%[src]) \n\t"
1147
1148 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1149 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1150 "sw %[tp3], 8(%[dst]) \n\t" /* store */
1151 "sw %[tp4], 12(%[dst]) \n\t" /* store */
1152
1153 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1154 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
1155 : [src] "r" (src), [dst] "r" (dst)
1156 );
1157
1158 src += src_stride;
1159 dst += dst_stride;
1160 }
1161 }
1162 break;
1163 case 32:
1164 {
1165 uint32_t tp1, tp2, tp3, tp4;
1166 uint32_t tp5, tp6, tp7, tp8;
1167
1168 /* 8 word storage */
1169 for (y = h; y--; ) {
1170 prefetch_load(src + src_stride);
1171 prefetch_load(src + src_stride + 32);
1172 prefetch_store(dst + dst_stride);
1173
1174 __asm__ __volatile__ (
1175 "ulw %[tp1], 0(%[src]) \n\t"
1176 "ulw %[tp2], 4(%[src]) \n\t"
1177 "ulw %[tp3], 8(%[src]) \n\t"
1178 "ulw %[tp4], 12(%[src]) \n\t"
1179 "ulw %[tp5], 16(%[src]) \n\t"
1180 "ulw %[tp6], 20(%[src]) \n\t"
1181 "ulw %[tp7], 24(%[src]) \n\t"
1182 "ulw %[tp8], 28(%[src]) \n\t"
1183
1184 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1185 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1186 "sw %[tp3], 8(%[dst]) \n\t" /* store */
1187 "sw %[tp4], 12(%[dst]) \n\t" /* store */
1188 "sw %[tp5], 16(%[dst]) \n\t" /* store */
1189 "sw %[tp6], 20(%[dst]) \n\t" /* store */
1190 "sw %[tp7], 24(%[dst]) \n\t" /* store */
1191 "sw %[tp8], 28(%[dst]) \n\t" /* store */
1192
1193 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1194 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
1195 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
1196 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
1197 : [src] "r" (src), [dst] "r" (dst)
1198 );
1199
1200 src += src_stride;
1201 dst += dst_stride;
1202 }
1203 }
1204 break;
1205 case 64:
1206 {
1207 uint32_t tp1, tp2, tp3, tp4;
1208 uint32_t tp5, tp6, tp7, tp8;
1209
1210 prefetch_load(src + 64);
1211 prefetch_store(dst + 32);
1212
1213 /* 16 word storage */
1214 for (y = h; y--; ) {
1215 prefetch_load(src + src_stride);
1216 prefetch_load(src + src_stride + 32);
1217 prefetch_load(src + src_stride + 64);
1218 prefetch_store(dst + dst_stride);
1219 prefetch_store(dst + dst_stride + 32);
1220
1221 __asm__ __volatile__ (
1222 "ulw %[tp1], 0(%[src]) \n\t"
1223 "ulw %[tp2], 4(%[src]) \n\t"
1224 "ulw %[tp3], 8(%[src]) \n\t"
1225 "ulw %[tp4], 12(%[src]) \n\t"
1226 "ulw %[tp5], 16(%[src]) \n\t"
1227 "ulw %[tp6], 20(%[src]) \n\t"
1228 "ulw %[tp7], 24(%[src]) \n\t"
1229 "ulw %[tp8], 28(%[src]) \n\t"
1230
1231 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1232 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1233 "sw %[tp3], 8(%[dst]) \n\t" /* store */
1234 "sw %[tp4], 12(%[dst]) \n\t" /* store */
1235 "sw %[tp5], 16(%[dst]) \n\t" /* store */
1236 "sw %[tp6], 20(%[dst]) \n\t" /* store */
1237 "sw %[tp7], 24(%[dst]) \n\t" /* store */
1238 "sw %[tp8], 28(%[dst]) \n\t" /* store */
1239
1240 "ulw %[tp1], 32(%[src]) \n\t"
1241 "ulw %[tp2], 36(%[src]) \n\t"
1242 "ulw %[tp3], 40(%[src]) \n\t"
1243 "ulw %[tp4], 44(%[src]) \n\t"
1244 "ulw %[tp5], 48(%[src]) \n\t"
1245 "ulw %[tp6], 52(%[src]) \n\t"
1246 "ulw %[tp7], 56(%[src]) \n\t"
1247 "ulw %[tp8], 60(%[src]) \n\t"
1248
1249 "sw %[tp1], 32(%[dst]) \n\t" /* store */
1250 "sw %[tp2], 36(%[dst]) \n\t" /* store */
1251 "sw %[tp3], 40(%[dst]) \n\t" /* store */
1252 "sw %[tp4], 44(%[dst]) \n\t" /* store */
1253 "sw %[tp5], 48(%[dst]) \n\t" /* store */
1254 "sw %[tp6], 52(%[dst]) \n\t" /* store */
1255 "sw %[tp7], 56(%[dst]) \n\t" /* store */
1256 "sw %[tp8], 60(%[dst]) \n\t" /* store */
1257
1258 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1259 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
1260 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
1261 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
1262 : [src] "r" (src), [dst] "r" (dst)
1263 );
1264
1265 src += src_stride;
1266 dst += dst_stride;
1267 }
1268 }
1269 break;
1270 default:
1271 for (y = h; y--; ) {
1272 for (x = 0; x < w; ++x) {
1273 dst[x] = src[x];
1274 }
1275
1276 src += src_stride;
1277 dst += dst_stride;
1278 }
1279 break;
1280 }
1281 }
1282 #endif
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c ('k') | source/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_dspr2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698