Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(118)

Side by Side Diff: source/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_dspr2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/vpx_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
21 static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
22 int32_t src_stride,
23 uint8_t *dst,
24 int32_t dst_stride,
25 const int16_t *filter_x0,
26 int32_t h) {
27 int32_t y;
28 uint8_t *cm = vpx_ff_cropTbl;
29 int32_t vector1b, vector2b, vector3b, vector4b;
30 int32_t Temp1, Temp2, Temp3, Temp4;
31 uint32_t vector4a = 64;
32 uint32_t tp1, tp2;
33 uint32_t p1, p2, p3, p4;
34 uint32_t n1, n2, n3, n4;
35 uint32_t tn1, tn2;
36
37 vector1b = ((const int32_t *)filter_x0)[0];
38 vector2b = ((const int32_t *)filter_x0)[1];
39 vector3b = ((const int32_t *)filter_x0)[2];
40 vector4b = ((const int32_t *)filter_x0)[3];
41
42 for (y = h; y--;) {
43 /* prefetch data to cache memory */
44 prefetch_load(src + src_stride);
45 prefetch_load(src + src_stride + 32);
46 prefetch_store(dst + dst_stride);
47
48 __asm__ __volatile__ (
49 "ulw %[tp1], 0(%[src]) \n\t"
50 "ulw %[tp2], 4(%[src]) \n\t"
51
52 /* even 1. pixel */
53 "mtlo %[vector4a], $ac3 \n\t"
54 "mthi $zero, $ac3 \n\t"
55 "preceu.ph.qbr %[p1], %[tp1] \n\t"
56 "preceu.ph.qbl %[p2], %[tp1] \n\t"
57 "preceu.ph.qbr %[p3], %[tp2] \n\t"
58 "preceu.ph.qbl %[p4], %[tp2] \n\t"
59 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
60 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
61 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
62 "ulw %[tn2], 8(%[src]) \n\t"
63 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
64 "extp %[Temp1], $ac3, 31 \n\t"
65
66 /* even 2. pixel */
67 "mtlo %[vector4a], $ac2 \n\t"
68 "mthi $zero, $ac2 \n\t"
69 "preceu.ph.qbr %[p1], %[tn2] \n\t"
70 "balign %[tn1], %[tn2], 3 \n\t"
71 "balign %[tn2], %[tp2], 3 \n\t"
72 "balign %[tp2], %[tp1], 3 \n\t"
73 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
74 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
75 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
76 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
77 "extp %[Temp3], $ac2, 31 \n\t"
78
79 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
80
81 /* odd 1. pixel */
82 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
83 "mtlo %[vector4a], $ac3 \n\t"
84 "mthi $zero, $ac3 \n\t"
85 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
86 "preceu.ph.qbr %[n1], %[tp2] \n\t"
87 "preceu.ph.qbl %[n2], %[tp2] \n\t"
88 "preceu.ph.qbr %[n3], %[tn2] \n\t"
89 "preceu.ph.qbl %[n4], %[tn2] \n\t"
90 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
91 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
92 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
93 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
94 "extp %[Temp2], $ac3, 31 \n\t"
95
96 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
97
98 /* odd 2. pixel */
99 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
100 "mtlo %[vector4a], $ac2 \n\t"
101 "mthi $zero, $ac2 \n\t"
102 "preceu.ph.qbr %[n1], %[tn1] \n\t"
103 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
104 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
105 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
106 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
107 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
108 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
109 "extp %[Temp4], $ac2, 31 \n\t"
110
111 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
112 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
113
114 /* clamp */
115 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
116 "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */
117 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
118
119 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
120 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
121
122 "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */
123 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
124
125 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
126 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
127 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
128 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
129 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
130 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
131 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
132 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
133 [vector4a] "r" (vector4a),
134 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
135 );
136
137 /* Next row... */
138 src += src_stride;
139 dst += dst_stride;
140 }
141 }
142
143 static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
144 int32_t src_stride,
145 uint8_t *dst,
146 int32_t dst_stride,
147 const int16_t *filter_x0,
148 int32_t h) {
149 int32_t y;
150 uint8_t *cm = vpx_ff_cropTbl;
151 uint32_t vector4a = 64;
152 int32_t vector1b, vector2b, vector3b, vector4b;
153 int32_t Temp1, Temp2, Temp3;
154 uint32_t tp1, tp2;
155 uint32_t p1, p2, p3, p4, n1;
156 uint32_t tn1, tn2, tn3;
157 uint32_t st0, st1;
158
159 vector1b = ((const int32_t *)filter_x0)[0];
160 vector2b = ((const int32_t *)filter_x0)[1];
161 vector3b = ((const int32_t *)filter_x0)[2];
162 vector4b = ((const int32_t *)filter_x0)[3];
163
164 for (y = h; y--;) {
165 /* prefetch data to cache memory */
166 prefetch_load(src + src_stride);
167 prefetch_load(src + src_stride + 32);
168 prefetch_store(dst + dst_stride);
169
170 __asm__ __volatile__ (
171 "ulw %[tp1], 0(%[src]) \n\t"
172 "ulw %[tp2], 4(%[src]) \n\t"
173
174 /* even 1. pixel */
175 "mtlo %[vector4a], $ac3 \n\t"
176 "mthi $zero, $ac3 \n\t"
177 "mtlo %[vector4a], $ac2 \n\t"
178 "mthi $zero, $ac2 \n\t"
179 "preceu.ph.qbr %[p1], %[tp1] \n\t"
180 "preceu.ph.qbl %[p2], %[tp1] \n\t"
181 "preceu.ph.qbr %[p3], %[tp2] \n\t"
182 "preceu.ph.qbl %[p4], %[tp2] \n\t"
183 "ulw %[tn2], 8(%[src]) \n\t"
184 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
185 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
186 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
187 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
188 "extp %[Temp1], $ac3, 31 \n\t"
189 "lbu %[Temp2], 0(%[dst]) \n\t"
190 "lbu %[tn3], 2(%[dst]) \n\t"
191
192 /* even 2. pixel */
193 "preceu.ph.qbr %[p1], %[tn2] \n\t"
194 "preceu.ph.qbl %[n1], %[tn2] \n\t"
195 "ulw %[tn1], 12(%[src]) \n\t"
196 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
197 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
198 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
199 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
200 "extp %[Temp3], $ac2, 31 \n\t"
201
202 /* even 3. pixel */
203 "lbux %[st0], %[Temp1](%[cm]) \n\t"
204 "mtlo %[vector4a], $ac1 \n\t"
205 "mthi $zero, $ac1 \n\t"
206 "preceu.ph.qbr %[p2], %[tn1] \n\t"
207 "lbux %[st1], %[Temp3](%[cm]) \n\t"
208 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
209 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
210 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
211 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
212 "extp %[Temp1], $ac1, 31 \n\t"
213
214 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
215 "addqh_r.w %[tn3], %[tn3], %[st1] \n\t"
216 "sb %[Temp2], 0(%[dst]) \n\t"
217 "sb %[tn3], 2(%[dst]) \n\t"
218
219 /* even 4. pixel */
220 "mtlo %[vector4a], $ac2 \n\t"
221 "mthi $zero, $ac2 \n\t"
222 "mtlo %[vector4a], $ac3 \n\t"
223 "mthi $zero, $ac3 \n\t"
224
225 "balign %[tn3], %[tn1], 3 \n\t"
226 "balign %[tn1], %[tn2], 3 \n\t"
227 "balign %[tn2], %[tp2], 3 \n\t"
228 "balign %[tp2], %[tp1], 3 \n\t"
229
230 "lbux %[st0], %[Temp1](%[cm]) \n\t"
231 "lbu %[Temp2], 4(%[dst]) \n\t"
232 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
233
234 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
235 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
236 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
237 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
238 "extp %[Temp3], $ac2, 31 \n\t"
239
240 /* odd 1. pixel */
241 "mtlo %[vector4a], $ac1 \n\t"
242 "mthi $zero, $ac1 \n\t"
243 "sb %[Temp2], 4(%[dst]) \n\t"
244 "preceu.ph.qbr %[p1], %[tp2] \n\t"
245 "preceu.ph.qbl %[p2], %[tp2] \n\t"
246 "preceu.ph.qbr %[p3], %[tn2] \n\t"
247 "preceu.ph.qbl %[p4], %[tn2] \n\t"
248 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
249 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
250 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
251 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
252 "extp %[Temp2], $ac3, 31 \n\t"
253
254 "lbu %[tp1], 6(%[dst]) \n\t"
255
256 /* odd 2. pixel */
257 "mtlo %[vector4a], $ac3 \n\t"
258 "mthi $zero, $ac3 \n\t"
259 "mtlo %[vector4a], $ac2 \n\t"
260 "mthi $zero, $ac2 \n\t"
261 "preceu.ph.qbr %[p1], %[tn1] \n\t"
262 "preceu.ph.qbl %[n1], %[tn1] \n\t"
263 "lbux %[st0], %[Temp3](%[cm]) \n\t"
264 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
265 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
266 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
267 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
268 "extp %[Temp3], $ac1, 31 \n\t"
269
270 "lbu %[tp2], 1(%[dst]) \n\t"
271 "lbu %[tn2], 3(%[dst]) \n\t"
272 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
273
274 /* odd 3. pixel */
275 "lbux %[st1], %[Temp2](%[cm]) \n\t"
276 "preceu.ph.qbr %[p2], %[tn3] \n\t"
277 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
278 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
279 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
280 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
281 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
282 "extp %[Temp2], $ac3, 31 \n\t"
283
284 "lbu %[tn3], 5(%[dst]) \n\t"
285
286 /* odd 4. pixel */
287 "sb %[tp2], 1(%[dst]) \n\t"
288 "sb %[tp1], 6(%[dst]) \n\t"
289 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
290 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
291 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
292 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
293 "extp %[Temp1], $ac2, 31 \n\t"
294
295 "lbu %[tn1], 7(%[dst]) \n\t"
296
297 /* clamp */
298 "lbux %[p4], %[Temp3](%[cm]) \n\t"
299 "addqh_r.w %[tn2], %[tn2], %[p4] \n\t"
300
301 "lbux %[p2], %[Temp2](%[cm]) \n\t"
302 "addqh_r.w %[tn3], %[tn3], %[p2] \n\t"
303
304 "lbux %[n1], %[Temp1](%[cm]) \n\t"
305 "addqh_r.w %[tn1], %[tn1], %[n1] \n\t"
306
307 /* store bytes */
308 "sb %[tn2], 3(%[dst]) \n\t"
309 "sb %[tn3], 5(%[dst]) \n\t"
310 "sb %[tn1], 7(%[dst]) \n\t"
311
312 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
313 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
314 [st0] "=&r" (st0), [st1] "=&r" (st1),
315 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
316 [n1] "=&r" (n1),
317 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
318 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
319 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
320 [vector4a] "r" (vector4a),
321 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
322 );
323
324 /* Next row... */
325 src += src_stride;
326 dst += dst_stride;
327 }
328 }
329
330 static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
331 int32_t src_stride,
332 uint8_t *dst_ptr,
333 int32_t dst_stride,
334 const int16_t *filter_x0,
335 int32_t h,
336 int32_t count) {
337 int32_t y, c;
338 const uint8_t *src;
339 uint8_t *dst;
340 uint8_t *cm = vpx_ff_cropTbl;
341 uint32_t vector_64 = 64;
342 int32_t filter12, filter34, filter56, filter78;
343 int32_t Temp1, Temp2, Temp3;
344 uint32_t qload1, qload2, qload3;
345 uint32_t p1, p2, p3, p4, p5;
346 uint32_t st1, st2, st3;
347
348 filter12 = ((const int32_t *)filter_x0)[0];
349 filter34 = ((const int32_t *)filter_x0)[1];
350 filter56 = ((const int32_t *)filter_x0)[2];
351 filter78 = ((const int32_t *)filter_x0)[3];
352
353 for (y = h; y--;) {
354 src = src_ptr;
355 dst = dst_ptr;
356
357 /* prefetch data to cache memory */
358 prefetch_load(src_ptr + src_stride);
359 prefetch_load(src_ptr + src_stride + 32);
360 prefetch_store(dst_ptr + dst_stride);
361
362 for (c = 0; c < count; c++) {
363 __asm__ __volatile__ (
364 "ulw %[qload1], 0(%[src]) \n\t"
365 "ulw %[qload2], 4(%[src]) \n\t"
366
367 /* even 1. pixel */
368 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
369 "mthi $zero, $ac1 \n\t"
370 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
371 "mthi $zero, $ac2 \n\t"
372 "preceu.ph.qbr %[p1], %[qload1] \n\t"
373 "preceu.ph.qbl %[p2], %[qload1] \n\t"
374 "preceu.ph.qbr %[p3], %[qload2] \n\t"
375 "preceu.ph.qbl %[p4], %[qload2] \n\t"
376 "ulw %[qload3], 8(%[src]) \n\t"
377 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
378 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
379 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
380 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
381 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
382 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
383
384 /* even 2. pixel */
385 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
386 "mthi $zero, $ac3 \n\t"
387 "preceu.ph.qbr %[p1], %[qload3] \n\t"
388 "preceu.ph.qbl %[p5], %[qload3] \n\t"
389 "ulw %[qload1], 12(%[src]) \n\t"
390 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
391 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
392 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
393 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
394 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
395 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
396
397 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
398
399 /* even 3. pixel */
400 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
401 "mthi $zero, $ac1 \n\t"
402 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
403 "preceu.ph.qbr %[p2], %[qload1] \n\t"
404 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
405 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
406 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
407 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
408 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
409 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
410 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
411
412 /* even 4. pixel */
413 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
414 "mthi $zero, $ac2 \n\t"
415 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
416 "preceu.ph.qbl %[p3], %[qload1] \n\t"
417 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
418 "ulw %[qload2], 16(%[src]) \n\t"
419 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
420 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
421 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
422 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
423 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
424 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
425 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
426 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
427
428 /* even 5. pixel */
429 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
430 "mthi $zero, $ac3 \n\t"
431 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
432 "preceu.ph.qbr %[p4], %[qload2] \n\t"
433 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
434 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
435 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
436 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
437 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
438 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
439 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
440
441 /* even 6. pixel */
442 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
443 "mthi $zero, $ac1 \n\t"
444 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
445 "preceu.ph.qbl %[p1], %[qload2] \n\t"
446 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
447 "ulw %[qload3], 20(%[src]) \n\t"
448 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
449 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
450 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
451 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
452 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
453 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
454 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
455
456 /* even 7. pixel */
457 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
458 "mthi $zero, $ac2 \n\t"
459 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
460 "preceu.ph.qbr %[p5], %[qload3] \n\t"
461 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
462 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
463 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
464 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
465 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
466 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
467 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
468 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
469
470 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
471
472 /* even 8. pixel */
473 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
474 "mthi $zero, $ac3 \n\t"
475 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
476 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
477 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
478 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
479 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
480 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
481 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
482 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
483
484 /* ODD pixels */
485 "ulw %[qload1], 1(%[src]) \n\t"
486 "ulw %[qload2], 5(%[src]) \n\t"
487
488 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
489
490 /* odd 1. pixel */
491 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
492 "mthi $zero, $ac1 \n\t"
493 "preceu.ph.qbr %[p1], %[qload1] \n\t"
494 "preceu.ph.qbl %[p2], %[qload1] \n\t"
495 "preceu.ph.qbr %[p3], %[qload2] \n\t"
496 "preceu.ph.qbl %[p4], %[qload2] \n\t"
497 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
498 "ulw %[qload3], 9(%[src]) \n\t"
499 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
500 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
501 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
502 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
503 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
504 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
505 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
506
507 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
508
509 /* odd 2. pixel */
510 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
511 "mthi $zero, $ac2 \n\t"
512 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
513 "preceu.ph.qbr %[p1], %[qload3] \n\t"
514 "preceu.ph.qbl %[p5], %[qload3] \n\t"
515 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
516 "ulw %[qload1], 13(%[src]) \n\t"
517 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
518 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
519 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
520 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
521 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
522 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
523 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
524
525 /* odd 3. pixel */
526 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
527 "mthi $zero, $ac3 \n\t"
528 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
529 "preceu.ph.qbr %[p2], %[qload1] \n\t"
530 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
531 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
532 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
533 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
534 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
535 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
536 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
537
538 /* odd 4. pixel */
539 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
540 "mthi $zero, $ac1 \n\t"
541 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
542 "preceu.ph.qbl %[p3], %[qload1] \n\t"
543 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
544 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
545 "ulw %[qload2], 17(%[src]) \n\t"
546 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
547 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
548 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
549 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
550 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
551 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
552
553 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
554
555 /* odd 5. pixel */
556 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
557 "mthi $zero, $ac2 \n\t"
558 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
559 "preceu.ph.qbr %[p4], %[qload2] \n\t"
560 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
561 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
562 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
563 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
564 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
565 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
566 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
567
568 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
569
570 /* odd 6. pixel */
571 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
572 "mthi $zero, $ac3 \n\t"
573 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
574 "preceu.ph.qbl %[p1], %[qload2] \n\t"
575 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
576 "ulw %[qload3], 21(%[src]) \n\t"
577 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
578 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
579 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
580 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
581 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
582 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
583
584 /* odd 7. pixel */
585 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
586 "mthi $zero, $ac1 \n\t"
587 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
588 "preceu.ph.qbr %[p5], %[qload3] \n\t"
589 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
590 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
591 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
592 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
593 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
594 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
595 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
596
597 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
598
599 /* odd 8. pixel */
600 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
601 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
602 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
603 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
604 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
605
606 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
607
608 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
609 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
610
611 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
612 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
613
614 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
615 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
616
617 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
618 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
619 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
620
621 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
622 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
623 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
624 [qload3] "=&r" (qload3), [p5] "=&r" (p5),
625 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
626 : [filter12] "r" (filter12), [filter34] "r" (filter34),
627 [filter56] "r" (filter56), [filter78] "r" (filter78),
628 [vector_64] "r" (vector_64),
629 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
630 );
631
632 src += 16;
633 dst += 16;
634 }
635
636 /* Next row... */
637 src_ptr += src_stride;
638 dst_ptr += dst_stride;
639 }
640 }
641
642 static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
643 int32_t src_stride,
644 uint8_t *dst_ptr,
645 int32_t dst_stride,
646 const int16_t *filter_x0,
647 int32_t h) {
648 int32_t y, c;
649 const uint8_t *src;
650 uint8_t *dst;
651 uint8_t *cm = vpx_ff_cropTbl;
652 uint32_t vector_64 = 64;
653 int32_t filter12, filter34, filter56, filter78;
654 int32_t Temp1, Temp2, Temp3;
655 uint32_t qload1, qload2, qload3;
656 uint32_t p1, p2, p3, p4, p5;
657 uint32_t st1, st2, st3;
658
659 filter12 = ((const int32_t *)filter_x0)[0];
660 filter34 = ((const int32_t *)filter_x0)[1];
661 filter56 = ((const int32_t *)filter_x0)[2];
662 filter78 = ((const int32_t *)filter_x0)[3];
663
664 for (y = h; y--;) {
665 src = src_ptr;
666 dst = dst_ptr;
667
668 /* prefetch data to cache memory */
669 prefetch_load(src_ptr + src_stride);
670 prefetch_load(src_ptr + src_stride + 32);
671 prefetch_load(src_ptr + src_stride + 64);
672 prefetch_store(dst_ptr + dst_stride);
673 prefetch_store(dst_ptr + dst_stride + 32);
674
675 for (c = 0; c < 4; c++) {
676 __asm__ __volatile__ (
677 "ulw %[qload1], 0(%[src]) \n\t"
678 "ulw %[qload2], 4(%[src]) \n\t"
679
680 /* even 1. pixel */
681 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
682 "mthi $zero, $ac1 \n\t"
683 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
684 "mthi $zero, $ac2 \n\t"
685 "preceu.ph.qbr %[p1], %[qload1] \n\t"
686 "preceu.ph.qbl %[p2], %[qload1] \n\t"
687 "preceu.ph.qbr %[p3], %[qload2] \n\t"
688 "preceu.ph.qbl %[p4], %[qload2] \n\t"
689 "ulw %[qload3], 8(%[src]) \n\t"
690 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
691 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
692 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
693 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
694 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
695 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
696
697 /* even 2. pixel */
698 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
699 "mthi $zero, $ac3 \n\t"
700 "preceu.ph.qbr %[p1], %[qload3] \n\t"
701 "preceu.ph.qbl %[p5], %[qload3] \n\t"
702 "ulw %[qload1], 12(%[src]) \n\t"
703 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
704 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
705 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
706 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
707 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
708 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
709
710 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
711
712 /* even 3. pixel */
713 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
714 "mthi $zero, $ac1 \n\t"
715 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
716 "preceu.ph.qbr %[p2], %[qload1] \n\t"
717 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
718 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
719 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
720 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
721 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
722 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
723 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
724
725 /* even 4. pixel */
726 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
727 "mthi $zero, $ac2 \n\t"
728 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
729 "preceu.ph.qbl %[p3], %[qload1] \n\t"
730 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
731 "ulw %[qload2], 16(%[src]) \n\t"
732 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
733 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
734 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
735 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
736 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
737 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
738 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
739 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
740
741 /* even 5. pixel */
742 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
743 "mthi $zero, $ac3 \n\t"
744 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
745 "preceu.ph.qbr %[p4], %[qload2] \n\t"
746 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
747 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
748 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
749 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
750 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
751 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
752 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
753
754 /* even 6. pixel */
755 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
756 "mthi $zero, $ac1 \n\t"
757 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
758 "preceu.ph.qbl %[p1], %[qload2] \n\t"
759 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
760 "ulw %[qload3], 20(%[src]) \n\t"
761 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
762 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
763 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
764 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
765 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
766 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
767 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
768
769 /* even 7. pixel */
770 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
771 "mthi $zero, $ac2 \n\t"
772 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
773 "preceu.ph.qbr %[p5], %[qload3] \n\t"
774 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
775 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
776 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
777 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
778 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
779 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
780 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
781 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
782
783 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
784
785 /* even 8. pixel */
786 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
787 "mthi $zero, $ac3 \n\t"
788 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
789 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
790 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
791 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
792 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
793 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
794 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
795 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
796
797 /* ODD pixels */
798 "ulw %[qload1], 1(%[src]) \n\t"
799 "ulw %[qload2], 5(%[src]) \n\t"
800
801 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
802
803 /* odd 1. pixel */
804 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
805 "mthi $zero, $ac1 \n\t"
806 "preceu.ph.qbr %[p1], %[qload1] \n\t"
807 "preceu.ph.qbl %[p2], %[qload1] \n\t"
808 "preceu.ph.qbr %[p3], %[qload2] \n\t"
809 "preceu.ph.qbl %[p4], %[qload2] \n\t"
810 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
811 "ulw %[qload3], 9(%[src]) \n\t"
812 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
813 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
814 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
815 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
816 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
817 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
818 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
819
820 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
821
822 /* odd 2. pixel */
823 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
824 "mthi $zero, $ac2 \n\t"
825 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
826 "preceu.ph.qbr %[p1], %[qload3] \n\t"
827 "preceu.ph.qbl %[p5], %[qload3] \n\t"
828 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
829 "ulw %[qload1], 13(%[src]) \n\t"
830 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
831 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
832 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
833 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
834 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
835 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
836 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
837
838 /* odd 3. pixel */
839 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
840 "mthi $zero, $ac3 \n\t"
841 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
842 "preceu.ph.qbr %[p2], %[qload1] \n\t"
843 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
844 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
845 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
846 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
847 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
848 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
849 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
850
851 /* odd 4. pixel */
852 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
853 "mthi $zero, $ac1 \n\t"
854 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
855 "preceu.ph.qbl %[p3], %[qload1] \n\t"
856 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
857 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
858 "ulw %[qload2], 17(%[src]) \n\t"
859 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
860 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
861 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
862 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
863 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
864 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
865
866 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
867
868 /* odd 5. pixel */
869 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
870 "mthi $zero, $ac2 \n\t"
871 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
872 "preceu.ph.qbr %[p4], %[qload2] \n\t"
873 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
874 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
875 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
876 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
877 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
878 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
879 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
880
881 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
882
883 /* odd 6. pixel */
884 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
885 "mthi $zero, $ac3 \n\t"
886 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
887 "preceu.ph.qbl %[p1], %[qload2] \n\t"
888 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
889 "ulw %[qload3], 21(%[src]) \n\t"
890 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
891 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
892 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
893 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
894 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
895 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
896
897 /* odd 7. pixel */
898 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
899 "mthi $zero, $ac1 \n\t"
900 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
901 "preceu.ph.qbr %[p5], %[qload3] \n\t"
902 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
903 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
904 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
905 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
906 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
907 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
908 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
909
910 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
911
912 /* odd 8. pixel */
913 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
914 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
915 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
916 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
917 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
918
919 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
920
921 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
922 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
923
924 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
925 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
926
927 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
928 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
929
930 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
931 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
932 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
933
934 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
935 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
936 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
937 [qload3] "=&r" (qload3), [p5] "=&r" (p5),
938 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
939 : [filter12] "r" (filter12), [filter34] "r" (filter34),
940 [filter56] "r" (filter56), [filter78] "r" (filter78),
941 [vector_64] "r" (vector_64),
942 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
943 );
944
945 src += 16;
946 dst += 16;
947 }
948
949 /* Next row... */
950 src_ptr += src_stride;
951 dst_ptr += dst_stride;
952 }
953 }
954
955 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
956 uint8_t *dst, ptrdiff_t dst_stride,
957 const int16_t *filter_x, int x_step_q4,
958 const int16_t *filter_y, int y_step_q4,
959 int w, int h) {
960 if (((const int32_t *)filter_x)[1] == 0x800000) {
961 vpx_convolve_avg(src, src_stride,
962 dst, dst_stride,
963 filter_x, x_step_q4,
964 filter_y, y_step_q4,
965 w, h);
966 } else if (((const int32_t *)filter_x)[0] == 0) {
967 vpx_convolve2_avg_horiz_dspr2(src, src_stride,
968 dst, dst_stride,
969 filter_x, x_step_q4,
970 filter_y, y_step_q4,
971 w, h);
972 } else {
973 if (16 == x_step_q4) {
974 uint32_t pos = 38;
975
976 src -= 3;
977
978 /* bit positon for extract from acc */
979 __asm__ __volatile__ (
980 "wrdsp %[pos], 1 \n\t"
981 :
982 : [pos] "r" (pos)
983 );
984
985 /* prefetch data to cache memory */
986 prefetch_load(src);
987 prefetch_load(src + 32);
988 prefetch_store(dst);
989
990 switch (w) {
991 case 4:
992 convolve_avg_horiz_4_dspr2(src, src_stride,
993 dst, dst_stride,
994 filter_x, h);
995 break;
996 case 8:
997 convolve_avg_horiz_8_dspr2(src, src_stride,
998 dst, dst_stride,
999 filter_x, h);
1000 break;
1001 case 16:
1002 convolve_avg_horiz_16_dspr2(src, src_stride,
1003 dst, dst_stride,
1004 filter_x, h, 1);
1005 break;
1006 case 32:
1007 convolve_avg_horiz_16_dspr2(src, src_stride,
1008 dst, dst_stride,
1009 filter_x, h, 2);
1010 break;
1011 case 64:
1012 prefetch_load(src + 64);
1013 prefetch_store(dst + 32);
1014
1015 convolve_avg_horiz_64_dspr2(src, src_stride,
1016 dst, dst_stride,
1017 filter_x, h);
1018 break;
1019 default:
1020 vpx_convolve8_avg_horiz_c(src + 3, src_stride,
1021 dst, dst_stride,
1022 filter_x, x_step_q4,
1023 filter_y, y_step_q4,
1024 w, h);
1025 break;
1026 }
1027 } else {
1028 vpx_convolve8_avg_horiz_c(src, src_stride,
1029 dst, dst_stride,
1030 filter_x, x_step_q4,
1031 filter_y, y_step_q4,
1032 w, h);
1033 }
1034 }
1035 }
1036 #endif
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/mips/vpx_convolve8_avg_dspr2.c ('k') | source/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698