Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(712)

Side by Side Diff: source/libvpx/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 #include <stdlib.h>
13 #include "vp8_rtcd.h"
14 #include "vp8/common/onyxc_int.h"
15
16 #if HAVE_DSPR2
17 typedef unsigned char uc;
18
19 /* prefetch data for load */
20 inline void prefetch_load_lf(unsigned char *src)
21 {
22 __asm__ __volatile__ (
23 "pref 0, 0(%[src]) \n\t"
24 :
25 : [src] "r" (src)
26 );
27 }
28
29
30 /* prefetch data for store */
31 inline void prefetch_store_lf(unsigned char *dst)
32 {
33 __asm__ __volatile__ (
34 "pref 1, 0(%[dst]) \n\t"
35 :
36 : [dst] "r" (dst)
37 );
38 }
39
40 /* processing 4 pixels at the same time
41 * compute hev and mask in the same function
42 */
43 static __inline void vp8_filter_mask_vec_mips
44 (
45 uint32_t limit,
46 uint32_t flimit,
47 uint32_t p1,
48 uint32_t p0,
49 uint32_t p3,
50 uint32_t p2,
51 uint32_t q0,
52 uint32_t q1,
53 uint32_t q2,
54 uint32_t q3,
55 uint32_t thresh,
56 uint32_t *hev,
57 uint32_t *mask
58 )
59 {
60 uint32_t c, r, r3, r_k;
61 uint32_t s1, s2, s3;
62 uint32_t ones = 0xFFFFFFFF;
63 uint32_t hev1;
64
65 __asm__ __volatile__ (
66 /* mask |= (abs(p3 - p2) > limit) */
67 "subu_s.qb %[c], %[p3], %[p2] \n\t"
68 "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
69 "or %[r_k], %[r_k], %[c] \n\t"
70 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
71 "or %[r], $0, %[c] \n\t"
72
73 /* mask |= (abs(p2 - p1) > limit) */
74 "subu_s.qb %[c], %[p2], %[p1] \n\t"
75 "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
76 "or %[r_k], %[r_k], %[c] \n\t"
77 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
78 "or %[r], %[r], %[c] \n\t"
79
80 /* mask |= (abs(p1 - p0) > limit)
81 * hev |= (abs(p1 - p0) > thresh)
82 */
83 "subu_s.qb %[c], %[p1], %[p0] \n\t"
84 "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
85 "or %[r_k], %[r_k], %[c] \n\t"
86 "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
87 "or %[r3], $0, %[c] \n\t"
88 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
89 "or %[r], %[r], %[c] \n\t"
90
91 /* mask |= (abs(q1 - q0) > limit)
92 * hev |= (abs(q1 - q0) > thresh)
93 */
94 "subu_s.qb %[c], %[q1], %[q0] \n\t"
95 "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
96 "or %[r_k], %[r_k], %[c] \n\t"
97 "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
98 "or %[r3], %[r3], %[c] \n\t"
99 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
100 "or %[r], %[r], %[c] \n\t"
101
102 /* mask |= (abs(q2 - q1) > limit) */
103 "subu_s.qb %[c], %[q2], %[q1] \n\t"
104 "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
105 "or %[r_k], %[r_k], %[c] \n\t"
106 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
107 "or %[r], %[r], %[c] \n\t"
108 "sll %[r3], %[r3], 24 \n\t"
109
110 /* mask |= (abs(q3 - q2) > limit) */
111 "subu_s.qb %[c], %[q3], %[q2] \n\t"
112 "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
113 "or %[r_k], %[r_k], %[c] \n\t"
114 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
115 "or %[r], %[r], %[c] \n\t"
116
117 : [c] "=&r" (c), [r_k] "=&r" (r_k),
118 [r] "=&r" (r), [r3] "=&r" (r3)
119 : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
120 [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
121 [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
122 );
123
124 __asm__ __volatile__ (
125 /* abs(p0 - q0) */
126 "subu_s.qb %[c], %[p0], %[q0] \n\t"
127 "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
128 "wrdsp %[r3] \n\t"
129 "or %[s1], %[r_k], %[c] \n\t"
130
131 /* abs(p1 - q1) */
132 "subu_s.qb %[c], %[p1], %[q1] \n\t"
133 "addu_s.qb %[s3], %[s1], %[s1] \n\t"
134 "pick.qb %[hev1], %[ones], $0 \n\t"
135 "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
136 "or %[s2], %[r_k], %[c] \n\t"
137
138 /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
139 "shrl.qb %[s2], %[s2], 1 \n\t"
140 "addu_s.qb %[s1], %[s2], %[s3] \n\t"
141 "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
142 "or %[r], %[r], %[c] \n\t"
143 "sll %[r], %[r], 24 \n\t"
144
145 "wrdsp %[r] \n\t"
146 "pick.qb %[s2], $0, %[ones] \n\t"
147
148 : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1) ,
149 [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
150 : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
151 [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
152 );
153
154 *hev = hev1;
155 *mask = s2;
156 }
157
158
159 /* inputs & outputs are quad-byte vectors */
160 static __inline void vp8_filter_mips
161 (
162 uint32_t mask,
163 uint32_t hev,
164 uint32_t *ps1,
165 uint32_t *ps0,
166 uint32_t *qs0,
167 uint32_t *qs1
168 )
169 {
170 int32_t vp8_filter_l, vp8_filter_r;
171 int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
172 int32_t subr_r, subr_l;
173 uint32_t t1, t2, HWM, t3;
174 uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
175
176 int32_t vps1, vps0, vqs0, vqs1;
177 int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
178 uint32_t N128;
179
180 N128 = 0x80808080;
181 t1 = 0x03000300;
182 t2 = 0x04000400;
183 t3 = 0x01000100;
184 HWM = 0xFF00FF00;
185
186 vps0 = (*ps0) ^ N128;
187 vps1 = (*ps1) ^ N128;
188 vqs0 = (*qs0) ^ N128;
189 vqs1 = (*qs1) ^ N128;
190
191 /* use halfword pairs instead quad-bytes because of accuracy */
192 vps0_l = vps0 & HWM;
193 vps0_r = vps0 << 8;
194 vps0_r = vps0_r & HWM;
195
196 vps1_l = vps1 & HWM;
197 vps1_r = vps1 << 8;
198 vps1_r = vps1_r & HWM;
199
200 vqs0_l = vqs0 & HWM;
201 vqs0_r = vqs0 << 8;
202 vqs0_r = vqs0_r & HWM;
203
204 vqs1_l = vqs1 & HWM;
205 vqs1_r = vqs1 << 8;
206 vqs1_r = vqs1_r & HWM;
207
208 mask_l = mask & HWM;
209 mask_r = mask << 8;
210 mask_r = mask_r & HWM;
211
212 hev_l = hev & HWM;
213 hev_r = hev << 8;
214 hev_r = hev_r & HWM;
215
216 __asm__ __volatile__ (
217 /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
218 "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t"
219 "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t"
220
221 /* qs0 - ps0 */
222 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
223 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
224
225 /* vp8_filter &= hev; */
226 "and %[vp8_filter_l], %[vp8_filter_l], %[hev_l] \n\t"
227 "and %[vp8_filter_r], %[vp8_filter_r], %[hev_r] \n\t"
228
229 /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
230 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
231 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
232 "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
233 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
234 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
235 "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
236 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
237 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
238
239 /* vp8_filter &= mask; */
240 "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t"
241 "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t"
242
243 : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=&r" (vp8_filter_ r),
244 [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
245 [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
246
247 : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
248 [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
249 [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
250 [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
251 [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
252 [HWM] "r" (HWM)
253 );
254
255 /* save bottom 3 bits so that we round one side +4 and the other +3 */
256 __asm__ __volatile__ (
257 /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
258 "addq_s.ph %[Filter1_l], %[vp8_filter_l], %[t2] \n\t"
259 "addq_s.ph %[Filter1_r], %[vp8_filter_r], %[t2] \n\t"
260
261 /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
262 "addq_s.ph %[Filter2_l], %[vp8_filter_l], %[t1] \n\t"
263 "addq_s.ph %[Filter2_r], %[vp8_filter_r], %[t1] \n\t"
264 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
265 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
266
267 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
268 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
269
270 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
271 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
272
273 /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
274 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
275 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
276
277 /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
278 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
279 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
280
281 : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
282 [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
283 [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
284 [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
285
286 : [t1] "r" (t1), [t2] "r" (t2),
287 [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
288 [HWM] "r" (HWM)
289 );
290
291 __asm__ __volatile__ (
292 /* (vp8_filter += 1) >>= 1 */
293 "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
294 "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
295
296 /* vp8_filter &= ~hev; */
297 "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
298 "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
299
300 /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
301 "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
302 "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
303
304 /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
305 "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
306 "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
307
308 : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
309 [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
310 [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
311
312 : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
313 );
314
315 /* Create quad-bytes from halfword pairs */
316 vqs0_l = vqs0_l & HWM;
317 vqs1_l = vqs1_l & HWM;
318 vps0_l = vps0_l & HWM;
319 vps1_l = vps1_l & HWM;
320
321 __asm__ __volatile__ (
322 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
323 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
324 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
325 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
326
327 : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
328 [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
329 :
330 );
331
332 vqs0 = vqs0_l | vqs0_r;
333 vqs1 = vqs1_l | vqs1_r;
334 vps0 = vps0_l | vps0_r;
335 vps1 = vps1_l | vps1_r;
336
337 *ps0 = vps0 ^ N128;
338 *ps1 = vps1 ^ N128;
339 *qs0 = vqs0 ^ N128;
340 *qs1 = vqs1 ^ N128;
341 }
342
343 void vp8_loop_filter_horizontal_edge_mips
344 (
345 unsigned char *s,
346 int p,
347 unsigned int flimit,
348 unsigned int limit,
349 unsigned int thresh,
350 int count
351 )
352 {
353 uint32_t mask;
354 uint32_t hev;
355 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
356 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
357
358 mask = 0;
359 hev = 0;
360 p1 = 0;
361 p2 = 0;
362 p3 = 0;
363 p4 = 0;
364
365 /* prefetch data for store */
366 prefetch_store_lf(s);
367
368 /* loop filter designed to work using chars so that we can make maximum use
369 * of 8 bit simd instructions.
370 */
371
372 sm1 = s - (p << 2);
373 s0 = s - p - p - p;
374 s1 = s - p - p ;
375 s2 = s - p;
376 s3 = s;
377 s4 = s + p;
378 s5 = s + p + p;
379 s6 = s + p + p + p;
380
381 /* load quad-byte vectors
382 * memory is 4 byte aligned
383 */
384 p1 = *((uint32_t *)(s1));
385 p2 = *((uint32_t *)(s2));
386 p3 = *((uint32_t *)(s3));
387 p4 = *((uint32_t *)(s4));
388
389 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
390 * mask will be zero and filtering is not needed
391 */
392 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
393 {
394
395 pm1 = *((uint32_t *)(sm1));
396 p0 = *((uint32_t *)(s0));
397 p5 = *((uint32_t *)(s5));
398 p6 = *((uint32_t *)(s6));
399
400 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
401 thresh, &hev, &mask);
402
403 /* if mask == 0 do filtering is not needed */
404 if (mask)
405 {
406 /* filtering */
407 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
408
409 /* unpack processed 4x4 neighborhood */
410 *((uint32_t *)s1) = p1;
411 *((uint32_t *)s2) = p2;
412 *((uint32_t *)s3) = p3;
413 *((uint32_t *)s4) = p4;
414 }
415 }
416
417 sm1 += 4;
418 s0 += 4;
419 s1 += 4;
420 s2 += 4;
421 s3 += 4;
422 s4 += 4;
423 s5 += 4;
424 s6 += 4;
425
426 /* load quad-byte vectors
427 * memory is 4 byte aligned
428 */
429 p1 = *((uint32_t *)(s1));
430 p2 = *((uint32_t *)(s2));
431 p3 = *((uint32_t *)(s3));
432 p4 = *((uint32_t *)(s4));
433
434 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
435 * mask will be zero and filtering is not needed
436 */
437 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
438 {
439
440 pm1 = *((uint32_t *)(sm1));
441 p0 = *((uint32_t *)(s0));
442 p5 = *((uint32_t *)(s5));
443 p6 = *((uint32_t *)(s6));
444
445 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
446 thresh, &hev, &mask);
447
448 /* if mask == 0 do filtering is not needed */
449 if (mask)
450 {
451 /* filtering */
452 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
453
454 /* unpack processed 4x4 neighborhood */
455 *((uint32_t *)s1) = p1;
456 *((uint32_t *)s2) = p2;
457 *((uint32_t *)s3) = p3;
458 *((uint32_t *)s4) = p4;
459 }
460 }
461
462 sm1 += 4;
463 s0 += 4;
464 s1 += 4;
465 s2 += 4;
466 s3 += 4;
467 s4 += 4;
468 s5 += 4;
469 s6 += 4;
470
471 /* load quad-byte vectors
472 * memory is 4 byte aligned
473 */
474 p1 = *((uint32_t *)(s1));
475 p2 = *((uint32_t *)(s2));
476 p3 = *((uint32_t *)(s3));
477 p4 = *((uint32_t *)(s4));
478
479 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
480 * mask will be zero and filtering is not needed
481 */
482 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
483 {
484
485 pm1 = *((uint32_t *)(sm1));
486 p0 = *((uint32_t *)(s0));
487 p5 = *((uint32_t *)(s5));
488 p6 = *((uint32_t *)(s6));
489
490 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
491 thresh, &hev, &mask);
492
493 /* if mask == 0 do filtering is not needed */
494 if (mask)
495 {
496 /* filtering */
497 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
498
499 /* unpack processed 4x4 neighborhood */
500 *((uint32_t *)s1) = p1;
501 *((uint32_t *)s2) = p2;
502 *((uint32_t *)s3) = p3;
503 *((uint32_t *)s4) = p4;
504 }
505 }
506
507 sm1 += 4;
508 s0 += 4;
509 s1 += 4;
510 s2 += 4;
511 s3 += 4;
512 s4 += 4;
513 s5 += 4;
514 s6 += 4;
515
516 /* load quad-byte vectors
517 * memory is 4 byte aligned
518 */
519 p1 = *((uint32_t *)(s1));
520 p2 = *((uint32_t *)(s2));
521 p3 = *((uint32_t *)(s3));
522 p4 = *((uint32_t *)(s4));
523
524 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
525 * mask will be zero and filtering is not needed
526 */
527 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
528 {
529
530 pm1 = *((uint32_t *)(sm1));
531 p0 = *((uint32_t *)(s0));
532 p5 = *((uint32_t *)(s5));
533 p6 = *((uint32_t *)(s6));
534
535 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
536 thresh, &hev, &mask);
537
538 /* if mask == 0 do filtering is not needed */
539 if (mask)
540 {
541 /* filtering */
542 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
543
544 /* unpack processed 4x4 neighborhood */
545 *((uint32_t *)s1) = p1;
546 *((uint32_t *)s2) = p2;
547 *((uint32_t *)s3) = p3;
548 *((uint32_t *)s4) = p4;
549 }
550 }
551 }
552
553 void vp8_loop_filter_uvhorizontal_edge_mips
554 (
555 unsigned char *s,
556 int p,
557 unsigned int flimit,
558 unsigned int limit,
559 unsigned int thresh,
560 int count
561 )
562 {
563 uint32_t mask;
564 uint32_t hev;
565 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
566 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
567
568 mask = 0;
569 hev = 0;
570 p1 = 0;
571 p2 = 0;
572 p3 = 0;
573 p4 = 0;
574
575 /* loop filter designed to work using chars so that we can make maximum use
576 * of 8 bit simd instructions.
577 */
578
579 sm1 = s - (p << 2);
580 s0 = s - p - p - p;
581 s1 = s - p - p ;
582 s2 = s - p;
583 s3 = s;
584 s4 = s + p;
585 s5 = s + p + p;
586 s6 = s + p + p + p;
587
588 /* load quad-byte vectors
589 * memory is 4 byte aligned
590 */
591 p1 = *((uint32_t *)(s1));
592 p2 = *((uint32_t *)(s2));
593 p3 = *((uint32_t *)(s3));
594 p4 = *((uint32_t *)(s4));
595
596 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
597 * mask will be zero and filtering is not needed
598 */
599 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
600 {
601
602 pm1 = *((uint32_t *)(sm1));
603 p0 = *((uint32_t *)(s0));
604 p5 = *((uint32_t *)(s5));
605 p6 = *((uint32_t *)(s6));
606
607 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
608 thresh, &hev, &mask);
609
610 /* if mask == 0 do filtering is not needed */
611 if (mask)
612 {
613 /* filtering */
614 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
615
616 /* unpack processed 4x4 neighborhood */
617 *((uint32_t *)s1) = p1;
618 *((uint32_t *)s2) = p2;
619 *((uint32_t *)s3) = p3;
620 *((uint32_t *)s4) = p4;
621 }
622 }
623
624 sm1 += 4;
625 s0 += 4;
626 s1 += 4;
627 s2 += 4;
628 s3 += 4;
629 s4 += 4;
630 s5 += 4;
631 s6 += 4;
632
633 /* load quad-byte vectors
634 * memory is 4 byte aligned
635 */
636 p1 = *((uint32_t *)(s1));
637 p2 = *((uint32_t *)(s2));
638 p3 = *((uint32_t *)(s3));
639 p4 = *((uint32_t *)(s4));
640
641 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
642 * mask will be zero and filtering is not needed
643 */
644 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
645 {
646
647 pm1 = *((uint32_t *)(sm1));
648 p0 = *((uint32_t *)(s0));
649 p5 = *((uint32_t *)(s5));
650 p6 = *((uint32_t *)(s6));
651
652 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
653 thresh, &hev, &mask);
654
655 /* if mask == 0 do filtering is not needed */
656 if (mask)
657 {
658 /* filtering */
659 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
660
661 /* unpack processed 4x4 neighborhood */
662 *((uint32_t *)s1) = p1;
663 *((uint32_t *)s2) = p2;
664 *((uint32_t *)s3) = p3;
665 *((uint32_t *)s4) = p4;
666 }
667 }
668 }
669
670 void vp8_loop_filter_vertical_edge_mips
671 (
672 unsigned char *s,
673 int p,
674 const unsigned int flimit,
675 const unsigned int limit,
676 const unsigned int thresh,
677 int count
678 )
679 {
680 int i;
681 uint32_t mask, hev;
682 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
683 unsigned char *s1, *s2, *s3, *s4;
684 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
685
686 hev = 0;
687 mask = 0;
688 i = 0;
689 pm1 = 0;
690 p0 = 0;
691 p1 = 0;
692 p2 = 0;
693 p3 = 0;
694 p4 = 0;
695 p5 = 0;
696 p6 = 0;
697
698 /* loop filter designed to work using chars so that we can make maximum use
699 * of 8 bit simd instructions.
700 */
701
702 /* apply filter on 4 pixesl at the same time */
703 do
704 {
705
706 /* prefetch data for store */
707 prefetch_store_lf(s + p);
708
709 s1 = s;
710 s2 = s + p;
711 s3 = s2 + p;
712 s4 = s3 + p;
713 s = s4 + p;
714
715 /* load quad-byte vectors
716 * memory is 4 byte aligned
717 */
718 p2 = *((uint32_t *)(s1 - 4));
719 p6 = *((uint32_t *)(s1));
720 p1 = *((uint32_t *)(s2 - 4));
721 p5 = *((uint32_t *)(s2));
722 p0 = *((uint32_t *)(s3 - 4));
723 p4 = *((uint32_t *)(s3));
724 pm1 = *((uint32_t *)(s4 - 4));
725 p3 = *((uint32_t *)(s4));
726
727 /* transpose pm1, p0, p1, p2 */
728 __asm__ __volatile__ (
729 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
730 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
731 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
732 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
733
734 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
735 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
736 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
737 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
738
739 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
740 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
741 "append %[p1], %[sec3], 16 \n\t"
742 "append %[pm1], %[sec4], 16 \n\t"
743
744 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
745 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
746 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
747 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
748 :
749 );
750
751 /* transpose p3, p4, p5, p6 */
752 __asm__ __volatile__ (
753 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
754 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
755 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
756 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
757
758 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
759 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
760 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
761 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
762
763 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
764 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
765 "append %[p5], %[sec3], 16 \n\t"
766 "append %[p3], %[sec4], 16 \n\t"
767
768 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
769 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
770 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
771 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
772 :
773 );
774
775 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
776 * mask will be zero and filtering is not needed
777 */
778 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
779 {
780
781 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
782 thresh, &hev, &mask);
783
784 /* if mask == 0 do filtering is not needed */
785 if (mask)
786 {
787 /* filtering */
788 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
789
790 /* unpack processed 4x4 neighborhood
791 * don't use transpose on output data
792 * because memory isn't aligned
793 */
794 __asm__ __volatile__ (
795 "sb %[p4], 1(%[s4]) \n\t"
796 "sb %[p3], 0(%[s4]) \n\t"
797 "sb %[p2], -1(%[s4]) \n\t"
798 "sb %[p1], -2(%[s4]) \n\t"
799 :
800 : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
801 [p2] "r" (p2), [p1] "r" (p1)
802 );
803
804 __asm__ __volatile__ (
805 "srl %[p4], %[p4], 8 \n\t"
806 "srl %[p3], %[p3], 8 \n\t"
807 "srl %[p2], %[p2], 8 \n\t"
808 "srl %[p1], %[p1], 8 \n\t"
809 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
810 :
811 );
812
813 __asm__ __volatile__ (
814 "sb %[p4], 1(%[s3]) \n\t"
815 "sb %[p3], 0(%[s3]) \n\t"
816 "sb %[p2], -1(%[s3]) \n\t"
817 "sb %[p1], -2(%[s3]) \n\t"
818 : [p1] "+r" (p1)
819 : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
820 );
821
822 __asm__ __volatile__ (
823 "srl %[p4], %[p4], 8 \n\t"
824 "srl %[p3], %[p3], 8 \n\t"
825 "srl %[p2], %[p2], 8 \n\t"
826 "srl %[p1], %[p1], 8 \n\t"
827 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
828 :
829 );
830
831 __asm__ __volatile__ (
832 "sb %[p4], 1(%[s2]) \n\t"
833 "sb %[p3], 0(%[s2]) \n\t"
834 "sb %[p2], -1(%[s2]) \n\t"
835 "sb %[p1], -2(%[s2]) \n\t"
836 :
837 : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
838 [p2] "r" (p2), [p1] "r" (p1)
839 );
840
841 __asm__ __volatile__ (
842 "srl %[p4], %[p4], 8 \n\t"
843 "srl %[p3], %[p3], 8 \n\t"
844 "srl %[p2], %[p2], 8 \n\t"
845 "srl %[p1], %[p1], 8 \n\t"
846 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
847 :
848 );
849
850 __asm__ __volatile__ (
851 "sb %[p4], 1(%[s1]) \n\t"
852 "sb %[p3], 0(%[s1]) \n\t"
853 "sb %[p2], -1(%[s1]) \n\t"
854 "sb %[p1], -2(%[s1]) \n\t"
855 :
856 : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
857 [p2] "r" (p2), [p1] "r" (p1)
858 );
859 }
860 }
861
862 s1 = s;
863 s2 = s + p;
864 s3 = s2 + p;
865 s4 = s3 + p;
866 s = s4 + p;
867
868 /* load quad-byte vectors
869 * memory is 4 byte aligned
870 */
871 p2 = *((uint32_t *)(s1 - 4));
872 p6 = *((uint32_t *)(s1));
873 p1 = *((uint32_t *)(s2 - 4));
874 p5 = *((uint32_t *)(s2));
875 p0 = *((uint32_t *)(s3 - 4));
876 p4 = *((uint32_t *)(s3));
877 pm1 = *((uint32_t *)(s4 - 4));
878 p3 = *((uint32_t *)(s4));
879
880 /* transpose pm1, p0, p1, p2 */
881 __asm__ __volatile__ (
882 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
883 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
884 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
885 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
886
887 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
888 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
889 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
890 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
891
892 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
893 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
894 "append %[p1], %[sec3], 16 \n\t"
895 "append %[pm1], %[sec4], 16 \n\t"
896
897 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
898 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
899 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
900 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
901 :
902 );
903
904 /* transpose p3, p4, p5, p6 */
905 __asm__ __volatile__ (
906 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
907 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
908 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
909 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
910
911 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
912 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
913 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
914 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
915
916 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
917 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
918 "append %[p5], %[sec3], 16 \n\t"
919 "append %[p3], %[sec4], 16 \n\t"
920
921 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
922 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
923 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
924 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
925 :
926 );
927
928 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
929 * mask will be zero and filtering is not needed
930 */
931 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
932 {
933
934 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
935 thresh, &hev, &mask);
936
937 /* if mask == 0 do filtering is not needed */
938 if (mask)
939 {
940 /* filtering */
941 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
942
943 /* unpack processed 4x4 neighborhood
944 * don't use transpose on output data
945 * because memory isn't aligned
946 */
947 __asm__ __volatile__ (
948 "sb %[p4], 1(%[s4]) \n\t"
949 "sb %[p3], 0(%[s4]) \n\t"
950 "sb %[p2], -1(%[s4]) \n\t"
951 "sb %[p1], -2(%[s4]) \n\t"
952 :
953 : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
954 [p2] "r" (p2), [p1] "r" (p1)
955 );
956
957 __asm__ __volatile__ (
958 "srl %[p4], %[p4], 8 \n\t"
959 "srl %[p3], %[p3], 8 \n\t"
960 "srl %[p2], %[p2], 8 \n\t"
961 "srl %[p1], %[p1], 8 \n\t"
962 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
963 :
964 );
965
966 __asm__ __volatile__ (
967 "sb %[p4], 1(%[s3]) \n\t"
968 "sb %[p3], 0(%[s3]) \n\t"
969 "sb %[p2], -1(%[s3]) \n\t"
970 "sb %[p1], -2(%[s3]) \n\t"
971 : [p1] "+r" (p1)
972 : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
973 );
974
975 __asm__ __volatile__ (
976 "srl %[p4], %[p4], 8 \n\t"
977 "srl %[p3], %[p3], 8 \n\t"
978 "srl %[p2], %[p2], 8 \n\t"
979 "srl %[p1], %[p1], 8 \n\t"
980 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
981 :
982 );
983
984 __asm__ __volatile__ (
985 "sb %[p4], 1(%[s2]) \n\t"
986 "sb %[p3], 0(%[s2]) \n\t"
987 "sb %[p2], -1(%[s2]) \n\t"
988 "sb %[p1], -2(%[s2]) \n\t"
989 :
990 : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
991 [p2] "r" (p2), [p1] "r" (p1)
992 );
993
994 __asm__ __volatile__ (
995 "srl %[p4], %[p4], 8 \n\t"
996 "srl %[p3], %[p3], 8 \n\t"
997 "srl %[p2], %[p2], 8 \n\t"
998 "srl %[p1], %[p1], 8 \n\t"
999 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1000 :
1001 );
1002
1003 __asm__ __volatile__ (
1004 "sb %[p4], 1(%[s1]) \n\t"
1005 "sb %[p3], 0(%[s1]) \n\t"
1006 "sb %[p2], -1(%[s1]) \n\t"
1007 "sb %[p1], -2(%[s1]) \n\t"
1008 :
1009 : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
1010 [p2] "r" (p2), [p1] "r" (p1)
1011 );
1012 }
1013 }
1014
1015 i += 8;
1016 }
1017
1018 while (i < count);
1019 }
1020
1021 void vp8_loop_filter_uvvertical_edge_mips
1022 (
1023 unsigned char *s,
1024 int p,
1025 unsigned int flimit,
1026 unsigned int limit,
1027 unsigned int thresh,
1028 int count
1029 )
1030 {
1031 uint32_t mask, hev;
1032 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1033 unsigned char *s1, *s2, *s3, *s4;
1034 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
1035
1036 /* loop filter designed to work using chars so that we can make maximum use
1037 * of 8 bit simd instructions.
1038 */
1039
1040 /* apply filter on 4 pixesl at the same time */
1041
1042 s1 = s;
1043 s2 = s + p;
1044 s3 = s2 + p;
1045 s4 = s3 + p;
1046
1047 /* load quad-byte vectors
1048 * memory is 4 byte aligned
1049 */
1050 p2 = *((uint32_t *)(s1 - 4));
1051 p6 = *((uint32_t *)(s1));
1052 p1 = *((uint32_t *)(s2 - 4));
1053 p5 = *((uint32_t *)(s2));
1054 p0 = *((uint32_t *)(s3 - 4));
1055 p4 = *((uint32_t *)(s3));
1056 pm1 = *((uint32_t *)(s4 - 4));
1057 p3 = *((uint32_t *)(s4));
1058
1059 /* transpose pm1, p0, p1, p2 */
1060 __asm__ __volatile__ (
1061 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
1062 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
1063 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
1064 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
1065
1066 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
1067 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
1068 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1069 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1070
1071 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
1072 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
1073 "append %[p1], %[sec3], 16 \n\t"
1074 "append %[pm1], %[sec4], 16 \n\t"
1075
1076 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1077 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1078 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
1079 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
1080 :
1081 );
1082
1083 /* transpose p3, p4, p5, p6 */
1084 __asm__ __volatile__ (
1085 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
1086 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
1087 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
1088 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
1089
1090 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
1091 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
1092 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1093 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1094
1095 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
1096 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
1097 "append %[p5], %[sec3], 16 \n\t"
1098 "append %[p3], %[sec4], 16 \n\t"
1099
1100 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1101 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1102 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
1103 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
1104 :
1105 );
1106
1107 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1108 * mask will be zero and filtering is not needed
1109 */
1110 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1111 {
1112
1113 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1114 thresh, &hev, &mask);
1115
1116 /* if mask == 0 do filtering is not needed */
1117 if (mask)
1118 {
1119 /* filtering */
1120 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
1121
1122 /* unpack processed 4x4 neighborhood
1123 * don't use transpose on output data
1124 * because memory isn't aligned
1125 */
1126 __asm__ __volatile__ (
1127 "sb %[p4], 1(%[s4]) \n\t"
1128 "sb %[p3], 0(%[s4]) \n\t"
1129 "sb %[p2], -1(%[s4]) \n\t"
1130 "sb %[p1], -2(%[s4]) \n\t"
1131 :
1132 : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
1133 [p2] "r" (p2), [p1] "r" (p1)
1134 );
1135
1136 __asm__ __volatile__ (
1137 "srl %[p4], %[p4], 8 \n\t"
1138 "srl %[p3], %[p3], 8 \n\t"
1139 "srl %[p2], %[p2], 8 \n\t"
1140 "srl %[p1], %[p1], 8 \n\t"
1141 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1142 :
1143 );
1144
1145 __asm__ __volatile__ (
1146 "sb %[p4], 1(%[s3]) \n\t"
1147 "sb %[p3], 0(%[s3]) \n\t"
1148 "sb %[p2], -1(%[s3]) \n\t"
1149 "sb %[p1], -2(%[s3]) \n\t"
1150 : [p1] "+r" (p1)
1151 : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
1152 );
1153
1154 __asm__ __volatile__ (
1155 "srl %[p4], %[p4], 8 \n\t"
1156 "srl %[p3], %[p3], 8 \n\t"
1157 "srl %[p2], %[p2], 8 \n\t"
1158 "srl %[p1], %[p1], 8 \n\t"
1159 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1160 :
1161 );
1162
1163 __asm__ __volatile__ (
1164 "sb %[p4], 1(%[s2]) \n\t"
1165 "sb %[p3], 0(%[s2]) \n\t"
1166 "sb %[p2], -1(%[s2]) \n\t"
1167 "sb %[p1], -2(%[s2]) \n\t"
1168 :
1169 : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
1170 [p2] "r" (p2), [p1] "r" (p1)
1171 );
1172
1173 __asm__ __volatile__ (
1174 "srl %[p4], %[p4], 8 \n\t"
1175 "srl %[p3], %[p3], 8 \n\t"
1176 "srl %[p2], %[p2], 8 \n\t"
1177 "srl %[p1], %[p1], 8 \n\t"
1178 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1179 :
1180 );
1181
1182 __asm__ __volatile__ (
1183 "sb %[p4], 1(%[s1]) \n\t"
1184 "sb %[p3], 0(%[s1]) \n\t"
1185 "sb %[p2], -1(%[s1]) \n\t"
1186 "sb %[p1], -2(%[s1]) \n\t"
1187 :
1188 : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), [p2] "r" (p2), [p 1] "r" (p1)
1189 );
1190 }
1191 }
1192
1193 s1 = s4 + p;
1194 s2 = s1 + p;
1195 s3 = s2 + p;
1196 s4 = s3 + p;
1197
1198 /* load quad-byte vectors
1199 * memory is 4 byte aligned
1200 */
1201 p2 = *((uint32_t *)(s1 - 4));
1202 p6 = *((uint32_t *)(s1));
1203 p1 = *((uint32_t *)(s2 - 4));
1204 p5 = *((uint32_t *)(s2));
1205 p0 = *((uint32_t *)(s3 - 4));
1206 p4 = *((uint32_t *)(s3));
1207 pm1 = *((uint32_t *)(s4 - 4));
1208 p3 = *((uint32_t *)(s4));
1209
1210 /* transpose pm1, p0, p1, p2 */
1211 __asm__ __volatile__ (
1212 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
1213 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
1214 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
1215 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
1216
1217 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
1218 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
1219 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1220 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1221
1222 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
1223 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
1224 "append %[p1], %[sec3], 16 \n\t"
1225 "append %[pm1], %[sec4], 16 \n\t"
1226
1227 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1228 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1229 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
1230 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
1231 :
1232 );
1233
1234 /* transpose p3, p4, p5, p6 */
1235 __asm__ __volatile__ (
1236 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
1237 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
1238 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
1239 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
1240
1241 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
1242 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
1243 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1244 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1245
1246 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
1247 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
1248 "append %[p5], %[sec3], 16 \n\t"
1249 "append %[p3], %[sec4], 16 \n\t"
1250
1251 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1252 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1253 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
1254 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
1255 :
1256 );
1257
1258 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1259 * mask will be zero and filtering is not needed
1260 */
1261 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1262 {
1263
1264 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1265 thresh, &hev, &mask);
1266
1267 /* if mask == 0 do filtering is not needed */
1268 if (mask)
1269 {
1270 /* filtering */
1271 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
1272
1273 /* unpack processed 4x4 neighborhood
1274 * don't use transpose on output data
1275 * because memory isn't aligned
1276 */
1277 __asm__ __volatile__ (
1278 "sb %[p4], 1(%[s4]) \n\t"
1279 "sb %[p3], 0(%[s4]) \n\t"
1280 "sb %[p2], -1(%[s4]) \n\t"
1281 "sb %[p1], -2(%[s4]) \n\t"
1282 :
1283 : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
1284 [p2] "r" (p2), [p1] "r" (p1)
1285 );
1286
1287 __asm__ __volatile__ (
1288 "srl %[p4], %[p4], 8 \n\t"
1289 "srl %[p3], %[p3], 8 \n\t"
1290 "srl %[p2], %[p2], 8 \n\t"
1291 "srl %[p1], %[p1], 8 \n\t"
1292 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1293 :
1294 );
1295
1296 __asm__ __volatile__ (
1297 "sb %[p4], 1(%[s3]) \n\t"
1298 "sb %[p3], 0(%[s3]) \n\t"
1299 "sb %[p2], -1(%[s3]) \n\t"
1300 "sb %[p1], -2(%[s3]) \n\t"
1301 : [p1] "+r" (p1)
1302 : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
1303 );
1304
1305 __asm__ __volatile__ (
1306 "srl %[p4], %[p4], 8 \n\t"
1307 "srl %[p3], %[p3], 8 \n\t"
1308 "srl %[p2], %[p2], 8 \n\t"
1309 "srl %[p1], %[p1], 8 \n\t"
1310 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1311 :
1312 );
1313
1314 __asm__ __volatile__ (
1315 "sb %[p4], 1(%[s2]) \n\t"
1316 "sb %[p3], 0(%[s2]) \n\t"
1317 "sb %[p2], -1(%[s2]) \n\t"
1318 "sb %[p1], -2(%[s2]) \n\t"
1319 :
1320 : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
1321 [p2] "r" (p2), [p1] "r" (p1)
1322 );
1323
1324 __asm__ __volatile__ (
1325 "srl %[p4], %[p4], 8 \n\t"
1326 "srl %[p3], %[p3], 8 \n\t"
1327 "srl %[p2], %[p2], 8 \n\t"
1328 "srl %[p1], %[p1], 8 \n\t"
1329 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
1330 :
1331 );
1332
1333 __asm__ __volatile__ (
1334 "sb %[p4], 1(%[s1]) \n\t"
1335 "sb %[p3], 0(%[s1]) \n\t"
1336 "sb %[p2], -1(%[s1]) \n\t"
1337 "sb %[p1], -2(%[s1]) \n\t"
1338 :
1339 : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
1340 [p2] "r" (p2), [p1] "r" (p1)
1341 );
1342 }
1343 }
1344 }
1345
1346 /* inputs & outputs are quad-byte vectors */
1347 static __inline void vp8_mbfilter_mips
1348 (
1349 uint32_t mask,
1350 uint32_t hev,
1351 uint32_t *ps2,
1352 uint32_t *ps1,
1353 uint32_t *ps0,
1354 uint32_t *qs0,
1355 uint32_t *qs1,
1356 uint32_t *qs2
1357 )
1358 {
1359 int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
1360 int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
1361 int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
1362 uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r, subr _r, subr_l;
1363 uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l, invhe v_r;
1364 uint32_t N128, R63;
1365 uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
1366
1367 R63 = 0x003F003F;
1368 HWM = 0xFF00FF00;
1369 N128 = 0x80808080;
1370 t1 = 0x03000300;
1371 t2 = 0x04000400;
1372
1373 vps0 = (*ps0) ^ N128;
1374 vps1 = (*ps1) ^ N128;
1375 vps2 = (*ps2) ^ N128;
1376 vqs0 = (*qs0) ^ N128;
1377 vqs1 = (*qs1) ^ N128;
1378 vqs2 = (*qs2) ^ N128;
1379
1380 /* use halfword pairs instead quad-bytes because of accuracy */
1381 vps0_l = vps0 & HWM;
1382 vps0_r = vps0 << 8;
1383 vps0_r = vps0_r & HWM;
1384
1385 vqs0_l = vqs0 & HWM;
1386 vqs0_r = vqs0 << 8;
1387 vqs0_r = vqs0_r & HWM;
1388
1389 vps1_l = vps1 & HWM;
1390 vps1_r = vps1 << 8;
1391 vps1_r = vps1_r & HWM;
1392
1393 vqs1_l = vqs1 & HWM;
1394 vqs1_r = vqs1 << 8;
1395 vqs1_r = vqs1_r & HWM;
1396
1397 vqs2_l = vqs2 & HWM;
1398 vqs2_r = vqs2 << 8;
1399 vqs2_r = vqs2_r & HWM;
1400
1401 __asm__ __volatile__ (
1402 /* qs0 - ps0 */
1403 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
1404 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
1405
1406 /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
1407 "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t"
1408 "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t"
1409
1410 : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=r" (vp8_filter_r ),
1411 [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r)
1412 : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
1413 [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
1414 [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r)
1415 );
1416
1417 vps2_l = vps2 & HWM;
1418 vps2_r = vps2 << 8;
1419 vps2_r = vps2_r & HWM;
1420
1421 /* add outer taps if we have high edge variance */
1422 __asm__ __volatile__ (
1423 /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
1424 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
1425 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
1426 "and %[mask_l], %[HWM], %[mask] \n\t"
1427 "sll %[mask_r], %[mask], 8 \n\t"
1428 "and %[mask_r], %[HWM], %[mask_r] \n\t"
1429 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
1430 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
1431 "and %[hev_l], %[HWM], %[hev] \n\t"
1432 "sll %[hev_r], %[hev], 8 \n\t"
1433 "and %[hev_r], %[HWM], %[hev_r] \n\t"
1434 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
1435 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
1436
1437 /* vp8_filter &= mask; */
1438 "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t"
1439 "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t"
1440
1441 /* Filter2 = vp8_filter & hev; */
1442 "and %[Filter2_l], %[vp8_filter_l], %[hev_l] \n\t"
1443 "and %[Filter2_r], %[vp8_filter_r], %[hev_r] \n\t"
1444
1445 : [vp8_filter_l] "+r" (vp8_filter_l), [vp8_filter_r] "+r" (vp8_filter_r) ,
1446 [hev_l] "=&r" (hev_l), [hev_r] "=&r" (hev_r),
1447 [mask_l] "=&r" (mask_l), [mask_r] "=&r" (mask_r),
1448 [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
1449 : [subr_l] "r" (subr_l), [subr_r] "r" (subr_r),
1450 [HWM] "r" (HWM), [hev] "r" (hev), [mask] "r" (mask)
1451 );
1452
1453 /* save bottom 3 bits so that we round one side +4 and the other +3 */
1454 __asm__ __volatile__ (
1455 /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
1456 "addq_s.ph %[Filter1_l], %[Filter2_l], %[t2] \n\t"
1457 "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
1458 "addq_s.ph %[Filter1_r], %[Filter2_r], %[t2] \n\t"
1459
1460 /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
1461 "addq_s.ph %[Filter2_l], %[Filter2_l], %[t1] \n\t"
1462 "addq_s.ph %[Filter2_r], %[Filter2_r], %[t1] \n\t"
1463
1464 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
1465 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
1466
1467 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
1468 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
1469 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
1470 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
1471 "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
1472
1473 /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
1474 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
1475 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
1476
1477 /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
1478 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
1479 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
1480
1481 : [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r),
1482 [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
1483 [Filter2_l] "+r" (Filter2_l), [Filter2_r] "+r" (Filter2_r),
1484 [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
1485 [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
1486 : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
1487 [hev_l] "r" (hev_l), [hev_r] "r" (hev_r)
1488 );
1489
1490 /* only apply wider filter if not high edge variance */
1491 __asm__ __volatile__ (
1492 /* vp8_filter &= ~hev; */
1493 "and %[Filter2_l], %[vp8_filter_l], %[invhev_l] \n\t"
1494 "and %[Filter2_r], %[vp8_filter_r], %[invhev_r] \n\t"
1495
1496 "shra.ph %[Filter2_l], %[Filter2_l], 8 \n\t"
1497 "shra.ph %[Filter2_r], %[Filter2_r], 8 \n\t"
1498
1499 : [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
1500 : [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
1501 [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
1502 );
1503
1504 /* roughly 3/7th difference across boundary */
1505 __asm__ __volatile__ (
1506 "shll.ph %[u3_l], %[Filter2_l], 3 \n\t"
1507 "shll.ph %[u3_r], %[Filter2_r], 3 \n\t"
1508
1509 "addq.ph %[u3_l], %[u3_l], %[Filter2_l] \n\t"
1510 "addq.ph %[u3_r], %[u3_r], %[Filter2_r] \n\t"
1511
1512 "shll.ph %[u2_l], %[u3_l], 1 \n\t"
1513 "shll.ph %[u2_r], %[u3_r], 1 \n\t"
1514
1515 "addq.ph %[u1_l], %[u3_l], %[u2_l] \n\t"
1516 "addq.ph %[u1_r], %[u3_r], %[u2_r] \n\t"
1517
1518 "addq.ph %[u2_l], %[u2_l], %[R63] \n\t"
1519 "addq.ph %[u2_r], %[u2_r], %[R63] \n\t"
1520
1521 "addq.ph %[u3_l], %[u3_l], %[R63] \n\t"
1522 "addq.ph %[u3_r], %[u3_r], %[R63] \n\t"
1523
1524 /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
1525 * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
1526 */
1527 "addq.ph %[u1_l], %[u1_l], %[R63] \n\t"
1528 "addq.ph %[u1_r], %[u1_r], %[R63] \n\t"
1529 "shra.ph %[u1_l], %[u1_l], 7 \n\t"
1530 "shra.ph %[u1_r], %[u1_r], 7 \n\t"
1531 "shra.ph %[u2_l], %[u2_l], 7 \n\t"
1532 "shra.ph %[u2_r], %[u2_r], 7 \n\t"
1533 "shll.ph %[u1_l], %[u1_l], 8 \n\t"
1534 "shll.ph %[u1_r], %[u1_r], 8 \n\t"
1535 "shll.ph %[u2_l], %[u2_l], 8 \n\t"
1536 "shll.ph %[u2_r], %[u2_r], 8 \n\t"
1537
1538 /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
1539 "subq_s.ph %[vqs0_l], %[vqs0_l], %[u1_l] \n\t"
1540 "subq_s.ph %[vqs0_r], %[vqs0_r], %[u1_r] \n\t"
1541
1542 /* vps0 = vp8_signed_char_clamp(ps0 + u); */
1543 "addq_s.ph %[vps0_l], %[vps0_l], %[u1_l] \n\t"
1544 "addq_s.ph %[vps0_r], %[vps0_r], %[u1_r] \n\t"
1545
1546 : [u1_l] "=&r" (u1_l), [u1_r] "=&r" (u1_r), [u2_l] "=&r" (u2_l),
1547 [u2_r] "=&r" (u2_r), [u3_l] "=&r" (u3_l), [u3_r] "=&r" (u3_r),
1548 [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
1549 [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
1550 : [R63] "r" (R63),
1551 [Filter2_l] "r" (Filter2_l), [Filter2_r] "r" (Filter2_r)
1552 );
1553
1554 __asm__ __volatile__ (
1555 /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
1556 "subq_s.ph %[vqs1_l], %[vqs1_l], %[u2_l] \n\t"
1557 "addq_s.ph %[vps1_l], %[vps1_l], %[u2_l] \n\t"
1558
1559 /* vps1 = vp8_signed_char_clamp(ps1 + u); */
1560 "addq_s.ph %[vps1_r], %[vps1_r], %[u2_r] \n\t"
1561 "subq_s.ph %[vqs1_r], %[vqs1_r], %[u2_r] \n\t"
1562
1563 : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
1564 [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
1565 : [u2_l] "r" (u2_l), [u2_r] "r" (u2_r)
1566 );
1567
1568 /* roughly 1/7th difference across boundary */
1569 __asm__ __volatile__ (
1570 /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
1571 "shra.ph %[u3_l], %[u3_l], 7 \n\t"
1572 "shra.ph %[u3_r], %[u3_r], 7 \n\t"
1573 "shll.ph %[u3_l], %[u3_l], 8 \n\t"
1574 "shll.ph %[u3_r], %[u3_r], 8 \n\t"
1575
1576 /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
1577 "subq_s.ph %[vqs2_l], %[vqs2_l], %[u3_l] \n\t"
1578 "subq_s.ph %[vqs2_r], %[vqs2_r], %[u3_r] \n\t"
1579
1580 /* vps2 = vp8_signed_char_clamp(ps2 + u); */
1581 "addq_s.ph %[vps2_l], %[vps2_l], %[u3_l] \n\t"
1582 "addq_s.ph %[vps2_r], %[vps2_r], %[u3_r] \n\t"
1583
1584 : [u3_l] "+r" (u3_l), [u3_r] "+r" (u3_r), [vps2_l] "+r" (vps2_l),
1585 [vps2_r] "+r" (vps2_r), [vqs2_l] "+r" (vqs2_l), [vqs2_r] "+r" (vqs2_r)
1586 :
1587 );
1588
1589 /* Create quad-bytes from halfword pairs */
1590 __asm__ __volatile__ (
1591 "and %[vqs0_l], %[vqs0_l], %[HWM] \n\t"
1592 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
1593
1594 "and %[vps0_l], %[vps0_l], %[HWM] \n\t"
1595 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
1596
1597 "and %[vqs1_l], %[vqs1_l], %[HWM] \n\t"
1598 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
1599
1600 "and %[vps1_l], %[vps1_l], %[HWM] \n\t"
1601 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
1602
1603 "and %[vqs2_l], %[vqs2_l], %[HWM] \n\t"
1604 "shrl.ph %[vqs2_r], %[vqs2_r], 8 \n\t"
1605
1606 "and %[vps2_l], %[vps2_l], %[HWM] \n\t"
1607 "shrl.ph %[vps2_r], %[vps2_r], 8 \n\t"
1608
1609 "or %[vqs0_r], %[vqs0_l], %[vqs0_r] \n\t"
1610 "or %[vps0_r], %[vps0_l], %[vps0_r] \n\t"
1611 "or %[vqs1_r], %[vqs1_l], %[vqs1_r] \n\t"
1612 "or %[vps1_r], %[vps1_l], %[vps1_r] \n\t"
1613 "or %[vqs2_r], %[vqs2_l], %[vqs2_r] \n\t"
1614 "or %[vps2_r], %[vps2_l], %[vps2_r] \n\t"
1615
1616 : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), [vqs1_l] "+r" (vqs1_l) ,
1617 [vqs1_r] "+r" (vqs1_r), [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r) ,
1618 [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r), [vqs2_l] "+r" (vqs2_l) ,
1619 [vqs2_r] "+r" (vqs2_r), [vps2_r] "+r" (vps2_r), [vps2_l] "+r" (vps2_l)
1620 : [HWM] "r" (HWM)
1621 );
1622
1623 *ps0 = vps0_r ^ N128;
1624 *ps1 = vps1_r ^ N128;
1625 *ps2 = vps2_r ^ N128;
1626 *qs0 = vqs0_r ^ N128;
1627 *qs1 = vqs1_r ^ N128;
1628 *qs2 = vqs2_r ^ N128;
1629 }
1630
1631 void vp8_mbloop_filter_horizontal_edge_mips
1632 (
1633 unsigned char *s,
1634 int p,
1635 unsigned int flimit,
1636 unsigned int limit,
1637 unsigned int thresh,
1638 int count
1639 )
1640 {
1641 int i;
1642 uint32_t mask, hev;
1643 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1644 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
1645
1646 mask = 0;
1647 hev = 0;
1648 i = 0;
1649 p1 = 0;
1650 p2 = 0;
1651 p3 = 0;
1652 p4 = 0;
1653
1654 /* loop filter designed to work using chars so that we can make maximum use
1655 * of 8 bit simd instructions.
1656 */
1657
1658 sm1 = s - (p << 2);
1659 s0 = s - p - p - p;
1660 s1 = s - p - p;
1661 s2 = s - p;
1662 s3 = s;
1663 s4 = s + p;
1664 s5 = s + p + p;
1665 s6 = s + p + p + p;
1666
1667 /* prefetch data for load */
1668 prefetch_load_lf(s + p);
1669
1670 /* apply filter on 4 pixesl at the same time */
1671 do
1672 {
1673 /* load quad-byte vectors
1674 * memory is 4 byte aligned
1675 */
1676 p1 = *((uint32_t *)(s1));
1677 p2 = *((uint32_t *)(s2));
1678 p3 = *((uint32_t *)(s3));
1679 p4 = *((uint32_t *)(s4));
1680
1681 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1682 * mask will be zero and filtering is not needed
1683 */
1684 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1685 {
1686
1687 pm1 = *((uint32_t *)(sm1));
1688 p0 = *((uint32_t *)(s0));
1689 p5 = *((uint32_t *)(s5));
1690 p6 = *((uint32_t *)(s6));
1691
1692 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1693 thresh, &hev, &mask);
1694
1695 /* if mask == 0 do filtering is not needed */
1696 if (mask)
1697 {
1698 /* filtering */
1699 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1700
1701 /* unpack processed 4x4 neighborhood
1702 * memory is 4 byte aligned
1703 */
1704 *((uint32_t *)s0) = p0;
1705 *((uint32_t *)s1) = p1;
1706 *((uint32_t *)s2) = p2;
1707 *((uint32_t *)s3) = p3;
1708 *((uint32_t *)s4) = p4;
1709 *((uint32_t *)s5) = p5;
1710 }
1711 }
1712
1713 sm1 += 4;
1714 s0 += 4;
1715 s1 += 4;
1716 s2 += 4;
1717 s3 += 4;
1718 s4 += 4;
1719 s5 += 4;
1720 s6 += 4;
1721
1722 /* load quad-byte vectors
1723 * memory is 4 byte aligned
1724 */
1725 p1 = *((uint32_t *)(s1));
1726 p2 = *((uint32_t *)(s2));
1727 p3 = *((uint32_t *)(s3));
1728 p4 = *((uint32_t *)(s4));
1729
1730 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1731 * mask will be zero and filtering is not needed
1732 */
1733 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1734 {
1735
1736 pm1 = *((uint32_t *)(sm1));
1737 p0 = *((uint32_t *)(s0));
1738 p5 = *((uint32_t *)(s5));
1739 p6 = *((uint32_t *)(s6));
1740
1741 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1742 thresh, &hev, &mask);
1743
1744 /* if mask == 0 do filtering is not needed */
1745 if (mask)
1746 {
1747 /* filtering */
1748 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1749
1750 /* unpack processed 4x4 neighborhood
1751 * memory is 4 byte aligned
1752 */
1753 *((uint32_t *)s0) = p0;
1754 *((uint32_t *)s1) = p1;
1755 *((uint32_t *)s2) = p2;
1756 *((uint32_t *)s3) = p3;
1757 *((uint32_t *)s4) = p4;
1758 *((uint32_t *)s5) = p5;
1759 }
1760 }
1761
1762 sm1 += 4;
1763 s0 += 4;
1764 s1 += 4;
1765 s2 += 4;
1766 s3 += 4;
1767 s4 += 4;
1768 s5 += 4;
1769 s6 += 4;
1770
1771 i += 8;
1772 }
1773
1774 while (i < count);
1775 }
1776
1777 void vp8_mbloop_filter_uvhorizontal_edge_mips
1778 (
1779 unsigned char *s,
1780 int p,
1781 unsigned int flimit,
1782 unsigned int limit,
1783 unsigned int thresh,
1784 int count
1785 )
1786 {
1787 uint32_t mask, hev;
1788 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1789 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
1790
1791 mask = 0;
1792 hev = 0;
1793 p1 = 0;
1794 p2 = 0;
1795 p3 = 0;
1796 p4 = 0;
1797
1798 /* loop filter designed to work using chars so that we can make maximum use
1799 * of 8 bit simd instructions.
1800 */
1801
1802 sm1 = s - (p << 2);
1803 s0 = s - p - p - p;
1804 s1 = s - p - p;
1805 s2 = s - p;
1806 s3 = s;
1807 s4 = s + p;
1808 s5 = s + p + p;
1809 s6 = s + p + p + p;
1810
1811 /* load quad-byte vectors
1812 * memory is 4 byte aligned
1813 */
1814 p1 = *((uint32_t *)(s1));
1815 p2 = *((uint32_t *)(s2));
1816 p3 = *((uint32_t *)(s3));
1817 p4 = *((uint32_t *)(s4));
1818
1819 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1820 * mask will be zero and filtering is not needed
1821 */
1822 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1823 {
1824
1825 pm1 = *((uint32_t *)(sm1));
1826 p0 = *((uint32_t *)(s0));
1827 p5 = *((uint32_t *)(s5));
1828 p6 = *((uint32_t *)(s6));
1829
1830 /* if mask == 0 do filtering is not needed */
1831 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1832 thresh, &hev, &mask);
1833
1834 if (mask)
1835 {
1836 /* filtering */
1837 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1838
1839 /* unpack processed 4x4 neighborhood
1840 * memory is 4 byte aligned
1841 */
1842 *((uint32_t *)s0) = p0;
1843 *((uint32_t *)s1) = p1;
1844 *((uint32_t *)s2) = p2;
1845 *((uint32_t *)s3) = p3;
1846 *((uint32_t *)s4) = p4;
1847 *((uint32_t *)s5) = p5;
1848 }
1849 }
1850
1851 sm1 += 4;
1852 s0 += 4;
1853 s1 += 4;
1854 s2 += 4;
1855 s3 += 4;
1856 s4 += 4;
1857 s5 += 4;
1858 s6 += 4;
1859
1860 /* load quad-byte vectors
1861 * memory is 4 byte aligned
1862 */
1863 p1 = *((uint32_t *)(s1));
1864 p2 = *((uint32_t *)(s2));
1865 p3 = *((uint32_t *)(s3));
1866 p4 = *((uint32_t *)(s4));
1867
1868 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1869 * mask will be zero and filtering is not needed
1870 */
1871 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
1872 {
1873
1874 pm1 = *((uint32_t *)(sm1));
1875 p0 = *((uint32_t *)(s0));
1876 p5 = *((uint32_t *)(s5));
1877 p6 = *((uint32_t *)(s6));
1878
1879 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1880 thresh, &hev, &mask);
1881
1882 /* if mask == 0 do filtering is not needed */
1883 if (mask)
1884 {
1885 /* filtering */
1886 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1887
1888 /* unpack processed 4x4 neighborhood
1889 * memory is 4 byte aligned
1890 */
1891 *((uint32_t *)s0) = p0;
1892 *((uint32_t *)s1) = p1;
1893 *((uint32_t *)s2) = p2;
1894 *((uint32_t *)s3) = p3;
1895 *((uint32_t *)s4) = p4;
1896 *((uint32_t *)s5) = p5;
1897 }
1898 }
1899 }
1900
1901
1902 void vp8_mbloop_filter_vertical_edge_mips
1903 (
1904 unsigned char *s,
1905 int p,
1906 unsigned int flimit,
1907 unsigned int limit,
1908 unsigned int thresh,
1909 int count
1910 )
1911 {
1912
1913 int i;
1914 uint32_t mask, hev;
1915 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1916 unsigned char *s1, *s2, *s3, *s4;
1917 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
1918
1919 mask = 0;
1920 hev = 0;
1921 i = 0;
1922 pm1 = 0;
1923 p0 = 0;
1924 p1 = 0;
1925 p2 = 0;
1926 p3 = 0;
1927 p4 = 0;
1928 p5 = 0;
1929 p6 = 0;
1930
1931 /* loop filter designed to work using chars so that we can make maximum use
1932 * of 8 bit simd instructions.
1933 */
1934
1935 /* apply filter on 4 pixesl at the same time */
1936 do
1937 {
1938 s1 = s;
1939 s2 = s + p;
1940 s3 = s2 + p;
1941 s4 = s3 + p;
1942 s = s4 + p;
1943
1944 /* load quad-byte vectors
1945 * memory is 4 byte aligned
1946 */
1947 p2 = *((uint32_t *)(s1 - 4));
1948 p6 = *((uint32_t *)(s1));
1949 p1 = *((uint32_t *)(s2 - 4));
1950 p5 = *((uint32_t *)(s2));
1951 p0 = *((uint32_t *)(s3 - 4));
1952 p4 = *((uint32_t *)(s3));
1953 pm1 = *((uint32_t *)(s4 - 4));
1954 p3 = *((uint32_t *)(s4));
1955
1956 /* transpose pm1, p0, p1, p2 */
1957 __asm__ __volatile__ (
1958 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
1959 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
1960 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
1961 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
1962
1963 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
1964 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
1965 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1966 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1967
1968 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
1969 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
1970 "append %[p1], %[sec3], 16 \n\t"
1971 "append %[pm1], %[sec4], 16 \n\t"
1972
1973 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1974 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1975 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
1976 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
1977 :
1978 );
1979
1980 /* transpose p3, p4, p5, p6 */
1981 __asm__ __volatile__ (
1982 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
1983 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
1984 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
1985 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
1986
1987 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
1988 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
1989 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1990 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1991
1992 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
1993 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
1994 "append %[p5], %[sec3], 16 \n\t"
1995 "append %[p3], %[sec4], 16 \n\t"
1996
1997 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1998 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1999 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2000 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
2001 :
2002 );
2003
2004 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2005 * mask will be zero and filtering is not needed
2006 */
2007 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
2008 {
2009
2010 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
2011 thresh, &hev, &mask);
2012
2013 /* if mask == 0 do filtering is not needed */
2014 if (mask)
2015 {
2016 /* filtering */
2017 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2018
2019 /* don't use transpose on output data
2020 * because memory isn't aligned
2021 */
2022 __asm__ __volatile__ (
2023 "sb %[p5], 2(%[s4]) \n\t"
2024 "sb %[p4], 1(%[s4]) \n\t"
2025 "sb %[p3], 0(%[s4]) \n\t"
2026 "sb %[p2], -1(%[s4]) \n\t"
2027 "sb %[p1], -2(%[s4]) \n\t"
2028 "sb %[p0], -3(%[s4]) \n\t"
2029 :
2030 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4) ,
2031 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2032 );
2033
2034 __asm__ __volatile__ (
2035 "srl %[p5], %[p5], 8 \n\t"
2036 "srl %[p4], %[p4], 8 \n\t"
2037 "srl %[p3], %[p3], 8 \n\t"
2038 "srl %[p2], %[p2], 8 \n\t"
2039 "srl %[p1], %[p1], 8 \n\t"
2040 "srl %[p0], %[p0], 8 \n\t"
2041 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2042 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2043 :
2044 );
2045
2046 __asm__ __volatile__ (
2047 "sb %[p5], 2(%[s3]) \n\t"
2048 "sb %[p4], 1(%[s3]) \n\t"
2049 "sb %[p3], 0(%[s3]) \n\t"
2050 "sb %[p2], -1(%[s3]) \n\t"
2051 "sb %[p1], -2(%[s3]) \n\t"
2052 "sb %[p0], -3(%[s3]) \n\t"
2053 :
2054 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3) ,
2055 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2056 );
2057
2058 __asm__ __volatile__ (
2059 "srl %[p5], %[p5], 8 \n\t"
2060 "srl %[p4], %[p4], 8 \n\t"
2061 "srl %[p3], %[p3], 8 \n\t"
2062 "srl %[p2], %[p2], 8 \n\t"
2063 "srl %[p1], %[p1], 8 \n\t"
2064 "srl %[p0], %[p0], 8 \n\t"
2065 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2066 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2067 :
2068 );
2069
2070 __asm__ __volatile__ (
2071 "sb %[p5], 2(%[s2]) \n\t"
2072 "sb %[p4], 1(%[s2]) \n\t"
2073 "sb %[p3], 0(%[s2]) \n\t"
2074 "sb %[p2], -1(%[s2]) \n\t"
2075 "sb %[p1], -2(%[s2]) \n\t"
2076 "sb %[p0], -3(%[s2]) \n\t"
2077 :
2078 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2) ,
2079 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2080 );
2081
2082 __asm__ __volatile__ (
2083 "srl %[p5], %[p5], 8 \n\t"
2084 "srl %[p4], %[p4], 8 \n\t"
2085 "srl %[p3], %[p3], 8 \n\t"
2086 "srl %[p2], %[p2], 8 \n\t"
2087 "srl %[p1], %[p1], 8 \n\t"
2088 "srl %[p0], %[p0], 8 \n\t"
2089 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2090 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2091 :
2092 );
2093
2094 __asm__ __volatile__ (
2095 "sb %[p5], 2(%[s1]) \n\t"
2096 "sb %[p4], 1(%[s1]) \n\t"
2097 "sb %[p3], 0(%[s1]) \n\t"
2098 "sb %[p2], -1(%[s1]) \n\t"
2099 "sb %[p1], -2(%[s1]) \n\t"
2100 "sb %[p0], -3(%[s1]) \n\t"
2101 :
2102 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1) ,
2103 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2104 );
2105 }
2106 }
2107
2108 i += 4;
2109 }
2110
2111 while (i < count);
2112 }
2113
2114 void vp8_mbloop_filter_uvvertical_edge_mips
2115 (
2116 unsigned char *s,
2117 int p,
2118 unsigned int flimit,
2119 unsigned int limit,
2120 unsigned int thresh,
2121 int count
2122 )
2123 {
2124 uint32_t mask, hev;
2125 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
2126 unsigned char *s1, *s2, *s3, *s4;
2127 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
2128
2129 mask = 0;
2130 hev = 0;
2131 pm1 = 0;
2132 p0 = 0;
2133 p1 = 0;
2134 p2 = 0;
2135 p3 = 0;
2136 p4 = 0;
2137 p5 = 0;
2138 p6 = 0;
2139
2140 /* loop filter designed to work using chars so that we can make maximum use
2141 * of 8 bit simd instructions.
2142 */
2143
2144 /* apply filter on 4 pixesl at the same time */
2145
2146 s1 = s;
2147 s2 = s + p;
2148 s3 = s2 + p;
2149 s4 = s3 + p;
2150
2151 /* prefetch data for load */
2152 prefetch_load_lf(s + 2 * p);
2153
2154 /* load quad-byte vectors
2155 * memory is 4 byte aligned
2156 */
2157 p2 = *((uint32_t *)(s1 - 4));
2158 p6 = *((uint32_t *)(s1));
2159 p1 = *((uint32_t *)(s2 - 4));
2160 p5 = *((uint32_t *)(s2));
2161 p0 = *((uint32_t *)(s3 - 4));
2162 p4 = *((uint32_t *)(s3));
2163 pm1 = *((uint32_t *)(s4 - 4));
2164 p3 = *((uint32_t *)(s4));
2165
2166 /* transpose pm1, p0, p1, p2 */
2167 __asm__ __volatile__ (
2168 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
2169 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
2170 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
2171 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
2172
2173 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
2174 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
2175 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
2176 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
2177
2178 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
2179 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
2180 "append %[p1], %[sec3], 16 \n\t"
2181 "append %[pm1], %[sec4], 16 \n\t"
2182
2183 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
2184 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
2185 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
2186 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
2187 :
2188 );
2189
2190 /* transpose p3, p4, p5, p6 */
2191 __asm__ __volatile__ (
2192 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
2193 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
2194 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
2195 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
2196
2197 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
2198 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
2199 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
2200 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
2201
2202 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
2203 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
2204 "append %[p5], %[sec3], 16 \n\t"
2205 "append %[p3], %[sec4], 16 \n\t"
2206
2207 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
2208 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
2209 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2210 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
2211 :
2212 );
2213
2214 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2215 * mask will be zero and filtering is not needed
2216 */
2217 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
2218 {
2219
2220 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
2221 thresh, &hev, &mask);
2222
2223 /* if mask == 0 do filtering is not needed */
2224 if (mask)
2225 {
2226 /* filtering */
2227 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2228
2229 /* don't use transpose on output data
2230 * because memory isn't aligned
2231 */
2232 __asm__ __volatile__ (
2233 "sb %[p5], 2(%[s4]) \n\t"
2234 "sb %[p4], 1(%[s4]) \n\t"
2235 "sb %[p3], 0(%[s4]) \n\t"
2236 "sb %[p2], -1(%[s4]) \n\t"
2237 "sb %[p1], -2(%[s4]) \n\t"
2238 "sb %[p0], -3(%[s4]) \n\t"
2239 :
2240 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
2241 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2242 );
2243
2244 __asm__ __volatile__ (
2245 "srl %[p5], %[p5], 8 \n\t"
2246 "srl %[p4], %[p4], 8 \n\t"
2247 "srl %[p3], %[p3], 8 \n\t"
2248 "srl %[p2], %[p2], 8 \n\t"
2249 "srl %[p1], %[p1], 8 \n\t"
2250 "srl %[p0], %[p0], 8 \n\t"
2251 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2252 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2253 :
2254 );
2255
2256 __asm__ __volatile__ (
2257 "sb %[p5], 2(%[s3]) \n\t"
2258 "sb %[p4], 1(%[s3]) \n\t"
2259 "sb %[p3], 0(%[s3]) \n\t"
2260 "sb %[p2], -1(%[s3]) \n\t"
2261 "sb %[p1], -2(%[s3]) \n\t"
2262 "sb %[p0], -3(%[s3]) \n\t"
2263 :
2264 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
2265 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2266 );
2267
2268 __asm__ __volatile__ (
2269 "srl %[p5], %[p5], 8 \n\t"
2270 "srl %[p4], %[p4], 8 \n\t"
2271 "srl %[p3], %[p3], 8 \n\t"
2272 "srl %[p2], %[p2], 8 \n\t"
2273 "srl %[p1], %[p1], 8 \n\t"
2274 "srl %[p0], %[p0], 8 \n\t"
2275 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2276 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2277 :
2278 );
2279
2280 __asm__ __volatile__ (
2281 "sb %[p5], 2(%[s2]) \n\t"
2282 "sb %[p4], 1(%[s2]) \n\t"
2283 "sb %[p3], 0(%[s2]) \n\t"
2284 "sb %[p2], -1(%[s2]) \n\t"
2285 "sb %[p1], -2(%[s2]) \n\t"
2286 "sb %[p0], -3(%[s2]) \n\t"
2287 :
2288 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
2289 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2290 );
2291
2292 __asm__ __volatile__ (
2293 "srl %[p5], %[p5], 8 \n\t"
2294 "srl %[p4], %[p4], 8 \n\t"
2295 "srl %[p3], %[p3], 8 \n\t"
2296 "srl %[p2], %[p2], 8 \n\t"
2297 "srl %[p1], %[p1], 8 \n\t"
2298 "srl %[p0], %[p0], 8 \n\t"
2299 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2300 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2301 :
2302 );
2303
2304 __asm__ __volatile__ (
2305 "sb %[p5], 2(%[s1]) \n\t"
2306 "sb %[p4], 1(%[s1]) \n\t"
2307 "sb %[p3], 0(%[s1]) \n\t"
2308 "sb %[p2], -1(%[s1]) \n\t"
2309 "sb %[p1], -2(%[s1]) \n\t"
2310 "sb %[p0], -3(%[s1]) \n\t"
2311 :
2312 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
2313 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2314 );
2315 }
2316 }
2317
2318 s1 = s4 + p;
2319 s2 = s1 + p;
2320 s3 = s2 + p;
2321 s4 = s3 + p;
2322
2323 /* load quad-byte vectors
2324 * memory is 4 byte aligned
2325 */
2326 p2 = *((uint32_t *)(s1 - 4));
2327 p6 = *((uint32_t *)(s1));
2328 p1 = *((uint32_t *)(s2 - 4));
2329 p5 = *((uint32_t *)(s2));
2330 p0 = *((uint32_t *)(s3 - 4));
2331 p4 = *((uint32_t *)(s3));
2332 pm1 = *((uint32_t *)(s4 - 4));
2333 p3 = *((uint32_t *)(s4));
2334
2335 /* transpose pm1, p0, p1, p2 */
2336 __asm__ __volatile__ (
2337 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
2338 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
2339 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
2340 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
2341
2342 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
2343 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
2344 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
2345 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
2346
2347 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
2348 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
2349 "append %[p1], %[sec3], 16 \n\t"
2350 "append %[pm1], %[sec4], 16 \n\t"
2351
2352 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
2353 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
2354 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
2355 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
2356 :
2357 );
2358
2359 /* transpose p3, p4, p5, p6 */
2360 __asm__ __volatile__ (
2361 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
2362 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
2363 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
2364 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
2365
2366 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
2367 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
2368 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
2369 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
2370
2371 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
2372 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
2373 "append %[p5], %[sec3], 16 \n\t"
2374 "append %[p3], %[sec4], 16 \n\t"
2375
2376 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
2377 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
2378 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2379 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
2380 :
2381 );
2382
2383 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2384 * mask will be zero and filtering is not needed
2385 */
2386 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
2387 {
2388
2389 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, thresh, &hev, &mask);
2390
2391 /* if mask == 0 do filtering is not needed */
2392 if (mask)
2393 {
2394 /* filtering */
2395 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2396
2397 /* don't use transpose on output data
2398 * because memory isn't aligned
2399 */
2400 __asm__ __volatile__ (
2401 "sb %[p5], 2(%[s4]) \n\t"
2402 "sb %[p4], 1(%[s4]) \n\t"
2403 "sb %[p3], 0(%[s4]) \n\t"
2404 "sb %[p2], -1(%[s4]) \n\t"
2405 "sb %[p1], -2(%[s4]) \n\t"
2406 "sb %[p0], -3(%[s4]) \n\t"
2407 :
2408 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
2409 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2410 );
2411
2412 __asm__ __volatile__ (
2413 "srl %[p5], %[p5], 8 \n\t"
2414 "srl %[p4], %[p4], 8 \n\t"
2415 "srl %[p3], %[p3], 8 \n\t"
2416 "srl %[p2], %[p2], 8 \n\t"
2417 "srl %[p1], %[p1], 8 \n\t"
2418 "srl %[p0], %[p0], 8 \n\t"
2419 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2420 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2421 :
2422 );
2423
2424 __asm__ __volatile__ (
2425 "sb %[p5], 2(%[s3]) \n\t"
2426 "sb %[p4], 1(%[s3]) \n\t"
2427 "sb %[p3], 0(%[s3]) \n\t"
2428 "sb %[p2], -1(%[s3]) \n\t"
2429 "sb %[p1], -2(%[s3]) \n\t"
2430 "sb %[p0], -3(%[s3]) \n\t"
2431 :
2432 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
2433 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2434 );
2435
2436 __asm__ __volatile__ (
2437 "srl %[p5], %[p5], 8 \n\t"
2438 "srl %[p4], %[p4], 8 \n\t"
2439 "srl %[p3], %[p3], 8 \n\t"
2440 "srl %[p2], %[p2], 8 \n\t"
2441 "srl %[p1], %[p1], 8 \n\t"
2442 "srl %[p0], %[p0], 8 \n\t"
2443 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2444 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2445 :
2446 );
2447
2448 __asm__ __volatile__ (
2449 "sb %[p5], 2(%[s2]) \n\t"
2450 "sb %[p4], 1(%[s2]) \n\t"
2451 "sb %[p3], 0(%[s2]) \n\t"
2452 "sb %[p2], -1(%[s2]) \n\t"
2453 "sb %[p1], -2(%[s2]) \n\t"
2454 "sb %[p0], -3(%[s2]) \n\t"
2455 :
2456 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
2457 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2458 );
2459
2460 __asm__ __volatile__ (
2461 "srl %[p5], %[p5], 8 \n\t"
2462 "srl %[p4], %[p4], 8 \n\t"
2463 "srl %[p3], %[p3], 8 \n\t"
2464 "srl %[p2], %[p2], 8 \n\t"
2465 "srl %[p1], %[p1], 8 \n\t"
2466 "srl %[p0], %[p0], 8 \n\t"
2467 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2468 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
2469 :
2470 );
2471
2472 __asm__ __volatile__ (
2473 "sb %[p5], 2(%[s1]) \n\t"
2474 "sb %[p4], 1(%[s1]) \n\t"
2475 "sb %[p3], 0(%[s1]) \n\t"
2476 "sb %[p2], -1(%[s1]) \n\t"
2477 "sb %[p1], -2(%[s1]) \n\t"
2478 "sb %[p0], -3(%[s1]) \n\t"
2479 :
2480 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
2481 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
2482 );
2483 }
2484 }
2485 }
2486
2487 /* Horizontal MB filtering */
2488 void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsig ned char *v_ptr,
2489 int y_stride, int uv_stride, loop_filter_info *lf i)
2490 {
2491 unsigned int thresh_vec, flimit_vec, limit_vec;
2492 unsigned char thresh, flimit, limit, flimit_temp;
2493
2494 /* use direct value instead pointers */
2495 limit = *(lfi->lim);
2496 flimit_temp = *(lfi->mblim);
2497 thresh = *(lfi->hev_thr);
2498 flimit = flimit_temp;
2499
2500 /* create quad-byte */
2501 __asm__ __volatile__ (
2502 "replv.qb %[thresh_vec], %[thresh] \n\t"
2503 "replv.qb %[flimit_vec], %[flimit] \n\t"
2504 "replv.qb %[limit_vec], %[limit] \n\t"
2505 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [lim it_vec] "=r" (limit_vec)
2506 : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
2507 );
2508
2509 vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_ve c, thresh_vec, 16);
2510
2511 if (u_ptr)
2512 {
2513 vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec, l imit_vec, thresh_vec, 0);
2514 }
2515
2516 if (v_ptr)
2517 {
2518 vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec, l imit_vec, thresh_vec, 0);
2519 }
2520 }
2521
2522
2523 /* Vertical MB Filtering */
2524 void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsig ned char *v_ptr,
2525 int y_stride, int uv_stride, loop_filter_info *lf i)
2526 {
2527 unsigned int thresh_vec, flimit_vec, limit_vec;
2528 unsigned char thresh, flimit, limit, flimit_temp;
2529
2530 /* use direct value instead pointers */
2531 limit = *(lfi->lim);
2532 flimit_temp = *(lfi->mblim);
2533 thresh = *(lfi->hev_thr);
2534 flimit = flimit_temp;
2535
2536 /* create quad-byte */
2537 __asm__ __volatile__ (
2538 "replv.qb %[thresh_vec], %[thresh] \n\t"
2539 "replv.qb %[flimit_vec], %[flimit] \n\t"
2540 "replv.qb %[limit_vec], %[limit] \n\t"
2541 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [lim it_vec] "=r" (limit_vec)
2542 : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
2543 );
2544
2545 vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
2546
2547 if (u_ptr)
2548 vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec, lim it_vec, thresh_vec, 0);
2549
2550 if (v_ptr)
2551 vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec, lim it_vec, thresh_vec, 0);
2552 }
2553
2554
2555 /* Horizontal B Filtering */
2556 void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsign ed char *v_ptr,
2557 int y_stride, int uv_stride, loop_filter_info *lfi )
2558 {
2559 unsigned int thresh_vec, flimit_vec, limit_vec;
2560 unsigned char thresh, flimit, limit, flimit_temp;
2561
2562 /* use direct value instead pointers */
2563 limit = *(lfi->lim);
2564 flimit_temp = *(lfi->blim);
2565 thresh = *(lfi->hev_thr);
2566 flimit = flimit_temp;
2567
2568 /* create quad-byte */
2569 __asm__ __volatile__ (
2570 "replv.qb %[thresh_vec], %[thresh] \n\t"
2571 "replv.qb %[flimit_vec], %[flimit] \n\t"
2572 "replv.qb %[limit_vec], %[limit] \n\t"
2573 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [lim it_vec] "=r" (limit_vec)
2574 : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
2575 );
2576
2577 vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride, flimit_ vec, limit_vec, thresh_vec, 16);
2578 vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride, flimit_ vec, limit_vec, thresh_vec, 16);
2579 vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride, flimit _vec, limit_vec, thresh_vec, 16);
2580
2581 if (u_ptr)
2582 vp8_loop_filter_uvhorizontal_edge_mips(u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2583
2584 if (v_ptr)
2585 vp8_loop_filter_uvhorizontal_edge_mips(v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2586 }
2587
2588
2589 /* Vertical B Filtering */
2590 void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsign ed char *v_ptr,
2591 int y_stride, int uv_stride, loop_filter_info *lfi )
2592 {
2593 unsigned int thresh_vec, flimit_vec, limit_vec;
2594 unsigned char thresh, flimit, limit, flimit_temp;
2595
2596 /* use direct value instead pointers */
2597 limit = *(lfi->lim);
2598 flimit_temp = *(lfi->blim);
2599 thresh = *(lfi->hev_thr);
2600 flimit = flimit_temp;
2601
2602 /* create quad-byte */
2603 __asm__ __volatile__ (
2604 "replv.qb %[thresh_vec], %[thresh] \n\t"
2605 "replv.qb %[flimit_vec], %[flimit] \n\t"
2606 "replv.qb %[limit_vec], %[limit] \n\t"
2607 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [lim it_vec] "=r" (limit_vec)
2608 : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
2609 );
2610
2611 vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_ve c, thresh_vec, 16);
2612 vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_ve c, thresh_vec, 16);
2613 vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec, limit_v ec, thresh_vec, 16);
2614
2615 if (u_ptr)
2616 vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec, l imit_vec, thresh_vec, 0);
2617
2618 if (v_ptr)
2619 vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec, l imit_vec, thresh_vec, 0);
2620 }
2621
2622 #endif
OLDNEW
« no previous file with comments | « source/libvpx/vp8/common/invtrans.h ('k') | source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698