OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 | |
12 #include <stdlib.h> | |
13 #include "vp8_rtcd.h" | |
14 #include "vp8/common/onyxc_int.h" | |
15 | |
16 #if HAVE_DSPR2 | |
17 typedef unsigned char uc; | |
18 | |
19 /* prefetch data for load */ | |
20 inline void prefetch_load_lf(unsigned char *src) | |
21 { | |
22 __asm__ __volatile__ ( | |
23 "pref 0, 0(%[src]) \n\t" | |
24 : | |
25 : [src] "r" (src) | |
26 ); | |
27 } | |
28 | |
29 | |
30 /* prefetch data for store */ | |
31 inline void prefetch_store_lf(unsigned char *dst) | |
32 { | |
33 __asm__ __volatile__ ( | |
34 "pref 1, 0(%[dst]) \n\t" | |
35 : | |
36 : [dst] "r" (dst) | |
37 ); | |
38 } | |
39 | |
40 /* processing 4 pixels at the same time | |
41 * compute hev and mask in the same function | |
42 */ | |
43 static __inline void vp8_filter_mask_vec_mips | |
44 ( | |
45 uint32_t limit, | |
46 uint32_t flimit, | |
47 uint32_t p1, | |
48 uint32_t p0, | |
49 uint32_t p3, | |
50 uint32_t p2, | |
51 uint32_t q0, | |
52 uint32_t q1, | |
53 uint32_t q2, | |
54 uint32_t q3, | |
55 uint32_t thresh, | |
56 uint32_t *hev, | |
57 uint32_t *mask | |
58 ) | |
59 { | |
60 uint32_t c, r, r3, r_k; | |
61 uint32_t s1, s2, s3; | |
62 uint32_t ones = 0xFFFFFFFF; | |
63 uint32_t hev1; | |
64 | |
65 __asm__ __volatile__ ( | |
66 /* mask |= (abs(p3 - p2) > limit) */ | |
67 "subu_s.qb %[c], %[p3], %[p2] \n\t" | |
68 "subu_s.qb %[r_k], %[p2], %[p3] \n\t" | |
69 "or %[r_k], %[r_k], %[c] \n\t" | |
70 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" | |
71 "or %[r], $0, %[c] \n\t" | |
72 | |
73 /* mask |= (abs(p2 - p1) > limit) */ | |
74 "subu_s.qb %[c], %[p2], %[p1] \n\t" | |
75 "subu_s.qb %[r_k], %[p1], %[p2] \n\t" | |
76 "or %[r_k], %[r_k], %[c] \n\t" | |
77 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" | |
78 "or %[r], %[r], %[c] \n\t" | |
79 | |
80 /* mask |= (abs(p1 - p0) > limit) | |
81 * hev |= (abs(p1 - p0) > thresh) | |
82 */ | |
83 "subu_s.qb %[c], %[p1], %[p0] \n\t" | |
84 "subu_s.qb %[r_k], %[p0], %[p1] \n\t" | |
85 "or %[r_k], %[r_k], %[c] \n\t" | |
86 "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" | |
87 "or %[r3], $0, %[c] \n\t" | |
88 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" | |
89 "or %[r], %[r], %[c] \n\t" | |
90 | |
91 /* mask |= (abs(q1 - q0) > limit) | |
92 * hev |= (abs(q1 - q0) > thresh) | |
93 */ | |
94 "subu_s.qb %[c], %[q1], %[q0] \n\t" | |
95 "subu_s.qb %[r_k], %[q0], %[q1] \n\t" | |
96 "or %[r_k], %[r_k], %[c] \n\t" | |
97 "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" | |
98 "or %[r3], %[r3], %[c] \n\t" | |
99 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" | |
100 "or %[r], %[r], %[c] \n\t" | |
101 | |
102 /* mask |= (abs(q2 - q1) > limit) */ | |
103 "subu_s.qb %[c], %[q2], %[q1] \n\t" | |
104 "subu_s.qb %[r_k], %[q1], %[q2] \n\t" | |
105 "or %[r_k], %[r_k], %[c] \n\t" | |
106 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" | |
107 "or %[r], %[r], %[c] \n\t" | |
108 "sll %[r3], %[r3], 24 \n\t" | |
109 | |
110 /* mask |= (abs(q3 - q2) > limit) */ | |
111 "subu_s.qb %[c], %[q3], %[q2] \n\t" | |
112 "subu_s.qb %[r_k], %[q2], %[q3] \n\t" | |
113 "or %[r_k], %[r_k], %[c] \n\t" | |
114 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" | |
115 "or %[r], %[r], %[c] \n\t" | |
116 | |
117 : [c] "=&r" (c), [r_k] "=&r" (r_k), | |
118 [r] "=&r" (r), [r3] "=&r" (r3) | |
119 : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2), | |
120 [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0), | |
121 [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh) | |
122 ); | |
123 | |
124 __asm__ __volatile__ ( | |
125 /* abs(p0 - q0) */ | |
126 "subu_s.qb %[c], %[p0], %[q0] \n\t" | |
127 "subu_s.qb %[r_k], %[q0], %[p0] \n\t" | |
128 "wrdsp %[r3] \n\t" | |
129 "or %[s1], %[r_k], %[c] \n\t" | |
130 | |
131 /* abs(p1 - q1) */ | |
132 "subu_s.qb %[c], %[p1], %[q1] \n\t" | |
133 "addu_s.qb %[s3], %[s1], %[s1] \n\t" | |
134 "pick.qb %[hev1], %[ones], $0 \n\t" | |
135 "subu_s.qb %[r_k], %[q1], %[p1] \n\t" | |
136 "or %[s2], %[r_k], %[c] \n\t" | |
137 | |
138 /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ | |
139 "shrl.qb %[s2], %[s2], 1 \n\t" | |
140 "addu_s.qb %[s1], %[s2], %[s3] \n\t" | |
141 "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" | |
142 "or %[r], %[r], %[c] \n\t" | |
143 "sll %[r], %[r], 24 \n\t" | |
144 | |
145 "wrdsp %[r] \n\t" | |
146 "pick.qb %[s2], $0, %[ones] \n\t" | |
147 | |
148 : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1)
, | |
149 [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3) | |
150 : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3), | |
151 [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit) | |
152 ); | |
153 | |
154 *hev = hev1; | |
155 *mask = s2; | |
156 } | |
157 | |
158 | |
159 /* inputs & outputs are quad-byte vectors */ | |
160 static __inline void vp8_filter_mips | |
161 ( | |
162 uint32_t mask, | |
163 uint32_t hev, | |
164 uint32_t *ps1, | |
165 uint32_t *ps0, | |
166 uint32_t *qs0, | |
167 uint32_t *qs1 | |
168 ) | |
169 { | |
170 int32_t vp8_filter_l, vp8_filter_r; | |
171 int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; | |
172 int32_t subr_r, subr_l; | |
173 uint32_t t1, t2, HWM, t3; | |
174 uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; | |
175 | |
176 int32_t vps1, vps0, vqs0, vqs1; | |
177 int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; | |
178 uint32_t N128; | |
179 | |
180 N128 = 0x80808080; | |
181 t1 = 0x03000300; | |
182 t2 = 0x04000400; | |
183 t3 = 0x01000100; | |
184 HWM = 0xFF00FF00; | |
185 | |
186 vps0 = (*ps0) ^ N128; | |
187 vps1 = (*ps1) ^ N128; | |
188 vqs0 = (*qs0) ^ N128; | |
189 vqs1 = (*qs1) ^ N128; | |
190 | |
191 /* use halfword pairs instead quad-bytes because of accuracy */ | |
192 vps0_l = vps0 & HWM; | |
193 vps0_r = vps0 << 8; | |
194 vps0_r = vps0_r & HWM; | |
195 | |
196 vps1_l = vps1 & HWM; | |
197 vps1_r = vps1 << 8; | |
198 vps1_r = vps1_r & HWM; | |
199 | |
200 vqs0_l = vqs0 & HWM; | |
201 vqs0_r = vqs0 << 8; | |
202 vqs0_r = vqs0_r & HWM; | |
203 | |
204 vqs1_l = vqs1 & HWM; | |
205 vqs1_r = vqs1 << 8; | |
206 vqs1_r = vqs1_r & HWM; | |
207 | |
208 mask_l = mask & HWM; | |
209 mask_r = mask << 8; | |
210 mask_r = mask_r & HWM; | |
211 | |
212 hev_l = hev & HWM; | |
213 hev_r = hev << 8; | |
214 hev_r = hev_r & HWM; | |
215 | |
216 __asm__ __volatile__ ( | |
217 /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */ | |
218 "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t" | |
219 "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t" | |
220 | |
221 /* qs0 - ps0 */ | |
222 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" | |
223 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" | |
224 | |
225 /* vp8_filter &= hev; */ | |
226 "and %[vp8_filter_l], %[vp8_filter_l], %[hev_l] \n\t" | |
227 "and %[vp8_filter_r], %[vp8_filter_r], %[hev_r] \n\t" | |
228 | |
229 /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */ | |
230 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" | |
231 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" | |
232 "xor %[invhev_l], %[hev_l], %[HWM] \n\t" | |
233 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" | |
234 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" | |
235 "xor %[invhev_r], %[hev_r], %[HWM] \n\t" | |
236 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" | |
237 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" | |
238 | |
239 /* vp8_filter &= mask; */ | |
240 "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t" | |
241 "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t" | |
242 | |
243 : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=&r" (vp8_filter_
r), | |
244 [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r), | |
245 [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r) | |
246 | |
247 : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l), | |
248 [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r), | |
249 [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r), | |
250 [mask_l] "r" (mask_l), [mask_r] "r" (mask_r), | |
251 [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), | |
252 [HWM] "r" (HWM) | |
253 ); | |
254 | |
255 /* save bottom 3 bits so that we round one side +4 and the other +3 */ | |
256 __asm__ __volatile__ ( | |
257 /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */ | |
258 "addq_s.ph %[Filter1_l], %[vp8_filter_l], %[t2] \n\t" | |
259 "addq_s.ph %[Filter1_r], %[vp8_filter_r], %[t2] \n\t" | |
260 | |
261 /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */ | |
262 "addq_s.ph %[Filter2_l], %[vp8_filter_l], %[t1] \n\t" | |
263 "addq_s.ph %[Filter2_r], %[vp8_filter_r], %[t1] \n\t" | |
264 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" | |
265 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" | |
266 | |
267 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" | |
268 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" | |
269 | |
270 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" | |
271 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" | |
272 | |
273 /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */ | |
274 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" | |
275 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" | |
276 | |
277 /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */ | |
278 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" | |
279 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" | |
280 | |
281 : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r), | |
282 [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r), | |
283 [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r), | |
284 [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r) | |
285 | |
286 : [t1] "r" (t1), [t2] "r" (t2), | |
287 [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r), | |
288 [HWM] "r" (HWM) | |
289 ); | |
290 | |
291 __asm__ __volatile__ ( | |
292 /* (vp8_filter += 1) >>= 1 */ | |
293 "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" | |
294 "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" | |
295 | |
296 /* vp8_filter &= ~hev; */ | |
297 "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" | |
298 "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" | |
299 | |
300 /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */ | |
301 "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" | |
302 "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" | |
303 | |
304 /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */ | |
305 "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" | |
306 "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" | |
307 | |
308 : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r), | |
309 [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), | |
310 [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r) | |
311 | |
312 : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r) | |
313 ); | |
314 | |
315 /* Create quad-bytes from halfword pairs */ | |
316 vqs0_l = vqs0_l & HWM; | |
317 vqs1_l = vqs1_l & HWM; | |
318 vps0_l = vps0_l & HWM; | |
319 vps1_l = vps1_l & HWM; | |
320 | |
321 __asm__ __volatile__ ( | |
322 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" | |
323 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" | |
324 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" | |
325 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" | |
326 | |
327 : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r), | |
328 [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r) | |
329 : | |
330 ); | |
331 | |
332 vqs0 = vqs0_l | vqs0_r; | |
333 vqs1 = vqs1_l | vqs1_r; | |
334 vps0 = vps0_l | vps0_r; | |
335 vps1 = vps1_l | vps1_r; | |
336 | |
337 *ps0 = vps0 ^ N128; | |
338 *ps1 = vps1 ^ N128; | |
339 *qs0 = vqs0 ^ N128; | |
340 *qs1 = vqs1 ^ N128; | |
341 } | |
342 | |
343 void vp8_loop_filter_horizontal_edge_mips | |
344 ( | |
345 unsigned char *s, | |
346 int p, | |
347 unsigned int flimit, | |
348 unsigned int limit, | |
349 unsigned int thresh, | |
350 int count | |
351 ) | |
352 { | |
353 uint32_t mask; | |
354 uint32_t hev; | |
355 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; | |
356 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; | |
357 | |
358 mask = 0; | |
359 hev = 0; | |
360 p1 = 0; | |
361 p2 = 0; | |
362 p3 = 0; | |
363 p4 = 0; | |
364 | |
365 /* prefetch data for store */ | |
366 prefetch_store_lf(s); | |
367 | |
368 /* loop filter designed to work using chars so that we can make maximum use | |
369 * of 8 bit simd instructions. | |
370 */ | |
371 | |
372 sm1 = s - (p << 2); | |
373 s0 = s - p - p - p; | |
374 s1 = s - p - p ; | |
375 s2 = s - p; | |
376 s3 = s; | |
377 s4 = s + p; | |
378 s5 = s + p + p; | |
379 s6 = s + p + p + p; | |
380 | |
381 /* load quad-byte vectors | |
382 * memory is 4 byte aligned | |
383 */ | |
384 p1 = *((uint32_t *)(s1)); | |
385 p2 = *((uint32_t *)(s2)); | |
386 p3 = *((uint32_t *)(s3)); | |
387 p4 = *((uint32_t *)(s4)); | |
388 | |
389 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
390 * mask will be zero and filtering is not needed | |
391 */ | |
392 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
393 { | |
394 | |
395 pm1 = *((uint32_t *)(sm1)); | |
396 p0 = *((uint32_t *)(s0)); | |
397 p5 = *((uint32_t *)(s5)); | |
398 p6 = *((uint32_t *)(s6)); | |
399 | |
400 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, | |
401 thresh, &hev, &mask); | |
402 | |
403 /* if mask == 0 do filtering is not needed */ | |
404 if (mask) | |
405 { | |
406 /* filtering */ | |
407 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); | |
408 | |
409 /* unpack processed 4x4 neighborhood */ | |
410 *((uint32_t *)s1) = p1; | |
411 *((uint32_t *)s2) = p2; | |
412 *((uint32_t *)s3) = p3; | |
413 *((uint32_t *)s4) = p4; | |
414 } | |
415 } | |
416 | |
417 sm1 += 4; | |
418 s0 += 4; | |
419 s1 += 4; | |
420 s2 += 4; | |
421 s3 += 4; | |
422 s4 += 4; | |
423 s5 += 4; | |
424 s6 += 4; | |
425 | |
426 /* load quad-byte vectors | |
427 * memory is 4 byte aligned | |
428 */ | |
429 p1 = *((uint32_t *)(s1)); | |
430 p2 = *((uint32_t *)(s2)); | |
431 p3 = *((uint32_t *)(s3)); | |
432 p4 = *((uint32_t *)(s4)); | |
433 | |
434 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
435 * mask will be zero and filtering is not needed | |
436 */ | |
437 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
438 { | |
439 | |
440 pm1 = *((uint32_t *)(sm1)); | |
441 p0 = *((uint32_t *)(s0)); | |
442 p5 = *((uint32_t *)(s5)); | |
443 p6 = *((uint32_t *)(s6)); | |
444 | |
445 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, | |
446 thresh, &hev, &mask); | |
447 | |
448 /* if mask == 0 do filtering is not needed */ | |
449 if (mask) | |
450 { | |
451 /* filtering */ | |
452 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); | |
453 | |
454 /* unpack processed 4x4 neighborhood */ | |
455 *((uint32_t *)s1) = p1; | |
456 *((uint32_t *)s2) = p2; | |
457 *((uint32_t *)s3) = p3; | |
458 *((uint32_t *)s4) = p4; | |
459 } | |
460 } | |
461 | |
462 sm1 += 4; | |
463 s0 += 4; | |
464 s1 += 4; | |
465 s2 += 4; | |
466 s3 += 4; | |
467 s4 += 4; | |
468 s5 += 4; | |
469 s6 += 4; | |
470 | |
471 /* load quad-byte vectors | |
472 * memory is 4 byte aligned | |
473 */ | |
474 p1 = *((uint32_t *)(s1)); | |
475 p2 = *((uint32_t *)(s2)); | |
476 p3 = *((uint32_t *)(s3)); | |
477 p4 = *((uint32_t *)(s4)); | |
478 | |
479 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
480 * mask will be zero and filtering is not needed | |
481 */ | |
482 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
483 { | |
484 | |
485 pm1 = *((uint32_t *)(sm1)); | |
486 p0 = *((uint32_t *)(s0)); | |
487 p5 = *((uint32_t *)(s5)); | |
488 p6 = *((uint32_t *)(s6)); | |
489 | |
490 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, | |
491 thresh, &hev, &mask); | |
492 | |
493 /* if mask == 0 do filtering is not needed */ | |
494 if (mask) | |
495 { | |
496 /* filtering */ | |
497 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); | |
498 | |
499 /* unpack processed 4x4 neighborhood */ | |
500 *((uint32_t *)s1) = p1; | |
501 *((uint32_t *)s2) = p2; | |
502 *((uint32_t *)s3) = p3; | |
503 *((uint32_t *)s4) = p4; | |
504 } | |
505 } | |
506 | |
507 sm1 += 4; | |
508 s0 += 4; | |
509 s1 += 4; | |
510 s2 += 4; | |
511 s3 += 4; | |
512 s4 += 4; | |
513 s5 += 4; | |
514 s6 += 4; | |
515 | |
516 /* load quad-byte vectors | |
517 * memory is 4 byte aligned | |
518 */ | |
519 p1 = *((uint32_t *)(s1)); | |
520 p2 = *((uint32_t *)(s2)); | |
521 p3 = *((uint32_t *)(s3)); | |
522 p4 = *((uint32_t *)(s4)); | |
523 | |
524 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
525 * mask will be zero and filtering is not needed | |
526 */ | |
527 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
528 { | |
529 | |
530 pm1 = *((uint32_t *)(sm1)); | |
531 p0 = *((uint32_t *)(s0)); | |
532 p5 = *((uint32_t *)(s5)); | |
533 p6 = *((uint32_t *)(s6)); | |
534 | |
535 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, | |
536 thresh, &hev, &mask); | |
537 | |
538 /* if mask == 0 do filtering is not needed */ | |
539 if (mask) | |
540 { | |
541 /* filtering */ | |
542 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); | |
543 | |
544 /* unpack processed 4x4 neighborhood */ | |
545 *((uint32_t *)s1) = p1; | |
546 *((uint32_t *)s2) = p2; | |
547 *((uint32_t *)s3) = p3; | |
548 *((uint32_t *)s4) = p4; | |
549 } | |
550 } | |
551 } | |
552 | |
553 void vp8_loop_filter_uvhorizontal_edge_mips | |
554 ( | |
555 unsigned char *s, | |
556 int p, | |
557 unsigned int flimit, | |
558 unsigned int limit, | |
559 unsigned int thresh, | |
560 int count | |
561 ) | |
562 { | |
563 uint32_t mask; | |
564 uint32_t hev; | |
565 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; | |
566 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; | |
567 | |
568 mask = 0; | |
569 hev = 0; | |
570 p1 = 0; | |
571 p2 = 0; | |
572 p3 = 0; | |
573 p4 = 0; | |
574 | |
575 /* loop filter designed to work using chars so that we can make maximum use | |
576 * of 8 bit simd instructions. | |
577 */ | |
578 | |
579 sm1 = s - (p << 2); | |
580 s0 = s - p - p - p; | |
581 s1 = s - p - p ; | |
582 s2 = s - p; | |
583 s3 = s; | |
584 s4 = s + p; | |
585 s5 = s + p + p; | |
586 s6 = s + p + p + p; | |
587 | |
588 /* load quad-byte vectors | |
589 * memory is 4 byte aligned | |
590 */ | |
591 p1 = *((uint32_t *)(s1)); | |
592 p2 = *((uint32_t *)(s2)); | |
593 p3 = *((uint32_t *)(s3)); | |
594 p4 = *((uint32_t *)(s4)); | |
595 | |
596 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
597 * mask will be zero and filtering is not needed | |
598 */ | |
599 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
600 { | |
601 | |
602 pm1 = *((uint32_t *)(sm1)); | |
603 p0 = *((uint32_t *)(s0)); | |
604 p5 = *((uint32_t *)(s5)); | |
605 p6 = *((uint32_t *)(s6)); | |
606 | |
607 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, | |
608 thresh, &hev, &mask); | |
609 | |
610 /* if mask == 0 do filtering is not needed */ | |
611 if (mask) | |
612 { | |
613 /* filtering */ | |
614 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); | |
615 | |
616 /* unpack processed 4x4 neighborhood */ | |
617 *((uint32_t *)s1) = p1; | |
618 *((uint32_t *)s2) = p2; | |
619 *((uint32_t *)s3) = p3; | |
620 *((uint32_t *)s4) = p4; | |
621 } | |
622 } | |
623 | |
624 sm1 += 4; | |
625 s0 += 4; | |
626 s1 += 4; | |
627 s2 += 4; | |
628 s3 += 4; | |
629 s4 += 4; | |
630 s5 += 4; | |
631 s6 += 4; | |
632 | |
633 /* load quad-byte vectors | |
634 * memory is 4 byte aligned | |
635 */ | |
636 p1 = *((uint32_t *)(s1)); | |
637 p2 = *((uint32_t *)(s2)); | |
638 p3 = *((uint32_t *)(s3)); | |
639 p4 = *((uint32_t *)(s4)); | |
640 | |
641 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
642 * mask will be zero and filtering is not needed | |
643 */ | |
644 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
645 { | |
646 | |
647 pm1 = *((uint32_t *)(sm1)); | |
648 p0 = *((uint32_t *)(s0)); | |
649 p5 = *((uint32_t *)(s5)); | |
650 p6 = *((uint32_t *)(s6)); | |
651 | |
652 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, | |
653 thresh, &hev, &mask); | |
654 | |
655 /* if mask == 0 do filtering is not needed */ | |
656 if (mask) | |
657 { | |
658 /* filtering */ | |
659 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); | |
660 | |
661 /* unpack processed 4x4 neighborhood */ | |
662 *((uint32_t *)s1) = p1; | |
663 *((uint32_t *)s2) = p2; | |
664 *((uint32_t *)s3) = p3; | |
665 *((uint32_t *)s4) = p4; | |
666 } | |
667 } | |
668 } | |
669 | |
670 void vp8_loop_filter_vertical_edge_mips | |
671 ( | |
672 unsigned char *s, | |
673 int p, | |
674 const unsigned int flimit, | |
675 const unsigned int limit, | |
676 const unsigned int thresh, | |
677 int count | |
678 ) | |
679 { | |
680 int i; | |
681 uint32_t mask, hev; | |
682 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; | |
683 unsigned char *s1, *s2, *s3, *s4; | |
684 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; | |
685 | |
686 hev = 0; | |
687 mask = 0; | |
688 i = 0; | |
689 pm1 = 0; | |
690 p0 = 0; | |
691 p1 = 0; | |
692 p2 = 0; | |
693 p3 = 0; | |
694 p4 = 0; | |
695 p5 = 0; | |
696 p6 = 0; | |
697 | |
698 /* loop filter designed to work using chars so that we can make maximum use | |
699 * of 8 bit simd instructions. | |
700 */ | |
701 | |
702 /* apply filter on 4 pixesl at the same time */ | |
703 do | |
704 { | |
705 | |
706 /* prefetch data for store */ | |
707 prefetch_store_lf(s + p); | |
708 | |
709 s1 = s; | |
710 s2 = s + p; | |
711 s3 = s2 + p; | |
712 s4 = s3 + p; | |
713 s = s4 + p; | |
714 | |
715 /* load quad-byte vectors | |
716 * memory is 4 byte aligned | |
717 */ | |
718 p2 = *((uint32_t *)(s1 - 4)); | |
719 p6 = *((uint32_t *)(s1)); | |
720 p1 = *((uint32_t *)(s2 - 4)); | |
721 p5 = *((uint32_t *)(s2)); | |
722 p0 = *((uint32_t *)(s3 - 4)); | |
723 p4 = *((uint32_t *)(s3)); | |
724 pm1 = *((uint32_t *)(s4 - 4)); | |
725 p3 = *((uint32_t *)(s4)); | |
726 | |
727 /* transpose pm1, p0, p1, p2 */ | |
728 __asm__ __volatile__ ( | |
729 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" | |
730 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" | |
731 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" | |
732 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" | |
733 | |
734 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" | |
735 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" | |
736 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
737 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
738 | |
739 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" | |
740 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" | |
741 "append %[p1], %[sec3], 16 \n\t" | |
742 "append %[pm1], %[sec4], 16 \n\t" | |
743 | |
744 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
745 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
746 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), | |
747 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
748 : | |
749 ); | |
750 | |
751 /* transpose p3, p4, p5, p6 */ | |
752 __asm__ __volatile__ ( | |
753 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" | |
754 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" | |
755 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" | |
756 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" | |
757 | |
758 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" | |
759 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" | |
760 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
761 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
762 | |
763 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" | |
764 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" | |
765 "append %[p5], %[sec3], 16 \n\t" | |
766 "append %[p3], %[sec4], 16 \n\t" | |
767 | |
768 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
769 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
770 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
771 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
772 : | |
773 ); | |
774 | |
775 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
776 * mask will be zero and filtering is not needed | |
777 */ | |
778 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
779 { | |
780 | |
781 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5,
p6, | |
782 thresh, &hev, &mask); | |
783 | |
784 /* if mask == 0 do filtering is not needed */ | |
785 if (mask) | |
786 { | |
787 /* filtering */ | |
788 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); | |
789 | |
790 /* unpack processed 4x4 neighborhood | |
791 * don't use transpose on output data | |
792 * because memory isn't aligned | |
793 */ | |
794 __asm__ __volatile__ ( | |
795 "sb %[p4], 1(%[s4]) \n\t" | |
796 "sb %[p3], 0(%[s4]) \n\t" | |
797 "sb %[p2], -1(%[s4]) \n\t" | |
798 "sb %[p1], -2(%[s4]) \n\t" | |
799 : | |
800 : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4), | |
801 [p2] "r" (p2), [p1] "r" (p1) | |
802 ); | |
803 | |
804 __asm__ __volatile__ ( | |
805 "srl %[p4], %[p4], 8 \n\t" | |
806 "srl %[p3], %[p3], 8 \n\t" | |
807 "srl %[p2], %[p2], 8 \n\t" | |
808 "srl %[p1], %[p1], 8 \n\t" | |
809 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r"
(p1) | |
810 : | |
811 ); | |
812 | |
813 __asm__ __volatile__ ( | |
814 "sb %[p4], 1(%[s3]) \n\t" | |
815 "sb %[p3], 0(%[s3]) \n\t" | |
816 "sb %[p2], -1(%[s3]) \n\t" | |
817 "sb %[p1], -2(%[s3]) \n\t" | |
818 : [p1] "+r" (p1) | |
819 : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2) | |
820 ); | |
821 | |
822 __asm__ __volatile__ ( | |
823 "srl %[p4], %[p4], 8 \n\t" | |
824 "srl %[p3], %[p3], 8 \n\t" | |
825 "srl %[p2], %[p2], 8 \n\t" | |
826 "srl %[p1], %[p1], 8 \n\t" | |
827 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r"
(p1) | |
828 : | |
829 ); | |
830 | |
831 __asm__ __volatile__ ( | |
832 "sb %[p4], 1(%[s2]) \n\t" | |
833 "sb %[p3], 0(%[s2]) \n\t" | |
834 "sb %[p2], -1(%[s2]) \n\t" | |
835 "sb %[p1], -2(%[s2]) \n\t" | |
836 : | |
837 : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2), | |
838 [p2] "r" (p2), [p1] "r" (p1) | |
839 ); | |
840 | |
841 __asm__ __volatile__ ( | |
842 "srl %[p4], %[p4], 8 \n\t" | |
843 "srl %[p3], %[p3], 8 \n\t" | |
844 "srl %[p2], %[p2], 8 \n\t" | |
845 "srl %[p1], %[p1], 8 \n\t" | |
846 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r"
(p1) | |
847 : | |
848 ); | |
849 | |
850 __asm__ __volatile__ ( | |
851 "sb %[p4], 1(%[s1]) \n\t" | |
852 "sb %[p3], 0(%[s1]) \n\t" | |
853 "sb %[p2], -1(%[s1]) \n\t" | |
854 "sb %[p1], -2(%[s1]) \n\t" | |
855 : | |
856 : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), | |
857 [p2] "r" (p2), [p1] "r" (p1) | |
858 ); | |
859 } | |
860 } | |
861 | |
862 s1 = s; | |
863 s2 = s + p; | |
864 s3 = s2 + p; | |
865 s4 = s3 + p; | |
866 s = s4 + p; | |
867 | |
868 /* load quad-byte vectors | |
869 * memory is 4 byte aligned | |
870 */ | |
871 p2 = *((uint32_t *)(s1 - 4)); | |
872 p6 = *((uint32_t *)(s1)); | |
873 p1 = *((uint32_t *)(s2 - 4)); | |
874 p5 = *((uint32_t *)(s2)); | |
875 p0 = *((uint32_t *)(s3 - 4)); | |
876 p4 = *((uint32_t *)(s3)); | |
877 pm1 = *((uint32_t *)(s4 - 4)); | |
878 p3 = *((uint32_t *)(s4)); | |
879 | |
880 /* transpose pm1, p0, p1, p2 */ | |
881 __asm__ __volatile__ ( | |
882 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" | |
883 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" | |
884 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" | |
885 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" | |
886 | |
887 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" | |
888 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" | |
889 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
890 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
891 | |
892 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" | |
893 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" | |
894 "append %[p1], %[sec3], 16 \n\t" | |
895 "append %[pm1], %[sec4], 16 \n\t" | |
896 | |
897 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
898 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
899 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), | |
900 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
901 : | |
902 ); | |
903 | |
904 /* transpose p3, p4, p5, p6 */ | |
905 __asm__ __volatile__ ( | |
906 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" | |
907 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" | |
908 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" | |
909 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" | |
910 | |
911 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" | |
912 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" | |
913 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
914 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
915 | |
916 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" | |
917 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" | |
918 "append %[p5], %[sec3], 16 \n\t" | |
919 "append %[p3], %[sec4], 16 \n\t" | |
920 | |
921 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
922 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
923 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
924 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
925 : | |
926 ); | |
927 | |
928 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
929 * mask will be zero and filtering is not needed | |
930 */ | |
931 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
932 { | |
933 | |
934 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5,
p6, | |
935 thresh, &hev, &mask); | |
936 | |
937 /* if mask == 0 do filtering is not needed */ | |
938 if (mask) | |
939 { | |
940 /* filtering */ | |
941 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); | |
942 | |
943 /* unpack processed 4x4 neighborhood | |
944 * don't use transpose on output data | |
945 * because memory isn't aligned | |
946 */ | |
947 __asm__ __volatile__ ( | |
948 "sb %[p4], 1(%[s4]) \n\t" | |
949 "sb %[p3], 0(%[s4]) \n\t" | |
950 "sb %[p2], -1(%[s4]) \n\t" | |
951 "sb %[p1], -2(%[s4]) \n\t" | |
952 : | |
953 : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4), | |
954 [p2] "r" (p2), [p1] "r" (p1) | |
955 ); | |
956 | |
957 __asm__ __volatile__ ( | |
958 "srl %[p4], %[p4], 8 \n\t" | |
959 "srl %[p3], %[p3], 8 \n\t" | |
960 "srl %[p2], %[p2], 8 \n\t" | |
961 "srl %[p1], %[p1], 8 \n\t" | |
962 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r"
(p1) | |
963 : | |
964 ); | |
965 | |
966 __asm__ __volatile__ ( | |
967 "sb %[p4], 1(%[s3]) \n\t" | |
968 "sb %[p3], 0(%[s3]) \n\t" | |
969 "sb %[p2], -1(%[s3]) \n\t" | |
970 "sb %[p1], -2(%[s3]) \n\t" | |
971 : [p1] "+r" (p1) | |
972 : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2) | |
973 ); | |
974 | |
975 __asm__ __volatile__ ( | |
976 "srl %[p4], %[p4], 8 \n\t" | |
977 "srl %[p3], %[p3], 8 \n\t" | |
978 "srl %[p2], %[p2], 8 \n\t" | |
979 "srl %[p1], %[p1], 8 \n\t" | |
980 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r"
(p1) | |
981 : | |
982 ); | |
983 | |
984 __asm__ __volatile__ ( | |
985 "sb %[p4], 1(%[s2]) \n\t" | |
986 "sb %[p3], 0(%[s2]) \n\t" | |
987 "sb %[p2], -1(%[s2]) \n\t" | |
988 "sb %[p1], -2(%[s2]) \n\t" | |
989 : | |
990 : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2), | |
991 [p2] "r" (p2), [p1] "r" (p1) | |
992 ); | |
993 | |
994 __asm__ __volatile__ ( | |
995 "srl %[p4], %[p4], 8 \n\t" | |
996 "srl %[p3], %[p3], 8 \n\t" | |
997 "srl %[p2], %[p2], 8 \n\t" | |
998 "srl %[p1], %[p1], 8 \n\t" | |
999 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r"
(p1) | |
1000 : | |
1001 ); | |
1002 | |
1003 __asm__ __volatile__ ( | |
1004 "sb %[p4], 1(%[s1]) \n\t" | |
1005 "sb %[p3], 0(%[s1]) \n\t" | |
1006 "sb %[p2], -1(%[s1]) \n\t" | |
1007 "sb %[p1], -2(%[s1]) \n\t" | |
1008 : | |
1009 : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), | |
1010 [p2] "r" (p2), [p1] "r" (p1) | |
1011 ); | |
1012 } | |
1013 } | |
1014 | |
1015 i += 8; | |
1016 } | |
1017 | |
1018 while (i < count); | |
1019 } | |
1020 | |
1021 void vp8_loop_filter_uvvertical_edge_mips | |
1022 ( | |
1023 unsigned char *s, | |
1024 int p, | |
1025 unsigned int flimit, | |
1026 unsigned int limit, | |
1027 unsigned int thresh, | |
1028 int count | |
1029 ) | |
1030 { | |
1031 uint32_t mask, hev; | |
1032 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; | |
1033 unsigned char *s1, *s2, *s3, *s4; | |
1034 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; | |
1035 | |
1036 /* loop filter designed to work using chars so that we can make maximum use | |
1037 * of 8 bit simd instructions. | |
1038 */ | |
1039 | |
1040 /* apply filter on 4 pixesl at the same time */ | |
1041 | |
1042 s1 = s; | |
1043 s2 = s + p; | |
1044 s3 = s2 + p; | |
1045 s4 = s3 + p; | |
1046 | |
1047 /* load quad-byte vectors | |
1048 * memory is 4 byte aligned | |
1049 */ | |
1050 p2 = *((uint32_t *)(s1 - 4)); | |
1051 p6 = *((uint32_t *)(s1)); | |
1052 p1 = *((uint32_t *)(s2 - 4)); | |
1053 p5 = *((uint32_t *)(s2)); | |
1054 p0 = *((uint32_t *)(s3 - 4)); | |
1055 p4 = *((uint32_t *)(s3)); | |
1056 pm1 = *((uint32_t *)(s4 - 4)); | |
1057 p3 = *((uint32_t *)(s4)); | |
1058 | |
1059 /* transpose pm1, p0, p1, p2 */ | |
1060 __asm__ __volatile__ ( | |
1061 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" | |
1062 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" | |
1063 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" | |
1064 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" | |
1065 | |
1066 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" | |
1067 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" | |
1068 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
1069 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
1070 | |
1071 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" | |
1072 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" | |
1073 "append %[p1], %[sec3], 16 \n\t" | |
1074 "append %[pm1], %[sec4], 16 \n\t" | |
1075 | |
1076 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
1077 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
1078 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), | |
1079 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
1080 : | |
1081 ); | |
1082 | |
1083 /* transpose p3, p4, p5, p6 */ | |
1084 __asm__ __volatile__ ( | |
1085 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" | |
1086 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" | |
1087 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" | |
1088 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" | |
1089 | |
1090 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" | |
1091 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" | |
1092 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
1093 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
1094 | |
1095 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" | |
1096 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" | |
1097 "append %[p5], %[sec3], 16 \n\t" | |
1098 "append %[p3], %[sec4], 16 \n\t" | |
1099 | |
1100 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
1101 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
1102 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
1103 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
1104 : | |
1105 ); | |
1106 | |
1107 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
1108 * mask will be zero and filtering is not needed | |
1109 */ | |
1110 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
1111 { | |
1112 | |
1113 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, | |
1114 thresh, &hev, &mask); | |
1115 | |
1116 /* if mask == 0 do filtering is not needed */ | |
1117 if (mask) | |
1118 { | |
1119 /* filtering */ | |
1120 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); | |
1121 | |
1122 /* unpack processed 4x4 neighborhood | |
1123 * don't use transpose on output data | |
1124 * because memory isn't aligned | |
1125 */ | |
1126 __asm__ __volatile__ ( | |
1127 "sb %[p4], 1(%[s4]) \n\t" | |
1128 "sb %[p3], 0(%[s4]) \n\t" | |
1129 "sb %[p2], -1(%[s4]) \n\t" | |
1130 "sb %[p1], -2(%[s4]) \n\t" | |
1131 : | |
1132 : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4), | |
1133 [p2] "r" (p2), [p1] "r" (p1) | |
1134 ); | |
1135 | |
1136 __asm__ __volatile__ ( | |
1137 "srl %[p4], %[p4], 8 \n\t" | |
1138 "srl %[p3], %[p3], 8 \n\t" | |
1139 "srl %[p2], %[p2], 8 \n\t" | |
1140 "srl %[p1], %[p1], 8 \n\t" | |
1141 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) | |
1142 : | |
1143 ); | |
1144 | |
1145 __asm__ __volatile__ ( | |
1146 "sb %[p4], 1(%[s3]) \n\t" | |
1147 "sb %[p3], 0(%[s3]) \n\t" | |
1148 "sb %[p2], -1(%[s3]) \n\t" | |
1149 "sb %[p1], -2(%[s3]) \n\t" | |
1150 : [p1] "+r" (p1) | |
1151 : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2) | |
1152 ); | |
1153 | |
1154 __asm__ __volatile__ ( | |
1155 "srl %[p4], %[p4], 8 \n\t" | |
1156 "srl %[p3], %[p3], 8 \n\t" | |
1157 "srl %[p2], %[p2], 8 \n\t" | |
1158 "srl %[p1], %[p1], 8 \n\t" | |
1159 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) | |
1160 : | |
1161 ); | |
1162 | |
1163 __asm__ __volatile__ ( | |
1164 "sb %[p4], 1(%[s2]) \n\t" | |
1165 "sb %[p3], 0(%[s2]) \n\t" | |
1166 "sb %[p2], -1(%[s2]) \n\t" | |
1167 "sb %[p1], -2(%[s2]) \n\t" | |
1168 : | |
1169 : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2), | |
1170 [p2] "r" (p2), [p1] "r" (p1) | |
1171 ); | |
1172 | |
1173 __asm__ __volatile__ ( | |
1174 "srl %[p4], %[p4], 8 \n\t" | |
1175 "srl %[p3], %[p3], 8 \n\t" | |
1176 "srl %[p2], %[p2], 8 \n\t" | |
1177 "srl %[p1], %[p1], 8 \n\t" | |
1178 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) | |
1179 : | |
1180 ); | |
1181 | |
1182 __asm__ __volatile__ ( | |
1183 "sb %[p4], 1(%[s1]) \n\t" | |
1184 "sb %[p3], 0(%[s1]) \n\t" | |
1185 "sb %[p2], -1(%[s1]) \n\t" | |
1186 "sb %[p1], -2(%[s1]) \n\t" | |
1187 : | |
1188 : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), [p2] "r" (p2), [p
1] "r" (p1) | |
1189 ); | |
1190 } | |
1191 } | |
1192 | |
1193 s1 = s4 + p; | |
1194 s2 = s1 + p; | |
1195 s3 = s2 + p; | |
1196 s4 = s3 + p; | |
1197 | |
1198 /* load quad-byte vectors | |
1199 * memory is 4 byte aligned | |
1200 */ | |
1201 p2 = *((uint32_t *)(s1 - 4)); | |
1202 p6 = *((uint32_t *)(s1)); | |
1203 p1 = *((uint32_t *)(s2 - 4)); | |
1204 p5 = *((uint32_t *)(s2)); | |
1205 p0 = *((uint32_t *)(s3 - 4)); | |
1206 p4 = *((uint32_t *)(s3)); | |
1207 pm1 = *((uint32_t *)(s4 - 4)); | |
1208 p3 = *((uint32_t *)(s4)); | |
1209 | |
1210 /* transpose pm1, p0, p1, p2 */ | |
1211 __asm__ __volatile__ ( | |
1212 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" | |
1213 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" | |
1214 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" | |
1215 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" | |
1216 | |
1217 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" | |
1218 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" | |
1219 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
1220 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
1221 | |
1222 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" | |
1223 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" | |
1224 "append %[p1], %[sec3], 16 \n\t" | |
1225 "append %[pm1], %[sec4], 16 \n\t" | |
1226 | |
1227 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
1228 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
1229 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), | |
1230 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
1231 : | |
1232 ); | |
1233 | |
1234 /* transpose p3, p4, p5, p6 */ | |
1235 __asm__ __volatile__ ( | |
1236 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" | |
1237 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" | |
1238 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" | |
1239 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" | |
1240 | |
1241 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" | |
1242 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" | |
1243 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
1244 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
1245 | |
1246 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" | |
1247 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" | |
1248 "append %[p5], %[sec3], 16 \n\t" | |
1249 "append %[p3], %[sec4], 16 \n\t" | |
1250 | |
1251 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
1252 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
1253 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
1254 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
1255 : | |
1256 ); | |
1257 | |
1258 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
1259 * mask will be zero and filtering is not needed | |
1260 */ | |
1261 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
1262 { | |
1263 | |
1264 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, | |
1265 thresh, &hev, &mask); | |
1266 | |
1267 /* if mask == 0 do filtering is not needed */ | |
1268 if (mask) | |
1269 { | |
1270 /* filtering */ | |
1271 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); | |
1272 | |
1273 /* unpack processed 4x4 neighborhood | |
1274 * don't use transpose on output data | |
1275 * because memory isn't aligned | |
1276 */ | |
1277 __asm__ __volatile__ ( | |
1278 "sb %[p4], 1(%[s4]) \n\t" | |
1279 "sb %[p3], 0(%[s4]) \n\t" | |
1280 "sb %[p2], -1(%[s4]) \n\t" | |
1281 "sb %[p1], -2(%[s4]) \n\t" | |
1282 : | |
1283 : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4), | |
1284 [p2] "r" (p2), [p1] "r" (p1) | |
1285 ); | |
1286 | |
1287 __asm__ __volatile__ ( | |
1288 "srl %[p4], %[p4], 8 \n\t" | |
1289 "srl %[p3], %[p3], 8 \n\t" | |
1290 "srl %[p2], %[p2], 8 \n\t" | |
1291 "srl %[p1], %[p1], 8 \n\t" | |
1292 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) | |
1293 : | |
1294 ); | |
1295 | |
1296 __asm__ __volatile__ ( | |
1297 "sb %[p4], 1(%[s3]) \n\t" | |
1298 "sb %[p3], 0(%[s3]) \n\t" | |
1299 "sb %[p2], -1(%[s3]) \n\t" | |
1300 "sb %[p1], -2(%[s3]) \n\t" | |
1301 : [p1] "+r" (p1) | |
1302 : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2) | |
1303 ); | |
1304 | |
1305 __asm__ __volatile__ ( | |
1306 "srl %[p4], %[p4], 8 \n\t" | |
1307 "srl %[p3], %[p3], 8 \n\t" | |
1308 "srl %[p2], %[p2], 8 \n\t" | |
1309 "srl %[p1], %[p1], 8 \n\t" | |
1310 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) | |
1311 : | |
1312 ); | |
1313 | |
1314 __asm__ __volatile__ ( | |
1315 "sb %[p4], 1(%[s2]) \n\t" | |
1316 "sb %[p3], 0(%[s2]) \n\t" | |
1317 "sb %[p2], -1(%[s2]) \n\t" | |
1318 "sb %[p1], -2(%[s2]) \n\t" | |
1319 : | |
1320 : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2), | |
1321 [p2] "r" (p2), [p1] "r" (p1) | |
1322 ); | |
1323 | |
1324 __asm__ __volatile__ ( | |
1325 "srl %[p4], %[p4], 8 \n\t" | |
1326 "srl %[p3], %[p3], 8 \n\t" | |
1327 "srl %[p2], %[p2], 8 \n\t" | |
1328 "srl %[p1], %[p1], 8 \n\t" | |
1329 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) | |
1330 : | |
1331 ); | |
1332 | |
1333 __asm__ __volatile__ ( | |
1334 "sb %[p4], 1(%[s1]) \n\t" | |
1335 "sb %[p3], 0(%[s1]) \n\t" | |
1336 "sb %[p2], -1(%[s1]) \n\t" | |
1337 "sb %[p1], -2(%[s1]) \n\t" | |
1338 : | |
1339 : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), | |
1340 [p2] "r" (p2), [p1] "r" (p1) | |
1341 ); | |
1342 } | |
1343 } | |
1344 } | |
1345 | |
1346 /* inputs & outputs are quad-byte vectors */ | |
1347 static __inline void vp8_mbfilter_mips | |
1348 ( | |
1349 uint32_t mask, | |
1350 uint32_t hev, | |
1351 uint32_t *ps2, | |
1352 uint32_t *ps1, | |
1353 uint32_t *ps0, | |
1354 uint32_t *qs0, | |
1355 uint32_t *qs1, | |
1356 uint32_t *qs2 | |
1357 ) | |
1358 { | |
1359 int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2; | |
1360 int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l; | |
1361 int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r; | |
1362 uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r, subr
_r, subr_l; | |
1363 uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l, invhe
v_r; | |
1364 uint32_t N128, R63; | |
1365 uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r; | |
1366 | |
1367 R63 = 0x003F003F; | |
1368 HWM = 0xFF00FF00; | |
1369 N128 = 0x80808080; | |
1370 t1 = 0x03000300; | |
1371 t2 = 0x04000400; | |
1372 | |
1373 vps0 = (*ps0) ^ N128; | |
1374 vps1 = (*ps1) ^ N128; | |
1375 vps2 = (*ps2) ^ N128; | |
1376 vqs0 = (*qs0) ^ N128; | |
1377 vqs1 = (*qs1) ^ N128; | |
1378 vqs2 = (*qs2) ^ N128; | |
1379 | |
1380 /* use halfword pairs instead quad-bytes because of accuracy */ | |
1381 vps0_l = vps0 & HWM; | |
1382 vps0_r = vps0 << 8; | |
1383 vps0_r = vps0_r & HWM; | |
1384 | |
1385 vqs0_l = vqs0 & HWM; | |
1386 vqs0_r = vqs0 << 8; | |
1387 vqs0_r = vqs0_r & HWM; | |
1388 | |
1389 vps1_l = vps1 & HWM; | |
1390 vps1_r = vps1 << 8; | |
1391 vps1_r = vps1_r & HWM; | |
1392 | |
1393 vqs1_l = vqs1 & HWM; | |
1394 vqs1_r = vqs1 << 8; | |
1395 vqs1_r = vqs1_r & HWM; | |
1396 | |
1397 vqs2_l = vqs2 & HWM; | |
1398 vqs2_r = vqs2 << 8; | |
1399 vqs2_r = vqs2_r & HWM; | |
1400 | |
1401 __asm__ __volatile__ ( | |
1402 /* qs0 - ps0 */ | |
1403 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" | |
1404 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" | |
1405 | |
1406 /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */ | |
1407 "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t" | |
1408 "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t" | |
1409 | |
1410 : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=r" (vp8_filter_r
), | |
1411 [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r) | |
1412 : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l), | |
1413 [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r), | |
1414 [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r) | |
1415 ); | |
1416 | |
1417 vps2_l = vps2 & HWM; | |
1418 vps2_r = vps2 << 8; | |
1419 vps2_r = vps2_r & HWM; | |
1420 | |
1421 /* add outer taps if we have high edge variance */ | |
1422 __asm__ __volatile__ ( | |
1423 /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */ | |
1424 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" | |
1425 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" | |
1426 "and %[mask_l], %[HWM], %[mask] \n\t" | |
1427 "sll %[mask_r], %[mask], 8 \n\t" | |
1428 "and %[mask_r], %[HWM], %[mask_r] \n\t" | |
1429 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" | |
1430 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" | |
1431 "and %[hev_l], %[HWM], %[hev] \n\t" | |
1432 "sll %[hev_r], %[hev], 8 \n\t" | |
1433 "and %[hev_r], %[HWM], %[hev_r] \n\t" | |
1434 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" | |
1435 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" | |
1436 | |
1437 /* vp8_filter &= mask; */ | |
1438 "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t" | |
1439 "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t" | |
1440 | |
1441 /* Filter2 = vp8_filter & hev; */ | |
1442 "and %[Filter2_l], %[vp8_filter_l], %[hev_l] \n\t" | |
1443 "and %[Filter2_r], %[vp8_filter_r], %[hev_r] \n\t" | |
1444 | |
1445 : [vp8_filter_l] "+r" (vp8_filter_l), [vp8_filter_r] "+r" (vp8_filter_r)
, | |
1446 [hev_l] "=&r" (hev_l), [hev_r] "=&r" (hev_r), | |
1447 [mask_l] "=&r" (mask_l), [mask_r] "=&r" (mask_r), | |
1448 [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r) | |
1449 : [subr_l] "r" (subr_l), [subr_r] "r" (subr_r), | |
1450 [HWM] "r" (HWM), [hev] "r" (hev), [mask] "r" (mask) | |
1451 ); | |
1452 | |
1453 /* save bottom 3 bits so that we round one side +4 and the other +3 */ | |
1454 __asm__ __volatile__ ( | |
1455 /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */ | |
1456 "addq_s.ph %[Filter1_l], %[Filter2_l], %[t2] \n\t" | |
1457 "xor %[invhev_l], %[hev_l], %[HWM] \n\t" | |
1458 "addq_s.ph %[Filter1_r], %[Filter2_r], %[t2] \n\t" | |
1459 | |
1460 /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */ | |
1461 "addq_s.ph %[Filter2_l], %[Filter2_l], %[t1] \n\t" | |
1462 "addq_s.ph %[Filter2_r], %[Filter2_r], %[t1] \n\t" | |
1463 | |
1464 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" | |
1465 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" | |
1466 | |
1467 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" | |
1468 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" | |
1469 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" | |
1470 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" | |
1471 "xor %[invhev_r], %[hev_r], %[HWM] \n\t" | |
1472 | |
1473 /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */ | |
1474 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" | |
1475 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" | |
1476 | |
1477 /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */ | |
1478 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" | |
1479 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" | |
1480 | |
1481 : [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r), | |
1482 [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r), | |
1483 [Filter2_l] "+r" (Filter2_l), [Filter2_r] "+r" (Filter2_r), | |
1484 [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r), | |
1485 [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r) | |
1486 : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM), | |
1487 [hev_l] "r" (hev_l), [hev_r] "r" (hev_r) | |
1488 ); | |
1489 | |
1490 /* only apply wider filter if not high edge variance */ | |
1491 __asm__ __volatile__ ( | |
1492 /* vp8_filter &= ~hev; */ | |
1493 "and %[Filter2_l], %[vp8_filter_l], %[invhev_l] \n\t" | |
1494 "and %[Filter2_r], %[vp8_filter_r], %[invhev_r] \n\t" | |
1495 | |
1496 "shra.ph %[Filter2_l], %[Filter2_l], 8 \n\t" | |
1497 "shra.ph %[Filter2_r], %[Filter2_r], 8 \n\t" | |
1498 | |
1499 : [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r) | |
1500 : [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r), | |
1501 [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r) | |
1502 ); | |
1503 | |
1504 /* roughly 3/7th difference across boundary */ | |
1505 __asm__ __volatile__ ( | |
1506 "shll.ph %[u3_l], %[Filter2_l], 3 \n\t" | |
1507 "shll.ph %[u3_r], %[Filter2_r], 3 \n\t" | |
1508 | |
1509 "addq.ph %[u3_l], %[u3_l], %[Filter2_l] \n\t" | |
1510 "addq.ph %[u3_r], %[u3_r], %[Filter2_r] \n\t" | |
1511 | |
1512 "shll.ph %[u2_l], %[u3_l], 1 \n\t" | |
1513 "shll.ph %[u2_r], %[u3_r], 1 \n\t" | |
1514 | |
1515 "addq.ph %[u1_l], %[u3_l], %[u2_l] \n\t" | |
1516 "addq.ph %[u1_r], %[u3_r], %[u2_r] \n\t" | |
1517 | |
1518 "addq.ph %[u2_l], %[u2_l], %[R63] \n\t" | |
1519 "addq.ph %[u2_r], %[u2_r], %[R63] \n\t" | |
1520 | |
1521 "addq.ph %[u3_l], %[u3_l], %[R63] \n\t" | |
1522 "addq.ph %[u3_r], %[u3_r], %[R63] \n\t" | |
1523 | |
1524 /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7) | |
1525 * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7) | |
1526 */ | |
1527 "addq.ph %[u1_l], %[u1_l], %[R63] \n\t" | |
1528 "addq.ph %[u1_r], %[u1_r], %[R63] \n\t" | |
1529 "shra.ph %[u1_l], %[u1_l], 7 \n\t" | |
1530 "shra.ph %[u1_r], %[u1_r], 7 \n\t" | |
1531 "shra.ph %[u2_l], %[u2_l], 7 \n\t" | |
1532 "shra.ph %[u2_r], %[u2_r], 7 \n\t" | |
1533 "shll.ph %[u1_l], %[u1_l], 8 \n\t" | |
1534 "shll.ph %[u1_r], %[u1_r], 8 \n\t" | |
1535 "shll.ph %[u2_l], %[u2_l], 8 \n\t" | |
1536 "shll.ph %[u2_r], %[u2_r], 8 \n\t" | |
1537 | |
1538 /* vqs0 = vp8_signed_char_clamp(qs0 - u); */ | |
1539 "subq_s.ph %[vqs0_l], %[vqs0_l], %[u1_l] \n\t" | |
1540 "subq_s.ph %[vqs0_r], %[vqs0_r], %[u1_r] \n\t" | |
1541 | |
1542 /* vps0 = vp8_signed_char_clamp(ps0 + u); */ | |
1543 "addq_s.ph %[vps0_l], %[vps0_l], %[u1_l] \n\t" | |
1544 "addq_s.ph %[vps0_r], %[vps0_r], %[u1_r] \n\t" | |
1545 | |
1546 : [u1_l] "=&r" (u1_l), [u1_r] "=&r" (u1_r), [u2_l] "=&r" (u2_l), | |
1547 [u2_r] "=&r" (u2_r), [u3_l] "=&r" (u3_l), [u3_r] "=&r" (u3_r), | |
1548 [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r), | |
1549 [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r) | |
1550 : [R63] "r" (R63), | |
1551 [Filter2_l] "r" (Filter2_l), [Filter2_r] "r" (Filter2_r) | |
1552 ); | |
1553 | |
1554 __asm__ __volatile__ ( | |
1555 /* vqs1 = vp8_signed_char_clamp(qs1 - u); */ | |
1556 "subq_s.ph %[vqs1_l], %[vqs1_l], %[u2_l] \n\t" | |
1557 "addq_s.ph %[vps1_l], %[vps1_l], %[u2_l] \n\t" | |
1558 | |
1559 /* vps1 = vp8_signed_char_clamp(ps1 + u); */ | |
1560 "addq_s.ph %[vps1_r], %[vps1_r], %[u2_r] \n\t" | |
1561 "subq_s.ph %[vqs1_r], %[vqs1_r], %[u2_r] \n\t" | |
1562 | |
1563 : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), | |
1564 [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r) | |
1565 : [u2_l] "r" (u2_l), [u2_r] "r" (u2_r) | |
1566 ); | |
1567 | |
1568 /* roughly 1/7th difference across boundary */ | |
1569 __asm__ __volatile__ ( | |
1570 /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */ | |
1571 "shra.ph %[u3_l], %[u3_l], 7 \n\t" | |
1572 "shra.ph %[u3_r], %[u3_r], 7 \n\t" | |
1573 "shll.ph %[u3_l], %[u3_l], 8 \n\t" | |
1574 "shll.ph %[u3_r], %[u3_r], 8 \n\t" | |
1575 | |
1576 /* vqs2 = vp8_signed_char_clamp(qs2 - u); */ | |
1577 "subq_s.ph %[vqs2_l], %[vqs2_l], %[u3_l] \n\t" | |
1578 "subq_s.ph %[vqs2_r], %[vqs2_r], %[u3_r] \n\t" | |
1579 | |
1580 /* vps2 = vp8_signed_char_clamp(ps2 + u); */ | |
1581 "addq_s.ph %[vps2_l], %[vps2_l], %[u3_l] \n\t" | |
1582 "addq_s.ph %[vps2_r], %[vps2_r], %[u3_r] \n\t" | |
1583 | |
1584 : [u3_l] "+r" (u3_l), [u3_r] "+r" (u3_r), [vps2_l] "+r" (vps2_l), | |
1585 [vps2_r] "+r" (vps2_r), [vqs2_l] "+r" (vqs2_l), [vqs2_r] "+r" (vqs2_r) | |
1586 : | |
1587 ); | |
1588 | |
1589 /* Create quad-bytes from halfword pairs */ | |
1590 __asm__ __volatile__ ( | |
1591 "and %[vqs0_l], %[vqs0_l], %[HWM] \n\t" | |
1592 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" | |
1593 | |
1594 "and %[vps0_l], %[vps0_l], %[HWM] \n\t" | |
1595 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" | |
1596 | |
1597 "and %[vqs1_l], %[vqs1_l], %[HWM] \n\t" | |
1598 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" | |
1599 | |
1600 "and %[vps1_l], %[vps1_l], %[HWM] \n\t" | |
1601 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" | |
1602 | |
1603 "and %[vqs2_l], %[vqs2_l], %[HWM] \n\t" | |
1604 "shrl.ph %[vqs2_r], %[vqs2_r], 8 \n\t" | |
1605 | |
1606 "and %[vps2_l], %[vps2_l], %[HWM] \n\t" | |
1607 "shrl.ph %[vps2_r], %[vps2_r], 8 \n\t" | |
1608 | |
1609 "or %[vqs0_r], %[vqs0_l], %[vqs0_r] \n\t" | |
1610 "or %[vps0_r], %[vps0_l], %[vps0_r] \n\t" | |
1611 "or %[vqs1_r], %[vqs1_l], %[vqs1_r] \n\t" | |
1612 "or %[vps1_r], %[vps1_l], %[vps1_r] \n\t" | |
1613 "or %[vqs2_r], %[vqs2_l], %[vqs2_r] \n\t" | |
1614 "or %[vps2_r], %[vps2_l], %[vps2_r] \n\t" | |
1615 | |
1616 : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), [vqs1_l] "+r" (vqs1_l)
, | |
1617 [vqs1_r] "+r" (vqs1_r), [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r)
, | |
1618 [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r), [vqs2_l] "+r" (vqs2_l)
, | |
1619 [vqs2_r] "+r" (vqs2_r), [vps2_r] "+r" (vps2_r), [vps2_l] "+r" (vps2_l) | |
1620 : [HWM] "r" (HWM) | |
1621 ); | |
1622 | |
1623 *ps0 = vps0_r ^ N128; | |
1624 *ps1 = vps1_r ^ N128; | |
1625 *ps2 = vps2_r ^ N128; | |
1626 *qs0 = vqs0_r ^ N128; | |
1627 *qs1 = vqs1_r ^ N128; | |
1628 *qs2 = vqs2_r ^ N128; | |
1629 } | |
1630 | |
1631 void vp8_mbloop_filter_horizontal_edge_mips | |
1632 ( | |
1633 unsigned char *s, | |
1634 int p, | |
1635 unsigned int flimit, | |
1636 unsigned int limit, | |
1637 unsigned int thresh, | |
1638 int count | |
1639 ) | |
1640 { | |
1641 int i; | |
1642 uint32_t mask, hev; | |
1643 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; | |
1644 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; | |
1645 | |
1646 mask = 0; | |
1647 hev = 0; | |
1648 i = 0; | |
1649 p1 = 0; | |
1650 p2 = 0; | |
1651 p3 = 0; | |
1652 p4 = 0; | |
1653 | |
1654 /* loop filter designed to work using chars so that we can make maximum use | |
1655 * of 8 bit simd instructions. | |
1656 */ | |
1657 | |
1658 sm1 = s - (p << 2); | |
1659 s0 = s - p - p - p; | |
1660 s1 = s - p - p; | |
1661 s2 = s - p; | |
1662 s3 = s; | |
1663 s4 = s + p; | |
1664 s5 = s + p + p; | |
1665 s6 = s + p + p + p; | |
1666 | |
1667 /* prefetch data for load */ | |
1668 prefetch_load_lf(s + p); | |
1669 | |
1670 /* apply filter on 4 pixesl at the same time */ | |
1671 do | |
1672 { | |
1673 /* load quad-byte vectors | |
1674 * memory is 4 byte aligned | |
1675 */ | |
1676 p1 = *((uint32_t *)(s1)); | |
1677 p2 = *((uint32_t *)(s2)); | |
1678 p3 = *((uint32_t *)(s3)); | |
1679 p4 = *((uint32_t *)(s4)); | |
1680 | |
1681 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
1682 * mask will be zero and filtering is not needed | |
1683 */ | |
1684 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
1685 { | |
1686 | |
1687 pm1 = *((uint32_t *)(sm1)); | |
1688 p0 = *((uint32_t *)(s0)); | |
1689 p5 = *((uint32_t *)(s5)); | |
1690 p6 = *((uint32_t *)(s6)); | |
1691 | |
1692 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5,
p6, | |
1693 thresh, &hev, &mask); | |
1694 | |
1695 /* if mask == 0 do filtering is not needed */ | |
1696 if (mask) | |
1697 { | |
1698 /* filtering */ | |
1699 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); | |
1700 | |
1701 /* unpack processed 4x4 neighborhood | |
1702 * memory is 4 byte aligned | |
1703 */ | |
1704 *((uint32_t *)s0) = p0; | |
1705 *((uint32_t *)s1) = p1; | |
1706 *((uint32_t *)s2) = p2; | |
1707 *((uint32_t *)s3) = p3; | |
1708 *((uint32_t *)s4) = p4; | |
1709 *((uint32_t *)s5) = p5; | |
1710 } | |
1711 } | |
1712 | |
1713 sm1 += 4; | |
1714 s0 += 4; | |
1715 s1 += 4; | |
1716 s2 += 4; | |
1717 s3 += 4; | |
1718 s4 += 4; | |
1719 s5 += 4; | |
1720 s6 += 4; | |
1721 | |
1722 /* load quad-byte vectors | |
1723 * memory is 4 byte aligned | |
1724 */ | |
1725 p1 = *((uint32_t *)(s1)); | |
1726 p2 = *((uint32_t *)(s2)); | |
1727 p3 = *((uint32_t *)(s3)); | |
1728 p4 = *((uint32_t *)(s4)); | |
1729 | |
1730 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
1731 * mask will be zero and filtering is not needed | |
1732 */ | |
1733 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
1734 { | |
1735 | |
1736 pm1 = *((uint32_t *)(sm1)); | |
1737 p0 = *((uint32_t *)(s0)); | |
1738 p5 = *((uint32_t *)(s5)); | |
1739 p6 = *((uint32_t *)(s6)); | |
1740 | |
1741 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5,
p6, | |
1742 thresh, &hev, &mask); | |
1743 | |
1744 /* if mask == 0 do filtering is not needed */ | |
1745 if (mask) | |
1746 { | |
1747 /* filtering */ | |
1748 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); | |
1749 | |
1750 /* unpack processed 4x4 neighborhood | |
1751 * memory is 4 byte aligned | |
1752 */ | |
1753 *((uint32_t *)s0) = p0; | |
1754 *((uint32_t *)s1) = p1; | |
1755 *((uint32_t *)s2) = p2; | |
1756 *((uint32_t *)s3) = p3; | |
1757 *((uint32_t *)s4) = p4; | |
1758 *((uint32_t *)s5) = p5; | |
1759 } | |
1760 } | |
1761 | |
1762 sm1 += 4; | |
1763 s0 += 4; | |
1764 s1 += 4; | |
1765 s2 += 4; | |
1766 s3 += 4; | |
1767 s4 += 4; | |
1768 s5 += 4; | |
1769 s6 += 4; | |
1770 | |
1771 i += 8; | |
1772 } | |
1773 | |
1774 while (i < count); | |
1775 } | |
1776 | |
1777 void vp8_mbloop_filter_uvhorizontal_edge_mips | |
1778 ( | |
1779 unsigned char *s, | |
1780 int p, | |
1781 unsigned int flimit, | |
1782 unsigned int limit, | |
1783 unsigned int thresh, | |
1784 int count | |
1785 ) | |
1786 { | |
1787 uint32_t mask, hev; | |
1788 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; | |
1789 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; | |
1790 | |
1791 mask = 0; | |
1792 hev = 0; | |
1793 p1 = 0; | |
1794 p2 = 0; | |
1795 p3 = 0; | |
1796 p4 = 0; | |
1797 | |
1798 /* loop filter designed to work using chars so that we can make maximum use | |
1799 * of 8 bit simd instructions. | |
1800 */ | |
1801 | |
1802 sm1 = s - (p << 2); | |
1803 s0 = s - p - p - p; | |
1804 s1 = s - p - p; | |
1805 s2 = s - p; | |
1806 s3 = s; | |
1807 s4 = s + p; | |
1808 s5 = s + p + p; | |
1809 s6 = s + p + p + p; | |
1810 | |
1811 /* load quad-byte vectors | |
1812 * memory is 4 byte aligned | |
1813 */ | |
1814 p1 = *((uint32_t *)(s1)); | |
1815 p2 = *((uint32_t *)(s2)); | |
1816 p3 = *((uint32_t *)(s3)); | |
1817 p4 = *((uint32_t *)(s4)); | |
1818 | |
1819 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
1820 * mask will be zero and filtering is not needed | |
1821 */ | |
1822 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
1823 { | |
1824 | |
1825 pm1 = *((uint32_t *)(sm1)); | |
1826 p0 = *((uint32_t *)(s0)); | |
1827 p5 = *((uint32_t *)(s5)); | |
1828 p6 = *((uint32_t *)(s6)); | |
1829 | |
1830 /* if mask == 0 do filtering is not needed */ | |
1831 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, | |
1832 thresh, &hev, &mask); | |
1833 | |
1834 if (mask) | |
1835 { | |
1836 /* filtering */ | |
1837 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); | |
1838 | |
1839 /* unpack processed 4x4 neighborhood | |
1840 * memory is 4 byte aligned | |
1841 */ | |
1842 *((uint32_t *)s0) = p0; | |
1843 *((uint32_t *)s1) = p1; | |
1844 *((uint32_t *)s2) = p2; | |
1845 *((uint32_t *)s3) = p3; | |
1846 *((uint32_t *)s4) = p4; | |
1847 *((uint32_t *)s5) = p5; | |
1848 } | |
1849 } | |
1850 | |
1851 sm1 += 4; | |
1852 s0 += 4; | |
1853 s1 += 4; | |
1854 s2 += 4; | |
1855 s3 += 4; | |
1856 s4 += 4; | |
1857 s5 += 4; | |
1858 s6 += 4; | |
1859 | |
1860 /* load quad-byte vectors | |
1861 * memory is 4 byte aligned | |
1862 */ | |
1863 p1 = *((uint32_t *)(s1)); | |
1864 p2 = *((uint32_t *)(s2)); | |
1865 p3 = *((uint32_t *)(s3)); | |
1866 p4 = *((uint32_t *)(s4)); | |
1867 | |
1868 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
1869 * mask will be zero and filtering is not needed | |
1870 */ | |
1871 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
1872 { | |
1873 | |
1874 pm1 = *((uint32_t *)(sm1)); | |
1875 p0 = *((uint32_t *)(s0)); | |
1876 p5 = *((uint32_t *)(s5)); | |
1877 p6 = *((uint32_t *)(s6)); | |
1878 | |
1879 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, | |
1880 thresh, &hev, &mask); | |
1881 | |
1882 /* if mask == 0 do filtering is not needed */ | |
1883 if (mask) | |
1884 { | |
1885 /* filtering */ | |
1886 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); | |
1887 | |
1888 /* unpack processed 4x4 neighborhood | |
1889 * memory is 4 byte aligned | |
1890 */ | |
1891 *((uint32_t *)s0) = p0; | |
1892 *((uint32_t *)s1) = p1; | |
1893 *((uint32_t *)s2) = p2; | |
1894 *((uint32_t *)s3) = p3; | |
1895 *((uint32_t *)s4) = p4; | |
1896 *((uint32_t *)s5) = p5; | |
1897 } | |
1898 } | |
1899 } | |
1900 | |
1901 | |
1902 void vp8_mbloop_filter_vertical_edge_mips | |
1903 ( | |
1904 unsigned char *s, | |
1905 int p, | |
1906 unsigned int flimit, | |
1907 unsigned int limit, | |
1908 unsigned int thresh, | |
1909 int count | |
1910 ) | |
1911 { | |
1912 | |
1913 int i; | |
1914 uint32_t mask, hev; | |
1915 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; | |
1916 unsigned char *s1, *s2, *s3, *s4; | |
1917 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; | |
1918 | |
1919 mask = 0; | |
1920 hev = 0; | |
1921 i = 0; | |
1922 pm1 = 0; | |
1923 p0 = 0; | |
1924 p1 = 0; | |
1925 p2 = 0; | |
1926 p3 = 0; | |
1927 p4 = 0; | |
1928 p5 = 0; | |
1929 p6 = 0; | |
1930 | |
1931 /* loop filter designed to work using chars so that we can make maximum use | |
1932 * of 8 bit simd instructions. | |
1933 */ | |
1934 | |
1935 /* apply filter on 4 pixesl at the same time */ | |
1936 do | |
1937 { | |
1938 s1 = s; | |
1939 s2 = s + p; | |
1940 s3 = s2 + p; | |
1941 s4 = s3 + p; | |
1942 s = s4 + p; | |
1943 | |
1944 /* load quad-byte vectors | |
1945 * memory is 4 byte aligned | |
1946 */ | |
1947 p2 = *((uint32_t *)(s1 - 4)); | |
1948 p6 = *((uint32_t *)(s1)); | |
1949 p1 = *((uint32_t *)(s2 - 4)); | |
1950 p5 = *((uint32_t *)(s2)); | |
1951 p0 = *((uint32_t *)(s3 - 4)); | |
1952 p4 = *((uint32_t *)(s3)); | |
1953 pm1 = *((uint32_t *)(s4 - 4)); | |
1954 p3 = *((uint32_t *)(s4)); | |
1955 | |
1956 /* transpose pm1, p0, p1, p2 */ | |
1957 __asm__ __volatile__ ( | |
1958 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" | |
1959 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" | |
1960 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" | |
1961 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" | |
1962 | |
1963 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" | |
1964 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" | |
1965 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
1966 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
1967 | |
1968 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" | |
1969 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" | |
1970 "append %[p1], %[sec3], 16 \n\t" | |
1971 "append %[pm1], %[sec4], 16 \n\t" | |
1972 | |
1973 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
1974 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
1975 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), | |
1976 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
1977 : | |
1978 ); | |
1979 | |
1980 /* transpose p3, p4, p5, p6 */ | |
1981 __asm__ __volatile__ ( | |
1982 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" | |
1983 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" | |
1984 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" | |
1985 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" | |
1986 | |
1987 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" | |
1988 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" | |
1989 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
1990 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
1991 | |
1992 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" | |
1993 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" | |
1994 "append %[p5], %[sec3], 16 \n\t" | |
1995 "append %[p3], %[sec4], 16 \n\t" | |
1996 | |
1997 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
1998 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
1999 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2000 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
2001 : | |
2002 ); | |
2003 | |
2004 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
2005 * mask will be zero and filtering is not needed | |
2006 */ | |
2007 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
2008 { | |
2009 | |
2010 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5,
p6, | |
2011 thresh, &hev, &mask); | |
2012 | |
2013 /* if mask == 0 do filtering is not needed */ | |
2014 if (mask) | |
2015 { | |
2016 /* filtering */ | |
2017 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); | |
2018 | |
2019 /* don't use transpose on output data | |
2020 * because memory isn't aligned | |
2021 */ | |
2022 __asm__ __volatile__ ( | |
2023 "sb %[p5], 2(%[s4]) \n\t" | |
2024 "sb %[p4], 1(%[s4]) \n\t" | |
2025 "sb %[p3], 0(%[s4]) \n\t" | |
2026 "sb %[p2], -1(%[s4]) \n\t" | |
2027 "sb %[p1], -2(%[s4]) \n\t" | |
2028 "sb %[p0], -3(%[s4]) \n\t" | |
2029 : | |
2030 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4)
, | |
2031 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2032 ); | |
2033 | |
2034 __asm__ __volatile__ ( | |
2035 "srl %[p5], %[p5], 8 \n\t" | |
2036 "srl %[p4], %[p4], 8 \n\t" | |
2037 "srl %[p3], %[p3], 8 \n\t" | |
2038 "srl %[p2], %[p2], 8 \n\t" | |
2039 "srl %[p1], %[p1], 8 \n\t" | |
2040 "srl %[p0], %[p0], 8 \n\t" | |
2041 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2042 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0) | |
2043 : | |
2044 ); | |
2045 | |
2046 __asm__ __volatile__ ( | |
2047 "sb %[p5], 2(%[s3]) \n\t" | |
2048 "sb %[p4], 1(%[s3]) \n\t" | |
2049 "sb %[p3], 0(%[s3]) \n\t" | |
2050 "sb %[p2], -1(%[s3]) \n\t" | |
2051 "sb %[p1], -2(%[s3]) \n\t" | |
2052 "sb %[p0], -3(%[s3]) \n\t" | |
2053 : | |
2054 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3)
, | |
2055 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2056 ); | |
2057 | |
2058 __asm__ __volatile__ ( | |
2059 "srl %[p5], %[p5], 8 \n\t" | |
2060 "srl %[p4], %[p4], 8 \n\t" | |
2061 "srl %[p3], %[p3], 8 \n\t" | |
2062 "srl %[p2], %[p2], 8 \n\t" | |
2063 "srl %[p1], %[p1], 8 \n\t" | |
2064 "srl %[p0], %[p0], 8 \n\t" | |
2065 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2066 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0) | |
2067 : | |
2068 ); | |
2069 | |
2070 __asm__ __volatile__ ( | |
2071 "sb %[p5], 2(%[s2]) \n\t" | |
2072 "sb %[p4], 1(%[s2]) \n\t" | |
2073 "sb %[p3], 0(%[s2]) \n\t" | |
2074 "sb %[p2], -1(%[s2]) \n\t" | |
2075 "sb %[p1], -2(%[s2]) \n\t" | |
2076 "sb %[p0], -3(%[s2]) \n\t" | |
2077 : | |
2078 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2)
, | |
2079 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2080 ); | |
2081 | |
2082 __asm__ __volatile__ ( | |
2083 "srl %[p5], %[p5], 8 \n\t" | |
2084 "srl %[p4], %[p4], 8 \n\t" | |
2085 "srl %[p3], %[p3], 8 \n\t" | |
2086 "srl %[p2], %[p2], 8 \n\t" | |
2087 "srl %[p1], %[p1], 8 \n\t" | |
2088 "srl %[p0], %[p0], 8 \n\t" | |
2089 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2090 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0) | |
2091 : | |
2092 ); | |
2093 | |
2094 __asm__ __volatile__ ( | |
2095 "sb %[p5], 2(%[s1]) \n\t" | |
2096 "sb %[p4], 1(%[s1]) \n\t" | |
2097 "sb %[p3], 0(%[s1]) \n\t" | |
2098 "sb %[p2], -1(%[s1]) \n\t" | |
2099 "sb %[p1], -2(%[s1]) \n\t" | |
2100 "sb %[p0], -3(%[s1]) \n\t" | |
2101 : | |
2102 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1)
, | |
2103 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2104 ); | |
2105 } | |
2106 } | |
2107 | |
2108 i += 4; | |
2109 } | |
2110 | |
2111 while (i < count); | |
2112 } | |
2113 | |
2114 void vp8_mbloop_filter_uvvertical_edge_mips | |
2115 ( | |
2116 unsigned char *s, | |
2117 int p, | |
2118 unsigned int flimit, | |
2119 unsigned int limit, | |
2120 unsigned int thresh, | |
2121 int count | |
2122 ) | |
2123 { | |
2124 uint32_t mask, hev; | |
2125 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; | |
2126 unsigned char *s1, *s2, *s3, *s4; | |
2127 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; | |
2128 | |
2129 mask = 0; | |
2130 hev = 0; | |
2131 pm1 = 0; | |
2132 p0 = 0; | |
2133 p1 = 0; | |
2134 p2 = 0; | |
2135 p3 = 0; | |
2136 p4 = 0; | |
2137 p5 = 0; | |
2138 p6 = 0; | |
2139 | |
2140 /* loop filter designed to work using chars so that we can make maximum use | |
2141 * of 8 bit simd instructions. | |
2142 */ | |
2143 | |
2144 /* apply filter on 4 pixesl at the same time */ | |
2145 | |
2146 s1 = s; | |
2147 s2 = s + p; | |
2148 s3 = s2 + p; | |
2149 s4 = s3 + p; | |
2150 | |
2151 /* prefetch data for load */ | |
2152 prefetch_load_lf(s + 2 * p); | |
2153 | |
2154 /* load quad-byte vectors | |
2155 * memory is 4 byte aligned | |
2156 */ | |
2157 p2 = *((uint32_t *)(s1 - 4)); | |
2158 p6 = *((uint32_t *)(s1)); | |
2159 p1 = *((uint32_t *)(s2 - 4)); | |
2160 p5 = *((uint32_t *)(s2)); | |
2161 p0 = *((uint32_t *)(s3 - 4)); | |
2162 p4 = *((uint32_t *)(s3)); | |
2163 pm1 = *((uint32_t *)(s4 - 4)); | |
2164 p3 = *((uint32_t *)(s4)); | |
2165 | |
2166 /* transpose pm1, p0, p1, p2 */ | |
2167 __asm__ __volatile__ ( | |
2168 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" | |
2169 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" | |
2170 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" | |
2171 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" | |
2172 | |
2173 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" | |
2174 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" | |
2175 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
2176 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
2177 | |
2178 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" | |
2179 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" | |
2180 "append %[p1], %[sec3], 16 \n\t" | |
2181 "append %[pm1], %[sec4], 16 \n\t" | |
2182 | |
2183 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
2184 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
2185 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), | |
2186 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
2187 : | |
2188 ); | |
2189 | |
2190 /* transpose p3, p4, p5, p6 */ | |
2191 __asm__ __volatile__ ( | |
2192 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" | |
2193 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" | |
2194 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" | |
2195 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" | |
2196 | |
2197 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" | |
2198 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" | |
2199 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
2200 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
2201 | |
2202 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" | |
2203 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" | |
2204 "append %[p5], %[sec3], 16 \n\t" | |
2205 "append %[p3], %[sec4], 16 \n\t" | |
2206 | |
2207 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
2208 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
2209 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2210 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
2211 : | |
2212 ); | |
2213 | |
2214 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
2215 * mask will be zero and filtering is not needed | |
2216 */ | |
2217 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
2218 { | |
2219 | |
2220 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, | |
2221 thresh, &hev, &mask); | |
2222 | |
2223 /* if mask == 0 do filtering is not needed */ | |
2224 if (mask) | |
2225 { | |
2226 /* filtering */ | |
2227 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); | |
2228 | |
2229 /* don't use transpose on output data | |
2230 * because memory isn't aligned | |
2231 */ | |
2232 __asm__ __volatile__ ( | |
2233 "sb %[p5], 2(%[s4]) \n\t" | |
2234 "sb %[p4], 1(%[s4]) \n\t" | |
2235 "sb %[p3], 0(%[s4]) \n\t" | |
2236 "sb %[p2], -1(%[s4]) \n\t" | |
2237 "sb %[p1], -2(%[s4]) \n\t" | |
2238 "sb %[p0], -3(%[s4]) \n\t" | |
2239 : | |
2240 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4), | |
2241 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2242 ); | |
2243 | |
2244 __asm__ __volatile__ ( | |
2245 "srl %[p5], %[p5], 8 \n\t" | |
2246 "srl %[p4], %[p4], 8 \n\t" | |
2247 "srl %[p3], %[p3], 8 \n\t" | |
2248 "srl %[p2], %[p2], 8 \n\t" | |
2249 "srl %[p1], %[p1], 8 \n\t" | |
2250 "srl %[p0], %[p0], 8 \n\t" | |
2251 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2252 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0) | |
2253 : | |
2254 ); | |
2255 | |
2256 __asm__ __volatile__ ( | |
2257 "sb %[p5], 2(%[s3]) \n\t" | |
2258 "sb %[p4], 1(%[s3]) \n\t" | |
2259 "sb %[p3], 0(%[s3]) \n\t" | |
2260 "sb %[p2], -1(%[s3]) \n\t" | |
2261 "sb %[p1], -2(%[s3]) \n\t" | |
2262 "sb %[p0], -3(%[s3]) \n\t" | |
2263 : | |
2264 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), | |
2265 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2266 ); | |
2267 | |
2268 __asm__ __volatile__ ( | |
2269 "srl %[p5], %[p5], 8 \n\t" | |
2270 "srl %[p4], %[p4], 8 \n\t" | |
2271 "srl %[p3], %[p3], 8 \n\t" | |
2272 "srl %[p2], %[p2], 8 \n\t" | |
2273 "srl %[p1], %[p1], 8 \n\t" | |
2274 "srl %[p0], %[p0], 8 \n\t" | |
2275 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2276 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0) | |
2277 : | |
2278 ); | |
2279 | |
2280 __asm__ __volatile__ ( | |
2281 "sb %[p5], 2(%[s2]) \n\t" | |
2282 "sb %[p4], 1(%[s2]) \n\t" | |
2283 "sb %[p3], 0(%[s2]) \n\t" | |
2284 "sb %[p2], -1(%[s2]) \n\t" | |
2285 "sb %[p1], -2(%[s2]) \n\t" | |
2286 "sb %[p0], -3(%[s2]) \n\t" | |
2287 : | |
2288 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2), | |
2289 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2290 ); | |
2291 | |
2292 __asm__ __volatile__ ( | |
2293 "srl %[p5], %[p5], 8 \n\t" | |
2294 "srl %[p4], %[p4], 8 \n\t" | |
2295 "srl %[p3], %[p3], 8 \n\t" | |
2296 "srl %[p2], %[p2], 8 \n\t" | |
2297 "srl %[p1], %[p1], 8 \n\t" | |
2298 "srl %[p0], %[p0], 8 \n\t" | |
2299 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2300 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0) | |
2301 : | |
2302 ); | |
2303 | |
2304 __asm__ __volatile__ ( | |
2305 "sb %[p5], 2(%[s1]) \n\t" | |
2306 "sb %[p4], 1(%[s1]) \n\t" | |
2307 "sb %[p3], 0(%[s1]) \n\t" | |
2308 "sb %[p2], -1(%[s1]) \n\t" | |
2309 "sb %[p1], -2(%[s1]) \n\t" | |
2310 "sb %[p0], -3(%[s1]) \n\t" | |
2311 : | |
2312 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), | |
2313 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2314 ); | |
2315 } | |
2316 } | |
2317 | |
2318 s1 = s4 + p; | |
2319 s2 = s1 + p; | |
2320 s3 = s2 + p; | |
2321 s4 = s3 + p; | |
2322 | |
2323 /* load quad-byte vectors | |
2324 * memory is 4 byte aligned | |
2325 */ | |
2326 p2 = *((uint32_t *)(s1 - 4)); | |
2327 p6 = *((uint32_t *)(s1)); | |
2328 p1 = *((uint32_t *)(s2 - 4)); | |
2329 p5 = *((uint32_t *)(s2)); | |
2330 p0 = *((uint32_t *)(s3 - 4)); | |
2331 p4 = *((uint32_t *)(s3)); | |
2332 pm1 = *((uint32_t *)(s4 - 4)); | |
2333 p3 = *((uint32_t *)(s4)); | |
2334 | |
2335 /* transpose pm1, p0, p1, p2 */ | |
2336 __asm__ __volatile__ ( | |
2337 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" | |
2338 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" | |
2339 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" | |
2340 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" | |
2341 | |
2342 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" | |
2343 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" | |
2344 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
2345 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
2346 | |
2347 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" | |
2348 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" | |
2349 "append %[p1], %[sec3], 16 \n\t" | |
2350 "append %[pm1], %[sec4], 16 \n\t" | |
2351 | |
2352 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
2353 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
2354 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), | |
2355 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
2356 : | |
2357 ); | |
2358 | |
2359 /* transpose p3, p4, p5, p6 */ | |
2360 __asm__ __volatile__ ( | |
2361 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" | |
2362 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" | |
2363 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" | |
2364 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" | |
2365 | |
2366 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" | |
2367 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" | |
2368 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" | |
2369 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" | |
2370 | |
2371 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" | |
2372 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" | |
2373 "append %[p5], %[sec3], 16 \n\t" | |
2374 "append %[p3], %[sec4], 16 \n\t" | |
2375 | |
2376 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), | |
2377 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), | |
2378 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2379 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) | |
2380 : | |
2381 ); | |
2382 | |
2383 /* if (p1 - p4 == 0) and (p2 - p3 == 0) | |
2384 * mask will be zero and filtering is not needed | |
2385 */ | |
2386 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) | |
2387 { | |
2388 | |
2389 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
thresh, &hev, &mask); | |
2390 | |
2391 /* if mask == 0 do filtering is not needed */ | |
2392 if (mask) | |
2393 { | |
2394 /* filtering */ | |
2395 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); | |
2396 | |
2397 /* don't use transpose on output data | |
2398 * because memory isn't aligned | |
2399 */ | |
2400 __asm__ __volatile__ ( | |
2401 "sb %[p5], 2(%[s4]) \n\t" | |
2402 "sb %[p4], 1(%[s4]) \n\t" | |
2403 "sb %[p3], 0(%[s4]) \n\t" | |
2404 "sb %[p2], -1(%[s4]) \n\t" | |
2405 "sb %[p1], -2(%[s4]) \n\t" | |
2406 "sb %[p0], -3(%[s4]) \n\t" | |
2407 : | |
2408 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4), | |
2409 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2410 ); | |
2411 | |
2412 __asm__ __volatile__ ( | |
2413 "srl %[p5], %[p5], 8 \n\t" | |
2414 "srl %[p4], %[p4], 8 \n\t" | |
2415 "srl %[p3], %[p3], 8 \n\t" | |
2416 "srl %[p2], %[p2], 8 \n\t" | |
2417 "srl %[p1], %[p1], 8 \n\t" | |
2418 "srl %[p0], %[p0], 8 \n\t" | |
2419 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2420 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0) | |
2421 : | |
2422 ); | |
2423 | |
2424 __asm__ __volatile__ ( | |
2425 "sb %[p5], 2(%[s3]) \n\t" | |
2426 "sb %[p4], 1(%[s3]) \n\t" | |
2427 "sb %[p3], 0(%[s3]) \n\t" | |
2428 "sb %[p2], -1(%[s3]) \n\t" | |
2429 "sb %[p1], -2(%[s3]) \n\t" | |
2430 "sb %[p0], -3(%[s3]) \n\t" | |
2431 : | |
2432 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), | |
2433 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2434 ); | |
2435 | |
2436 __asm__ __volatile__ ( | |
2437 "srl %[p5], %[p5], 8 \n\t" | |
2438 "srl %[p4], %[p4], 8 \n\t" | |
2439 "srl %[p3], %[p3], 8 \n\t" | |
2440 "srl %[p2], %[p2], 8 \n\t" | |
2441 "srl %[p1], %[p1], 8 \n\t" | |
2442 "srl %[p0], %[p0], 8 \n\t" | |
2443 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2444 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0) | |
2445 : | |
2446 ); | |
2447 | |
2448 __asm__ __volatile__ ( | |
2449 "sb %[p5], 2(%[s2]) \n\t" | |
2450 "sb %[p4], 1(%[s2]) \n\t" | |
2451 "sb %[p3], 0(%[s2]) \n\t" | |
2452 "sb %[p2], -1(%[s2]) \n\t" | |
2453 "sb %[p1], -2(%[s2]) \n\t" | |
2454 "sb %[p0], -3(%[s2]) \n\t" | |
2455 : | |
2456 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2), | |
2457 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2458 ); | |
2459 | |
2460 __asm__ __volatile__ ( | |
2461 "srl %[p5], %[p5], 8 \n\t" | |
2462 "srl %[p4], %[p4], 8 \n\t" | |
2463 "srl %[p3], %[p3], 8 \n\t" | |
2464 "srl %[p2], %[p2], 8 \n\t" | |
2465 "srl %[p1], %[p1], 8 \n\t" | |
2466 "srl %[p0], %[p0], 8 \n\t" | |
2467 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), | |
2468 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0) | |
2469 : | |
2470 ); | |
2471 | |
2472 __asm__ __volatile__ ( | |
2473 "sb %[p5], 2(%[s1]) \n\t" | |
2474 "sb %[p4], 1(%[s1]) \n\t" | |
2475 "sb %[p3], 0(%[s1]) \n\t" | |
2476 "sb %[p2], -1(%[s1]) \n\t" | |
2477 "sb %[p1], -2(%[s1]) \n\t" | |
2478 "sb %[p0], -3(%[s1]) \n\t" | |
2479 : | |
2480 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), | |
2481 [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0) | |
2482 ); | |
2483 } | |
2484 } | |
2485 } | |
2486 | |
2487 /* Horizontal MB filtering */ | |
2488 void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
ned char *v_ptr, | |
2489 int y_stride, int uv_stride, loop_filter_info *lf
i) | |
2490 { | |
2491 unsigned int thresh_vec, flimit_vec, limit_vec; | |
2492 unsigned char thresh, flimit, limit, flimit_temp; | |
2493 | |
2494 /* use direct value instead pointers */ | |
2495 limit = *(lfi->lim); | |
2496 flimit_temp = *(lfi->mblim); | |
2497 thresh = *(lfi->hev_thr); | |
2498 flimit = flimit_temp; | |
2499 | |
2500 /* create quad-byte */ | |
2501 __asm__ __volatile__ ( | |
2502 "replv.qb %[thresh_vec], %[thresh] \n\t" | |
2503 "replv.qb %[flimit_vec], %[flimit] \n\t" | |
2504 "replv.qb %[limit_vec], %[limit] \n\t" | |
2505 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [lim
it_vec] "=r" (limit_vec) | |
2506 : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit) | |
2507 ); | |
2508 | |
2509 vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_ve
c, thresh_vec, 16); | |
2510 | |
2511 if (u_ptr) | |
2512 { | |
2513 vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec, l
imit_vec, thresh_vec, 0); | |
2514 } | |
2515 | |
2516 if (v_ptr) | |
2517 { | |
2518 vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec, l
imit_vec, thresh_vec, 0); | |
2519 } | |
2520 } | |
2521 | |
2522 | |
2523 /* Vertical MB Filtering */ | |
2524 void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
ned char *v_ptr, | |
2525 int y_stride, int uv_stride, loop_filter_info *lf
i) | |
2526 { | |
2527 unsigned int thresh_vec, flimit_vec, limit_vec; | |
2528 unsigned char thresh, flimit, limit, flimit_temp; | |
2529 | |
2530 /* use direct value instead pointers */ | |
2531 limit = *(lfi->lim); | |
2532 flimit_temp = *(lfi->mblim); | |
2533 thresh = *(lfi->hev_thr); | |
2534 flimit = flimit_temp; | |
2535 | |
2536 /* create quad-byte */ | |
2537 __asm__ __volatile__ ( | |
2538 "replv.qb %[thresh_vec], %[thresh] \n\t" | |
2539 "replv.qb %[flimit_vec], %[flimit] \n\t" | |
2540 "replv.qb %[limit_vec], %[limit] \n\t" | |
2541 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [lim
it_vec] "=r" (limit_vec) | |
2542 : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit) | |
2543 ); | |
2544 | |
2545 vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec,
thresh_vec, 16); | |
2546 | |
2547 if (u_ptr) | |
2548 vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec, lim
it_vec, thresh_vec, 0); | |
2549 | |
2550 if (v_ptr) | |
2551 vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec, lim
it_vec, thresh_vec, 0); | |
2552 } | |
2553 | |
2554 | |
2555 /* Horizontal B Filtering */ | |
2556 void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
ed char *v_ptr, | |
2557 int y_stride, int uv_stride, loop_filter_info *lfi
) | |
2558 { | |
2559 unsigned int thresh_vec, flimit_vec, limit_vec; | |
2560 unsigned char thresh, flimit, limit, flimit_temp; | |
2561 | |
2562 /* use direct value instead pointers */ | |
2563 limit = *(lfi->lim); | |
2564 flimit_temp = *(lfi->blim); | |
2565 thresh = *(lfi->hev_thr); | |
2566 flimit = flimit_temp; | |
2567 | |
2568 /* create quad-byte */ | |
2569 __asm__ __volatile__ ( | |
2570 "replv.qb %[thresh_vec], %[thresh] \n\t" | |
2571 "replv.qb %[flimit_vec], %[flimit] \n\t" | |
2572 "replv.qb %[limit_vec], %[limit] \n\t" | |
2573 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [lim
it_vec] "=r" (limit_vec) | |
2574 : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit) | |
2575 ); | |
2576 | |
2577 vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride, flimit_
vec, limit_vec, thresh_vec, 16); | |
2578 vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride, flimit_
vec, limit_vec, thresh_vec, 16); | |
2579 vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride, flimit
_vec, limit_vec, thresh_vec, 16); | |
2580 | |
2581 if (u_ptr) | |
2582 vp8_loop_filter_uvhorizontal_edge_mips(u_ptr + 4 * uv_stride, uv_stride,
flimit_vec, limit_vec, thresh_vec, 0); | |
2583 | |
2584 if (v_ptr) | |
2585 vp8_loop_filter_uvhorizontal_edge_mips(v_ptr + 4 * uv_stride, uv_stride,
flimit_vec, limit_vec, thresh_vec, 0); | |
2586 } | |
2587 | |
2588 | |
2589 /* Vertical B Filtering */ | |
2590 void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
ed char *v_ptr, | |
2591 int y_stride, int uv_stride, loop_filter_info *lfi
) | |
2592 { | |
2593 unsigned int thresh_vec, flimit_vec, limit_vec; | |
2594 unsigned char thresh, flimit, limit, flimit_temp; | |
2595 | |
2596 /* use direct value instead pointers */ | |
2597 limit = *(lfi->lim); | |
2598 flimit_temp = *(lfi->blim); | |
2599 thresh = *(lfi->hev_thr); | |
2600 flimit = flimit_temp; | |
2601 | |
2602 /* create quad-byte */ | |
2603 __asm__ __volatile__ ( | |
2604 "replv.qb %[thresh_vec], %[thresh] \n\t" | |
2605 "replv.qb %[flimit_vec], %[flimit] \n\t" | |
2606 "replv.qb %[limit_vec], %[limit] \n\t" | |
2607 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [lim
it_vec] "=r" (limit_vec) | |
2608 : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit) | |
2609 ); | |
2610 | |
2611 vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_ve
c, thresh_vec, 16); | |
2612 vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_ve
c, thresh_vec, 16); | |
2613 vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec, limit_v
ec, thresh_vec, 16); | |
2614 | |
2615 if (u_ptr) | |
2616 vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec, l
imit_vec, thresh_vec, 0); | |
2617 | |
2618 if (v_ptr) | |
2619 vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec, l
imit_vec, thresh_vec, 0); | |
2620 } | |
2621 | |
2622 #endif | |
OLD | NEW |