OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <assert.h> | |
12 #include <stdio.h> | |
13 | |
14 #include "./vpx_dsp_rtcd.h" | |
15 #include "vpx_dsp/mips/vpx_common_dspr2.h" | |
16 #include "vpx_dsp/vpx_convolve.h" | |
17 #include "vpx_dsp/vpx_dsp_common.h" | |
18 #include "vpx_ports/mem.h" | |
19 | |
20 #if HAVE_DSPR2 | |
21 static void convolve_avg_horiz_4_dspr2(const uint8_t *src, | |
22 int32_t src_stride, | |
23 uint8_t *dst, | |
24 int32_t dst_stride, | |
25 const int16_t *filter_x0, | |
26 int32_t h) { | |
27 int32_t y; | |
28 uint8_t *cm = vpx_ff_cropTbl; | |
29 int32_t vector1b, vector2b, vector3b, vector4b; | |
30 int32_t Temp1, Temp2, Temp3, Temp4; | |
31 uint32_t vector4a = 64; | |
32 uint32_t tp1, tp2; | |
33 uint32_t p1, p2, p3, p4; | |
34 uint32_t n1, n2, n3, n4; | |
35 uint32_t tn1, tn2; | |
36 | |
37 vector1b = ((const int32_t *)filter_x0)[0]; | |
38 vector2b = ((const int32_t *)filter_x0)[1]; | |
39 vector3b = ((const int32_t *)filter_x0)[2]; | |
40 vector4b = ((const int32_t *)filter_x0)[3]; | |
41 | |
42 for (y = h; y--;) { | |
43 /* prefetch data to cache memory */ | |
44 prefetch_load(src + src_stride); | |
45 prefetch_load(src + src_stride + 32); | |
46 prefetch_store(dst + dst_stride); | |
47 | |
48 __asm__ __volatile__ ( | |
49 "ulw %[tp1], 0(%[src]) \n\t" | |
50 "ulw %[tp2], 4(%[src]) \n\t" | |
51 | |
52 /* even 1. pixel */ | |
53 "mtlo %[vector4a], $ac3 \n\t" | |
54 "mthi $zero, $ac3 \n\t" | |
55 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
56 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
57 "preceu.ph.qbr %[p3], %[tp2] \n\t" | |
58 "preceu.ph.qbl %[p4], %[tp2] \n\t" | |
59 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
60 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
61 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
62 "ulw %[tn2], 8(%[src]) \n\t" | |
63 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
64 "extp %[Temp1], $ac3, 31 \n\t" | |
65 | |
66 /* even 2. pixel */ | |
67 "mtlo %[vector4a], $ac2 \n\t" | |
68 "mthi $zero, $ac2 \n\t" | |
69 "preceu.ph.qbr %[p1], %[tn2] \n\t" | |
70 "balign %[tn1], %[tn2], 3 \n\t" | |
71 "balign %[tn2], %[tp2], 3 \n\t" | |
72 "balign %[tp2], %[tp1], 3 \n\t" | |
73 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" | |
74 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" | |
75 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" | |
76 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" | |
77 "extp %[Temp3], $ac2, 31 \n\t" | |
78 | |
79 "lbu %[p2], 3(%[dst]) \n\t"
/* load odd 2 */ | |
80 | |
81 /* odd 1. pixel */ | |
82 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
/* even 1 */ | |
83 "mtlo %[vector4a], $ac3 \n\t" | |
84 "mthi $zero, $ac3 \n\t" | |
85 "lbu %[Temp1], 1(%[dst]) \n\t"
/* load odd 1 */ | |
86 "preceu.ph.qbr %[n1], %[tp2] \n\t" | |
87 "preceu.ph.qbl %[n2], %[tp2] \n\t" | |
88 "preceu.ph.qbr %[n3], %[tn2] \n\t" | |
89 "preceu.ph.qbl %[n4], %[tn2] \n\t" | |
90 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" | |
91 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" | |
92 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" | |
93 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" | |
94 "extp %[Temp2], $ac3, 31 \n\t" | |
95 | |
96 "lbu %[tn2], 0(%[dst]) \n\t"
/* load even 1 */ | |
97 | |
98 /* odd 2. pixel */ | |
99 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
/* even 2 */ | |
100 "mtlo %[vector4a], $ac2 \n\t" | |
101 "mthi $zero, $ac2 \n\t" | |
102 "preceu.ph.qbr %[n1], %[tn1] \n\t" | |
103 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
/* odd 1 */ | |
104 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t"
/* average even 1 */ | |
105 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" | |
106 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" | |
107 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" | |
108 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" | |
109 "extp %[Temp4], $ac2, 31 \n\t" | |
110 | |
111 "lbu %[tp1], 2(%[dst]) \n\t"
/* load even 2 */ | |
112 "sb %[tn2], 0(%[dst]) \n\t"
/* store even 1 */ | |
113 | |
114 /* clamp */ | |
115 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t"
/* average odd 1 */ | |
116 "lbux %[n2], %[Temp4](%[cm]) \n\t"
/* odd 2 */ | |
117 "sb %[Temp1], 1(%[dst]) \n\t"
/* store odd 1 */ | |
118 | |
119 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t"
/* average even 2 */ | |
120 "sb %[tp1], 2(%[dst]) \n\t"
/* store even 2 */ | |
121 | |
122 "addqh_r.w %[p2], %[p2], %[n2] \n\t"
/* average odd 2 */ | |
123 "sb %[p2], 3(%[dst]) \n\t"
/* store odd 2 */ | |
124 | |
125 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
126 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), | |
127 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
128 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), | |
129 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), | |
130 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) | |
131 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), | |
132 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), | |
133 [vector4a] "r" (vector4a), | |
134 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) | |
135 ); | |
136 | |
137 /* Next row... */ | |
138 src += src_stride; | |
139 dst += dst_stride; | |
140 } | |
141 } | |
142 | |
143 static void convolve_avg_horiz_8_dspr2(const uint8_t *src, | |
144 int32_t src_stride, | |
145 uint8_t *dst, | |
146 int32_t dst_stride, | |
147 const int16_t *filter_x0, | |
148 int32_t h) { | |
149 int32_t y; | |
150 uint8_t *cm = vpx_ff_cropTbl; | |
151 uint32_t vector4a = 64; | |
152 int32_t vector1b, vector2b, vector3b, vector4b; | |
153 int32_t Temp1, Temp2, Temp3; | |
154 uint32_t tp1, tp2; | |
155 uint32_t p1, p2, p3, p4, n1; | |
156 uint32_t tn1, tn2, tn3; | |
157 uint32_t st0, st1; | |
158 | |
159 vector1b = ((const int32_t *)filter_x0)[0]; | |
160 vector2b = ((const int32_t *)filter_x0)[1]; | |
161 vector3b = ((const int32_t *)filter_x0)[2]; | |
162 vector4b = ((const int32_t *)filter_x0)[3]; | |
163 | |
164 for (y = h; y--;) { | |
165 /* prefetch data to cache memory */ | |
166 prefetch_load(src + src_stride); | |
167 prefetch_load(src + src_stride + 32); | |
168 prefetch_store(dst + dst_stride); | |
169 | |
170 __asm__ __volatile__ ( | |
171 "ulw %[tp1], 0(%[src]) \n\t" | |
172 "ulw %[tp2], 4(%[src]) \n\t" | |
173 | |
174 /* even 1. pixel */ | |
175 "mtlo %[vector4a], $ac3 \n\t" | |
176 "mthi $zero, $ac3 \n\t" | |
177 "mtlo %[vector4a], $ac2 \n\t" | |
178 "mthi $zero, $ac2 \n\t" | |
179 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
180 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
181 "preceu.ph.qbr %[p3], %[tp2] \n\t" | |
182 "preceu.ph.qbl %[p4], %[tp2] \n\t" | |
183 "ulw %[tn2], 8(%[src]) \n\t" | |
184 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
185 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
186 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
187 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
188 "extp %[Temp1], $ac3, 31 \n\t" | |
189 "lbu %[Temp2], 0(%[dst]) \n\t" | |
190 "lbu %[tn3], 2(%[dst]) \n\t" | |
191 | |
192 /* even 2. pixel */ | |
193 "preceu.ph.qbr %[p1], %[tn2] \n\t" | |
194 "preceu.ph.qbl %[n1], %[tn2] \n\t" | |
195 "ulw %[tn1], 12(%[src]) \n\t" | |
196 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" | |
197 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" | |
198 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" | |
199 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" | |
200 "extp %[Temp3], $ac2, 31 \n\t" | |
201 | |
202 /* even 3. pixel */ | |
203 "lbux %[st0], %[Temp1](%[cm]) \n\t" | |
204 "mtlo %[vector4a], $ac1 \n\t" | |
205 "mthi $zero, $ac1 \n\t" | |
206 "preceu.ph.qbr %[p2], %[tn1] \n\t" | |
207 "lbux %[st1], %[Temp3](%[cm]) \n\t" | |
208 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" | |
209 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" | |
210 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" | |
211 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" | |
212 "extp %[Temp1], $ac1, 31 \n\t" | |
213 | |
214 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" | |
215 "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" | |
216 "sb %[Temp2], 0(%[dst]) \n\t" | |
217 "sb %[tn3], 2(%[dst]) \n\t" | |
218 | |
219 /* even 4. pixel */ | |
220 "mtlo %[vector4a], $ac2 \n\t" | |
221 "mthi $zero, $ac2 \n\t" | |
222 "mtlo %[vector4a], $ac3 \n\t" | |
223 "mthi $zero, $ac3 \n\t" | |
224 | |
225 "balign %[tn3], %[tn1], 3 \n\t" | |
226 "balign %[tn1], %[tn2], 3 \n\t" | |
227 "balign %[tn2], %[tp2], 3 \n\t" | |
228 "balign %[tp2], %[tp1], 3 \n\t" | |
229 | |
230 "lbux %[st0], %[Temp1](%[cm]) \n\t" | |
231 "lbu %[Temp2], 4(%[dst]) \n\t" | |
232 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" | |
233 | |
234 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" | |
235 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" | |
236 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" | |
237 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" | |
238 "extp %[Temp3], $ac2, 31 \n\t" | |
239 | |
240 /* odd 1. pixel */ | |
241 "mtlo %[vector4a], $ac1 \n\t" | |
242 "mthi $zero, $ac1 \n\t" | |
243 "sb %[Temp2], 4(%[dst]) \n\t" | |
244 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
245 "preceu.ph.qbl %[p2], %[tp2] \n\t" | |
246 "preceu.ph.qbr %[p3], %[tn2] \n\t" | |
247 "preceu.ph.qbl %[p4], %[tn2] \n\t" | |
248 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
249 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
250 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
251 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
252 "extp %[Temp2], $ac3, 31 \n\t" | |
253 | |
254 "lbu %[tp1], 6(%[dst]) \n\t" | |
255 | |
256 /* odd 2. pixel */ | |
257 "mtlo %[vector4a], $ac3 \n\t" | |
258 "mthi $zero, $ac3 \n\t" | |
259 "mtlo %[vector4a], $ac2 \n\t" | |
260 "mthi $zero, $ac2 \n\t" | |
261 "preceu.ph.qbr %[p1], %[tn1] \n\t" | |
262 "preceu.ph.qbl %[n1], %[tn1] \n\t" | |
263 "lbux %[st0], %[Temp3](%[cm]) \n\t" | |
264 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" | |
265 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" | |
266 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" | |
267 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" | |
268 "extp %[Temp3], $ac1, 31 \n\t" | |
269 | |
270 "lbu %[tp2], 1(%[dst]) \n\t" | |
271 "lbu %[tn2], 3(%[dst]) \n\t" | |
272 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" | |
273 | |
274 /* odd 3. pixel */ | |
275 "lbux %[st1], %[Temp2](%[cm]) \n\t" | |
276 "preceu.ph.qbr %[p2], %[tn3] \n\t" | |
277 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" | |
278 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" | |
279 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" | |
280 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" | |
281 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" | |
282 "extp %[Temp2], $ac3, 31 \n\t" | |
283 | |
284 "lbu %[tn3], 5(%[dst]) \n\t" | |
285 | |
286 /* odd 4. pixel */ | |
287 "sb %[tp2], 1(%[dst]) \n\t" | |
288 "sb %[tp1], 6(%[dst]) \n\t" | |
289 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" | |
290 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" | |
291 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" | |
292 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" | |
293 "extp %[Temp1], $ac2, 31 \n\t" | |
294 | |
295 "lbu %[tn1], 7(%[dst]) \n\t" | |
296 | |
297 /* clamp */ | |
298 "lbux %[p4], %[Temp3](%[cm]) \n\t" | |
299 "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" | |
300 | |
301 "lbux %[p2], %[Temp2](%[cm]) \n\t" | |
302 "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" | |
303 | |
304 "lbux %[n1], %[Temp1](%[cm]) \n\t" | |
305 "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" | |
306 | |
307 /* store bytes */ | |
308 "sb %[tn2], 3(%[dst]) \n\t" | |
309 "sb %[tn3], 5(%[dst]) \n\t" | |
310 "sb %[tn1], 7(%[dst]) \n\t" | |
311 | |
312 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
313 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), | |
314 [st0] "=&r" (st0), [st1] "=&r" (st1), | |
315 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
316 [n1] "=&r" (n1), | |
317 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) | |
318 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), | |
319 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), | |
320 [vector4a] "r" (vector4a), | |
321 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) | |
322 ); | |
323 | |
324 /* Next row... */ | |
325 src += src_stride; | |
326 dst += dst_stride; | |
327 } | |
328 } | |
329 | |
330 static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, | |
331 int32_t src_stride, | |
332 uint8_t *dst_ptr, | |
333 int32_t dst_stride, | |
334 const int16_t *filter_x0, | |
335 int32_t h, | |
336 int32_t count) { | |
337 int32_t y, c; | |
338 const uint8_t *src; | |
339 uint8_t *dst; | |
340 uint8_t *cm = vpx_ff_cropTbl; | |
341 uint32_t vector_64 = 64; | |
342 int32_t filter12, filter34, filter56, filter78; | |
343 int32_t Temp1, Temp2, Temp3; | |
344 uint32_t qload1, qload2, qload3; | |
345 uint32_t p1, p2, p3, p4, p5; | |
346 uint32_t st1, st2, st3; | |
347 | |
348 filter12 = ((const int32_t *)filter_x0)[0]; | |
349 filter34 = ((const int32_t *)filter_x0)[1]; | |
350 filter56 = ((const int32_t *)filter_x0)[2]; | |
351 filter78 = ((const int32_t *)filter_x0)[3]; | |
352 | |
353 for (y = h; y--;) { | |
354 src = src_ptr; | |
355 dst = dst_ptr; | |
356 | |
357 /* prefetch data to cache memory */ | |
358 prefetch_load(src_ptr + src_stride); | |
359 prefetch_load(src_ptr + src_stride + 32); | |
360 prefetch_store(dst_ptr + dst_stride); | |
361 | |
362 for (c = 0; c < count; c++) { | |
363 __asm__ __volatile__ ( | |
364 "ulw %[qload1], 0(%[src]) \n\t" | |
365 "ulw %[qload2], 4(%[src]) \n\t" | |
366 | |
367 /* even 1. pixel */ | |
368 "mtlo %[vector_64], $ac1 \n\t" /*
even 1 */ | |
369 "mthi $zero, $ac1 \n\t" | |
370 "mtlo %[vector_64], $ac2 \n\t" /*
even 2 */ | |
371 "mthi $zero, $ac2 \n\t" | |
372 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
373 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
374 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
375 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
376 "ulw %[qload3], 8(%[src]) \n\t" | |
377 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /*
even 1 */ | |
378 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /*
even 1 */ | |
379 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /*
even 1 */ | |
380 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /*
even 1 */ | |
381 "extp %[Temp1], $ac1, 31 \n\t" /*
even 1 */ | |
382 "lbu %[st2], 0(%[dst]) \n\t" /*
load even 1 from dst */ | |
383 | |
384 /* even 2. pixel */ | |
385 "mtlo %[vector_64], $ac3 \n\t" /*
even 3 */ | |
386 "mthi $zero, $ac3 \n\t" | |
387 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
388 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
389 "ulw %[qload1], 12(%[src]) \n\t" | |
390 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /*
even 1 */ | |
391 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /*
even 1 */ | |
392 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /*
even 1 */ | |
393 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /*
even 1 */ | |
394 "extp %[Temp2], $ac2, 31 \n\t" /*
even 1 */ | |
395 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 1 */ | |
396 | |
397 "lbu %[qload3], 2(%[dst]) \n\t" /*
load even 2 from dst */ | |
398 | |
399 /* even 3. pixel */ | |
400 "mtlo %[vector_64], $ac1 \n\t" /*
even 4 */ | |
401 "mthi $zero, $ac1 \n\t" | |
402 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /*
average even 1 */ | |
403 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
404 "sb %[st2], 0(%[dst]) \n\t" /*
store even 1 to dst */ | |
405 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /*
even 3 */ | |
406 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /*
even 3 */ | |
407 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /*
even 3 */ | |
408 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /*
even 3 */ | |
409 "extp %[Temp3], $ac3, 31 \n\t" /*
even 3 */ | |
410 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 1 */ | |
411 | |
412 /* even 4. pixel */ | |
413 "mtlo %[vector_64], $ac2 \n\t" /*
even 5 */ | |
414 "mthi $zero, $ac2 \n\t" | |
415 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /*
average even 2 */ | |
416 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
417 "sb %[qload3], 2(%[dst]) \n\t" /*
store even 2 to dst */ | |
418 "ulw %[qload2], 16(%[src]) \n\t" | |
419 "lbu %[qload3], 4(%[dst]) \n\t" /*
load even 3 from dst */ | |
420 "lbu %[qload1], 6(%[dst]) \n\t" /*
load even 4 from dst */ | |
421 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /*
even 4 */ | |
422 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /*
even 4 */ | |
423 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /*
even 4 */ | |
424 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /*
even 4 */ | |
425 "extp %[Temp1], $ac1, 31 \n\t" /*
even 4 */ | |
426 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 3 */ | |
427 | |
428 /* even 5. pixel */ | |
429 "mtlo %[vector_64], $ac3 \n\t" /*
even 6 */ | |
430 "mthi $zero, $ac3 \n\t" | |
431 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /*
average even 3 */ | |
432 "preceu.ph.qbr %[p4], %[qload2] \n\t" | |
433 "sb %[qload3], 4(%[dst]) \n\t" /*
store even 3 to dst */ | |
434 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /*
even 5 */ | |
435 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /*
even 5 */ | |
436 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /*
even 5 */ | |
437 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /*
even 5 */ | |
438 "extp %[Temp2], $ac2, 31 \n\t" /*
even 5 */ | |
439 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 4 */ | |
440 | |
441 /* even 6. pixel */ | |
442 "mtlo %[vector_64], $ac1 \n\t" /*
even 7 */ | |
443 "mthi $zero, $ac1 \n\t" | |
444 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /*
average even 4 */ | |
445 "preceu.ph.qbl %[p1], %[qload2] \n\t" | |
446 "sb %[qload1], 6(%[dst]) \n\t" /*
store even 4 to dst */ | |
447 "ulw %[qload3], 20(%[src]) \n\t" | |
448 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /*
even 6 */ | |
449 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /*
even 6 */ | |
450 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /*
even 6 */ | |
451 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /*
even 6 */ | |
452 "lbu %[qload2], 8(%[dst]) \n\t" /*
load even 5 from dst */ | |
453 "extp %[Temp3], $ac3, 31 \n\t" /*
even 6 */ | |
454 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 5 */ | |
455 | |
456 /* even 7. pixel */ | |
457 "mtlo %[vector_64], $ac2 \n\t" /*
even 8 */ | |
458 "mthi $zero, $ac2 \n\t" | |
459 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /*
average even 5 */ | |
460 "preceu.ph.qbr %[p5], %[qload3] \n\t" | |
461 "sb %[qload2], 8(%[dst]) \n\t" /*
store even 5 to dst */ | |
462 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /*
even 7 */ | |
463 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /*
even 7 */ | |
464 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /*
even 7 */ | |
465 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /*
even 7 */ | |
466 "lbu %[qload3], 10(%[dst]) \n\t" /*
load even 6 from dst */ | |
467 "extp %[Temp1], $ac1, 31 \n\t" /*
even 7 */ | |
468 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 6 */ | |
469 | |
470 "lbu %[st2], 12(%[dst]) \n\t" /*
load even 7 from dst */ | |
471 | |
472 /* even 8. pixel */ | |
473 "mtlo %[vector_64], $ac3 \n\t" /*
odd 1 */ | |
474 "mthi $zero, $ac3 \n\t" | |
475 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /*
average even 6 */ | |
476 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /*
even 8 */ | |
477 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /*
even 8 */ | |
478 "sb %[qload3], 10(%[dst]) \n\t" /*
store even 6 to dst */ | |
479 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /*
even 8 */ | |
480 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /*
even 8 */ | |
481 "extp %[Temp2], $ac2, 31 \n\t" /*
even 8 */ | |
482 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 7 */ | |
483 | |
484 /* ODD pixels */ | |
485 "ulw %[qload1], 1(%[src]) \n\t" | |
486 "ulw %[qload2], 5(%[src]) \n\t" | |
487 | |
488 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /*
average even 7 */ | |
489 | |
490 /* odd 1. pixel */ | |
491 "mtlo %[vector_64], $ac1 \n\t" /*
odd 2 */ | |
492 "mthi $zero, $ac1 \n\t" | |
493 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
494 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
495 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
496 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
497 "sb %[st2], 12(%[dst]) \n\t" /*
store even 7 to dst */ | |
498 "ulw %[qload3], 9(%[src]) \n\t" | |
499 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /*
odd 1 */ | |
500 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /*
odd 1 */ | |
501 "lbu %[qload2], 14(%[dst]) \n\t" /*
load even 8 from dst */ | |
502 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /*
odd 1 */ | |
503 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /*
odd 1 */ | |
504 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 1 */ | |
505 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 8 */ | |
506 | |
507 "lbu %[st1], 1(%[dst]) \n\t" /*
load odd 1 from dst */ | |
508 | |
509 /* odd 2. pixel */ | |
510 "mtlo %[vector_64], $ac2 \n\t" /*
odd 3 */ | |
511 "mthi $zero, $ac2 \n\t" | |
512 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /*
average even 8 */ | |
513 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
514 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
515 "sb %[qload2], 14(%[dst]) \n\t" /*
store even 8 to dst */ | |
516 "ulw %[qload1], 13(%[src]) \n\t" | |
517 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /*
odd 2 */ | |
518 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /*
odd 2 */ | |
519 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /*
odd 2 */ | |
520 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /*
odd 2 */ | |
521 "lbu %[qload3], 3(%[dst]) \n\t" /*
load odd 2 from dst */ | |
522 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 2 */ | |
523 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 1 */ | |
524 | |
525 /* odd 3. pixel */ | |
526 "mtlo %[vector_64], $ac3 \n\t" /*
odd 4 */ | |
527 "mthi $zero, $ac3 \n\t" | |
528 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /*
average odd 1 */ | |
529 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
530 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /*
odd 3 */ | |
531 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /*
odd 3 */ | |
532 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /*
odd 3 */ | |
533 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /*
odd 3 */ | |
534 "sb %[st3], 1(%[dst]) \n\t" /*
store odd 1 to dst */ | |
535 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 3 */ | |
536 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 2 */ | |
537 | |
538 /* odd 4. pixel */ | |
539 "mtlo %[vector_64], $ac1 \n\t" /*
odd 5 */ | |
540 "mthi $zero, $ac1 \n\t" | |
541 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /*
average odd 2 */ | |
542 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
543 "sb %[qload3], 3(%[dst]) \n\t" /*
store odd 2 to dst */ | |
544 "lbu %[qload1], 5(%[dst]) \n\t" /*
load odd 3 from dst */ | |
545 "ulw %[qload2], 17(%[src]) \n\t" | |
546 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /*
odd 4 */ | |
547 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /*
odd 4 */ | |
548 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /*
odd 4 */ | |
549 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /*
odd 4 */ | |
550 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 4 */ | |
551 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 3 */ | |
552 | |
553 "lbu %[st1], 7(%[dst]) \n\t" /*
load odd 4 from dst */ | |
554 | |
555 /* odd 5. pixel */ | |
556 "mtlo %[vector_64], $ac2 \n\t" /*
odd 6 */ | |
557 "mthi $zero, $ac2 \n\t" | |
558 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /*
average odd 3 */ | |
559 "preceu.ph.qbr %[p4], %[qload2] \n\t" | |
560 "sb %[qload1], 5(%[dst]) \n\t" /*
store odd 3 to dst */ | |
561 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /*
odd 5 */ | |
562 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /*
odd 5 */ | |
563 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /*
odd 5 */ | |
564 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /*
odd 5 */ | |
565 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 5 */ | |
566 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 4 */ | |
567 | |
568 "lbu %[qload1], 9(%[dst]) \n\t" /*
load odd 5 from dst */ | |
569 | |
570 /* odd 6. pixel */ | |
571 "mtlo %[vector_64], $ac3 \n\t" /*
odd 7 */ | |
572 "mthi $zero, $ac3 \n\t" | |
573 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /*
average odd 4 */ | |
574 "preceu.ph.qbl %[p1], %[qload2] \n\t" | |
575 "sb %[st1], 7(%[dst]) \n\t" /*
store odd 4 to dst */ | |
576 "ulw %[qload3], 21(%[src]) \n\t" | |
577 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /*
odd 6 */ | |
578 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /*
odd 6 */ | |
579 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /*
odd 6 */ | |
580 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /*
odd 6 */ | |
581 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 6 */ | |
582 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 5 */ | |
583 | |
584 /* odd 7. pixel */ | |
585 "mtlo %[vector_64], $ac1 \n\t" /*
odd 8 */ | |
586 "mthi $zero, $ac1 \n\t" | |
587 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /*
average odd 5 */ | |
588 "preceu.ph.qbr %[p5], %[qload3] \n\t" | |
589 "sb %[qload1], 9(%[dst]) \n\t" /*
store odd 5 to dst */ | |
590 "lbu %[qload2], 11(%[dst]) \n\t" /*
load odd 6 from dst */ | |
591 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /*
odd 7 */ | |
592 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /*
odd 7 */ | |
593 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /*
odd 7 */ | |
594 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /*
odd 7 */ | |
595 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 7 */ | |
596 | |
597 "lbu %[qload3], 13(%[dst]) \n\t" /*
load odd 7 from dst */ | |
598 | |
599 /* odd 8. pixel */ | |
600 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /*
odd 8 */ | |
601 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /*
odd 8 */ | |
602 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /*
odd 8 */ | |
603 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /*
odd 8 */ | |
604 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 8 */ | |
605 | |
606 "lbu %[qload1], 15(%[dst]) \n\t" /*
load odd 8 from dst */ | |
607 | |
608 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 6 */ | |
609 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /*
average odd 6 */ | |
610 | |
611 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 7 */ | |
612 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /*
average odd 7 */ | |
613 | |
614 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 8 */ | |
615 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /*
average odd 8 */ | |
616 | |
617 "sb %[qload2], 11(%[dst]) \n\t" /*
store odd 6 to dst */ | |
618 "sb %[qload3], 13(%[dst]) \n\t" /*
store odd 7 to dst */ | |
619 "sb %[qload1], 15(%[dst]) \n\t" /*
store odd 8 to dst */ | |
620 | |
621 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), | |
622 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
623 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
624 [qload3] "=&r" (qload3), [p5] "=&r" (p5), | |
625 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) | |
626 : [filter12] "r" (filter12), [filter34] "r" (filter34), | |
627 [filter56] "r" (filter56), [filter78] "r" (filter78), | |
628 [vector_64] "r" (vector_64), | |
629 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) | |
630 ); | |
631 | |
632 src += 16; | |
633 dst += 16; | |
634 } | |
635 | |
636 /* Next row... */ | |
637 src_ptr += src_stride; | |
638 dst_ptr += dst_stride; | |
639 } | |
640 } | |
641 | |
642 static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, | |
643 int32_t src_stride, | |
644 uint8_t *dst_ptr, | |
645 int32_t dst_stride, | |
646 const int16_t *filter_x0, | |
647 int32_t h) { | |
648 int32_t y, c; | |
649 const uint8_t *src; | |
650 uint8_t *dst; | |
651 uint8_t *cm = vpx_ff_cropTbl; | |
652 uint32_t vector_64 = 64; | |
653 int32_t filter12, filter34, filter56, filter78; | |
654 int32_t Temp1, Temp2, Temp3; | |
655 uint32_t qload1, qload2, qload3; | |
656 uint32_t p1, p2, p3, p4, p5; | |
657 uint32_t st1, st2, st3; | |
658 | |
659 filter12 = ((const int32_t *)filter_x0)[0]; | |
660 filter34 = ((const int32_t *)filter_x0)[1]; | |
661 filter56 = ((const int32_t *)filter_x0)[2]; | |
662 filter78 = ((const int32_t *)filter_x0)[3]; | |
663 | |
664 for (y = h; y--;) { | |
665 src = src_ptr; | |
666 dst = dst_ptr; | |
667 | |
668 /* prefetch data to cache memory */ | |
669 prefetch_load(src_ptr + src_stride); | |
670 prefetch_load(src_ptr + src_stride + 32); | |
671 prefetch_load(src_ptr + src_stride + 64); | |
672 prefetch_store(dst_ptr + dst_stride); | |
673 prefetch_store(dst_ptr + dst_stride + 32); | |
674 | |
675 for (c = 0; c < 4; c++) { | |
676 __asm__ __volatile__ ( | |
677 "ulw %[qload1], 0(%[src]) \n\t" | |
678 "ulw %[qload2], 4(%[src]) \n\t" | |
679 | |
680 /* even 1. pixel */ | |
681 "mtlo %[vector_64], $ac1 \n\t" /*
even 1 */ | |
682 "mthi $zero, $ac1 \n\t" | |
683 "mtlo %[vector_64], $ac2 \n\t" /*
even 2 */ | |
684 "mthi $zero, $ac2 \n\t" | |
685 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
686 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
687 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
688 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
689 "ulw %[qload3], 8(%[src]) \n\t" | |
690 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /*
even 1 */ | |
691 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /*
even 1 */ | |
692 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /*
even 1 */ | |
693 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /*
even 1 */ | |
694 "extp %[Temp1], $ac1, 31 \n\t" /*
even 1 */ | |
695 "lbu %[st2], 0(%[dst]) \n\t" /*
load even 1 from dst */ | |
696 | |
697 /* even 2. pixel */ | |
698 "mtlo %[vector_64], $ac3 \n\t" /*
even 3 */ | |
699 "mthi $zero, $ac3 \n\t" | |
700 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
701 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
702 "ulw %[qload1], 12(%[src]) \n\t" | |
703 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /*
even 1 */ | |
704 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /*
even 1 */ | |
705 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /*
even 1 */ | |
706 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /*
even 1 */ | |
707 "extp %[Temp2], $ac2, 31 \n\t" /*
even 1 */ | |
708 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 1 */ | |
709 | |
710 "lbu %[qload3], 2(%[dst]) \n\t" /*
load even 2 from dst */ | |
711 | |
712 /* even 3. pixel */ | |
713 "mtlo %[vector_64], $ac1 \n\t" /*
even 4 */ | |
714 "mthi $zero, $ac1 \n\t" | |
715 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /*
average even 1 */ | |
716 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
717 "sb %[st2], 0(%[dst]) \n\t" /*
store even 1 to dst */ | |
718 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /*
even 3 */ | |
719 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /*
even 3 */ | |
720 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /*
even 3 */ | |
721 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /*
even 3 */ | |
722 "extp %[Temp3], $ac3, 31 \n\t" /*
even 3 */ | |
723 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 1 */ | |
724 | |
725 /* even 4. pixel */ | |
726 "mtlo %[vector_64], $ac2 \n\t" /*
even 5 */ | |
727 "mthi $zero, $ac2 \n\t" | |
728 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /*
average even 2 */ | |
729 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
730 "sb %[qload3], 2(%[dst]) \n\t" /*
store even 2 to dst */ | |
731 "ulw %[qload2], 16(%[src]) \n\t" | |
732 "lbu %[qload3], 4(%[dst]) \n\t" /*
load even 3 from dst */ | |
733 "lbu %[qload1], 6(%[dst]) \n\t" /*
load even 4 from dst */ | |
734 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /*
even 4 */ | |
735 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /*
even 4 */ | |
736 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /*
even 4 */ | |
737 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /*
even 4 */ | |
738 "extp %[Temp1], $ac1, 31 \n\t" /*
even 4 */ | |
739 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 3 */ | |
740 | |
741 /* even 5. pixel */ | |
742 "mtlo %[vector_64], $ac3 \n\t" /*
even 6 */ | |
743 "mthi $zero, $ac3 \n\t" | |
744 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /*
average even 3 */ | |
745 "preceu.ph.qbr %[p4], %[qload2] \n\t" | |
746 "sb %[qload3], 4(%[dst]) \n\t" /*
store even 3 to dst */ | |
747 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /*
even 5 */ | |
748 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /*
even 5 */ | |
749 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /*
even 5 */ | |
750 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /*
even 5 */ | |
751 "extp %[Temp2], $ac2, 31 \n\t" /*
even 5 */ | |
752 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 4 */ | |
753 | |
754 /* even 6. pixel */ | |
755 "mtlo %[vector_64], $ac1 \n\t" /*
even 7 */ | |
756 "mthi $zero, $ac1 \n\t" | |
757 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /*
average even 4 */ | |
758 "preceu.ph.qbl %[p1], %[qload2] \n\t" | |
759 "sb %[qload1], 6(%[dst]) \n\t" /*
store even 4 to dst */ | |
760 "ulw %[qload3], 20(%[src]) \n\t" | |
761 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /*
even 6 */ | |
762 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /*
even 6 */ | |
763 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /*
even 6 */ | |
764 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /*
even 6 */ | |
765 "lbu %[qload2], 8(%[dst]) \n\t" /*
load even 5 from dst */ | |
766 "extp %[Temp3], $ac3, 31 \n\t" /*
even 6 */ | |
767 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 5 */ | |
768 | |
769 /* even 7. pixel */ | |
770 "mtlo %[vector_64], $ac2 \n\t" /*
even 8 */ | |
771 "mthi $zero, $ac2 \n\t" | |
772 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /*
average even 5 */ | |
773 "preceu.ph.qbr %[p5], %[qload3] \n\t" | |
774 "sb %[qload2], 8(%[dst]) \n\t" /*
store even 5 to dst */ | |
775 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /*
even 7 */ | |
776 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /*
even 7 */ | |
777 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /*
even 7 */ | |
778 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /*
even 7 */ | |
779 "lbu %[qload3], 10(%[dst]) \n\t" /*
load even 6 from dst */ | |
780 "extp %[Temp1], $ac1, 31 \n\t" /*
even 7 */ | |
781 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 6 */ | |
782 | |
783 "lbu %[st2], 12(%[dst]) \n\t" /*
load even 7 from dst */ | |
784 | |
785 /* even 8. pixel */ | |
786 "mtlo %[vector_64], $ac3 \n\t" /*
odd 1 */ | |
787 "mthi $zero, $ac3 \n\t" | |
788 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /*
average even 6 */ | |
789 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /*
even 8 */ | |
790 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /*
even 8 */ | |
791 "sb %[qload3], 10(%[dst]) \n\t" /*
store even 6 to dst */ | |
792 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /*
even 8 */ | |
793 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /*
even 8 */ | |
794 "extp %[Temp2], $ac2, 31 \n\t" /*
even 8 */ | |
795 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 7 */ | |
796 | |
797 /* ODD pixels */ | |
798 "ulw %[qload1], 1(%[src]) \n\t" | |
799 "ulw %[qload2], 5(%[src]) \n\t" | |
800 | |
801 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /*
average even 7 */ | |
802 | |
803 /* odd 1. pixel */ | |
804 "mtlo %[vector_64], $ac1 \n\t" /*
odd 2 */ | |
805 "mthi $zero, $ac1 \n\t" | |
806 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
807 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
808 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
809 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
810 "sb %[st2], 12(%[dst]) \n\t" /*
store even 7 to dst */ | |
811 "ulw %[qload3], 9(%[src]) \n\t" | |
812 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /*
odd 1 */ | |
813 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /*
odd 1 */ | |
814 "lbu %[qload2], 14(%[dst]) \n\t" /*
load even 8 from dst */ | |
815 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /*
odd 1 */ | |
816 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /*
odd 1 */ | |
817 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 1 */ | |
818 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 8 */ | |
819 | |
820 "lbu %[st1], 1(%[dst]) \n\t" /*
load odd 1 from dst */ | |
821 | |
822 /* odd 2. pixel */ | |
823 "mtlo %[vector_64], $ac2 \n\t" /*
odd 3 */ | |
824 "mthi $zero, $ac2 \n\t" | |
825 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /*
average even 8 */ | |
826 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
827 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
828 "sb %[qload2], 14(%[dst]) \n\t" /*
store even 8 to dst */ | |
829 "ulw %[qload1], 13(%[src]) \n\t" | |
830 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /*
odd 2 */ | |
831 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /*
odd 2 */ | |
832 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /*
odd 2 */ | |
833 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /*
odd 2 */ | |
834 "lbu %[qload3], 3(%[dst]) \n\t" /*
load odd 2 from dst */ | |
835 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 2 */ | |
836 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 1 */ | |
837 | |
838 /* odd 3. pixel */ | |
839 "mtlo %[vector_64], $ac3 \n\t" /*
odd 4 */ | |
840 "mthi $zero, $ac3 \n\t" | |
841 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /*
average odd 1 */ | |
842 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
843 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /*
odd 3 */ | |
844 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /*
odd 3 */ | |
845 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /*
odd 3 */ | |
846 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /*
odd 3 */ | |
847 "sb %[st3], 1(%[dst]) \n\t" /*
store odd 1 to dst */ | |
848 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 3 */ | |
849 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 2 */ | |
850 | |
851 /* odd 4. pixel */ | |
852 "mtlo %[vector_64], $ac1 \n\t" /*
odd 5 */ | |
853 "mthi $zero, $ac1 \n\t" | |
854 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /*
average odd 2 */ | |
855 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
856 "sb %[qload3], 3(%[dst]) \n\t" /*
store odd 2 to dst */ | |
857 "lbu %[qload1], 5(%[dst]) \n\t" /*
load odd 3 from dst */ | |
858 "ulw %[qload2], 17(%[src]) \n\t" | |
859 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /*
odd 4 */ | |
860 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /*
odd 4 */ | |
861 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /*
odd 4 */ | |
862 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /*
odd 4 */ | |
863 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 4 */ | |
864 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 3 */ | |
865 | |
866 "lbu %[st1], 7(%[dst]) \n\t" /*
load odd 4 from dst */ | |
867 | |
868 /* odd 5. pixel */ | |
869 "mtlo %[vector_64], $ac2 \n\t" /*
odd 6 */ | |
870 "mthi $zero, $ac2 \n\t" | |
871 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /*
average odd 3 */ | |
872 "preceu.ph.qbr %[p4], %[qload2] \n\t" | |
873 "sb %[qload1], 5(%[dst]) \n\t" /*
store odd 3 to dst */ | |
874 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /*
odd 5 */ | |
875 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /*
odd 5 */ | |
876 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /*
odd 5 */ | |
877 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /*
odd 5 */ | |
878 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 5 */ | |
879 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 4 */ | |
880 | |
881 "lbu %[qload1], 9(%[dst]) \n\t" /*
load odd 5 from dst */ | |
882 | |
883 /* odd 6. pixel */ | |
884 "mtlo %[vector_64], $ac3 \n\t" /*
odd 7 */ | |
885 "mthi $zero, $ac3 \n\t" | |
886 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /*
average odd 4 */ | |
887 "preceu.ph.qbl %[p1], %[qload2] \n\t" | |
888 "sb %[st1], 7(%[dst]) \n\t" /*
store odd 4 to dst */ | |
889 "ulw %[qload3], 21(%[src]) \n\t" | |
890 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /*
odd 6 */ | |
891 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /*
odd 6 */ | |
892 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /*
odd 6 */ | |
893 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /*
odd 6 */ | |
894 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 6 */ | |
895 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 5 */ | |
896 | |
897 /* odd 7. pixel */ | |
898 "mtlo %[vector_64], $ac1 \n\t" /*
odd 8 */ | |
899 "mthi $zero, $ac1 \n\t" | |
900 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /*
average odd 5 */ | |
901 "preceu.ph.qbr %[p5], %[qload3] \n\t" | |
902 "sb %[qload1], 9(%[dst]) \n\t" /*
store odd 5 to dst */ | |
903 "lbu %[qload2], 11(%[dst]) \n\t" /*
load odd 6 from dst */ | |
904 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /*
odd 7 */ | |
905 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /*
odd 7 */ | |
906 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /*
odd 7 */ | |
907 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /*
odd 7 */ | |
908 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 7 */ | |
909 | |
910 "lbu %[qload3], 13(%[dst]) \n\t" /*
load odd 7 from dst */ | |
911 | |
912 /* odd 8. pixel */ | |
913 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /*
odd 8 */ | |
914 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /*
odd 8 */ | |
915 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /*
odd 8 */ | |
916 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /*
odd 8 */ | |
917 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 8 */ | |
918 | |
919 "lbu %[qload1], 15(%[dst]) \n\t" /*
load odd 8 from dst */ | |
920 | |
921 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 6 */ | |
922 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /*
average odd 6 */ | |
923 | |
924 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 7 */ | |
925 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /*
average odd 7 */ | |
926 | |
927 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 8 */ | |
928 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /*
average odd 8 */ | |
929 | |
930 "sb %[qload2], 11(%[dst]) \n\t" /*
store odd 6 to dst */ | |
931 "sb %[qload3], 13(%[dst]) \n\t" /*
store odd 7 to dst */ | |
932 "sb %[qload1], 15(%[dst]) \n\t" /*
store odd 8 to dst */ | |
933 | |
934 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), | |
935 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
936 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
937 [qload3] "=&r" (qload3), [p5] "=&r" (p5), | |
938 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) | |
939 : [filter12] "r" (filter12), [filter34] "r" (filter34), | |
940 [filter56] "r" (filter56), [filter78] "r" (filter78), | |
941 [vector_64] "r" (vector_64), | |
942 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) | |
943 ); | |
944 | |
945 src += 16; | |
946 dst += 16; | |
947 } | |
948 | |
949 /* Next row... */ | |
950 src_ptr += src_stride; | |
951 dst_ptr += dst_stride; | |
952 } | |
953 } | |
954 | |
955 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, | |
956 uint8_t *dst, ptrdiff_t dst_stride, | |
957 const int16_t *filter_x, int x_step_q4, | |
958 const int16_t *filter_y, int y_step_q4, | |
959 int w, int h) { | |
960 if (((const int32_t *)filter_x)[1] == 0x800000) { | |
961 vpx_convolve_avg(src, src_stride, | |
962 dst, dst_stride, | |
963 filter_x, x_step_q4, | |
964 filter_y, y_step_q4, | |
965 w, h); | |
966 } else if (((const int32_t *)filter_x)[0] == 0) { | |
967 vpx_convolve2_avg_horiz_dspr2(src, src_stride, | |
968 dst, dst_stride, | |
969 filter_x, x_step_q4, | |
970 filter_y, y_step_q4, | |
971 w, h); | |
972 } else { | |
973 if (16 == x_step_q4) { | |
974 uint32_t pos = 38; | |
975 | |
976 src -= 3; | |
977 | |
978 /* bit positon for extract from acc */ | |
979 __asm__ __volatile__ ( | |
980 "wrdsp %[pos], 1 \n\t" | |
981 : | |
982 : [pos] "r" (pos) | |
983 ); | |
984 | |
985 /* prefetch data to cache memory */ | |
986 prefetch_load(src); | |
987 prefetch_load(src + 32); | |
988 prefetch_store(dst); | |
989 | |
990 switch (w) { | |
991 case 4: | |
992 convolve_avg_horiz_4_dspr2(src, src_stride, | |
993 dst, dst_stride, | |
994 filter_x, h); | |
995 break; | |
996 case 8: | |
997 convolve_avg_horiz_8_dspr2(src, src_stride, | |
998 dst, dst_stride, | |
999 filter_x, h); | |
1000 break; | |
1001 case 16: | |
1002 convolve_avg_horiz_16_dspr2(src, src_stride, | |
1003 dst, dst_stride, | |
1004 filter_x, h, 1); | |
1005 break; | |
1006 case 32: | |
1007 convolve_avg_horiz_16_dspr2(src, src_stride, | |
1008 dst, dst_stride, | |
1009 filter_x, h, 2); | |
1010 break; | |
1011 case 64: | |
1012 prefetch_load(src + 64); | |
1013 prefetch_store(dst + 32); | |
1014 | |
1015 convolve_avg_horiz_64_dspr2(src, src_stride, | |
1016 dst, dst_stride, | |
1017 filter_x, h); | |
1018 break; | |
1019 default: | |
1020 vpx_convolve8_avg_horiz_c(src + 3, src_stride, | |
1021 dst, dst_stride, | |
1022 filter_x, x_step_q4, | |
1023 filter_y, y_step_q4, | |
1024 w, h); | |
1025 break; | |
1026 } | |
1027 } else { | |
1028 vpx_convolve8_avg_horiz_c(src, src_stride, | |
1029 dst, dst_stride, | |
1030 filter_x, x_step_q4, | |
1031 filter_y, y_step_q4, | |
1032 w, h); | |
1033 } | |
1034 } | |
1035 } | |
1036 #endif | |
OLD | NEW |