OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <assert.h> | |
12 #include <stdio.h> | |
13 | |
14 #include "./vpx_dsp_rtcd.h" | |
15 #include "vpx_dsp/mips/vpx_common_dspr2.h" | |
16 #include "vpx_dsp/vpx_dsp_common.h" | |
17 #include "vpx_dsp/vpx_filter.h" | |
18 #include "vpx_ports/mem.h" | |
19 | |
20 #if HAVE_DSPR2 | |
21 static void convolve_horiz_4_dspr2(const uint8_t *src, | |
22 int32_t src_stride, | |
23 uint8_t *dst, | |
24 int32_t dst_stride, | |
25 const int16_t *filter_x0, | |
26 int32_t h) { | |
27 int32_t y; | |
28 uint8_t *cm = vpx_ff_cropTbl; | |
29 int32_t vector1b, vector2b, vector3b, vector4b; | |
30 int32_t Temp1, Temp2, Temp3, Temp4; | |
31 uint32_t vector4a = 64; | |
32 uint32_t tp1, tp2; | |
33 uint32_t p1, p2, p3, p4; | |
34 uint32_t n1, n2, n3, n4; | |
35 uint32_t tn1, tn2; | |
36 | |
37 vector1b = ((const int32_t *)filter_x0)[0]; | |
38 vector2b = ((const int32_t *)filter_x0)[1]; | |
39 vector3b = ((const int32_t *)filter_x0)[2]; | |
40 vector4b = ((const int32_t *)filter_x0)[3]; | |
41 | |
42 for (y = h; y--;) { | |
43 /* prefetch data to cache memory */ | |
44 prefetch_load(src + src_stride); | |
45 prefetch_load(src + src_stride + 32); | |
46 prefetch_store(dst + dst_stride); | |
47 | |
48 __asm__ __volatile__ ( | |
49 "ulw %[tp1], 0(%[src]) \n\t" | |
50 "ulw %[tp2], 4(%[src]) \n\t" | |
51 | |
52 /* even 1. pixel */ | |
53 "mtlo %[vector4a], $ac3 \n\t" | |
54 "mthi $zero, $ac3 \n\t" | |
55 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
56 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
57 "preceu.ph.qbr %[p3], %[tp2] \n\t" | |
58 "preceu.ph.qbl %[p4], %[tp2] \n\t" | |
59 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
60 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
61 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
62 "ulw %[tn2], 8(%[src]) \n\t" | |
63 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
64 "extp %[Temp1], $ac3, 31 \n\t" | |
65 | |
66 /* even 2. pixel */ | |
67 "mtlo %[vector4a], $ac2 \n\t" | |
68 "mthi $zero, $ac2 \n\t" | |
69 "preceu.ph.qbr %[p1], %[tn2] \n\t" | |
70 "balign %[tn1], %[tn2], 3 \n\t" | |
71 "balign %[tn2], %[tp2], 3 \n\t" | |
72 "balign %[tp2], %[tp1], 3 \n\t" | |
73 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" | |
74 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" | |
75 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" | |
76 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" | |
77 "extp %[Temp3], $ac2, 31 \n\t" | |
78 | |
79 /* odd 1. pixel */ | |
80 "lbux %[tp1], %[Temp1](%[cm]) \n\t" | |
81 "mtlo %[vector4a], $ac3 \n\t" | |
82 "mthi $zero, $ac3 \n\t" | |
83 "preceu.ph.qbr %[n1], %[tp2] \n\t" | |
84 "preceu.ph.qbl %[n2], %[tp2] \n\t" | |
85 "preceu.ph.qbr %[n3], %[tn2] \n\t" | |
86 "preceu.ph.qbl %[n4], %[tn2] \n\t" | |
87 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" | |
88 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" | |
89 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" | |
90 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" | |
91 "extp %[Temp2], $ac3, 31 \n\t" | |
92 | |
93 /* odd 2. pixel */ | |
94 "lbux %[tp2], %[Temp3](%[cm]) \n\t" | |
95 "mtlo %[vector4a], $ac2 \n\t" | |
96 "mthi $zero, $ac2 \n\t" | |
97 "preceu.ph.qbr %[n1], %[tn1] \n\t" | |
98 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" | |
99 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" | |
100 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" | |
101 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" | |
102 "extp %[Temp4], $ac2, 31 \n\t" | |
103 | |
104 /* clamp */ | |
105 "lbux %[tn1], %[Temp2](%[cm]) \n\t" | |
106 "lbux %[n2], %[Temp4](%[cm]) \n\t" | |
107 | |
108 /* store bytes */ | |
109 "sb %[tp1], 0(%[dst]) \n\t" | |
110 "sb %[tn1], 1(%[dst]) \n\t" | |
111 "sb %[tp2], 2(%[dst]) \n\t" | |
112 "sb %[n2], 3(%[dst]) \n\t" | |
113 | |
114 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
115 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), | |
116 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
117 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), | |
118 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), | |
119 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) | |
120 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), | |
121 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), | |
122 [vector4a] "r" (vector4a), | |
123 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) | |
124 ); | |
125 | |
126 /* Next row... */ | |
127 src += src_stride; | |
128 dst += dst_stride; | |
129 } | |
130 } | |
131 | |
132 static void convolve_horiz_8_dspr2(const uint8_t *src, | |
133 int32_t src_stride, | |
134 uint8_t *dst, | |
135 int32_t dst_stride, | |
136 const int16_t *filter_x0, | |
137 int32_t h) { | |
138 int32_t y; | |
139 uint8_t *cm = vpx_ff_cropTbl; | |
140 uint32_t vector4a = 64; | |
141 int32_t vector1b, vector2b, vector3b, vector4b; | |
142 int32_t Temp1, Temp2, Temp3; | |
143 uint32_t tp1, tp2; | |
144 uint32_t p1, p2, p3, p4, n1; | |
145 uint32_t tn1, tn2, tn3; | |
146 uint32_t st0, st1; | |
147 | |
148 vector1b = ((const int32_t *)filter_x0)[0]; | |
149 vector2b = ((const int32_t *)filter_x0)[1]; | |
150 vector3b = ((const int32_t *)filter_x0)[2]; | |
151 vector4b = ((const int32_t *)filter_x0)[3]; | |
152 | |
153 for (y = h; y--;) { | |
154 /* prefetch data to cache memory */ | |
155 prefetch_load(src + src_stride); | |
156 prefetch_load(src + src_stride + 32); | |
157 prefetch_store(dst + dst_stride); | |
158 | |
159 __asm__ __volatile__ ( | |
160 "ulw %[tp1], 0(%[src]) \n\t" | |
161 "ulw %[tp2], 4(%[src]) \n\t" | |
162 | |
163 /* even 1. pixel */ | |
164 "mtlo %[vector4a], $ac3 \n\t" | |
165 "mthi $zero, $ac3 \n\t" | |
166 "mtlo %[vector4a], $ac2 \n\t" | |
167 "mthi $zero, $ac2 \n\t" | |
168 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
169 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
170 "preceu.ph.qbr %[p3], %[tp2] \n\t" | |
171 "preceu.ph.qbl %[p4], %[tp2] \n\t" | |
172 "ulw %[tn2], 8(%[src]) \n\t" | |
173 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
174 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
175 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
176 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
177 "extp %[Temp1], $ac3, 31 \n\t" | |
178 | |
179 /* even 2. pixel */ | |
180 "preceu.ph.qbr %[p1], %[tn2] \n\t" | |
181 "preceu.ph.qbl %[n1], %[tn2] \n\t" | |
182 "ulw %[tn1], 12(%[src]) \n\t" | |
183 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" | |
184 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" | |
185 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" | |
186 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" | |
187 "extp %[Temp3], $ac2, 31 \n\t" | |
188 | |
189 /* even 3. pixel */ | |
190 "lbux %[st0], %[Temp1](%[cm]) \n\t" | |
191 "mtlo %[vector4a], $ac1 \n\t" | |
192 "mthi $zero, $ac1 \n\t" | |
193 "preceu.ph.qbr %[p2], %[tn1] \n\t" | |
194 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" | |
195 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" | |
196 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" | |
197 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" | |
198 "extp %[Temp1], $ac1, 31 \n\t" | |
199 | |
200 /* even 4. pixel */ | |
201 "mtlo %[vector4a], $ac2 \n\t" | |
202 "mthi $zero, $ac2 \n\t" | |
203 "mtlo %[vector4a], $ac3 \n\t" | |
204 "mthi $zero, $ac3 \n\t" | |
205 "sb %[st0], 0(%[dst]) \n\t" | |
206 "lbux %[st1], %[Temp3](%[cm]) \n\t" | |
207 | |
208 "balign %[tn3], %[tn1], 3 \n\t" | |
209 "balign %[tn1], %[tn2], 3 \n\t" | |
210 "balign %[tn2], %[tp2], 3 \n\t" | |
211 "balign %[tp2], %[tp1], 3 \n\t" | |
212 | |
213 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" | |
214 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" | |
215 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" | |
216 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" | |
217 "extp %[Temp3], $ac2, 31 \n\t" | |
218 | |
219 "lbux %[st0], %[Temp1](%[cm]) \n\t" | |
220 | |
221 /* odd 1. pixel */ | |
222 "mtlo %[vector4a], $ac1 \n\t" | |
223 "mthi $zero, $ac1 \n\t" | |
224 "sb %[st1], 2(%[dst]) \n\t" | |
225 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
226 "preceu.ph.qbl %[p2], %[tp2] \n\t" | |
227 "preceu.ph.qbr %[p3], %[tn2] \n\t" | |
228 "preceu.ph.qbl %[p4], %[tn2] \n\t" | |
229 "sb %[st0], 4(%[dst]) \n\t" | |
230 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
231 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
232 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
233 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
234 "extp %[Temp2], $ac3, 31 \n\t" | |
235 | |
236 /* odd 2. pixel */ | |
237 "mtlo %[vector4a], $ac3 \n\t" | |
238 "mthi $zero, $ac3 \n\t" | |
239 "mtlo %[vector4a], $ac2 \n\t" | |
240 "mthi $zero, $ac2 \n\t" | |
241 "preceu.ph.qbr %[p1], %[tn1] \n\t" | |
242 "preceu.ph.qbl %[n1], %[tn1] \n\t" | |
243 "lbux %[st0], %[Temp3](%[cm]) \n\t" | |
244 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" | |
245 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" | |
246 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" | |
247 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" | |
248 "extp %[Temp3], $ac1, 31 \n\t" | |
249 | |
250 /* odd 3. pixel */ | |
251 "lbux %[st1], %[Temp2](%[cm]) \n\t" | |
252 "preceu.ph.qbr %[p2], %[tn3] \n\t" | |
253 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" | |
254 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" | |
255 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" | |
256 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" | |
257 "extp %[Temp2], $ac3, 31 \n\t" | |
258 | |
259 /* odd 4. pixel */ | |
260 "sb %[st1], 1(%[dst]) \n\t" | |
261 "sb %[st0], 6(%[dst]) \n\t" | |
262 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" | |
263 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" | |
264 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" | |
265 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" | |
266 "extp %[Temp1], $ac2, 31 \n\t" | |
267 | |
268 /* clamp */ | |
269 "lbux %[p4], %[Temp3](%[cm]) \n\t" | |
270 "lbux %[p2], %[Temp2](%[cm]) \n\t" | |
271 "lbux %[n1], %[Temp1](%[cm]) \n\t" | |
272 | |
273 /* store bytes */ | |
274 "sb %[p4], 3(%[dst]) \n\t" | |
275 "sb %[p2], 5(%[dst]) \n\t" | |
276 "sb %[n1], 7(%[dst]) \n\t" | |
277 | |
278 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
279 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), | |
280 [st0] "=&r" (st0), [st1] "=&r" (st1), | |
281 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
282 [n1] "=&r" (n1), | |
283 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) | |
284 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), | |
285 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), | |
286 [vector4a] "r" (vector4a), | |
287 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) | |
288 ); | |
289 | |
290 /* Next row... */ | |
291 src += src_stride; | |
292 dst += dst_stride; | |
293 } | |
294 } | |
295 | |
296 static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, | |
297 int32_t src_stride, | |
298 uint8_t *dst_ptr, | |
299 int32_t dst_stride, | |
300 const int16_t *filter_x0, | |
301 int32_t h, | |
302 int32_t count) { | |
303 int32_t y, c; | |
304 const uint8_t *src; | |
305 uint8_t *dst; | |
306 uint8_t *cm = vpx_ff_cropTbl; | |
307 uint32_t vector_64 = 64; | |
308 int32_t filter12, filter34, filter56, filter78; | |
309 int32_t Temp1, Temp2, Temp3; | |
310 uint32_t qload1, qload2, qload3; | |
311 uint32_t p1, p2, p3, p4, p5; | |
312 uint32_t st1, st2, st3; | |
313 | |
314 filter12 = ((const int32_t *)filter_x0)[0]; | |
315 filter34 = ((const int32_t *)filter_x0)[1]; | |
316 filter56 = ((const int32_t *)filter_x0)[2]; | |
317 filter78 = ((const int32_t *)filter_x0)[3]; | |
318 | |
319 for (y = h; y--;) { | |
320 src = src_ptr; | |
321 dst = dst_ptr; | |
322 | |
323 /* prefetch data to cache memory */ | |
324 prefetch_load(src_ptr + src_stride); | |
325 prefetch_load(src_ptr + src_stride + 32); | |
326 prefetch_store(dst_ptr + dst_stride); | |
327 | |
328 for (c = 0; c < count; c++) { | |
329 __asm__ __volatile__ ( | |
330 "ulw %[qload1], 0(%[src]) \n\t" | |
331 "ulw %[qload2], 4(%[src]) \n\t" | |
332 | |
333 /* even 1. pixel */ | |
334 "mtlo %[vector_64], $ac1 \n\t" /*
even 1 */ | |
335 "mthi $zero, $ac1 \n\t" | |
336 "mtlo %[vector_64], $ac2 \n\t" /*
even 2 */ | |
337 "mthi $zero, $ac2 \n\t" | |
338 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
339 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
340 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
341 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
342 "ulw %[qload3], 8(%[src]) \n\t" | |
343 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /*
even 1 */ | |
344 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /*
even 1 */ | |
345 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /*
even 1 */ | |
346 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /*
even 1 */ | |
347 "extp %[Temp1], $ac1, 31 \n\t" /*
even 1 */ | |
348 | |
349 /* even 2. pixel */ | |
350 "mtlo %[vector_64], $ac3 \n\t" /*
even 3 */ | |
351 "mthi $zero, $ac3 \n\t" | |
352 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
353 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
354 "ulw %[qload1], 12(%[src]) \n\t" | |
355 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /*
even 1 */ | |
356 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /*
even 1 */ | |
357 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /*
even 1 */ | |
358 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /*
even 1 */ | |
359 "extp %[Temp2], $ac2, 31 \n\t" /*
even 1 */ | |
360 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 1 */ | |
361 | |
362 /* even 3. pixel */ | |
363 "mtlo %[vector_64], $ac1 \n\t" /*
even 4 */ | |
364 "mthi $zero, $ac1 \n\t" | |
365 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
366 "sb %[st1], 0(%[dst]) \n\t" /*
even 1 */ | |
367 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /*
even 3 */ | |
368 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /*
even 3 */ | |
369 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /*
even 3 */ | |
370 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /*
even 3 */ | |
371 "extp %[Temp3], $ac3, 31 \n\t" /*
even 3 */ | |
372 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 1 */ | |
373 | |
374 /* even 4. pixel */ | |
375 "mtlo %[vector_64], $ac2 \n\t" /*
even 5 */ | |
376 "mthi $zero, $ac2 \n\t" | |
377 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
378 "sb %[st2], 2(%[dst]) \n\t" /*
even 1 */ | |
379 "ulw %[qload2], 16(%[src]) \n\t" | |
380 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /*
even 4 */ | |
381 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /*
even 4 */ | |
382 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /*
even 4 */ | |
383 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /*
even 4 */ | |
384 "extp %[Temp1], $ac1, 31 \n\t" /*
even 4 */ | |
385 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 3 */ | |
386 | |
387 /* even 5. pixel */ | |
388 "mtlo %[vector_64], $ac3 \n\t" /*
even 6 */ | |
389 "mthi $zero, $ac3 \n\t" | |
390 "preceu.ph.qbr %[p4], %[qload2] \n\t" | |
391 "sb %[st3], 4(%[dst]) \n\t" /*
even 3 */ | |
392 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /*
even 5 */ | |
393 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /*
even 5 */ | |
394 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /*
even 5 */ | |
395 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /*
even 5 */ | |
396 "extp %[Temp2], $ac2, 31 \n\t" /*
even 5 */ | |
397 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 4 */ | |
398 | |
399 /* even 6. pixel */ | |
400 "mtlo %[vector_64], $ac1 \n\t" /*
even 7 */ | |
401 "mthi $zero, $ac1 \n\t" | |
402 "preceu.ph.qbl %[p1], %[qload2] \n\t" | |
403 "sb %[st1], 6(%[dst]) \n\t" /*
even 4 */ | |
404 "ulw %[qload3], 20(%[src]) \n\t" | |
405 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /*
even 6 */ | |
406 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /*
even 6 */ | |
407 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /*
even 6 */ | |
408 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /*
even 6 */ | |
409 "extp %[Temp3], $ac3, 31 \n\t" /*
even 6 */ | |
410 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 5 */ | |
411 | |
412 /* even 7. pixel */ | |
413 "mtlo %[vector_64], $ac2 \n\t" /*
even 8 */ | |
414 "mthi $zero, $ac2 \n\t" | |
415 "preceu.ph.qbr %[p5], %[qload3] \n\t" | |
416 "sb %[st2], 8(%[dst]) \n\t" /*
even 5 */ | |
417 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /*
even 7 */ | |
418 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /*
even 7 */ | |
419 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /*
even 7 */ | |
420 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /*
even 7 */ | |
421 "extp %[Temp1], $ac1, 31 \n\t" /*
even 7 */ | |
422 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 6 */ | |
423 | |
424 /* even 8. pixel */ | |
425 "mtlo %[vector_64], $ac3 \n\t" /*
odd 1 */ | |
426 "mthi $zero, $ac3 \n\t" | |
427 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /*
even 8 */ | |
428 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /*
even 8 */ | |
429 "sb %[st3], 10(%[dst]) \n\t" /*
even 6 */ | |
430 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /*
even 8 */ | |
431 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /*
even 8 */ | |
432 "extp %[Temp2], $ac2, 31 \n\t" /*
even 8 */ | |
433 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 7 */ | |
434 | |
435 /* ODD pixels */ | |
436 "ulw %[qload1], 1(%[src]) \n\t" | |
437 "ulw %[qload2], 5(%[src]) \n\t" | |
438 | |
439 /* odd 1. pixel */ | |
440 "mtlo %[vector_64], $ac1 \n\t" /*
odd 2 */ | |
441 "mthi $zero, $ac1 \n\t" | |
442 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
443 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
444 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
445 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
446 "sb %[st1], 12(%[dst]) \n\t" /*
even 7 */ | |
447 "ulw %[qload3], 9(%[src]) \n\t" | |
448 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /*
odd 1 */ | |
449 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /*
odd 1 */ | |
450 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /*
odd 1 */ | |
451 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /*
odd 1 */ | |
452 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 1 */ | |
453 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 8 */ | |
454 | |
455 /* odd 2. pixel */ | |
456 "mtlo %[vector_64], $ac2 \n\t" /*
odd 3 */ | |
457 "mthi $zero, $ac2 \n\t" | |
458 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
459 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
460 "sb %[st2], 14(%[dst]) \n\t" /*
even 8 */ | |
461 "ulw %[qload1], 13(%[src]) \n\t" | |
462 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /*
odd 2 */ | |
463 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /*
odd 2 */ | |
464 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /*
odd 2 */ | |
465 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /*
odd 2 */ | |
466 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 2 */ | |
467 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 1 */ | |
468 | |
469 /* odd 3. pixel */ | |
470 "mtlo %[vector_64], $ac3 \n\t" /*
odd 4 */ | |
471 "mthi $zero, $ac3 \n\t" | |
472 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
473 "sb %[st3], 1(%[dst]) \n\t" /*
odd 1 */ | |
474 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /*
odd 3 */ | |
475 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /*
odd 3 */ | |
476 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /*
odd 3 */ | |
477 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /*
odd 3 */ | |
478 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 3 */ | |
479 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 2 */ | |
480 | |
481 /* odd 4. pixel */ | |
482 "mtlo %[vector_64], $ac1 \n\t" /*
odd 5 */ | |
483 "mthi $zero, $ac1 \n\t" | |
484 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
485 "sb %[st1], 3(%[dst]) \n\t" /*
odd 2 */ | |
486 "ulw %[qload2], 17(%[src]) \n\t" | |
487 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /*
odd 4 */ | |
488 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /*
odd 4 */ | |
489 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /*
odd 4 */ | |
490 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /*
odd 4 */ | |
491 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 4 */ | |
492 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 3 */ | |
493 | |
494 /* odd 5. pixel */ | |
495 "mtlo %[vector_64], $ac2 \n\t" /*
odd 6 */ | |
496 "mthi $zero, $ac2 \n\t" | |
497 "preceu.ph.qbr %[p4], %[qload2] \n\t" | |
498 "sb %[st2], 5(%[dst]) \n\t" /*
odd 3 */ | |
499 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /*
odd 5 */ | |
500 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /*
odd 5 */ | |
501 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /*
odd 5 */ | |
502 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /*
odd 5 */ | |
503 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 5 */ | |
504 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 4 */ | |
505 | |
506 /* odd 6. pixel */ | |
507 "mtlo %[vector_64], $ac3 \n\t" /*
odd 7 */ | |
508 "mthi $zero, $ac3 \n\t" | |
509 "preceu.ph.qbl %[p1], %[qload2] \n\t" | |
510 "sb %[st3], 7(%[dst]) \n\t" /*
odd 4 */ | |
511 "ulw %[qload3], 21(%[src]) \n\t" | |
512 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /*
odd 6 */ | |
513 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /*
odd 6 */ | |
514 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /*
odd 6 */ | |
515 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /*
odd 6 */ | |
516 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 6 */ | |
517 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 5 */ | |
518 | |
519 /* odd 7. pixel */ | |
520 "mtlo %[vector_64], $ac1 \n\t" /*
odd 8 */ | |
521 "mthi $zero, $ac1 \n\t" | |
522 "preceu.ph.qbr %[p5], %[qload3] \n\t" | |
523 "sb %[st1], 9(%[dst]) \n\t" /*
odd 5 */ | |
524 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /*
odd 7 */ | |
525 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /*
odd 7 */ | |
526 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /*
odd 7 */ | |
527 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /*
odd 7 */ | |
528 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 7 */ | |
529 | |
530 /* odd 8. pixel */ | |
531 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /*
odd 8 */ | |
532 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /*
odd 8 */ | |
533 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /*
odd 8 */ | |
534 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /*
odd 8 */ | |
535 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 8 */ | |
536 | |
537 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 6 */ | |
538 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 7 */ | |
539 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 8 */ | |
540 | |
541 "sb %[st2], 11(%[dst]) \n\t" /*
odd 6 */ | |
542 "sb %[st3], 13(%[dst]) \n\t" /*
odd 7 */ | |
543 "sb %[st1], 15(%[dst]) \n\t" /*
odd 8 */ | |
544 | |
545 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (ql
oad3), | |
546 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
547 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
548 [p5] "=&r" (p5), | |
549 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) | |
550 : [filter12] "r" (filter12), [filter34] "r" (filter34), | |
551 [filter56] "r" (filter56), [filter78] "r" (filter78), | |
552 [vector_64] "r" (vector_64), | |
553 [cm] "r" (cm), [dst] "r" (dst), | |
554 [src] "r" (src) | |
555 ); | |
556 | |
557 src += 16; | |
558 dst += 16; | |
559 } | |
560 | |
561 /* Next row... */ | |
562 src_ptr += src_stride; | |
563 dst_ptr += dst_stride; | |
564 } | |
565 } | |
566 | |
567 static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, | |
568 int32_t src_stride, | |
569 uint8_t *dst_ptr, | |
570 int32_t dst_stride, | |
571 const int16_t *filter_x0, | |
572 int32_t h) { | |
573 int32_t y, c; | |
574 const uint8_t *src; | |
575 uint8_t *dst; | |
576 uint8_t *cm = vpx_ff_cropTbl; | |
577 uint32_t vector_64 = 64; | |
578 int32_t filter12, filter34, filter56, filter78; | |
579 int32_t Temp1, Temp2, Temp3; | |
580 uint32_t qload1, qload2, qload3; | |
581 uint32_t p1, p2, p3, p4, p5; | |
582 uint32_t st1, st2, st3; | |
583 | |
584 filter12 = ((const int32_t *)filter_x0)[0]; | |
585 filter34 = ((const int32_t *)filter_x0)[1]; | |
586 filter56 = ((const int32_t *)filter_x0)[2]; | |
587 filter78 = ((const int32_t *)filter_x0)[3]; | |
588 | |
589 for (y = h; y--;) { | |
590 src = src_ptr; | |
591 dst = dst_ptr; | |
592 | |
593 /* prefetch data to cache memory */ | |
594 prefetch_load(src_ptr + src_stride); | |
595 prefetch_load(src_ptr + src_stride + 32); | |
596 prefetch_load(src_ptr + src_stride + 64); | |
597 prefetch_store(dst_ptr + dst_stride); | |
598 prefetch_store(dst_ptr + dst_stride + 32); | |
599 | |
600 for (c = 0; c < 4; c++) { | |
601 __asm__ __volatile__ ( | |
602 "ulw %[qload1], 0(%[src]) \n\t" | |
603 "ulw %[qload2], 4(%[src]) \n\t" | |
604 | |
605 /* even 1. pixel */ | |
606 "mtlo %[vector_64], $ac1 \n\t" /*
even 1 */ | |
607 "mthi $zero, $ac1 \n\t" | |
608 "mtlo %[vector_64], $ac2 \n\t" /*
even 2 */ | |
609 "mthi $zero, $ac2 \n\t" | |
610 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
611 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
612 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
613 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
614 "ulw %[qload3], 8(%[src]) \n\t" | |
615 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /*
even 1 */ | |
616 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /*
even 1 */ | |
617 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /*
even 1 */ | |
618 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /*
even 1 */ | |
619 "extp %[Temp1], $ac1, 31 \n\t" /*
even 1 */ | |
620 | |
621 /* even 2. pixel */ | |
622 "mtlo %[vector_64], $ac3 \n\t" /*
even 3 */ | |
623 "mthi $zero, $ac3 \n\t" | |
624 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
625 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
626 "ulw %[qload1], 12(%[src]) \n\t" | |
627 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /*
even 1 */ | |
628 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /*
even 1 */ | |
629 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /*
even 1 */ | |
630 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /*
even 1 */ | |
631 "extp %[Temp2], $ac2, 31 \n\t" /*
even 1 */ | |
632 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 1 */ | |
633 | |
634 /* even 3. pixel */ | |
635 "mtlo %[vector_64], $ac1 \n\t" /*
even 4 */ | |
636 "mthi $zero, $ac1 \n\t" | |
637 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
638 "sb %[st1], 0(%[dst]) \n\t" /*
even 1 */ | |
639 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /*
even 3 */ | |
640 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /*
even 3 */ | |
641 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /*
even 3 */ | |
642 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /*
even 3 */ | |
643 "extp %[Temp3], $ac3, 31 \n\t" /*
even 3 */ | |
644 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 1 */ | |
645 | |
646 /* even 4. pixel */ | |
647 "mtlo %[vector_64], $ac2 \n\t" /*
even 5 */ | |
648 "mthi $zero, $ac2 \n\t" | |
649 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
650 "sb %[st2], 2(%[dst]) \n\t" /*
even 1 */ | |
651 "ulw %[qload2], 16(%[src]) \n\t" | |
652 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /*
even 4 */ | |
653 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /*
even 4 */ | |
654 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /*
even 4 */ | |
655 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /*
even 4 */ | |
656 "extp %[Temp1], $ac1, 31 \n\t" /*
even 4 */ | |
657 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 3 */ | |
658 | |
659 /* even 5. pixel */ | |
660 "mtlo %[vector_64], $ac3 \n\t" /*
even 6 */ | |
661 "mthi $zero, $ac3 \n\t" | |
662 "preceu.ph.qbr %[p4], %[qload2] \n\t" | |
663 "sb %[st3], 4(%[dst]) \n\t" /*
even 3 */ | |
664 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /*
even 5 */ | |
665 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /*
even 5 */ | |
666 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /*
even 5 */ | |
667 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /*
even 5 */ | |
668 "extp %[Temp2], $ac2, 31 \n\t" /*
even 5 */ | |
669 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 4 */ | |
670 | |
671 /* even 6. pixel */ | |
672 "mtlo %[vector_64], $ac1 \n\t" /*
even 7 */ | |
673 "mthi $zero, $ac1 \n\t" | |
674 "preceu.ph.qbl %[p1], %[qload2] \n\t" | |
675 "sb %[st1], 6(%[dst]) \n\t" /*
even 4 */ | |
676 "ulw %[qload3], 20(%[src]) \n\t" | |
677 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /*
even 6 */ | |
678 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /*
even 6 */ | |
679 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /*
even 6 */ | |
680 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /*
even 6 */ | |
681 "extp %[Temp3], $ac3, 31 \n\t" /*
even 6 */ | |
682 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 5 */ | |
683 | |
684 /* even 7. pixel */ | |
685 "mtlo %[vector_64], $ac2 \n\t" /*
even 8 */ | |
686 "mthi $zero, $ac2 \n\t" | |
687 "preceu.ph.qbr %[p5], %[qload3] \n\t" | |
688 "sb %[st2], 8(%[dst]) \n\t" /*
even 5 */ | |
689 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /*
even 7 */ | |
690 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /*
even 7 */ | |
691 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /*
even 7 */ | |
692 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /*
even 7 */ | |
693 "extp %[Temp1], $ac1, 31 \n\t" /*
even 7 */ | |
694 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 6 */ | |
695 | |
696 /* even 8. pixel */ | |
697 "mtlo %[vector_64], $ac3 \n\t" /*
odd 1 */ | |
698 "mthi $zero, $ac3 \n\t" | |
699 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /*
even 8 */ | |
700 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /*
even 8 */ | |
701 "sb %[st3], 10(%[dst]) \n\t" /*
even 6 */ | |
702 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /*
even 8 */ | |
703 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /*
even 8 */ | |
704 "extp %[Temp2], $ac2, 31 \n\t" /*
even 8 */ | |
705 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 7 */ | |
706 | |
707 /* ODD pixels */ | |
708 "ulw %[qload1], 1(%[src]) \n\t" | |
709 "ulw %[qload2], 5(%[src]) \n\t" | |
710 | |
711 /* odd 1. pixel */ | |
712 "mtlo %[vector_64], $ac1 \n\t" /*
odd 2 */ | |
713 "mthi $zero, $ac1 \n\t" | |
714 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
715 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
716 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
717 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
718 "sb %[st1], 12(%[dst]) \n\t" /*
even 7 */ | |
719 "ulw %[qload3], 9(%[src]) \n\t" | |
720 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /*
odd 1 */ | |
721 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /*
odd 1 */ | |
722 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /*
odd 1 */ | |
723 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /*
odd 1 */ | |
724 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 1 */ | |
725 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 8 */ | |
726 | |
727 /* odd 2. pixel */ | |
728 "mtlo %[vector_64], $ac2 \n\t" /*
odd 3 */ | |
729 "mthi $zero, $ac2 \n\t" | |
730 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
731 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
732 "sb %[st2], 14(%[dst]) \n\t" /*
even 8 */ | |
733 "ulw %[qload1], 13(%[src]) \n\t" | |
734 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /*
odd 2 */ | |
735 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /*
odd 2 */ | |
736 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /*
odd 2 */ | |
737 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /*
odd 2 */ | |
738 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 2 */ | |
739 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 1 */ | |
740 | |
741 /* odd 3. pixel */ | |
742 "mtlo %[vector_64], $ac3 \n\t" /*
odd 4 */ | |
743 "mthi $zero, $ac3 \n\t" | |
744 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
745 "sb %[st3], 1(%[dst]) \n\t" /*
odd 1 */ | |
746 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /*
odd 3 */ | |
747 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /*
odd 3 */ | |
748 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /*
odd 3 */ | |
749 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /*
odd 3 */ | |
750 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 3 */ | |
751 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 2 */ | |
752 | |
753 /* odd 4. pixel */ | |
754 "mtlo %[vector_64], $ac1 \n\t" /*
odd 5 */ | |
755 "mthi $zero, $ac1 \n\t" | |
756 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
757 "sb %[st1], 3(%[dst]) \n\t" /*
odd 2 */ | |
758 "ulw %[qload2], 17(%[src]) \n\t" | |
759 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /*
odd 4 */ | |
760 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /*
odd 4 */ | |
761 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /*
odd 4 */ | |
762 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /*
odd 4 */ | |
763 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 4 */ | |
764 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 3 */ | |
765 | |
766 /* odd 5. pixel */ | |
767 "mtlo %[vector_64], $ac2 \n\t" /*
odd 6 */ | |
768 "mthi $zero, $ac2 \n\t" | |
769 "preceu.ph.qbr %[p4], %[qload2] \n\t" | |
770 "sb %[st2], 5(%[dst]) \n\t" /*
odd 3 */ | |
771 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /*
odd 5 */ | |
772 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /*
odd 5 */ | |
773 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /*
odd 5 */ | |
774 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /*
odd 5 */ | |
775 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 5 */ | |
776 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 4 */ | |
777 | |
778 /* odd 6. pixel */ | |
779 "mtlo %[vector_64], $ac3 \n\t" /*
odd 7 */ | |
780 "mthi $zero, $ac3 \n\t" | |
781 "preceu.ph.qbl %[p1], %[qload2] \n\t" | |
782 "sb %[st3], 7(%[dst]) \n\t" /*
odd 4 */ | |
783 "ulw %[qload3], 21(%[src]) \n\t" | |
784 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /*
odd 6 */ | |
785 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /*
odd 6 */ | |
786 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /*
odd 6 */ | |
787 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /*
odd 6 */ | |
788 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 6 */ | |
789 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 5 */ | |
790 | |
791 /* odd 7. pixel */ | |
792 "mtlo %[vector_64], $ac1 \n\t" /*
odd 8 */ | |
793 "mthi $zero, $ac1 \n\t" | |
794 "preceu.ph.qbr %[p5], %[qload3] \n\t" | |
795 "sb %[st1], 9(%[dst]) \n\t" /*
odd 5 */ | |
796 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /*
odd 7 */ | |
797 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /*
odd 7 */ | |
798 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /*
odd 7 */ | |
799 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /*
odd 7 */ | |
800 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 7 */ | |
801 | |
802 /* odd 8. pixel */ | |
803 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /*
odd 8 */ | |
804 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /*
odd 8 */ | |
805 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /*
odd 8 */ | |
806 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /*
odd 8 */ | |
807 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 8 */ | |
808 | |
809 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 6 */ | |
810 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 7 */ | |
811 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 8 */ | |
812 | |
813 "sb %[st2], 11(%[dst]) \n\t" /*
odd 6 */ | |
814 "sb %[st3], 13(%[dst]) \n\t" /*
odd 7 */ | |
815 "sb %[st1], 15(%[dst]) \n\t" /*
odd 8 */ | |
816 | |
817 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (ql
oad3), | |
818 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
819 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
820 [p5] "=&r" (p5), | |
821 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) | |
822 : [filter12] "r" (filter12), [filter34] "r" (filter34), | |
823 [filter56] "r" (filter56), [filter78] "r" (filter78), | |
824 [vector_64] "r" (vector_64), | |
825 [cm] "r" (cm), [dst] "r" (dst), | |
826 [src] "r" (src) | |
827 ); | |
828 | |
829 src += 16; | |
830 dst += 16; | |
831 } | |
832 | |
833 /* Next row... */ | |
834 src_ptr += src_stride; | |
835 dst_ptr += dst_stride; | |
836 } | |
837 } | |
838 | |
839 void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, | |
840 uint8_t *dst, ptrdiff_t dst_stride, | |
841 const int16_t *filter_x, int x_step_q4, | |
842 const int16_t *filter_y, int y_step_q4, | |
843 int w, int h) { | |
844 if (((const int32_t *)filter_x)[1] == 0x800000) { | |
845 vpx_convolve_copy(src, src_stride, | |
846 dst, dst_stride, | |
847 filter_x, x_step_q4, | |
848 filter_y, y_step_q4, | |
849 w, h); | |
850 } else if (((const int32_t *)filter_x)[0] == 0) { | |
851 vpx_convolve2_horiz_dspr2(src, src_stride, | |
852 dst, dst_stride, | |
853 filter_x, x_step_q4, | |
854 filter_y, y_step_q4, | |
855 w, h); | |
856 } else { | |
857 if (16 == x_step_q4) { | |
858 uint32_t pos = 38; | |
859 | |
860 prefetch_load((const uint8_t *)filter_x); | |
861 src -= 3; | |
862 | |
863 /* bit positon for extract from acc */ | |
864 __asm__ __volatile__ ( | |
865 "wrdsp %[pos], 1 \n\t" | |
866 : | |
867 : [pos] "r" (pos) | |
868 ); | |
869 | |
870 /* prefetch data to cache memory */ | |
871 prefetch_load(src); | |
872 prefetch_load(src + 32); | |
873 prefetch_store(dst); | |
874 | |
875 switch (w) { | |
876 case 4: | |
877 convolve_horiz_4_dspr2(src, (int32_t)src_stride, | |
878 dst, (int32_t)dst_stride, | |
879 filter_x, (int32_t)h); | |
880 break; | |
881 case 8: | |
882 convolve_horiz_8_dspr2(src, (int32_t)src_stride, | |
883 dst, (int32_t)dst_stride, | |
884 filter_x, (int32_t)h); | |
885 break; | |
886 case 16: | |
887 convolve_horiz_16_dspr2(src, (int32_t)src_stride, | |
888 dst, (int32_t)dst_stride, | |
889 filter_x, (int32_t)h, 1); | |
890 break; | |
891 case 32: | |
892 convolve_horiz_16_dspr2(src, (int32_t)src_stride, | |
893 dst, (int32_t)dst_stride, | |
894 filter_x, (int32_t)h, 2); | |
895 break; | |
896 case 64: | |
897 prefetch_load(src + 64); | |
898 prefetch_store(dst + 32); | |
899 | |
900 convolve_horiz_64_dspr2(src, (int32_t)src_stride, | |
901 dst, (int32_t)dst_stride, | |
902 filter_x, (int32_t)h); | |
903 break; | |
904 default: | |
905 vpx_convolve8_horiz_c(src + 3, src_stride, | |
906 dst, dst_stride, | |
907 filter_x, x_step_q4, | |
908 filter_y, y_step_q4, | |
909 w, h); | |
910 break; | |
911 } | |
912 } else { | |
913 vpx_convolve8_horiz_c(src, src_stride, | |
914 dst, dst_stride, | |
915 filter_x, x_step_q4, | |
916 filter_y, y_step_q4, | |
917 w, h); | |
918 } | |
919 } | |
920 } | |
921 #endif | |
OLD | NEW |