OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <assert.h> | |
12 #include <stdio.h> | |
13 | |
14 #include "./vpx_dsp_rtcd.h" | |
15 #include "vpx_dsp/mips/vpx_common_dspr2.h" | |
16 #include "vpx_dsp/vpx_convolve.h" | |
17 #include "vpx_dsp/vpx_dsp_common.h" | |
18 #include "vpx_ports/mem.h" | |
19 | |
20 #if HAVE_DSPR2 | |
21 static void convolve_bi_horiz_4_dspr2(const uint8_t *src, | |
22 int32_t src_stride, | |
23 uint8_t *dst, | |
24 int32_t dst_stride, | |
25 const int16_t *filter_x0, | |
26 int32_t h) { | |
27 int32_t y; | |
28 uint8_t *cm = vpx_ff_cropTbl; | |
29 int32_t Temp1, Temp2, Temp3, Temp4; | |
30 uint32_t vector4a = 64; | |
31 uint32_t tp1, tp2; | |
32 uint32_t p1, p2; | |
33 const int16_t *filter = &filter_x0[3]; | |
34 uint32_t filter45;; | |
35 | |
36 filter45 = ((const int32_t *)filter)[0]; | |
37 | |
38 for (y = h; y--;) { | |
39 /* prefetch data to cache memory */ | |
40 prefetch_load(src + src_stride); | |
41 prefetch_load(src + src_stride + 32); | |
42 prefetch_store(dst + dst_stride); | |
43 | |
44 __asm__ __volatile__ ( | |
45 "ulw %[tp1], 0(%[src]) \n\t" | |
46 "ulw %[tp2], 4(%[src]) \n\t" | |
47 | |
48 /* even 1. pixel */ | |
49 "mtlo %[vector4a], $ac3 \n\t" | |
50 "mthi $zero, $ac3 \n\t" | |
51 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
52 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
53 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" | |
54 "extp %[Temp1], $ac3, 31 \n\t" | |
55 | |
56 /* even 2. pixel */ | |
57 "mtlo %[vector4a], $ac2 \n\t" | |
58 "mthi $zero, $ac2 \n\t" | |
59 "balign %[tp2], %[tp1], 3 \n\t" | |
60 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" | |
61 "extp %[Temp3], $ac2, 31 \n\t" | |
62 | |
63 /* odd 1. pixel */ | |
64 "lbux %[tp1], %[Temp1](%[cm]) \n\t" | |
65 "mtlo %[vector4a], $ac3 \n\t" | |
66 "mthi $zero, $ac3 \n\t" | |
67 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
68 "preceu.ph.qbl %[p2], %[tp2] \n\t" | |
69 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" | |
70 "extp %[Temp2], $ac3, 31 \n\t" | |
71 | |
72 /* odd 2. pixel */ | |
73 "lbux %[tp2], %[Temp3](%[cm]) \n\t" | |
74 "mtlo %[vector4a], $ac2 \n\t" | |
75 "mthi $zero, $ac2 \n\t" | |
76 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" | |
77 "extp %[Temp4], $ac2, 31 \n\t" | |
78 | |
79 /* clamp */ | |
80 "lbux %[p1], %[Temp2](%[cm]) \n\t" | |
81 "lbux %[p2], %[Temp4](%[cm]) \n\t" | |
82 | |
83 /* store bytes */ | |
84 "sb %[tp1], 0(%[dst]) \n\t" | |
85 "sb %[p1], 1(%[dst]) \n\t" | |
86 "sb %[tp2], 2(%[dst]) \n\t" | |
87 "sb %[p2], 3(%[dst]) \n\t" | |
88 | |
89 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
90 [p1] "=&r" (p1), [p2] "=&r" (p2), | |
91 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), | |
92 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) | |
93 : [filter45] "r" (filter45), [vector4a] "r" (vector4a), | |
94 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) | |
95 ); | |
96 | |
97 /* Next row... */ | |
98 src += src_stride; | |
99 dst += dst_stride; | |
100 } | |
101 } | |
102 | |
103 static void convolve_bi_horiz_8_dspr2(const uint8_t *src, | |
104 int32_t src_stride, | |
105 uint8_t *dst, | |
106 int32_t dst_stride, | |
107 const int16_t *filter_x0, | |
108 int32_t h) { | |
109 int32_t y; | |
110 uint8_t *cm = vpx_ff_cropTbl; | |
111 uint32_t vector4a = 64; | |
112 int32_t Temp1, Temp2, Temp3; | |
113 uint32_t tp1, tp2, tp3; | |
114 uint32_t p1, p2, p3, p4; | |
115 uint32_t st0, st1; | |
116 const int16_t *filter = &filter_x0[3]; | |
117 uint32_t filter45;; | |
118 | |
119 filter45 = ((const int32_t *)filter)[0]; | |
120 | |
121 for (y = h; y--;) { | |
122 /* prefetch data to cache memory */ | |
123 prefetch_load(src + src_stride); | |
124 prefetch_load(src + src_stride + 32); | |
125 prefetch_store(dst + dst_stride); | |
126 | |
127 __asm__ __volatile__ ( | |
128 "ulw %[tp1], 0(%[src]) \n\t" | |
129 "ulw %[tp2], 4(%[src]) \n\t" | |
130 | |
131 /* even 1. pixel */ | |
132 "mtlo %[vector4a], $ac3 \n\t" | |
133 "mthi $zero, $ac3 \n\t" | |
134 "mtlo %[vector4a], $ac2 \n\t" | |
135 "mthi $zero, $ac2 \n\t" | |
136 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
137 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
138 "preceu.ph.qbr %[p3], %[tp2] \n\t" | |
139 "preceu.ph.qbl %[p4], %[tp2] \n\t" | |
140 "ulw %[tp3], 8(%[src]) \n\t" | |
141 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" | |
142 "extp %[Temp1], $ac3, 31 \n\t" | |
143 | |
144 /* even 2. pixel */ | |
145 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" | |
146 "extp %[Temp3], $ac2, 31 \n\t" | |
147 | |
148 /* even 3. pixel */ | |
149 "lbux %[st0], %[Temp1](%[cm]) \n\t" | |
150 "mtlo %[vector4a], $ac1 \n\t" | |
151 "mthi $zero, $ac1 \n\t" | |
152 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" | |
153 "extp %[Temp1], $ac1, 31 \n\t" | |
154 | |
155 /* even 4. pixel */ | |
156 "mtlo %[vector4a], $ac2 \n\t" | |
157 "mthi $zero, $ac2 \n\t" | |
158 "mtlo %[vector4a], $ac3 \n\t" | |
159 "mthi $zero, $ac3 \n\t" | |
160 "sb %[st0], 0(%[dst]) \n\t" | |
161 "lbux %[st1], %[Temp3](%[cm]) \n\t" | |
162 | |
163 "balign %[tp3], %[tp2], 3 \n\t" | |
164 "balign %[tp2], %[tp1], 3 \n\t" | |
165 | |
166 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" | |
167 "extp %[Temp3], $ac2, 31 \n\t" | |
168 | |
169 "lbux %[st0], %[Temp1](%[cm]) \n\t" | |
170 | |
171 /* odd 1. pixel */ | |
172 "mtlo %[vector4a], $ac1 \n\t" | |
173 "mthi $zero, $ac1 \n\t" | |
174 "sb %[st1], 2(%[dst]) \n\t" | |
175 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
176 "preceu.ph.qbl %[p2], %[tp2] \n\t" | |
177 "preceu.ph.qbr %[p3], %[tp3] \n\t" | |
178 "preceu.ph.qbl %[p4], %[tp3] \n\t" | |
179 "sb %[st0], 4(%[dst]) \n\t" | |
180 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" | |
181 "extp %[Temp2], $ac3, 31 \n\t" | |
182 | |
183 /* odd 2. pixel */ | |
184 "mtlo %[vector4a], $ac3 \n\t" | |
185 "mthi $zero, $ac3 \n\t" | |
186 "mtlo %[vector4a], $ac2 \n\t" | |
187 "mthi $zero, $ac2 \n\t" | |
188 "lbux %[st0], %[Temp3](%[cm]) \n\t" | |
189 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" | |
190 "extp %[Temp3], $ac1, 31 \n\t" | |
191 | |
192 /* odd 3. pixel */ | |
193 "lbux %[st1], %[Temp2](%[cm]) \n\t" | |
194 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" | |
195 "extp %[Temp2], $ac3, 31 \n\t" | |
196 | |
197 /* odd 4. pixel */ | |
198 "sb %[st1], 1(%[dst]) \n\t" | |
199 "sb %[st0], 6(%[dst]) \n\t" | |
200 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" | |
201 "extp %[Temp1], $ac2, 31 \n\t" | |
202 | |
203 /* clamp */ | |
204 "lbux %[p4], %[Temp3](%[cm]) \n\t" | |
205 "lbux %[p2], %[Temp2](%[cm]) \n\t" | |
206 "lbux %[p1], %[Temp1](%[cm]) \n\t" | |
207 | |
208 /* store bytes */ | |
209 "sb %[p4], 3(%[dst]) \n\t" | |
210 "sb %[p2], 5(%[dst]) \n\t" | |
211 "sb %[p1], 7(%[dst]) \n\t" | |
212 | |
213 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), | |
214 [st0] "=&r" (st0), [st1] "=&r" (st1), | |
215 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
216 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) | |
217 : [filter45] "r" (filter45), [vector4a] "r" (vector4a), | |
218 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) | |
219 ); | |
220 | |
221 /* Next row... */ | |
222 src += src_stride; | |
223 dst += dst_stride; | |
224 } | |
225 } | |
226 | |
227 static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, | |
228 int32_t src_stride, | |
229 uint8_t *dst_ptr, | |
230 int32_t dst_stride, | |
231 const int16_t *filter_x0, | |
232 int32_t h, | |
233 int32_t count) { | |
234 int32_t y, c; | |
235 const uint8_t *src; | |
236 uint8_t *dst; | |
237 uint8_t *cm = vpx_ff_cropTbl; | |
238 uint32_t vector_64 = 64; | |
239 int32_t Temp1, Temp2, Temp3; | |
240 uint32_t qload1, qload2, qload3; | |
241 uint32_t p1, p2, p3, p4, p5; | |
242 uint32_t st1, st2, st3; | |
243 const int16_t *filter = &filter_x0[3]; | |
244 uint32_t filter45;; | |
245 | |
246 filter45 = ((const int32_t *)filter)[0]; | |
247 | |
248 for (y = h; y--;) { | |
249 src = src_ptr; | |
250 dst = dst_ptr; | |
251 | |
252 /* prefetch data to cache memory */ | |
253 prefetch_load(src_ptr + src_stride); | |
254 prefetch_load(src_ptr + src_stride + 32); | |
255 prefetch_store(dst_ptr + dst_stride); | |
256 | |
257 for (c = 0; c < count; c++) { | |
258 __asm__ __volatile__ ( | |
259 "ulw %[qload1], 0(%[src]) \n\t" | |
260 "ulw %[qload2], 4(%[src]) \n\t" | |
261 | |
262 /* even 1. pixel */ | |
263 "mtlo %[vector_64], $ac1 \n\t" /*
even 1 */ | |
264 "mthi $zero, $ac1 \n\t" | |
265 "mtlo %[vector_64], $ac2 \n\t" /*
even 2 */ | |
266 "mthi $zero, $ac2 \n\t" | |
267 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
268 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
269 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
270 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
271 "ulw %[qload3], 8(%[src]) \n\t" | |
272 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /*
even 1 */ | |
273 "extp %[Temp1], $ac1, 31 \n\t" /*
even 1 */ | |
274 | |
275 /* even 2. pixel */ | |
276 "mtlo %[vector_64], $ac3 \n\t" /*
even 3 */ | |
277 "mthi $zero, $ac3 \n\t" | |
278 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
279 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
280 "ulw %[qload1], 12(%[src]) \n\t" | |
281 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /*
even 1 */ | |
282 "extp %[Temp2], $ac2, 31 \n\t" /*
even 1 */ | |
283 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 1 */ | |
284 | |
285 /* even 3. pixel */ | |
286 "mtlo %[vector_64], $ac1 \n\t" /*
even 4 */ | |
287 "mthi $zero, $ac1 \n\t" | |
288 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
289 "sb %[st1], 0(%[dst]) \n\t" /*
even 1 */ | |
290 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /*
even 3 */ | |
291 "extp %[Temp3], $ac3, 31 \n\t" /*
even 3 */ | |
292 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 1 */ | |
293 | |
294 /* even 4. pixel */ | |
295 "mtlo %[vector_64], $ac2 \n\t" /*
even 5 */ | |
296 "mthi $zero, $ac2 \n\t" | |
297 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
298 "sb %[st2], 2(%[dst]) \n\t" /*
even 1 */ | |
299 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /*
even 4 */ | |
300 "extp %[Temp1], $ac1, 31 \n\t" /*
even 4 */ | |
301 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 3 */ | |
302 | |
303 /* even 5. pixel */ | |
304 "mtlo %[vector_64], $ac3 \n\t" /*
even 6 */ | |
305 "mthi $zero, $ac3 \n\t" | |
306 "sb %[st3], 4(%[dst]) \n\t" /*
even 3 */ | |
307 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /*
even 5 */ | |
308 "extp %[Temp2], $ac2, 31 \n\t" /*
even 5 */ | |
309 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 4 */ | |
310 | |
311 /* even 6. pixel */ | |
312 "mtlo %[vector_64], $ac1 \n\t" /*
even 7 */ | |
313 "mthi $zero, $ac1 \n\t" | |
314 "sb %[st1], 6(%[dst]) \n\t" /*
even 4 */ | |
315 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /*
even 6 */ | |
316 "extp %[Temp3], $ac3, 31 \n\t" /*
even 6 */ | |
317 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 5 */ | |
318 | |
319 /* even 7. pixel */ | |
320 "mtlo %[vector_64], $ac2 \n\t" /*
even 8 */ | |
321 "mthi $zero, $ac2 \n\t" | |
322 "sb %[st2], 8(%[dst]) \n\t" /*
even 5 */ | |
323 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /*
even 7 */ | |
324 "extp %[Temp1], $ac1, 31 \n\t" /*
even 7 */ | |
325 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 6 */ | |
326 | |
327 /* even 8. pixel */ | |
328 "mtlo %[vector_64], $ac3 \n\t" /*
odd 1 */ | |
329 "mthi $zero, $ac3 \n\t" | |
330 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /*
even 8 */ | |
331 "sb %[st3], 10(%[dst]) \n\t" /*
even 6 */ | |
332 "extp %[Temp2], $ac2, 31 \n\t" /*
even 8 */ | |
333 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 7 */ | |
334 | |
335 /* ODD pixels */ | |
336 "ulw %[qload1], 1(%[src]) \n\t" | |
337 "ulw %[qload2], 5(%[src]) \n\t" | |
338 | |
339 /* odd 1. pixel */ | |
340 "mtlo %[vector_64], $ac1 \n\t" /*
odd 2 */ | |
341 "mthi $zero, $ac1 \n\t" | |
342 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
343 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
344 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
345 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
346 "sb %[st1], 12(%[dst]) \n\t" /*
even 7 */ | |
347 "ulw %[qload3], 9(%[src]) \n\t" | |
348 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /*
odd 1 */ | |
349 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 1 */ | |
350 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 8 */ | |
351 | |
352 /* odd 2. pixel */ | |
353 "mtlo %[vector_64], $ac2 \n\t" /*
odd 3 */ | |
354 "mthi $zero, $ac2 \n\t" | |
355 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
356 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
357 "sb %[st2], 14(%[dst]) \n\t" /*
even 8 */ | |
358 "ulw %[qload1], 13(%[src]) \n\t" | |
359 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /*
odd 2 */ | |
360 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 2 */ | |
361 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 1 */ | |
362 | |
363 /* odd 3. pixel */ | |
364 "mtlo %[vector_64], $ac3 \n\t" /*
odd 4 */ | |
365 "mthi $zero, $ac3 \n\t" | |
366 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
367 "sb %[st3], 1(%[dst]) \n\t" /*
odd 1 */ | |
368 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /*
odd 3 */ | |
369 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 3 */ | |
370 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 2 */ | |
371 | |
372 /* odd 4. pixel */ | |
373 "mtlo %[vector_64], $ac1 \n\t" /*
odd 5 */ | |
374 "mthi $zero, $ac1 \n\t" | |
375 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
376 "sb %[st1], 3(%[dst]) \n\t" /*
odd 2 */ | |
377 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /*
odd 4 */ | |
378 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 4 */ | |
379 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 3 */ | |
380 | |
381 /* odd 5. pixel */ | |
382 "mtlo %[vector_64], $ac2 \n\t" /*
odd 6 */ | |
383 "mthi $zero, $ac2 \n\t" | |
384 "sb %[st2], 5(%[dst]) \n\t" /*
odd 3 */ | |
385 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /*
odd 5 */ | |
386 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 5 */ | |
387 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 4 */ | |
388 | |
389 /* odd 6. pixel */ | |
390 "mtlo %[vector_64], $ac3 \n\t" /*
odd 7 */ | |
391 "mthi $zero, $ac3 \n\t" | |
392 "sb %[st3], 7(%[dst]) \n\t" /*
odd 4 */ | |
393 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /*
odd 6 */ | |
394 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 6 */ | |
395 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 5 */ | |
396 | |
397 /* odd 7. pixel */ | |
398 "mtlo %[vector_64], $ac1 \n\t" /*
odd 8 */ | |
399 "mthi $zero, $ac1 \n\t" | |
400 "sb %[st1], 9(%[dst]) \n\t" /*
odd 5 */ | |
401 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /*
odd 7 */ | |
402 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 7 */ | |
403 | |
404 /* odd 8. pixel */ | |
405 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /*
odd 8 */ | |
406 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 8 */ | |
407 | |
408 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 6 */ | |
409 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 7 */ | |
410 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 8 */ | |
411 | |
412 "sb %[st2], 11(%[dst]) \n\t" /*
odd 6 */ | |
413 "sb %[st3], 13(%[dst]) \n\t" /*
odd 7 */ | |
414 "sb %[st1], 15(%[dst]) \n\t" /*
odd 8 */ | |
415 | |
416 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (ql
oad3), | |
417 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
418 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
419 [p5] "=&r" (p5), | |
420 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) | |
421 : [filter45] "r" (filter45), [vector_64] "r" (vector_64), | |
422 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) | |
423 ); | |
424 | |
425 src += 16; | |
426 dst += 16; | |
427 } | |
428 | |
429 /* Next row... */ | |
430 src_ptr += src_stride; | |
431 dst_ptr += dst_stride; | |
432 } | |
433 } | |
434 | |
435 static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, | |
436 int32_t src_stride, | |
437 uint8_t *dst_ptr, | |
438 int32_t dst_stride, | |
439 const int16_t *filter_x0, | |
440 int32_t h) { | |
441 int32_t y, c; | |
442 const uint8_t *src; | |
443 uint8_t *dst; | |
444 uint8_t *cm = vpx_ff_cropTbl; | |
445 uint32_t vector_64 = 64; | |
446 int32_t Temp1, Temp2, Temp3; | |
447 uint32_t qload1, qload2, qload3; | |
448 uint32_t p1, p2, p3, p4, p5; | |
449 uint32_t st1, st2, st3; | |
450 const int16_t *filter = &filter_x0[3]; | |
451 uint32_t filter45;; | |
452 | |
453 filter45 = ((const int32_t *)filter)[0]; | |
454 | |
455 for (y = h; y--;) { | |
456 src = src_ptr; | |
457 dst = dst_ptr; | |
458 | |
459 /* prefetch data to cache memory */ | |
460 prefetch_load(src_ptr + src_stride); | |
461 prefetch_load(src_ptr + src_stride + 32); | |
462 prefetch_load(src_ptr + src_stride + 64); | |
463 prefetch_store(dst_ptr + dst_stride); | |
464 prefetch_store(dst_ptr + dst_stride + 32); | |
465 | |
466 for (c = 0; c < 4; c++) { | |
467 __asm__ __volatile__ ( | |
468 "ulw %[qload1], 0(%[src]) \n\t" | |
469 "ulw %[qload2], 4(%[src]) \n\t" | |
470 | |
471 /* even 1. pixel */ | |
472 "mtlo %[vector_64], $ac1 \n\t" /*
even 1 */ | |
473 "mthi $zero, $ac1 \n\t" | |
474 "mtlo %[vector_64], $ac2 \n\t" /*
even 2 */ | |
475 "mthi $zero, $ac2 \n\t" | |
476 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
477 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
478 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
479 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
480 "ulw %[qload3], 8(%[src]) \n\t" | |
481 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /*
even 1 */ | |
482 "extp %[Temp1], $ac1, 31 \n\t" /*
even 1 */ | |
483 | |
484 /* even 2. pixel */ | |
485 "mtlo %[vector_64], $ac3 \n\t" /*
even 3 */ | |
486 "mthi $zero, $ac3 \n\t" | |
487 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
488 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
489 "ulw %[qload1], 12(%[src]) \n\t" | |
490 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /*
even 1 */ | |
491 "extp %[Temp2], $ac2, 31 \n\t" /*
even 1 */ | |
492 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 1 */ | |
493 | |
494 /* even 3. pixel */ | |
495 "mtlo %[vector_64], $ac1 \n\t" /*
even 4 */ | |
496 "mthi $zero, $ac1 \n\t" | |
497 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
498 "sb %[st1], 0(%[dst]) \n\t" /*
even 1 */ | |
499 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /*
even 3 */ | |
500 "extp %[Temp3], $ac3, 31 \n\t" /*
even 3 */ | |
501 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 1 */ | |
502 | |
503 /* even 4. pixel */ | |
504 "mtlo %[vector_64], $ac2 \n\t" /*
even 5 */ | |
505 "mthi $zero, $ac2 \n\t" | |
506 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
507 "sb %[st2], 2(%[dst]) \n\t" /*
even 1 */ | |
508 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /*
even 4 */ | |
509 "extp %[Temp1], $ac1, 31 \n\t" /*
even 4 */ | |
510 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 3 */ | |
511 | |
512 /* even 5. pixel */ | |
513 "mtlo %[vector_64], $ac3 \n\t" /*
even 6 */ | |
514 "mthi $zero, $ac3 \n\t" | |
515 "sb %[st3], 4(%[dst]) \n\t" /*
even 3 */ | |
516 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /*
even 5 */ | |
517 "extp %[Temp2], $ac2, 31 \n\t" /*
even 5 */ | |
518 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 4 */ | |
519 | |
520 /* even 6. pixel */ | |
521 "mtlo %[vector_64], $ac1 \n\t" /*
even 7 */ | |
522 "mthi $zero, $ac1 \n\t" | |
523 "sb %[st1], 6(%[dst]) \n\t" /*
even 4 */ | |
524 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /*
even 6 */ | |
525 "extp %[Temp3], $ac3, 31 \n\t" /*
even 6 */ | |
526 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 5 */ | |
527 | |
528 /* even 7. pixel */ | |
529 "mtlo %[vector_64], $ac2 \n\t" /*
even 8 */ | |
530 "mthi $zero, $ac2 \n\t" | |
531 "sb %[st2], 8(%[dst]) \n\t" /*
even 5 */ | |
532 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /*
even 7 */ | |
533 "extp %[Temp1], $ac1, 31 \n\t" /*
even 7 */ | |
534 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
even 6 */ | |
535 | |
536 /* even 8. pixel */ | |
537 "mtlo %[vector_64], $ac3 \n\t" /*
odd 1 */ | |
538 "mthi $zero, $ac3 \n\t" | |
539 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /*
even 8 */ | |
540 "sb %[st3], 10(%[dst]) \n\t" /*
even 6 */ | |
541 "extp %[Temp2], $ac2, 31 \n\t" /*
even 8 */ | |
542 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
even 7 */ | |
543 | |
544 /* ODD pixels */ | |
545 "ulw %[qload1], 1(%[src]) \n\t" | |
546 "ulw %[qload2], 5(%[src]) \n\t" | |
547 | |
548 /* odd 1. pixel */ | |
549 "mtlo %[vector_64], $ac1 \n\t" /*
odd 2 */ | |
550 "mthi $zero, $ac1 \n\t" | |
551 "preceu.ph.qbr %[p1], %[qload1] \n\t" | |
552 "preceu.ph.qbl %[p2], %[qload1] \n\t" | |
553 "preceu.ph.qbr %[p3], %[qload2] \n\t" | |
554 "preceu.ph.qbl %[p4], %[qload2] \n\t" | |
555 "sb %[st1], 12(%[dst]) \n\t" /*
even 7 */ | |
556 "ulw %[qload3], 9(%[src]) \n\t" | |
557 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /*
odd 1 */ | |
558 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 1 */ | |
559 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
even 8 */ | |
560 | |
561 /* odd 2. pixel */ | |
562 "mtlo %[vector_64], $ac2 \n\t" /*
odd 3 */ | |
563 "mthi $zero, $ac2 \n\t" | |
564 "preceu.ph.qbr %[p1], %[qload3] \n\t" | |
565 "preceu.ph.qbl %[p5], %[qload3] \n\t" | |
566 "sb %[st2], 14(%[dst]) \n\t" /*
even 8 */ | |
567 "ulw %[qload1], 13(%[src]) \n\t" | |
568 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /*
odd 2 */ | |
569 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 2 */ | |
570 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 1 */ | |
571 | |
572 /* odd 3. pixel */ | |
573 "mtlo %[vector_64], $ac3 \n\t" /*
odd 4 */ | |
574 "mthi $zero, $ac3 \n\t" | |
575 "preceu.ph.qbr %[p2], %[qload1] \n\t" | |
576 "sb %[st3], 1(%[dst]) \n\t" /*
odd 1 */ | |
577 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /*
odd 3 */ | |
578 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 3 */ | |
579 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 2 */ | |
580 | |
581 /* odd 4. pixel */ | |
582 "mtlo %[vector_64], $ac1 \n\t" /*
odd 5 */ | |
583 "mthi $zero, $ac1 \n\t" | |
584 "preceu.ph.qbl %[p3], %[qload1] \n\t" | |
585 "sb %[st1], 3(%[dst]) \n\t" /*
odd 2 */ | |
586 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /*
odd 4 */ | |
587 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 4 */ | |
588 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 3 */ | |
589 | |
590 /* odd 5. pixel */ | |
591 "mtlo %[vector_64], $ac2 \n\t" /*
odd 6 */ | |
592 "mthi $zero, $ac2 \n\t" | |
593 "sb %[st2], 5(%[dst]) \n\t" /*
odd 3 */ | |
594 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /*
odd 5 */ | |
595 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 5 */ | |
596 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 4 */ | |
597 | |
598 /* odd 6. pixel */ | |
599 "mtlo %[vector_64], $ac3 \n\t" /*
odd 7 */ | |
600 "mthi $zero, $ac3 \n\t" | |
601 "sb %[st3], 7(%[dst]) \n\t" /*
odd 4 */ | |
602 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /*
odd 6 */ | |
603 "extp %[Temp2], $ac2, 31 \n\t" /*
odd 6 */ | |
604 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 5 */ | |
605 | |
606 /* odd 7. pixel */ | |
607 "mtlo %[vector_64], $ac1 \n\t" /*
odd 8 */ | |
608 "mthi $zero, $ac1 \n\t" | |
609 "sb %[st1], 9(%[dst]) \n\t" /*
odd 5 */ | |
610 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /*
odd 7 */ | |
611 "extp %[Temp3], $ac3, 31 \n\t" /*
odd 7 */ | |
612 | |
613 /* odd 8. pixel */ | |
614 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /*
odd 8 */ | |
615 "extp %[Temp1], $ac1, 31 \n\t" /*
odd 8 */ | |
616 | |
617 "lbux %[st2], %[Temp2](%[cm]) \n\t" /*
odd 6 */ | |
618 "lbux %[st3], %[Temp3](%[cm]) \n\t" /*
odd 7 */ | |
619 "lbux %[st1], %[Temp1](%[cm]) \n\t" /*
odd 8 */ | |
620 | |
621 "sb %[st2], 11(%[dst]) \n\t" /*
odd 6 */ | |
622 "sb %[st3], 13(%[dst]) \n\t" /*
odd 7 */ | |
623 "sb %[st1], 15(%[dst]) \n\t" /*
odd 8 */ | |
624 | |
625 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (ql
oad3), | |
626 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
627 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
628 [p5] "=&r" (p5), | |
629 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) | |
630 : [filter45] "r" (filter45), [vector_64] "r" (vector_64), | |
631 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) | |
632 ); | |
633 | |
634 src += 16; | |
635 dst += 16; | |
636 } | |
637 | |
638 /* Next row... */ | |
639 src_ptr += src_stride; | |
640 dst_ptr += dst_stride; | |
641 } | |
642 } | |
643 | |
644 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, | |
645 uint8_t *dst, ptrdiff_t dst_stride, | |
646 const int16_t *filter_x, int x_step_q4, | |
647 const int16_t *filter_y, int y_step_q4, | |
648 int w, int h) { | |
649 if (16 == x_step_q4) { | |
650 uint32_t pos = 38; | |
651 | |
652 prefetch_load((const uint8_t *)filter_x); | |
653 | |
654 /* bit positon for extract from acc */ | |
655 __asm__ __volatile__ ( | |
656 "wrdsp %[pos], 1 \n\t" | |
657 : | |
658 : [pos] "r" (pos) | |
659 ); | |
660 | |
661 /* prefetch data to cache memory */ | |
662 prefetch_load(src); | |
663 prefetch_load(src + 32); | |
664 prefetch_store(dst); | |
665 | |
666 switch (w) { | |
667 case 4: | |
668 convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, | |
669 dst, (int32_t)dst_stride, | |
670 filter_x, (int32_t)h); | |
671 break; | |
672 case 8: | |
673 convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, | |
674 dst, (int32_t)dst_stride, | |
675 filter_x, (int32_t)h); | |
676 break; | |
677 case 16: | |
678 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, | |
679 dst, (int32_t)dst_stride, | |
680 filter_x, (int32_t)h, 1); | |
681 break; | |
682 case 32: | |
683 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, | |
684 dst, (int32_t)dst_stride, | |
685 filter_x, (int32_t)h, 2); | |
686 break; | |
687 case 64: | |
688 prefetch_load(src + 64); | |
689 prefetch_store(dst + 32); | |
690 | |
691 convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, | |
692 dst, (int32_t)dst_stride, | |
693 filter_x, (int32_t)h); | |
694 break; | |
695 default: | |
696 vpx_convolve8_horiz_c(src, src_stride, | |
697 dst, dst_stride, | |
698 filter_x, x_step_q4, | |
699 filter_y, y_step_q4, | |
700 w, h); | |
701 break; | |
702 } | |
703 } else { | |
704 vpx_convolve8_horiz_c(src, src_stride, | |
705 dst, dst_stride, | |
706 filter_x, x_step_q4, | |
707 filter_y, y_step_q4, | |
708 w, h); | |
709 } | |
710 } | |
711 #endif | |
OLD | NEW |