OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <assert.h> | |
12 #include <stdio.h> | |
13 | |
14 #include "./vpx_dsp_rtcd.h" | |
15 #include "vpx_dsp/mips/vpx_common_dspr2.h" | |
16 #include "vpx_dsp/vpx_dsp_common.h" | |
17 #include "vpx_dsp/vpx_filter.h" | |
18 #include "vpx_ports/mem.h" | |
19 | |
20 #if HAVE_DSPR2 | |
21 static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src, | |
22 int32_t src_stride, | |
23 uint8_t *dst, | |
24 int32_t dst_stride, | |
25 const int16_t *filter_x0, | |
26 int32_t h) { | |
27 int32_t y; | |
28 uint8_t *cm = vpx_ff_cropTbl; | |
29 uint8_t *dst_ptr; | |
30 int32_t Temp1, Temp2; | |
31 uint32_t vector4a = 64; | |
32 uint32_t tp1, tp2; | |
33 uint32_t p1, p2; | |
34 const int16_t *filter = &filter_x0[3]; | |
35 uint32_t filter45; | |
36 | |
37 filter45 = ((const int32_t *)filter)[0]; | |
38 | |
39 for (y = h; y--;) { | |
40 dst_ptr = dst; | |
41 /* prefetch data to cache memory */ | |
42 prefetch_load(src + src_stride); | |
43 prefetch_load(src + src_stride + 32); | |
44 | |
45 __asm__ __volatile__ ( | |
46 "ulw %[tp1], 0(%[src]) \n\t" | |
47 "ulw %[tp2], 4(%[src]) \n\t" | |
48 | |
49 /* even 1. pixel */ | |
50 "mtlo %[vector4a], $ac3 \n\t" | |
51 "mthi $zero, $ac3 \n\t" | |
52 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
53 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
54 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" | |
55 "extp %[Temp1], $ac3, 31 \n\t" | |
56 | |
57 /* even 2. pixel */ | |
58 "mtlo %[vector4a], $ac2 \n\t" | |
59 "mthi $zero, $ac2 \n\t" | |
60 "balign %[tp2], %[tp1], 3 \n\t" | |
61 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" | |
62 "extp %[Temp2], $ac2, 31 \n\t" | |
63 | |
64 /* odd 1. pixel */ | |
65 "lbux %[tp1], %[Temp1](%[cm]) \n\t" | |
66 "mtlo %[vector4a], $ac3 \n\t" | |
67 "mthi $zero, $ac3 \n\t" | |
68 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
69 "preceu.ph.qbl %[p2], %[tp2] \n\t" | |
70 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" | |
71 "extp %[Temp1], $ac3, 31 \n\t" | |
72 | |
73 /* odd 2. pixel */ | |
74 "lbux %[tp2], %[Temp2](%[cm]) \n\t" | |
75 "mtlo %[vector4a], $ac2 \n\t" | |
76 "mthi $zero, $ac2 \n\t" | |
77 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" | |
78 "extp %[Temp2], $ac2, 31 \n\t" | |
79 | |
80 /* clamp */ | |
81 "lbux %[p1], %[Temp1](%[cm]) \n\t" | |
82 "lbux %[p2], %[Temp2](%[cm]) \n\t" | |
83 | |
84 /* store bytes */ | |
85 "sb %[tp1], 0(%[dst_ptr]) \n\t" | |
86 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
87 | |
88 "sb %[p1], 0(%[dst_ptr]) \n\t" | |
89 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
90 | |
91 "sb %[tp2], 0(%[dst_ptr]) \n\t" | |
92 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
93 | |
94 "sb %[p2], 0(%[dst_ptr]) \n\t" | |
95 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
96 | |
97 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
98 [p1] "=&r" (p1), [p2] "=&r" (p2), | |
99 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), | |
100 [dst_ptr] "+r" (dst_ptr) | |
101 : [filter45] "r" (filter45),[vector4a] "r" (vector4a), | |
102 [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) | |
103 ); | |
104 | |
105 /* Next row... */ | |
106 src += src_stride; | |
107 dst += 1; | |
108 } | |
109 } | |
110 | |
111 static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src, | |
112 int32_t src_stride, | |
113 uint8_t *dst, | |
114 int32_t dst_stride, | |
115 const int16_t *filter_x0, | |
116 int32_t h) { | |
117 int32_t y; | |
118 uint8_t *cm = vpx_ff_cropTbl; | |
119 uint8_t *dst_ptr; | |
120 uint32_t vector4a = 64; | |
121 int32_t Temp1, Temp2, Temp3; | |
122 uint32_t tp1, tp2, tp3; | |
123 uint32_t p1, p2, p3, p4; | |
124 uint8_t *odd_dst; | |
125 uint32_t dst_pitch_2 = (dst_stride << 1); | |
126 const int16_t *filter = &filter_x0[3]; | |
127 uint32_t filter45; | |
128 | |
129 filter45 = ((const int32_t *)filter)[0]; | |
130 | |
131 for (y = h; y--;) { | |
132 /* prefetch data to cache memory */ | |
133 prefetch_load(src + src_stride); | |
134 prefetch_load(src + src_stride + 32); | |
135 | |
136 dst_ptr = dst; | |
137 odd_dst = (dst_ptr + dst_stride); | |
138 | |
139 __asm__ __volatile__ ( | |
140 "ulw %[tp1], 0(%[src]) \n\t" | |
141 "ulw %[tp2], 4(%[src]) \n\t" | |
142 | |
143 /* even 1. pixel */ | |
144 "mtlo %[vector4a], $ac3 \n\t" | |
145 "mthi $zero, $ac3 \n\t" | |
146 "mtlo %[vector4a], $ac2 \n\t" | |
147 "mthi $zero, $ac2 \n\t" | |
148 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
149 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
150 "preceu.ph.qbr %[p3], %[tp2] \n\t" | |
151 "preceu.ph.qbl %[p4], %[tp2] \n\t" | |
152 "ulw %[tp3], 8(%[src]) \n\t" | |
153 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" | |
154 "extp %[Temp1], $ac3, 31 \n\t" | |
155 | |
156 /* even 2. pixel */ | |
157 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" | |
158 "extp %[Temp3], $ac2, 31 \n\t" | |
159 | |
160 /* even 3. pixel */ | |
161 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
162 "mtlo %[vector4a], $ac1 \n\t" | |
163 "mthi $zero, $ac1 \n\t" | |
164 "balign %[tp3], %[tp2], 3 \n\t" | |
165 "balign %[tp2], %[tp1], 3 \n\t" | |
166 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" | |
167 "lbux %[tp1], %[Temp3](%[cm]) \n\t" | |
168 "extp %[p3], $ac1, 31 \n\t" | |
169 | |
170 /* even 4. pixel */ | |
171 "mtlo %[vector4a], $ac2 \n\t" | |
172 "mthi $zero, $ac2 \n\t" | |
173 "mtlo %[vector4a], $ac3 \n\t" | |
174 "mthi $zero, $ac3 \n\t" | |
175 "sb %[Temp2], 0(%[dst_ptr]) \n\t" | |
176 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
177 "sb %[tp1], 0(%[dst_ptr]) \n\t" | |
178 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
179 | |
180 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" | |
181 "extp %[Temp3], $ac2, 31 \n\t" | |
182 | |
183 "lbux %[Temp1], %[p3](%[cm]) \n\t
" | |
184 | |
185 /* odd 1. pixel */ | |
186 "mtlo %[vector4a], $ac1 \n\t" | |
187 "mthi $zero, $ac1 \n\t" | |
188 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
189 "preceu.ph.qbl %[p2], %[tp2] \n\t" | |
190 "preceu.ph.qbr %[p3], %[tp3] \n\t" | |
191 "preceu.ph.qbl %[p4], %[tp3] \n\t" | |
192 "sb %[Temp1], 0(%[dst_ptr]) \n\t" | |
193 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
194 | |
195 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" | |
196 "extp %[Temp2], $ac3, 31 \n\t" | |
197 | |
198 /* odd 2. pixel */ | |
199 "lbux %[tp1], %[Temp3](%[cm]) \n\t" | |
200 "mtlo %[vector4a], $ac3 \n\t" | |
201 "mthi $zero, $ac3 \n\t" | |
202 "mtlo %[vector4a], $ac2 \n\t" | |
203 "mthi $zero, $ac2 \n\t" | |
204 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" | |
205 "sb %[tp1], 0(%[dst_ptr]) \n\t" | |
206 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
207 "extp %[Temp3], $ac1, 31 \n\t" | |
208 | |
209 /* odd 3. pixel */ | |
210 "lbux %[tp3], %[Temp2](%[cm]) \n\t" | |
211 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" | |
212 "extp %[Temp2], $ac3, 31 \n\t" | |
213 | |
214 /* odd 4. pixel */ | |
215 "sb %[tp3], 0(%[odd_dst]) \n\t" | |
216 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" | |
217 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" | |
218 "extp %[Temp1], $ac2, 31 \n\t" | |
219 | |
220 /* clamp */ | |
221 "lbux %[p4], %[Temp3](%[cm]) \n\t" | |
222 "lbux %[p2], %[Temp2](%[cm]) \n\t" | |
223 "lbux %[p1], %[Temp1](%[cm]) \n\t" | |
224 | |
225 /* store bytes */ | |
226 "sb %[p4], 0(%[odd_dst]) \n\t" | |
227 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" | |
228 | |
229 "sb %[p2], 0(%[odd_dst]) \n\t" | |
230 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" | |
231 | |
232 "sb %[p1], 0(%[odd_dst]) \n\t" | |
233 | |
234 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), | |
235 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
236 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
237 [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) | |
238 : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm), | |
239 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) | |
240 ); | |
241 | |
242 /* Next row... */ | |
243 src += src_stride; | |
244 dst += 1; | |
245 } | |
246 } | |
247 | |
248 static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr, | |
249 int32_t src_stride, | |
250 uint8_t *dst_ptr, | |
251 int32_t dst_stride, | |
252 const int16_t *filter_x0, | |
253 int32_t h, | |
254 int32_t count) { | |
255 int32_t c, y; | |
256 const uint8_t *src; | |
257 uint8_t *dst; | |
258 uint8_t *cm = vpx_ff_cropTbl; | |
259 uint32_t vector_64 = 64; | |
260 int32_t Temp1, Temp2, Temp3; | |
261 uint32_t qload1, qload2; | |
262 uint32_t p1, p2, p3, p4, p5; | |
263 uint32_t st1, st2, st3; | |
264 uint32_t dst_pitch_2 = (dst_stride << 1); | |
265 uint8_t *odd_dst; | |
266 const int16_t *filter = &filter_x0[3]; | |
267 uint32_t filter45; | |
268 | |
269 filter45 = ((const int32_t *)filter)[0]; | |
270 | |
271 for (y = h; y--;) { | |
272 /* prefetch data to cache memory */ | |
273 prefetch_load(src_ptr + src_stride); | |
274 prefetch_load(src_ptr + src_stride + 32); | |
275 | |
276 src = src_ptr; | |
277 dst = dst_ptr; | |
278 | |
279 odd_dst = (dst + dst_stride); | |
280 | |
281 for (c = 0; c < count; c++) { | |
282 __asm__ __volatile__ ( | |
283 "ulw %[qload1], 0(%[src]) \n
\t" | |
284 "ulw %[qload2], 4(%[src]) \n
\t" | |
285 | |
286 /* even 1. pixel */ | |
287 "mtlo %[vector_64], $ac1 \n
\t" /* even 1 */ | |
288 "mthi $zero, $ac1 \n
\t" | |
289 "mtlo %[vector_64], $ac2 \n
\t" /* even 2 */ | |
290 "mthi $zero, $ac2 \n
\t" | |
291 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
292 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
293 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
294 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
295 "ulw %[qload1], 8(%[src]) \n
\t" | |
296 "dpa.w.ph $ac1, %[p1], %[filter45] \n
\t" /* even 1 */ | |
297 "extp %[Temp1], $ac1, 31 \n
\t" /* even 1 */ | |
298 | |
299 /* even 2. pixel */ | |
300 "mtlo %[vector_64], $ac3 \n
\t" /* even 3 */ | |
301 "mthi $zero, $ac3 \n
\t" | |
302 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
303 "preceu.ph.qbl %[p5], %[qload1] \n
\t" | |
304 "ulw %[qload2], 12(%[src]) \n
\t" | |
305 "dpa.w.ph $ac2, %[p2], %[filter45] \n
\t" /* even 1 */ | |
306 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 1 */ | |
307 "extp %[Temp2], $ac2, 31 \n
\t" /* even 1 */ | |
308 | |
309 /* even 3. pixel */ | |
310 "mtlo %[vector_64], $ac1 \n
\t" /* even 4 */ | |
311 "mthi $zero, $ac1 \n
\t" | |
312 "preceu.ph.qbr %[p2], %[qload2] \n
\t" | |
313 "sb %[st1], 0(%[dst]) \n
\t" /* even 1 */ | |
314 "addu %[dst], %[dst], %[dst_pitch_2]
\n\t" | |
315 "dpa.w.ph $ac3, %[p3], %[filter45] \n
\t" /* even 3 */ | |
316 "extp %[Temp3], $ac3, 31 \n
\t" /* even 3 */ | |
317 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 1 */ | |
318 | |
319 /* even 4. pixel */ | |
320 "mtlo %[vector_64], $ac2 \n
\t" /* even 5 */ | |
321 "mthi $zero, $ac2 \n
\t" | |
322 "preceu.ph.qbl %[p3], %[qload2] \n
\t" | |
323 "sb %[st2], 0(%[dst]) \n
\t" /* even 2 */ | |
324 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
325 "dpa.w.ph $ac1, %[p4], %[filter45] \n
\t" /* even 4 */ | |
326 "extp %[Temp1], $ac1, 31 \n
\t" /* even 4 */ | |
327 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 3 */ | |
328 | |
329 /* even 5. pixel */ | |
330 "mtlo %[vector_64], $ac3 \n
\t" /* even 6 */ | |
331 "mthi $zero, $ac3 \n
\t" | |
332 "sb %[st3], 0(%[dst]) \n
\t" /* even 3 */ | |
333 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
334 "dpa.w.ph $ac2, %[p1], %[filter45] \n
\t" /* even 5 */ | |
335 "extp %[Temp2], $ac2, 31 \n
\t" /* even 5 */ | |
336 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 4 */ | |
337 | |
338 /* even 6. pixel */ | |
339 "mtlo %[vector_64], $ac1 \n
\t" /* even 7 */ | |
340 "mthi $zero, $ac1 \n
\t" | |
341 "sb %[st1], 0(%[dst]) \n
\t" /* even 4 */ | |
342 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
343 "ulw %[qload1], 20(%[src]) \n
\t" | |
344 "dpa.w.ph $ac3, %[p5], %[filter45] \n
\t" /* even 6 */ | |
345 "extp %[Temp3], $ac3, 31 \n
\t" /* even 6 */ | |
346 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 5 */ | |
347 | |
348 /* even 7. pixel */ | |
349 "mtlo %[vector_64], $ac2 \n
\t" /* even 8 */ | |
350 "mthi $zero, $ac2 \n
\t" | |
351 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
352 "sb %[st2], 0(%[dst]) \n
\t" /* even 5 */ | |
353 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
354 "dpa.w.ph $ac1, %[p2], %[filter45] \n
\t" /* even 7 */ | |
355 "extp %[Temp1], $ac1, 31 \n
\t" /* even 7 */ | |
356 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 6 */ | |
357 | |
358 /* even 8. pixel */ | |
359 "mtlo %[vector_64], $ac3 \n
\t" /* odd 1 */ | |
360 "mthi $zero, $ac3 \n
\t" | |
361 "dpa.w.ph $ac2, %[p3], %[filter45] \n
\t" /* even 8 */ | |
362 "sb %[st3], 0(%[dst]) \n
\t" /* even 6 */ | |
363 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
364 "extp %[Temp2], $ac2, 31 \n
\t" /* even 8 */ | |
365 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 7 */ | |
366 | |
367 /* ODD pixels */ | |
368 "ulw %[qload1], 1(%[src]) \n
\t" | |
369 "ulw %[qload2], 5(%[src]) \n
\t" | |
370 | |
371 /* odd 1. pixel */ | |
372 "mtlo %[vector_64], $ac1 \n
\t" /* odd 2 */ | |
373 "mthi $zero, $ac1 \n
\t" | |
374 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
375 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
376 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
377 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
378 "sb %[st1], 0(%[dst]) \n
\t" /* even 7 */ | |
379 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
380 "ulw %[qload2], 9(%[src]) \n
\t" | |
381 "dpa.w.ph $ac3, %[p1], %[filter45] \n
\t" /* odd 1 */ | |
382 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 1 */ | |
383 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 8 */ | |
384 | |
385 /* odd 2. pixel */ | |
386 "mtlo %[vector_64], $ac2 \n
\t" /* odd 3 */ | |
387 "mthi $zero, $ac2 \n
\t" | |
388 "preceu.ph.qbr %[p1], %[qload2] \n
\t" | |
389 "preceu.ph.qbl %[p5], %[qload2] \n
\t" | |
390 "sb %[st2], 0(%[dst]) \n
\t" /* even 8 */ | |
391 "ulw %[qload1], 13(%[src]) \n
\t" | |
392 "dpa.w.ph $ac1, %[p2], %[filter45] \n
\t" /* odd 2 */ | |
393 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 2 */ | |
394 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 1 */ | |
395 | |
396 /* odd 3. pixel */ | |
397 "mtlo %[vector_64], $ac3 \n
\t" /* odd 4 */ | |
398 "mthi $zero, $ac3 \n
\t" | |
399 "preceu.ph.qbr %[p2], %[qload1] \n
\t" | |
400 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 1 */ | |
401 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
402 "dpa.w.ph $ac2, %[p3], %[filter45] \n
\t" /* odd 3 */ | |
403 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 3 */ | |
404 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 2 */ | |
405 | |
406 /* odd 4. pixel */ | |
407 "mtlo %[vector_64], $ac1 \n
\t" /* odd 5 */ | |
408 "mthi $zero, $ac1 \n
\t" | |
409 "preceu.ph.qbl %[p3], %[qload1] \n
\t" | |
410 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 2 */ | |
411 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
412 "dpa.w.ph $ac3, %[p4], %[filter45] \n
\t" /* odd 4 */ | |
413 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 4 */ | |
414 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 3 */ | |
415 | |
416 /* odd 5. pixel */ | |
417 "mtlo %[vector_64], $ac2 \n
\t" /* odd 6 */ | |
418 "mthi $zero, $ac2 \n
\t" | |
419 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 3 */ | |
420 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
421 "dpa.w.ph $ac1, %[p1], %[filter45] \n
\t" /* odd 5 */ | |
422 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 5 */ | |
423 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 4 */ | |
424 | |
425 /* odd 6. pixel */ | |
426 "mtlo %[vector_64], $ac3 \n
\t" /* odd 7 */ | |
427 "mthi $zero, $ac3 \n
\t" | |
428 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 4 */ | |
429 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
430 "ulw %[qload1], 21(%[src]) \n
\t" | |
431 "dpa.w.ph $ac2, %[p5], %[filter45] \n
\t" /* odd 6 */ | |
432 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 6 */ | |
433 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 5 */ | |
434 | |
435 /* odd 7. pixel */ | |
436 "mtlo %[vector_64], $ac1 \n
\t" /* odd 8 */ | |
437 "mthi $zero, $ac1 \n
\t" | |
438 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
439 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 5 */ | |
440 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
441 "dpa.w.ph $ac3, %[p2], %[filter45] \n
\t" /* odd 7 */ | |
442 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 7 */ | |
443 | |
444 /* odd 8. pixel */ | |
445 "dpa.w.ph $ac1, %[p3], %[filter45] \n
\t" /* odd 8 */ | |
446 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 8 */ | |
447 | |
448 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 6 */ | |
449 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 7 */ | |
450 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 8 */ | |
451 | |
452 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 6 */ | |
453 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
454 | |
455 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 7 */ | |
456 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
457 | |
458 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 8 */ | |
459 | |
460 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), | |
461 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
462 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
463 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
464 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) | |
465 : [filter45] "r" (filter45), [vector_64] "r" (vector_64), | |
466 [cm] "r" (cm), | |
467 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) | |
468 ); | |
469 | |
470 src += 16; | |
471 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); | |
472 odd_dst = (dst + dst_stride); | |
473 } | |
474 | |
475 /* Next row... */ | |
476 src_ptr += src_stride; | |
477 dst_ptr += 1; | |
478 } | |
479 } | |
480 | |
481 static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr, | |
482 int32_t src_stride, | |
483 uint8_t *dst_ptr, | |
484 int32_t dst_stride, | |
485 const int16_t *filter_x0, | |
486 int32_t h) { | |
487 int32_t c, y; | |
488 const uint8_t *src; | |
489 uint8_t *dst; | |
490 uint8_t *cm = vpx_ff_cropTbl; | |
491 uint32_t vector_64 = 64; | |
492 int32_t Temp1, Temp2, Temp3; | |
493 uint32_t qload1, qload2; | |
494 uint32_t p1, p2, p3, p4, p5; | |
495 uint32_t st1, st2, st3; | |
496 uint32_t dst_pitch_2 = (dst_stride << 1); | |
497 uint8_t *odd_dst; | |
498 const int16_t *filter = &filter_x0[3]; | |
499 uint32_t filter45; | |
500 | |
501 filter45 = ((const int32_t *)filter)[0]; | |
502 | |
503 for (y = h; y--;) { | |
504 /* prefetch data to cache memory */ | |
505 prefetch_load(src_ptr + src_stride); | |
506 prefetch_load(src_ptr + src_stride + 32); | |
507 prefetch_load(src_ptr + src_stride + 64); | |
508 | |
509 src = src_ptr; | |
510 dst = dst_ptr; | |
511 | |
512 odd_dst = (dst + dst_stride); | |
513 | |
514 for (c = 0; c < 4; c++) { | |
515 __asm__ __volatile__ ( | |
516 "ulw %[qload1], 0(%[src]) \n
\t" | |
517 "ulw %[qload2], 4(%[src]) \n
\t" | |
518 | |
519 /* even 1. pixel */ | |
520 "mtlo %[vector_64], $ac1 \n
\t" /* even 1 */ | |
521 "mthi $zero, $ac1 \n
\t" | |
522 "mtlo %[vector_64], $ac2 \n
\t" /* even 2 */ | |
523 "mthi $zero, $ac2 \n
\t" | |
524 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
525 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
526 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
527 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
528 "ulw %[qload1], 8(%[src]) \n
\t" | |
529 "dpa.w.ph $ac1, %[p1], %[filter45] \n
\t" /* even 1 */ | |
530 "extp %[Temp1], $ac1, 31 \n
\t" /* even 1 */ | |
531 | |
532 /* even 2. pixel */ | |
533 "mtlo %[vector_64], $ac3 \n
\t" /* even 3 */ | |
534 "mthi $zero, $ac3 \n
\t" | |
535 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
536 "preceu.ph.qbl %[p5], %[qload1] \n
\t" | |
537 "ulw %[qload2], 12(%[src]) \n
\t" | |
538 "dpa.w.ph $ac2, %[p2], %[filter45] \n
\t" /* even 1 */ | |
539 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 1 */ | |
540 "extp %[Temp2], $ac2, 31 \n
\t" /* even 1 */ | |
541 | |
542 /* even 3. pixel */ | |
543 "mtlo %[vector_64], $ac1 \n
\t" /* even 4 */ | |
544 "mthi $zero, $ac1 \n
\t" | |
545 "preceu.ph.qbr %[p2], %[qload2] \n
\t" | |
546 "sb %[st1], 0(%[dst]) \n
\t" /* even 1 */ | |
547 "addu %[dst], %[dst], %[dst_pitch_2]
\n\t" | |
548 "dpa.w.ph $ac3, %[p3], %[filter45] \n
\t" /* even 3 */ | |
549 "extp %[Temp3], $ac3, 31 \n
\t" /* even 3 */ | |
550 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 1 */ | |
551 | |
552 /* even 4. pixel */ | |
553 "mtlo %[vector_64], $ac2 \n
\t" /* even 5 */ | |
554 "mthi $zero, $ac2 \n
\t" | |
555 "preceu.ph.qbl %[p3], %[qload2] \n
\t" | |
556 "sb %[st2], 0(%[dst]) \n
\t" /* even 2 */ | |
557 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
558 "dpa.w.ph $ac1, %[p4], %[filter45] \n
\t" /* even 4 */ | |
559 "extp %[Temp1], $ac1, 31 \n
\t" /* even 4 */ | |
560 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 3 */ | |
561 | |
562 /* even 5. pixel */ | |
563 "mtlo %[vector_64], $ac3 \n
\t" /* even 6 */ | |
564 "mthi $zero, $ac3 \n
\t" | |
565 "sb %[st3], 0(%[dst]) \n
\t" /* even 3 */ | |
566 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
567 "dpa.w.ph $ac2, %[p1], %[filter45] \n
\t" /* even 5 */ | |
568 "extp %[Temp2], $ac2, 31 \n
\t" /* even 5 */ | |
569 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 4 */ | |
570 | |
571 /* even 6. pixel */ | |
572 "mtlo %[vector_64], $ac1 \n
\t" /* even 7 */ | |
573 "mthi $zero, $ac1 \n
\t" | |
574 "sb %[st1], 0(%[dst]) \n
\t" /* even 4 */ | |
575 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
576 "ulw %[qload1], 20(%[src]) \n
\t" | |
577 "dpa.w.ph $ac3, %[p5], %[filter45] \n
\t" /* even 6 */ | |
578 "extp %[Temp3], $ac3, 31 \n
\t" /* even 6 */ | |
579 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 5 */ | |
580 | |
581 /* even 7. pixel */ | |
582 "mtlo %[vector_64], $ac2 \n
\t" /* even 8 */ | |
583 "mthi $zero, $ac2 \n
\t" | |
584 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
585 "sb %[st2], 0(%[dst]) \n
\t" /* even 5 */ | |
586 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
587 "dpa.w.ph $ac1, %[p2], %[filter45] \n
\t" /* even 7 */ | |
588 "extp %[Temp1], $ac1, 31 \n
\t" /* even 7 */ | |
589 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 6 */ | |
590 | |
591 /* even 8. pixel */ | |
592 "mtlo %[vector_64], $ac3 \n
\t" /* odd 1 */ | |
593 "mthi $zero, $ac3 \n
\t" | |
594 "dpa.w.ph $ac2, %[p3], %[filter45] \n
\t" /* even 8 */ | |
595 "sb %[st3], 0(%[dst]) \n
\t" /* even 6 */ | |
596 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
597 "extp %[Temp2], $ac2, 31 \n
\t" /* even 8 */ | |
598 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 7 */ | |
599 | |
600 /* ODD pixels */ | |
601 "ulw %[qload1], 1(%[src]) \n
\t" | |
602 "ulw %[qload2], 5(%[src]) \n
\t" | |
603 | |
604 /* odd 1. pixel */ | |
605 "mtlo %[vector_64], $ac1 \n
\t" /* odd 2 */ | |
606 "mthi $zero, $ac1 \n
\t" | |
607 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
608 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
609 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
610 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
611 "sb %[st1], 0(%[dst]) \n
\t" /* even 7 */ | |
612 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
613 "ulw %[qload2], 9(%[src]) \n
\t" | |
614 "dpa.w.ph $ac3, %[p1], %[filter45] \n
\t" /* odd 1 */ | |
615 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 1 */ | |
616 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 8 */ | |
617 | |
618 /* odd 2. pixel */ | |
619 "mtlo %[vector_64], $ac2 \n
\t" /* odd 3 */ | |
620 "mthi $zero, $ac2 \n
\t" | |
621 "preceu.ph.qbr %[p1], %[qload2] \n
\t" | |
622 "preceu.ph.qbl %[p5], %[qload2] \n
\t" | |
623 "sb %[st2], 0(%[dst]) \n
\t" /* even 8 */ | |
624 "ulw %[qload1], 13(%[src]) \n
\t" | |
625 "dpa.w.ph $ac1, %[p2], %[filter45] \n
\t" /* odd 2 */ | |
626 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 2 */ | |
627 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 1 */ | |
628 | |
629 /* odd 3. pixel */ | |
630 "mtlo %[vector_64], $ac3 \n
\t" /* odd 4 */ | |
631 "mthi $zero, $ac3 \n
\t" | |
632 "preceu.ph.qbr %[p2], %[qload1] \n
\t" | |
633 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 1 */ | |
634 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
635 "dpa.w.ph $ac2, %[p3], %[filter45] \n
\t" /* odd 3 */ | |
636 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 3 */ | |
637 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 2 */ | |
638 | |
639 /* odd 4. pixel */ | |
640 "mtlo %[vector_64], $ac1 \n
\t" /* odd 5 */ | |
641 "mthi $zero, $ac1 \n
\t" | |
642 "preceu.ph.qbl %[p3], %[qload1] \n
\t" | |
643 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 2 */ | |
644 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
645 "dpa.w.ph $ac3, %[p4], %[filter45] \n
\t" /* odd 4 */ | |
646 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 4 */ | |
647 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 3 */ | |
648 | |
649 /* odd 5. pixel */ | |
650 "mtlo %[vector_64], $ac2 \n
\t" /* odd 6 */ | |
651 "mthi $zero, $ac2 \n
\t" | |
652 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 3 */ | |
653 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
654 "dpa.w.ph $ac1, %[p1], %[filter45] \n
\t" /* odd 5 */ | |
655 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 5 */ | |
656 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 4 */ | |
657 | |
658 /* odd 6. pixel */ | |
659 "mtlo %[vector_64], $ac3 \n
\t" /* odd 7 */ | |
660 "mthi $zero, $ac3 \n
\t" | |
661 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 4 */ | |
662 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
663 "ulw %[qload1], 21(%[src]) \n
\t" | |
664 "dpa.w.ph $ac2, %[p5], %[filter45] \n
\t" /* odd 6 */ | |
665 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 6 */ | |
666 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 5 */ | |
667 | |
668 /* odd 7. pixel */ | |
669 "mtlo %[vector_64], $ac1 \n
\t" /* odd 8 */ | |
670 "mthi $zero, $ac1 \n
\t" | |
671 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
672 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 5 */ | |
673 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
674 "dpa.w.ph $ac3, %[p2], %[filter45] \n
\t" /* odd 7 */ | |
675 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 7 */ | |
676 | |
677 /* odd 8. pixel */ | |
678 "dpa.w.ph $ac1, %[p3], %[filter45] \n
\t" /* odd 8 */ | |
679 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 8 */ | |
680 | |
681 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 6 */ | |
682 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 7 */ | |
683 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 8 */ | |
684 | |
685 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 6 */ | |
686 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
687 | |
688 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 7 */ | |
689 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
690 | |
691 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 8 */ | |
692 | |
693 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), | |
694 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
695 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
696 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
697 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) | |
698 : [filter45] "r" (filter45), [vector_64] "r" (vector_64), | |
699 [cm] "r" (cm), | |
700 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) | |
701 ); | |
702 | |
703 src += 16; | |
704 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); | |
705 odd_dst = (dst + dst_stride); | |
706 } | |
707 | |
708 /* Next row... */ | |
709 src_ptr += src_stride; | |
710 dst_ptr += 1; | |
711 } | |
712 } | |
713 | |
714 void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, | |
715 uint8_t *dst, ptrdiff_t dst_stride, | |
716 const int16_t *filter, int w, int h) { | |
717 int x, y; | |
718 | |
719 for (y = 0; y < h; ++y) { | |
720 for (x = 0; x < w; ++x) { | |
721 int sum = 0; | |
722 | |
723 sum += src[x] * filter[3]; | |
724 sum += src[x + 1] * filter[4]; | |
725 | |
726 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); | |
727 } | |
728 | |
729 src += src_stride; | |
730 dst += 1; | |
731 } | |
732 } | |
733 | |
734 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, | |
735 uint8_t *dst, ptrdiff_t dst_stride, | |
736 const int16_t *filter, | |
737 int w, int h) { | |
738 uint32_t pos = 38; | |
739 | |
740 /* bit positon for extract from acc */ | |
741 __asm__ __volatile__ ( | |
742 "wrdsp %[pos], 1 \n\t" | |
743 : | |
744 : [pos] "r" (pos) | |
745 ); | |
746 | |
747 /* prefetch data to cache memory */ | |
748 prefetch_load(src); | |
749 prefetch_load(src + 32); | |
750 | |
751 switch (w) { | |
752 case 4: | |
753 convolve_bi_horiz_4_transposed_dspr2(src, src_stride, | |
754 dst, dst_stride, | |
755 filter, h); | |
756 break; | |
757 case 8: | |
758 convolve_bi_horiz_8_transposed_dspr2(src, src_stride, | |
759 dst, dst_stride, | |
760 filter, h); | |
761 break; | |
762 case 16: | |
763 case 32: | |
764 convolve_bi_horiz_16_transposed_dspr2(src, src_stride, | |
765 dst, dst_stride, | |
766 filter, h, | |
767 (w/16)); | |
768 break; | |
769 case 64: | |
770 prefetch_load(src + 32); | |
771 convolve_bi_horiz_64_transposed_dspr2(src, src_stride, | |
772 dst, dst_stride, | |
773 filter, h); | |
774 break; | |
775 default: | |
776 convolve_bi_horiz_transposed(src, src_stride, | |
777 dst, dst_stride, | |
778 filter, w, h); | |
779 break; | |
780 } | |
781 } | |
782 #endif | |
OLD | NEW |