OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <assert.h> | |
12 #include <stdio.h> | |
13 | |
14 #include "./vpx_dsp_rtcd.h" | |
15 #include "vpx_dsp/mips/vpx_common_dspr2.h" | |
16 #include "vpx_dsp/vpx_dsp_common.h" | |
17 #include "vpx_dsp/vpx_filter.h" | |
18 #include "vpx_ports/mem.h" | |
19 | |
20 #if HAVE_DSPR2 | |
21 uint8_t vpx_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; | |
22 uint8_t *vpx_ff_cropTbl; | |
23 | |
24 void vpx_dsputil_static_init(void) { | |
25 int i; | |
26 | |
27 for (i = 0; i < 256; i++) vpx_ff_cropTbl_a[i + CROP_WIDTH] = i; | |
28 | |
29 for (i = 0; i < CROP_WIDTH; i++) { | |
30 vpx_ff_cropTbl_a[i] = 0; | |
31 vpx_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; | |
32 } | |
33 | |
34 vpx_ff_cropTbl = &vpx_ff_cropTbl_a[CROP_WIDTH]; | |
35 } | |
36 | |
37 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, | |
38 int32_t src_stride, | |
39 uint8_t *dst, | |
40 int32_t dst_stride, | |
41 const int16_t *filter_x0, | |
42 int32_t h) { | |
43 int32_t y; | |
44 uint8_t *cm = vpx_ff_cropTbl; | |
45 uint8_t *dst_ptr; | |
46 int32_t vector1b, vector2b, vector3b, vector4b; | |
47 int32_t Temp1, Temp2, Temp3, Temp4; | |
48 uint32_t vector4a = 64; | |
49 uint32_t tp1, tp2; | |
50 uint32_t p1, p2, p3, p4; | |
51 uint32_t tn1, tn2; | |
52 | |
53 vector1b = ((const int32_t *)filter_x0)[0]; | |
54 vector2b = ((const int32_t *)filter_x0)[1]; | |
55 vector3b = ((const int32_t *)filter_x0)[2]; | |
56 vector4b = ((const int32_t *)filter_x0)[3]; | |
57 | |
58 for (y = h; y--;) { | |
59 dst_ptr = dst; | |
60 /* prefetch data to cache memory */ | |
61 prefetch_load(src + src_stride); | |
62 prefetch_load(src + src_stride + 32); | |
63 | |
64 __asm__ __volatile__ ( | |
65 "ulw %[tp1], 0(%[src]) \n\t" | |
66 "ulw %[tp2], 4(%[src]) \n\t" | |
67 | |
68 /* even 1. pixel */ | |
69 "mtlo %[vector4a], $ac3 \n\t" | |
70 "mthi $zero, $ac3 \n\t" | |
71 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
72 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
73 "preceu.ph.qbr %[p3], %[tp2] \n\t" | |
74 "preceu.ph.qbl %[p4], %[tp2] \n\t" | |
75 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
76 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
77 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
78 "ulw %[tn2], 8(%[src]) \n\t" | |
79 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
80 "extp %[Temp1], $ac3, 31 \n\t" | |
81 | |
82 /* even 2. pixel */ | |
83 "mtlo %[vector4a], $ac2 \n\t" | |
84 "mthi $zero, $ac2 \n\t" | |
85 "preceu.ph.qbr %[p1], %[tn2] \n\t" | |
86 "balign %[tn1], %[tn2], 3 \n\t" | |
87 "balign %[tn2], %[tp2], 3 \n\t" | |
88 "balign %[tp2], %[tp1], 3 \n\t" | |
89 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" | |
90 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" | |
91 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" | |
92 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" | |
93 "extp %[Temp3], $ac2, 31 \n\t" | |
94 | |
95 /* odd 1. pixel */ | |
96 "lbux %[tp1], %[Temp1](%[cm]) \n\t" | |
97 "mtlo %[vector4a], $ac3 \n\t" | |
98 "mthi $zero, $ac3 \n\t" | |
99 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
100 "preceu.ph.qbl %[p2], %[tp2] \n\t" | |
101 "preceu.ph.qbr %[p3], %[tn2] \n\t" | |
102 "preceu.ph.qbl %[p4], %[tn2] \n\t" | |
103 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
104 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
105 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
106 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
107 "extp %[Temp2], $ac3, 31 \n\t" | |
108 | |
109 /* odd 2. pixel */ | |
110 "lbux %[tp2], %[Temp3](%[cm]) \n\t" | |
111 "mtlo %[vector4a], $ac2 \n\t" | |
112 "mthi $zero, $ac2 \n\t" | |
113 "preceu.ph.qbr %[p1], %[tn1] \n\t" | |
114 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" | |
115 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" | |
116 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" | |
117 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" | |
118 "extp %[Temp4], $ac2, 31 \n\t" | |
119 | |
120 /* clamp */ | |
121 "lbux %[tn1], %[Temp2](%[cm]) \n\t" | |
122 "lbux %[p2], %[Temp4](%[cm]) \n\t" | |
123 | |
124 /* store bytes */ | |
125 "sb %[tp1], 0(%[dst_ptr]) \n\t" | |
126 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
127 | |
128 "sb %[tn1], 0(%[dst_ptr]) \n\t" | |
129 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
130 | |
131 "sb %[tp2], 0(%[dst_ptr]) \n\t" | |
132 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
133 | |
134 "sb %[p2], 0(%[dst_ptr]) \n\t" | |
135 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
136 | |
137 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (
tn2), | |
138 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
139 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [
Temp4] "=&r" (Temp4), | |
140 [dst_ptr] "+r" (dst_ptr) | |
141 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), | |
142 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), | |
143 [vector4a] "r" (vector4a), | |
144 [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) | |
145 ); | |
146 | |
147 /* Next row... */ | |
148 src += src_stride; | |
149 dst += 1; | |
150 } | |
151 } | |
152 | |
153 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, | |
154 int32_t src_stride, | |
155 uint8_t *dst, | |
156 int32_t dst_stride, | |
157 const int16_t *filter_x0, | |
158 int32_t h) { | |
159 int32_t y; | |
160 uint8_t *cm = vpx_ff_cropTbl; | |
161 uint8_t *dst_ptr; | |
162 uint32_t vector4a = 64; | |
163 int32_t vector1b, vector2b, vector3b, vector4b; | |
164 int32_t Temp1, Temp2, Temp3; | |
165 uint32_t tp1, tp2, tp3; | |
166 uint32_t p1, p2, p3, p4, n1; | |
167 uint8_t *odd_dst; | |
168 uint32_t dst_pitch_2 = (dst_stride << 1); | |
169 | |
170 vector1b = ((const int32_t *)filter_x0)[0]; | |
171 vector2b = ((const int32_t *)filter_x0)[1]; | |
172 vector3b = ((const int32_t *)filter_x0)[2]; | |
173 vector4b = ((const int32_t *)filter_x0)[3]; | |
174 | |
175 for (y = h; y--;) { | |
176 /* prefetch data to cache memory */ | |
177 prefetch_load(src + src_stride); | |
178 prefetch_load(src + src_stride + 32); | |
179 | |
180 dst_ptr = dst; | |
181 odd_dst = (dst_ptr + dst_stride); | |
182 | |
183 __asm__ __volatile__ ( | |
184 "ulw %[tp2], 0(%[src]) \n\t" | |
185 "ulw %[tp1], 4(%[src]) \n\t" | |
186 | |
187 /* even 1. pixel */ | |
188 "mtlo %[vector4a], $ac3 \n\t" | |
189 "mthi $zero, $ac3 \n\t" | |
190 "mtlo %[vector4a], $ac2 \n\t" | |
191 "mthi $zero, $ac2 \n\t" | |
192 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
193 "preceu.ph.qbl %[p2], %[tp2] \n\t" | |
194 "preceu.ph.qbr %[p3], %[tp1] \n\t" | |
195 "preceu.ph.qbl %[p4], %[tp1] \n\t" | |
196 "ulw %[tp3], 8(%[src]) \n\t" | |
197 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
198 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
199 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
200 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
201 "extp %[Temp1], $ac3, 31 \n\t" | |
202 | |
203 /* even 2. pixel */ | |
204 "preceu.ph.qbr %[p1], %[tp3] \n\t" | |
205 "preceu.ph.qbl %[n1], %[tp3] \n\t" | |
206 "ulw %[tp2], 12(%[src]) \n\t" | |
207 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" | |
208 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" | |
209 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" | |
210 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" | |
211 "extp %[Temp3], $ac2, 31 \n\t" | |
212 | |
213 /* even 3. pixel */ | |
214 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
215 "mtlo %[vector4a], $ac1 \n\t" | |
216 "mthi $zero, $ac1 \n\t" | |
217 "preceu.ph.qbr %[p2], %[tp2] \n\t" | |
218 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" | |
219 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" | |
220 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" | |
221 "lbux %[tp3], %[Temp3](%[cm]) \n\t" | |
222 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" | |
223 "extp %[p3], $ac1, 31 \n\t" | |
224 | |
225 /* even 4. pixel */ | |
226 "mtlo %[vector4a], $ac2 \n\t" | |
227 "mthi $zero, $ac2 \n\t" | |
228 "mtlo %[vector4a], $ac3 \n\t" | |
229 "mthi $zero, $ac3 \n\t" | |
230 "sb %[Temp2], 0(%[dst_ptr]) \n\t" | |
231 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
232 "sb %[tp3], 0(%[dst_ptr]) \n\t" | |
233 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
234 | |
235 "ulw %[tp1], 1(%[src]) \n\t" | |
236 "ulw %[tp3], 5(%[src]) \n\t" | |
237 | |
238 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" | |
239 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" | |
240 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" | |
241 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" | |
242 "extp %[Temp3], $ac2, 31 \n\t" | |
243 | |
244 "lbux %[tp2], %[p3](%[cm]) \n\t" | |
245 | |
246 /* odd 1. pixel */ | |
247 "mtlo %[vector4a], $ac1 \n\t" | |
248 "mthi $zero, $ac1 \n\t" | |
249 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
250 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
251 "preceu.ph.qbr %[p3], %[tp3] \n\t" | |
252 "preceu.ph.qbl %[p4], %[tp3] \n\t" | |
253 "sb %[tp2], 0(%[dst_ptr]) \n\t" | |
254 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
255 "ulw %[tp2], 9(%[src]) \n\t" | |
256 | |
257 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
258 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
259 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
260 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
261 "extp %[Temp2], $ac3, 31 \n\t" | |
262 | |
263 /* odd 2. pixel */ | |
264 "lbux %[tp1], %[Temp3](%[cm]) \n\t" | |
265 "mtlo %[vector4a], $ac3 \n\t" | |
266 "mthi $zero, $ac3 \n\t" | |
267 "mtlo %[vector4a], $ac2 \n\t" | |
268 "mthi $zero, $ac2 \n\t" | |
269 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
270 "preceu.ph.qbl %[n1], %[tp2] \n\t" | |
271 "ulw %[Temp1], 13(%[src]) \n\t" | |
272 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" | |
273 "sb %[tp1], 0(%[dst_ptr]) \n\t" | |
274 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
275 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" | |
276 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" | |
277 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" | |
278 "extp %[Temp3], $ac1, 31 \n\t" | |
279 | |
280 /* odd 3. pixel */ | |
281 "lbux %[tp3], %[Temp2](%[cm]) \n\t" | |
282 "preceu.ph.qbr %[p2], %[Temp1] \n\t" | |
283 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" | |
284 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" | |
285 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" | |
286 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" | |
287 "extp %[Temp2], $ac3, 31 \n\t" | |
288 | |
289 /* odd 4. pixel */ | |
290 "sb %[tp3], 0(%[odd_dst]) \n\t" | |
291 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" | |
292 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" | |
293 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" | |
294 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" | |
295 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" | |
296 "extp %[Temp1], $ac2, 31 \n\t" | |
297 | |
298 /* clamp */ | |
299 "lbux %[p4], %[Temp3](%[cm]) \n\t" | |
300 "lbux %[p2], %[Temp2](%[cm]) \n\t" | |
301 "lbux %[n1], %[Temp1](%[cm]) \n\t" | |
302 | |
303 /* store bytes */ | |
304 "sb %[p4], 0(%[odd_dst]) \n\t" | |
305 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" | |
306 | |
307 "sb %[p2], 0(%[odd_dst]) \n\t" | |
308 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" | |
309 | |
310 "sb %[n1], 0(%[odd_dst]) \n\t" | |
311 | |
312 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), | |
313 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
314 [n1] "=&r" (n1), | |
315 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
316 [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) | |
317 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), | |
318 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), | |
319 [vector4a] "r" (vector4a), [cm] "r" (cm), | |
320 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) | |
321 ); | |
322 | |
323 /* Next row... */ | |
324 src += src_stride; | |
325 dst += 1; | |
326 } | |
327 } | |
328 | |
329 static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr, | |
330 int32_t src_stride, | |
331 uint8_t *dst_ptr, | |
332 int32_t dst_stride, | |
333 const int16_t *filter_x0, | |
334 int32_t h, | |
335 int32_t count) { | |
336 int32_t c, y; | |
337 const uint8_t *src; | |
338 uint8_t *dst; | |
339 uint8_t *cm = vpx_ff_cropTbl; | |
340 uint32_t vector_64 = 64; | |
341 int32_t filter12, filter34, filter56, filter78; | |
342 int32_t Temp1, Temp2, Temp3; | |
343 uint32_t qload1, qload2; | |
344 uint32_t p1, p2, p3, p4, p5; | |
345 uint32_t st1, st2, st3; | |
346 uint32_t dst_pitch_2 = (dst_stride << 1); | |
347 uint8_t *odd_dst; | |
348 | |
349 filter12 = ((const int32_t *)filter_x0)[0]; | |
350 filter34 = ((const int32_t *)filter_x0)[1]; | |
351 filter56 = ((const int32_t *)filter_x0)[2]; | |
352 filter78 = ((const int32_t *)filter_x0)[3]; | |
353 | |
354 for (y = h; y--;) { | |
355 /* prefetch data to cache memory */ | |
356 prefetch_load(src_ptr + src_stride); | |
357 prefetch_load(src_ptr + src_stride + 32); | |
358 | |
359 src = src_ptr; | |
360 dst = dst_ptr; | |
361 | |
362 odd_dst = (dst + dst_stride); | |
363 | |
364 for (c = 0; c < count; c++) { | |
365 __asm__ __volatile__ ( | |
366 "ulw %[qload1], 0(%[src]) \n
\t" | |
367 "ulw %[qload2], 4(%[src]) \n
\t" | |
368 | |
369 /* even 1. pixel */ | |
370 "mtlo %[vector_64], $ac1 \n
\t" /* even 1 */ | |
371 "mthi $zero, $ac1 \n
\t" | |
372 "mtlo %[vector_64], $ac2 \n
\t" /* even 2 */ | |
373 "mthi $zero, $ac2 \n
\t" | |
374 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
375 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
376 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
377 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
378 "ulw %[qload2], 8(%[src]) \n
\t" | |
379 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* even 1 */ | |
380 "dpa.w.ph $ac1, %[p2], %[filter34] \n
\t" /* even 1 */ | |
381 "dpa.w.ph $ac1, %[p3], %[filter56] \n
\t" /* even 1 */ | |
382 "dpa.w.ph $ac1, %[p4], %[filter78] \n
\t" /* even 1 */ | |
383 "extp %[Temp1], $ac1, 31 \n
\t" /* even 1 */ | |
384 | |
385 /* even 2. pixel */ | |
386 "mtlo %[vector_64], $ac3 \n
\t" /* even 3 */ | |
387 "mthi $zero, $ac3 \n
\t" | |
388 "preceu.ph.qbr %[p1], %[qload2] \n
\t" | |
389 "preceu.ph.qbl %[p5], %[qload2] \n
\t" | |
390 "ulw %[qload1], 12(%[src]) \n
\t" | |
391 "dpa.w.ph $ac2, %[p2], %[filter12] \n
\t" /* even 1 */ | |
392 "dpa.w.ph $ac2, %[p3], %[filter34] \n
\t" /* even 1 */ | |
393 "dpa.w.ph $ac2, %[p4], %[filter56] \n
\t" /* even 1 */ | |
394 "dpa.w.ph $ac2, %[p1], %[filter78] \n
\t" /* even 1 */ | |
395 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 1 */ | |
396 "extp %[Temp2], $ac2, 31 \n
\t" /* even 1 */ | |
397 | |
398 /* even 3. pixel */ | |
399 "mtlo %[vector_64], $ac1 \n
\t" /* even 4 */ | |
400 "mthi $zero, $ac1 \n
\t" | |
401 "preceu.ph.qbr %[p2], %[qload1] \n
\t" | |
402 "sb %[st1], 0(%[dst]) \n
\t" /* even 1 */ | |
403 "addu %[dst], %[dst], %[dst_pitch_2]
\n\t" | |
404 "dpa.w.ph $ac3, %[p3], %[filter12] \n
\t" /* even 3 */ | |
405 "dpa.w.ph $ac3, %[p4], %[filter34] \n
\t" /* even 3 */ | |
406 "dpa.w.ph $ac3, %[p1], %[filter56] \n
\t" /* even 3 */ | |
407 "dpa.w.ph $ac3, %[p5], %[filter78] \n
\t" /* even 3 */ | |
408 "extp %[Temp3], $ac3, 31 \n
\t" /* even 3 */ | |
409 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 1 */ | |
410 | |
411 /* even 4. pixel */ | |
412 "mtlo %[vector_64], $ac2 \n
\t" /* even 5 */ | |
413 "mthi $zero, $ac2 \n
\t" | |
414 "preceu.ph.qbl %[p3], %[qload1] \n
\t" | |
415 "sb %[st2], 0(%[dst]) \n
\t" /* even 2 */ | |
416 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
417 "ulw %[qload2], 16(%[src]) \n
\t" | |
418 "dpa.w.ph $ac1, %[p4], %[filter12] \n
\t" /* even 4 */ | |
419 "dpa.w.ph $ac1, %[p1], %[filter34] \n
\t" /* even 4 */ | |
420 "dpa.w.ph $ac1, %[p5], %[filter56] \n
\t" /* even 4 */ | |
421 "dpa.w.ph $ac1, %[p2], %[filter78] \n
\t" /* even 4 */ | |
422 "extp %[Temp1], $ac1, 31 \n
\t" /* even 4 */ | |
423 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 3 */ | |
424 | |
425 /* even 5. pixel */ | |
426 "mtlo %[vector_64], $ac3 \n
\t" /* even 6 */ | |
427 "mthi $zero, $ac3 \n
\t" | |
428 "preceu.ph.qbr %[p4], %[qload2] \n
\t" | |
429 "sb %[st3], 0(%[dst]) \n
\t" /* even 3 */ | |
430 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
431 "dpa.w.ph $ac2, %[p1], %[filter12] \n
\t" /* even 5 */ | |
432 "dpa.w.ph $ac2, %[p5], %[filter34] \n
\t" /* even 5 */ | |
433 "dpa.w.ph $ac2, %[p2], %[filter56] \n
\t" /* even 5 */ | |
434 "dpa.w.ph $ac2, %[p3], %[filter78] \n
\t" /* even 5 */ | |
435 "extp %[Temp2], $ac2, 31 \n
\t" /* even 5 */ | |
436 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 4 */ | |
437 | |
438 /* even 6. pixel */ | |
439 "mtlo %[vector_64], $ac1 \n
\t" /* even 7 */ | |
440 "mthi $zero, $ac1 \n
\t" | |
441 "preceu.ph.qbl %[p1], %[qload2] \n
\t" | |
442 "sb %[st1], 0(%[dst]) \n
\t" /* even 4 */ | |
443 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
444 "ulw %[qload1], 20(%[src]) \n
\t" | |
445 "dpa.w.ph $ac3, %[p5], %[filter12] \n
\t" /* even 6 */ | |
446 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* even 6 */ | |
447 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* even 6 */ | |
448 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* even 6 */ | |
449 "extp %[Temp3], $ac3, 31 \n
\t" /* even 6 */ | |
450 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 5 */ | |
451 | |
452 /* even 7. pixel */ | |
453 "mtlo %[vector_64], $ac2 \n
\t" /* even 8 */ | |
454 "mthi $zero, $ac2 \n
\t" | |
455 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
456 "sb %[st2], 0(%[dst]) \n
\t" /* even 5 */ | |
457 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
458 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* even 7 */ | |
459 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* even 7 */ | |
460 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* even 7 */ | |
461 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* even 7 */ | |
462 "extp %[Temp1], $ac1, 31 \n
\t" /* even 7 */ | |
463 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 6 */ | |
464 | |
465 /* even 8. pixel */ | |
466 "mtlo %[vector_64], $ac3 \n
\t" /* odd 1 */ | |
467 "mthi $zero, $ac3 \n
\t" | |
468 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* even 8 */ | |
469 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* even 8 */ | |
470 "sb %[st3], 0(%[dst]) \n
\t" /* even 6 */ | |
471 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
472 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* even 8 */ | |
473 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* even 8 */ | |
474 "extp %[Temp2], $ac2, 31 \n
\t" /* even 8 */ | |
475 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 7 */ | |
476 | |
477 /* ODD pixels */ | |
478 "ulw %[qload1], 1(%[src]) \n
\t" | |
479 "ulw %[qload2], 5(%[src]) \n
\t" | |
480 | |
481 /* odd 1. pixel */ | |
482 "mtlo %[vector_64], $ac1 \n
\t" /* odd 2 */ | |
483 "mthi $zero, $ac1 \n
\t" | |
484 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
485 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
486 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
487 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
488 "sb %[st1], 0(%[dst]) \n
\t" /* even 7 */ | |
489 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
490 "ulw %[qload2], 9(%[src]) \n
\t" | |
491 "dpa.w.ph $ac3, %[p1], %[filter12] \n
\t" /* odd 1 */ | |
492 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* odd 1 */ | |
493 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* odd 1 */ | |
494 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* odd 1 */ | |
495 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 1 */ | |
496 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 8 */ | |
497 | |
498 /* odd 2. pixel */ | |
499 "mtlo %[vector_64], $ac2 \n
\t" /* odd 3 */ | |
500 "mthi $zero, $ac2 \n
\t" | |
501 "preceu.ph.qbr %[p1], %[qload2] \n
\t" | |
502 "preceu.ph.qbl %[p5], %[qload2] \n
\t" | |
503 "sb %[st2], 0(%[dst]) \n
\t" /* even 8 */ | |
504 "ulw %[qload1], 13(%[src]) \n
\t" | |
505 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* odd 2 */ | |
506 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* odd 2 */ | |
507 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* odd 2 */ | |
508 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* odd 2 */ | |
509 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 2 */ | |
510 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 1 */ | |
511 | |
512 /* odd 3. pixel */ | |
513 "mtlo %[vector_64], $ac3 \n
\t" /* odd 4 */ | |
514 "mthi $zero, $ac3 \n
\t" | |
515 "preceu.ph.qbr %[p2], %[qload1] \n
\t" | |
516 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 1 */ | |
517 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
518 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* odd 3 */ | |
519 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* odd 3 */ | |
520 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* odd 3 */ | |
521 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* odd 3 */ | |
522 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 3 */ | |
523 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 2 */ | |
524 | |
525 /* odd 4. pixel */ | |
526 "mtlo %[vector_64], $ac1 \n
\t" /* odd 5 */ | |
527 "mthi $zero, $ac1 \n
\t" | |
528 "preceu.ph.qbl %[p3], %[qload1] \n
\t" | |
529 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 2 */ | |
530 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
531 "ulw %[qload2], 17(%[src]) \n
\t" | |
532 "dpa.w.ph $ac3, %[p4], %[filter12] \n
\t" /* odd 4 */ | |
533 "dpa.w.ph $ac3, %[p1], %[filter34] \n
\t" /* odd 4 */ | |
534 "dpa.w.ph $ac3, %[p5], %[filter56] \n
\t" /* odd 4 */ | |
535 "dpa.w.ph $ac3, %[p2], %[filter78] \n
\t" /* odd 4 */ | |
536 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 4 */ | |
537 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 3 */ | |
538 | |
539 /* odd 5. pixel */ | |
540 "mtlo %[vector_64], $ac2 \n
\t" /* odd 6 */ | |
541 "mthi $zero, $ac2 \n
\t" | |
542 "preceu.ph.qbr %[p4], %[qload2] \n
\t" | |
543 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 3 */ | |
544 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
545 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* odd 5 */ | |
546 "dpa.w.ph $ac1, %[p5], %[filter34] \n
\t" /* odd 5 */ | |
547 "dpa.w.ph $ac1, %[p2], %[filter56] \n
\t" /* odd 5 */ | |
548 "dpa.w.ph $ac1, %[p3], %[filter78] \n
\t" /* odd 5 */ | |
549 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 5 */ | |
550 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 4 */ | |
551 | |
552 /* odd 6. pixel */ | |
553 "mtlo %[vector_64], $ac3 \n
\t" /* odd 7 */ | |
554 "mthi $zero, $ac3 \n
\t" | |
555 "preceu.ph.qbl %[p1], %[qload2] \n
\t" | |
556 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 4 */ | |
557 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
558 "ulw %[qload1], 21(%[src]) \n
\t" | |
559 "dpa.w.ph $ac2, %[p5], %[filter12] \n
\t" /* odd 6 */ | |
560 "dpa.w.ph $ac2, %[p2], %[filter34] \n
\t" /* odd 6 */ | |
561 "dpa.w.ph $ac2, %[p3], %[filter56] \n
\t" /* odd 6 */ | |
562 "dpa.w.ph $ac2, %[p4], %[filter78] \n
\t" /* odd 6 */ | |
563 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 6 */ | |
564 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 5 */ | |
565 | |
566 /* odd 7. pixel */ | |
567 "mtlo %[vector_64], $ac1 \n
\t" /* odd 8 */ | |
568 "mthi $zero, $ac1 \n
\t" | |
569 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
570 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 5 */ | |
571 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
572 "dpa.w.ph $ac3, %[p2], %[filter12] \n
\t" /* odd 7 */ | |
573 "dpa.w.ph $ac3, %[p3], %[filter34] \n
\t" /* odd 7 */ | |
574 "dpa.w.ph $ac3, %[p4], %[filter56] \n
\t" /* odd 7 */ | |
575 "dpa.w.ph $ac3, %[p1], %[filter78] \n
\t" /* odd 7 */ | |
576 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 7 */ | |
577 | |
578 /* odd 8. pixel */ | |
579 "dpa.w.ph $ac1, %[p3], %[filter12] \n
\t" /* odd 8 */ | |
580 "dpa.w.ph $ac1, %[p4], %[filter34] \n
\t" /* odd 8 */ | |
581 "dpa.w.ph $ac1, %[p1], %[filter56] \n
\t" /* odd 8 */ | |
582 "dpa.w.ph $ac1, %[p5], %[filter78] \n
\t" /* odd 8 */ | |
583 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 8 */ | |
584 | |
585 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 6 */ | |
586 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 7 */ | |
587 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 8 */ | |
588 | |
589 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 6 */ | |
590 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
591 | |
592 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 7 */ | |
593 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
594 | |
595 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 8 */ | |
596 | |
597 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), | |
598 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
599 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
600 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
601 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) | |
602 : [filter12] "r" (filter12), [filter34] "r" (filter34), | |
603 [filter56] "r" (filter56), [filter78] "r" (filter78), | |
604 [vector_64] "r" (vector_64), [cm] "r" (cm), | |
605 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) | |
606 ); | |
607 | |
608 src += 16; | |
609 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); | |
610 odd_dst = (dst + dst_stride); | |
611 } | |
612 | |
613 /* Next row... */ | |
614 src_ptr += src_stride; | |
615 | |
616 dst_ptr += 1; | |
617 } | |
618 } | |
619 | |
620 static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr, | |
621 int32_t src_stride, | |
622 uint8_t *dst_ptr, | |
623 int32_t dst_stride, | |
624 const int16_t *filter_x0, | |
625 int32_t h) { | |
626 int32_t c, y; | |
627 const uint8_t *src; | |
628 uint8_t *dst; | |
629 uint8_t *cm = vpx_ff_cropTbl; | |
630 uint32_t vector_64 = 64; | |
631 int32_t filter12, filter34, filter56, filter78; | |
632 int32_t Temp1, Temp2, Temp3; | |
633 uint32_t qload1, qload2; | |
634 uint32_t p1, p2, p3, p4, p5; | |
635 uint32_t st1, st2, st3; | |
636 uint32_t dst_pitch_2 = (dst_stride << 1); | |
637 uint8_t *odd_dst; | |
638 | |
639 filter12 = ((const int32_t *)filter_x0)[0]; | |
640 filter34 = ((const int32_t *)filter_x0)[1]; | |
641 filter56 = ((const int32_t *)filter_x0)[2]; | |
642 filter78 = ((const int32_t *)filter_x0)[3]; | |
643 | |
644 for (y = h; y--;) { | |
645 /* prefetch data to cache memory */ | |
646 prefetch_load(src_ptr + src_stride); | |
647 prefetch_load(src_ptr + src_stride + 32); | |
648 prefetch_load(src_ptr + src_stride + 64); | |
649 | |
650 src = src_ptr; | |
651 dst = dst_ptr; | |
652 | |
653 odd_dst = (dst + dst_stride); | |
654 | |
655 for (c = 0; c < 4; c++) { | |
656 __asm__ __volatile__ ( | |
657 "ulw %[qload1], 0(%[src]) \n
\t" | |
658 "ulw %[qload2], 4(%[src]) \n
\t" | |
659 | |
660 /* even 1. pixel */ | |
661 "mtlo %[vector_64], $ac1 \n
\t" /* even 1 */ | |
662 "mthi $zero, $ac1 \n
\t" | |
663 "mtlo %[vector_64], $ac2 \n
\t" /* even 2 */ | |
664 "mthi $zero, $ac2 \n
\t" | |
665 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
666 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
667 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
668 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
669 "ulw %[qload2], 8(%[src]) \n
\t" | |
670 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* even 1 */ | |
671 "dpa.w.ph $ac1, %[p2], %[filter34] \n
\t" /* even 1 */ | |
672 "dpa.w.ph $ac1, %[p3], %[filter56] \n
\t" /* even 1 */ | |
673 "dpa.w.ph $ac1, %[p4], %[filter78] \n
\t" /* even 1 */ | |
674 "extp %[Temp1], $ac1, 31 \n
\t" /* even 1 */ | |
675 | |
676 /* even 2. pixel */ | |
677 "mtlo %[vector_64], $ac3 \n
\t" /* even 3 */ | |
678 "mthi $zero, $ac3 \n
\t" | |
679 "preceu.ph.qbr %[p1], %[qload2] \n
\t" | |
680 "preceu.ph.qbl %[p5], %[qload2] \n
\t" | |
681 "ulw %[qload1], 12(%[src]) \n
\t" | |
682 "dpa.w.ph $ac2, %[p2], %[filter12] \n
\t" /* even 1 */ | |
683 "dpa.w.ph $ac2, %[p3], %[filter34] \n
\t" /* even 1 */ | |
684 "dpa.w.ph $ac2, %[p4], %[filter56] \n
\t" /* even 1 */ | |
685 "dpa.w.ph $ac2, %[p1], %[filter78] \n
\t" /* even 1 */ | |
686 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 1 */ | |
687 "extp %[Temp2], $ac2, 31 \n
\t" /* even 1 */ | |
688 | |
689 /* even 3. pixel */ | |
690 "mtlo %[vector_64], $ac1 \n
\t" /* even 4 */ | |
691 "mthi $zero, $ac1 \n
\t" | |
692 "preceu.ph.qbr %[p2], %[qload1] \n
\t" | |
693 "sb %[st1], 0(%[dst]) \n
\t" /* even 1 */ | |
694 "addu %[dst], %[dst], %[dst_pitch_2]
\n\t" | |
695 "dpa.w.ph $ac3, %[p3], %[filter12] \n
\t" /* even 3 */ | |
696 "dpa.w.ph $ac3, %[p4], %[filter34] \n
\t" /* even 3 */ | |
697 "dpa.w.ph $ac3, %[p1], %[filter56] \n
\t" /* even 3 */ | |
698 "dpa.w.ph $ac3, %[p5], %[filter78] \n
\t" /* even 3 */ | |
699 "extp %[Temp3], $ac3, 31 \n
\t" /* even 3 */ | |
700 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 1 */ | |
701 | |
702 /* even 4. pixel */ | |
703 "mtlo %[vector_64], $ac2 \n
\t" /* even 5 */ | |
704 "mthi $zero, $ac2 \n
\t" | |
705 "preceu.ph.qbl %[p3], %[qload1] \n
\t" | |
706 "sb %[st2], 0(%[dst]) \n
\t" /* even 2 */ | |
707 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
708 "ulw %[qload2], 16(%[src]) \n
\t" | |
709 "dpa.w.ph $ac1, %[p4], %[filter12] \n
\t" /* even 4 */ | |
710 "dpa.w.ph $ac1, %[p1], %[filter34] \n
\t" /* even 4 */ | |
711 "dpa.w.ph $ac1, %[p5], %[filter56] \n
\t" /* even 4 */ | |
712 "dpa.w.ph $ac1, %[p2], %[filter78] \n
\t" /* even 4 */ | |
713 "extp %[Temp1], $ac1, 31 \n
\t" /* even 4 */ | |
714 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 3 */ | |
715 | |
716 /* even 5. pixel */ | |
717 "mtlo %[vector_64], $ac3 \n
\t" /* even 6 */ | |
718 "mthi $zero, $ac3 \n
\t" | |
719 "preceu.ph.qbr %[p4], %[qload2] \n
\t" | |
720 "sb %[st3], 0(%[dst]) \n
\t" /* even 3 */ | |
721 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
722 "dpa.w.ph $ac2, %[p1], %[filter12] \n
\t" /* even 5 */ | |
723 "dpa.w.ph $ac2, %[p5], %[filter34] \n
\t" /* even 5 */ | |
724 "dpa.w.ph $ac2, %[p2], %[filter56] \n
\t" /* even 5 */ | |
725 "dpa.w.ph $ac2, %[p3], %[filter78] \n
\t" /* even 5 */ | |
726 "extp %[Temp2], $ac2, 31 \n
\t" /* even 5 */ | |
727 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 4 */ | |
728 | |
729 /* even 6. pixel */ | |
730 "mtlo %[vector_64], $ac1 \n
\t" /* even 7 */ | |
731 "mthi $zero, $ac1 \n
\t" | |
732 "preceu.ph.qbl %[p1], %[qload2] \n
\t" | |
733 "sb %[st1], 0(%[dst]) \n
\t" /* even 4 */ | |
734 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
735 "ulw %[qload1], 20(%[src]) \n
\t" | |
736 "dpa.w.ph $ac3, %[p5], %[filter12] \n
\t" /* even 6 */ | |
737 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* even 6 */ | |
738 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* even 6 */ | |
739 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* even 6 */ | |
740 "extp %[Temp3], $ac3, 31 \n
\t" /* even 6 */ | |
741 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 5 */ | |
742 | |
743 /* even 7. pixel */ | |
744 "mtlo %[vector_64], $ac2 \n
\t" /* even 8 */ | |
745 "mthi $zero, $ac2 \n
\t" | |
746 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
747 "sb %[st2], 0(%[dst]) \n
\t" /* even 5 */ | |
748 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
749 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* even 7 */ | |
750 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* even 7 */ | |
751 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* even 7 */ | |
752 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* even 7 */ | |
753 "extp %[Temp1], $ac1, 31 \n
\t" /* even 7 */ | |
754 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 6 */ | |
755 | |
756 /* even 8. pixel */ | |
757 "mtlo %[vector_64], $ac3 \n
\t" /* odd 1 */ | |
758 "mthi $zero, $ac3 \n
\t" | |
759 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* even 8 */ | |
760 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* even 8 */ | |
761 "sb %[st3], 0(%[dst]) \n
\t" /* even 6 */ | |
762 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
763 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* even 8 */ | |
764 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* even 8 */ | |
765 "extp %[Temp2], $ac2, 31 \n
\t" /* even 8 */ | |
766 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 7 */ | |
767 | |
768 /* ODD pixels */ | |
769 "ulw %[qload1], 1(%[src]) \n
\t" | |
770 "ulw %[qload2], 5(%[src]) \n
\t" | |
771 | |
772 /* odd 1. pixel */ | |
773 "mtlo %[vector_64], $ac1 \n
\t" /* odd 2 */ | |
774 "mthi $zero, $ac1 \n
\t" | |
775 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
776 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
777 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
778 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
779 "sb %[st1], 0(%[dst]) \n
\t" /* even 7 */ | |
780 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
781 "ulw %[qload2], 9(%[src]) \n
\t" | |
782 "dpa.w.ph $ac3, %[p1], %[filter12] \n
\t" /* odd 1 */ | |
783 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* odd 1 */ | |
784 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* odd 1 */ | |
785 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* odd 1 */ | |
786 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 1 */ | |
787 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 8 */ | |
788 | |
789 /* odd 2. pixel */ | |
790 "mtlo %[vector_64], $ac2 \n
\t" /* odd 3 */ | |
791 "mthi $zero, $ac2 \n
\t" | |
792 "preceu.ph.qbr %[p1], %[qload2] \n
\t" | |
793 "preceu.ph.qbl %[p5], %[qload2] \n
\t" | |
794 "sb %[st2], 0(%[dst]) \n
\t" /* even 8 */ | |
795 "ulw %[qload1], 13(%[src]) \n
\t" | |
796 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* odd 2 */ | |
797 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* odd 2 */ | |
798 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* odd 2 */ | |
799 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* odd 2 */ | |
800 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 2 */ | |
801 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 1 */ | |
802 | |
803 /* odd 3. pixel */ | |
804 "mtlo %[vector_64], $ac3 \n
\t" /* odd 4 */ | |
805 "mthi $zero, $ac3 \n
\t" | |
806 "preceu.ph.qbr %[p2], %[qload1] \n
\t" | |
807 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 1 */ | |
808 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
809 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* odd 3 */ | |
810 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* odd 3 */ | |
811 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* odd 3 */ | |
812 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* odd 3 */ | |
813 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 3 */ | |
814 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 2 */ | |
815 | |
816 /* odd 4. pixel */ | |
817 "mtlo %[vector_64], $ac1 \n
\t" /* odd 5 */ | |
818 "mthi $zero, $ac1 \n
\t" | |
819 "preceu.ph.qbl %[p3], %[qload1] \n
\t" | |
820 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 2 */ | |
821 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
822 "ulw %[qload2], 17(%[src]) \n
\t" | |
823 "dpa.w.ph $ac3, %[p4], %[filter12] \n
\t" /* odd 4 */ | |
824 "dpa.w.ph $ac3, %[p1], %[filter34] \n
\t" /* odd 4 */ | |
825 "dpa.w.ph $ac3, %[p5], %[filter56] \n
\t" /* odd 4 */ | |
826 "dpa.w.ph $ac3, %[p2], %[filter78] \n
\t" /* odd 4 */ | |
827 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 4 */ | |
828 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 3 */ | |
829 | |
830 /* odd 5. pixel */ | |
831 "mtlo %[vector_64], $ac2 \n
\t" /* odd 6 */ | |
832 "mthi $zero, $ac2 \n
\t" | |
833 "preceu.ph.qbr %[p4], %[qload2] \n
\t" | |
834 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 3 */ | |
835 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
836 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* odd 5 */ | |
837 "dpa.w.ph $ac1, %[p5], %[filter34] \n
\t" /* odd 5 */ | |
838 "dpa.w.ph $ac1, %[p2], %[filter56] \n
\t" /* odd 5 */ | |
839 "dpa.w.ph $ac1, %[p3], %[filter78] \n
\t" /* odd 5 */ | |
840 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 5 */ | |
841 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 4 */ | |
842 | |
843 /* odd 6. pixel */ | |
844 "mtlo %[vector_64], $ac3 \n
\t" /* odd 7 */ | |
845 "mthi $zero, $ac3 \n
\t" | |
846 "preceu.ph.qbl %[p1], %[qload2] \n
\t" | |
847 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 4 */ | |
848 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
849 "ulw %[qload1], 21(%[src]) \n
\t" | |
850 "dpa.w.ph $ac2, %[p5], %[filter12] \n
\t" /* odd 6 */ | |
851 "dpa.w.ph $ac2, %[p2], %[filter34] \n
\t" /* odd 6 */ | |
852 "dpa.w.ph $ac2, %[p3], %[filter56] \n
\t" /* odd 6 */ | |
853 "dpa.w.ph $ac2, %[p4], %[filter78] \n
\t" /* odd 6 */ | |
854 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 6 */ | |
855 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 5 */ | |
856 | |
857 /* odd 7. pixel */ | |
858 "mtlo %[vector_64], $ac1 \n
\t" /* odd 8 */ | |
859 "mthi $zero, $ac1 \n
\t" | |
860 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
861 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 5 */ | |
862 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
863 "dpa.w.ph $ac3, %[p2], %[filter12] \n
\t" /* odd 7 */ | |
864 "dpa.w.ph $ac3, %[p3], %[filter34] \n
\t" /* odd 7 */ | |
865 "dpa.w.ph $ac3, %[p4], %[filter56] \n
\t" /* odd 7 */ | |
866 "dpa.w.ph $ac3, %[p1], %[filter78] \n
\t" /* odd 7 */ | |
867 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 7 */ | |
868 | |
869 /* odd 8. pixel */ | |
870 "dpa.w.ph $ac1, %[p3], %[filter12] \n
\t" /* odd 8 */ | |
871 "dpa.w.ph $ac1, %[p4], %[filter34] \n
\t" /* odd 8 */ | |
872 "dpa.w.ph $ac1, %[p1], %[filter56] \n
\t" /* odd 8 */ | |
873 "dpa.w.ph $ac1, %[p5], %[filter78] \n
\t" /* odd 8 */ | |
874 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 8 */ | |
875 | |
876 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 6 */ | |
877 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 7 */ | |
878 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 8 */ | |
879 | |
880 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 6 */ | |
881 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
882 | |
883 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 7 */ | |
884 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
885 | |
886 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 8 */ | |
887 | |
888 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), | |
889 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
890 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
891 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
892 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) | |
893 : [filter12] "r" (filter12), [filter34] "r" (filter34), | |
894 [filter56] "r" (filter56), [filter78] "r" (filter78), | |
895 [vector_64] "r" (vector_64), [cm] "r" (cm), | |
896 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) | |
897 ); | |
898 | |
899 src += 16; | |
900 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); | |
901 odd_dst = (dst + dst_stride); | |
902 } | |
903 | |
904 /* Next row... */ | |
905 src_ptr += src_stride; | |
906 | |
907 dst_ptr += 1; | |
908 } | |
909 } | |
910 | |
911 void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, | |
912 uint8_t *dst, ptrdiff_t dst_stride, | |
913 const int16_t *filter, int w, int h) { | |
914 int x, y, k; | |
915 | |
916 for (y = 0; y < h; ++y) { | |
917 for (x = 0; x < w; ++x) { | |
918 int sum = 0; | |
919 | |
920 for (k = 0; k < 8; ++k) | |
921 sum += src[x + k] * filter[k]; | |
922 | |
923 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); | |
924 } | |
925 | |
926 src += src_stride; | |
927 dst += 1; | |
928 } | |
929 } | |
930 | |
931 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, | |
932 uint8_t *dst, ptrdiff_t dst_stride, | |
933 int w, int h) { | |
934 int x, y; | |
935 | |
936 for (y = 0; y < h; ++y) { | |
937 for (x = 0; x < w; ++x) { | |
938 dst[x * dst_stride] = src[x]; | |
939 } | |
940 | |
941 src += src_stride; | |
942 dst += 1; | |
943 } | |
944 } | |
945 | |
946 void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, | |
947 uint8_t *dst, ptrdiff_t dst_stride, | |
948 const int16_t *filter_x, int x_step_q4, | |
949 const int16_t *filter_y, int y_step_q4, | |
950 int w, int h) { | |
951 DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); | |
952 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; | |
953 uint32_t pos = 38; | |
954 | |
955 /* bit positon for extract from acc */ | |
956 __asm__ __volatile__ ( | |
957 "wrdsp %[pos], 1 \n\t" | |
958 : | |
959 : [pos] "r" (pos) | |
960 ); | |
961 | |
962 if (intermediate_height < h) | |
963 intermediate_height = h; | |
964 | |
965 if (x_step_q4 != 16 || y_step_q4 != 16) | |
966 return vpx_convolve8_c(src, src_stride, | |
967 dst, dst_stride, | |
968 filter_x, x_step_q4, | |
969 filter_y, y_step_q4, | |
970 w, h); | |
971 | |
972 if ((((const int32_t *)filter_x)[1] == 0x800000) | |
973 && (((const int32_t *)filter_y)[1] == 0x800000)) | |
974 return vpx_convolve_copy(src, src_stride, | |
975 dst, dst_stride, | |
976 filter_x, x_step_q4, | |
977 filter_y, y_step_q4, | |
978 w, h); | |
979 | |
980 /* copy the src to dst */ | |
981 if (filter_x[3] == 0x80) { | |
982 copy_horiz_transposed(src - src_stride * 3, src_stride, | |
983 temp, intermediate_height, | |
984 w, intermediate_height); | |
985 } else if (((const int32_t *)filter_x)[0] == 0) { | |
986 vpx_convolve2_dspr2(src - src_stride * 3, src_stride, | |
987 temp, intermediate_height, | |
988 filter_x, | |
989 w, intermediate_height); | |
990 } else { | |
991 src -= (src_stride * 3 + 3); | |
992 | |
993 /* prefetch data to cache memory */ | |
994 prefetch_load(src); | |
995 prefetch_load(src + 32); | |
996 | |
997 switch (w) { | |
998 case 4: | |
999 convolve_horiz_4_transposed_dspr2(src, src_stride, | |
1000 temp, intermediate_height, | |
1001 filter_x, intermediate_height); | |
1002 break; | |
1003 case 8: | |
1004 convolve_horiz_8_transposed_dspr2(src, src_stride, | |
1005 temp, intermediate_height, | |
1006 filter_x, intermediate_height); | |
1007 break; | |
1008 case 16: | |
1009 case 32: | |
1010 convolve_horiz_16_transposed_dspr2(src, src_stride, | |
1011 temp, intermediate_height, | |
1012 filter_x, intermediate_height, | |
1013 (w/16)); | |
1014 break; | |
1015 case 64: | |
1016 prefetch_load(src + 32); | |
1017 convolve_horiz_64_transposed_dspr2(src, src_stride, | |
1018 temp, intermediate_height, | |
1019 filter_x, intermediate_height); | |
1020 break; | |
1021 default: | |
1022 convolve_horiz_transposed(src, src_stride, | |
1023 temp, intermediate_height, | |
1024 filter_x, w, intermediate_height); | |
1025 break; | |
1026 } | |
1027 } | |
1028 | |
1029 /* copy the src to dst */ | |
1030 if (filter_y[3] == 0x80) { | |
1031 copy_horiz_transposed(temp + 3, intermediate_height, | |
1032 dst, dst_stride, | |
1033 h, w); | |
1034 } else if (((const int32_t *)filter_y)[0] == 0) { | |
1035 vpx_convolve2_dspr2(temp + 3, intermediate_height, | |
1036 dst, dst_stride, | |
1037 filter_y, | |
1038 h, w); | |
1039 } else { | |
1040 switch (h) { | |
1041 case 4: | |
1042 convolve_horiz_4_transposed_dspr2(temp, intermediate_height, | |
1043 dst, dst_stride, | |
1044 filter_y, w); | |
1045 break; | |
1046 case 8: | |
1047 convolve_horiz_8_transposed_dspr2(temp, intermediate_height, | |
1048 dst, dst_stride, | |
1049 filter_y, w); | |
1050 break; | |
1051 case 16: | |
1052 case 32: | |
1053 convolve_horiz_16_transposed_dspr2(temp, intermediate_height, | |
1054 dst, dst_stride, | |
1055 filter_y, w, (h/16)); | |
1056 break; | |
1057 case 64: | |
1058 convolve_horiz_64_transposed_dspr2(temp, intermediate_height, | |
1059 dst, dst_stride, | |
1060 filter_y, w); | |
1061 break; | |
1062 default: | |
1063 convolve_horiz_transposed(temp, intermediate_height, | |
1064 dst, dst_stride, | |
1065 filter_y, h, w); | |
1066 break; | |
1067 } | |
1068 } | |
1069 } | |
1070 | |
1071 void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, | |
1072 uint8_t *dst, ptrdiff_t dst_stride, | |
1073 const int16_t *filter_x, int filter_x_stride, | |
1074 const int16_t *filter_y, int filter_y_stride, | |
1075 int w, int h) { | |
1076 int x, y; | |
1077 | |
1078 /* prefetch data to cache memory */ | |
1079 prefetch_load(src); | |
1080 prefetch_load(src + 32); | |
1081 prefetch_store(dst); | |
1082 | |
1083 switch (w) { | |
1084 case 4: | |
1085 { | |
1086 uint32_t tp1; | |
1087 | |
1088 /* 1 word storage */ | |
1089 for (y = h; y--; ) { | |
1090 prefetch_load(src + src_stride); | |
1091 prefetch_load(src + src_stride + 32); | |
1092 prefetch_store(dst + dst_stride); | |
1093 | |
1094 __asm__ __volatile__ ( | |
1095 "ulw %[tp1], (%[src]) \n\t" | |
1096 "sw %[tp1], (%[dst]) \n\t" /* store */ | |
1097 | |
1098 : [tp1] "=&r" (tp1) | |
1099 : [src] "r" (src), [dst] "r" (dst) | |
1100 ); | |
1101 | |
1102 src += src_stride; | |
1103 dst += dst_stride; | |
1104 } | |
1105 } | |
1106 break; | |
1107 case 8: | |
1108 { | |
1109 uint32_t tp1, tp2; | |
1110 | |
1111 /* 2 word storage */ | |
1112 for (y = h; y--; ) { | |
1113 prefetch_load(src + src_stride); | |
1114 prefetch_load(src + src_stride + 32); | |
1115 prefetch_store(dst + dst_stride); | |
1116 | |
1117 __asm__ __volatile__ ( | |
1118 "ulw %[tp1], 0(%[src]) \n\t" | |
1119 "ulw %[tp2], 4(%[src]) \n\t" | |
1120 "sw %[tp1], 0(%[dst]) \n\t" /* store */ | |
1121 "sw %[tp2], 4(%[dst]) \n\t" /* store */ | |
1122 | |
1123 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2) | |
1124 : [src] "r" (src), [dst] "r" (dst) | |
1125 ); | |
1126 | |
1127 src += src_stride; | |
1128 dst += dst_stride; | |
1129 } | |
1130 } | |
1131 break; | |
1132 case 16: | |
1133 { | |
1134 uint32_t tp1, tp2, tp3, tp4; | |
1135 | |
1136 /* 4 word storage */ | |
1137 for (y = h; y--; ) { | |
1138 prefetch_load(src + src_stride); | |
1139 prefetch_load(src + src_stride + 32); | |
1140 prefetch_store(dst + dst_stride); | |
1141 | |
1142 __asm__ __volatile__ ( | |
1143 "ulw %[tp1], 0(%[src]) \n\t" | |
1144 "ulw %[tp2], 4(%[src]) \n\t" | |
1145 "ulw %[tp3], 8(%[src]) \n\t" | |
1146 "ulw %[tp4], 12(%[src]) \n\t" | |
1147 | |
1148 "sw %[tp1], 0(%[dst]) \n\t" /* store */ | |
1149 "sw %[tp2], 4(%[dst]) \n\t" /* store */ | |
1150 "sw %[tp3], 8(%[dst]) \n\t" /* store */ | |
1151 "sw %[tp4], 12(%[dst]) \n\t" /* store */ | |
1152 | |
1153 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
1154 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4) | |
1155 : [src] "r" (src), [dst] "r" (dst) | |
1156 ); | |
1157 | |
1158 src += src_stride; | |
1159 dst += dst_stride; | |
1160 } | |
1161 } | |
1162 break; | |
1163 case 32: | |
1164 { | |
1165 uint32_t tp1, tp2, tp3, tp4; | |
1166 uint32_t tp5, tp6, tp7, tp8; | |
1167 | |
1168 /* 8 word storage */ | |
1169 for (y = h; y--; ) { | |
1170 prefetch_load(src + src_stride); | |
1171 prefetch_load(src + src_stride + 32); | |
1172 prefetch_store(dst + dst_stride); | |
1173 | |
1174 __asm__ __volatile__ ( | |
1175 "ulw %[tp1], 0(%[src]) \n\t" | |
1176 "ulw %[tp2], 4(%[src]) \n\t" | |
1177 "ulw %[tp3], 8(%[src]) \n\t" | |
1178 "ulw %[tp4], 12(%[src]) \n\t" | |
1179 "ulw %[tp5], 16(%[src]) \n\t" | |
1180 "ulw %[tp6], 20(%[src]) \n\t" | |
1181 "ulw %[tp7], 24(%[src]) \n\t" | |
1182 "ulw %[tp8], 28(%[src]) \n\t" | |
1183 | |
1184 "sw %[tp1], 0(%[dst]) \n\t" /* store */ | |
1185 "sw %[tp2], 4(%[dst]) \n\t" /* store */ | |
1186 "sw %[tp3], 8(%[dst]) \n\t" /* store */ | |
1187 "sw %[tp4], 12(%[dst]) \n\t" /* store */ | |
1188 "sw %[tp5], 16(%[dst]) \n\t" /* store */ | |
1189 "sw %[tp6], 20(%[dst]) \n\t" /* store */ | |
1190 "sw %[tp7], 24(%[dst]) \n\t" /* store */ | |
1191 "sw %[tp8], 28(%[dst]) \n\t" /* store */ | |
1192 | |
1193 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
1194 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), | |
1195 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), | |
1196 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) | |
1197 : [src] "r" (src), [dst] "r" (dst) | |
1198 ); | |
1199 | |
1200 src += src_stride; | |
1201 dst += dst_stride; | |
1202 } | |
1203 } | |
1204 break; | |
1205 case 64: | |
1206 { | |
1207 uint32_t tp1, tp2, tp3, tp4; | |
1208 uint32_t tp5, tp6, tp7, tp8; | |
1209 | |
1210 prefetch_load(src + 64); | |
1211 prefetch_store(dst + 32); | |
1212 | |
1213 /* 16 word storage */ | |
1214 for (y = h; y--; ) { | |
1215 prefetch_load(src + src_stride); | |
1216 prefetch_load(src + src_stride + 32); | |
1217 prefetch_load(src + src_stride + 64); | |
1218 prefetch_store(dst + dst_stride); | |
1219 prefetch_store(dst + dst_stride + 32); | |
1220 | |
1221 __asm__ __volatile__ ( | |
1222 "ulw %[tp1], 0(%[src]) \n\t" | |
1223 "ulw %[tp2], 4(%[src]) \n\t" | |
1224 "ulw %[tp3], 8(%[src]) \n\t" | |
1225 "ulw %[tp4], 12(%[src]) \n\t" | |
1226 "ulw %[tp5], 16(%[src]) \n\t" | |
1227 "ulw %[tp6], 20(%[src]) \n\t" | |
1228 "ulw %[tp7], 24(%[src]) \n\t" | |
1229 "ulw %[tp8], 28(%[src]) \n\t" | |
1230 | |
1231 "sw %[tp1], 0(%[dst]) \n\t" /* store */ | |
1232 "sw %[tp2], 4(%[dst]) \n\t" /* store */ | |
1233 "sw %[tp3], 8(%[dst]) \n\t" /* store */ | |
1234 "sw %[tp4], 12(%[dst]) \n\t" /* store */ | |
1235 "sw %[tp5], 16(%[dst]) \n\t" /* store */ | |
1236 "sw %[tp6], 20(%[dst]) \n\t" /* store */ | |
1237 "sw %[tp7], 24(%[dst]) \n\t" /* store */ | |
1238 "sw %[tp8], 28(%[dst]) \n\t" /* store */ | |
1239 | |
1240 "ulw %[tp1], 32(%[src]) \n\t" | |
1241 "ulw %[tp2], 36(%[src]) \n\t" | |
1242 "ulw %[tp3], 40(%[src]) \n\t" | |
1243 "ulw %[tp4], 44(%[src]) \n\t" | |
1244 "ulw %[tp5], 48(%[src]) \n\t" | |
1245 "ulw %[tp6], 52(%[src]) \n\t" | |
1246 "ulw %[tp7], 56(%[src]) \n\t" | |
1247 "ulw %[tp8], 60(%[src]) \n\t" | |
1248 | |
1249 "sw %[tp1], 32(%[dst]) \n\t" /* store */ | |
1250 "sw %[tp2], 36(%[dst]) \n\t" /* store */ | |
1251 "sw %[tp3], 40(%[dst]) \n\t" /* store */ | |
1252 "sw %[tp4], 44(%[dst]) \n\t" /* store */ | |
1253 "sw %[tp5], 48(%[dst]) \n\t" /* store */ | |
1254 "sw %[tp6], 52(%[dst]) \n\t" /* store */ | |
1255 "sw %[tp7], 56(%[dst]) \n\t" /* store */ | |
1256 "sw %[tp8], 60(%[dst]) \n\t" /* store */ | |
1257 | |
1258 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
1259 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), | |
1260 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), | |
1261 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) | |
1262 : [src] "r" (src), [dst] "r" (dst) | |
1263 ); | |
1264 | |
1265 src += src_stride; | |
1266 dst += dst_stride; | |
1267 } | |
1268 } | |
1269 break; | |
1270 default: | |
1271 for (y = h; y--; ) { | |
1272 for (x = 0; x < w; ++x) { | |
1273 dst[x] = src[x]; | |
1274 } | |
1275 | |
1276 src += src_stride; | |
1277 dst += dst_stride; | |
1278 } | |
1279 break; | |
1280 } | |
1281 } | |
1282 #endif | |
OLD | NEW |