Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(444)

Side by Side Diff: source/libvpx/vpx_dsp/mips/vpx_convolve8_avg_dspr2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/vpx_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
21 static void convolve_avg_vert_4_dspr2(const uint8_t *src,
22 int32_t src_stride,
23 uint8_t *dst,
24 int32_t dst_stride,
25 const int16_t *filter_y,
26 int32_t w,
27 int32_t h) {
28 int32_t x, y;
29 const uint8_t *src_ptr;
30 uint8_t *dst_ptr;
31 uint8_t *cm = vpx_ff_cropTbl;
32 uint32_t vector4a = 64;
33 uint32_t load1, load2, load3, load4;
34 uint32_t p1, p2;
35 uint32_t n1, n2;
36 uint32_t scratch1, scratch2;
37 uint32_t store1, store2;
38 int32_t vector1b, vector2b, vector3b, vector4b;
39 int32_t Temp1, Temp2;
40
41 vector1b = ((const int32_t *)filter_y)[0];
42 vector2b = ((const int32_t *)filter_y)[1];
43 vector3b = ((const int32_t *)filter_y)[2];
44 vector4b = ((const int32_t *)filter_y)[3];
45
46 src -= 3 * src_stride;
47
48 for (y = h; y--;) {
49 /* prefetch data to cache memory */
50 prefetch_store(dst + dst_stride);
51
52 for (x = 0; x < w; x += 4) {
53 src_ptr = src + x;
54 dst_ptr = dst + x;
55
56 __asm__ __volatile__ (
57 "ulw %[load1], 0(%[src_ptr]) \n\t"
58 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
59 "ulw %[load2], 0(%[src_ptr]) \n\t"
60 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
61 "ulw %[load3], 0(%[src_ptr]) \n\t"
62 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
63 "ulw %[load4], 0(%[src_ptr]) \n\t"
64
65 "mtlo %[vector4a], $ac0 \n\t"
66 "mtlo %[vector4a], $ac1 \n\t"
67 "mtlo %[vector4a], $ac2 \n\t"
68 "mtlo %[vector4a], $ac3 \n\t"
69 "mthi $zero, $ac0 \n\t"
70 "mthi $zero, $ac1 \n\t"
71 "mthi $zero, $ac2 \n\t"
72 "mthi $zero, $ac3 \n\t"
73
74 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
75 "preceu.ph.qbr %[p1], %[load2] \n\t"
76 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
77 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
78 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
79 "preceu.ph.qbr %[p2], %[load4] \n\t"
80 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
81 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
82
83 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
84 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
85 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
86 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
87
88 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
89 "preceu.ph.qbl %[p1], %[load2] \n\t"
90 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
91 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
92 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
93 "preceu.ph.qbl %[p2], %[load4] \n\t"
94 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
95 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
96
97 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
98 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
99 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
100 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
101
102 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
103 "ulw %[load1], 0(%[src_ptr]) \n\t"
104 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
105 "ulw %[load2], 0(%[src_ptr]) \n\t"
106 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
107 "ulw %[load3], 0(%[src_ptr]) \n\t"
108 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
109 "ulw %[load4], 0(%[src_ptr]) \n\t"
110
111 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
112 "preceu.ph.qbr %[p1], %[load2] \n\t"
113 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
114 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
115 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
116 "preceu.ph.qbr %[p2], %[load4] \n\t"
117 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
118 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
119
120 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
121 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
122 "extp %[Temp1], $ac0, 31 \n\t"
123 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
124 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
125 "extp %[Temp2], $ac1, 31 \n\t"
126
127 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
128 "preceu.ph.qbl %[p1], %[load2] \n\t"
129 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
130 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
131 "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
132 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
133 "preceu.ph.qbl %[p2], %[load4] \n\t"
134 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
135 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
136 "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
137
138 "lbux %[store1], %[Temp1](%[cm]) \n\t"
139 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
140 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
141 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
142 "extp %[Temp1], $ac2, 31 \n\t"
143
144 "lbux %[store2], %[Temp2](%[cm]) \n\t"
145 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
146 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
147 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
148 "extp %[Temp2], $ac3, 31 \n\t"
149 "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
150
151 "sb %[store1], 0(%[dst_ptr]) \n\t"
152 "sb %[store2], 1(%[dst_ptr]) \n\t"
153 "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
154
155 "lbux %[store1], %[Temp1](%[cm]) \n\t"
156 "lbux %[store2], %[Temp2](%[cm]) \n\t"
157 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
158 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
159
160 "sb %[store1], 2(%[dst_ptr]) \n\t"
161 "sb %[store2], 3(%[dst_ptr]) \n\t"
162
163 : [load1] "=&r" (load1), [load2] "=&r" (load2),
164 [load3] "=&r" (load3), [load4] "=&r" (load4),
165 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
166 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
167 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
168 [store1] "=&r" (store1), [store2] "=&r" (store2),
169 [src_ptr] "+r" (src_ptr)
170 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
171 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
172 [vector4a] "r" (vector4a),
173 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr )
174 );
175 }
176
177 /* Next row... */
178 src += src_stride;
179 dst += dst_stride;
180 }
181 }
182
183 static void convolve_avg_vert_64_dspr2(const uint8_t *src,
184 int32_t src_stride,
185 uint8_t *dst,
186 int32_t dst_stride,
187 const int16_t *filter_y,
188 int32_t h) {
189 int32_t x, y;
190 const uint8_t *src_ptr;
191 uint8_t *dst_ptr;
192 uint8_t *cm = vpx_ff_cropTbl;
193 uint32_t vector4a = 64;
194 uint32_t load1, load2, load3, load4;
195 uint32_t p1, p2;
196 uint32_t n1, n2;
197 uint32_t scratch1, scratch2;
198 uint32_t store1, store2;
199 int32_t vector1b, vector2b, vector3b, vector4b;
200 int32_t Temp1, Temp2;
201
202 vector1b = ((const int32_t *)filter_y)[0];
203 vector2b = ((const int32_t *)filter_y)[1];
204 vector3b = ((const int32_t *)filter_y)[2];
205 vector4b = ((const int32_t *)filter_y)[3];
206
207 src -= 3 * src_stride;
208
209 for (y = h; y--;) {
210 /* prefetch data to cache memory */
211 prefetch_store(dst + dst_stride);
212 prefetch_store(dst + dst_stride + 32);
213
214 for (x = 0; x < 64; x += 4) {
215 src_ptr = src + x;
216 dst_ptr = dst + x;
217
218 __asm__ __volatile__ (
219 "ulw %[load1], 0(%[src_ptr]) \n\t"
220 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
221 "ulw %[load2], 0(%[src_ptr]) \n\t"
222 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
223 "ulw %[load3], 0(%[src_ptr]) \n\t"
224 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
225 "ulw %[load4], 0(%[src_ptr]) \n\t"
226
227 "mtlo %[vector4a], $ac0 \n\t"
228 "mtlo %[vector4a], $ac1 \n\t"
229 "mtlo %[vector4a], $ac2 \n\t"
230 "mtlo %[vector4a], $ac3 \n\t"
231 "mthi $zero, $ac0 \n\t"
232 "mthi $zero, $ac1 \n\t"
233 "mthi $zero, $ac2 \n\t"
234 "mthi $zero, $ac3 \n\t"
235
236 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
237 "preceu.ph.qbr %[p1], %[load2] \n\t"
238 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
239 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
240 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
241 "preceu.ph.qbr %[p2], %[load4] \n\t"
242 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
243 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
244
245 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
246 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
247 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
248 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
249
250 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
251 "preceu.ph.qbl %[p1], %[load2] \n\t"
252 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
253 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
254 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
255 "preceu.ph.qbl %[p2], %[load4] \n\t"
256 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
257 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
258
259 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
260 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
261 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
262 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
263
264 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
265 "ulw %[load1], 0(%[src_ptr]) \n\t"
266 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
267 "ulw %[load2], 0(%[src_ptr]) \n\t"
268 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
269 "ulw %[load3], 0(%[src_ptr]) \n\t"
270 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
271 "ulw %[load4], 0(%[src_ptr]) \n\t"
272
273 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
274 "preceu.ph.qbr %[p1], %[load2] \n\t"
275 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
276 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
277 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
278 "preceu.ph.qbr %[p2], %[load4] \n\t"
279 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
280 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
281
282 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
283 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
284 "extp %[Temp1], $ac0, 31 \n\t"
285 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
286 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
287 "extp %[Temp2], $ac1, 31 \n\t"
288
289 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
290 "preceu.ph.qbl %[p1], %[load2] \n\t"
291 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
292 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
293 "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
294 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
295 "preceu.ph.qbl %[p2], %[load4] \n\t"
296 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
297 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
298 "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
299
300 "lbux %[store1], %[Temp1](%[cm]) \n\t"
301 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
302 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
303 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
304 "extp %[Temp1], $ac2, 31 \n\t"
305
306 "lbux %[store2], %[Temp2](%[cm]) \n\t"
307 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
308 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
309 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
310 "extp %[Temp2], $ac3, 31 \n\t"
311 "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
312
313 "sb %[store1], 0(%[dst_ptr]) \n\t"
314 "sb %[store2], 1(%[dst_ptr]) \n\t"
315 "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
316
317 "lbux %[store1], %[Temp1](%[cm]) \n\t"
318 "lbux %[store2], %[Temp2](%[cm]) \n\t"
319 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
320 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
321
322 "sb %[store1], 2(%[dst_ptr]) \n\t"
323 "sb %[store2], 3(%[dst_ptr]) \n\t"
324
325 : [load1] "=&r" (load1), [load2] "=&r" (load2),
326 [load3] "=&r" (load3), [load4] "=&r" (load4),
327 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
328 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
329 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
330 [store1] "=&r" (store1), [store2] "=&r" (store2),
331 [src_ptr] "+r" (src_ptr)
332 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
333 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
334 [vector4a] "r" (vector4a),
335 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr )
336 );
337 }
338
339 /* Next row... */
340 src += src_stride;
341 dst += dst_stride;
342 }
343 }
344
345 void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
346 uint8_t *dst, ptrdiff_t dst_stride,
347 const int16_t *filter_x, int x_step_q4,
348 const int16_t *filter_y, int y_step_q4,
349 int w, int h) {
350 if (((const int32_t *)filter_y)[1] == 0x800000) {
351 vpx_convolve_avg(src, src_stride,
352 dst, dst_stride,
353 filter_x, x_step_q4,
354 filter_y, y_step_q4,
355 w, h);
356 } else if (((const int32_t *)filter_y)[0] == 0) {
357 vpx_convolve2_avg_vert_dspr2(src, src_stride,
358 dst, dst_stride,
359 filter_x, x_step_q4,
360 filter_y, y_step_q4,
361 w, h);
362 } else {
363 if (16 == y_step_q4) {
364 uint32_t pos = 38;
365
366 /* bit positon for extract from acc */
367 __asm__ __volatile__ (
368 "wrdsp %[pos], 1 \n\t"
369 :
370 : [pos] "r" (pos)
371 );
372
373 prefetch_store(dst);
374
375 switch (w) {
376 case 4:
377 case 8:
378 case 16:
379 case 32:
380 convolve_avg_vert_4_dspr2(src, src_stride,
381 dst, dst_stride,
382 filter_y, w, h);
383 break;
384 case 64:
385 prefetch_store(dst + 32);
386 convolve_avg_vert_64_dspr2(src, src_stride,
387 dst, dst_stride,
388 filter_y, h);
389 break;
390 default:
391 vpx_convolve8_avg_vert_c(src, src_stride,
392 dst, dst_stride,
393 filter_x, x_step_q4,
394 filter_y, y_step_q4,
395 w, h);
396 break;
397 }
398 } else {
399 vpx_convolve8_avg_vert_c(src, src_stride,
400 dst, dst_stride,
401 filter_x, x_step_q4,
402 filter_y, y_step_q4,
403 w, h);
404 }
405 }
406 }
407
408 void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
409 uint8_t *dst, ptrdiff_t dst_stride,
410 const int16_t *filter_x, int x_step_q4,
411 const int16_t *filter_y, int y_step_q4,
412 int w, int h) {
413 /* Fixed size intermediate buffer places limits on parameters. */
414 DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
415 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
416
417 assert(w <= 64);
418 assert(h <= 64);
419
420 if (intermediate_height < h)
421 intermediate_height = h;
422
423 if (x_step_q4 != 16 || y_step_q4 != 16)
424 return vpx_convolve8_avg_c(src, src_stride,
425 dst, dst_stride,
426 filter_x, x_step_q4,
427 filter_y, y_step_q4,
428 w, h);
429
430 vpx_convolve8_horiz(src - (src_stride * 3), src_stride,
431 temp, 64,
432 filter_x, x_step_q4,
433 filter_y, y_step_q4,
434 w, intermediate_height);
435
436 vpx_convolve8_avg_vert(temp + 64 * 3, 64,
437 dst, dst_stride,
438 filter_x, x_step_q4,
439 filter_y, y_step_q4,
440 w, h);
441 }
442
443 void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
444 uint8_t *dst, ptrdiff_t dst_stride,
445 const int16_t *filter_x, int filter_x_stride,
446 const int16_t *filter_y, int filter_y_stride,
447 int w, int h) {
448 int x, y;
449 uint32_t tp1, tp2, tn1;
450 uint32_t tp3, tp4, tn2;
451
452 /* prefetch data to cache memory */
453 prefetch_load(src);
454 prefetch_load(src + 32);
455 prefetch_store(dst);
456
457 switch (w) {
458 case 4:
459 /* 1 word storage */
460 for (y = h; y--; ) {
461 prefetch_load(src + src_stride);
462 prefetch_load(src + src_stride + 32);
463 prefetch_store(dst + dst_stride);
464
465 __asm__ __volatile__ (
466 "ulw %[tp1], 0(%[src]) \n\t"
467 "ulw %[tp2], 0(%[dst]) \n\t"
468 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
469 "sw %[tn1], 0(%[dst]) \n\t" /* store */
470
471 : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
472 [tp2] "=&r" (tp2)
473 : [src] "r" (src), [dst] "r" (dst)
474 );
475
476 src += src_stride;
477 dst += dst_stride;
478 }
479 break;
480 case 8:
481 /* 2 word storage */
482 for (y = h; y--; ) {
483 prefetch_load(src + src_stride);
484 prefetch_load(src + src_stride + 32);
485 prefetch_store(dst + dst_stride);
486
487 __asm__ __volatile__ (
488 "ulw %[tp1], 0(%[src]) \n\t"
489 "ulw %[tp2], 0(%[dst]) \n\t"
490 "ulw %[tp3], 4(%[src]) \n\t"
491 "ulw %[tp4], 4(%[dst]) \n\t"
492 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
493 "sw %[tn1], 0(%[dst]) \n\t" /* store */
494 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
495 "sw %[tn2], 4(%[dst]) \n\t" /* store */
496
497 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
498 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
499 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
500 : [src] "r" (src), [dst] "r" (dst)
501 );
502
503 src += src_stride;
504 dst += dst_stride;
505 }
506 break;
507 case 16:
508 /* 4 word storage */
509 for (y = h; y--; ) {
510 prefetch_load(src + src_stride);
511 prefetch_load(src + src_stride + 32);
512 prefetch_store(dst + dst_stride);
513
514 __asm__ __volatile__ (
515 "ulw %[tp1], 0(%[src]) \n\t"
516 "ulw %[tp2], 0(%[dst]) \n\t"
517 "ulw %[tp3], 4(%[src]) \n\t"
518 "ulw %[tp4], 4(%[dst]) \n\t"
519 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
520 "ulw %[tp1], 8(%[src]) \n\t"
521 "ulw %[tp2], 8(%[dst]) \n\t"
522 "sw %[tn1], 0(%[dst]) \n\t" /* store */
523 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
524 "sw %[tn2], 4(%[dst]) \n\t" /* store */
525 "ulw %[tp3], 12(%[src]) \n\t"
526 "ulw %[tp4], 12(%[dst]) \n\t"
527 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
528 "sw %[tn1], 8(%[dst]) \n\t" /* store */
529 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
530 "sw %[tn2], 12(%[dst]) \n\t" /* store */
531
532 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
533 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
534 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
535 : [src] "r" (src), [dst] "r" (dst)
536 );
537
538 src += src_stride;
539 dst += dst_stride;
540 }
541 break;
542 case 32:
543 /* 8 word storage */
544 for (y = h; y--; ) {
545 prefetch_load(src + src_stride);
546 prefetch_load(src + src_stride + 32);
547 prefetch_store(dst + dst_stride);
548
549 __asm__ __volatile__ (
550 "ulw %[tp1], 0(%[src]) \n\t"
551 "ulw %[tp2], 0(%[dst]) \n\t"
552 "ulw %[tp3], 4(%[src]) \n\t"
553 "ulw %[tp4], 4(%[dst]) \n\t"
554 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
555 "ulw %[tp1], 8(%[src]) \n\t"
556 "ulw %[tp2], 8(%[dst]) \n\t"
557 "sw %[tn1], 0(%[dst]) \n\t" /* store */
558 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
559 "sw %[tn2], 4(%[dst]) \n\t" /* store */
560 "ulw %[tp3], 12(%[src]) \n\t"
561 "ulw %[tp4], 12(%[dst]) \n\t"
562 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
563 "ulw %[tp1], 16(%[src]) \n\t"
564 "ulw %[tp2], 16(%[dst]) \n\t"
565 "sw %[tn1], 8(%[dst]) \n\t" /* store */
566 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
567 "sw %[tn2], 12(%[dst]) \n\t" /* store */
568 "ulw %[tp3], 20(%[src]) \n\t"
569 "ulw %[tp4], 20(%[dst]) \n\t"
570 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
571 "ulw %[tp1], 24(%[src]) \n\t"
572 "ulw %[tp2], 24(%[dst]) \n\t"
573 "sw %[tn1], 16(%[dst]) \n\t" /* store */
574 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
575 "sw %[tn2], 20(%[dst]) \n\t" /* store */
576 "ulw %[tp3], 28(%[src]) \n\t"
577 "ulw %[tp4], 28(%[dst]) \n\t"
578 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
579 "sw %[tn1], 24(%[dst]) \n\t" /* store */
580 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
581 "sw %[tn2], 28(%[dst]) \n\t" /* store */
582
583 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
584 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
585 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
586 : [src] "r" (src), [dst] "r" (dst)
587 );
588
589 src += src_stride;
590 dst += dst_stride;
591 }
592 break;
593 case 64:
594 prefetch_load(src + 64);
595 prefetch_store(dst + 32);
596
597 /* 16 word storage */
598 for (y = h; y--; ) {
599 prefetch_load(src + src_stride);
600 prefetch_load(src + src_stride + 32);
601 prefetch_load(src + src_stride + 64);
602 prefetch_store(dst + dst_stride);
603 prefetch_store(dst + dst_stride + 32);
604
605 __asm__ __volatile__ (
606 "ulw %[tp1], 0(%[src]) \n\t"
607 "ulw %[tp2], 0(%[dst]) \n\t"
608 "ulw %[tp3], 4(%[src]) \n\t"
609 "ulw %[tp4], 4(%[dst]) \n\t"
610 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
611 "ulw %[tp1], 8(%[src]) \n\t"
612 "ulw %[tp2], 8(%[dst]) \n\t"
613 "sw %[tn1], 0(%[dst]) \n\t" /* store */
614 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
615 "sw %[tn2], 4(%[dst]) \n\t" /* store */
616 "ulw %[tp3], 12(%[src]) \n\t"
617 "ulw %[tp4], 12(%[dst]) \n\t"
618 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
619 "ulw %[tp1], 16(%[src]) \n\t"
620 "ulw %[tp2], 16(%[dst]) \n\t"
621 "sw %[tn1], 8(%[dst]) \n\t" /* store */
622 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
623 "sw %[tn2], 12(%[dst]) \n\t" /* store */
624 "ulw %[tp3], 20(%[src]) \n\t"
625 "ulw %[tp4], 20(%[dst]) \n\t"
626 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
627 "ulw %[tp1], 24(%[src]) \n\t"
628 "ulw %[tp2], 24(%[dst]) \n\t"
629 "sw %[tn1], 16(%[dst]) \n\t" /* store */
630 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
631 "sw %[tn2], 20(%[dst]) \n\t" /* store */
632 "ulw %[tp3], 28(%[src]) \n\t"
633 "ulw %[tp4], 28(%[dst]) \n\t"
634 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
635 "ulw %[tp1], 32(%[src]) \n\t"
636 "ulw %[tp2], 32(%[dst]) \n\t"
637 "sw %[tn1], 24(%[dst]) \n\t" /* store */
638 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
639 "sw %[tn2], 28(%[dst]) \n\t" /* store */
640 "ulw %[tp3], 36(%[src]) \n\t"
641 "ulw %[tp4], 36(%[dst]) \n\t"
642 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
643 "ulw %[tp1], 40(%[src]) \n\t"
644 "ulw %[tp2], 40(%[dst]) \n\t"
645 "sw %[tn1], 32(%[dst]) \n\t" /* store */
646 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
647 "sw %[tn2], 36(%[dst]) \n\t" /* store */
648 "ulw %[tp3], 44(%[src]) \n\t"
649 "ulw %[tp4], 44(%[dst]) \n\t"
650 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
651 "ulw %[tp1], 48(%[src]) \n\t"
652 "ulw %[tp2], 48(%[dst]) \n\t"
653 "sw %[tn1], 40(%[dst]) \n\t" /* store */
654 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
655 "sw %[tn2], 44(%[dst]) \n\t" /* store */
656 "ulw %[tp3], 52(%[src]) \n\t"
657 "ulw %[tp4], 52(%[dst]) \n\t"
658 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
659 "ulw %[tp1], 56(%[src]) \n\t"
660 "ulw %[tp2], 56(%[dst]) \n\t"
661 "sw %[tn1], 48(%[dst]) \n\t" /* store */
662 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
663 "sw %[tn2], 52(%[dst]) \n\t" /* store */
664 "ulw %[tp3], 60(%[src]) \n\t"
665 "ulw %[tp4], 60(%[dst]) \n\t"
666 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /
667 "sw %[tn1], 56(%[dst]) \n\t" /* store */
668 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /
669 "sw %[tn2], 60(%[dst]) \n\t" /* store */
670
671 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
672 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
673 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
674 : [src] "r" (src), [dst] "r" (dst)
675 );
676
677 src += src_stride;
678 dst += dst_stride;
679 }
680 break;
681 default:
682 for (y = h; y > 0; --y) {
683 for (x = 0; x < w; ++x) {
684 dst[x] = (dst[x] + src[x] + 1) >> 1;
685 }
686
687 src += src_stride;
688 dst += dst_stride;
689 }
690 break;
691 }
692 }
693 #endif
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/mips/vpx_convolve2_vert_dspr2.c ('k') | source/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_dspr2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698