source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c - Issue 54923004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c

Issue 54923004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « source/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c ('k') | source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <assert.h>

	12 #include <stdio.h>

	13

	14 #include "./vpx_config.h"

	15 #include "./vp9_rtcd.h"

	16 #include "vp9/common/vp9_common.h"

	17 #include "vpx/vpx_integer.h"

	18 #include "vpx_ports/mem.h"

	19 #include "vp9/common/vp9_convolve.h"

	20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

	21

	22 #if HAVE_DSPR2

	23 static void convolve_avg_vert_4_dspr2(const uint8_t *src,

	24 int32_t src_stride,

	25 uint8_t *dst,

	26 int32_t dst_stride,

	27 const int16_t *filter_y,

	28 int32_t w,

	29 int32_t h) {

	30 int32_t x, y;

	31 const uint8_t *src_ptr;

	32 uint8_t *dst_ptr;

	33 uint8_t *cm = vp9_ff_cropTbl;

	34 uint32_t vector4a = 64;

	35 uint32_t load1, load2, load3, load4;

	36 uint32_t p1, p2;

	37 uint32_t n1, n2;

	38 uint32_t scratch1, scratch2;

	39 uint32_t store1, store2;

	40 int32_t vector1b, vector2b, vector3b, vector4b;

	41 int32_t Temp1, Temp2;

	42

	43 vector1b = ((const int32_t *)filter_y)[0];

	44 vector2b = ((const int32_t *)filter_y)[1];

	45 vector3b = ((const int32_t *)filter_y)[2];

	46 vector4b = ((const int32_t *)filter_y)[3];

	47

	48 src -= 3 * src_stride;

	49

	50 for (y = h; y--;) {

	51 /* prefetch data to cache memory */

	52 vp9_prefetch_store(dst + dst_stride);

	53

	54 for (x = 0; x < w; x += 4) {

	55 src_ptr = src + x;

	56 dst_ptr = dst + x;

	57

	58 __asm__ __volatile__ (

	59 "ulw %[load1], 0(%[src_ptr]) \n\t"

	60 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	61 "ulw %[load2], 0(%[src_ptr]) \n\t"

	62 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	63 "ulw %[load3], 0(%[src_ptr]) \n\t"

	64 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	65 "ulw %[load4], 0(%[src_ptr]) \n\t"

	66

	67 "mtlo %[vector4a], $ac0 \n\t"

	68 "mtlo %[vector4a], $ac1 \n\t"

	69 "mtlo %[vector4a], $ac2 \n\t"

	70 "mtlo %[vector4a], $ac3 \n\t"

	71 "mthi $zero, $ac0 \n\t"

	72 "mthi $zero, $ac1 \n\t"

	73 "mthi $zero, $ac2 \n\t"

	74 "mthi $zero, $ac3 \n\t"

	75

	76 "preceu.ph.qbr %[scratch1], %[load1] \n\t"

	77 "preceu.ph.qbr %[p1], %[load2] \n\t"

	78 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */

	79 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	80 "preceu.ph.qbr %[scratch2], %[load3] \n\t"

	81 "preceu.ph.qbr %[p2], %[load4] \n\t"

	82 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */

	83 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */

	84

	85 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"

	86 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"

	87 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"

	88 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"

	89

	90 "preceu.ph.qbl %[scratch1], %[load1] \n\t"

	91 "preceu.ph.qbl %[p1], %[load2] \n\t"

	92 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */

	93 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	94 "preceu.ph.qbl %[scratch2], %[load3] \n\t"

	95 "preceu.ph.qbl %[p2], %[load4] \n\t"

	96 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */

	97 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */

	98

	99 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"

	100 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"

	101 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"

	102 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"

	103

	104 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	105 "ulw %[load1], 0(%[src_ptr]) \n\t"

	106 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	107 "ulw %[load2], 0(%[src_ptr]) \n\t"

	108 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	109 "ulw %[load3], 0(%[src_ptr]) \n\t"

	110 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	111 "ulw %[load4], 0(%[src_ptr]) \n\t"

	112

	113 "preceu.ph.qbr %[scratch1], %[load1] \n\t"

	114 "preceu.ph.qbr %[p1], %[load2] \n\t"

	115 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */

	116 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	117 "preceu.ph.qbr %[scratch2], %[load3] \n\t"

	118 "preceu.ph.qbr %[p2], %[load4] \n\t"

	119 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */

	120 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */

	121

	122 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"

	123 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"

	124 "extp %[Temp1], $ac0, 31 \n\t"

	125 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"

	126 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"

	127 "extp %[Temp2], $ac1, 31 \n\t"

	128

	129 "preceu.ph.qbl %[scratch1], %[load1] \n\t"

	130 "preceu.ph.qbl %[p1], %[load2] \n\t"

	131 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */

	132 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	133 "lbu %[scratch1], 0(%[dst_ptr]) \n\t"

	134 "preceu.ph.qbl %[scratch2], %[load3] \n\t"

	135 "preceu.ph.qbl %[p2], %[load4] \n\t"

	136 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */

	137 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */

	138 "lbu %[scratch2], 1(%[dst_ptr]) \n\t"

	139

	140 "lbux %[store1], %[Temp1](%[cm]) \n\t"

	141 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"

	142 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"

	143 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */

	144 "extp %[Temp1], $ac2, 31 \n\t"

	145

	146 "lbux %[store2], %[Temp2](%[cm]) \n\t"

	147 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"

	148 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"

	149 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */

	150 "extp %[Temp2], $ac3, 31 \n\t"

	151 "lbu %[scratch1], 2(%[dst_ptr]) \n\t"

	152

	153 "sb %[store1], 0(%[dst_ptr]) \n\t"

	154 "sb %[store2], 1(%[dst_ptr]) \n\t"

	155 "lbu %[scratch2], 3(%[dst_ptr]) \n\t"

	156

	157 "lbux %[store1], %[Temp1](%[cm]) \n\t"

	158 "lbux %[store2], %[Temp2](%[cm]) \n\t"

	159 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */

	160 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */

	161

	162 "sb %[store1], 2(%[dst_ptr]) \n\t"

	163 "sb %[store2], 3(%[dst_ptr]) \n\t"

	164

	165 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	166 [load3] "=&r" (load3), [load4] "=&r" (load4),

	167 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),

	168 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),

	169 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),

	170 [store1] "=&r" (store1), [store2] "=&r" (store2),

	171 [src_ptr] "+r" (src_ptr)

	172 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),

	173 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),

	174 [vector4a] "r" (vector4a),

	175 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr )

	176 );

	177 }

	178

	179 /* Next row... */

	180 src += src_stride;

	181 dst += dst_stride;

	182 }

	183 }

	184

	185 static void convolve_avg_vert_64_dspr2(const uint8_t *src,

	186 int32_t src_stride,

	187 uint8_t *dst,

	188 int32_t dst_stride,

	189 const int16_t *filter_y,

	190 int32_t h) {

	191 int32_t x, y;

	192 const uint8_t *src_ptr;

	193 uint8_t *dst_ptr;

	194 uint8_t *cm = vp9_ff_cropTbl;

	195 uint32_t vector4a = 64;

	196 uint32_t load1, load2, load3, load4;

	197 uint32_t p1, p2;

	198 uint32_t n1, n2;

	199 uint32_t scratch1, scratch2;

	200 uint32_t store1, store2;

	201 int32_t vector1b, vector2b, vector3b, vector4b;

	202 int32_t Temp1, Temp2;

	203

	204 vector1b = ((const int32_t *)filter_y)[0];

	205 vector2b = ((const int32_t *)filter_y)[1];

	206 vector3b = ((const int32_t *)filter_y)[2];

	207 vector4b = ((const int32_t *)filter_y)[3];

	208

	209 src -= 3 * src_stride;

	210

	211 for (y = h; y--;) {

	212 /* prefetch data to cache memory */

	213 vp9_prefetch_store(dst + dst_stride);

	214 vp9_prefetch_store(dst + dst_stride + 32);

	215

	216 for (x = 0; x < 64; x += 4) {

	217 src_ptr = src + x;

	218 dst_ptr = dst + x;

	219

	220 __asm__ __volatile__ (

	221 "ulw %[load1], 0(%[src_ptr]) \n\t"

	222 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	223 "ulw %[load2], 0(%[src_ptr]) \n\t"

	224 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	225 "ulw %[load3], 0(%[src_ptr]) \n\t"

	226 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	227 "ulw %[load4], 0(%[src_ptr]) \n\t"

	228

	229 "mtlo %[vector4a], $ac0 \n\t"

	230 "mtlo %[vector4a], $ac1 \n\t"

	231 "mtlo %[vector4a], $ac2 \n\t"

	232 "mtlo %[vector4a], $ac3 \n\t"

	233 "mthi $zero, $ac0 \n\t"

	234 "mthi $zero, $ac1 \n\t"

	235 "mthi $zero, $ac2 \n\t"

	236 "mthi $zero, $ac3 \n\t"

	237

	238 "preceu.ph.qbr %[scratch1], %[load1] \n\t"

	239 "preceu.ph.qbr %[p1], %[load2] \n\t"

	240 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */

	241 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	242 "preceu.ph.qbr %[scratch2], %[load3] \n\t"

	243 "preceu.ph.qbr %[p2], %[load4] \n\t"

	244 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */

	245 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */

	246

	247 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"

	248 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"

	249 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"

	250 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"

	251

	252 "preceu.ph.qbl %[scratch1], %[load1] \n\t"

	253 "preceu.ph.qbl %[p1], %[load2] \n\t"

	254 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */

	255 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	256 "preceu.ph.qbl %[scratch2], %[load3] \n\t"

	257 "preceu.ph.qbl %[p2], %[load4] \n\t"

	258 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */

	259 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */

	260

	261 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"

	262 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"

	263 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"

	264 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"

	265

	266 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	267 "ulw %[load1], 0(%[src_ptr]) \n\t"

	268 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	269 "ulw %[load2], 0(%[src_ptr]) \n\t"

	270 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	271 "ulw %[load3], 0(%[src_ptr]) \n\t"

	272 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	273 "ulw %[load4], 0(%[src_ptr]) \n\t"

	274

	275 "preceu.ph.qbr %[scratch1], %[load1] \n\t"

	276 "preceu.ph.qbr %[p1], %[load2] \n\t"

	277 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */

	278 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	279 "preceu.ph.qbr %[scratch2], %[load3] \n\t"

	280 "preceu.ph.qbr %[p2], %[load4] \n\t"

	281 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */

	282 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */

	283

	284 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"

	285 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"

	286 "extp %[Temp1], $ac0, 31 \n\t"

	287 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"

	288 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"

	289 "extp %[Temp2], $ac1, 31 \n\t"

	290

	291 "preceu.ph.qbl %[scratch1], %[load1] \n\t"

	292 "preceu.ph.qbl %[p1], %[load2] \n\t"

	293 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */

	294 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	295 "lbu %[scratch1], 0(%[dst_ptr]) \n\t"

	296 "preceu.ph.qbl %[scratch2], %[load3] \n\t"

	297 "preceu.ph.qbl %[p2], %[load4] \n\t"

	298 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */

	299 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */

	300 "lbu %[scratch2], 1(%[dst_ptr]) \n\t"

	301

	302 "lbux %[store1], %[Temp1](%[cm]) \n\t"

	303 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"

	304 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"

	305 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */

	306 "extp %[Temp1], $ac2, 31 \n\t"

	307

	308 "lbux %[store2], %[Temp2](%[cm]) \n\t"

	309 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"

	310 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"

	311 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */

	312 "extp %[Temp2], $ac3, 31 \n\t"

	313 "lbu %[scratch1], 2(%[dst_ptr]) \n\t"

	314

	315 "sb %[store1], 0(%[dst_ptr]) \n\t"

	316 "sb %[store2], 1(%[dst_ptr]) \n\t"

	317 "lbu %[scratch2], 3(%[dst_ptr]) \n\t"

	318

	319 "lbux %[store1], %[Temp1](%[cm]) \n\t"

	320 "lbux %[store2], %[Temp2](%[cm]) \n\t"

	321 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */

	322 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */

	323

	324 "sb %[store1], 2(%[dst_ptr]) \n\t"

	325 "sb %[store2], 3(%[dst_ptr]) \n\t"

	326

	327 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	328 [load3] "=&r" (load3), [load4] "=&r" (load4),

	329 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),

	330 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),

	331 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),

	332 [store1] "=&r" (store1), [store2] "=&r" (store2),

	333 [src_ptr] "+r" (src_ptr)

	334 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),

	335 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),

	336 [vector4a] "r" (vector4a),

	337 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr )

	338 );

	339 }

	340

	341 /* Next row... */

	342 src += src_stride;

	343 dst += dst_stride;

	344 }

	345 }

	346

	347 void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,

	348 uint8_t *dst, ptrdiff_t dst_stride,

	349 const int16_t *filter_x, int x_step_q4,

	350 const int16_t *filter_y, int y_step_q4,

	351 int w, int h) {

	352 if (((const int32_t *)filter_y)[1] == 0x800000) {

	353 vp9_convolve_avg(src, src_stride,

	354 dst, dst_stride,

	355 filter_x, x_step_q4,

	356 filter_y, y_step_q4,

	357 w, h);

	358 } else if (((const int32_t *)filter_y)[0] == 0) {

	359 vp9_convolve2_avg_vert_dspr2(src, src_stride,

	360 dst, dst_stride,

	361 filter_x, x_step_q4,

	362 filter_y, y_step_q4,

	363 w, h);

	364 } else {

	365 if (16 == y_step_q4) {

	366 uint32_t pos = 38;

	367

	368 /* bit positon for extract from acc */

	369 __asm__ __volatile__ (

	370 "wrdsp %[pos], 1 \n\t"

	371 :

	372 : [pos] "r" (pos)

	373 );

	374

	375 vp9_prefetch_store(dst);

	376

	377 switch (w) {

	378 case 4:

	379 case 8:

	380 case 16:

	381 case 32:

	382 convolve_avg_vert_4_dspr2(src, src_stride,

	383 dst, dst_stride,

	384 filter_y, w, h);

	385 break;

	386 case 64:

	387 vp9_prefetch_store(dst + 32);

	388 convolve_avg_vert_64_dspr2(src, src_stride,

	389 dst, dst_stride,

	390 filter_y, h);

	391 break;

	392 default:

	393 vp9_convolve8_avg_vert_c(src, src_stride,

	394 dst, dst_stride,

	395 filter_x, x_step_q4,

	396 filter_y, y_step_q4,

	397 w, h);

	398 break;

	399 }

	400 } else {

	401 vp9_convolve8_avg_vert_c(src, src_stride,

	402 dst, dst_stride,

	403 filter_x, x_step_q4,

	404 filter_y, y_step_q4,

	405 w, h);

	406 }

	407 }

	408 }

	409

	410 void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,

	411 uint8_t *dst, ptrdiff_t dst_stride,

	412 const int16_t *filter_x, int x_step_q4,

	413 const int16_t *filter_y, int y_step_q4,

	414 int w, int h) {

	415 /* Fixed size intermediate buffer places limits on parameters. */

	416 DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);

	417 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;

	418

	419 assert(w <= 64);

	420 assert(h <= 64);

	421

	422 if (intermediate_height < h)

	423 intermediate_height = h;

	424

	425 if (x_step_q4 != 16 \|\| y_step_q4 != 16)

	426 return vp9_convolve8_avg_c(src, src_stride,

	427 dst, dst_stride,

	428 filter_x, x_step_q4,

	429 filter_y, y_step_q4,

	430 w, h);

	431

	432 vp9_convolve8_horiz(src - (src_stride * 3), src_stride,

	433 temp, 64,

	434 filter_x, x_step_q4,

	435 filter_y, y_step_q4,

	436 w, intermediate_height);

	437

	438 vp9_convolve8_avg_vert(temp + 64 * 3, 64,

	439 dst, dst_stride,

	440 filter_x, x_step_q4,

	441 filter_y, y_step_q4,

	442 w, h);

	443 }

	444

	445 void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,

	446 uint8_t *dst, ptrdiff_t dst_stride,

	447 const int16_t *filter_x, int filter_x_stride,

	448 const int16_t *filter_y, int filter_y_stride,

	449 int w, int h) {

	450 int x, y;

	451 uint32_t tp1, tp2, tn1;

	452 uint32_t tp3, tp4, tn2;

	453

	454 /* prefetch data to cache memory */

	455 vp9_prefetch_load(src);

	456 vp9_prefetch_load(src + 32);

	457 vp9_prefetch_store(dst);

	458

	459 switch (w) {

	460 case 4:

	461 /* 1 word storage */

	462 for (y = h; y--; ) {

	463 vp9_prefetch_load(src + src_stride);

	464 vp9_prefetch_load(src + src_stride + 32);

	465 vp9_prefetch_store(dst + dst_stride);

	466

	467 __asm__ __volatile__ (

	468 "ulw %[tp1], 0(%[src]) \n\t"

	469 "ulw %[tp2], 0(%[dst]) \n\t"

	470 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	471 "sw %[tn1], 0(%[dst]) \n\t" /* store */

	472

	473 : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),

	474 [tp2] "=&r" (tp2)

	475 : [src] "r" (src), [dst] "r" (dst)

	476 );

	477

	478 src += src_stride;

	479 dst += dst_stride;

	480 }

	481 break;

	482 case 8:

	483 /* 2 word storage */

	484 for (y = h; y--; ) {

	485 vp9_prefetch_load(src + src_stride);

	486 vp9_prefetch_load(src + src_stride + 32);

	487 vp9_prefetch_store(dst + dst_stride);

	488

	489 __asm__ __volatile__ (

	490 "ulw %[tp1], 0(%[src]) \n\t"

	491 "ulw %[tp2], 0(%[dst]) \n\t"

	492 "ulw %[tp3], 4(%[src]) \n\t"

	493 "ulw %[tp4], 4(%[dst]) \n\t"

	494 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	495 "sw %[tn1], 0(%[dst]) \n\t" /* store */

	496 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	497 "sw %[tn2], 4(%[dst]) \n\t" /* store */

	498

	499 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),

	500 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),

	501 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)

	502 : [src] "r" (src), [dst] "r" (dst)

	503 );

	504

	505 src += src_stride;

	506 dst += dst_stride;

	507 }

	508 break;

	509 case 16:

	510 /* 4 word storage */

	511 for (y = h; y--; ) {

	512 vp9_prefetch_load(src + src_stride);

	513 vp9_prefetch_load(src + src_stride + 32);

	514 vp9_prefetch_store(dst + dst_stride);

	515

	516 __asm__ __volatile__ (

	517 "ulw %[tp1], 0(%[src]) \n\t"

	518 "ulw %[tp2], 0(%[dst]) \n\t"

	519 "ulw %[tp3], 4(%[src]) \n\t"

	520 "ulw %[tp4], 4(%[dst]) \n\t"

	521 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	522 "ulw %[tp1], 8(%[src]) \n\t"

	523 "ulw %[tp2], 8(%[dst]) \n\t"

	524 "sw %[tn1], 0(%[dst]) \n\t" /* store */

	525 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	526 "sw %[tn2], 4(%[dst]) \n\t" /* store */

	527 "ulw %[tp3], 12(%[src]) \n\t"

	528 "ulw %[tp4], 12(%[dst]) \n\t"

	529 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	530 "sw %[tn1], 8(%[dst]) \n\t" /* store */

	531 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	532 "sw %[tn2], 12(%[dst]) \n\t" /* store */

	533

	534 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),

	535 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),

	536 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)

	537 : [src] "r" (src), [dst] "r" (dst)

	538 );

	539

	540 src += src_stride;

	541 dst += dst_stride;

	542 }

	543 break;

	544 case 32:

	545 /* 8 word storage */

	546 for (y = h; y--; ) {

	547 vp9_prefetch_load(src + src_stride);

	548 vp9_prefetch_load(src + src_stride + 32);

	549 vp9_prefetch_store(dst + dst_stride);

	550

	551 __asm__ __volatile__ (

	552 "ulw %[tp1], 0(%[src]) \n\t"

	553 "ulw %[tp2], 0(%[dst]) \n\t"

	554 "ulw %[tp3], 4(%[src]) \n\t"

	555 "ulw %[tp4], 4(%[dst]) \n\t"

	556 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	557 "ulw %[tp1], 8(%[src]) \n\t"

	558 "ulw %[tp2], 8(%[dst]) \n\t"

	559 "sw %[tn1], 0(%[dst]) \n\t" /* store */

	560 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	561 "sw %[tn2], 4(%[dst]) \n\t" /* store */

	562 "ulw %[tp3], 12(%[src]) \n\t"

	563 "ulw %[tp4], 12(%[dst]) \n\t"

	564 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	565 "ulw %[tp1], 16(%[src]) \n\t"

	566 "ulw %[tp2], 16(%[dst]) \n\t"

	567 "sw %[tn1], 8(%[dst]) \n\t" /* store */

	568 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	569 "sw %[tn2], 12(%[dst]) \n\t" /* store */

	570 "ulw %[tp3], 20(%[src]) \n\t"

	571 "ulw %[tp4], 20(%[dst]) \n\t"

	572 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	573 "ulw %[tp1], 24(%[src]) \n\t"

	574 "ulw %[tp2], 24(%[dst]) \n\t"

	575 "sw %[tn1], 16(%[dst]) \n\t" /* store */

	576 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	577 "sw %[tn2], 20(%[dst]) \n\t" /* store */

	578 "ulw %[tp3], 28(%[src]) \n\t"

	579 "ulw %[tp4], 28(%[dst]) \n\t"

	580 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	581 "sw %[tn1], 24(%[dst]) \n\t" /* store */

	582 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	583 "sw %[tn2], 28(%[dst]) \n\t" /* store */

	584

	585 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),

	586 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),

	587 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)

	588 : [src] "r" (src), [dst] "r" (dst)

	589 );

	590

	591 src += src_stride;

	592 dst += dst_stride;

	593 }

	594 break;

	595 case 64:

	596 vp9_prefetch_load(src + 64);

	597 vp9_prefetch_store(dst + 32);

	598

	599 /* 16 word storage */

	600 for (y = h; y--; ) {

	601 vp9_prefetch_load(src + src_stride);

	602 vp9_prefetch_load(src + src_stride + 32);

	603 vp9_prefetch_load(src + src_stride + 64);

	604 vp9_prefetch_store(dst + dst_stride);

	605 vp9_prefetch_store(dst + dst_stride + 32);

	606

	607 __asm__ __volatile__ (

	608 "ulw %[tp1], 0(%[src]) \n\t"

	609 "ulw %[tp2], 0(%[dst]) \n\t"

	610 "ulw %[tp3], 4(%[src]) \n\t"

	611 "ulw %[tp4], 4(%[dst]) \n\t"

	612 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	613 "ulw %[tp1], 8(%[src]) \n\t"

	614 "ulw %[tp2], 8(%[dst]) \n\t"

	615 "sw %[tn1], 0(%[dst]) \n\t" /* store */

	616 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	617 "sw %[tn2], 4(%[dst]) \n\t" /* store */

	618 "ulw %[tp3], 12(%[src]) \n\t"

	619 "ulw %[tp4], 12(%[dst]) \n\t"

	620 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	621 "ulw %[tp1], 16(%[src]) \n\t"

	622 "ulw %[tp2], 16(%[dst]) \n\t"

	623 "sw %[tn1], 8(%[dst]) \n\t" /* store */

	624 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	625 "sw %[tn2], 12(%[dst]) \n\t" /* store */

	626 "ulw %[tp3], 20(%[src]) \n\t"

	627 "ulw %[tp4], 20(%[dst]) \n\t"

	628 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	629 "ulw %[tp1], 24(%[src]) \n\t"

	630 "ulw %[tp2], 24(%[dst]) \n\t"

	631 "sw %[tn1], 16(%[dst]) \n\t" /* store */

	632 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	633 "sw %[tn2], 20(%[dst]) \n\t" /* store */

	634 "ulw %[tp3], 28(%[src]) \n\t"

	635 "ulw %[tp4], 28(%[dst]) \n\t"

	636 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	637 "ulw %[tp1], 32(%[src]) \n\t"

	638 "ulw %[tp2], 32(%[dst]) \n\t"

	639 "sw %[tn1], 24(%[dst]) \n\t" /* store */

	640 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	641 "sw %[tn2], 28(%[dst]) \n\t" /* store */

	642 "ulw %[tp3], 36(%[src]) \n\t"

	643 "ulw %[tp4], 36(%[dst]) \n\t"

	644 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	645 "ulw %[tp1], 40(%[src]) \n\t"

	646 "ulw %[tp2], 40(%[dst]) \n\t"

	647 "sw %[tn1], 32(%[dst]) \n\t" /* store */

	648 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	649 "sw %[tn2], 36(%[dst]) \n\t" /* store */

	650 "ulw %[tp3], 44(%[src]) \n\t"

	651 "ulw %[tp4], 44(%[dst]) \n\t"

	652 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	653 "ulw %[tp1], 48(%[src]) \n\t"

	654 "ulw %[tp2], 48(%[dst]) \n\t"

	655 "sw %[tn1], 40(%[dst]) \n\t" /* store */

	656 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	657 "sw %[tn2], 44(%[dst]) \n\t" /* store */

	658 "ulw %[tp3], 52(%[src]) \n\t"

	659 "ulw %[tp4], 52(%[dst]) \n\t"

	660 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	661 "ulw %[tp1], 56(%[src]) \n\t"

	662 "ulw %[tp2], 56(%[dst]) \n\t"

	663 "sw %[tn1], 48(%[dst]) \n\t" /* store */

	664 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	665 "sw %[tn2], 52(%[dst]) \n\t" /* store */

	666 "ulw %[tp3], 60(%[src]) \n\t"

	667 "ulw %[tp4], 60(%[dst]) \n\t"

	668 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average * /

	669 "sw %[tn1], 56(%[dst]) \n\t" /* store */

	670 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average * /

	671 "sw %[tn2], 60(%[dst]) \n\t" /* store */

	672

	673 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),

	674 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),

	675 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)

	676 : [src] "r" (src), [dst] "r" (dst)

	677 );

	678

	679 src += src_stride;

	680 dst += dst_stride;

	681 }

	682 break;

	683 default:

	684 for (y = h; y > 0; --y) {

	685 for (x = 0; x < w; ++x) {

	686 dst[x] = (dst[x] + src[x] + 1) >> 1;

	687 }

	688

	689 src += src_stride;

	690 dst += dst_stride;

	691 }

	692 break;

	693 }

	694 }

	695 #endif

OLD	NEW