source/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c - Issue 54923004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c

Issue 54923004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « source/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c ('k') | source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <assert.h>

	12 #include <stdio.h>

	13

	14 #include "./vpx_config.h"

	15 #include "./vp9_rtcd.h"

	16 #include "vp9/common/vp9_common.h"

	17 #include "vpx/vpx_integer.h"

	18 #include "vpx_ports/mem.h"

	19 #include "vp9/common/vp9_convolve.h"

	20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

	21

	22 #if HAVE_DSPR2

	23 static void convolve_bi_vert_4_dspr2(const uint8_t *src,

	24 int32_t src_stride,

	25 uint8_t *dst,

	26 int32_t dst_stride,

	27 const int16_t *filter_y,

	28 int32_t w,

	29 int32_t h) {

	30 int32_t x, y;

	31 const uint8_t *src_ptr;

	32 uint8_t *dst_ptr;

	33 uint8_t *cm = vp9_ff_cropTbl;

	34 uint32_t vector4a = 64;

	35 uint32_t load1, load2;

	36 uint32_t p1, p2;

	37 uint32_t scratch1;

	38 uint32_t store1, store2;

	39 int32_t Temp1, Temp2;

	40 const int16_t *filter = &filter_y[3];

	41 uint32_t filter45;

	42

	43 filter45 = ((const int32_t *)filter)[0];

	44

	45 for (y = h; y--;) {

	46 /* prefetch data to cache memory */

	47 vp9_prefetch_store(dst + dst_stride);

	48

	49 for (x = 0; x < w; x += 4) {

	50 src_ptr = src + x;

	51 dst_ptr = dst + x;

	52

	53 __asm__ __volatile__ (

	54 "ulw %[load1], 0(%[src_ptr]) \n\t"

	55 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	56 "ulw %[load2], 0(%[src_ptr]) \n\t"

	57

	58 "mtlo %[vector4a], $ac0 \n\t"

	59 "mtlo %[vector4a], $ac1 \n\t"

	60 "mtlo %[vector4a], $ac2 \n\t"

	61 "mtlo %[vector4a], $ac3 \n\t"

	62 "mthi $zero, $ac0 \n\t"

	63 "mthi $zero, $ac1 \n\t"

	64 "mthi $zero, $ac2 \n\t"

	65 "mthi $zero, $ac3 \n\t"

	66

	67 "preceu.ph.qbr %[scratch1], %[load1] \n\t"

	68 "preceu.ph.qbr %[p1], %[load2] \n\t"

	69

	70 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */

	71 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	72

	73 "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"

	74 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"

	75

	76 "preceu.ph.qbl %[scratch1], %[load1] \n\t"

	77 "preceu.ph.qbl %[p1], %[load2] \n\t"

	78

	79 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */

	80 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	81

	82 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"

	83 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"

	84

	85 "extp %[Temp1], $ac0, 31 \n\t"

	86 "extp %[Temp2], $ac1, 31 \n\t"

	87

	88 "lbux %[store1], %[Temp1](%[cm]) \n\t"

	89 "extp %[Temp1], $ac2, 31 \n\t"

	90

	91 "lbux %[store2], %[Temp2](%[cm]) \n\t"

	92 "extp %[Temp2], $ac3, 31 \n\t"

	93

	94 "sb %[store1], 0(%[dst_ptr]) \n\t"

	95 "sb %[store2], 1(%[dst_ptr]) \n\t"

	96

	97 "lbux %[store1], %[Temp1](%[cm]) \n\t"

	98 "lbux %[store2], %[Temp2](%[cm]) \n\t"

	99

	100 "sb %[store1], 2(%[dst_ptr]) \n\t"

	101 "sb %[store2], 3(%[dst_ptr]) \n\t"

	102

	103 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	104 [p1] "=&r" (p1), [p2] "=&r" (p2),

	105 [scratch1] "=&r" (scratch1),

	106 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),

	107 [store1] "=&r" (store1), [store2] "=&r" (store2),

	108 [src_ptr] "+r" (src_ptr)

	109 : [filter45] "r" (filter45),[vector4a] "r" (vector4a),

	110 [src_stride] "r" (src_stride),

	111 [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)

	112 );

	113 }

	114

	115 /* Next row... */

	116 src += src_stride;

	117 dst += dst_stride;

	118 }

	119 }

	120

	121 static void convolve_bi_vert_64_dspr2(const uint8_t *src,

	122 int32_t src_stride,

	123 uint8_t *dst,

	124 int32_t dst_stride,

	125 const int16_t *filter_y,

	126 int32_t h) {

	127 int32_t x, y;

	128 const uint8_t *src_ptr;

	129 uint8_t *dst_ptr;

	130 uint8_t *cm = vp9_ff_cropTbl;

	131 uint32_t vector4a = 64;

	132 uint32_t load1, load2;

	133 uint32_t p1, p2;

	134 uint32_t scratch1;

	135 uint32_t store1, store2;

	136 int32_t Temp1, Temp2;

	137 const int16_t *filter = &filter_y[3];

	138 uint32_t filter45;

	139

	140 filter45 = ((const int32_t *)filter)[0];

	141

	142 for (y = h; y--;) {

	143 /* prefetch data to cache memory */

	144 vp9_prefetch_store(dst + dst_stride);

	145

	146 for (x = 0; x < 64; x += 4) {

	147 src_ptr = src + x;

	148 dst_ptr = dst + x;

	149

	150 __asm__ __volatile__ (

	151 "ulw %[load1], 0(%[src_ptr]) \n\t"

	152 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"

	153 "ulw %[load2], 0(%[src_ptr]) \n\t"

	154

	155 "mtlo %[vector4a], $ac0 \n\t"

	156 "mtlo %[vector4a], $ac1 \n\t"

	157 "mtlo %[vector4a], $ac2 \n\t"

	158 "mtlo %[vector4a], $ac3 \n\t"

	159 "mthi $zero, $ac0 \n\t"

	160 "mthi $zero, $ac1 \n\t"

	161 "mthi $zero, $ac2 \n\t"

	162 "mthi $zero, $ac3 \n\t"

	163

	164 "preceu.ph.qbr %[scratch1], %[load1] \n\t"

	165 "preceu.ph.qbr %[p1], %[load2] \n\t"

	166

	167 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */

	168 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	169

	170 "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"

	171 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"

	172

	173 "preceu.ph.qbl %[scratch1], %[load1] \n\t"

	174 "preceu.ph.qbl %[p1], %[load2] \n\t"

	175

	176 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */

	177 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */

	178

	179 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"

	180 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"

	181

	182 "extp %[Temp1], $ac0, 31 \n\t"

	183 "extp %[Temp2], $ac1, 31 \n\t"

	184

	185 "lbux %[store1], %[Temp1](%[cm]) \n\t"

	186 "extp %[Temp1], $ac2, 31 \n\t"

	187

	188 "lbux %[store2], %[Temp2](%[cm]) \n\t"

	189 "extp %[Temp2], $ac3, 31 \n\t"

	190

	191 "sb %[store1], 0(%[dst_ptr]) \n\t"

	192 "sb %[store2], 1(%[dst_ptr]) \n\t"

	193

	194 "lbux %[store1], %[Temp1](%[cm]) \n\t"

	195 "lbux %[store2], %[Temp2](%[cm]) \n\t"

	196

	197 "sb %[store1], 2(%[dst_ptr]) \n\t"

	198 "sb %[store2], 3(%[dst_ptr]) \n\t"

	199

	200 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	201 [p1] "=&r" (p1), [p2] "=&r" (p2),

	202 [scratch1] "=&r" (scratch1),

	203 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),

	204 [store1] "=&r" (store1), [store2] "=&r" (store2),

	205 [src_ptr] "+r" (src_ptr)

	206 : [filter45] "r" (filter45),[vector4a] "r" (vector4a),

	207 [src_stride] "r" (src_stride),

	208 [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)

	209 );

	210 }

	211

	212 /* Next row... */

	213 src += src_stride;

	214 dst += dst_stride;

	215 }

	216 }

	217

	218 void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,

	219 uint8_t *dst, ptrdiff_t dst_stride,

	220 const int16_t *filter_x, int x_step_q4,

	221 const int16_t *filter_y, int y_step_q4,

	222 int w, int h) {

	223 if (16 == y_step_q4) {

	224 uint32_t pos = 38;

	225

	226 /* bit positon for extract from acc */

	227 __asm__ __volatile__ (

	228 "wrdsp %[pos], 1 \n\t"

	229 :

	230 : [pos] "r" (pos)

	231 );

	232

	233 vp9_prefetch_store(dst);

	234

	235 switch (w) {

	236 case 4 :

	237 case 8 :

	238 case 16 :

	239 case 32 :

	240 convolve_bi_vert_4_dspr2(src, src_stride,

	241 dst, dst_stride,

	242 filter_y, w, h);

	243 break;

	244 case 64 :

	245 vp9_prefetch_store(dst + 32);

	246 convolve_bi_vert_64_dspr2(src, src_stride,

	247 dst, dst_stride,

	248 filter_y, h);

	249 break;

	250 default:

	251 vp9_convolve8_vert_c(src, src_stride,

	252 dst, dst_stride,

	253 filter_x, x_step_q4,

	254 filter_y, y_step_q4,

	255 w, h);

	256 break;

	257 }

	258 } else {

	259 vp9_convolve8_vert_c(src, src_stride,

	260 dst, dst_stride,

	261 filter_x, x_step_q4,

	262 filter_y, y_step_q4,

	263 w, h);

	264 }

	265 }

	266 #endif

OLD	NEW