source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c - Issue 54923004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c

Issue 54923004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <assert.h>

	12 #include <stdio.h>

	13

	14 #include "./vpx_config.h"

	15 #include "./vp9_rtcd.h"

	16 #include "vp9/common/vp9_common.h"

	17 #include "vp9/common/vp9_blockd.h"

	18 #include "vp9/common/vp9_idct.h"

	19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

	20

	21 #if HAVE_DSPR2

	22 static void idct32_1d_rows_dspr2(const int16_t input, int16_t output) {

	23 int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;

	24 int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;

	25 int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;

	26 int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;

	27 int16_t step1_28, step1_29, step1_30, step1_31;

	28 int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;

	29 int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;

	30 int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;

	31 int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;

	32 int16_t step2_28, step2_29, step2_30, step2_31;

	33 int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;

	34 int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;

	35 int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;

	36 int16_t step3_29, step3_30, step3_31;

	37 int temp0, temp1, temp2, temp3;

	38 int load1, load2, load3, load4;

	39 int result1, result2;

	40 int temp21;

	41 int i;

	42 const int const_2_power_13 = 8192;

	43 const int32_t *input_int;

	44

	45 for (i = 32; i--; ) {

	46 input_int = (const int32_t *)input;

	47

	48 if (!(input_int[0] \| input_int[1] \| input_int[2] \| input_int[3] \|

	49 input_int[4] \| input_int[5] \| input_int[6] \| input_int[7] \|

	50 input_int[8] \| input_int[9] \| input_int[10] \| input_int[11] \|

	51 input_int[12] \| input_int[13] \| input_int[14] \| input_int[15])) {

	52 input += 32;

	53

	54 __asm__ __volatile__ (

	55 "sh $zero, 0(%[output]) \n\t"

	56 "sh $zero, 64(%[output]) \n\t"

	57 "sh $zero, 128(%[output]) \n\t"

	58 "sh $zero, 192(%[output]) \n\t"

	59 "sh $zero, 256(%[output]) \n\t"

	60 "sh $zero, 320(%[output]) \n\t"

	61 "sh $zero, 384(%[output]) \n\t"

	62 "sh $zero, 448(%[output]) \n\t"

	63 "sh $zero, 512(%[output]) \n\t"

	64 "sh $zero, 576(%[output]) \n\t"

	65 "sh $zero, 640(%[output]) \n\t"

	66 "sh $zero, 704(%[output]) \n\t"

	67 "sh $zero, 768(%[output]) \n\t"

	68 "sh $zero, 832(%[output]) \n\t"

	69 "sh $zero, 896(%[output]) \n\t"

	70 "sh $zero, 960(%[output]) \n\t"

	71 "sh $zero, 1024(%[output]) \n\t"

	72 "sh $zero, 1088(%[output]) \n\t"

	73 "sh $zero, 1152(%[output]) \n\t"

	74 "sh $zero, 1216(%[output]) \n\t"

	75 "sh $zero, 1280(%[output]) \n\t"

	76 "sh $zero, 1344(%[output]) \n\t"

	77 "sh $zero, 1408(%[output]) \n\t"

	78 "sh $zero, 1472(%[output]) \n\t"

	79 "sh $zero, 1536(%[output]) \n\t"

	80 "sh $zero, 1600(%[output]) \n\t"

	81 "sh $zero, 1664(%[output]) \n\t"

	82 "sh $zero, 1728(%[output]) \n\t"

	83 "sh $zero, 1792(%[output]) \n\t"

	84 "sh $zero, 1856(%[output]) \n\t"

	85 "sh $zero, 1920(%[output]) \n\t"

	86 "sh $zero, 1984(%[output]) \n\t"

	87

	88 :

	89 : [output] "r" (output)

	90 );

	91

	92 output += 1;

	93

	94 continue;

	95 }

	96

	97 /* prefetch row */

	98 vp9_prefetch_load((const uint8_t *)(input + 32));

	99 vp9_prefetch_load((const uint8_t *)(input + 48));

	100

	101 __asm__ __volatile__ (

	102 "lh %[load1], 2(%[input]) \n\t"

	103 "lh %[load2], 62(%[input]) \n\t"

	104 "lh %[load3], 34(%[input]) \n\t"

	105 "lh %[load4], 30(%[input]) \n\t"

	106

	107 "mtlo %[const_2_power_13], $ac1 \n\t"

	108 "mthi $zero, $ac1 \n\t"

	109 "mtlo %[const_2_power_13], $ac3 \n\t"

	110 "mthi $zero, $ac3 \n\t"

	111

	112 "madd $ac1, %[load1], %[cospi_31_64] \n\t"

	113 "msub $ac1, %[load2], %[cospi_1_64] \n\t"

	114 "extp %[temp0], $ac1, 31 \n\t"

	115

	116 "madd $ac3, %[load1], %[cospi_1_64] \n\t"

	117 "madd $ac3, %[load2], %[cospi_31_64] \n\t"

	118 "extp %[temp3], $ac3, 31 \n\t"

	119

	120 "mtlo %[const_2_power_13], $ac1 \n\t"

	121 "mthi $zero, $ac1 \n\t"

	122 "mtlo %[const_2_power_13], $ac2 \n\t"

	123 "mthi $zero, $ac2 \n\t"

	124

	125 "madd $ac2, %[load3], %[cospi_15_64] \n\t"

	126 "msub $ac2, %[load4], %[cospi_17_64] \n\t"

	127 "extp %[temp1], $ac2, 31 \n\t"

	128

	129 "madd $ac1, %[load3], %[cospi_17_64] \n\t"

	130 "madd $ac1, %[load4], %[cospi_15_64] \n\t"

	131 "extp %[temp2], $ac1, 31 \n\t"

	132

	133 "mtlo %[const_2_power_13], $ac1 \n\t"

	134 "mthi $zero, $ac1 \n\t"

	135 "mtlo %[const_2_power_13], $ac3 \n\t"

	136 "mthi $zero, $ac3 \n\t"

	137

	138 "sub %[load1], %[temp3], %[temp2] \n\t"

	139 "sub %[load2], %[temp0], %[temp1] \n\t"

	140

	141 "madd $ac1, %[load1], %[cospi_28_64] \n\t"

	142 "msub $ac1, %[load2], %[cospi_4_64] \n\t"

	143 "madd $ac3, %[load1], %[cospi_4_64] \n\t"

	144 "madd $ac3, %[load2], %[cospi_28_64] \n\t"

	145

	146 "extp %[step1_17], $ac1, 31 \n\t"

	147 "extp %[step1_30], $ac3, 31 \n\t"

	148 "add %[step1_16], %[temp0], %[temp1] \n\t"

	149 "add %[step1_31], %[temp2], %[temp3] \n\t"

	150

	151 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	152 [load3] "=&r" (load3), [load4] "=&r" (load4),

	153 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	154 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	155 [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),

	156 [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)

	157 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	158 [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),

	159 [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),

	160 [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)

	161 );

	162

	163 __asm__ __volatile__ (

	164 "lh %[load1], 18(%[input]) \n\t"

	165 "lh %[load2], 46(%[input]) \n\t"

	166 "lh %[load3], 50(%[input]) \n\t"

	167 "lh %[load4], 14(%[input]) \n\t"

	168

	169 "mtlo %[const_2_power_13], $ac1 \n\t"

	170 "mthi $zero, $ac1 \n\t"

	171 "mtlo %[const_2_power_13], $ac3 \n\t"

	172 "mthi $zero, $ac3 \n\t"

	173

	174 "madd $ac1, %[load1], %[cospi_23_64] \n\t"

	175 "msub $ac1, %[load2], %[cospi_9_64] \n\t"

	176 "extp %[temp0], $ac1, 31 \n\t"

	177

	178 "madd $ac3, %[load1], %[cospi_9_64] \n\t"

	179 "madd $ac3, %[load2], %[cospi_23_64] \n\t"

	180 "extp %[temp3], $ac3, 31 \n\t"

	181

	182 "mtlo %[const_2_power_13], $ac1 \n\t"

	183 "mthi $zero, $ac1 \n\t"

	184 "mtlo %[const_2_power_13], $ac2 \n\t"

	185 "mthi $zero, $ac2 \n\t"

	186

	187 "madd $ac2, %[load3], %[cospi_7_64] \n\t"

	188 "msub $ac2, %[load4], %[cospi_25_64] \n\t"

	189 "extp %[temp1], $ac2, 31 \n\t"

	190

	191 "madd $ac1, %[load3], %[cospi_25_64] \n\t"

	192 "madd $ac1, %[load4], %[cospi_7_64] \n\t"

	193 "extp %[temp2], $ac1, 31 \n\t"

	194

	195 "mtlo %[const_2_power_13], $ac1 \n\t"

	196 "mthi $zero, $ac1 \n\t"

	197 "mtlo %[const_2_power_13], $ac3 \n\t"

	198 "mthi $zero, $ac3 \n\t"

	199

	200 "sub %[load1], %[temp1], %[temp0] \n\t"

	201 "sub %[load2], %[temp2], %[temp3] \n\t"

	202

	203 "msub $ac1, %[load1], %[cospi_28_64] \n\t"

	204 "msub $ac1, %[load2], %[cospi_4_64] \n\t"

	205 "msub $ac3, %[load1], %[cospi_4_64] \n\t"

	206 "madd $ac3, %[load2], %[cospi_28_64] \n\t"

	207

	208 "extp %[step1_18], $ac1, 31 \n\t"

	209 "extp %[step1_29], $ac3, 31 \n\t"

	210 "add %[step1_19], %[temp0], %[temp1] \n\t"

	211 "add %[step1_28], %[temp2], %[temp3] \n\t"

	212

	213 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	214 [load3] "=&r" (load3), [load4] "=&r" (load4),

	215 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	216 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	217 [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),

	218 [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)

	219 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	220 [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),

	221 [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),

	222 [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)

	223 );

	224

	225 __asm__ __volatile__ (

	226 "lh %[load1], 10(%[input]) \n\t"

	227 "lh %[load2], 54(%[input]) \n\t"

	228 "lh %[load3], 42(%[input]) \n\t"

	229 "lh %[load4], 22(%[input]) \n\t"

	230

	231 "mtlo %[const_2_power_13], $ac1 \n\t"

	232 "mthi $zero, $ac1 \n\t"

	233 "mtlo %[const_2_power_13], $ac3 \n\t"

	234 "mthi $zero, $ac3 \n\t"

	235

	236 "madd $ac1, %[load1], %[cospi_27_64] \n\t"

	237 "msub $ac1, %[load2], %[cospi_5_64] \n\t"

	238 "extp %[temp0], $ac1, 31 \n\t"

	239

	240 "madd $ac3, %[load1], %[cospi_5_64] \n\t"

	241 "madd $ac3, %[load2], %[cospi_27_64] \n\t"

	242 "extp %[temp3], $ac3, 31 \n\t"

	243

	244 "mtlo %[const_2_power_13], $ac1 \n\t"

	245 "mthi $zero, $ac1 \n\t"

	246 "mtlo %[const_2_power_13], $ac2 \n\t"

	247 "mthi $zero, $ac2 \n\t"

	248

	249 "madd $ac2, %[load3], %[cospi_11_64] \n\t"

	250 "msub $ac2, %[load4], %[cospi_21_64] \n\t"

	251 "extp %[temp1], $ac2, 31 \n\t"

	252

	253 "madd $ac1, %[load3], %[cospi_21_64] \n\t"

	254 "madd $ac1, %[load4], %[cospi_11_64] \n\t"

	255 "extp %[temp2], $ac1, 31 \n\t"

	256

	257 "mtlo %[const_2_power_13], $ac1 \n\t"

	258 "mthi $zero, $ac1 \n\t"

	259 "mtlo %[const_2_power_13], $ac3 \n\t"

	260 "mthi $zero, $ac3 \n\t"

	261

	262 "sub %[load1], %[temp0], %[temp1] \n\t"

	263 "sub %[load2], %[temp3], %[temp2] \n\t"

	264

	265 "madd $ac1, %[load2], %[cospi_12_64] \n\t"

	266 "msub $ac1, %[load1], %[cospi_20_64] \n\t"

	267 "madd $ac3, %[load1], %[cospi_12_64] \n\t"

	268 "madd $ac3, %[load2], %[cospi_20_64] \n\t"

	269

	270 "extp %[step1_21], $ac1, 31 \n\t"

	271 "extp %[step1_26], $ac3, 31 \n\t"

	272 "add %[step1_20], %[temp0], %[temp1] \n\t"

	273 "add %[step1_27], %[temp2], %[temp3] \n\t"

	274

	275 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	276 [load3] "=&r" (load3), [load4] "=&r" (load4),

	277 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	278 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	279 [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),

	280 [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)

	281 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	282 [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),

	283 [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),

	284 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

	285 );

	286

	287 __asm__ __volatile__ (

	288 "lh %[load1], 26(%[input]) \n\t"

	289 "lh %[load2], 38(%[input]) \n\t"

	290 "lh %[load3], 58(%[input]) \n\t"

	291 "lh %[load4], 6(%[input]) \n\t"

	292

	293 "mtlo %[const_2_power_13], $ac1 \n\t"

	294 "mthi $zero, $ac1 \n\t"

	295 "mtlo %[const_2_power_13], $ac3 \n\t"

	296 "mthi $zero, $ac3 \n\t"

	297

	298 "madd $ac1, %[load1], %[cospi_19_64] \n\t"

	299 "msub $ac1, %[load2], %[cospi_13_64] \n\t"

	300 "extp %[temp0], $ac1, 31 \n\t"

	301

	302 "madd $ac3, %[load1], %[cospi_13_64] \n\t"

	303 "madd $ac3, %[load2], %[cospi_19_64] \n\t"

	304 "extp %[temp3], $ac3, 31 \n\t"

	305

	306 "mtlo %[const_2_power_13], $ac1 \n\t"

	307 "mthi $zero, $ac1 \n\t"

	308 "mtlo %[const_2_power_13], $ac2 \n\t"

	309 "mthi $zero, $ac2 \n\t"

	310

	311 "madd $ac2, %[load3], %[cospi_3_64] \n\t"

	312 "msub $ac2, %[load4], %[cospi_29_64] \n\t"

	313 "extp %[temp1], $ac2, 31 \n\t"

	314

	315 "madd $ac1, %[load3], %[cospi_29_64] \n\t"

	316 "madd $ac1, %[load4], %[cospi_3_64] \n\t"

	317 "extp %[temp2], $ac1, 31 \n\t"

	318

	319 "mtlo %[const_2_power_13], $ac1 \n\t"

	320 "mthi $zero, $ac1 \n\t"

	321 "mtlo %[const_2_power_13], $ac3 \n\t"

	322 "mthi $zero, $ac3 \n\t"

	323

	324 "sub %[load1], %[temp1], %[temp0] \n\t"

	325 "sub %[load2], %[temp2], %[temp3] \n\t"

	326

	327 "msub $ac1, %[load1], %[cospi_12_64] \n\t"

	328 "msub $ac1, %[load2], %[cospi_20_64] \n\t"

	329 "msub $ac3, %[load1], %[cospi_20_64] \n\t"

	330 "madd $ac3, %[load2], %[cospi_12_64] \n\t"

	331

	332 "extp %[step1_22], $ac1, 31 \n\t"

	333 "extp %[step1_25], $ac3, 31 \n\t"

	334 "add %[step1_23], %[temp0], %[temp1] \n\t"

	335 "add %[step1_24], %[temp2], %[temp3] \n\t"

	336

	337 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	338 [load3] "=&r" (load3), [load4] "=&r" (load4),

	339 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	340 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	341 [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),

	342 [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)

	343 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	344 [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),

	345 [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),

	346 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

	347 );

	348

	349 __asm__ __volatile__ (

	350 "lh %[load1], 4(%[input]) \n\t"

	351 "lh %[load2], 60(%[input]) \n\t"

	352 "lh %[load3], 36(%[input]) \n\t"

	353 "lh %[load4], 28(%[input]) \n\t"

	354

	355 "mtlo %[const_2_power_13], $ac1 \n\t"

	356 "mthi $zero, $ac1 \n\t"

	357 "mtlo %[const_2_power_13], $ac3 \n\t"

	358 "mthi $zero, $ac3 \n\t"

	359

	360 "madd $ac1, %[load1], %[cospi_30_64] \n\t"

	361 "msub $ac1, %[load2], %[cospi_2_64] \n\t"

	362 "extp %[temp0], $ac1, 31 \n\t"

	363

	364 "madd $ac3, %[load1], %[cospi_2_64] \n\t"

	365 "madd $ac3, %[load2], %[cospi_30_64] \n\t"

	366 "extp %[temp3], $ac3, 31 \n\t"

	367

	368 "mtlo %[const_2_power_13], $ac1 \n\t"

	369 "mthi $zero, $ac1 \n\t"

	370 "mtlo %[const_2_power_13], $ac2 \n\t"

	371 "mthi $zero, $ac2 \n\t"

	372

	373 "madd $ac2, %[load3], %[cospi_14_64] \n\t"

	374 "msub $ac2, %[load4], %[cospi_18_64] \n\t"

	375 "extp %[temp1], $ac2, 31 \n\t"

	376

	377 "madd $ac1, %[load3], %[cospi_18_64] \n\t"

	378 "madd $ac1, %[load4], %[cospi_14_64] \n\t"

	379 "extp %[temp2], $ac1, 31 \n\t"

	380

	381 "mtlo %[const_2_power_13], $ac1 \n\t"

	382 "mthi $zero, $ac1 \n\t"

	383 "mtlo %[const_2_power_13], $ac3 \n\t"

	384 "mthi $zero, $ac3 \n\t"

	385

	386 "sub %[load1], %[temp0], %[temp1] \n\t"

	387 "sub %[load2], %[temp3], %[temp2] \n\t"

	388

	389 "msub $ac1, %[load1], %[cospi_8_64] \n\t"

	390 "madd $ac1, %[load2], %[cospi_24_64] \n\t"

	391 "madd $ac3, %[load1], %[cospi_24_64] \n\t"

	392 "madd $ac3, %[load2], %[cospi_8_64] \n\t"

	393

	394 "extp %[step2_9], $ac1, 31 \n\t"

	395 "extp %[step2_14], $ac3, 31 \n\t"

	396 "add %[step2_8], %[temp0], %[temp1] \n\t"

	397 "add %[step2_15], %[temp2], %[temp3] \n\t"

	398

	399 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	400 [load3] "=&r" (load3), [load4] "=&r" (load4),

	401 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	402 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	403 [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),

	404 [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)

	405 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	406 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

	407 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

	408 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

	409 );

	410

	411 __asm__ __volatile__ (

	412 "lh %[load1], 20(%[input]) \n\t"

	413 "lh %[load2], 44(%[input]) \n\t"

	414 "lh %[load3], 52(%[input]) \n\t"

	415 "lh %[load4], 12(%[input]) \n\t"

	416

	417 "mtlo %[const_2_power_13], $ac1 \n\t"

	418 "mthi $zero, $ac1 \n\t"

	419 "mtlo %[const_2_power_13], $ac3 \n\t"

	420 "mthi $zero, $ac3 \n\t"

	421

	422 "madd $ac1, %[load1], %[cospi_22_64] \n\t"

	423 "msub $ac1, %[load2], %[cospi_10_64] \n\t"

	424 "extp %[temp0], $ac1, 31 \n\t"

	425

	426 "madd $ac3, %[load1], %[cospi_10_64] \n\t"

	427 "madd $ac3, %[load2], %[cospi_22_64] \n\t"

	428 "extp %[temp3], $ac3, 31 \n\t"

	429

	430 "mtlo %[const_2_power_13], $ac1 \n\t"

	431 "mthi $zero, $ac1 \n\t"

	432 "mtlo %[const_2_power_13], $ac2 \n\t"

	433 "mthi $zero, $ac2 \n\t"

	434

	435 "madd $ac2, %[load3], %[cospi_6_64] \n\t"

	436 "msub $ac2, %[load4], %[cospi_26_64] \n\t"

	437 "extp %[temp1], $ac2, 31 \n\t"

	438

	439 "madd $ac1, %[load3], %[cospi_26_64] \n\t"

	440 "madd $ac1, %[load4], %[cospi_6_64] \n\t"

	441 "extp %[temp2], $ac1, 31 \n\t"

	442

	443 "mtlo %[const_2_power_13], $ac1 \n\t"

	444 "mthi $zero, $ac1 \n\t"

	445 "mtlo %[const_2_power_13], $ac3 \n\t"

	446 "mthi $zero, $ac3 \n\t"

	447

	448 "sub %[load1], %[temp1], %[temp0] \n\t"

	449 "sub %[load2], %[temp2], %[temp3] \n\t"

	450

	451 "msub $ac1, %[load1], %[cospi_24_64] \n\t"

	452 "msub $ac1, %[load2], %[cospi_8_64] \n\t"

	453 "madd $ac3, %[load2], %[cospi_24_64] \n\t"

	454 "msub $ac3, %[load1], %[cospi_8_64] \n\t"

	455

	456 "extp %[step2_10], $ac1, 31 \n\t"

	457 "extp %[step2_13], $ac3, 31 \n\t"

	458 "add %[step2_11], %[temp0], %[temp1] \n\t"

	459 "add %[step2_12], %[temp2], %[temp3] \n\t"

	460

	461 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	462 [load3] "=&r" (load3), [load4] "=&r" (load4),

	463 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	464 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	465 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

	466 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

	467 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	468 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

	469 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

	470 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

	471 );

	472

	473 __asm__ __volatile__ (

	474 "mtlo %[const_2_power_13], $ac0 \n\t"

	475 "mthi $zero, $ac0 \n\t"

	476 "sub %[temp0], %[step2_14], %[step2_13] \n\t"

	477 "sub %[temp0], %[temp0], %[step2_9] \n\t"

	478 "add %[temp0], %[temp0], %[step2_10] \n\t"

	479 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

	480

	481 "mtlo %[const_2_power_13], $ac1 \n\t"

	482 "mthi $zero, $ac1 \n\t"

	483 "sub %[temp1], %[step2_14], %[step2_13] \n\t"

	484 "add %[temp1], %[temp1], %[step2_9] \n\t"

	485 "sub %[temp1], %[temp1], %[step2_10] \n\t"

	486 "madd $ac1, %[temp1], %[cospi_16_64] \n\t"

	487

	488 "mtlo %[const_2_power_13], $ac2 \n\t"

	489 "mthi $zero, $ac2 \n\t"

	490 "sub %[temp0], %[step2_15], %[step2_12] \n\t"

	491 "sub %[temp0], %[temp0], %[step2_8] \n\t"

	492 "add %[temp0], %[temp0], %[step2_11] \n\t"

	493 "madd $ac2, %[temp0], %[cospi_16_64] \n\t"

	494

	495 "mtlo %[const_2_power_13], $ac3 \n\t"

	496 "mthi $zero, $ac3 \n\t"

	497 "sub %[temp1], %[step2_15], %[step2_12] \n\t"

	498 "add %[temp1], %[temp1], %[step2_8] \n\t"

	499 "sub %[temp1], %[temp1], %[step2_11] \n\t"

	500 "madd $ac3, %[temp1], %[cospi_16_64] \n\t"

	501

	502 "add %[step3_8], %[step2_8], %[step2_11] \n\t"

	503 "add %[step3_9], %[step2_9], %[step2_10] \n\t"

	504 "add %[step3_14], %[step2_13], %[step2_14] \n\t"

	505 "add %[step3_15], %[step2_12], %[step2_15] \n\t"

	506

	507 "extp %[step3_10], $ac0, 31 \n\t"

	508 "extp %[step3_13], $ac1, 31 \n\t"

	509 "extp %[step3_11], $ac2, 31 \n\t"

	510 "extp %[step3_12], $ac3, 31 \n\t"

	511

	512 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	513 [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),

	514 [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),

	515 [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),

	516 [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)

	517 : [const_2_power_13] "r" (const_2_power_13),

	518 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),

	519 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),

	520 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),

	521 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),

	522 [cospi_16_64] "r" (cospi_16_64)

	523 );

	524

	525 step2_18 = step1_17 - step1_18;

	526 step2_29 = step1_30 - step1_29;

	527

	528 __asm__ __volatile__ (

	529 "mtlo %[const_2_power_13], $ac0 \n\t"

	530 "mthi $zero, $ac0 \n\t"

	531 "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"

	532 "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"

	533 "extp %[step3_18], $ac0, 31 \n\t"

	534

	535 : [step3_18] "=r" (step3_18)

	536 : [const_2_power_13] "r" (const_2_power_13),

	537 [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),

	538 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

	539 );

	540

	541 temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;

	542 step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	543

	544 step2_19 = step1_16 - step1_19;

	545 step2_28 = step1_31 - step1_28;

	546

	547 __asm__ __volatile__ (

	548 "mtlo %[const_2_power_13], $ac0 \n\t"

	549 "mthi $zero, $ac0 \n\t"

	550 "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"

	551 "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"

	552 "extp %[step3_19], $ac0, 31 \n\t"

	553

	554 : [step3_19] "=r" (step3_19)

	555 : [const_2_power_13] "r" (const_2_power_13),

	556 [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),

	557 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

	558 );

	559

	560 temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;

	561 step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	562

	563 step3_16 = step1_16 + step1_19;

	564 step3_17 = step1_17 + step1_18;

	565 step3_30 = step1_29 + step1_30;

	566 step3_31 = step1_28 + step1_31;

	567

	568 step2_20 = step1_23 - step1_20;

	569 step2_27 = step1_24 - step1_27;

	570

	571 __asm__ __volatile__ (

	572 "mtlo %[const_2_power_13], $ac0 \n\t"

	573 "mthi $zero, $ac0 \n\t"

	574 "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"

	575 "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"

	576 "extp %[step3_20], $ac0, 31 \n\t"

	577

	578 : [step3_20] "=r" (step3_20)

	579 : [const_2_power_13] "r" (const_2_power_13),

	580 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),

	581 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

	582 );

	583

	584 temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;

	585 step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	586

	587 step2_21 = step1_22 - step1_21;

	588 step2_26 = step1_25 - step1_26;

	589

	590 __asm__ __volatile__ (

	591 "mtlo %[const_2_power_13], $ac1 \n\t"

	592 "mthi $zero, $ac1 \n\t"

	593 "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"

	594 "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"

	595 "extp %[step3_21], $ac1, 31 \n\t"

	596

	597 : [step3_21] "=r" (step3_21)

	598 : [const_2_power_13] "r" (const_2_power_13),

	599 [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),

	600 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

	601 );

	602

	603 temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;

	604 step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	605

	606 step3_22 = step1_21 + step1_22;

	607 step3_23 = step1_20 + step1_23;

	608 step3_24 = step1_24 + step1_27;

	609 step3_25 = step1_25 + step1_26;

	610

	611 step2_16 = step3_16 + step3_23;

	612 step2_17 = step3_17 + step3_22;

	613 step2_18 = step3_18 + step3_21;

	614 step2_19 = step3_19 + step3_20;

	615 step2_20 = step3_19 - step3_20;

	616 step2_21 = step3_18 - step3_21;

	617 step2_22 = step3_17 - step3_22;

	618 step2_23 = step3_16 - step3_23;

	619

	620 step2_24 = step3_31 - step3_24;

	621 step2_25 = step3_30 - step3_25;

	622 step2_26 = step3_29 - step3_26;

	623 step2_27 = step3_28 - step3_27;

	624 step2_28 = step3_28 + step3_27;

	625 step2_29 = step3_29 + step3_26;

	626 step2_30 = step3_30 + step3_25;

	627 step2_31 = step3_31 + step3_24;

	628

	629 __asm__ __volatile__ (

	630 "lh %[load1], 0(%[input]) \n\t"

	631 "lh %[load2], 32(%[input]) \n\t"

	632 "lh %[load3], 16(%[input]) \n\t"

	633 "lh %[load4], 48(%[input]) \n\t"

	634

	635 "mtlo %[const_2_power_13], $ac1 \n\t"

	636 "mthi $zero, $ac1 \n\t"

	637 "mtlo %[const_2_power_13], $ac2 \n\t"

	638 "mthi $zero, $ac2 \n\t"

	639 "add %[result1], %[load1], %[load2] \n\t"

	640 "sub %[result2], %[load1], %[load2] \n\t"

	641 "madd $ac1, %[result1], %[cospi_16_64] \n\t"

	642 "madd $ac2, %[result2], %[cospi_16_64] \n\t"

	643 "extp %[temp0], $ac1, 31 \n\t"

	644 "extp %[temp1], $ac2, 31 \n\t"

	645

	646 "mtlo %[const_2_power_13], $ac3 \n\t"

	647 "mthi $zero, $ac3 \n\t"

	648 "madd $ac3, %[load3], %[cospi_24_64] \n\t"

	649 "msub $ac3, %[load4], %[cospi_8_64] \n\t"

	650 "extp %[temp2], $ac3, 31 \n\t"

	651

	652 "mtlo %[const_2_power_13], $ac1 \n\t"

	653 "mthi $zero, $ac1 \n\t"

	654 "madd $ac1, %[load3], %[cospi_8_64] \n\t"

	655 "madd $ac1, %[load4], %[cospi_24_64] \n\t"

	656 "extp %[temp3], $ac1, 31 \n\t"

	657

	658 "add %[step1_0], %[temp0], %[temp3] \n\t"

	659 "add %[step1_1], %[temp1], %[temp2] \n\t"

	660 "sub %[step1_2], %[temp1], %[temp2] \n\t"

	661 "sub %[step1_3], %[temp0], %[temp3] \n\t"

	662

	663 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	664 [load3] "=&r" (load3), [load4] "=&r" (load4),

	665 [result1] "=&r" (result1), [result2] "=&r" (result2),

	666 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	667 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	668 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

	669 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

	670 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	671 [cospi_16_64] "r" (cospi_16_64),

	672 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

	673

	674 );

	675

	676 __asm__ __volatile__ (

	677 "lh %[load1], 8(%[input]) \n\t"

	678 "lh %[load2], 56(%[input]) \n\t"

	679 "lh %[load3], 40(%[input]) \n\t"

	680 "lh %[load4], 24(%[input]) \n\t"

	681

	682 "mtlo %[const_2_power_13], $ac1 \n\t"

	683 "mthi $zero, $ac1 \n\t"

	684 "mtlo %[const_2_power_13], $ac3 \n\t"

	685 "mthi $zero, $ac3 \n\t"

	686

	687 "madd $ac1, %[load1], %[cospi_28_64] \n\t"

	688 "msub $ac1, %[load2], %[cospi_4_64] \n\t"

	689 "extp %[temp0], $ac1, 31 \n\t"

	690

	691 "madd $ac3, %[load1], %[cospi_4_64] \n\t"

	692 "madd $ac3, %[load2], %[cospi_28_64] \n\t"

	693 "extp %[temp3], $ac3, 31 \n\t"

	694

	695 "mtlo %[const_2_power_13], $ac1 \n\t"

	696 "mthi $zero, $ac1 \n\t"

	697 "mtlo %[const_2_power_13], $ac2 \n\t"

	698 "mthi $zero, $ac2 \n\t"

	699

	700 "madd $ac2, %[load3], %[cospi_12_64] \n\t"

	701 "msub $ac2, %[load4], %[cospi_20_64] \n\t"

	702 "extp %[temp1], $ac2, 31 \n\t"

	703

	704 "madd $ac1, %[load3], %[cospi_20_64] \n\t"

	705 "madd $ac1, %[load4], %[cospi_12_64] \n\t"

	706 "extp %[temp2], $ac1, 31 \n\t"

	707

	708 "mtlo %[const_2_power_13], $ac1 \n\t"

	709 "mthi $zero, $ac1 \n\t"

	710 "mtlo %[const_2_power_13], $ac3 \n\t"

	711 "mthi $zero, $ac3 \n\t"

	712

	713 "sub %[load1], %[temp3], %[temp2] \n\t"

	714 "sub %[load1], %[load1], %[temp0] \n\t"

	715 "add %[load1], %[load1], %[temp1] \n\t"

	716

	717 "sub %[load2], %[temp0], %[temp1] \n\t"

	718 "sub %[load2], %[load2], %[temp2] \n\t"

	719 "add %[load2], %[load2], %[temp3] \n\t"

	720

	721 "madd $ac1, %[load1], %[cospi_16_64] \n\t"

	722 "madd $ac3, %[load2], %[cospi_16_64] \n\t"

	723

	724 "extp %[step1_5], $ac1, 31 \n\t"

	725 "extp %[step1_6], $ac3, 31 \n\t"

	726 "add %[step1_4], %[temp0], %[temp1] \n\t"

	727 "add %[step1_7], %[temp3], %[temp2] \n\t"

	728

	729 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	730 [load3] "=&r" (load3), [load4] "=&r" (load4),

	731 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	732 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	733 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

	734 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

	735 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	736 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

	737 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

	738 [cospi_16_64] "r" (cospi_16_64)

	739 );

	740

	741 step2_0 = step1_0 + step1_7;

	742 step2_1 = step1_1 + step1_6;

	743 step2_2 = step1_2 + step1_5;

	744 step2_3 = step1_3 + step1_4;

	745 step2_4 = step1_3 - step1_4;

	746 step2_5 = step1_2 - step1_5;

	747 step2_6 = step1_1 - step1_6;

	748 step2_7 = step1_0 - step1_7;

	749

	750 step1_0 = step2_0 + step3_15;

	751 step1_1 = step2_1 + step3_14;

	752 step1_2 = step2_2 + step3_13;

	753 step1_3 = step2_3 + step3_12;

	754 step1_4 = step2_4 + step3_11;

	755 step1_5 = step2_5 + step3_10;

	756 step1_6 = step2_6 + step3_9;

	757 step1_7 = step2_7 + step3_8;

	758 step1_8 = step2_7 - step3_8;

	759 step1_9 = step2_6 - step3_9;

	760 step1_10 = step2_5 - step3_10;

	761 step1_11 = step2_4 - step3_11;

	762 step1_12 = step2_3 - step3_12;

	763 step1_13 = step2_2 - step3_13;

	764 step1_14 = step2_1 - step3_14;

	765 step1_15 = step2_0 - step3_15;

	766

	767 __asm__ __volatile__ (

	768 "sub %[temp0], %[step2_27], %[step2_20] \n\t"

	769 "mtlo %[const_2_power_13], $ac0 \n\t"

	770 "mthi $zero, $ac0 \n\t"

	771 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

	772 "extp %[step1_20], $ac0, 31 \n\t"

	773

	774 : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)

	775 : [const_2_power_13] "r" (const_2_power_13),

	776 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),

	777 [cospi_16_64] "r" (cospi_16_64)

	778 );

	779

	780 temp21 = (step2_20 + step2_27) * cospi_16_64;

	781 step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	782

	783 __asm__ __volatile__ (

	784 "sub %[temp0], %[step2_26], %[step2_21] \n\t"

	785 "mtlo %[const_2_power_13], $ac0 \n\t"

	786 "mthi $zero, $ac0 \n\t"

	787 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

	788 "extp %[step1_21], $ac0, 31 \n\t"

	789

	790 : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)

	791 : [const_2_power_13] "r" (const_2_power_13),

	792 [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),

	793 [cospi_16_64] "r" (cospi_16_64)

	794 );

	795

	796 temp21 = (step2_21 + step2_26) * cospi_16_64;

	797 step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	798

	799 __asm__ __volatile__ (

	800 "sub %[temp0], %[step2_25], %[step2_22] \n\t"

	801 "mtlo %[const_2_power_13], $ac0 \n\t"

	802 "mthi $zero, $ac0 \n\t"

	803 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

	804 "extp %[step1_22], $ac0, 31 \n\t"

	805

	806 : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)

	807 : [const_2_power_13] "r" (const_2_power_13),

	808 [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),

	809 [cospi_16_64] "r" (cospi_16_64)

	810 );

	811

	812 temp21 = (step2_22 + step2_25) * cospi_16_64;

	813 step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	814

	815 __asm__ __volatile__ (

	816 "sub %[temp0], %[step2_24], %[step2_23] \n\t"

	817 "mtlo %[const_2_power_13], $ac0 \n\t"

	818 "mthi $zero, $ac0 \n\t"

	819 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

	820 "extp %[step1_23], $ac0, 31 \n\t"

	821

	822 : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)

	823 : [const_2_power_13] "r" (const_2_power_13),

	824 [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),

	825 [cospi_16_64] "r" (cospi_16_64)

	826 );

	827

	828 temp21 = (step2_23 + step2_24) * cospi_16_64;

	829 step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	830

	831 // final stage

	832 output[0 * 32] = step1_0 + step2_31;

	833 output[1 * 32] = step1_1 + step2_30;

	834 output[2 * 32] = step1_2 + step2_29;

	835 output[3 * 32] = step1_3 + step2_28;

	836 output[4 * 32] = step1_4 + step1_27;

	837 output[5 * 32] = step1_5 + step1_26;

	838 output[6 * 32] = step1_6 + step1_25;

	839 output[7 * 32] = step1_7 + step1_24;

	840 output[8 * 32] = step1_8 + step1_23;

	841 output[9 * 32] = step1_9 + step1_22;

	842 output[10 * 32] = step1_10 + step1_21;

	843 output[11 * 32] = step1_11 + step1_20;

	844 output[12 * 32] = step1_12 + step2_19;

	845 output[13 * 32] = step1_13 + step2_18;

	846 output[14 * 32] = step1_14 + step2_17;

	847 output[15 * 32] = step1_15 + step2_16;

	848 output[16 * 32] = step1_15 - step2_16;

	849 output[17 * 32] = step1_14 - step2_17;

	850 output[18 * 32] = step1_13 - step2_18;

	851 output[19 * 32] = step1_12 - step2_19;

	852 output[20 * 32] = step1_11 - step1_20;

	853 output[21 * 32] = step1_10 - step1_21;

	854 output[22 * 32] = step1_9 - step1_22;

	855 output[23 * 32] = step1_8 - step1_23;

	856 output[24 * 32] = step1_7 - step1_24;

	857 output[25 * 32] = step1_6 - step1_25;

	858 output[26 * 32] = step1_5 - step1_26;

	859 output[27 * 32] = step1_4 - step1_27;

	860 output[28 * 32] = step1_3 - step2_28;

	861 output[29 * 32] = step1_2 - step2_29;

	862 output[30 * 32] = step1_1 - step2_30;

	863 output[31 * 32] = step1_0 - step2_31;

	864

	865 input += 32;

	866 output += 1;

	867 }

	868 }

	869

	870 void vp9_idct32x32_1024_add_dspr2(const int16_t input, uint8_t dest,

	871 int dest_stride) {

	872 DECLARE_ALIGNED(32, int16_t, out[32 * 32]);

	873 int16_t *outptr = out;

	874 uint32_t pos = 45;

	875

	876 /* bit positon for extract from acc */

	877 __asm__ __volatile__ (

	878 "wrdsp %[pos], 1 \n\t"

	879 :

	880 : [pos] "r" (pos)

	881 );

	882

	883 // Rows

	884 idct32_1d_rows_dspr2(input, outptr);

	885

	886 // Columns

	887 vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);

	888 }

	889

	890 void vp9_idct32x32_1_add_dspr2(const int16_t input, uint8_t dest,

	891 int stride) {

	892 int r, out;

	893 int32_t a1, absa1;

	894 int32_t vector_a1;

	895 int32_t t1, t2, t3, t4;

	896 int32_t vector_1, vector_2, vector_3, vector_4;

	897 uint32_t pos = 45;

	898

	899 /* bit positon for extract from acc */

	900 __asm__ __volatile__ (

	901 "wrdsp %[pos], 1 \n\t"

	902

	903 :

	904 : [pos] "r" (pos)

	905 );

	906

	907 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

	908 __asm__ __volatile__ (

	909 "addi %[out], %[out], 32 \n\t"

	910 "sra %[a1], %[out], 6 \n\t"

	911

	912 : [out] "+r" (out), [a1] "=r" (a1)

	913 :

	914 );

	915

	916 if (a1 < 0) {

	917 /* use quad-byte

	918 * input and output memory are four byte aligned */

	919 __asm__ __volatile__ (

	920 "abs %[absa1], %[a1] \n\t"

	921 "replv.qb %[vector_a1], %[absa1] \n\t"

	922

	923 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

	924 : [a1] "r" (a1)

	925 );

	926

	927 for (r = 32; r--;) {

	928 __asm__ __volatile__ (

	929 "lw %[t1], 0(%[dest]) \n\t"

	930 "lw %[t2], 4(%[dest]) \n\t"

	931 "lw %[t3], 8(%[dest]) \n\t"

	932 "lw %[t4], 12(%[dest]) \n\t"

	933 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

	934 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

	935 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

	936 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

	937 "sw %[vector_1], 0(%[dest]) \n\t"

	938 "sw %[vector_2], 4(%[dest]) \n\t"

	939 "sw %[vector_3], 8(%[dest]) \n\t"

	940 "sw %[vector_4], 12(%[dest]) \n\t"

	941

	942 "lw %[t1], 16(%[dest]) \n\t"

	943 "lw %[t2], 20(%[dest]) \n\t"

	944 "lw %[t3], 24(%[dest]) \n\t"

	945 "lw %[t4], 28(%[dest]) \n\t"

	946 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

	947 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

	948 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

	949 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

	950 "sw %[vector_1], 16(%[dest]) \n\t"

	951 "sw %[vector_2], 20(%[dest]) \n\t"

	952 "sw %[vector_3], 24(%[dest]) \n\t"

	953 "sw %[vector_4], 28(%[dest]) \n\t"

	954

	955 "add %[dest], %[dest], %[stride] \n\t"

	956

	957 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

	958 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

	959 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

	960 [dest] "+&r" (dest)

	961 : [stride] "r" (stride), [vector_a1] "r" (vector_a1)

	962 );

	963 }

	964 } else {

	965 /* use quad-byte

	966 * input and output memory are four byte aligned */

	967 __asm__ __volatile__ (

	968 "replv.qb %[vector_a1], %[a1] \n\t"

	969

	970 : [vector_a1] "=r" (vector_a1)

	971 : [a1] "r" (a1)

	972 );

	973

	974 for (r = 32; r--;) {

	975 __asm__ __volatile__ (

	976 "lw %[t1], 0(%[dest]) \n\t"

	977 "lw %[t2], 4(%[dest]) \n\t"

	978 "lw %[t3], 8(%[dest]) \n\t"

	979 "lw %[t4], 12(%[dest]) \n\t"

	980 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

	981 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

	982 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

	983 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

	984 "sw %[vector_1], 0(%[dest]) \n\t"

	985 "sw %[vector_2], 4(%[dest]) \n\t"

	986 "sw %[vector_3], 8(%[dest]) \n\t"

	987 "sw %[vector_4], 12(%[dest]) \n\t"

	988

	989 "lw %[t1], 16(%[dest]) \n\t"

	990 "lw %[t2], 20(%[dest]) \n\t"

	991 "lw %[t3], 24(%[dest]) \n\t"

	992 "lw %[t4], 28(%[dest]) \n\t"

	993 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

	994 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

	995 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

	996 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

	997 "sw %[vector_1], 16(%[dest]) \n\t"

	998 "sw %[vector_2], 20(%[dest]) \n\t"

	999 "sw %[vector_3], 24(%[dest]) \n\t"

	1000 "sw %[vector_4], 28(%[dest]) \n\t"

	1001

	1002 "add %[dest], %[dest], %[stride] \n\t"

	1003

	1004 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

	1005 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

	1006 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

	1007 [dest] "+&r" (dest)

	1008 : [stride] "r" (stride), [vector_a1] "r" (vector_a1)

	1009 );

	1010 }

	1011 }

	1012 }

	1013 #endif // #if HAVE_DSPR2

OLD	NEW