source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c - Issue 54923004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c

Issue 54923004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <assert.h>

	12

	13 #include "./vpx_config.h"

	14 #include "./vp9_rtcd.h"

	15 #include "vp9/common/vp9_common.h"

	16 #include "vp9/common/vp9_blockd.h"

	17 #include "vp9/common/vp9_idct.h"

	18 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

	19

	20 #if HAVE_DSPR2

	21 void vp9_idct32_1d_cols_add_blk_dspr2(int16_t input, uint8_t dest,

	22 int dest_stride) {

	23 int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;

	24 int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;

	25 int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;

	26 int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;

	27 int16_t step1_27, step1_28, step1_29, step1_30, step1_31;

	28 int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;

	29 int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;

	30 int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;

	31 int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;

	32 int16_t step2_28, step2_29, step2_30, step2_31;

	33 int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;

	34 int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;

	35 int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;

	36 int16_t step3_28, step3_29, step3_30, step3_31;

	37 int temp0, temp1, temp2, temp3;

	38 int load1, load2, load3, load4;

	39 int result1, result2;

	40 int i, temp21;

	41 uint8_t dest_pix, dest_pix1;

	42 const int const_2_power_13 = 8192;

	43 uint8_t *cm = vp9_ff_cropTbl;

	44

	45 /* prefetch vp9_ff_cropTbl */

	46 vp9_prefetch_load(vp9_ff_cropTbl);

	47 vp9_prefetch_load(vp9_ff_cropTbl + 32);

	48 vp9_prefetch_load(vp9_ff_cropTbl + 64);

	49 vp9_prefetch_load(vp9_ff_cropTbl + 96);

	50 vp9_prefetch_load(vp9_ff_cropTbl + 128);

	51 vp9_prefetch_load(vp9_ff_cropTbl + 160);

	52 vp9_prefetch_load(vp9_ff_cropTbl + 192);

	53 vp9_prefetch_load(vp9_ff_cropTbl + 224);

	54

	55 for (i = 0; i < 32; ++i) {

	56 dest_pix = dest + i;

	57 dest_pix1 = dest + i + 31 * dest_stride;

	58

	59 __asm__ __volatile__ (

	60 "lh %[load1], 2(%[input]) \n\t"

	61 "lh %[load2], 62(%[input]) \n\t"

	62 "lh %[load3], 34(%[input]) \n\t"

	63 "lh %[load4], 30(%[input]) \n\t"

	64

	65 "mtlo %[const_2_power_13], $ac1 \n\t"

	66 "mthi $zero, $ac1 \n\t"

	67 "mtlo %[const_2_power_13], $ac3 \n\t"

	68 "mthi $zero, $ac3 \n\t"

	69

	70 "madd $ac1, %[load1], %[cospi_31_64] \n\t"

	71 "msub $ac1, %[load2], %[cospi_1_64] \n\t"

	72 "extp %[temp0], $ac1, 31 \n\t"

	73

	74 "madd $ac3, %[load1], %[cospi_1_64] \n\t"

	75 "madd $ac3, %[load2], %[cospi_31_64] \n\t"

	76 "extp %[temp3], $ac3, 31 \n\t"

	77

	78 "mtlo %[const_2_power_13], $ac1 \n\t"

	79 "mthi $zero, $ac1 \n\t"

	80 "mtlo %[const_2_power_13], $ac2 \n\t"

	81 "mthi $zero, $ac2 \n\t"

	82

	83 "madd $ac2, %[load3], %[cospi_15_64] \n\t"

	84 "msub $ac2, %[load4], %[cospi_17_64] \n\t"

	85 "extp %[temp1], $ac2, 31 \n\t"

	86

	87 "madd $ac1, %[load3], %[cospi_17_64] \n\t"

	88 "madd $ac1, %[load4], %[cospi_15_64] \n\t"

	89 "extp %[temp2], $ac1, 31 \n\t"

	90

	91 "mtlo %[const_2_power_13], $ac1 \n\t"

	92 "mthi $zero, $ac1 \n\t"

	93 "mtlo %[const_2_power_13], $ac3 \n\t"

	94 "mthi $zero, $ac3 \n\t"

	95

	96 "sub %[load1], %[temp3], %[temp2] \n\t"

	97 "sub %[load2], %[temp0], %[temp1] \n\t"

	98

	99 "madd $ac1, %[load1], %[cospi_28_64] \n\t"

	100 "msub $ac1, %[load2], %[cospi_4_64] \n\t"

	101 "madd $ac3, %[load1], %[cospi_4_64] \n\t"

	102 "madd $ac3, %[load2], %[cospi_28_64] \n\t"

	103

	104 "extp %[step1_17], $ac1, 31 \n\t"

	105 "extp %[step1_30], $ac3, 31 \n\t"

	106 "add %[step1_16], %[temp0], %[temp1] \n\t"

	107 "add %[step1_31], %[temp2], %[temp3] \n\t"

	108

	109 : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

	110 [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	111 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	112 [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),

	113 [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)

	114 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	115 [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),

	116 [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),

	117 [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)

	118 );

	119

	120 __asm__ __volatile__ (

	121 "lh %[load1], 18(%[input]) \n\t"

	122 "lh %[load2], 46(%[input]) \n\t"

	123 "lh %[load3], 50(%[input]) \n\t"

	124 "lh %[load4], 14(%[input]) \n\t"

	125

	126 "mtlo %[const_2_power_13], $ac1 \n\t"

	127 "mthi $zero, $ac1 \n\t"

	128 "mtlo %[const_2_power_13], $ac3 \n\t"

	129 "mthi $zero, $ac3 \n\t"

	130

	131 "madd $ac1, %[load1], %[cospi_23_64] \n\t"

	132 "msub $ac1, %[load2], %[cospi_9_64] \n\t"

	133 "extp %[temp0], $ac1, 31 \n\t"

	134

	135 "madd $ac3, %[load1], %[cospi_9_64] \n\t"

	136 "madd $ac3, %[load2], %[cospi_23_64] \n\t"

	137 "extp %[temp3], $ac3, 31 \n\t"

	138

	139 "mtlo %[const_2_power_13], $ac1 \n\t"

	140 "mthi $zero, $ac1 \n\t"

	141 "mtlo %[const_2_power_13], $ac2 \n\t"

	142 "mthi $zero, $ac2 \n\t"

	143

	144 "madd $ac2, %[load3], %[cospi_7_64] \n\t"

	145 "msub $ac2, %[load4], %[cospi_25_64] \n\t"

	146 "extp %[temp1], $ac2, 31 \n\t"

	147

	148 "madd $ac1, %[load3], %[cospi_25_64] \n\t"

	149 "madd $ac1, %[load4], %[cospi_7_64] \n\t"

	150 "extp %[temp2], $ac1, 31 \n\t"

	151

	152 "mtlo %[const_2_power_13], $ac1 \n\t"

	153 "mthi $zero, $ac1 \n\t"

	154 "mtlo %[const_2_power_13], $ac3 \n\t"

	155 "mthi $zero, $ac3 \n\t"

	156

	157 "sub %[load1], %[temp1], %[temp0] \n\t"

	158 "sub %[load2], %[temp2], %[temp3] \n\t"

	159

	160 "msub $ac1, %[load1], %[cospi_28_64] \n\t"

	161 "msub $ac1, %[load2], %[cospi_4_64] \n\t"

	162 "msub $ac3, %[load1], %[cospi_4_64] \n\t"

	163 "madd $ac3, %[load2], %[cospi_28_64] \n\t"

	164

	165 "extp %[step1_18], $ac1, 31 \n\t"

	166 "extp %[step1_29], $ac3, 31 \n\t"

	167 "add %[step1_19], %[temp0], %[temp1] \n\t"

	168 "add %[step1_28], %[temp2], %[temp3] \n\t"

	169

	170 : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

	171 [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	172 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	173 [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),

	174 [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)

	175 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	176 [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),

	177 [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),

	178 [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)

	179 );

	180

	181 __asm__ __volatile__ (

	182 "lh %[load1], 10(%[input]) \n\t"

	183 "lh %[load2], 54(%[input]) \n\t"

	184 "lh %[load3], 42(%[input]) \n\t"

	185 "lh %[load4], 22(%[input]) \n\t"

	186

	187 "mtlo %[const_2_power_13], $ac1 \n\t"

	188 "mthi $zero, $ac1 \n\t"

	189 "mtlo %[const_2_power_13], $ac3 \n\t"

	190 "mthi $zero, $ac3 \n\t"

	191

	192 "madd $ac1, %[load1], %[cospi_27_64] \n\t"

	193 "msub $ac1, %[load2], %[cospi_5_64] \n\t"

	194 "extp %[temp0], $ac1, 31 \n\t"

	195

	196 "madd $ac3, %[load1], %[cospi_5_64] \n\t"

	197 "madd $ac3, %[load2], %[cospi_27_64] \n\t"

	198 "extp %[temp3], $ac3, 31 \n\t"

	199

	200 "mtlo %[const_2_power_13], $ac1 \n\t"

	201 "mthi $zero, $ac1 \n\t"

	202 "mtlo %[const_2_power_13], $ac2 \n\t"

	203 "mthi $zero, $ac2 \n\t"

	204

	205 "madd $ac2, %[load3], %[cospi_11_64] \n\t"

	206 "msub $ac2, %[load4], %[cospi_21_64] \n\t"

	207 "extp %[temp1], $ac2, 31 \n\t"

	208

	209 "madd $ac1, %[load3], %[cospi_21_64] \n\t"

	210 "madd $ac1, %[load4], %[cospi_11_64] \n\t"

	211 "extp %[temp2], $ac1, 31 \n\t"

	212

	213 "mtlo %[const_2_power_13], $ac1 \n\t"

	214 "mthi $zero, $ac1 \n\t"

	215 "mtlo %[const_2_power_13], $ac3 \n\t"

	216 "mthi $zero, $ac3 \n\t"

	217

	218 "sub %[load1], %[temp0], %[temp1] \n\t"

	219 "sub %[load2], %[temp3], %[temp2] \n\t"

	220

	221 "madd $ac1, %[load2], %[cospi_12_64] \n\t"

	222 "msub $ac1, %[load1], %[cospi_20_64] \n\t"

	223 "madd $ac3, %[load1], %[cospi_12_64] \n\t"

	224 "madd $ac3, %[load2], %[cospi_20_64] \n\t"

	225

	226 "extp %[step1_21], $ac1, 31 \n\t"

	227 "extp %[step1_26], $ac3, 31 \n\t"

	228 "add %[step1_20], %[temp0], %[temp1] \n\t"

	229 "add %[step1_27], %[temp2], %[temp3] \n\t"

	230

	231 : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

	232 [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	233 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	234 [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),

	235 [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)

	236 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	237 [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),

	238 [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),

	239 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

	240 );

	241

	242 __asm__ __volatile__ (

	243 "lh %[load1], 26(%[input]) \n\t"

	244 "lh %[load2], 38(%[input]) \n\t"

	245 "lh %[load3], 58(%[input]) \n\t"

	246 "lh %[load4], 6(%[input]) \n\t"

	247

	248 "mtlo %[const_2_power_13], $ac1 \n\t"

	249 "mthi $zero, $ac1 \n\t"

	250 "mtlo %[const_2_power_13], $ac3 \n\t"

	251 "mthi $zero, $ac3 \n\t"

	252

	253 "madd $ac1, %[load1], %[cospi_19_64] \n\t"

	254 "msub $ac1, %[load2], %[cospi_13_64] \n\t"

	255 "extp %[temp0], $ac1, 31 \n\t"

	256 "madd $ac3, %[load1], %[cospi_13_64] \n\t"

	257 "madd $ac3, %[load2], %[cospi_19_64] \n\t"

	258 "extp %[temp3], $ac3, 31 \n\t"

	259

	260 "mtlo %[const_2_power_13], $ac1 \n\t"

	261 "mthi $zero, $ac1 \n\t"

	262 "mtlo %[const_2_power_13], $ac2 \n\t"

	263 "mthi $zero, $ac2 \n\t"

	264

	265 "madd $ac2, %[load3], %[cospi_3_64] \n\t"

	266 "msub $ac2, %[load4], %[cospi_29_64] \n\t"

	267 "extp %[temp1], $ac2, 31 \n\t"

	268 "madd $ac1, %[load3], %[cospi_29_64] \n\t"

	269 "madd $ac1, %[load4], %[cospi_3_64] \n\t"

	270 "extp %[temp2], $ac1, 31 \n\t"

	271

	272 "mtlo %[const_2_power_13], $ac1 \n\t"

	273 "mthi $zero, $ac1 \n\t"

	274 "mtlo %[const_2_power_13], $ac3 \n\t"

	275 "mthi $zero, $ac3 \n\t"

	276

	277 "sub %[load1], %[temp1], %[temp0] \n\t"

	278 "sub %[load2], %[temp2], %[temp3] \n\t"

	279 "msub $ac1, %[load1], %[cospi_12_64] \n\t"

	280 "msub $ac1, %[load2], %[cospi_20_64] \n\t"

	281 "msub $ac3, %[load1], %[cospi_20_64] \n\t"

	282 "madd $ac3, %[load2], %[cospi_12_64] \n\t"

	283 "extp %[step1_22], $ac1, 31 \n\t"

	284 "extp %[step1_25], $ac3, 31 \n\t"

	285 "add %[step1_23], %[temp0], %[temp1] \n\t"

	286 "add %[step1_24], %[temp2], %[temp3] \n\t"

	287

	288 : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

	289 [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	290 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	291 [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),

	292 [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)

	293 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	294 [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),

	295 [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),

	296 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)

	297 );

	298

	299 __asm__ __volatile__ (

	300 "lh %[load1], 4(%[input]) \n\t"

	301 "lh %[load2], 60(%[input]) \n\t"

	302 "lh %[load3], 36(%[input]) \n\t"

	303 "lh %[load4], 28(%[input]) \n\t"

	304

	305 "mtlo %[const_2_power_13], $ac1 \n\t"

	306 "mthi $zero, $ac1 \n\t"

	307 "mtlo %[const_2_power_13], $ac3 \n\t"

	308 "mthi $zero, $ac3 \n\t"

	309

	310 "madd $ac1, %[load1], %[cospi_30_64] \n\t"

	311 "msub $ac1, %[load2], %[cospi_2_64] \n\t"

	312 "extp %[temp0], $ac1, 31 \n\t"

	313 "madd $ac3, %[load1], %[cospi_2_64] \n\t"

	314 "madd $ac3, %[load2], %[cospi_30_64] \n\t"

	315 "extp %[temp3], $ac3, 31 \n\t"

	316

	317 "mtlo %[const_2_power_13], $ac1 \n\t"

	318 "mthi $zero, $ac1 \n\t"

	319 "mtlo %[const_2_power_13], $ac2 \n\t"

	320 "mthi $zero, $ac2 \n\t"

	321

	322 "madd $ac2, %[load3], %[cospi_14_64] \n\t"

	323 "msub $ac2, %[load4], %[cospi_18_64] \n\t"

	324 "extp %[temp1], $ac2, 31 \n\t"

	325 "madd $ac1, %[load3], %[cospi_18_64] \n\t"

	326 "madd $ac1, %[load4], %[cospi_14_64] \n\t"

	327 "extp %[temp2], $ac1, 31 \n\t"

	328

	329 "mtlo %[const_2_power_13], $ac1 \n\t"

	330 "mthi $zero, $ac1 \n\t"

	331 "mtlo %[const_2_power_13], $ac3 \n\t"

	332 "mthi $zero, $ac3 \n\t"

	333

	334 "sub %[load1], %[temp0], %[temp1] \n\t"

	335 "sub %[load2], %[temp3], %[temp2] \n\t"

	336 "msub $ac1, %[load1], %[cospi_8_64] \n\t"

	337 "madd $ac1, %[load2], %[cospi_24_64] \n\t"

	338 "madd $ac3, %[load1], %[cospi_24_64] \n\t"

	339 "madd $ac3, %[load2], %[cospi_8_64] \n\t"

	340 "extp %[step2_9], $ac1, 31 \n\t"

	341 "extp %[step2_14], $ac3, 31 \n\t"

	342 "add %[step2_8], %[temp0], %[temp1] \n\t"

	343 "add %[step2_15], %[temp2], %[temp3] \n\t"

	344

	345 : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

	346 [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	347 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	348 [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),

	349 [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)

	350 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	351 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),

	352 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),

	353 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

	354 );

	355

	356 __asm__ __volatile__ (

	357 "lh %[load1], 20(%[input]) \n\t"

	358 "lh %[load2], 44(%[input]) \n\t"

	359 "lh %[load3], 52(%[input]) \n\t"

	360 "lh %[load4], 12(%[input]) \n\t"

	361

	362 "mtlo %[const_2_power_13], $ac1 \n\t"

	363 "mthi $zero, $ac1 \n\t"

	364 "mtlo %[const_2_power_13], $ac3 \n\t"

	365 "mthi $zero, $ac3 \n\t"

	366

	367 "madd $ac1, %[load1], %[cospi_22_64] \n\t"

	368 "msub $ac1, %[load2], %[cospi_10_64] \n\t"

	369 "extp %[temp0], $ac1, 31 \n\t"

	370 "madd $ac3, %[load1], %[cospi_10_64] \n\t"

	371 "madd $ac3, %[load2], %[cospi_22_64] \n\t"

	372 "extp %[temp3], $ac3, 31 \n\t"

	373

	374 "mtlo %[const_2_power_13], $ac1 \n\t"

	375 "mthi $zero, $ac1 \n\t"

	376 "mtlo %[const_2_power_13], $ac2 \n\t"

	377 "mthi $zero, $ac2 \n\t"

	378

	379 "madd $ac2, %[load3], %[cospi_6_64] \n\t"

	380 "msub $ac2, %[load4], %[cospi_26_64] \n\t"

	381 "extp %[temp1], $ac2, 31 \n\t"

	382 "madd $ac1, %[load3], %[cospi_26_64] \n\t"

	383 "madd $ac1, %[load4], %[cospi_6_64] \n\t"

	384 "extp %[temp2], $ac1, 31 \n\t"

	385

	386 "mtlo %[const_2_power_13], $ac1 \n\t"

	387 "mthi $zero, $ac1 \n\t"

	388 "mtlo %[const_2_power_13], $ac3 \n\t"

	389 "mthi $zero, $ac3 \n\t"

	390

	391 "sub %[load1], %[temp1], %[temp0] \n\t"

	392 "sub %[load2], %[temp2], %[temp3] \n\t"

	393 "msub $ac1, %[load1], %[cospi_24_64] \n\t"

	394 "msub $ac1, %[load2], %[cospi_8_64] \n\t"

	395 "madd $ac3, %[load2], %[cospi_24_64] \n\t"

	396 "msub $ac3, %[load1], %[cospi_8_64] \n\t"

	397 "extp %[step2_10], $ac1, 31 \n\t"

	398 "extp %[step2_13], $ac3, 31 \n\t"

	399 "add %[step2_11], %[temp0], %[temp1] \n\t"

	400 "add %[step2_12], %[temp2], %[temp3] \n\t"

	401

	402 : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),

	403 [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	404 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	405 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),

	406 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)

	407 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	408 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),

	409 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),

	410 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)

	411 );

	412

	413 __asm__ __volatile__ (

	414 "mtlo %[const_2_power_13], $ac0 \n\t"

	415 "mthi $zero, $ac0 \n\t"

	416 "sub %[temp0], %[step2_14], %[step2_13] \n\t"

	417 "sub %[temp0], %[temp0], %[step2_9] \n\t"

	418 "add %[temp0], %[temp0], %[step2_10] \n\t"

	419 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

	420 "mtlo %[const_2_power_13], $ac1 \n\t"

	421 "mthi $zero, $ac1 \n\t"

	422 "sub %[temp1], %[step2_14], %[step2_13] \n\t"

	423 "add %[temp1], %[temp1], %[step2_9] \n\t"

	424 "sub %[temp1], %[temp1], %[step2_10] \n\t"

	425 "madd $ac1, %[temp1], %[cospi_16_64] \n\t"

	426 "mtlo %[const_2_power_13], $ac2 \n\t"

	427 "mthi $zero, $ac2 \n\t"

	428 "sub %[temp0], %[step2_15], %[step2_12] \n\t"

	429 "sub %[temp0], %[temp0], %[step2_8] \n\t"

	430 "add %[temp0], %[temp0], %[step2_11] \n\t"

	431 "madd $ac2, %[temp0], %[cospi_16_64] \n\t"

	432 "mtlo %[const_2_power_13], $ac3 \n\t"

	433 "mthi $zero, $ac3 \n\t"

	434 "sub %[temp1], %[step2_15], %[step2_12] \n\t"

	435 "add %[temp1], %[temp1], %[step2_8] \n\t"

	436 "sub %[temp1], %[temp1], %[step2_11] \n\t"

	437 "madd $ac3, %[temp1], %[cospi_16_64] \n\t"

	438

	439 "add %[step3_8], %[step2_8], %[step2_11] \n\t"

	440 "add %[step3_9], %[step2_9], %[step2_10] \n\t"

	441 "add %[step3_14], %[step2_13], %[step2_14] \n\t"

	442 "add %[step3_15], %[step2_12], %[step2_15] \n\t"

	443 "extp %[step3_10], $ac0, 31 \n\t"

	444 "extp %[step3_13], $ac1, 31 \n\t"

	445 "extp %[step3_11], $ac2, 31 \n\t"

	446 "extp %[step3_12], $ac3, 31 \n\t"

	447

	448 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	449 [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),

	450 [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),

	451 [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),

	452 [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)

	453 : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),

	454 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),

	455 [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),

	456 [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),

	457 [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)

	458 );

	459

	460 step2_18 = step1_17 - step1_18;

	461 step2_29 = step1_30 - step1_29;

	462

	463 __asm__ __volatile__ (

	464 "mtlo %[const_2_power_13], $ac0 \n\t"

	465 "mthi $zero, $ac0 \n\t"

	466 "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"

	467 "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"

	468 "extp %[step3_18], $ac0, 31 \n\t"

	469

	470 : [step3_18] "=r" (step3_18)

	471 : [const_2_power_13] "r" (const_2_power_13),

	472 [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),

	473 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

	474 );

	475

	476 temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;

	477 step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	478

	479 step2_19 = step1_16 - step1_19;

	480 step2_28 = step1_31 - step1_28;

	481

	482 __asm__ __volatile__ (

	483 "mtlo %[const_2_power_13], $ac0 \n\t"

	484 "mthi $zero, $ac0 \n\t"

	485 "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"

	486 "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"

	487 "extp %[step3_19], $ac0, 31 \n\t"

	488

	489 : [step3_19] "=r" (step3_19)

	490 : [const_2_power_13] "r" (const_2_power_13),

	491 [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),

	492 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

	493 );

	494

	495 temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;

	496 step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	497

	498 step3_16 = step1_16 + step1_19;

	499 step3_17 = step1_17 + step1_18;

	500 step3_30 = step1_29 + step1_30;

	501 step3_31 = step1_28 + step1_31;

	502

	503 step2_20 = step1_23 - step1_20;

	504 step2_27 = step1_24 - step1_27;

	505

	506 __asm__ __volatile__ (

	507 "mtlo %[const_2_power_13], $ac0 \n\t"

	508 "mthi $zero, $ac0 \n\t"

	509 "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"

	510 "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"

	511 "extp %[step3_20], $ac0, 31 \n\t"

	512

	513 : [step3_20] "=r" (step3_20)

	514 : [const_2_power_13] "r" (const_2_power_13),

	515 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),

	516 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

	517 );

	518

	519 temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;

	520 step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	521

	522 step2_21 = step1_22 - step1_21;

	523 step2_26 = step1_25 - step1_26;

	524

	525 __asm__ __volatile__ (

	526 "mtlo %[const_2_power_13], $ac1 \n\t"

	527 "mthi $zero, $ac1 \n\t"

	528 "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"

	529 "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"

	530 "extp %[step3_21], $ac1, 31 \n\t"

	531

	532 : [step3_21] "=r" (step3_21)

	533 : [const_2_power_13] "r" (const_2_power_13),

	534 [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),

	535 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)

	536 );

	537

	538 temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;

	539 step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	540

	541 step3_22 = step1_21 + step1_22;

	542 step3_23 = step1_20 + step1_23;

	543 step3_24 = step1_24 + step1_27;

	544 step3_25 = step1_25 + step1_26;

	545

	546 step2_16 = step3_16 + step3_23;

	547 step2_17 = step3_17 + step3_22;

	548 step2_18 = step3_18 + step3_21;

	549 step2_19 = step3_19 + step3_20;

	550 step2_20 = step3_19 - step3_20;

	551 step2_21 = step3_18 - step3_21;

	552 step2_22 = step3_17 - step3_22;

	553 step2_23 = step3_16 - step3_23;

	554

	555 step2_24 = step3_31 - step3_24;

	556 step2_25 = step3_30 - step3_25;

	557 step2_26 = step3_29 - step3_26;

	558 step2_27 = step3_28 - step3_27;

	559 step2_28 = step3_28 + step3_27;

	560 step2_29 = step3_29 + step3_26;

	561 step2_30 = step3_30 + step3_25;

	562 step2_31 = step3_31 + step3_24;

	563

	564 __asm__ __volatile__ (

	565 "lh %[load1], 0(%[input]) \n\t"

	566 "lh %[load2], 32(%[input]) \n\t"

	567 "lh %[load3], 16(%[input]) \n\t"

	568 "lh %[load4], 48(%[input]) \n\t"

	569

	570 "mtlo %[const_2_power_13], $ac1 \n\t"

	571 "mthi $zero, $ac1 \n\t"

	572 "mtlo %[const_2_power_13], $ac2 \n\t"

	573 "mthi $zero, $ac2 \n\t"

	574 "add %[result1], %[load1], %[load2] \n\t"

	575 "sub %[result2], %[load1], %[load2] \n\t"

	576 "madd $ac1, %[result1], %[cospi_16_64] \n\t"

	577 "madd $ac2, %[result2], %[cospi_16_64] \n\t"

	578 "extp %[temp0], $ac1, 31 \n\t"

	579 "extp %[temp1], $ac2, 31 \n\t"

	580

	581 "mtlo %[const_2_power_13], $ac3 \n\t"

	582 "mthi $zero, $ac3 \n\t"

	583 "madd $ac3, %[load3], %[cospi_24_64] \n\t"

	584 "msub $ac3, %[load4], %[cospi_8_64] \n\t"

	585 "extp %[temp2], $ac3, 31 \n\t"

	586 "mtlo %[const_2_power_13], $ac1 \n\t"

	587 "mthi $zero, $ac1 \n\t"

	588 "madd $ac1, %[load3], %[cospi_8_64] \n\t"

	589 "madd $ac1, %[load4], %[cospi_24_64] \n\t"

	590 "extp %[temp3], $ac1, 31 \n\t"

	591 "add %[step1_0], %[temp0], %[temp3] \n\t"

	592 "add %[step1_1], %[temp1], %[temp2] \n\t"

	593 "sub %[step1_2], %[temp1], %[temp2] \n\t"

	594 "sub %[step1_3], %[temp0], %[temp3] \n\t"

	595

	596 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	597 [load3] "=&r" (load3), [load4] "=&r" (load4),

	598 [result1] "=&r" (result1), [result2] "=&r" (result2),

	599 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	600 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	601 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),

	602 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)

	603 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	604 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),

	605 [cospi_16_64] "r" (cospi_16_64)

	606 );

	607

	608 __asm__ __volatile__ (

	609 "lh %[load1], 8(%[input]) \n\t"

	610 "lh %[load2], 56(%[input]) \n\t"

	611 "lh %[load3], 40(%[input]) \n\t"

	612 "lh %[load4], 24(%[input]) \n\t"

	613

	614 "mtlo %[const_2_power_13], $ac1 \n\t"

	615 "mthi $zero, $ac1 \n\t"

	616 "mtlo %[const_2_power_13], $ac3 \n\t"

	617 "mthi $zero, $ac3 \n\t"

	618

	619 "madd $ac1, %[load1], %[cospi_28_64] \n\t"

	620 "msub $ac1, %[load2], %[cospi_4_64] \n\t"

	621 "extp %[temp0], $ac1, 31 \n\t"

	622 "madd $ac3, %[load1], %[cospi_4_64] \n\t"

	623 "madd $ac3, %[load2], %[cospi_28_64] \n\t"

	624 "extp %[temp3], $ac3, 31 \n\t"

	625

	626 "mtlo %[const_2_power_13], $ac1 \n\t"

	627 "mthi $zero, $ac1 \n\t"

	628 "mtlo %[const_2_power_13], $ac2 \n\t"

	629 "mthi $zero, $ac2 \n\t"

	630

	631 "madd $ac2, %[load3], %[cospi_12_64] \n\t"

	632 "msub $ac2, %[load4], %[cospi_20_64] \n\t"

	633 "extp %[temp1], $ac2, 31 \n\t"

	634 "madd $ac1, %[load3], %[cospi_20_64] \n\t"

	635 "madd $ac1, %[load4], %[cospi_12_64] \n\t"

	636 "extp %[temp2], $ac1, 31 \n\t"

	637

	638 "mtlo %[const_2_power_13], $ac1 \n\t"

	639 "mthi $zero, $ac1 \n\t"

	640 "mtlo %[const_2_power_13], $ac3 \n\t"

	641 "mthi $zero, $ac3 \n\t"

	642

	643 "sub %[load1], %[temp3], %[temp2] \n\t"

	644 "sub %[load1], %[load1], %[temp0] \n\t"

	645 "add %[load1], %[load1], %[temp1] \n\t"

	646 "sub %[load2], %[temp0], %[temp1] \n\t"

	647 "sub %[load2], %[load2], %[temp2] \n\t"

	648 "add %[load2], %[load2], %[temp3] \n\t"

	649 "madd $ac1, %[load1], %[cospi_16_64] \n\t"

	650 "madd $ac3, %[load2], %[cospi_16_64] \n\t"

	651

	652 "extp %[step1_5], $ac1, 31 \n\t"

	653 "extp %[step1_6], $ac3, 31 \n\t"

	654 "add %[step1_4], %[temp0], %[temp1] \n\t"

	655 "add %[step1_7], %[temp3], %[temp2] \n\t"

	656

	657 : [load1] "=&r" (load1), [load2] "=&r" (load2),

	658 [load3] "=&r" (load3), [load4] "=&r" (load4),

	659 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),

	660 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),

	661 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),

	662 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)

	663 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),

	664 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),

	665 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),

	666 [cospi_16_64] "r" (cospi_16_64)

	667 );

	668

	669 step2_0 = step1_0 + step1_7;

	670 step2_1 = step1_1 + step1_6;

	671 step2_2 = step1_2 + step1_5;

	672 step2_3 = step1_3 + step1_4;

	673 step2_4 = step1_3 - step1_4;

	674 step2_5 = step1_2 - step1_5;

	675 step2_6 = step1_1 - step1_6;

	676 step2_7 = step1_0 - step1_7;

	677

	678 // stage 7

	679 step1_0 = step2_0 + step3_15;

	680 step1_1 = step2_1 + step3_14;

	681 step1_2 = step2_2 + step3_13;

	682 step1_3 = step2_3 + step3_12;

	683 step1_4 = step2_4 + step3_11;

	684 step1_5 = step2_5 + step3_10;

	685 step1_6 = step2_6 + step3_9;

	686 step1_7 = step2_7 + step3_8;

	687 step1_8 = step2_7 - step3_8;

	688 step1_9 = step2_6 - step3_9;

	689 step1_10 = step2_5 - step3_10;

	690 step1_11 = step2_4 - step3_11;

	691 step1_12 = step2_3 - step3_12;

	692 step1_13 = step2_2 - step3_13;

	693 step1_14 = step2_1 - step3_14;

	694 step1_15 = step2_0 - step3_15;

	695

	696 __asm__ __volatile__ (

	697 "sub %[temp0], %[step2_27], %[step2_20] \n\t"

	698 "mtlo %[const_2_power_13], $ac0 \n\t"

	699 "mthi $zero, $ac0 \n\t"

	700 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

	701 "extp %[step1_20], $ac0, 31 \n\t"

	702

	703 : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)

	704 : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),

	705 [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)

	706 );

	707

	708 temp21 = (step2_20 + step2_27) * cospi_16_64;

	709 step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	710

	711 __asm__ __volatile__ (

	712 "sub %[temp0], %[step2_26], %[step2_21] \n\t"

	713 "mtlo %[const_2_power_13], $ac0 \n\t"

	714 "mthi $zero, $ac0 \n\t"

	715 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

	716 "extp %[step1_21], $ac0, 31 \n\t"

	717

	718 : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)

	719 : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),

	720 [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)

	721 );

	722

	723 temp21 = (step2_21 + step2_26) * cospi_16_64;

	724 step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	725

	726 __asm__ __volatile__ (

	727 "sub %[temp0], %[step2_25], %[step2_22] \n\t"

	728 "mtlo %[const_2_power_13], $ac0 \n\t"

	729 "mthi $zero, $ac0 \n\t"

	730 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

	731 "extp %[step1_22], $ac0, 31 \n\t"

	732

	733 : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)

	734 : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),

	735 [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)

	736 );

	737

	738 temp21 = (step2_22 + step2_25) * cospi_16_64;

	739 step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	740

	741 __asm__ __volatile__ (

	742 "sub %[temp0], %[step2_24], %[step2_23] \n\t"

	743 "mtlo %[const_2_power_13], $ac0 \n\t"

	744 "mthi $zero, $ac0 \n\t"

	745 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"

	746 "extp %[step1_23], $ac0, 31 \n\t"

	747

	748 : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)

	749 : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),

	750 [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)

	751 );

	752

	753 temp21 = (step2_23 + step2_24) * cospi_16_64;

	754 step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

	755

	756 __asm__ __volatile__ (

	757 "lbu %[temp2], 0(%[dest_pix]) \n\t"

	758 "add %[temp0], %[step1_0], %[step2_31] \n\t"

	759 "addi %[temp0], %[temp0], 32 \n\t"

	760 "sra %[temp0], %[temp0], 6 \n\t"

	761 "add %[temp2], %[temp2], %[temp0] \n\t"

	762 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	763 "add %[temp1], %[step1_1], %[step2_30] \n\t"

	764 "sb %[temp0], 0(%[dest_pix]) \n\t"

	765 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	766 "lbu %[temp3], 0(%[dest_pix]) \n\t"

	767 "addi %[temp1], %[temp1], 32 \n\t"

	768 "sra %[temp1], %[temp1], 6 \n\t"

	769 "add %[temp3], %[temp3], %[temp1] \n\t"

	770 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	771 "sb %[temp1], 0(%[dest_pix]) \n\t"

	772 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	773

	774 "lbu %[temp2], 0(%[dest_pix]) \n\t"

	775 "add %[temp0], %[step1_2], %[step2_29] \n\t"

	776 "addi %[temp0], %[temp0], 32 \n\t"

	777 "sra %[temp0], %[temp0], 6 \n\t"

	778 "add %[temp2], %[temp2], %[temp0] \n\t"

	779 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	780 "add %[temp1], %[step1_3], %[step2_28] \n\t"

	781 "sb %[temp0], 0(%[dest_pix]) \n\t"

	782 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	783 "lbu %[temp3], 0(%[dest_pix]) \n\t"

	784 "addi %[temp1], %[temp1], 32 \n\t"

	785 "sra %[temp1], %[temp1], 6 \n\t"

	786 "add %[temp3], %[temp3], %[temp1] \n\t"

	787 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	788 "sb %[temp1], 0(%[dest_pix]) \n\t"

	789 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	790

	791 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

	792 [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

	793 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

	794 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),

	795 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),

	796 [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),

	797 [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)

	798 );

	799

	800 step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);

	801 step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);

	802 step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);

	803 step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);

	804

	805 __asm__ __volatile__ (

	806 "lbu %[temp2], 0(%[dest_pix1]) \n\t"

	807 "add %[temp2], %[temp2], %[step3_15] \n\t"

	808 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	809 "sb %[temp0], 0(%[dest_pix1]) \n\t"

	810 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	811 "lbu %[temp3], 0(%[dest_pix1]) \n\t"

	812 "add %[temp3], %[temp3], %[step3_14] \n\t"

	813 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	814 "sb %[temp1], 0(%[dest_pix1]) \n\t"

	815 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	816

	817 "lbu %[temp2], 0(%[dest_pix1]) \n\t"

	818 "add %[temp2], %[temp2], %[step3_13] \n\t"

	819 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	820 "sb %[temp0], 0(%[dest_pix1]) \n\t"

	821 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	822 "lbu %[temp3], 0(%[dest_pix1]) \n\t"

	823 "add %[temp3], %[temp3], %[step3_12] \n\t"

	824 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	825 "sb %[temp1], 0(%[dest_pix1]) \n\t"

	826 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	827

	828 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

	829 [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

	830 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

	831 [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

	832 [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

	833 );

	834

	835 __asm__ __volatile__ (

	836 "lbu %[temp2], 0(%[dest_pix]) \n\t"

	837 "add %[temp0], %[step1_4], %[step1_27] \n\t"

	838 "addi %[temp0], %[temp0], 32 \n\t"

	839 "sra %[temp0], %[temp0], 6 \n\t"

	840 "add %[temp2], %[temp2], %[temp0] \n\t"

	841 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	842 "add %[temp1], %[step1_5], %[step1_26] \n\t"

	843 "sb %[temp0], 0(%[dest_pix]) \n\t"

	844 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	845 "lbu %[temp3], 0(%[dest_pix]) \n\t"

	846 "addi %[temp1], %[temp1], 32 \n\t"

	847 "sra %[temp1], %[temp1], 6 \n\t"

	848 "add %[temp3], %[temp3], %[temp1] \n\t"

	849 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	850 "sb %[temp1], 0(%[dest_pix]) \n\t"

	851 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	852

	853 "lbu %[temp2], 0(%[dest_pix]) \n\t"

	854 "add %[temp0], %[step1_6], %[step1_25] \n\t"

	855 "addi %[temp0], %[temp0], 32 \n\t"

	856 "sra %[temp0], %[temp0], 6 \n\t"

	857 "add %[temp2], %[temp2], %[temp0] \n\t"

	858 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	859 "add %[temp1], %[step1_7], %[step1_24] \n\t"

	860 "sb %[temp0], 0(%[dest_pix]) \n\t"

	861 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	862 "lbu %[temp3], 0(%[dest_pix]) \n\t"

	863 "addi %[temp1], %[temp1], 32 \n\t"

	864 "sra %[temp1], %[temp1], 6 \n\t"

	865 "add %[temp3], %[temp3], %[temp1] \n\t"

	866 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	867 "sb %[temp1], 0(%[dest_pix]) \n\t"

	868 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	869

	870 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

	871 [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

	872 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

	873 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),

	874 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),

	875 [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),

	876 [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)

	877 );

	878

	879 step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);

	880 step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);

	881 step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);

	882 step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);

	883

	884 __asm__ __volatile__ (

	885 "lbu %[temp2], 0(%[dest_pix1]) \n\t"

	886 "add %[temp2], %[temp2], %[step3_15] \n\t"

	887 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	888 "sb %[temp0], 0(%[dest_pix1]) \n\t"

	889 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	890 "lbu %[temp3], 0(%[dest_pix1]) \n\t"

	891 "add %[temp3], %[temp3], %[step3_14] \n\t"

	892 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	893 "sb %[temp1], 0(%[dest_pix1]) \n\t"

	894 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	895

	896 "lbu %[temp2], 0(%[dest_pix1]) \n\t"

	897 "add %[temp2], %[temp2], %[step3_13] \n\t"

	898 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	899 "sb %[temp0], 0(%[dest_pix1]) \n\t"

	900 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	901 "lbu %[temp3], 0(%[dest_pix1]) \n\t"

	902 "add %[temp3], %[temp3], %[step3_12] \n\t"

	903 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	904 "sb %[temp1], 0(%[dest_pix1]) \n\t"

	905 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	906

	907 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

	908 [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

	909 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

	910 [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

	911 [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

	912 );

	913

	914 __asm__ __volatile__ (

	915 "lbu %[temp2], 0(%[dest_pix]) \n\t"

	916 "add %[temp0], %[step1_8], %[step1_23] \n\t"

	917 "addi %[temp0], %[temp0], 32 \n\t"

	918 "sra %[temp0], %[temp0], 6 \n\t"

	919 "add %[temp2], %[temp2], %[temp0] \n\t"

	920 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	921 "add %[temp1], %[step1_9], %[step1_22] \n\t"

	922 "sb %[temp0], 0(%[dest_pix]) \n\t"

	923 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	924 "lbu %[temp3], 0(%[dest_pix]) \n\t"

	925 "addi %[temp1], %[temp1], 32 \n\t"

	926 "sra %[temp1], %[temp1], 6 \n\t"

	927 "add %[temp3], %[temp3], %[temp1] \n\t"

	928 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	929 "sb %[temp1], 0(%[dest_pix]) \n\t"

	930 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	931

	932 "lbu %[temp2], 0(%[dest_pix]) \n\t"

	933 "add %[temp0], %[step1_10], %[step1_21] \n\t"

	934 "addi %[temp0], %[temp0], 32 \n\t"

	935 "sra %[temp0], %[temp0], 6 \n\t"

	936 "add %[temp2], %[temp2], %[temp0] \n\t"

	937 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	938 "add %[temp1], %[step1_11], %[step1_20] \n\t"

	939 "sb %[temp0], 0(%[dest_pix]) \n\t"

	940 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	941 "lbu %[temp3], 0(%[dest_pix]) \n\t"

	942 "addi %[temp1], %[temp1], 32 \n\t"

	943 "sra %[temp1], %[temp1], 6 \n\t"

	944 "add %[temp3], %[temp3], %[temp1] \n\t"

	945 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	946 "sb %[temp1], 0(%[dest_pix]) \n\t"

	947 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	948

	949 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

	950 [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

	951 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

	952 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),

	953 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

	954 [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),

	955 [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)

	956 );

	957

	958 step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);

	959 step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);

	960 step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);

	961 step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);

	962

	963 __asm__ __volatile__ (

	964 "lbu %[temp2], 0(%[dest_pix1]) \n\t"

	965 "add %[temp2], %[temp2], %[step3_15] \n\t"

	966 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	967 "sb %[temp0], 0(%[dest_pix1]) \n\t"

	968 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	969 "lbu %[temp3], 0(%[dest_pix1]) \n\t"

	970 "add %[temp3], %[temp3], %[step3_14] \n\t"

	971 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	972 "sb %[temp1], 0(%[dest_pix1]) \n\t"

	973 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	974

	975 "lbu %[temp2], 0(%[dest_pix1]) \n\t"

	976 "add %[temp2], %[temp2], %[step3_13] \n\t"

	977 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	978 "sb %[temp0], 0(%[dest_pix1]) \n\t"

	979 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	980 "lbu %[temp3], 0(%[dest_pix1]) \n\t"

	981 "add %[temp3], %[temp3], %[step3_12] \n\t"

	982 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	983 "sb %[temp1], 0(%[dest_pix1]) \n\t"

	984 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	985

	986 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

	987 [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

	988 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

	989 [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

	990 [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

	991 );

	992

	993 __asm__ __volatile__ (

	994 "lbu %[temp2], 0(%[dest_pix]) \n\t"

	995 "add %[temp0], %[step1_12], %[step2_19] \n\t"

	996 "addi %[temp0], %[temp0], 32 \n\t"

	997 "sra %[temp0], %[temp0], 6 \n\t"

	998 "add %[temp2], %[temp2], %[temp0] \n\t"

	999 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	1000 "add %[temp1], %[step1_13], %[step2_18] \n\t"

	1001 "sb %[temp0], 0(%[dest_pix]) \n\t"

	1002 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	1003 "lbu %[temp3], 0(%[dest_pix]) \n\t"

	1004 "addi %[temp1], %[temp1], 32 \n\t"

	1005 "sra %[temp1], %[temp1], 6 \n\t"

	1006 "add %[temp3], %[temp3], %[temp1] \n\t"

	1007 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	1008 "sb %[temp1], 0(%[dest_pix]) \n\t"

	1009 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	1010

	1011 "lbu %[temp2], 0(%[dest_pix]) \n\t"

	1012 "add %[temp0], %[step1_14], %[step2_17] \n\t"

	1013 "addi %[temp0], %[temp0], 32 \n\t"

	1014 "sra %[temp0], %[temp0], 6 \n\t"

	1015 "add %[temp2], %[temp2], %[temp0] \n\t"

	1016 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	1017 "add %[temp1], %[step1_15], %[step2_16] \n\t"

	1018 "sb %[temp0], 0(%[dest_pix]) \n\t"

	1019 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"

	1020 "lbu %[temp3], 0(%[dest_pix]) \n\t"

	1021 "addi %[temp1], %[temp1], 32 \n\t"

	1022 "sra %[temp1], %[temp1], 6 \n\t"

	1023 "add %[temp3], %[temp3], %[temp1] \n\t"

	1024 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	1025 "sb %[temp1], 0(%[dest_pix]) \n\t"

	1026

	1027 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

	1028 [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)

	1029 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

	1030 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),

	1031 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),

	1032 [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),

	1033 [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)

	1034 );

	1035

	1036 step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);

	1037 step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);

	1038 step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);

	1039 step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);

	1040

	1041 __asm__ __volatile__ (

	1042 "lbu %[temp2], 0(%[dest_pix1]) \n\t"

	1043 "add %[temp2], %[temp2], %[step3_15] \n\t"

	1044 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	1045 "sb %[temp0], 0(%[dest_pix1]) \n\t"

	1046 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	1047 "lbu %[temp3], 0(%[dest_pix1]) \n\t"

	1048 "add %[temp3], %[temp3], %[step3_14] \n\t"

	1049 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	1050 "sb %[temp1], 0(%[dest_pix1]) \n\t"

	1051 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	1052

	1053 "lbu %[temp2], 0(%[dest_pix1]) \n\t"

	1054 "add %[temp2], %[temp2], %[step3_13] \n\t"

	1055 "lbux %[temp0], %[temp2](%[cm]) \n\t"

	1056 "sb %[temp0], 0(%[dest_pix1]) \n\t"

	1057 "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"

	1058 "lbu %[temp3], 0(%[dest_pix1]) \n\t"

	1059 "add %[temp3], %[temp3], %[step3_12] \n\t"

	1060 "lbux %[temp1], %[temp3](%[cm]) \n\t"

	1061 "sb %[temp1], 0(%[dest_pix1]) \n\t"

	1062

	1063 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),

	1064 [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)

	1065 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),

	1066 [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),

	1067 [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)

	1068 );

	1069

	1070 input += 32;

	1071 }

	1072 }

	1073 #endif // #if HAVE_DSPR2

OLD	NEW