source/libvpx/third_party/libyuv/source/row_mips.cc - Issue 341293003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/third_party/libyuv/source/row_mips.cc

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include "third_party/libyuv/include/libyuv/row.h"

	12

	13 #ifdef __cplusplus

	14 namespace libyuv {

	15 extern "C" {

	16 #endif

	17

	18 // The following are available on Mips platforms:

	19 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)

	20

	21 #ifdef HAS_COPYROW_MIPS

	22 void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {

	23 __asm__ __volatile__ (

	24 ".set noreorder \n"

	25 ".set noat \n"

	26 "slti $at, %[count], 8 \n"

	27 "bne $at ,$zero, $last8 \n"

	28 "xor $t8, %[src], %[dst] \n"

	29 "andi $t8, $t8, 0x3 \n"

	30

	31 "bne $t8, $zero, unaligned \n"

	32 "negu $a3, %[dst] \n"

	33 // make dst/src aligned

	34 "andi $a3, $a3, 0x3 \n"

	35 "beq $a3, $zero, $chk16w \n"

	36 // word-aligned now count is the remining bytes count

	37 "subu %[count], %[count], $a3 \n"

	38

	39 "lwr $t8, 0(%[src]) \n"

	40 "addu %[src], %[src], $a3 \n"

	41 "swr $t8, 0(%[dst]) \n"

	42 "addu %[dst], %[dst], $a3 \n"

	43

	44 // Now the dst/src are mutually word-aligned with word-aligned addresses

	45 "$chk16w: \n"

	46 "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?

	47 // t8 is the byte count after 64-byte chunks

	48 "beq %[count], $t8, chk8w \n"

	49 // There will be at most 1 32-byte chunk after it

	50 "subu $a3, %[count], $t8 \n" // the reminder

	51 // Here a3 counts bytes in 16w chunks

	52 "addu $a3, %[dst], $a3 \n"

	53 // Now a3 is the final dst after 64-byte chunks

	54 "addu $t0, %[dst], %[count] \n"

	55 // t0 is the "past the end" address

	56

	57 // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past

	58 // the "t0-32" address

	59 // This means: for x=128 the last "safe" a1 address is "t0-160"

	60 // Alternatively, for x=64 the last "safe" a1 address is "t0-96"

	61 // we will use "pref 30,128(a1)", so "t0-160" is the limit

	62 "subu $t9, $t0, 160 \n"

	63 // t9 is the "last safe pref 30,128(a1)" address

	64 "pref 0, 0(%[src]) \n" // first line of src

	65 "pref 0, 32(%[src]) \n" // second line of src

	66 "pref 0, 64(%[src]) \n"

	67 "pref 30, 32(%[dst]) \n"

	68 // In case the a1 > t9 don't use "pref 30" at all

	69 "sgtu $v1, %[dst], $t9 \n"

	70 "bgtz $v1, $loop16w \n"

	71 "nop \n"

	72 // otherwise, start with using pref30

	73 "pref 30, 64(%[dst]) \n"

	74 "$loop16w: \n"

	75 "pref 0, 96(%[src]) \n"

	76 "lw $t0, 0(%[src]) \n"

	77 "bgtz $v1, $skip_pref30_96 \n" // skip

	78 "lw $t1, 4(%[src]) \n"

	79 "pref 30, 96(%[dst]) \n" // continue

	80 "$skip_pref30_96: \n"

	81 "lw $t2, 8(%[src]) \n"

	82 "lw $t3, 12(%[src]) \n"

	83 "lw $t4, 16(%[src]) \n"

	84 "lw $t5, 20(%[src]) \n"

	85 "lw $t6, 24(%[src]) \n"

	86 "lw $t7, 28(%[src]) \n"

	87 "pref 0, 128(%[src]) \n"

	88 // bring the next lines of src, addr 128

	89 "sw $t0, 0(%[dst]) \n"

	90 "sw $t1, 4(%[dst]) \n"

	91 "sw $t2, 8(%[dst]) \n"

	92 "sw $t3, 12(%[dst]) \n"

	93 "sw $t4, 16(%[dst]) \n"

	94 "sw $t5, 20(%[dst]) \n"

	95 "sw $t6, 24(%[dst]) \n"

	96 "sw $t7, 28(%[dst]) \n"

	97 "lw $t0, 32(%[src]) \n"

	98 "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)

	99 "lw $t1, 36(%[src]) \n"

	100 "pref 30, 128(%[dst]) \n" // set dest, addr 128

	101 "$skip_pref30_128: \n"

	102 "lw $t2, 40(%[src]) \n"

	103 "lw $t3, 44(%[src]) \n"

	104 "lw $t4, 48(%[src]) \n"

	105 "lw $t5, 52(%[src]) \n"

	106 "lw $t6, 56(%[src]) \n"

	107 "lw $t7, 60(%[src]) \n"

	108 "pref 0, 160(%[src]) \n"

	109 // bring the next lines of src, addr 160

	110 "sw $t0, 32(%[dst]) \n"

	111 "sw $t1, 36(%[dst]) \n"

	112 "sw $t2, 40(%[dst]) \n"

	113 "sw $t3, 44(%[dst]) \n"

	114 "sw $t4, 48(%[dst]) \n"

	115 "sw $t5, 52(%[dst]) \n"

	116 "sw $t6, 56(%[dst]) \n"

	117 "sw $t7, 60(%[dst]) \n"

	118

	119 "addiu %[dst], %[dst], 64 \n" // adding 64 to dest

	120 "sgtu $v1, %[dst], $t9 \n"

	121 "bne %[dst], $a3, $loop16w \n"

	122 " addiu %[src], %[src], 64 \n" // adding 64 to src

	123 "move %[count], $t8 \n"

	124

	125 // Here we have src and dest word-aligned but less than 64-bytes to go

	126

	127 "chk8w: \n"

	128 "pref 0, 0x0(%[src]) \n"

	129 "andi $t8, %[count], 0x1f \n" // 32-byte chunk?

	130 // the t8 is the reminder count past 32-bytes

	131 "beq %[count], $t8, chk1w \n"

	132 // count=t8,no 32-byte chunk

	133 " nop \n"

	134

	135 "lw $t0, 0(%[src]) \n"

	136 "lw $t1, 4(%[src]) \n"

	137 "lw $t2, 8(%[src]) \n"

	138 "lw $t3, 12(%[src]) \n"

	139 "lw $t4, 16(%[src]) \n"

	140 "lw $t5, 20(%[src]) \n"

	141 "lw $t6, 24(%[src]) \n"

	142 "lw $t7, 28(%[src]) \n"

	143 "addiu %[src], %[src], 32 \n"

	144

	145 "sw $t0, 0(%[dst]) \n"

	146 "sw $t1, 4(%[dst]) \n"

	147 "sw $t2, 8(%[dst]) \n"

	148 "sw $t3, 12(%[dst]) \n"

	149 "sw $t4, 16(%[dst]) \n"

	150 "sw $t5, 20(%[dst]) \n"

	151 "sw $t6, 24(%[dst]) \n"

	152 "sw $t7, 28(%[dst]) \n"

	153 "addiu %[dst], %[dst], 32 \n"

	154

	155 "chk1w: \n"

	156 "andi %[count], $t8, 0x3 \n"

	157 // now count is the reminder past 1w chunks

	158 "beq %[count], $t8, $last8 \n"

	159 " subu $a3, $t8, %[count] \n"

	160 // a3 is count of bytes in 1w chunks

	161 "addu $a3, %[dst], $a3 \n"

	162 // now a3 is the dst address past the 1w chunks

	163 // copying in words (4-byte chunks)

	164 "$wordCopy_loop: \n"

	165 "lw $t3, 0(%[src]) \n"

	166 // the first t3 may be equal t0 ... optimize?

	167 "addiu %[src], %[src],4 \n"

	168 "addiu %[dst], %[dst],4 \n"

	169 "bne %[dst], $a3,$wordCopy_loop \n"

	170 " sw $t3, -4(%[dst]) \n"

	171

	172 // For the last (<8) bytes

	173 "$last8: \n"

	174 "blez %[count], leave \n"

	175 " addu $a3, %[dst], %[count] \n" // a3 -last dst address

	176 "$last8loop: \n"

	177 "lb $v1, 0(%[src]) \n"

	178 "addiu %[src], %[src], 1 \n"

	179 "addiu %[dst], %[dst], 1 \n"

	180 "bne %[dst], $a3, $last8loop \n"

	181 " sb $v1, -1(%[dst]) \n"

	182

	183 "leave: \n"

	184 " j $ra \n"

	185 " nop \n"

	186

	187 //

	188 // UNALIGNED case

	189 //

	190

	191 "unaligned: \n"

	192 // got here with a3="negu a1"

	193 "andi $a3, $a3, 0x3 \n" // a1 is word aligned?

	194 "beqz $a3, $ua_chk16w \n"

	195 " subu %[count], %[count], $a3 \n"

	196 // bytes left after initial a3 bytes

	197 "lwr $v1, 0(%[src]) \n"

	198 "lwl $v1, 3(%[src]) \n"

	199 "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3

	200 "swr $v1, 0(%[dst]) \n"

	201 "addu %[dst], %[dst], $a3 \n"

	202 // below the dst will be word aligned (NOTE1)

	203 "$ua_chk16w: \n"

	204 "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?

	205 // t8 is the byte count after 64-byte chunks

	206 "beq %[count], $t8, ua_chk8w \n"

	207 // if a2==t8, no 64-byte chunks

	208 // There will be at most 1 32-byte chunk after it

	209 "subu $a3, %[count], $t8 \n" // the reminder

	210 // Here a3 counts bytes in 16w chunks

	211 "addu $a3, %[dst], $a3 \n"

	212 // Now a3 is the final dst after 64-byte chunks

	213 "addu $t0, %[dst], %[count] \n" // t0 "past the end"

	214 "subu $t9, $t0, 160 \n"

	215 // t9 is the "last safe pref 30,128(a1)" address

	216 "pref 0, 0(%[src]) \n" // first line of src

	217 "pref 0, 32(%[src]) \n" // second line addr 32

	218 "pref 0, 64(%[src]) \n"

	219 "pref 30, 32(%[dst]) \n"

	220 // safe, as we have at least 64 bytes ahead

	221 // In case the a1 > t9 don't use "pref 30" at all

	222 "sgtu $v1, %[dst], $t9 \n"

	223 "bgtz $v1, $ua_loop16w \n"

	224 // skip "pref 30,64(a1)" for too short arrays

	225 " nop \n"

	226 // otherwise, start with using pref30

	227 "pref 30, 64(%[dst]) \n"

	228 "$ua_loop16w: \n"

	229 "pref 0, 96(%[src]) \n"

	230 "lwr $t0, 0(%[src]) \n"

	231 "lwl $t0, 3(%[src]) \n"

	232 "lwr $t1, 4(%[src]) \n"

	233 "bgtz $v1, $ua_skip_pref30_96 \n"

	234 " lwl $t1, 7(%[src]) \n"

	235 "pref 30, 96(%[dst]) \n"

	236 // continue setting up the dest, addr 96

	237 "$ua_skip_pref30_96: \n"

	238 "lwr $t2, 8(%[src]) \n"

	239 "lwl $t2, 11(%[src]) \n"

	240 "lwr $t3, 12(%[src]) \n"

	241 "lwl $t3, 15(%[src]) \n"

	242 "lwr $t4, 16(%[src]) \n"

	243 "lwl $t4, 19(%[src]) \n"

	244 "lwr $t5, 20(%[src]) \n"

	245 "lwl $t5, 23(%[src]) \n"

	246 "lwr $t6, 24(%[src]) \n"

	247 "lwl $t6, 27(%[src]) \n"

	248 "lwr $t7, 28(%[src]) \n"

	249 "lwl $t7, 31(%[src]) \n"

	250 "pref 0, 128(%[src]) \n"

	251 // bring the next lines of src, addr 128

	252 "sw $t0, 0(%[dst]) \n"

	253 "sw $t1, 4(%[dst]) \n"

	254 "sw $t2, 8(%[dst]) \n"

	255 "sw $t3, 12(%[dst]) \n"

	256 "sw $t4, 16(%[dst]) \n"

	257 "sw $t5, 20(%[dst]) \n"

	258 "sw $t6, 24(%[dst]) \n"

	259 "sw $t7, 28(%[dst]) \n"

	260 "lwr $t0, 32(%[src]) \n"

	261 "lwl $t0, 35(%[src]) \n"

	262 "lwr $t1, 36(%[src]) \n"

	263 "bgtz $v1, ua_skip_pref30_128 \n"

	264 " lwl $t1, 39(%[src]) \n"

	265 "pref 30, 128(%[dst]) \n"

	266 // continue setting up the dest, addr 128

	267 "ua_skip_pref30_128: \n"

	268

	269 "lwr $t2, 40(%[src]) \n"

	270 "lwl $t2, 43(%[src]) \n"

	271 "lwr $t3, 44(%[src]) \n"

	272 "lwl $t3, 47(%[src]) \n"

	273 "lwr $t4, 48(%[src]) \n"

	274 "lwl $t4, 51(%[src]) \n"

	275 "lwr $t5, 52(%[src]) \n"

	276 "lwl $t5, 55(%[src]) \n"

	277 "lwr $t6, 56(%[src]) \n"

	278 "lwl $t6, 59(%[src]) \n"

	279 "lwr $t7, 60(%[src]) \n"

	280 "lwl $t7, 63(%[src]) \n"

	281 "pref 0, 160(%[src]) \n"

	282 // bring the next lines of src, addr 160

	283 "sw $t0, 32(%[dst]) \n"

	284 "sw $t1, 36(%[dst]) \n"

	285 "sw $t2, 40(%[dst]) \n"

	286 "sw $t3, 44(%[dst]) \n"

	287 "sw $t4, 48(%[dst]) \n"

	288 "sw $t5, 52(%[dst]) \n"

	289 "sw $t6, 56(%[dst]) \n"

	290 "sw $t7, 60(%[dst]) \n"

	291

	292 "addiu %[dst],%[dst],64 \n" // adding 64 to dest

	293 "sgtu $v1,%[dst],$t9 \n"

	294 "bne %[dst],$a3,$ua_loop16w \n"

	295 " addiu %[src],%[src],64 \n" // adding 64 to src

	296 "move %[count],$t8 \n"

	297

	298 // Here we have src and dest word-aligned but less than 64-bytes to go

	299

	300 "ua_chk8w: \n"

	301 "pref 0, 0x0(%[src]) \n"

	302 "andi $t8, %[count], 0x1f \n" // 32-byte chunk?

	303 // the t8 is the reminder count

	304 "beq %[count], $t8, $ua_chk1w \n"

	305 // when count==t8, no 32-byte chunk

	306

	307 "lwr $t0, 0(%[src]) \n"

	308 "lwl $t0, 3(%[src]) \n"

	309 "lwr $t1, 4(%[src]) \n"

	310 "lwl $t1, 7(%[src]) \n"

	311 "lwr $t2, 8(%[src]) \n"

	312 "lwl $t2, 11(%[src]) \n"

	313 "lwr $t3, 12(%[src]) \n"

	314 "lwl $t3, 15(%[src]) \n"

	315 "lwr $t4, 16(%[src]) \n"

	316 "lwl $t4, 19(%[src]) \n"

	317 "lwr $t5, 20(%[src]) \n"

	318 "lwl $t5, 23(%[src]) \n"

	319 "lwr $t6, 24(%[src]) \n"

	320 "lwl $t6, 27(%[src]) \n"

	321 "lwr $t7, 28(%[src]) \n"

	322 "lwl $t7, 31(%[src]) \n"

	323 "addiu %[src], %[src], 32 \n"

	324

	325 "sw $t0, 0(%[dst]) \n"

	326 "sw $t1, 4(%[dst]) \n"

	327 "sw $t2, 8(%[dst]) \n"

	328 "sw $t3, 12(%[dst]) \n"

	329 "sw $t4, 16(%[dst]) \n"

	330 "sw $t5, 20(%[dst]) \n"

	331 "sw $t6, 24(%[dst]) \n"

	332 "sw $t7, 28(%[dst]) \n"

	333 "addiu %[dst], %[dst], 32 \n"

	334

	335 "$ua_chk1w: \n"

	336 "andi %[count], $t8, 0x3 \n"

	337 // now count is the reminder past 1w chunks

	338 "beq %[count], $t8, ua_smallCopy \n"

	339 "subu $a3, $t8, %[count] \n"

	340 // a3 is count of bytes in 1w chunks

	341 "addu $a3, %[dst], $a3 \n"

	342 // now a3 is the dst address past the 1w chunks

	343

	344 // copying in words (4-byte chunks)

	345 "$ua_wordCopy_loop: \n"

	346 "lwr $v1, 0(%[src]) \n"

	347 "lwl $v1, 3(%[src]) \n"

	348 "addiu %[src], %[src], 4 \n"

	349 "addiu %[dst], %[dst], 4 \n"

	350 // note: dst=a1 is word aligned here, see NOTE1

	351 "bne %[dst], $a3, $ua_wordCopy_loop \n"

	352 " sw $v1,-4(%[dst]) \n"

	353

	354 // Now less than 4 bytes (value in count) left to copy

	355 "ua_smallCopy: \n"

	356 "beqz %[count], leave \n"

	357 " addu $a3, %[dst], %[count] \n" // a3 = last dst address

	358 "$ua_smallCopy_loop: \n"

	359 "lb $v1, 0(%[src]) \n"

	360 "addiu %[src], %[src], 1 \n"

	361 "addiu %[dst], %[dst], 1 \n"

	362 "bne %[dst],$a3,$ua_smallCopy_loop \n"

	363 " sb $v1, -1(%[dst]) \n"

	364

	365 "j $ra \n"

	366 " nop \n"

	367 ".set at \n"

	368 ".set reorder \n"

	369 : [dst] "+r" (dst), [src] "+r" (src)

	370 : [count] "r" (count)

	371 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",

	372 "t8", "t9", "a3", "v1", "at"

	373 );

	374 }

	375 #endif // HAS_COPYROW_MIPS

	376

	377 // MIPS DSPR2 functions

	378 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \

	379 (__mips_dsp_rev >= 2)

	380 void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

	381 int width) {

	382 __asm__ __volatile__ (

	383 ".set push \n"

	384 ".set noreorder \n"

	385 "srl $t4, %[width], 4 \n" // multiplies of 16

	386 "blez $t4, 2f \n"

	387 " andi %[width], %[width], 0xf \n" // residual

	388

	389 ".p2align 2 \n"

	390 "1: \n"

	391 "addiu $t4, $t4, -1 \n"

	392 "lw $t0, 0(%[src_uv]) \n" // V1 \| U1 \| V0 \| U0

	393 "lw $t1, 4(%[src_uv]) \n" // V3 \| U3 \| V2 \| U2

	394 "lw $t2, 8(%[src_uv]) \n" // V5 \| U5 \| V4 \| U4

	395 "lw $t3, 12(%[src_uv]) \n" // V7 \| U7 \| V6 \| U6

	396 "lw $t5, 16(%[src_uv]) \n" // V9 \| U9 \| V8 \| U8

	397 "lw $t6, 20(%[src_uv]) \n" // V11 \| U11 \| V10 \| U10

	398 "lw $t7, 24(%[src_uv]) \n" // V13 \| U13 \| V12 \| U12

	399 "lw $t8, 28(%[src_uv]) \n" // V15 \| U15 \| V14 \| U14

	400 "addiu %[src_uv], %[src_uv], 32 \n"

	401 "precrq.qb.ph $t9, $t1, $t0 \n" // V3 \| V2 \| V1 \| V0

	402 "precr.qb.ph $t0, $t1, $t0 \n" // U3 \| U2 \| U1 \| U0

	403 "precrq.qb.ph $t1, $t3, $t2 \n" // V7 \| V6 \| V5 \| V4

	404 "precr.qb.ph $t2, $t3, $t2 \n" // U7 \| U6 \| U5 \| U4

	405 "precrq.qb.ph $t3, $t6, $t5 \n" // V11 \| V10 \| V9 \| V8

	406 "precr.qb.ph $t5, $t6, $t5 \n" // U11 \| U10 \| U9 \| U8

	407 "precrq.qb.ph $t6, $t8, $t7 \n" // V15 \| V14 \| V13 \| V12

	408 "precr.qb.ph $t7, $t8, $t7 \n" // U15 \| U14 \| U13 \| U12

	409 "sw $t9, 0(%[dst_v]) \n"

	410 "sw $t0, 0(%[dst_u]) \n"

	411 "sw $t1, 4(%[dst_v]) \n"

	412 "sw $t2, 4(%[dst_u]) \n"

	413 "sw $t3, 8(%[dst_v]) \n"

	414 "sw $t5, 8(%[dst_u]) \n"

	415 "sw $t6, 12(%[dst_v]) \n"

	416 "sw $t7, 12(%[dst_u]) \n"

	417 "addiu %[dst_v], %[dst_v], 16 \n"

	418 "bgtz $t4, 1b \n"

	419 " addiu %[dst_u], %[dst_u], 16 \n"

	420

	421 "beqz %[width], 3f \n"

	422 " nop \n"

	423

	424 "2: \n"

	425 "lbu $t0, 0(%[src_uv]) \n"

	426 "lbu $t1, 1(%[src_uv]) \n"

	427 "addiu %[src_uv], %[src_uv], 2 \n"

	428 "addiu %[width], %[width], -1 \n"

	429 "sb $t0, 0(%[dst_u]) \n"

	430 "sb $t1, 0(%[dst_v]) \n"

	431 "addiu %[dst_u], %[dst_u], 1 \n"

	432 "bgtz %[width], 2b \n"

	433 " addiu %[dst_v], %[dst_v], 1 \n"

	434

	435 "3: \n"

	436 ".set pop \n"

	437 : [src_uv] "+r" (src_uv),

	438 [width] "+r" (width),

	439 [dst_u] "+r" (dst_u),

	440 [dst_v] "+r" (dst_v)

	441 :

	442 : "t0", "t1", "t2", "t3",

	443 "t4", "t5", "t6", "t7", "t8", "t9"

	444 );

	445 }

	446

	447 void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,

	448 uint8* dst_v, int width) {

	449 __asm__ __volatile__ (

	450 ".set push \n"

	451 ".set noreorder \n"

	452 "srl $t4, %[width], 4 \n" // multiplies of 16

	453 "blez $t4, 2f \n"

	454 " andi %[width], %[width], 0xf \n" // residual

	455

	456 ".p2align 2 \n"

	457 "1: \n"

	458 "addiu $t4, $t4, -1 \n"

	459 "lwr $t0, 0(%[src_uv]) \n"

	460 "lwl $t0, 3(%[src_uv]) \n" // V1 \| U1 \| V0 \| U0

	461 "lwr $t1, 4(%[src_uv]) \n"

	462 "lwl $t1, 7(%[src_uv]) \n" // V3 \| U3 \| V2 \| U2

	463 "lwr $t2, 8(%[src_uv]) \n"

	464 "lwl $t2, 11(%[src_uv]) \n" // V5 \| U5 \| V4 \| U4

	465 "lwr $t3, 12(%[src_uv]) \n"

	466 "lwl $t3, 15(%[src_uv]) \n" // V7 \| U7 \| V6 \| U6

	467 "lwr $t5, 16(%[src_uv]) \n"

	468 "lwl $t5, 19(%[src_uv]) \n" // V9 \| U9 \| V8 \| U8

	469 "lwr $t6, 20(%[src_uv]) \n"

	470 "lwl $t6, 23(%[src_uv]) \n" // V11 \| U11 \| V10 \| U10

	471 "lwr $t7, 24(%[src_uv]) \n"

	472 "lwl $t7, 27(%[src_uv]) \n" // V13 \| U13 \| V12 \| U12

	473 "lwr $t8, 28(%[src_uv]) \n"

	474 "lwl $t8, 31(%[src_uv]) \n" // V15 \| U15 \| V14 \| U14

	475 "precrq.qb.ph $t9, $t1, $t0 \n" // V3 \| V2 \| V1 \| V0

	476 "precr.qb.ph $t0, $t1, $t0 \n" // U3 \| U2 \| U1 \| U0

	477 "precrq.qb.ph $t1, $t3, $t2 \n" // V7 \| V6 \| V5 \| V4

	478 "precr.qb.ph $t2, $t3, $t2 \n" // U7 \| U6 \| U5 \| U4

	479 "precrq.qb.ph $t3, $t6, $t5 \n" // V11 \| V10 \| V9 \| V8

	480 "precr.qb.ph $t5, $t6, $t5 \n" // U11 \| U10 \| U9 \| U8

	481 "precrq.qb.ph $t6, $t8, $t7 \n" // V15 \| V14 \| V13 \| V12

	482 "precr.qb.ph $t7, $t8, $t7 \n" // U15 \| U14 \| U13 \| U12

	483 "addiu %[src_uv], %[src_uv], 32 \n"

	484 "swr $t9, 0(%[dst_v]) \n"

	485 "swl $t9, 3(%[dst_v]) \n"

	486 "swr $t0, 0(%[dst_u]) \n"

	487 "swl $t0, 3(%[dst_u]) \n"

	488 "swr $t1, 4(%[dst_v]) \n"

	489 "swl $t1, 7(%[dst_v]) \n"

	490 "swr $t2, 4(%[dst_u]) \n"

	491 "swl $t2, 7(%[dst_u]) \n"

	492 "swr $t3, 8(%[dst_v]) \n"

	493 "swl $t3, 11(%[dst_v]) \n"

	494 "swr $t5, 8(%[dst_u]) \n"

	495 "swl $t5, 11(%[dst_u]) \n"

	496 "swr $t6, 12(%[dst_v]) \n"

	497 "swl $t6, 15(%[dst_v]) \n"

	498 "swr $t7, 12(%[dst_u]) \n"

	499 "swl $t7, 15(%[dst_u]) \n"

	500 "addiu %[dst_u], %[dst_u], 16 \n"

	501 "bgtz $t4, 1b \n"

	502 " addiu %[dst_v], %[dst_v], 16 \n"

	503

	504 "beqz %[width], 3f \n"

	505 " nop \n"

	506

	507 "2: \n"

	508 "lbu $t0, 0(%[src_uv]) \n"

	509 "lbu $t1, 1(%[src_uv]) \n"

	510 "addiu %[src_uv], %[src_uv], 2 \n"

	511 "addiu %[width], %[width], -1 \n"

	512 "sb $t0, 0(%[dst_u]) \n"

	513 "sb $t1, 0(%[dst_v]) \n"

	514 "addiu %[dst_u], %[dst_u], 1 \n"

	515 "bgtz %[width], 2b \n"

	516 " addiu %[dst_v], %[dst_v], 1 \n"

	517

	518 "3: \n"

	519 ".set pop \n"

	520 : [src_uv] "+r" (src_uv),

	521 [width] "+r" (width),

	522 [dst_u] "+r" (dst_u),

	523 [dst_v] "+r" (dst_v)

	524 :

	525 : "t0", "t1", "t2", "t3",

	526 "t4", "t5", "t6", "t7", "t8", "t9"

	527 );

	528 }

	529

	530 void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {

	531 __asm__ __volatile__ (

	532 ".set push \n"

	533 ".set noreorder \n"

	534

	535 "srl $t4, %[width], 4 \n" // multiplies of 16

	536 "andi $t5, %[width], 0xf \n"

	537 "blez $t4, 2f \n"

	538 " addu %[src], %[src], %[width] \n" // src += width

	539

	540 ".p2align 2 \n"

	541 "1: \n"

	542 "lw $t0, -16(%[src]) \n" // \|3\|2\|1\|0\|

	543 "lw $t1, -12(%[src]) \n" // \|7\|6\|5\|4\|

	544 "lw $t2, -8(%[src]) \n" // \|11\|10\|9\|8\|

	545 "lw $t3, -4(%[src]) \n" // \|15\|14\|13\|12\|

	546 "wsbh $t0, $t0 \n" // \|2\|3\|0\|1\|

	547 "wsbh $t1, $t1 \n" // \|6\|7\|4\|5\|

	548 "wsbh $t2, $t2 \n" // \|10\|11\|8\|9\|

	549 "wsbh $t3, $t3 \n" // \|14\|15\|12\|13\|

	550 "rotr $t0, $t0, 16 \n" // \|0\|1\|2\|3\|

	551 "rotr $t1, $t1, 16 \n" // \|4\|5\|6\|7\|

	552 "rotr $t2, $t2, 16 \n" // \|8\|9\|10\|11\|

	553 "rotr $t3, $t3, 16 \n" // \|12\|13\|14\|15\|

	554 "addiu %[src], %[src], -16 \n"

	555 "addiu $t4, $t4, -1 \n"

	556 "sw $t3, 0(%[dst]) \n" // \|15\|14\|13\|12\|

	557 "sw $t2, 4(%[dst]) \n" // \|11\|10\|9\|8\|

	558 "sw $t1, 8(%[dst]) \n" // \|7\|6\|5\|4\|

	559 "sw $t0, 12(%[dst]) \n" // \|3\|2\|1\|0\|

	560 "bgtz $t4, 1b \n"

	561 " addiu %[dst], %[dst], 16 \n"

	562 "beqz $t5, 3f \n"

	563 " nop \n"

	564

	565 "2: \n"

	566 "lbu $t0, -1(%[src]) \n"

	567 "addiu $t5, $t5, -1 \n"

	568 "addiu %[src], %[src], -1 \n"

	569 "sb $t0, 0(%[dst]) \n"

	570 "bgez $t5, 2b \n"

	571 " addiu %[dst], %[dst], 1 \n"

	572

	573 "3: \n"

	574 ".set pop \n"

	575 : [src] "+r" (src), [dst] "+r" (dst)

	576 : [width] "r" (width)

	577 : "t0", "t1", "t2", "t3", "t4", "t5"

	578 );

	579 }

	580

	581 void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

	582 int width) {

	583 int x = 0;

	584 int y = 0;

	585 __asm__ __volatile__ (

	586 ".set push \n"

	587 ".set noreorder \n"

	588

	589 "addu $t4, %[width], %[width] \n"

	590 "srl %[x], %[width], 4 \n"

	591 "andi %[y], %[width], 0xf \n"

	592 "blez %[x], 2f \n"

	593 " addu %[src_uv], %[src_uv], $t4 \n"

	594

	595 ".p2align 2 \n"

	596 "1: \n"

	597 "lw $t0, -32(%[src_uv]) \n" // \|3\|2\|1\|0\|

	598 "lw $t1, -28(%[src_uv]) \n" // \|7\|6\|5\|4\|

	599 "lw $t2, -24(%[src_uv]) \n" // \|11\|10\|9\|8\|

	600 "lw $t3, -20(%[src_uv]) \n" // \|15\|14\|13\|12\|

	601 "lw $t4, -16(%[src_uv]) \n" // \|19\|18\|17\|16\|

	602 "lw $t6, -12(%[src_uv]) \n" // \|23\|22\|21\|20\|

	603 "lw $t7, -8(%[src_uv]) \n" // \|27\|26\|25\|24\|

	604 "lw $t8, -4(%[src_uv]) \n" // \|31\|30\|29\|28\|

	605

	606 "rotr $t0, $t0, 16 \n" // \|1\|0\|3\|2\|

	607 "rotr $t1, $t1, 16 \n" // \|5\|4\|7\|6\|

	608 "rotr $t2, $t2, 16 \n" // \|9\|8\|11\|10\|

	609 "rotr $t3, $t3, 16 \n" // \|13\|12\|15\|14\|

	610 "rotr $t4, $t4, 16 \n" // \|17\|16\|19\|18\|

	611 "rotr $t6, $t6, 16 \n" // \|21\|20\|23\|22\|

	612 "rotr $t7, $t7, 16 \n" // \|25\|24\|27\|26\|

	613 "rotr $t8, $t8, 16 \n" // \|29\|28\|31\|30\|

	614 "precr.qb.ph $t9, $t0, $t1 \n" // \|0\|2\|4\|6\|

	615 "precrq.qb.ph $t5, $t0, $t1 \n" // \|1\|3\|5\|7\|

	616 "precr.qb.ph $t0, $t2, $t3 \n" // \|8\|10\|12\|14\|

	617 "precrq.qb.ph $t1, $t2, $t3 \n" // \|9\|11\|13\|15\|

	618 "precr.qb.ph $t2, $t4, $t6 \n" // \|16\|18\|20\|22\|

	619 "precrq.qb.ph $t3, $t4, $t6 \n" // \|17\|19\|21\|23\|

	620 "precr.qb.ph $t4, $t7, $t8 \n" // \|24\|26\|28\|30\|

	621 "precrq.qb.ph $t6, $t7, $t8 \n" // \|25\|27\|29\|31\|

	622 "addiu %[src_uv], %[src_uv], -32 \n"

	623 "addiu %[x], %[x], -1 \n"

	624 "swr $t4, 0(%[dst_u]) \n"

	625 "swl $t4, 3(%[dst_u]) \n" // \|30\|28\|26\|24\|

	626 "swr $t6, 0(%[dst_v]) \n"

	627 "swl $t6, 3(%[dst_v]) \n" // \|31\|29\|27\|25\|

	628 "swr $t2, 4(%[dst_u]) \n"

	629 "swl $t2, 7(%[dst_u]) \n" // \|22\|20\|18\|16\|

	630 "swr $t3, 4(%[dst_v]) \n"

	631 "swl $t3, 7(%[dst_v]) \n" // \|23\|21\|19\|17\|

	632 "swr $t0, 8(%[dst_u]) \n"

	633 "swl $t0, 11(%[dst_u]) \n" // \|14\|12\|10\|8\|

	634 "swr $t1, 8(%[dst_v]) \n"

	635 "swl $t1, 11(%[dst_v]) \n" // \|15\|13\|11\|9\|

	636 "swr $t9, 12(%[dst_u]) \n"

	637 "swl $t9, 15(%[dst_u]) \n" // \|6\|4\|2\|0\|

	638 "swr $t5, 12(%[dst_v]) \n"

	639 "swl $t5, 15(%[dst_v]) \n" // \|7\|5\|3\|1\|

	640 "addiu %[dst_v], %[dst_v], 16 \n"

	641 "bgtz %[x], 1b \n"

	642 " addiu %[dst_u], %[dst_u], 16 \n"

	643 "beqz %[y], 3f \n"

	644 " nop \n"

	645 "b 2f \n"

	646 " nop \n"

	647

	648 "2: \n"

	649 "lbu $t0, -2(%[src_uv]) \n"

	650 "lbu $t1, -1(%[src_uv]) \n"

	651 "addiu %[src_uv], %[src_uv], -2 \n"

	652 "addiu %[y], %[y], -1 \n"

	653 "sb $t0, 0(%[dst_u]) \n"

	654 "sb $t1, 0(%[dst_v]) \n"

	655 "addiu %[dst_u], %[dst_u], 1 \n"

	656 "bgtz %[y], 2b \n"

	657 " addiu %[dst_v], %[dst_v], 1 \n"

	658

	659 "3: \n"

	660 ".set pop \n"

	661 : [src_uv] "+r" (src_uv),

	662 [dst_u] "+r" (dst_u),

	663 [dst_v] "+r" (dst_v),

	664 [x] "=&r" (x),

	665 [y] "+r" (y)

	666 : [width] "r" (width)

	667 : "t0", "t1", "t2", "t3", "t4",

	668 "t5", "t7", "t8", "t9"

	669 );

	670 }

	671

	672 // Convert (4 Y and 2 VU) I422 and arrange RGB values into

	673 // t5 = \| 0 \| B0 \| 0 \| b0 \|

	674 // t4 = \| 0 \| B1 \| 0 \| b1 \|

	675 // t9 = \| 0 \| G0 \| 0 \| g0 \|

	676 // t8 = \| 0 \| G1 \| 0 \| g1 \|

	677 // t2 = \| 0 \| R0 \| 0 \| r0 \|

	678 // t1 = \| 0 \| R1 \| 0 \| r1 \|

	679 #define I422ToTransientMipsRGB \

	680 "lw $t0, 0(%[y_buf]) \n" \

	681 "lhu $t1, 0(%[u_buf]) \n" \

	682 "lhu $t2, 0(%[v_buf]) \n" \

	683 "preceu.ph.qbr $t1, $t1 \n" \

	684 "preceu.ph.qbr $t2, $t2 \n" \

	685 "preceu.ph.qbra $t3, $t0 \n" \

	686 "preceu.ph.qbla $t0, $t0 \n" \

	687 "subu.ph $t1, $t1, $s5 \n" \

	688 "subu.ph $t2, $t2, $s5 \n" \

	689 "subu.ph $t3, $t3, $s4 \n" \

	690 "subu.ph $t0, $t0, $s4 \n" \

	691 "mul.ph $t3, $t3, $s0 \n" \

	692 "mul.ph $t0, $t0, $s0 \n" \

	693 "shll.ph $t4, $t1, 0x7 \n" \

	694 "subu.ph $t4, $t4, $t1 \n" \

	695 "mul.ph $t6, $t1, $s1 \n" \

	696 "mul.ph $t1, $t2, $s2 \n" \

	697 "addq_s.ph $t5, $t4, $t3 \n" \

	698 "addq_s.ph $t4, $t4, $t0 \n" \

	699 "shra.ph $t5, $t5, 6 \n" \

	700 "shra.ph $t4, $t4, 6 \n" \

	701 "addiu %[u_buf], 2 \n" \

	702 "addiu %[v_buf], 2 \n" \

	703 "addu.ph $t6, $t6, $t1 \n" \

	704 "mul.ph $t1, $t2, $s3 \n" \

	705 "addu.ph $t9, $t6, $t3 \n" \

	706 "addu.ph $t8, $t6, $t0 \n" \

	707 "shra.ph $t9, $t9, 6 \n" \

	708 "shra.ph $t8, $t8, 6 \n" \

	709 "addu.ph $t2, $t1, $t3 \n" \

	710 "addu.ph $t1, $t1, $t0 \n" \

	711 "shra.ph $t2, $t2, 6 \n" \

	712 "shra.ph $t1, $t1, 6 \n" \

	713 "subu.ph $t5, $t5, $s5 \n" \

	714 "subu.ph $t4, $t4, $s5 \n" \

	715 "subu.ph $t9, $t9, $s5 \n" \

	716 "subu.ph $t8, $t8, $s5 \n" \

	717 "subu.ph $t2, $t2, $s5 \n" \

	718 "subu.ph $t1, $t1, $s5 \n" \

	719 "shll_s.ph $t5, $t5, 8 \n" \

	720 "shll_s.ph $t4, $t4, 8 \n" \

	721 "shll_s.ph $t9, $t9, 8 \n" \

	722 "shll_s.ph $t8, $t8, 8 \n" \

	723 "shll_s.ph $t2, $t2, 8 \n" \

	724 "shll_s.ph $t1, $t1, 8 \n" \

	725 "shra.ph $t5, $t5, 8 \n" \

	726 "shra.ph $t4, $t4, 8 \n" \

	727 "shra.ph $t9, $t9, 8 \n" \

	728 "shra.ph $t8, $t8, 8 \n" \

	729 "shra.ph $t2, $t2, 8 \n" \

	730 "shra.ph $t1, $t1, 8 \n" \

	731 "addu.ph $t5, $t5, $s5 \n" \

	732 "addu.ph $t4, $t4, $s5 \n" \

	733 "addu.ph $t9, $t9, $s5 \n" \

	734 "addu.ph $t8, $t8, $s5 \n" \

	735 "addu.ph $t2, $t2, $s5 \n" \

	736 "addu.ph $t1, $t1, $s5 \n"

	737

	738 void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,

	739 const uint8* u_buf,

	740 const uint8* v_buf,

	741 uint8* rgb_buf,

	742 int width) {

	743 __asm__ __volatile__ (

	744 ".set push \n"

	745 ".set noreorder \n"

	746 "beqz %[width], 2f \n"

	747 " repl.ph $s0, 74 \n" // \|YG\|YG\| = \|74\|74\|

	748 "repl.ph $s1, -25 \n" // \|UG\|UG\| = \|-25\|-25\|

	749 "repl.ph $s2, -52 \n" // \|VG\|VG\| = \|-52\|-52\|

	750 "repl.ph $s3, 102 \n" // \|VR\|VR\| = \|102\|102\|

	751 "repl.ph $s4, 16 \n" // \|0\|16\|0\|16\|

	752 "repl.ph $s5, 128 \n" // \|128\|128\| // clipping

	753 "lui $s6, 0xff00 \n"

	754 "ori $s6, 0xff00 \n" // \|ff\|00\|ff\|00\|ff\|

	755

	756 ".p2align 2 \n"

	757 "1: \n"

	758 I422ToTransientMipsRGB

	759 // Arranging into argb format

	760 "precr.qb.ph $t4, $t8, $t4 \n" // \|G1\|g1\|B1\|b1\|

	761 "precr.qb.ph $t5, $t9, $t5 \n" // \|G0\|g0\|B0\|b0\|

	762 "addiu %[width], -4 \n"

	763 "precrq.qb.ph $t8, $t4, $t5 \n" // \|G1\|B1\|G0\|B0\|

	764 "precr.qb.ph $t9, $t4, $t5 \n" // \|g1\|b1\|g0\|b0\|

	765 "precr.qb.ph $t2, $t1, $t2 \n" // \|R1\|r1\|R0\|r0\|

	766

	767 "addiu %[y_buf], 4 \n"

	768 "preceu.ph.qbla $t1, $t2 \n" // \|0 \|R1\|0 \|R0\|

	769 "preceu.ph.qbra $t2, $t2 \n" // \|0 \|r1\|0 \|r0\|

	770 "or $t1, $t1, $s6 \n" // \|ff\|R1\|ff\|R0\|

	771 "or $t2, $t2, $s6 \n" // \|ff\|r1\|ff\|r0\|

	772 "precrq.ph.w $t0, $t2, $t9 \n" // \|ff\|r1\|g1\|b1\|

	773 "precrq.ph.w $t3, $t1, $t8 \n" // \|ff\|R1\|G1\|B1\|

	774 "sll $t9, $t9, 16 \n"

	775 "sll $t8, $t8, 16 \n"

	776 "packrl.ph $t2, $t2, $t9 \n" // \|ff\|r0\|g0\|b0\|

	777 "packrl.ph $t1, $t1, $t8 \n" // \|ff\|R0\|G0\|B0\|

	778 // Store results.

	779 "sw $t2, 0(%[rgb_buf]) \n"

	780 "sw $t0, 4(%[rgb_buf]) \n"

	781 "sw $t1, 8(%[rgb_buf]) \n"

	782 "sw $t3, 12(%[rgb_buf]) \n"

	783 "bnez %[width], 1b \n"

	784 " addiu %[rgb_buf], 16 \n"

	785 "2: \n"

	786 ".set pop \n"

	787 :[y_buf] "+r" (y_buf),

	788 [u_buf] "+r" (u_buf),

	789 [v_buf] "+r" (v_buf),

	790 [width] "+r" (width),

	791 [rgb_buf] "+r" (rgb_buf)

	792 :

	793 : "t0", "t1", "t2", "t3", "t4", "t5",

	794 "t6", "t7", "t8", "t9",

	795 "s0", "s1", "s2", "s3",

	796 "s4", "s5", "s6"

	797 );

	798 }

	799

	800 void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,

	801 const uint8* u_buf,

	802 const uint8* v_buf,

	803 uint8* rgb_buf,

	804 int width) {

	805 __asm__ __volatile__ (

	806 ".set push \n"

	807 ".set noreorder \n"

	808 "beqz %[width], 2f \n"

	809 " repl.ph $s0, 74 \n" // \|YG\|YG\| = \|74\|74\|

	810 "repl.ph $s1, -25 \n" // \|UG\|UG\| = \|-25\|-25\|

	811 "repl.ph $s2, -52 \n" // \|VG\|VG\| = \|-52\|-52\|

	812 "repl.ph $s3, 102 \n" // \|VR\|VR\| = \|102\|102\|

	813 "repl.ph $s4, 16 \n" // \|0\|16\|0\|16\|

	814 "repl.ph $s5, 128 \n" // \|128\|128\|

	815 "lui $s6, 0xff00 \n"

	816 "ori $s6, 0xff00 \n" // \|ff\|00\|ff\|00\|

	817

	818 ".p2align 2 \n"

	819 "1: \n"

	820 I422ToTransientMipsRGB

	821 // Arranging into abgr format

	822 "precr.qb.ph $t0, $t8, $t1 \n" // \|G1\|g1\|R1\|r1\|

	823 "precr.qb.ph $t3, $t9, $t2 \n" // \|G0\|g0\|R0\|r0\|

	824 "precrq.qb.ph $t8, $t0, $t3 \n" // \|G1\|R1\|G0\|R0\|

	825 "precr.qb.ph $t9, $t0, $t3 \n" // \|g1\|r1\|g0\|r0\|

	826

	827 "precr.qb.ph $t2, $t4, $t5 \n" // \|B1\|b1\|B0\|b0\|

	828 "addiu %[width], -4 \n"

	829 "addiu %[y_buf], 4 \n"

	830 "preceu.ph.qbla $t1, $t2 \n" // \|0 \|B1\|0 \|B0\|

	831 "preceu.ph.qbra $t2, $t2 \n" // \|0 \|b1\|0 \|b0\|

	832 "or $t1, $t1, $s6 \n" // \|ff\|B1\|ff\|B0\|

	833 "or $t2, $t2, $s6 \n" // \|ff\|b1\|ff\|b0\|

	834 "precrq.ph.w $t0, $t2, $t9 \n" // \|ff\|b1\|g1\|r1\|

	835 "precrq.ph.w $t3, $t1, $t8 \n" // \|ff\|B1\|G1\|R1\|

	836 "sll $t9, $t9, 16 \n"

	837 "sll $t8, $t8, 16 \n"

	838 "packrl.ph $t2, $t2, $t9 \n" // \|ff\|b0\|g0\|r0\|

	839 "packrl.ph $t1, $t1, $t8 \n" // \|ff\|B0\|G0\|R0\|

	840 // Store results.

	841 "sw $t2, 0(%[rgb_buf]) \n"

	842 "sw $t0, 4(%[rgb_buf]) \n"

	843 "sw $t1, 8(%[rgb_buf]) \n"

	844 "sw $t3, 12(%[rgb_buf]) \n"

	845 "bnez %[width], 1b \n"

	846 " addiu %[rgb_buf], 16 \n"

	847 "2: \n"

	848 ".set pop \n"

	849 :[y_buf] "+r" (y_buf),

	850 [u_buf] "+r" (u_buf),

	851 [v_buf] "+r" (v_buf),

	852 [width] "+r" (width),

	853 [rgb_buf] "+r" (rgb_buf)

	854 :

	855 : "t0", "t1", "t2", "t3", "t4", "t5",

	856 "t6", "t7", "t8", "t9",

	857 "s0", "s1", "s2", "s3",

	858 "s4", "s5", "s6"

	859 );

	860 }

	861

	862 void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,

	863 const uint8* u_buf,

	864 const uint8* v_buf,

	865 uint8* rgb_buf,

	866 int width) {

	867 __asm__ __volatile__ (

	868 ".set push \n"

	869 ".set noreorder \n"

	870 "beqz %[width], 2f \n"

	871 " repl.ph $s0, 74 \n" // \|YG\|YG\| = \|74 \|74 \|

	872 "repl.ph $s1, -25 \n" // \|UG\|UG\| = \|-25\|-25\|

	873 "repl.ph $s2, -52 \n" // \|VG\|VG\| = \|-52\|-52\|

	874 "repl.ph $s3, 102 \n" // \|VR\|VR\| = \|102\|102\|

	875 "repl.ph $s4, 16 \n" // \|0\|16\|0\|16\|

	876 "repl.ph $s5, 128 \n" // \|128\|128\|

	877 "lui $s6, 0xff \n"

	878 "ori $s6, 0xff \n" // \|00\|ff\|00\|ff\|

	879

	880 ".p2align 2 \n"

	881 "1: \n"

	882 I422ToTransientMipsRGB

	883 // Arranging into bgra format

	884 "precr.qb.ph $t4, $t4, $t8 \n" // \|B1\|b1\|G1\|g1\|

	885 "precr.qb.ph $t5, $t5, $t9 \n" // \|B0\|b0\|G0\|g0\|

	886 "precrq.qb.ph $t8, $t4, $t5 \n" // \|B1\|G1\|B0\|G0\|

	887 "precr.qb.ph $t9, $t4, $t5 \n" // \|b1\|g1\|b0\|g0\|

	888

	889 "precr.qb.ph $t2, $t1, $t2 \n" // \|R1\|r1\|R0\|r0\|

	890 "addiu %[width], -4 \n"

	891 "addiu %[y_buf], 4 \n"

	892 "preceu.ph.qbla $t1, $t2 \n" // \|0 \|R1\|0 \|R0\|

	893 "preceu.ph.qbra $t2, $t2 \n" // \|0 \|r1\|0 \|r0\|

	894 "sll $t1, $t1, 8 \n" // \|R1\|0 \|R0\|0 \|

	895 "sll $t2, $t2, 8 \n" // \|r1\|0 \|r0\|0 \|

	896 "or $t1, $t1, $s6 \n" // \|R1\|ff\|R0\|ff\|

	897 "or $t2, $t2, $s6 \n" // \|r1\|ff\|r0\|ff\|

	898 "precrq.ph.w $t0, $t9, $t2 \n" // \|b1\|g1\|r1\|ff\|

	899 "precrq.ph.w $t3, $t8, $t1 \n" // \|B1\|G1\|R1\|ff\|

	900 "sll $t1, $t1, 16 \n"

	901 "sll $t2, $t2, 16 \n"

	902 "packrl.ph $t2, $t9, $t2 \n" // \|b0\|g0\|r0\|ff\|

	903 "packrl.ph $t1, $t8, $t1 \n" // \|B0\|G0\|R0\|ff\|

	904 // Store results.

	905 "sw $t2, 0(%[rgb_buf]) \n"

	906 "sw $t0, 4(%[rgb_buf]) \n"

	907 "sw $t1, 8(%[rgb_buf]) \n"

	908 "sw $t3, 12(%[rgb_buf]) \n"

	909 "bnez %[width], 1b \n"

	910 " addiu %[rgb_buf], 16 \n"

	911 "2: \n"

	912 ".set pop \n"

	913 :[y_buf] "+r" (y_buf),

	914 [u_buf] "+r" (u_buf),

	915 [v_buf] "+r" (v_buf),

	916 [width] "+r" (width),

	917 [rgb_buf] "+r" (rgb_buf)

	918 :

	919 : "t0", "t1", "t2", "t3", "t4", "t5",

	920 "t6", "t7", "t8", "t9",

	921 "s0", "s1", "s2", "s3",

	922 "s4", "s5", "s6"

	923 );

	924 }

	925

	926 // Bilinear filter 8x2 -> 8x1

	927 void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

	928 ptrdiff_t src_stride, int dst_width,

	929 int source_y_fraction) {

	930 int y0_fraction = 256 - source_y_fraction;

	931 const uint8* src_ptr1 = src_ptr + src_stride;

	932

	933 __asm__ __volatile__ (

	934 ".set push \n"

	935 ".set noreorder \n"

	936

	937 "replv.ph $t0, %[y0_fraction] \n"

	938 "replv.ph $t1, %[source_y_fraction] \n"

	939

	940 ".p2align 2 \n"

	941 "1: \n"

	942 "lw $t2, 0(%[src_ptr]) \n"

	943 "lw $t3, 0(%[src_ptr1]) \n"

	944 "lw $t4, 4(%[src_ptr]) \n"

	945 "lw $t5, 4(%[src_ptr1]) \n"

	946 "muleu_s.ph.qbl $t6, $t2, $t0 \n"

	947 "muleu_s.ph.qbr $t7, $t2, $t0 \n"

	948 "muleu_s.ph.qbl $t8, $t3, $t1 \n"

	949 "muleu_s.ph.qbr $t9, $t3, $t1 \n"

	950 "muleu_s.ph.qbl $t2, $t4, $t0 \n"

	951 "muleu_s.ph.qbr $t3, $t4, $t0 \n"

	952 "muleu_s.ph.qbl $t4, $t5, $t1 \n"

	953 "muleu_s.ph.qbr $t5, $t5, $t1 \n"

	954 "addq.ph $t6, $t6, $t8 \n"

	955 "addq.ph $t7, $t7, $t9 \n"

	956 "addq.ph $t2, $t2, $t4 \n"

	957 "addq.ph $t3, $t3, $t5 \n"

	958 "shra.ph $t6, $t6, 8 \n"

	959 "shra.ph $t7, $t7, 8 \n"

	960 "shra.ph $t2, $t2, 8 \n"

	961 "shra.ph $t3, $t3, 8 \n"

	962 "precr.qb.ph $t6, $t6, $t7 \n"

	963 "precr.qb.ph $t2, $t2, $t3 \n"

	964 "addiu %[src_ptr], %[src_ptr], 8 \n"

	965 "addiu %[src_ptr1], %[src_ptr1], 8 \n"

	966 "addiu %[dst_width], %[dst_width], -8 \n"

	967 "sw $t6, 0(%[dst_ptr]) \n"

	968 "sw $t2, 4(%[dst_ptr]) \n"

	969 "bgtz %[dst_width], 1b \n"

	970 " addiu %[dst_ptr], %[dst_ptr], 8 \n"

	971

	972 ".set pop \n"

	973 : [dst_ptr] "+r" (dst_ptr),

	974 [src_ptr1] "+r" (src_ptr1),

	975 [src_ptr] "+r" (src_ptr),

	976 [dst_width] "+r" (dst_width)

	977 : [source_y_fraction] "r" (source_y_fraction),

	978 [y0_fraction] "r" (y0_fraction),

	979 [src_stride] "r" (src_stride)

	980 : "t0", "t1", "t2", "t3", "t4", "t5",

	981 "t6", "t7", "t8", "t9"

	982 );

	983 }

	984 #endif // __mips_dsp_rev >= 2

	985

	986 #endif // defined(__mips__)

	987

	988 #ifdef __cplusplus

	989 } // extern "C"

	990 } // namespace libyuv

	991 #endif

OLD	NEW

« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_common.cc ('k') | source/libvpx/third_party/libyuv/source/row_neon.cc » ('j') | no next file with comments »