third_party/libwebp/dsp/dec_mips_dsp_r2.c - Issue 1546003002: libwebp: update to 0.5.0

Side by Side Diff: third_party/libwebp/dsp/dec_mips_dsp_r2.c

Issue 1546003002: libwebp: update to 0.5.0 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: rebase around clang-cl fix Created 4 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2014 Google Inc. All Rights Reserved.

	2 //

	3 // Use of this source code is governed by a BSD-style license

	4 // that can be found in the COPYING file in the root of the source

	5 // tree. An additional intellectual property rights grant can be found

	6 // in the file PATENTS. All contributing project authors may

	7 // be found in the AUTHORS file in the root of the source tree.

	8 // -----------------------------------------------------------------------------

	9 //

	10 // MIPS version of dsp functions

	11 //

	12 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)

	13 // Jovan Zelincevic (jovan.zelincevic@imgtec.com)

	14

	15 #include "./dsp.h"

	16

	17 #if defined(WEBP_USE_MIPS_DSP_R2)

	18

	19 #include "./mips_macro.h"

	20

	21 static const int kC1 = 20091 + (1 << 16);

	22 static const int kC2 = 35468;

	23

	24 #define MUL(a, b) (((a) * (b)) >> 16)

	25

	26 static void TransformDC(const int16_t* in, uint8_t* dst) {

	27 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;

	28

	29 __asm__ volatile (

	30 LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,

	31 0, 0, 0, 0,

	32 0, 1, 2, 3,

	33 BPS)

	34 "lh %[temp5], 0(%[in]) \n\t"

	35 "addiu %[temp5], %[temp5], 4 \n\t"

	36 "ins %[temp5], %[temp5], 16, 16 \n\t"

	37 "shra.ph %[temp5], %[temp5], 3 \n\t"

	38 CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,

	39 temp3, temp1, temp2, temp3, temp4)

	40 STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,

	41 temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,

	42 dst, 0, 1, 2, 3, BPS)

	43

	44 OUTPUT_EARLY_CLOBBER_REGS_10()

	45 : [in]"r"(in), [dst]"r"(dst)

	46 : "memory"

	47 );

	48 }

	49

	50 static void TransformAC3(const int16_t* in, uint8_t* dst) {

	51 const int a = in[0] + 4;

	52 int c4 = MUL(in[4], kC2);

	53 const int d4 = MUL(in[4], kC1);

	54 const int c1 = MUL(in[1], kC2);

	55 const int d1 = MUL(in[1], kC1);

	56 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;

	57 int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;

	58

	59 __asm__ volatile (

	60 "ins %[c4], %[d4], 16, 16 \n\t"

	61 "replv.ph %[temp1], %[a] \n\t"

	62 "replv.ph %[temp4], %[d1] \n\t"

	63 ADD_SUB_HALVES(temp2, temp3, temp1, c4)

	64 "replv.ph %[temp5], %[c1] \n\t"

	65 SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,

	66 temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)

	67 LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,

	68 0, 0, 0, 0,

	69 0, 1, 2, 3,

	70 BPS)

	71 CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,

	72 temp11, temp17, temp3, temp5, temp11, temp12)

	73 PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,

	74 temp4, temp7, temp6, temp10, temp9)

	75 STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,

	76 temp17, temp12, temp18, temp1, temp8, temp2, temp4,

	77 temp7, temp6, dst, 0, 1, 2, 3, BPS)

	78

	79 OUTPUT_EARLY_CLOBBER_REGS_18(),

	80 [c4]"+&r"(c4)

	81 : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)

	82 : "memory"

	83 );

	84 }

	85

	86 static void TransformOne(const int16_t* in, uint8_t* dst) {

	87 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;

	88 int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;

	89

	90 __asm__ volatile (

	91 "ulw %[temp1], 0(%[in]) \n\t"

	92 "ulw %[temp2], 16(%[in]) \n\t"

	93 LOAD_IN_X2(temp5, temp6, 24, 26)

	94 ADD_SUB_HALVES(temp3, temp4, temp1, temp2)

	95 LOAD_IN_X2(temp1, temp2, 8, 10)

	96 MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,

	97 temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,

	98 temp13, temp11, temp14, temp12)

	99 INSERT_HALF_X2(temp8, temp7, temp10, temp9)

	100 "ulw %[temp17], 4(%[in]) \n\t"

	101 "ulw %[temp18], 20(%[in]) \n\t"

	102 ADD_SUB_HALVES(temp1, temp2, temp3, temp8)

	103 ADD_SUB_HALVES(temp5, temp6, temp4, temp7)

	104 ADD_SUB_HALVES(temp7, temp8, temp17, temp18)

	105 LOAD_IN_X2(temp17, temp18, 12, 14)

	106 LOAD_IN_X2(temp9, temp10, 28, 30)

	107 MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,

	108 temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,

	109 temp15, temp4, temp16, temp17)

	110 INSERT_HALF_X2(temp11, temp12, temp13, temp14)

	111 ADD_SUB_HALVES(temp17, temp8, temp8, temp11)

	112 ADD_SUB_HALVES(temp3, temp4, temp7, temp12)

	113

	114 // horizontal

	115 SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)

	116 INSERT_HALF_X2(temp1, temp6, temp5, temp2)

	117 SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)

	118 "repl.ph %[temp2], 0x4 \n\t"

	119 INSERT_HALF_X2(temp3, temp8, temp17, temp4)

	120 "addq.ph %[temp1], %[temp1], %[temp2] \n\t"

	121 "addq.ph %[temp6], %[temp6], %[temp2] \n\t"

	122 ADD_SUB_HALVES(temp2, temp4, temp1, temp3)

	123 ADD_SUB_HALVES(temp5, temp7, temp6, temp8)

	124 MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,

	125 temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,

	126 temp6, temp17, temp8, temp18)

	127 MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,

	128 temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,

	129 temp18, temp12, temp17, temp16)

	130 INSERT_HALF_X2(temp1, temp3, temp9, temp13)

	131 INSERT_HALF_X2(temp6, temp8, temp11, temp15)

	132 SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,

	133 temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,

	134 temp6)

	135 PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,

	136 temp16, temp11, temp10, temp15, temp14)

	137 LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,

	138 0, 0, 0, 0,

	139 0, 1, 2, 3,

	140 BPS)

	141 CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,

	142 temp11, temp10, temp11, temp14, temp15)

	143 STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,

	144 temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,

	145 dst, 0, 1, 2, 3, BPS)

	146

	147 OUTPUT_EARLY_CLOBBER_REGS_18()

	148 : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)

	149 : "memory", "hi", "lo"

	150 );

	151 }

	152

	153 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {

	154 TransformOne(in, dst);

	155 if (do_two) {

	156 TransformOne(in + 16, dst + 4);

	157 }

	158 }

	159

	160 static WEBP_INLINE void FilterLoop26(uint8_t* p,

	161 int hstride, int vstride, int size,

	162 int thresh, int ithresh, int hev_thresh) {

	163 const int thresh2 = 2 * thresh + 1;

	164 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;

	165 int temp10, temp11, temp12, temp13, temp14, temp15;

	166

	167 __asm__ volatile (

	168 ".set push \n\t"

	169 ".set noreorder \n\t"

	170 "1: \n\t"

	171 "negu %[temp1], %[hstride] \n\t"

	172 "addiu %[size], %[size], -1 \n\t"

	173 "sll %[temp2], %[hstride], 1 \n\t"

	174 "sll %[temp3], %[temp1], 1 \n\t"

	175 "addu %[temp4], %[temp2], %[hstride] \n\t"

	176 "addu %[temp5], %[temp3], %[temp1] \n\t"

	177 "lbu %[temp7], 0(%[p]) \n\t"

	178 "sll %[temp6], %[temp3], 1 \n\t"

	179 "lbux %[temp8], %[temp5](%[p]) \n\t"

	180 "lbux %[temp9], %[temp3](%[p]) \n\t"

	181 "lbux %[temp10], %[temp1](%[p]) \n\t"

	182 "lbux %[temp11], %[temp6](%[p]) \n\t"

	183 "lbux %[temp12], %[hstride](%[p]) \n\t"

	184 "lbux %[temp13], %[temp2](%[p]) \n\t"

	185 "lbux %[temp14], %[temp4](%[p]) \n\t"

	186 "subu %[temp1], %[temp10], %[temp7] \n\t"

	187 "subu %[temp2], %[temp9], %[temp12] \n\t"

	188 "absq_s.w %[temp3], %[temp1] \n\t"

	189 "absq_s.w %[temp4], %[temp2] \n\t"

	190 "negu %[temp1], %[temp1] \n\t"

	191 "sll %[temp3], %[temp3], 2 \n\t"

	192 "addu %[temp15], %[temp3], %[temp4] \n\t"

	193 "subu %[temp3], %[temp15], %[thresh2] \n\t"

	194 "sll %[temp6], %[temp1], 1 \n\t"

	195 "bgtz %[temp3], 3f \n\t"

	196 " subu %[temp4], %[temp11], %[temp8] \n\t"

	197 "absq_s.w %[temp4], %[temp4] \n\t"

	198 "shll_s.w %[temp2], %[temp2], 24 \n\t"

	199 "subu %[temp4], %[temp4], %[ithresh] \n\t"

	200 "bgtz %[temp4], 3f \n\t"

	201 " subu %[temp3], %[temp8], %[temp9] \n\t"

	202 "absq_s.w %[temp3], %[temp3] \n\t"

	203 "subu %[temp3], %[temp3], %[ithresh] \n\t"

	204 "bgtz %[temp3], 3f \n\t"

	205 " subu %[temp5], %[temp9], %[temp10] \n\t"

	206 "absq_s.w %[temp3], %[temp5] \n\t"

	207 "absq_s.w %[temp5], %[temp5] \n\t"

	208 "subu %[temp3], %[temp3], %[ithresh] \n\t"

	209 "bgtz %[temp3], 3f \n\t"

	210 " subu %[temp3], %[temp14], %[temp13] \n\t"

	211 "absq_s.w %[temp3], %[temp3] \n\t"

	212 "slt %[temp5], %[hev_thresh], %[temp5] \n\t"

	213 "subu %[temp3], %[temp3], %[ithresh] \n\t"

	214 "bgtz %[temp3], 3f \n\t"

	215 " subu %[temp3], %[temp13], %[temp12] \n\t"

	216 "absq_s.w %[temp3], %[temp3] \n\t"

	217 "sra %[temp4], %[temp2], 24 \n\t"

	218 "subu %[temp3], %[temp3], %[ithresh] \n\t"

	219 "bgtz %[temp3], 3f \n\t"

	220 " subu %[temp15], %[temp12], %[temp7] \n\t"

	221 "absq_s.w %[temp3], %[temp15] \n\t"

	222 "absq_s.w %[temp15], %[temp15] \n\t"

	223 "subu %[temp3], %[temp3], %[ithresh] \n\t"

	224 "bgtz %[temp3], 3f \n\t"

	225 " slt %[temp15], %[hev_thresh], %[temp15] \n\t"

	226 "addu %[temp3], %[temp6], %[temp1] \n\t"

	227 "or %[temp2], %[temp5], %[temp15] \n\t"

	228 "addu %[temp5], %[temp4], %[temp3] \n\t"

	229 "beqz %[temp2], 4f \n\t"

	230 " shra_r.w %[temp1], %[temp5], 3 \n\t"

	231 "addiu %[temp2], %[temp5], 3 \n\t"

	232 "sra %[temp2], %[temp2], 3 \n\t"

	233 "shll_s.w %[temp1], %[temp1], 27 \n\t"

	234 "shll_s.w %[temp2], %[temp2], 27 \n\t"

	235 "subu %[temp3], %[p], %[hstride] \n\t"

	236 "sra %[temp1], %[temp1], 27 \n\t"

	237 "sra %[temp2], %[temp2], 27 \n\t"

	238 "subu %[temp1], %[temp7], %[temp1] \n\t"

	239 "addu %[temp2], %[temp10], %[temp2] \n\t"

	240 "lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t"

	241 "lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t"

	242 "sb %[temp2], 0(%[temp3]) \n\t"

	243 "j 3f \n\t"

	244 " sb %[temp1], 0(%[p]) \n\t"

	245 "4: \n\t"

	246 "shll_s.w %[temp5], %[temp5], 24 \n\t"

	247 "subu %[temp14], %[p], %[hstride] \n\t"

	248 "subu %[temp11], %[temp14], %[hstride] \n\t"

	249 "sra %[temp6], %[temp5], 24 \n\t"

	250 "sll %[temp1], %[temp6], 3 \n\t"

	251 "subu %[temp15], %[temp11], %[hstride] \n\t"

	252 "addu %[temp2], %[temp6], %[temp1] \n\t"

	253 "sll %[temp3], %[temp2], 1 \n\t"

	254 "addu %[temp4], %[temp3], %[temp2] \n\t"

	255 "addiu %[temp2], %[temp2], 63 \n\t"

	256 "addiu %[temp3], %[temp3], 63 \n\t"

	257 "addiu %[temp4], %[temp4], 63 \n\t"

	258 "sra %[temp2], %[temp2], 7 \n\t"

	259 "sra %[temp3], %[temp3], 7 \n\t"

	260 "sra %[temp4], %[temp4], 7 \n\t"

	261 "addu %[temp1], %[temp8], %[temp2] \n\t"

	262 "addu %[temp5], %[temp9], %[temp3] \n\t"

	263 "addu %[temp6], %[temp10], %[temp4] \n\t"

	264 "subu %[temp8], %[temp7], %[temp4] \n\t"

	265 "subu %[temp7], %[temp12], %[temp3] \n\t"

	266 "addu %[temp10], %[p], %[hstride] \n\t"

	267 "subu %[temp9], %[temp13], %[temp2] \n\t"

	268 "addu %[temp12], %[temp10], %[hstride] \n\t"

	269 "lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t"

	270 "lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t"

	271 "lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t"

	272 "lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t"

	273 "lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t"

	274 "lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t"

	275 "sb %[temp2], 0(%[temp15]) \n\t"

	276 "sb %[temp3], 0(%[temp11]) \n\t"

	277 "sb %[temp4], 0(%[temp14]) \n\t"

	278 "sb %[temp5], 0(%[p]) \n\t"

	279 "sb %[temp6], 0(%[temp10]) \n\t"

	280 "sb %[temp8], 0(%[temp12]) \n\t"

	281 "3: \n\t"

	282 "bgtz %[size], 1b \n\t"

	283 " addu %[p], %[p], %[vstride] \n\t"

	284 ".set pop \n\t"

	285 : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),

	286 [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),

	287 [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),

	288 [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),

	289 [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),

	290 [size]"+&r"(size), [p]"+&r"(p)

	291 : [hstride]"r"(hstride), [thresh2]"r"(thresh2),

	292 [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),

	293 [VP8kclip1]"r"(VP8kclip1)

	294 : "memory"

	295 );

	296 }

	297

	298 static WEBP_INLINE void FilterLoop24(uint8_t* p,

	299 int hstride, int vstride, int size,

	300 int thresh, int ithresh, int hev_thresh) {

	301 int p0, q0, p1, q1, p2, q2, p3, q3;

	302 int step1, step2, temp1, temp2, temp3, temp4;

	303 uint8_t* pTemp0;

	304 uint8_t* pTemp1;

	305 const int thresh2 = 2 * thresh + 1;

	306

	307 __asm__ volatile (

	308 ".set push \n\t"

	309 ".set noreorder \n\t"

	310 "bltz %[size], 3f \n\t"

	311 " nop \n\t"

	312 "2: \n\t"

	313 "negu %[step1], %[hstride] \n\t"

	314 "lbu %[q0], 0(%[p]) \n\t"

	315 "lbux %[p0], %[step1](%[p]) \n\t"

	316 "subu %[step1], %[step1], %[hstride] \n\t"

	317 "lbux %[q1], %[hstride](%[p]) \n\t"

	318 "subu %[temp1], %[p0], %[q0] \n\t"

	319 "lbux %[p1], %[step1](%[p]) \n\t"

	320 "addu %[step2], %[hstride], %[hstride] \n\t"

	321 "absq_s.w %[temp2], %[temp1] \n\t"

	322 "subu %[temp3], %[p1], %[q1] \n\t"

	323 "absq_s.w %[temp4], %[temp3] \n\t"

	324 "sll %[temp2], %[temp2], 2 \n\t"

	325 "addu %[temp2], %[temp2], %[temp4] \n\t"

	326 "subu %[temp4], %[temp2], %[thresh2] \n\t"

	327 "subu %[step1], %[step1], %[hstride] \n\t"

	328 "bgtz %[temp4], 0f \n\t"

	329 " lbux %[p2], %[step1](%[p]) \n\t"

	330 "subu %[step1], %[step1], %[hstride] \n\t"

	331 "lbux %[q2], %[step2](%[p]) \n\t"

	332 "lbux %[p3], %[step1](%[p]) \n\t"

	333 "subu %[temp4], %[p2], %[p1] \n\t"

	334 "addu %[step2], %[step2], %[hstride] \n\t"

	335 "subu %[temp2], %[p3], %[p2] \n\t"

	336 "absq_s.w %[temp4], %[temp4] \n\t"

	337 "absq_s.w %[temp2], %[temp2] \n\t"

	338 "lbux %[q3], %[step2](%[p]) \n\t"

	339 "subu %[temp4], %[temp4], %[ithresh] \n\t"

	340 "negu %[temp1], %[temp1] \n\t"

	341 "bgtz %[temp4], 0f \n\t"

	342 " subu %[temp2], %[temp2], %[ithresh] \n\t"

	343 "subu %[p3], %[p1], %[p0] \n\t"

	344 "bgtz %[temp2], 0f \n\t"

	345 " absq_s.w %[p3], %[p3] \n\t"

	346 "subu %[temp4], %[q3], %[q2] \n\t"

	347 "subu %[pTemp0], %[p], %[hstride] \n\t"

	348 "absq_s.w %[temp4], %[temp4] \n\t"

	349 "subu %[temp2], %[p3], %[ithresh] \n\t"

	350 "sll %[step1], %[temp1], 1 \n\t"

	351 "bgtz %[temp2], 0f \n\t"

	352 " subu %[temp4], %[temp4], %[ithresh] \n\t"

	353 "subu %[temp2], %[q2], %[q1] \n\t"

	354 "bgtz %[temp4], 0f \n\t"

	355 " absq_s.w %[temp2], %[temp2] \n\t"

	356 "subu %[q3], %[q1], %[q0] \n\t"

	357 "absq_s.w %[q3], %[q3] \n\t"

	358 "subu %[temp2], %[temp2], %[ithresh] \n\t"

	359 "addu %[temp1], %[temp1], %[step1] \n\t"

	360 "bgtz %[temp2], 0f \n\t"

	361 " subu %[temp4], %[q3], %[ithresh] \n\t"

	362 "slt %[p3], %[hev_thresh], %[p3] \n\t"

	363 "bgtz %[temp4], 0f \n\t"

	364 " slt %[q3], %[hev_thresh], %[q3] \n\t"

	365 "or %[q3], %[q3], %[p3] \n\t"

	366 "bgtz %[q3], 1f \n\t"

	367 " shra_r.w %[temp2], %[temp1], 3 \n\t"

	368 "addiu %[temp1], %[temp1], 3 \n\t"

	369 "sra %[temp1], %[temp1], 3 \n\t"

	370 "shll_s.w %[temp2], %[temp2], 27 \n\t"

	371 "shll_s.w %[temp1], %[temp1], 27 \n\t"

	372 "addu %[pTemp1], %[p], %[hstride] \n\t"

	373 "sra %[temp2], %[temp2], 27 \n\t"

	374 "sra %[temp1], %[temp1], 27 \n\t"

	375 "addiu %[step1], %[temp2], 1 \n\t"

	376 "sra %[step1], %[step1], 1 \n\t"

	377 "addu %[p0], %[p0], %[temp1] \n\t"

	378 "addu %[p1], %[p1], %[step1] \n\t"

	379 "subu %[q0], %[q0], %[temp2] \n\t"

	380 "subu %[q1], %[q1], %[step1] \n\t"

	381 "lbux %[temp2], %[p0](%[VP8kclip1]) \n\t"

	382 "lbux %[temp3], %[q0](%[VP8kclip1]) \n\t"

	383 "lbux %[temp4], %[q1](%[VP8kclip1]) \n\t"

	384 "sb %[temp2], 0(%[pTemp0]) \n\t"

	385 "lbux %[temp1], %[p1](%[VP8kclip1]) \n\t"

	386 "subu %[pTemp0], %[pTemp0], %[hstride] \n\t"

	387 "sb %[temp3], 0(%[p]) \n\t"

	388 "sb %[temp4], 0(%[pTemp1]) \n\t"

	389 "j 0f \n\t"

	390 " sb %[temp1], 0(%[pTemp0]) \n\t"

	391 "1: \n\t"

	392 "shll_s.w %[temp3], %[temp3], 24 \n\t"

	393 "sra %[temp3], %[temp3], 24 \n\t"

	394 "addu %[temp1], %[temp1], %[temp3] \n\t"

	395 "shra_r.w %[temp2], %[temp1], 3 \n\t"

	396 "addiu %[temp1], %[temp1], 3 \n\t"

	397 "shll_s.w %[temp2], %[temp2], 27 \n\t"

	398 "sra %[temp1], %[temp1], 3 \n\t"

	399 "shll_s.w %[temp1], %[temp1], 27 \n\t"

	400 "sra %[temp2], %[temp2], 27 \n\t"

	401 "sra %[temp1], %[temp1], 27 \n\t"

	402 "addu %[p0], %[p0], %[temp1] \n\t"

	403 "subu %[q0], %[q0], %[temp2] \n\t"

	404 "lbux %[temp1], %[p0](%[VP8kclip1]) \n\t"

	405 "lbux %[temp2], %[q0](%[VP8kclip1]) \n\t"

	406 "sb %[temp2], 0(%[p]) \n\t"

	407 "sb %[temp1], 0(%[pTemp0]) \n\t"

	408 "0: \n\t"

	409 "subu %[size], %[size], 1 \n\t"

	410 "bgtz %[size], 2b \n\t"

	411 " addu %[p], %[p], %[vstride] \n\t"

	412 "3: \n\t"

	413 ".set pop \n\t"

	414 : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),

	415 [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),

	416 [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),

	417 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),

	418 [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),

	419 [size]"+&r"(size)

	420 : [vstride]"r"(vstride), [ithresh]"r"(ithresh),

	421 [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),

	422 [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)

	423 : "memory"

	424 );

	425 }

	426

	427 // on macroblock edges

	428 static void VFilter16(uint8_t* p, int stride,

	429 int thresh, int ithresh, int hev_thresh) {

	430 FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);

	431 }

	432

	433 static void HFilter16(uint8_t* p, int stride,

	434 int thresh, int ithresh, int hev_thresh) {

	435 FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);

	436 }

	437

	438 // 8-pixels wide variant, for chroma filtering

	439 static void VFilter8(uint8_t* u, uint8_t* v, int stride,

	440 int thresh, int ithresh, int hev_thresh) {

	441 FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);

	442 FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);

	443 }

	444

	445 static void HFilter8(uint8_t* u, uint8_t* v, int stride,

	446 int thresh, int ithresh, int hev_thresh) {

	447 FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);

	448 FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);

	449 }

	450

	451 // on three inner edges

	452 static void VFilter16i(uint8_t* p, int stride,

	453 int thresh, int ithresh, int hev_thresh) {

	454 int k;

	455 for (k = 3; k > 0; --k) {

	456 p += 4 * stride;

	457 FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);

	458 }

	459 }

	460

	461 static void HFilter16i(uint8_t* p, int stride,

	462 int thresh, int ithresh, int hev_thresh) {

	463 int k;

	464 for (k = 3; k > 0; --k) {

	465 p += 4;

	466 FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);

	467 }

	468 }

	469

	470 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,

	471 int thresh, int ithresh, int hev_thresh) {

	472 FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);

	473 FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);

	474 }

	475

	476 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,

	477 int thresh, int ithresh, int hev_thresh) {

	478 FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);

	479 FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);

	480 }

	481

	482 #undef MUL

	483

	484 //------------------------------------------------------------------------------

	485 // Simple In-loop filtering (Paragraph 15.2)

	486

	487 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {

	488 int i;

	489 const int thresh2 = 2 * thresh + 1;

	490 int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;

	491 uint8_t* p1 = p - stride;

	492 __asm__ volatile (

	493 ".set push \n\t"

	494 ".set noreorder \n\t"

	495 "li %[i], 16 \n\t"

	496 "0: \n\t"

	497 "negu %[temp4], %[stride] \n\t"

	498 "sll %[temp5], %[temp4], 1 \n\t"

	499 "lbu %[temp2], 0(%[p]) \n\t"

	500 "lbux %[temp3], %[stride](%[p]) \n\t"

	501 "lbux %[temp1], %[temp4](%[p]) \n\t"

	502 "lbux %[temp0], %[temp5](%[p]) \n\t"

	503 "subu %[temp7], %[temp1], %[temp2] \n\t"

	504 "subu %[temp6], %[temp0], %[temp3] \n\t"

	505 "absq_s.w %[temp4], %[temp7] \n\t"

	506 "absq_s.w %[temp5], %[temp6] \n\t"

	507 "sll %[temp4], %[temp4], 2 \n\t"

	508 "subu %[temp5], %[temp5], %[thresh2] \n\t"

	509 "addu %[temp5], %[temp4], %[temp5] \n\t"

	510 "negu %[temp8], %[temp7] \n\t"

	511 "bgtz %[temp5], 1f \n\t"

	512 " addiu %[i], %[i], -1 \n\t"

	513 "sll %[temp4], %[temp8], 1 \n\t"

	514 "shll_s.w %[temp5], %[temp6], 24 \n\t"

	515 "addu %[temp3], %[temp4], %[temp8] \n\t"

	516 "sra %[temp5], %[temp5], 24 \n\t"

	517 "addu %[temp3], %[temp3], %[temp5] \n\t"

	518 "addiu %[temp7], %[temp3], 3 \n\t"

	519 "sra %[temp7], %[temp7], 3 \n\t"

	520 "shra_r.w %[temp8], %[temp3], 3 \n\t"

	521 "shll_s.w %[temp0], %[temp7], 27 \n\t"

	522 "shll_s.w %[temp4], %[temp8], 27 \n\t"

	523 "sra %[temp0], %[temp0], 27 \n\t"

	524 "sra %[temp4], %[temp4], 27 \n\t"

	525 "addu %[temp7], %[temp1], %[temp0] \n\t"

	526 "subu %[temp2], %[temp2], %[temp4] \n\t"

	527 "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"

	528 "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"

	529 "sb %[temp3], 0(%[p1]) \n\t"

	530 "sb %[temp4], 0(%[p]) \n\t"

	531 "1: \n\t"

	532 "addiu %[p1], %[p1], 1 \n\t"

	533 "bgtz %[i], 0b \n\t"

	534 " addiu %[p], %[p], 1 \n\t"

	535 " .set pop \n\t"

	536 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),

	537 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),

	538 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),

	539 [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)

	540 : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)

	541 : "memory"

	542 );

	543 }

	544

	545 // TEMP0 = SRC[A + A1 * BPS]

	546 // TEMP1 = SRC[B + B1 * BPS]

	547 // TEMP2 = SRC[C + C1 * BPS]

	548 // TEMP3 = SRC[D + D1 * BPS]

	549 #define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \

	550 A, A1, B, B1, C, C1, D, D1, SRC) \

	551 "lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \

	552 "lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \

	553 "lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \

	554 "lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \

	555

	556 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {

	557 int i;

	558 const int thresh2 = 2 * thresh + 1;

	559 int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;

	560 __asm__ volatile (

	561 ".set push \n\t"

	562 ".set noreorder \n\t"

	563 "li %[i], 16 \n\t"

	564 "0: \n\t"

	565 LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)

	566 "subu %[temp7], %[temp1], %[temp2] \n\t"

	567 "subu %[temp6], %[temp0], %[temp3] \n\t"

	568 "absq_s.w %[temp4], %[temp7] \n\t"

	569 "absq_s.w %[temp5], %[temp6] \n\t"

	570 "sll %[temp4], %[temp4], 2 \n\t"

	571 "addu %[temp5], %[temp4], %[temp5] \n\t"

	572 "subu %[temp5], %[temp5], %[thresh2] \n\t"

	573 "negu %[temp8], %[temp7] \n\t"

	574 "bgtz %[temp5], 1f \n\t"

	575 " addiu %[i], %[i], -1 \n\t"

	576 "sll %[temp4], %[temp8], 1 \n\t"

	577 "shll_s.w %[temp5], %[temp6], 24 \n\t"

	578 "addu %[temp3], %[temp4], %[temp8] \n\t"

	579 "sra %[temp5], %[temp5], 24 \n\t"

	580 "addu %[temp3], %[temp3], %[temp5] \n\t"

	581 "addiu %[temp7], %[temp3], 3 \n\t"

	582 "sra %[temp7], %[temp7], 3 \n\t"

	583 "shra_r.w %[temp8], %[temp3], 3 \n\t"

	584 "shll_s.w %[temp0], %[temp7], 27 \n\t"

	585 "shll_s.w %[temp4], %[temp8], 27 \n\t"

	586 "sra %[temp0], %[temp0], 27 \n\t"

	587 "sra %[temp4], %[temp4], 27 \n\t"

	588 "addu %[temp7], %[temp1], %[temp0] \n\t"

	589 "subu %[temp2], %[temp2], %[temp4] \n\t"

	590 "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"

	591 "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"

	592 "sb %[temp3], -1(%[p]) \n\t"

	593 "sb %[temp4], 0(%[p]) \n\t"

	594 "1: \n\t"

	595 "bgtz %[i], 0b \n\t"

	596 " addu %[p], %[p], %[stride] \n\t"

	597 ".set pop \n\t"

	598 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),

	599 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),

	600 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),

	601 [p]"+&r"(p), [i]"=&r"(i)

	602 : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)

	603 : "memory"

	604 );

	605 }

	606

	607 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {

	608 int k;

	609 for (k = 3; k > 0; --k) {

	610 p += 4 * stride;

	611 SimpleVFilter16(p, stride, thresh);

	612 }

	613 }

	614

	615 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {

	616 int k;

	617 for (k = 3; k > 0; --k) {

	618 p += 4;

	619 SimpleHFilter16(p, stride, thresh);

	620 }

	621 }

	622

	623 // DST[A * BPS] = TEMP0

	624 // DST[B + C * BPS] = TEMP1

	625 #define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \

	626 "usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \

	627 "usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t"

	628

	629 static void VE4(uint8_t* dst) { // vertical

	630 const uint8_t* top = dst - BPS;

	631 int temp0, temp1, temp2, temp3, temp4, temp5, temp6;

	632 __asm__ volatile (

	633 "ulw %[temp0], -1(%[top]) \n\t"

	634 "ulh %[temp1], 3(%[top]) \n\t"

	635 "preceu.ph.qbr %[temp2], %[temp0] \n\t"

	636 "preceu.ph.qbl %[temp3], %[temp0] \n\t"

	637 "preceu.ph.qbr %[temp4], %[temp1] \n\t"

	638 "packrl.ph %[temp5], %[temp3], %[temp2] \n\t"

	639 "packrl.ph %[temp6], %[temp4], %[temp3] \n\t"

	640 "shll.ph %[temp5], %[temp5], 1 \n\t"

	641 "shll.ph %[temp6], %[temp6], 1 \n\t"

	642 "addq.ph %[temp2], %[temp5], %[temp2] \n\t"

	643 "addq.ph %[temp6], %[temp6], %[temp4] \n\t"

	644 "addq.ph %[temp2], %[temp2], %[temp3] \n\t"

	645 "addq.ph %[temp6], %[temp6], %[temp3] \n\t"

	646 "shra_r.ph %[temp2], %[temp2], 2 \n\t"

	647 "shra_r.ph %[temp6], %[temp6], 2 \n\t"

	648 "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t"

	649 STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)

	650 STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)

	651 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),

	652 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),

	653 [temp6]"=&r"(temp6)

	654 : [top]"r"(top), [dst]"r"(dst)

	655 : "memory"

	656 );

	657 }

	658

	659 static void DC4(uint8_t* dst) { // DC

	660 int temp0, temp1, temp2, temp3, temp4;

	661 __asm__ volatile (

	662 "ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t"

	663 LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)

	664 "ins %[temp1], %[temp2], 8, 8 \n\t"

	665 "ins %[temp1], %[temp3], 16, 8 \n\t"

	666 "ins %[temp1], %[temp4], 24, 8 \n\t"

	667 "raddu.w.qb %[temp0], %[temp0] \n\t"

	668 "raddu.w.qb %[temp1], %[temp1] \n\t"

	669 "addu %[temp0], %[temp0], %[temp1] \n\t"

	670 "shra_r.w %[temp0], %[temp0], 3 \n\t"

	671 "replv.qb %[temp0], %[temp0] \n\t"

	672 STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)

	673 STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)

	674 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),

	675 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)

	676 : [dst]"r"(dst)

	677 : "memory"

	678 );

	679 }

	680

	681 static void RD4(uint8_t* dst) { // Down-right

	682 int temp0, temp1, temp2, temp3, temp4;

	683 int temp5, temp6, temp7, temp8;

	684 __asm__ volatile (

	685 LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)

	686 "ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t"

	687 "ins %[temp1], %[temp0], 16, 16 \n\t"

	688 "preceu.ph.qbr %[temp5], %[temp7] \n\t"

	689 "ins %[temp2], %[temp1], 16, 16 \n\t"

	690 "preceu.ph.qbl %[temp4], %[temp7] \n\t"

	691 "ins %[temp3], %[temp2], 16, 16 \n\t"

	692 "shll.ph %[temp2], %[temp2], 1 \n\t"

	693 "addq.ph %[temp3], %[temp3], %[temp1] \n\t"

	694 "packrl.ph %[temp6], %[temp5], %[temp1] \n\t"

	695 "addq.ph %[temp3], %[temp3], %[temp2] \n\t"

	696 "addq.ph %[temp1], %[temp1], %[temp5] \n\t"

	697 "shll.ph %[temp6], %[temp6], 1 \n\t"

	698 "addq.ph %[temp1], %[temp1], %[temp6] \n\t"

	699 "packrl.ph %[temp0], %[temp4], %[temp5] \n\t"

	700 "addq.ph %[temp8], %[temp5], %[temp4] \n\t"

	701 "shra_r.ph %[temp3], %[temp3], 2 \n\t"

	702 "shll.ph %[temp0], %[temp0], 1 \n\t"

	703 "shra_r.ph %[temp1], %[temp1], 2 \n\t"

	704 "addq.ph %[temp8], %[temp0], %[temp8] \n\t"

	705 "lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t"

	706 "precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t"

	707 "shra_r.ph %[temp8], %[temp8], 2 \n\t"

	708 "ins %[temp7], %[temp5], 0, 8 \n\t"

	709 "precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t"

	710 "raddu.w.qb %[temp4], %[temp7] \n\t"

	711 "precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t"

	712 "shra_r.w %[temp4], %[temp4], 2 \n\t"

	713 STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)

	714 "prepend %[temp2], %[temp8], 8 \n\t"

	715 "prepend %[temp6], %[temp4], 8 \n\t"

	716 STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)

	717 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),

	718 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),

	719 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)

	720 : [dst]"r"(dst)

	721 : "memory"

	722 );

	723 }

	724

	725 // TEMP0 = SRC[A * BPS]

	726 // TEMP1 = SRC[B + C * BPS]

	727 #define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \

	728 "ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \

	729 "ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t"

	730

	731 static void LD4(uint8_t* dst) { // Down-Left

	732 int temp0, temp1, temp2, temp3, temp4;

	733 int temp5, temp6, temp7, temp8, temp9;

	734 __asm__ volatile (

	735 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)

	736 "preceu.ph.qbl %[temp2], %[temp0] \n\t"

	737 "preceu.ph.qbr %[temp3], %[temp0] \n\t"

	738 "preceu.ph.qbr %[temp4], %[temp1] \n\t"

	739 "preceu.ph.qbl %[temp5], %[temp1] \n\t"

	740 "packrl.ph %[temp6], %[temp2], %[temp3] \n\t"

	741 "packrl.ph %[temp7], %[temp4], %[temp2] \n\t"

	742 "packrl.ph %[temp8], %[temp5], %[temp4] \n\t"

	743 "shll.ph %[temp6], %[temp6], 1 \n\t"

	744 "addq.ph %[temp9], %[temp2], %[temp6] \n\t"

	745 "shll.ph %[temp7], %[temp7], 1 \n\t"

	746 "addq.ph %[temp9], %[temp9], %[temp3] \n\t"

	747 "shll.ph %[temp8], %[temp8], 1 \n\t"

	748 "shra_r.ph %[temp9], %[temp9], 2 \n\t"

	749 "addq.ph %[temp3], %[temp4], %[temp7] \n\t"

	750 "addq.ph %[temp0], %[temp5], %[temp8] \n\t"

	751 "addq.ph %[temp3], %[temp3], %[temp2] \n\t"

	752 "addq.ph %[temp0], %[temp0], %[temp4] \n\t"

	753 "shra_r.ph %[temp3], %[temp3], 2 \n\t"

	754 "shra_r.ph %[temp0], %[temp0], 2 \n\t"

	755 "srl %[temp1], %[temp1], 24 \n\t"

	756 "sll %[temp1], %[temp1], 1 \n\t"

	757 "raddu.w.qb %[temp5], %[temp5] \n\t"

	758 "precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t"

	759 "precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t"

	760 "addu %[temp1], %[temp1], %[temp5] \n\t"

	761 "shra_r.w %[temp1], %[temp1], 2 \n\t"

	762 STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)

	763 "prepend %[temp9], %[temp0], 8 \n\t"

	764 "prepend %[temp3], %[temp1], 8 \n\t"

	765 STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)

	766 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),

	767 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),

	768 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),

	769 [temp9]"=&r"(temp9)

	770 : [dst]"r"(dst)

	771 : "memory"

	772 );

	773 }

	774

	775 //------------------------------------------------------------------------------

	776 // Chroma

	777

	778 static void DC8uv(uint8_t* dst) { // DC

	779 int temp0, temp1, temp2, temp3, temp4;

	780 int temp5, temp6, temp7, temp8, temp9;

	781 __asm__ volatile (

	782 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)

	783 LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)

	784 LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)

	785 "raddu.w.qb %[temp0], %[temp0] \n\t"

	786 "raddu.w.qb %[temp1], %[temp1] \n\t"

	787 "addu %[temp2], %[temp2], %[temp3] \n\t"

	788 "addu %[temp4], %[temp4], %[temp5] \n\t"

	789 "addu %[temp6], %[temp6], %[temp7] \n\t"

	790 "addu %[temp8], %[temp8], %[temp9] \n\t"

	791 "addu %[temp0], %[temp0], %[temp1] \n\t"

	792 "addu %[temp2], %[temp2], %[temp4] \n\t"

	793 "addu %[temp6], %[temp6], %[temp8] \n\t"

	794 "addu %[temp0], %[temp0], %[temp2] \n\t"

	795 "addu %[temp0], %[temp0], %[temp6] \n\t"

	796 "shra_r.w %[temp0], %[temp0], 4 \n\t"

	797 "replv.qb %[temp0], %[temp0] \n\t"

	798 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)

	799 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)

	800 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)

	801 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)

	802 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)

	803 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)

	804 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)

	805 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)

	806 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),

	807 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),

	808 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),

	809 [temp9]"=&r"(temp9)

	810 : [dst]"r"(dst)

	811 : "memory"

	812 );

	813 }

	814

	815 static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples

	816 int temp0, temp1;

	817 __asm__ volatile (

	818 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)

	819 "raddu.w.qb %[temp0], %[temp0] \n\t"

	820 "raddu.w.qb %[temp1], %[temp1] \n\t"

	821 "addu %[temp0], %[temp0], %[temp1] \n\t"

	822 "shra_r.w %[temp0], %[temp0], 3 \n\t"

	823 "replv.qb %[temp0], %[temp0] \n\t"

	824 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)

	825 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)

	826 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)

	827 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)

	828 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)

	829 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)

	830 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)

	831 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)

	832 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)

	833 : [dst]"r"(dst)

	834 : "memory"

	835 );

	836 }

	837

	838 static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples

	839 int temp0, temp1, temp2, temp3, temp4;

	840 int temp5, temp6, temp7, temp8;

	841 __asm__ volatile (

	842 LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)

	843 LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)

	844 "addu %[temp2], %[temp2], %[temp3] \n\t"

	845 "addu %[temp4], %[temp4], %[temp5] \n\t"

	846 "addu %[temp6], %[temp6], %[temp7] \n\t"

	847 "addu %[temp8], %[temp8], %[temp1] \n\t"

	848 "addu %[temp2], %[temp2], %[temp4] \n\t"

	849 "addu %[temp6], %[temp6], %[temp8] \n\t"

	850 "addu %[temp0], %[temp6], %[temp2] \n\t"

	851 "shra_r.w %[temp0], %[temp0], 3 \n\t"

	852 "replv.qb %[temp0], %[temp0] \n\t"

	853 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)

	854 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)

	855 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)

	856 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)

	857 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)

	858 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)

	859 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)

	860 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)

	861 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),

	862 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),

	863 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)

	864 : [dst]"r"(dst)

	865 : "memory"

	866 );

	867 }

	868

	869 #undef LOAD_8_BYTES

	870 #undef STORE_8_BYTES

	871 #undef LOAD_4_BYTES

	872

	873 #define CLIPPING(SIZE) \

	874 "preceu.ph.qbl %[temp2], %[temp0] \n\t" \

	875 "preceu.ph.qbr %[temp0], %[temp0] \n\t" \

	876 ".if " #SIZE " == 8 \n\t" \

	877 "preceu.ph.qbl %[temp3], %[temp1] \n\t" \

	878 "preceu.ph.qbr %[temp1], %[temp1] \n\t" \

	879 ".endif \n\t" \

	880 "addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \

	881 "addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \

	882 ".if " #SIZE " == 8 \n\t" \

	883 "addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \

	884 "addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \

	885 ".endif \n\t" \

	886 "shll_s.ph %[temp2], %[temp2], 7 \n\t" \

	887 "shll_s.ph %[temp0], %[temp0], 7 \n\t" \

	888 ".if " #SIZE " == 8 \n\t" \

	889 "shll_s.ph %[temp3], %[temp3], 7 \n\t" \

	890 "shll_s.ph %[temp1], %[temp1], 7 \n\t" \

	891 ".endif \n\t" \

	892 "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \

	893 ".if " #SIZE " == 8 \n\t" \

	894 "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \

	895 ".endif \n\t"

	896

	897

	898 #define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \

	899 int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \

	900 int temp0, temp1, temp2, temp3; \

	901 __asm__ volatile ( \

	902 ".if " #SIZE " < 8 \n\t" \

	903 "ulw %[temp0], 0(%[top]) \n\t" \

	904 "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \

	905 CLIPPING(4) \

	906 "usw %[temp0], 0(%[dst]) \n\t" \

	907 ".else \n\t" \

	908 "ulw %[temp0], 0(%[top]) \n\t" \

	909 "ulw %[temp1], 4(%[top]) \n\t" \

	910 "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \

	911 CLIPPING(8) \

	912 "usw %[temp0], 0(%[dst]) \n\t" \

	913 "usw %[temp1], 4(%[dst]) \n\t" \

	914 ".if " #SIZE " == 16 \n\t" \

	915 "ulw %[temp0], 8(%[top]) \n\t" \

	916 "ulw %[temp1], 12(%[top]) \n\t" \

	917 CLIPPING(8) \

	918 "usw %[temp0], 8(%[dst]) \n\t" \

	919 "usw %[temp1], 12(%[dst]) \n\t" \

	920 ".endif \n\t" \

	921 ".endif \n\t" \

	922 : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \

	923 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \

	924 : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \

	925 : "memory" \

	926 ); \

	927 } while (0)

	928

	929 #define CLIP_TO_DST(DST, SIZE) do { \

	930 int y; \

	931 const uint8_t* top = (DST) - BPS; \

	932 const int top_1 = ((int)top[-1] << 16) + top[-1]; \

	933 for (y = 0; y < (SIZE); ++y) { \

	934 CLIP_8B_TO_DST((DST), top, (SIZE)); \

	935 (DST) += BPS; \

	936 } \

	937 } while (0)

	938

	939 #define TRUE_MOTION(DST, SIZE) \

	940 static void TrueMotion##SIZE(uint8_t* (DST)) { \

	941 CLIP_TO_DST((DST), (SIZE)); \

	942 }

	943

	944 TRUE_MOTION(dst, 4)

	945 TRUE_MOTION(dst, 8)

	946 TRUE_MOTION(dst, 16)

	947

	948 #undef TRUE_MOTION

	949 #undef CLIP_TO_DST

	950 #undef CLIP_8B_TO_DST

	951 #undef CLIPPING

	952

	953 //------------------------------------------------------------------------------

	954 // Entry point

	955

	956 extern void VP8DspInitMIPSdspR2(void);

	957

	958 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {

	959 VP8TransformDC = TransformDC;

	960 VP8TransformAC3 = TransformAC3;

	961 VP8Transform = TransformTwo;

	962

	963 VP8VFilter16 = VFilter16;

	964 VP8HFilter16 = HFilter16;

	965 VP8VFilter8 = VFilter8;

	966 VP8HFilter8 = HFilter8;

	967 VP8VFilter16i = VFilter16i;

	968 VP8HFilter16i = HFilter16i;

	969 VP8VFilter8i = VFilter8i;

	970 VP8HFilter8i = HFilter8i;

	971 VP8SimpleVFilter16 = SimpleVFilter16;

	972 VP8SimpleHFilter16 = SimpleHFilter16;

	973 VP8SimpleVFilter16i = SimpleVFilter16i;

	974 VP8SimpleHFilter16i = SimpleHFilter16i;

	975

	976 VP8PredLuma4[0] = DC4;

	977 VP8PredLuma4[1] = TrueMotion4;

	978 VP8PredLuma4[2] = VE4;

	979 VP8PredLuma4[4] = RD4;

	980 VP8PredLuma4[6] = LD4;

	981

	982 VP8PredChroma8[0] = DC8uv;

	983 VP8PredChroma8[1] = TrueMotion8;

	984 VP8PredChroma8[4] = DC8uvNoTop;

	985 VP8PredChroma8[5] = DC8uvNoLeft;

	986

	987 VP8PredLuma16[1] = TrueMotion16;

	988 }

	989

	990 #else // !WEBP_USE_MIPS_DSP_R2

	991

	992 WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)

	993

	994 #endif // WEBP_USE_MIPS_DSP_R2

OLD	NEW

« third_party/libwebp/BUILD.gn ('K') | « third_party/libwebp/dsp/dec_mips32.c ('k') | third_party/libwebp/dsp/dec_neon.c » ('j') | third_party/libwebp/libwebp.gyp » ('J')