third_party/libwebp/dsp/upsampling_msa.c - Issue 2651883004: libwebp-0.6.0-rc1

Side by Side Diff: third_party/libwebp/dsp/upsampling_msa.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2016 Google Inc. All Rights Reserved.

	2 //

	3 // Use of this source code is governed by a BSD-style license

	4 // that can be found in the COPYING file in the root of the source

	5 // tree. An additional intellectual property rights grant can be found

	6 // in the file PATENTS. All contributing project authors may

	7 // be found in the AUTHORS file in the root of the source tree.

	8 // -----------------------------------------------------------------------------

	9 //

	10 // MSA version of YUV to RGB upsampling functions.

	11 //

	12 // Author: Prashant Patil (prashant.patil@imgtec.com)

	13

	14 #include <string.h>

	15 #include "./dsp.h"

	16

	17 #if defined(WEBP_USE_MSA)

	18

	19 #include "./msa_macro.h"

	20 #include "./yuv.h"

	21

	22 #ifdef FANCY_UPSAMPLING

	23

	24 #define ILVR_UW2(in, out0, out1) do { \

	25 const v8i16 t0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in); \

	26 out0 = (v4u32)__msa_ilvr_h((v8i16)zero, t0); \

	27 out1 = (v4u32)__msa_ilvl_h((v8i16)zero, t0); \

	28 } while (0)

	29

	30 #define ILVRL_UW4(in, out0, out1, out2, out3) do { \

	31 v16u8 t0, t1; \

	32 ILVRL_B2_UB(zero, in, t0, t1); \

	33 ILVRL_H2_UW(zero, t0, out0, out1); \

	34 ILVRL_H2_UW(zero, t1, out2, out3); \

	35 } while (0)

	36

	37 #define MULTHI_16(in0, in1, in2, in3, cnst, out0, out1) do { \

	38 const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256); \

	39 v4u32 temp0, temp1, temp2, temp3; \

	40 MUL4(in0, const0, in1, const0, in2, const0, in3, const0, \

	41 temp0, temp1, temp2, temp3); \

	42 PCKOD_H2_UH(temp1, temp0, temp3, temp2, out0, out1); \

	43 } while (0)

	44

	45 #define MULTHI_8(in0, in1, cnst, out0) do { \

	46 const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256); \

	47 v4u32 temp0, temp1; \

	48 MUL2(in0, const0, in1, const0, temp0, temp1); \

	49 out0 = (v8u16)__msa_pckod_h((v8i16)temp1, (v8i16)temp0); \

	50 } while (0)

	51

	52 #define CALC_R16(y0, y1, v0, v1, dst) do { \

	53 const v8i16 const_a = (v8i16)__msa_fill_h(14234); \

	54 const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0); \

	55 const v8i16 a1 = __msa_adds_s_h((v8i16)y1, (v8i16)v1); \

	56 v8i16 b0 = __msa_subs_s_h(a0, const_a); \

	57 v8i16 b1 = __msa_subs_s_h(a1, const_a); \

	58 SRAI_H2_SH(b0, b1, 6); \

	59 CLIP_SH2_0_255(b0, b1); \

	60 dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0); \

	61 } while (0)

	62

	63 #define CALC_R8(y0, v0, dst) do { \

	64 const v8i16 const_a = (v8i16)__msa_fill_h(14234); \

	65 const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0); \

	66 v8i16 b0 = __msa_subs_s_h(a0, const_a); \

	67 b0 = SRAI_H(b0, 6); \

	68 CLIP_SH_0_255(b0); \

	69 dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0); \

	70 } while (0)

	71

	72 #define CALC_G16(y0, y1, u0, u1, v0, v1, dst) do { \

	73 const v8i16 const_a = (v8i16)__msa_fill_h(8708); \

	74 v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0); \

	75 v8i16 a1 = __msa_subs_s_h((v8i16)y1, (v8i16)u1); \

	76 const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0); \

	77 const v8i16 b1 = __msa_subs_s_h(a1, (v8i16)v1); \

	78 a0 = __msa_adds_s_h(b0, const_a); \

	79 a1 = __msa_adds_s_h(b1, const_a); \

	80 SRAI_H2_SH(a0, a1, 6); \

	81 CLIP_SH2_0_255(a0, a1); \

	82 dst = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0); \

	83 } while (0)

	84

	85 #define CALC_G8(y0, u0, v0, dst) do { \

	86 const v8i16 const_a = (v8i16)__msa_fill_h(8708); \

	87 v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0); \

	88 const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0); \

	89 a0 = __msa_adds_s_h(b0, const_a); \

	90 a0 = SRAI_H(a0, 6); \

	91 CLIP_SH_0_255(a0); \

	92 dst = (v16u8)__msa_pckev_b((v16i8)a0, (v16i8)a0); \

	93 } while (0)

	94

	95 #define CALC_B16(y0, y1, u0, u1, dst) do { \

	96 const v8u16 const_a = (v8u16)__msa_fill_h(17685); \

	97 const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0); \

	98 const v8u16 a1 = __msa_adds_u_h((v8u16)y1, u1); \

	99 v8u16 b0 = __msa_subs_u_h(a0, const_a); \

	100 v8u16 b1 = __msa_subs_u_h(a1, const_a); \

	101 SRAI_H2_UH(b0, b1, 6); \

	102 CLIP_UH2_0_255(b0, b1); \

	103 dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0); \

	104 } while (0)

	105

	106 #define CALC_B8(y0, u0, dst) do { \

	107 const v8u16 const_a = (v8u16)__msa_fill_h(17685); \

	108 const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0); \

	109 v8u16 b0 = __msa_subs_u_h(a0, const_a); \

	110 b0 = SRAI_H(b0, 6); \

	111 CLIP_UH_0_255(b0); \

	112 dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0); \

	113 } while (0)

	114

	115 #define CALC_RGB16(y, u, v, R, G, B) do { \

	116 const v16u8 zero = { 0 }; \

	117 v8u16 y0, y1, u0, u1, v0, v1; \

	118 v4u32 p0, p1, p2, p3; \

	119 const v16u8 in_y = LD_UB(y); \

	120 const v16u8 in_u = LD_UB(u); \

	121 const v16u8 in_v = LD_UB(v); \

	122 ILVRL_UW4(in_y, p0, p1, p2, p3); \

	123 MULTHI_16(p0, p1, p2, p3, 19077, y0, y1); \

	124 ILVRL_UW4(in_v, p0, p1, p2, p3); \

	125 MULTHI_16(p0, p1, p2, p3, 26149, v0, v1); \

	126 CALC_R16(y0, y1, v0, v1, R); \

	127 MULTHI_16(p0, p1, p2, p3, 13320, v0, v1); \

	128 ILVRL_UW4(in_u, p0, p1, p2, p3); \

	129 MULTHI_16(p0, p1, p2, p3, 6419, u0, u1); \

	130 CALC_G16(y0, y1, u0, u1, v0, v1, G); \

	131 MULTHI_16(p0, p1, p2, p3, 33050, u0, u1); \

	132 CALC_B16(y0, y1, u0, u1, B); \

	133 } while (0)

	134

	135 #define CALC_RGB8(y, u, v, R, G, B) do { \

	136 const v16u8 zero = { 0 }; \

	137 v8u16 y0, u0, v0; \

	138 v4u32 p0, p1; \

	139 const v16u8 in_y = LD_UB(y); \

	140 const v16u8 in_u = LD_UB(u); \

	141 const v16u8 in_v = LD_UB(v); \

	142 ILVR_UW2(in_y, p0, p1); \

	143 MULTHI_8(p0, p1, 19077, y0); \

	144 ILVR_UW2(in_v, p0, p1); \

	145 MULTHI_8(p0, p1, 26149, v0); \

	146 CALC_R8(y0, v0, R); \

	147 MULTHI_8(p0, p1, 13320, v0); \

	148 ILVR_UW2(in_u, p0, p1); \

	149 MULTHI_8(p0, p1, 6419, u0); \

	150 CALC_G8(y0, u0, v0, G); \

	151 MULTHI_8(p0, p1, 33050, u0); \

	152 CALC_B8(y0, u0, B); \

	153 } while (0)

	154

	155 #define STORE16_3(a0, a1, a2, dst) do { \

	156 const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, \

	157 8, 9, 20, 10 }; \

	158 const v16u8 mask1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, \

	159 8, 25, 9, 10 }; \

	160 const v16u8 mask2 = { 26, 0, 1, 27, 2, 3, 28, 4, 5, 29, 6, 7, \

	161 30, 8, 9, 31 }; \

	162 v16u8 out0, out1, out2, tmp0, tmp1, tmp2; \

	163 ILVRL_B2_UB(a1, a0, tmp0, tmp1); \

	164 out0 = VSHF_UB(tmp0, a2, mask0); \

	165 tmp2 = SLDI_UB(tmp1, tmp0, 11); \

	166 out1 = VSHF_UB(tmp2, a2, mask1); \

	167 tmp2 = SLDI_UB(tmp1, tmp1, 6); \

	168 out2 = VSHF_UB(tmp2, a2, mask2); \

	169 ST_UB(out0, dst + 0); \

	170 ST_UB(out1, dst + 16); \

	171 ST_UB(out2, dst + 32); \

	172 } while (0)

	173

	174 #define STORE8_3(a0, a1, a2, dst) do { \

	175 int64_t out_m; \

	176 const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, \

	177 8, 9, 20, 10 }; \

	178 const v16u8 mask1 = { 11, 21, 12, 13, 22, 14, 15, 23, \

	179 255, 255, 255, 255, 255, 255, 255, 255 }; \

	180 const v16u8 tmp0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0); \

	181 v16u8 out0, out1; \

	182 VSHF_B2_UB(tmp0, a2, tmp0, a2, mask0, mask1, out0, out1); \

	183 ST_UB(out0, dst); \

	184 out_m = __msa_copy_s_d((v2i64)out1, 0); \

	185 SD(out_m, dst + 16); \

	186 } while (0)

	187

	188 #define STORE16_4(a0, a1, a2, a3, dst) do { \

	189 v16u8 tmp0, tmp1, tmp2, tmp3; \

	190 v16u8 out0, out1, out2, out3; \

	191 ILVRL_B2_UB(a1, a0, tmp0, tmp1); \

	192 ILVRL_B2_UB(a3, a2, tmp2, tmp3); \

	193 ILVRL_H2_UB(tmp2, tmp0, out0, out1); \

	194 ILVRL_H2_UB(tmp3, tmp1, out2, out3); \

	195 ST_UB(out0, dst + 0); \

	196 ST_UB(out1, dst + 16); \

	197 ST_UB(out2, dst + 32); \

	198 ST_UB(out3, dst + 48); \

	199 } while (0)

	200

	201 #define STORE8_4(a0, a1, a2, a3, dst) do { \

	202 v16u8 tmp0, tmp1, tmp2, tmp3; \

	203 ILVR_B2_UB(a1, a0, a3, a2, tmp0, tmp1); \

	204 ILVRL_H2_UB(tmp1, tmp0, tmp2, tmp3); \

	205 ST_UB(tmp2, dst + 0); \

	206 ST_UB(tmp3, dst + 16); \

	207 } while (0)

	208

	209 #define STORE2_16(a0, a1, dst) do { \

	210 v16u8 out0, out1; \

	211 ILVRL_B2_UB(a1, a0, out0, out1); \

	212 ST_UB(out0, dst + 0); \

	213 ST_UB(out1, dst + 16); \

	214 } while (0)

	215

	216 #define STORE2_8(a0, a1, dst) do { \

	217 const v16u8 out0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0); \

	218 ST_UB(out0, dst); \

	219 } while (0)

	220

	221 #define CALC_RGBA4444(y, u, v, out0, out1, N, dst) do { \

	222 CALC_RGB##N(y, u, v, R, G, B); \

	223 tmp0 = ANDI_B(R, 0xf0); \

	224 tmp1 = SRAI_B(G, 4); \

	225 RG = tmp0 \| tmp1; \

	226 tmp0 = ANDI_B(B, 0xf0); \

	227 BA = ORI_B(tmp0, 0x0f); \

	228 STORE2_##N(out0, out1, dst); \

	229 } while (0)

	230

	231 #define CALC_RGB565(y, u, v, out0, out1, N, dst) do { \

	232 CALC_RGB##N(y, u, v, R, G, B); \

	233 tmp0 = ANDI_B(R, 0xf8); \

	234 tmp1 = SRAI_B(G, 5); \

	235 RG = tmp0 \| tmp1; \

	236 tmp0 = SLLI_B(G, 3); \

	237 tmp1 = ANDI_B(tmp0, 0xe0); \

	238 tmp0 = SRAI_B(B, 3); \

	239 GB = tmp0 \| tmp1; \

	240 STORE2_##N(out0, out1, dst); \

	241 } while (0)

	242

	243 static WEBP_INLINE int Clip8(int v) {

	244 return v < 0 ? 0 : v > 255 ? 255 : v;

	245 }

	246

	247 static void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {

	248 const int y1 = MultHi(y, 19077);

	249 const int r1 = y1 + MultHi(v, 26149) - 14234;

	250 const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;

	251 const int b1 = y1 + MultHi(u, 33050) - 17685;

	252 rgb[0] = Clip8(r1 >> 6);

	253 rgb[1] = Clip8(g1 >> 6);

	254 rgb[2] = Clip8(b1 >> 6);

	255 }

	256

	257 static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {

	258 const int y1 = MultHi(y, 19077);

	259 const int r1 = y1 + MultHi(v, 26149) - 14234;

	260 const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;

	261 const int b1 = y1 + MultHi(u, 33050) - 17685;

	262 bgr[0] = Clip8(b1 >> 6);

	263 bgr[1] = Clip8(g1 >> 6);

	264 bgr[2] = Clip8(r1 >> 6);

	265 }

	266

	267 static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {

	268 const int y1 = MultHi(y, 19077);

	269 const int r1 = y1 + MultHi(v, 26149) - 14234;

	270 const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;

	271 const int b1 = y1 + MultHi(u, 33050) - 17685;

	272 const int r = Clip8(r1 >> 6);

	273 const int g = Clip8(g1 >> 6);

	274 const int b = Clip8(b1 >> 6);

	275 const int rg = (r & 0xf8) \| (g >> 5);

	276 const int gb = ((g << 3) & 0xe0) \| (b >> 3);

	277 #ifdef WEBP_SWAP_16BIT_CSP

	278 rgb[0] = gb;

	279 rgb[1] = rg;

	280 #else

	281 rgb[0] = rg;

	282 rgb[1] = gb;

	283 #endif

	284 }

	285

	286 static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {

	287 const int y1 = MultHi(y, 19077);

	288 const int r1 = y1 + MultHi(v, 26149) - 14234;

	289 const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;

	290 const int b1 = y1 + MultHi(u, 33050) - 17685;

	291 const int r = Clip8(r1 >> 6);

	292 const int g = Clip8(g1 >> 6);

	293 const int b = Clip8(b1 >> 6);

	294 const int rg = (r & 0xf0) \| (g >> 4);

	295 const int ba = (b & 0xf0) \| 0x0f; // overwrite the lower 4 bits

	296 #ifdef WEBP_SWAP_16BIT_CSP

	297 argb[0] = ba;

	298 argb[1] = rg;

	299 #else

	300 argb[0] = rg;

	301 argb[1] = ba;

	302 #endif

	303 }

	304

	305 static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {

	306 argb[0] = 0xff;

	307 YuvToRgb(y, u, v, argb + 1);

	308 }

	309

	310 static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {

	311 YuvToBgr(y, u, v, bgra);

	312 bgra[3] = 0xff;

	313 }

	314

	315 static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {

	316 YuvToRgb(y, u, v, rgba);

	317 rgba[3] = 0xff;

	318 }

	319

	320 static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,

	321 const uint8_t* v, uint8_t* dst, int length) {

	322 v16u8 R, G, B;

	323 while (length >= 16) {

	324 CALC_RGB16(y, u, v, R, G, B);

	325 STORE16_3(R, G, B, dst);

	326 y += 16;

	327 u += 16;

	328 v += 16;

	329 dst += 16 * 3;

	330 length -= 16;

	331 }

	332 if (length > 8) {

	333 uint8_t temp[3 * 16] = { 0 };

	334 memcpy(temp, y, length * sizeof(*temp));

	335 CALC_RGB16(temp, u, v, R, G, B);

	336 STORE16_3(R, G, B, temp);

	337 memcpy(dst, temp, length * 3 * sizeof(*dst));

	338 } else if (length > 0) {

	339 uint8_t temp[3 * 8] = { 0 };

	340 memcpy(temp, y, length * sizeof(*temp));

	341 CALC_RGB8(temp, u, v, R, G, B);

	342 STORE8_3(R, G, B, temp);

	343 memcpy(dst, temp, length * 3 * sizeof(*dst));

	344 }

	345 }

	346

	347 static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,

	348 const uint8_t* v, uint8_t* dst, int length) {

	349 v16u8 R, G, B;

	350 while (length >= 16) {

	351 CALC_RGB16(y, u, v, R, G, B);

	352 STORE16_3(B, G, R, dst);

	353 y += 16;

	354 u += 16;

	355 v += 16;

	356 dst += 16 * 3;

	357 length -= 16;

	358 }

	359 if (length > 8) {

	360 uint8_t temp[3 * 16] = { 0 };

	361 memcpy(temp, y, length * sizeof(*temp));

	362 CALC_RGB16(temp, u, v, R, G, B);

	363 STORE16_3(B, G, R, temp);

	364 memcpy(dst, temp, length * 3 * sizeof(*dst));

	365 } else if (length > 0) {

	366 uint8_t temp[3 * 8] = { 0 };

	367 memcpy(temp, y, length * sizeof(*temp));

	368 CALC_RGB8(temp, u, v, R, G, B);

	369 STORE8_3(B, G, R, temp);

	370 memcpy(dst, temp, length * 3 * sizeof(*dst));

	371 }

	372 }

	373

	374 static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,

	375 const uint8_t* v, uint8_t* dst, int length) {

	376 v16u8 R, G, B;

	377 const v16u8 A = (v16u8)__msa_ldi_b(0xff);

	378 while (length >= 16) {

	379 CALC_RGB16(y, u, v, R, G, B);

	380 STORE16_4(R, G, B, A, dst);

	381 y += 16;

	382 u += 16;

	383 v += 16;

	384 dst += 16 * 4;

	385 length -= 16;

	386 }

	387 if (length > 8) {

	388 uint8_t temp[4 * 16] = { 0 };

	389 memcpy(temp, y, length * sizeof(*temp));

	390 CALC_RGB16(&temp[0], u, v, R, G, B);

	391 STORE16_4(R, G, B, A, temp);

	392 memcpy(dst, temp, length * 4 * sizeof(*dst));

	393 } else if (length > 0) {

	394 uint8_t temp[4 * 8] = { 0 };

	395 memcpy(temp, y, length * sizeof(*temp));

	396 CALC_RGB8(temp, u, v, R, G, B);

	397 STORE8_4(R, G, B, A, temp);

	398 memcpy(dst, temp, length * 4 * sizeof(*dst));

	399 }

	400 }

	401

	402 static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,

	403 const uint8_t* v, uint8_t* dst, int length) {

	404 v16u8 R, G, B;

	405 const v16u8 A = (v16u8)__msa_ldi_b(0xff);

	406 while (length >= 16) {

	407 CALC_RGB16(y, u, v, R, G, B);

	408 STORE16_4(B, G, R, A, dst);

	409 y += 16;

	410 u += 16;

	411 v += 16;

	412 dst += 16 * 4;

	413 length -= 16;

	414 }

	415 if (length > 8) {

	416 uint8_t temp[4 * 16] = { 0 };

	417 memcpy(temp, y, length * sizeof(*temp));

	418 CALC_RGB16(temp, u, v, R, G, B);

	419 STORE16_4(B, G, R, A, temp);

	420 memcpy(dst, temp, length * 4 * sizeof(*dst));

	421 } else if (length > 0) {

	422 uint8_t temp[4 * 8] = { 0 };

	423 memcpy(temp, y, length * sizeof(*temp));

	424 CALC_RGB8(temp, u, v, R, G, B);

	425 STORE8_4(B, G, R, A, temp);

	426 memcpy(dst, temp, length * 4 * sizeof(*dst));

	427 }

	428 }

	429

	430 static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,

	431 const uint8_t* v, uint8_t* dst, int length) {

	432 v16u8 R, G, B;

	433 const v16u8 A = (v16u8)__msa_ldi_b(0xff);

	434 while (length >= 16) {

	435 CALC_RGB16(y, u, v, R, G, B);

	436 STORE16_4(A, R, G, B, dst);

	437 y += 16;

	438 u += 16;

	439 v += 16;

	440 dst += 16 * 4;

	441 length -= 16;

	442 }

	443 if (length > 8) {

	444 uint8_t temp[4 * 16] = { 0 };

	445 memcpy(temp, y, length * sizeof(*temp));

	446 CALC_RGB16(temp, u, v, R, G, B);

	447 STORE16_4(A, R, G, B, temp);

	448 memcpy(dst, temp, length * 4 * sizeof(*dst));

	449 } else if (length > 0) {

	450 uint8_t temp[4 * 8] = { 0 };

	451 memcpy(temp, y, length * sizeof(*temp));

	452 CALC_RGB8(temp, u, v, R, G, B);

	453 STORE8_4(A, R, G, B, temp);

	454 memcpy(dst, temp, length * 4 * sizeof(*dst));

	455 }

	456 }

	457

	458 static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,

	459 const uint8_t* v, uint8_t* dst, int length) {

	460 v16u8 R, G, B, RG, BA, tmp0, tmp1;

	461 while (length >= 16) {

	462 #ifdef WEBP_SWAP_16BIT_CSP

	463 CALC_RGBA4444(y, u, v, BA, RG, 16, dst);

	464 #else

	465 CALC_RGBA4444(y, u, v, RG, BA, 16, dst);

	466 #endif

	467 y += 16;

	468 u += 16;

	469 v += 16;

	470 dst += 16 * 2;

	471 length -= 16;

	472 }

	473 if (length > 8) {

	474 uint8_t temp[2 * 16] = { 0 };

	475 memcpy(temp, y, length * sizeof(*temp));

	476 #ifdef WEBP_SWAP_16BIT_CSP

	477 CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);

	478 #else

	479 CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);

	480 #endif

	481 memcpy(dst, temp, length * 2 * sizeof(*dst));

	482 } else if (length > 0) {

	483 uint8_t temp[2 * 8] = { 0 };

	484 memcpy(temp, y, length * sizeof(*temp));

	485 #ifdef WEBP_SWAP_16BIT_CSP

	486 CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);

	487 #else

	488 CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);

	489 #endif

	490 memcpy(dst, temp, length * 2 * sizeof(*dst));

	491 }

	492 }

	493

	494 static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,

	495 const uint8_t* v, uint8_t* dst, int length) {

	496 v16u8 R, G, B, RG, GB, tmp0, tmp1;

	497 while (length >= 16) {

	498 #ifdef WEBP_SWAP_16BIT_CSP

	499 CALC_RGB565(y, u, v, GB, RG, 16, dst);

	500 #else

	501 CALC_RGB565(y, u, v, RG, GB, 16, dst);

	502 #endif

	503 y += 16;

	504 u += 16;

	505 v += 16;

	506 dst += 16 * 2;

	507 length -= 16;

	508 }

	509 if (length > 8) {

	510 uint8_t temp[2 * 16] = { 0 };

	511 memcpy(temp, y, length * sizeof(*temp));

	512 #ifdef WEBP_SWAP_16BIT_CSP

	513 CALC_RGB565(temp, u, v, GB, RG, 16, temp);

	514 #else

	515 CALC_RGB565(temp, u, v, RG, GB, 16, temp);

	516 #endif

	517 memcpy(dst, temp, length * 2 * sizeof(*dst));

	518 } else if (length > 0) {

	519 uint8_t temp[2 * 8] = { 0 };

	520 memcpy(temp, y, length * sizeof(*temp));

	521 #ifdef WEBP_SWAP_16BIT_CSP

	522 CALC_RGB565(temp, u, v, GB, RG, 8, temp);

	523 #else

	524 CALC_RGB565(temp, u, v, RG, GB, 8, temp);

	525 #endif

	526 memcpy(dst, temp, length * 2 * sizeof(*dst));

	527 }

	528 }

	529

	530 #define UPSAMPLE_32PIXELS(a, b, c, d) do { \

	531 v16u8 s = __msa_aver_u_b(a, d); \

	532 v16u8 t = __msa_aver_u_b(b, c); \

	533 const v16u8 st = s ^ t; \

	534 v16u8 ad = a ^ d; \

	535 v16u8 bc = b ^ c; \

	536 v16u8 t0 = ad \| bc; \

	537 v16u8 t1 = t0 \| st; \

	538 v16u8 t2 = ANDI_B(t1, 1); \

	539 v16u8 t3 = __msa_aver_u_b(s, t); \

	540 const v16u8 k = t3 - t2; \

	541 v16u8 diag1, diag2; \

	542 AVER_UB2_UB(t, k, s, k, t0, t1); \

	543 bc = bc & st; \

	544 ad = ad & st; \

	545 t = t ^ k; \

	546 s = s ^ k; \

	547 t2 = bc \| t; \

	548 t3 = ad \| s; \

	549 t2 = ANDI_B(t2, 1); \

	550 t3 = ANDI_B(t3, 1); \

	551 SUB2(t0, t2, t1, t3, diag1, diag2); \

	552 AVER_UB2_UB(a, diag1, b, diag2, t0, t1); \

	553 ILVRL_B2_UB(t1, t0, a, b); \

	554 if (pbot_y != NULL) { \

	555 AVER_UB2_UB(c, diag2, d, diag1, t0, t1); \

	556 ILVRL_B2_UB(t1, t0, c, d); \

	557 } \

	558 } while (0)

	559

	560 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \

	561 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \

	562 const uint8_t* top_u, const uint8_t* top_v, \

	563 const uint8_t* cur_u, const uint8_t* cur_v, \

	564 uint8_t* top_dst, uint8_t* bot_dst, int len) \

	565 { \

	566 int size = (len - 1) >> 1; \

	567 uint8_t temp_u[64]; \

	568 uint8_t temp_v[64]; \

	569 const uint32_t tl_uv = ((top_u[0]) \| ((top_v[0]) << 16)); \

	570 const uint32_t l_uv = ((cur_u[0]) \| ((cur_v[0]) << 16)); \

	571 const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \

	572 const uint8_t* ptop_y = &top_y[1]; \

	573 uint8_t *ptop_dst = top_dst + XSTEP; \

	574 const uint8_t* pbot_y = &bot_y[1]; \

	575 uint8_t *pbot_dst = bot_dst + XSTEP; \

	576 \

	577 FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \

	578 if (bot_y != NULL) { \

	579 const uint32_t uv1 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \

	580 FUNC(bot_y[0], uv1 & 0xff, (uv1 >> 16), bot_dst); \

	581 } \

	582 while (size >= 16) { \

	583 v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1; \

	584 LD_UB2(top_u, 1, tu0, tu1); \

	585 LD_UB2(cur_u, 1, cu0, cu1); \

	586 LD_UB2(top_v, 1, tv0, tv1); \

	587 LD_UB2(cur_v, 1, cv0, cv1); \

	588 UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1); \

	589 UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1); \

	590 ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16); \

	591 ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16); \

	592 FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, 32); \

	593 if (bot_y != NULL) { \

	594 FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, 32); \

	595 } \

	596 ptop_y += 32; \

	597 pbot_y += 32; \

	598 ptop_dst += XSTEP * 32; \

	599 pbot_dst += XSTEP * 32; \

	600 top_u += 16; \

	601 top_v += 16; \

	602 cur_u += 16; \

	603 cur_v += 16; \

	604 size -= 16; \

	605 } \

	606 if (size > 0) { \

	607 v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1; \

	608 memcpy(&temp_u[ 0], top_u, 17 * sizeof(uint8_t)); \

	609 memcpy(&temp_u[32], cur_u, 17 * sizeof(uint8_t)); \

	610 memcpy(&temp_v[ 0], top_v, 17 * sizeof(uint8_t)); \

	611 memcpy(&temp_v[32], cur_v, 17 * sizeof(uint8_t)); \

	612 LD_UB2(&temp_u[ 0], 1, tu0, tu1); \

	613 LD_UB2(&temp_u[32], 1, cu0, cu1); \

	614 LD_UB2(&temp_v[ 0], 1, tv0, tv1); \

	615 LD_UB2(&temp_v[32], 1, cv0, cv1); \

	616 UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1); \

	617 UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1); \

	618 ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16); \

	619 ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16); \

	620 FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, size * 2); \

	621 if (bot_y != NULL) { \

	622 FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, size * 2); \

	623 } \

	624 top_u += size; \

	625 top_v += size; \

	626 cur_u += size; \

	627 cur_v += size; \

	628 } \

	629 if (!(len & 1)) { \

	630 const uint32_t t0 = ((top_u[0]) \| ((top_v[0]) << 16)); \

	631 const uint32_t c0 = ((cur_u[0]) \| ((cur_v[0]) << 16)); \

	632 const uint32_t tmp0 = (3 * t0 + c0 + 0x00020002u) >> 2; \

	633 FUNC(top_y[len - 1], tmp0 & 0xff, (tmp0 >> 16), \

	634 top_dst + (len - 1) * XSTEP); \

	635 if (bot_y != NULL) { \

	636 const uint32_t tmp1 = (3 * c0 + t0 + 0x00020002u) >> 2; \

	637 FUNC(bot_y[len - 1], tmp1 & 0xff, (tmp1 >> 16), \

	638 bot_dst + (len - 1) * XSTEP); \

	639 } \

	640 } \

	641 }

	642

	643 UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)

	644 UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)

	645 UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)

	646 UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)

	647 UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)

	648 UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)

	649 UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)

	650

	651 //------------------------------------------------------------------------------

	652 // Entry point

	653

	654 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];

	655

	656 extern void WebPInitUpsamplersMSA(void);

	657

	658 WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {

	659 WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;

	660 WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;

	661 WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;

	662 WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;

	663 WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;

	664 WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;

	665 WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;

	666 WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;

	667 WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;

	668 WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;

	669 WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;

	670 }

	671

	672 #endif // FANCY_UPSAMPLING

	673

	674 #endif // WEBP_USE_MSA

	675

	676 #if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MSA))

	677 WEBP_DSP_INIT_STUB(WebPInitUpsamplersMSA)

	678 #endif

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/upsampling.c ('k') | third_party/libwebp/dsp/upsampling_neon.c » ('j') | no next file with comments »