source/row_msa.cc - Issue 2421843002: Add MSA optimized ARGB4444ToI420 and ARGB4444ToARGB functions

Side by Side Diff: source/row_msa.cc

Issue 2421843002: Add MSA optimized ARGB4444ToI420 and ARGB4444ToARGB functions (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 203 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
214 dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);	214 dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);

215 dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);	215 dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);

216 ST_UB(dst0, dst_u);	216 ST_UB(dst0, dst_u);

217 ST_UB(dst1, dst_v);	217 ST_UB(dst1, dst_v);

218 src_uyvy += 64;	218 src_uyvy += 64;

219 dst_u += 16;	219 dst_u += 16;

220 dst_v += 16;	220 dst_v += 16;

221 }	221 }

222 }	222 }

223	223

	224 void ARGB4444ToYRow_MSA(const uint8* src_argb4444, uint8* dst_y, int width) {

	225 int x;

	226 const uint16* src_argb4444_u16 = (uint16*) src_argb4444;

	227 v8u16 src0, src1;

	228 v8u16 vec0, vec1, vec2, vec3, vec4, vec5;

	229 v16u8 dst0;

	230 v8u16 const_0x19 = (v8u16) __msa_ldi_h(0x19);

	231 v8u16 const_0x81 = (v8u16) __msa_ldi_h(0x81);

	232 v8u16 const_0x42 = (v8u16) __msa_ldi_h(0x42);

	233 v8u16 const_0x1080 = (v8u16) __msa_fill_h(0x1080);

	234 v8u16 const_0x0F = (v8u16) __msa_ldi_h(0x0F);

	235

	236 for (x = 0; x < width; x += 16) {

	237 LD_UH2(src_argb4444_u16, 8, src0, src1);

	238 vec0 = src0 & const_0x0F;

	239 vec1 = src1 & const_0x0F;

	240 src0 = (v8u16) __msa_srai_h((v8i16) src0, 4);

	241 src1 = (v8u16) __msa_srai_h((v8i16) src1, 4);

	242 vec2 = src0 & const_0x0F;

	243 vec3 = src1 & const_0x0F;

	244 src0 = (v8u16) __msa_srai_h((v8i16) src0, 4);

	245 src1 = (v8u16) __msa_srai_h((v8i16) src1, 4);

	246 vec4 = src0 & const_0x0F;

	247 vec5 = src1 & const_0x0F;

	248 vec0 \|= (v8u16) __msa_slli_h((v8i16) vec0, 4);

	249 vec1 \|= (v8u16) __msa_slli_h((v8i16) vec1, 4);

	250 vec2 \|= (v8u16) __msa_slli_h((v8i16) vec2, 4);

	251 vec3 \|= (v8u16) __msa_slli_h((v8i16) vec3, 4);

	252 vec4 \|= (v8u16) __msa_slli_h((v8i16) vec4, 4);

	253 vec5 \|= (v8u16) __msa_slli_h((v8i16) vec5, 4);

	254 vec0 *= const_0x19;
	fbarchard1 2016/10/14 21:35:16 FYI The YUV to RGB functions now take constants as FYI The YUV to RGB functions now take constants as a parameter. We should do that for RGB to YUV at some point too, allowing jpeg and bt.709 coefficients. manojkumar.bhosale 2016/10/19 11:56:27 OK. will then fix them when changes happens. Show quoted text On 2016/10/14 21:35:16, fbarchard1 wrote: > FYI The YUV to RGB functions now take constants as a parameter. We should do > that for RGB to YUV at some point too, allowing jpeg and bt.709 coefficients. OK. will then fix them when changes happens.
	255 vec1 *= const_0x19;

	256 vec2 *= const_0x81;

	257 vec3 *= const_0x81;

	258 vec4 *= const_0x42;

	259 vec5 *= const_0x42;

	260 vec0 += vec2;

	261 vec1 += vec3;

	262 vec0 += vec4;

	263 vec1 += vec5;

	264 vec0 += const_0x1080;

	265 vec1 += const_0x1080;

	266 vec0 = (v8u16) __msa_srai_h((v8i16) vec0, 8);

	267 vec1 = (v8u16) __msa_srai_h((v8i16) vec1, 8);

	268 dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);

	269 ST_UB(dst0, dst_y);

	270 src_argb4444_u16 += 16;

	271 dst_y += 16;

	272 }

	273 }

	274

	275 void ARGB4444ToUVRow_MSA(const uint8* src_argb4444,

	276 int src_stride_argb4444,

	277 uint8* dst_u, uint8* dst_v, int width) {

	278 int x;

	279 const uint8* src_argb4444_next = src_argb4444 + src_stride_argb4444;

	280 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

	281 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;

	282 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;

	283 v16u8 dst0, dst1;

	284 v8u16 const_0x70 = (v8u16) __msa_ldi_h(0x70);

	285 v8u16 const_0x4A = (v8u16) __msa_ldi_h(0x4A);

	286 v8u16 const_0x26 = (v8u16) __msa_ldi_h(0x26);

	287 v8u16 const_0x5E = (v8u16) __msa_ldi_h(0x5E);

	288 v8u16 const_0x12 = (v8u16) __msa_ldi_h(0x12);

	289 v8u16 const_0x8080 = (v8u16) __msa_fill_h(0x8080);

	290

	291 for (x = 0; x < width; x += 32) {

	292 LD_UB4(src_argb4444, 16, src0, src1, src2, src3);

	293 LD_UB4(src_argb4444_next, 16, src4, src5, src6, src7);

	294 reg0 = __msa_andi_b(src0, 0x0F);

	295 reg1 = __msa_andi_b(src1, 0x0F);

	296 reg2 = __msa_andi_b(src2, 0x0F);

	297 reg3 = __msa_andi_b(src3, 0x0F);

	298 reg0 += __msa_andi_b(src4, 0x0F);

	299 reg1 += __msa_andi_b(src5, 0x0F);

	300 reg2 += __msa_andi_b(src6, 0x0F);

	301 reg3 += __msa_andi_b(src7, 0x0F);

	302 src0 = __msa_andi_b(src0, 0xF0);

	303 src1 = __msa_andi_b(src1, 0xF0);

	304 src2 = __msa_andi_b(src2, 0xF0);

	305 src3 = __msa_andi_b(src3, 0xF0);

	306 src4 = __msa_andi_b(src4, 0xF0);

	307 src5 = __msa_andi_b(src5, 0xF0);

	308 src6 = __msa_andi_b(src6, 0xF0);

	309 src7 = __msa_andi_b(src7, 0xF0);

	310 reg4 = (v16u8) __msa_srli_b((v16i8) src0, 4);

	311 reg5 = (v16u8) __msa_srli_b((v16i8) src1, 4);

	312 reg6 = (v16u8) __msa_srli_b((v16i8) src2, 4);

	313 reg7 = (v16u8) __msa_srli_b((v16i8) src3, 4);

	314 reg4 += (v16u8) __msa_srli_b((v16i8) src4, 4);

	315 reg5 += (v16u8) __msa_srli_b((v16i8) src5, 4);

	316 reg6 += (v16u8) __msa_srli_b((v16i8) src6, 4);

	317 reg7 += (v16u8) __msa_srli_b((v16i8) src7, 4);

	318 reg8 = (v16u8) __msa_pckod_b((v16i8) reg1, (v16i8) reg0);

	319 reg9 = (v16u8) __msa_pckod_b((v16i8) reg3, (v16i8) reg2);

	320 reg0 = (v16u8) __msa_pckev_b((v16i8) reg1, (v16i8) reg0);

	321 reg1 = (v16u8) __msa_pckev_b((v16i8) reg3, (v16i8) reg2);

	322 reg2 = (v16u8) __msa_pckev_b((v16i8) reg5, (v16i8) reg4);

	323 reg3 = (v16u8) __msa_pckev_b((v16i8) reg7, (v16i8) reg6);

	324 vec0 = __msa_hadd_u_h(reg0, reg0);

	325 vec1 = __msa_hadd_u_h(reg1, reg1);

	326 vec2 = __msa_hadd_u_h(reg2, reg2);

	327 vec3 = __msa_hadd_u_h(reg3, reg3);

	328 vec4 = __msa_hadd_u_h(reg8, reg8);

	329 vec5 = __msa_hadd_u_h(reg9, reg9);

	330 vec0 = (v8u16) __msa_slli_h((v8i16) vec0, 2);

	331 vec1 = (v8u16) __msa_slli_h((v8i16) vec1, 2);

	332 vec2 = (v8u16) __msa_slli_h((v8i16) vec2, 2);

	333 vec3 = (v8u16) __msa_slli_h((v8i16) vec3, 2);

	334 vec4 = (v8u16) __msa_slli_h((v8i16) vec4, 2);

	335 vec5 = (v8u16) __msa_slli_h((v8i16) vec5, 2);

	336 vec0 \|= (v8u16) __msa_srai_h((v8i16) vec0, 6);

	337 vec1 \|= (v8u16) __msa_srai_h((v8i16) vec1, 6);

	338 vec2 \|= (v8u16) __msa_srai_h((v8i16) vec2, 6);

	339 vec3 \|= (v8u16) __msa_srai_h((v8i16) vec3, 6);

	340 vec4 \|= (v8u16) __msa_srai_h((v8i16) vec4, 6);

	341 vec5 \|= (v8u16) __msa_srai_h((v8i16) vec5, 6);
	fbarchard1 2016/10/14 21:35:16 I'm concerned that this is a lot of code for a for I'm concerned that this is a lot of code for a format/function that is rarely used. ABGRToI420 or ARGBToI420 comes up a bit on android for screen casting, but screen casting doesnt happen on android much... its more for desktops. To reduce the code I would 1. take the approach intel version do, which is just do ARGB4444ToARGBRow_MSA and then use ARGBToY and ARGBToUV. 2. use macros for ARGB4444TOARGB and perhaps for RGBTOYUV manojkumar.bhosale 2016/10/19 11:56:27 Done. Show quoted text On 2016/10/14 21:35:16, fbarchard1 wrote: > I'm concerned that this is a lot of code for a format/function that is rarely > used. > ABGRToI420 or ARGBToI420 comes up a bit on android for screen casting, but > screen casting doesnt happen on android much... its more for desktops. > To reduce the code I would > 1. take the approach intel version do, which is just do ARGB4444ToARGBRow_MSA > and then use ARGBToY and ARGBToUV. > 2. use macros for ARGB4444TOARGB and perhaps for RGBTOYUV Done.
	342 vec6 = vec0 * const_0x70;

	343 vec7 = vec1 * const_0x70;

	344 vec8 = vec2 * const_0x4A;

	345 vec9 = vec3 * const_0x4A;

	346 vec0 *= const_0x12;

	347 vec1 *= const_0x12;

	348 vec2 *= const_0x5E;

	349 vec3 *= const_0x5E;

	350 vec6 += const_0x8080;

	351 vec7 += const_0x8080;

	352 vec8 += vec4 * const_0x26;

	353 vec9 += vec5 * const_0x26;

	354 vec4 *= const_0x70;

	355 vec5 *= const_0x70;

	356 vec2 += vec0;

	357 vec3 += vec1;

	358 vec4 += const_0x8080;

	359 vec5 += const_0x8080;

	360 vec0 = vec6 - vec8;

	361 vec1 = vec7 - vec9;

	362 vec2 = vec4 - vec2;

	363 vec3 = vec5 - vec3;

	364 vec0 = (v8u16) __msa_srli_h((v8i16) vec0, 8);

	365 vec1 = (v8u16) __msa_srli_h((v8i16) vec1, 8);

	366 vec2 = (v8u16) __msa_srli_h((v8i16) vec2, 8);

	367 vec3 = (v8u16) __msa_srli_h((v8i16) vec3, 8);

	368 dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);

	369 dst1 = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);

	370 ST_UB(dst0, dst_u);

	371 ST_UB(dst1, dst_v);

	372 src_argb4444 += 64;

	373 src_argb4444_next += 64;
	fbarchard1 2016/10/14 21:35:16 on other platforms I'd typically unroll less than on other platforms I'd typically unroll less than this... doing same read as ARGBTOY, then subsampling and storing less UV. Allows macros to share for first part, and should achieve nearly same performance with less code.
	374 dst_u += 16;

	375 dst_v += 16;

	376 }

	377 }

	378

	379 void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, uint8* dst_argb,

	380 int width) {

	381 int x;

	382 v16u8 src0, src1;

	383 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

	384 v16u8 dst0, dst1, dst2, dst3;

	385

	386 for (x = 0; x < width; x += 16) {

	387 LD_UB2(src_argb4444, 16, src0, src1);

	388 vec0 = (v8u16) __msa_andi_b(src0, 0x0F);

	389 vec1 = (v8u16) __msa_andi_b(src1, 0x0F);

	390 vec2 = (v8u16) __msa_andi_b(src0, 0xF0);

	391 vec3 = (v8u16) __msa_andi_b(src1, 0xF0);

	392 vec4 = (v8u16) __msa_slli_b((v16i8) vec0, 4);

	393 vec5 = (v8u16) __msa_slli_b((v16i8) vec1, 4);

	394 vec6 = (v8u16) __msa_srli_b((v16i8) vec2, 4);

	395 vec7 = (v8u16) __msa_srli_b((v16i8) vec3, 4);

	396 vec0 \|= vec4;

	397 vec1 \|= vec5;

	398 vec2 \|= vec6;

	399 vec3 \|= vec7;

	400 dst0 = (v16u8) __msa_ilvr_b((v16i8) vec2, (v16i8) vec0);

	401 dst1 = (v16u8) __msa_ilvl_b((v16i8) vec2, (v16i8) vec0);

	402 dst2 = (v16u8) __msa_ilvr_b((v16i8) vec3, (v16i8) vec1);

	403 dst3 = (v16u8) __msa_ilvl_b((v16i8) vec3, (v16i8) vec1);

	404 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);

	405 src_argb4444 += 32;

	406 dst_argb += 64;

	407 }

	408 }

	409

224 #ifdef __cplusplus	410 #ifdef __cplusplus

225 } // extern "C"	411 } // extern "C"

226 } // namespace libyuv	412 } // namespace libyuv

227 #endif	413 #endif

228	414

229 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)	415 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

OLD	NEW

« source/convert.cc ('K') | « source/row_any.cc ('k') | no next file » | no next file with comments »