source/row_msa.cc - Issue 2397693002: Add MSA optimized YUY2ToI422, YUY2ToI420, UYVYToI422, UYVYToI420 functions

Side by Side Diff: source/row_msa.cc

Issue 2397693002: Add MSA optimized YUY2ToI422, YUY2ToI420, UYVYToI422, UYVYToI420 functions (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 83 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
94 ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);	94 ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);

95 ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);	95 ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);

96 ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);	96 ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);

97 src_u += 16;	97 src_u += 16;

98 src_v += 16;	98 src_v += 16;

99 src_y += 32;	99 src_y += 32;

100 dst_uyvy += 64;	100 dst_uyvy += 64;

101 }	101 }

102 }	102 }

103	103

	104 void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {

	105 int x;

	106 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

	107 v16u8 dst0, dst1, dst2, dst3;

	108

	109 for (x = 0; x < width; x += 64) {
	fbarchard1 2016/10/05 22:03:58 suggest 32 at a time... or 16. 4 of everything re suggest 32 at a time... or 16. 4 of everything requires a lot of alignment (any will be used more) and code size/icache goes up, hurting performance. On many cpus, too many of 1 instruction causes execution unittest to saturate. e.g. you can often do 2 of the same instruction, but few cpus have 4 execution units that can run in parallel for the same instruction. 2 of everything is usually a good compromise. manojkumar.bhosale 2016/10/07 10:42:09 We have done loop unrolling considering the load l Show quoted text On 2016/10/05 22:03:58, fbarchard1 wrote: > suggest 32 at a time... or 16. 4 of everything requires a lot of alignment (any > will be used more) and code size/icache goes up, hurting performance. > On many cpus, too many of 1 instruction causes execution unittest to saturate. > e.g. you can often do 2 of the same instruction, but few cpus have 4 execution > units that can run in parallel for the same instruction. 2 of everything is > usually a good compromise. We have done loop unrolling considering the load latency of 8-10 cycles and to avoid un-necessary stalls. However, agreed that any will be used more impacting performance so processing 32 at a time will be a good compromise. Modified for 32.
	110 LD_UB8(src_yuy2, 16, src0, src1, src2, src3, src4, src5, src6, src7);

	111 PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,

	112 dst0, dst1, dst2, dst3);

	113 ST_UB4(dst0, dst1, dst2, dst3, dst_y, 16);

	114 src_yuy2 += 128;

	115 dst_y += 64;
	fbarchard1 2016/10/05 22:03:58 this is unrolled quite alot. other platforms are this is unrolled quite alot. other platforms are doing 16 pixels? manojkumar.bhosale 2016/10/07 10:42:09 Done. Show quoted text On 2016/10/05 22:03:58, fbarchard1 wrote: > this is unrolled quite alot. other platforms are doing 16 pixels? Done.
	116 }

	117 }

	118

	119 void YUY2ToUVRow_MSA(const uint8* src_yuy2, int src_stride_yuy2,

	120 uint8* dst_u, uint8* dst_v, int width) {

	121 const uint8* nxt = src_yuy2 + src_stride_yuy2;

	122 int x;

	123 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

	124 v16u8 vec0, vec1, dst0, dst1;

	125

	126 for (x = 0; x < width; x += 32) {

	127 LD_UB4(src_yuy2, 16, src0, src1, src2, src3);

	128 LD_UB4(nxt, 16, src4, src5, src6, src7);

	129 PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,

	130 src0, src1, src2, src3);

	131 AVER_UB2_UB(src0, src2, src1, src3, vec0, vec1);

	132 dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
	fbarchard1 2016/10/05 22:03:58 style nit - i like seeing the intrinisics (or inli style nit - i like seeing the intrinisics (or inline asm) used directly and would prefer avoid the macros, except for large blocks of reusable code, such as YUVToRGB manojkumar.bhosale 2016/10/07 10:42:09 Acknowledged. Show quoted text On 2016/10/05 22:03:58, fbarchard1 wrote: > style nit - i like seeing the intrinisics (or inline asm) used directly and > would prefer avoid the macros, except for large blocks of reusable code, such as > YUVToRGB Acknowledged.
	133 dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);

	134 ST_UB(dst0, dst_u);

	135 ST_UB(dst1, dst_v);

	136 src_yuy2 += 64;

	137 nxt += 64;

	138 dst_u += 16;

	139 dst_v += 16;

	140 }

	141 }

	142

	143 void YUY2ToUV422Row_MSA(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
	fbarchard1 2016/10/05 22:03:58 prefer this be same as YUY2TToUV and do 32 at a ti prefer this be same as YUY2TToUV and do 32 at a time. Code should be almost the same, but with averaging removed. manojkumar.bhosale 2016/10/07 10:42:09 Done. Show quoted text On 2016/10/05 22:03:58, fbarchard1 wrote: > prefer this be same as YUY2TToUV and do 32 at a time. > Code should be almost the same, but with averaging removed. Done.
	144 int width) {

	145 int x;

	146 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

	147 v16u8 dst0, dst1, dst2, dst3;

	148

	149 for (x = 0; x < width; x += 64) {

	150 LD_UB8(src_yuy2, 16, src0, src1, src2, src3, src4, src5, src6, src7);

	151 PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,

	152 src0, src1, src2, src3);

	153 PCKEV_B2_UB(src1, src0, src3, src2, dst0, dst1);

	154 PCKOD_B2_UB(src1, src0, src3, src2, dst2, dst3);

	155 ST_UB2(dst0, dst1, dst_u, 16);

	156 ST_UB2(dst2, dst3, dst_v, 16);

	157 src_yuy2 += 128;

	158 dst_u += 32;

	159 dst_v += 32;

	160 }

	161 }

	162

	163 void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {

	164 int x;

	165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

	166 v16u8 dst0, dst1, dst2, dst3;

	167

	168 for (x = 0; x < width; x += 64) {

	169 LD_UB8(src_uyvy, 16, src0, src1, src2, src3, src4, src5, src6, src7);

	170 PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,

	171 dst0, dst1, dst2, dst3);

	172 ST_UB4(dst0, dst1, dst2, dst3, dst_y, 16);

	173 src_uyvy += 128;

	174 dst_y += 64;

	175 }

	176 }

	177

	178 void UYVYToUVRow_MSA(const uint8* src_uyvy, int src_stride_uyvy,

	179 uint8* dst_u, uint8* dst_v, int width) {

	180 const uint8 *nxt = src_uyvy + src_stride_uyvy;

	181 int x;

	182 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

	183 v16u8 vec0, vec1, dst0, dst1;

	184

	185 for (x = 0; x < width; x += 32) {

	186 LD_UB4(src_uyvy, 16, src0, src1, src2, src3);

	187 LD_UB4(nxt, 16, src4, src5, src6, src7);

	188 PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,

	189 src0, src1, src2, src3);

	190 AVER_UB2_UB(src0, src2, src1, src3, vec0, vec1);

	191 dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);

	192 dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
	fbarchard1 2016/10/05 22:03:58 the i8 version of vectors seems to be preferred? the i8 version of vectors seems to be preferred? use those types instead to reduce casting? manojkumar.bhosale 2016/10/07 10:42:09 As we are operating on pixel data (u8), we have us Show quoted text On 2016/10/05 22:03:58, fbarchard1 wrote: > the i8 version of vectors seems to be preferred? use those types instead to > reduce casting? As we are operating on pixel data (u8), we have used all u8 vector types. The i8 version of vectors can be used but then all the intrinsic have to be used for i8 operations except aver as it is sign dependent intrinsic and then it will need u8 casting. The type casting here can be removed using -flax-vector-conversions gcc flag however we have avoided its use as per gcc recommendation at below link, https://gcc.gnu.org/onlinedocs/gcc/C-Dialect-Options.html Considering pckev_b/od_b intrinsic currently accepting i8 operands & returning i8 result, keeping the casting as is.
	193 ST_UB(dst0, dst_u);

	194 ST_UB(dst1, dst_v);

	195 src_uyvy += 64;

	196 nxt += 64;

	197 dst_u += 16;

	198 dst_v += 16;

	199 }

	200 }

	201

	202 void UYVYToUV422Row_MSA(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,

	203 int width) {

	204 int x;

	205 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

	206 v16u8 dst0, dst1, dst2, dst3;

	207

	208 for (x = 0; x < width; x += 64) {

	209 LD_UB8(src_uyvy, 16, src0, src1, src2, src3, src4, src5, src6, src7);

	210 PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,

	211 src0, src1, src2, src3);

	212 PCKEV_B2_UB(src1, src0, src3, src2, dst0, dst1);

	213 PCKOD_B2_UB(src1, src0, src3, src2, dst2, dst3);

	214 ST_UB2(dst0, dst1, dst_u, 16);

	215 ST_UB2(dst2, dst3, dst_v, 16);

	216 src_uyvy += 128;

	217 dst_u += 32;

	218 dst_v += 32;

	219 }

	220 }

	221

104 #ifdef __cplusplus	222 #ifdef __cplusplus

105 } // extern "C"	223 } // extern "C"

106 } // namespace libyuv	224 } // namespace libyuv

107 #endif	225 #endif

108	226

109 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)	227 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

OLD	NEW

« source/planar_functions.cc ('K') | « source/row_any.cc ('k') | no next file » | no next file with comments »