source/row_msa.cc - Issue 2378753004: Add MSA optimized I422ToYUY2Row, I422ToUYVYRow functions

Side by Side Diff: source/row_msa.cc

Issue 2378753004: Add MSA optimized I422ToYUY2Row, I422ToUYVYRow functions (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
46 for (x = 0; x < width; x += 16) {	46 for (x = 0; x < width; x += 16) {

47 LD_UB4(src, 16, src3, src2, src1, src0);	47 LD_UB4(src, 16, src3, src2, src1, src0);

48 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);	48 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);

49 VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);	49 VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);

50 ST_UB4(dst0, dst1, dst2, dst3, dst, 16);	50 ST_UB4(dst0, dst1, dst2, dst3, dst, 16);

51 dst += 64;	51 dst += 64;

52 src -= 64;	52 src -= 64;

53 }	53 }

54 }	54 }

55	55

	56 void I422ToYUY2Row_MSA(const uint8* src_y,

	57 const uint8* src_u,

	58 const uint8* src_v,

	59 uint8* dst_yuy2,

	60 int width) {

	61 int x;

	62 v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;

	63 v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;

	64

	65 for (x = 0; x < width; x += 32) {

	66 src_u0 = LD_UB(src_u);

	67 src_v0 = LD_UB(src_v);

	68 LD_UB2(src_y, 16, src_y0, src_y1);

	69 ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
	fbarchard1 2016/09/29 17:10:43 Did you try just 1 vector at a time instead of 2? Did you try just 1 vector at a time instead of 2? Is it improving performance much to unroll all instructions to 2? manojkumar.bhosale 2016/09/30 09:00:29 all the loop unrolling we have done is based on lo Show quoted text On 2016/09/29 17:10:43, fbarchard1 wrote: > Did you try just 1 vector at a time instead of 2? Is it improving performance > much to unroll all instructions to 2? all the loop unrolling we have done is based on load latencies and to reduce un-necessary stalls. Here, byte vector holds 16 elements of u and v after load, so we needed 2 loads of y and hence all subsequent operations with 2 instructions. Operating 1 y vector at a time is possible but then we will be wasting half of the u and v elements getting loaded anyway, will insert more stalls which will be un-optimal.
	70 ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);

	71 ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);

	72 ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);

	73 src_u += 16;

	74 src_v += 16;

	75 src_y += 32;

	76 dst_yuy2 += 64;

	77 }

	78 }

	79

	80 void I422ToUYVYRow_MSA(const uint8* src_y,

	81 const uint8* src_u,

	82 const uint8* src_v,

	83 uint8* dst_uyvy,

	84 int width) {

	85 int x;

	86 v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;

	87 v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;

	88

	89 for (x = 0; x < width; x += 32) {

	90 src_u0 = LD_UB(src_u);

	91 src_v0 = LD_UB(src_v);

	92 LD_UB2(src_y, 16, src_y0, src_y1);

	93 ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);

	94 ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);

	95 ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);

	96 ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);

	97 src_u += 16;

	98 src_v += 16;

	99 src_y += 32;

	100 dst_uyvy += 64;

	101 }

	102 }

	103

56 #ifdef __cplusplus	104 #ifdef __cplusplus

57 } // extern "C"	105 } // extern "C"

58 } // namespace libyuv	106 } // namespace libyuv

59 #endif	107 #endif

60	108

61 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)	109 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

OLD	NEW

« source/cpu_id.cc ('K') | « source/row_any.cc ('k') | no next file » | no next file with comments »