Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(21)

Unified Diff: source/row_msa.cc

Issue 2430313005: Add MSA optimized I422ToARGBRow_MSA and I422ToRGBARow_MSA functions (Closed)
Patch Set: Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: source/row_msa.cc
diff --git a/source/row_msa.cc b/source/row_msa.cc
index 3a55b0b7554a2a0e4506a633e73242c879187f74..d0990b4ff8f799d82157da76e6a145e7e22df81c 100644
--- a/source/row_msa.cc
+++ b/source/row_msa.cc
@@ -19,6 +19,66 @@ namespace libyuv {
extern "C" {
#endif
+#define I422TORGB(in0, in1, in2, ub, vr, ug, vg, \
fbarchard1 2016/10/24 17:36:32 Could use a brief comment to describe what this ma
+ bb, bg, br, yg, out0, out1, out2) { \
+ v8i16 vec0_m; \
+ v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
+ v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \
+ v4i32 max_val_m = __msa_ldi_w(255); \
+ v8i16 zero_m = { 0 }; \
+ \
+ in1 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in1); \
+ in2 = (v16u8) __msa_ilvr_b((v16i8) in2, (v16i8) in2); \
+ vec0_m = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in0); \
+ reg0_m = (v4i32) __msa_ilvr_h(zero_m, vec0_m); \
+ reg1_m = (v4i32) __msa_ilvl_h(zero_m, vec0_m); \
+ reg0_m *= vec_yg; \
+ reg1_m *= vec_yg; \
+ reg0_m = __msa_srai_w(reg0_m, 16); \
+ reg1_m = __msa_srai_w(reg1_m, 16); \
+ reg4_m = reg0_m + br; \
+ reg5_m = reg1_m + br; \
+ reg2_m = reg0_m + bg; \
+ reg3_m = reg1_m + bg; \
+ reg0_m += bb; \
+ reg1_m += bb; \
+ vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in1); \
+ reg6_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \
+ reg7_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \
+ vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in2); \
+ reg8_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \
+ reg9_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \
+ reg0_m -= reg6_m * ub; \
+ reg1_m -= reg7_m * ub; \
+ reg2_m -= reg6_m * ug; \
+ reg3_m -= reg7_m * ug; \
+ reg4_m -= reg8_m * vr; \
+ reg5_m -= reg9_m * vr; \
+ reg2_m -= reg8_m * vg; \
+ reg3_m -= reg9_m * vg; \
+ reg0_m = __msa_srai_w(reg0_m, 6); \
+ reg1_m = __msa_srai_w(reg1_m, 6); \
+ reg2_m = __msa_srai_w(reg2_m, 6); \
+ reg3_m = __msa_srai_w(reg3_m, 6); \
+ reg4_m = __msa_srai_w(reg4_m, 6); \
+ reg5_m = __msa_srai_w(reg5_m, 6); \
+ reg0_m = __msa_maxi_s_w(reg0_m, 0); \
+ reg1_m = __msa_maxi_s_w(reg1_m, 0); \
+ reg2_m = __msa_maxi_s_w(reg2_m, 0); \
+ reg3_m = __msa_maxi_s_w(reg3_m, 0); \
+ reg4_m = __msa_maxi_s_w(reg4_m, 0); \
+ reg5_m = __msa_maxi_s_w(reg5_m, 0); \
+ reg0_m = __msa_min_s_w(reg0_m, max_val_m); \
+ reg1_m = __msa_min_s_w(reg1_m, max_val_m); \
+ reg2_m = __msa_min_s_w(reg2_m, max_val_m); \
+ reg3_m = __msa_min_s_w(reg3_m, max_val_m); \
+ reg4_m = __msa_min_s_w(reg4_m, max_val_m); \
+ reg5_m = __msa_min_s_w(reg5_m, max_val_m); \
+ out0 = __msa_pckev_h((v8i16) reg1_m, (v8i16) reg0_m); \
+ out1 = __msa_pckev_h((v8i16) reg3_m, (v8i16) reg2_m); \
+ out2 = __msa_pckev_h((v8i16) reg5_m, (v8i16) reg4_m); \
+}
+
void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
@@ -101,6 +161,90 @@ void I422ToUYVYRow_MSA(const uint8* src_y,
}
}
+void I422ToARGBRow_MSA(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) {
+ int x;
+ int32 data_u, data_v;
+ int64 data_y;
+ v16u8 src0, src1, src2, dst0, dst1;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v16u8 const_255 = (v16u8) __msa_ldi_b(255);
+ v4i32 zero = { 0 };
+
+ vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]);
+ vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]);
+ vec_ug = __msa_fill_w(yuvconstants->kUVToG[0]);
+ vec_vg = __msa_fill_w(yuvconstants->kUVToG[1]);
+ vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]);
+ vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]);
+ vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]);
+ vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]);
+
+ for (x = 0; x < width; x += 8) {
+ data_y = LD(src_y);
fbarchard1 2016/10/24 17:36:32 Consider a macro to read I422, as there are many Y
+ data_u = LW(src_u);
+ data_v = LW(src_v);
+ src0 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_y);
+ src1 = (v16u8) __msa_insert_w(zero, 0, data_u);
+ src2 = (v16u8) __msa_insert_w(zero, 0, data_v);
fbarchard1 2016/10/24 17:36:32 No way to load 4 or 8 bytes directly into an MSA r
+ I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg,
+ vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2);
+ vec0 = (v8i16) __msa_ilvev_b((v16i8) vec1, (v16i8) vec0);
+ vec1 = (v8i16) __msa_ilvev_b((v16i8) const_255, (v16i8) vec2);
+ dst0 = (v16u8) __msa_ilvr_h((v8i16) vec1, (v8i16) vec0);
+ dst1 = (v16u8) __msa_ilvl_h((v8i16) vec1, (v8i16) vec0);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ rgb_buf += 32;
+ }
+}
+
+void I422ToRGBARow_MSA(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) {
+ int x;
+ int64 data_y;
+ int32 data_u, data_v;
+ v16u8 src0, src1, src2, dst0, dst1;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v16u8 const_255 = (v16u8) __msa_ldi_b(255);
+ v4i32 zero = { 0 };
+
+ vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]);
+ vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]);
+ vec_ug = __msa_fill_w(yuvconstants->kUVToG[0]);
+ vec_vg = __msa_fill_w(yuvconstants->kUVToG[1]);
+ vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]);
+ vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]);
+ vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]);
+ vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]);
+
+ for (x = 0; x < width; x += 8) {
+ data_y = LD(src_y);
+ data_u = LW(src_u);
+ data_v = LW(src_v);
+ src0 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_y);
+ src1 = (v16u8) __msa_insert_w(zero, 0, data_u);
+ src2 = (v16u8) __msa_insert_w(zero, 0, data_v);
+ I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg,
+ vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2);
+ vec0 = (v8i16) __msa_ilvev_b((v16i8) vec0, (v16i8) const_255);
+ vec1 = (v8i16) __msa_ilvev_b((v16i8) vec2, (v16i8) vec1);
+ dst0 = (v16u8) __msa_ilvr_h(vec1, vec0);
+ dst1 = (v16u8) __msa_ilvl_h(vec1, vec0);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ rgb_buf += 32;
+ }
+}
+
void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
« include/libyuv/macros_msa.h ('K') | « source/row_any.cc ('k') | source/scale_argb.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698