Chromium Code Reviews| Index: source/row_msa.cc |
| diff --git a/source/row_msa.cc b/source/row_msa.cc |
| index b747a77ec5cbc49a400931aec6001f6ce3061181..fde044d380276e2d51e0c8046849531009301450 100644 |
| --- a/source/row_msa.cc |
| +++ b/source/row_msa.cc |
| @@ -19,65 +19,79 @@ namespace libyuv { |
| extern "C" { |
| #endif |
| +// Load YUV 422 pixel data |
| +#define LOAD_I422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) { \ |
|
fbarchard1
2016/10/26 17:56:31
suggest READYUV422 as name, for consistency with r
|
| + uint64 y_m; \ |
| + uint32 u_m, v_m; \ |
| + v4i32 zero_m = { 0 }; \ |
| + y_m = LD(psrc_y); \ |
| + u_m = LW(psrc_u); \ |
| + v_m = LW(psrc_v); \ |
| + out_y = (v16u8) __msa_insert_d((v2i64) zero_m, 0, (int64) y_m); \ |
| + out_u = (v16u8) __msa_insert_w(zero_m, 0, (int32) u_m); \ |
| + out_v = (v16u8) __msa_insert_w(zero_m, 0, (int32) v_m); \ |
| +} |
| + |
| // Convert 8 pixels of YUV 420 to RGB. |
| -#define I422TORGB(in0, in1, in2, ub, vr, ug, vg, \ |
| - bb, bg, br, yg, out0, out1, out2) { \ |
| - v8i16 vec0_m; \ |
| - v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ |
| - v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \ |
| - v4i32 max_val_m = __msa_ldi_w(255); \ |
| - v8i16 zero_m = { 0 }; \ |
| - \ |
| - in1 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in1); \ |
| - in2 = (v16u8) __msa_ilvr_b((v16i8) in2, (v16i8) in2); \ |
| - vec0_m = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in0); \ |
| - reg0_m = (v4i32) __msa_ilvr_h(zero_m, vec0_m); \ |
| - reg1_m = (v4i32) __msa_ilvl_h(zero_m, vec0_m); \ |
| - reg0_m *= vec_yg; \ |
| - reg1_m *= vec_yg; \ |
| - reg0_m = __msa_srai_w(reg0_m, 16); \ |
| - reg1_m = __msa_srai_w(reg1_m, 16); \ |
| - reg4_m = reg0_m + br; \ |
| - reg5_m = reg1_m + br; \ |
| - reg2_m = reg0_m + bg; \ |
| - reg3_m = reg1_m + bg; \ |
| - reg0_m += bb; \ |
| - reg1_m += bb; \ |
| - vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in1); \ |
| - reg6_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \ |
| - reg7_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \ |
| - vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in2); \ |
| - reg8_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \ |
| - reg9_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \ |
| - reg0_m -= reg6_m * ub; \ |
| - reg1_m -= reg7_m * ub; \ |
| - reg2_m -= reg6_m * ug; \ |
| - reg3_m -= reg7_m * ug; \ |
| - reg4_m -= reg8_m * vr; \ |
| - reg5_m -= reg9_m * vr; \ |
| - reg2_m -= reg8_m * vg; \ |
| - reg3_m -= reg9_m * vg; \ |
| - reg0_m = __msa_srai_w(reg0_m, 6); \ |
| - reg1_m = __msa_srai_w(reg1_m, 6); \ |
| - reg2_m = __msa_srai_w(reg2_m, 6); \ |
| - reg3_m = __msa_srai_w(reg3_m, 6); \ |
| - reg4_m = __msa_srai_w(reg4_m, 6); \ |
| - reg5_m = __msa_srai_w(reg5_m, 6); \ |
| - reg0_m = __msa_maxi_s_w(reg0_m, 0); \ |
| - reg1_m = __msa_maxi_s_w(reg1_m, 0); \ |
| - reg2_m = __msa_maxi_s_w(reg2_m, 0); \ |
| - reg3_m = __msa_maxi_s_w(reg3_m, 0); \ |
| - reg4_m = __msa_maxi_s_w(reg4_m, 0); \ |
| - reg5_m = __msa_maxi_s_w(reg5_m, 0); \ |
| - reg0_m = __msa_min_s_w(reg0_m, max_val_m); \ |
| - reg1_m = __msa_min_s_w(reg1_m, max_val_m); \ |
| - reg2_m = __msa_min_s_w(reg2_m, max_val_m); \ |
| - reg3_m = __msa_min_s_w(reg3_m, max_val_m); \ |
| - reg4_m = __msa_min_s_w(reg4_m, max_val_m); \ |
| - reg5_m = __msa_min_s_w(reg5_m, max_val_m); \ |
| - out0 = __msa_pckev_h((v8i16) reg1_m, (v8i16) reg0_m); \ |
| - out1 = __msa_pckev_h((v8i16) reg3_m, (v8i16) reg2_m); \ |
| - out2 = __msa_pckev_h((v8i16) reg5_m, (v8i16) reg4_m); \ |
| +#define I422TORGB(in_y, in_u, in_v, \ |
| + ub, vr, ug, vg, bb, bg, br, yg, \ |
| + out_b, out_g, out_r) { \ |
| + v8i16 vec0_m; \ |
| + v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ |
| + v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \ |
| + v4i32 max_val_m = __msa_ldi_w(255); \ |
| + v8i16 zero_m = { 0 }; \ |
| + \ |
| + in_u = (v16u8) __msa_ilvr_b((v16i8) in_u, (v16i8) in_u); \ |
| + in_v = (v16u8) __msa_ilvr_b((v16i8) in_v, (v16i8) in_v); \ |
| + vec0_m = (v8i16) __msa_ilvr_b((v16i8) in_y, (v16i8) in_y); \ |
| + reg0_m = (v4i32) __msa_ilvr_h(zero_m, vec0_m); \ |
| + reg1_m = (v4i32) __msa_ilvl_h(zero_m, vec0_m); \ |
| + reg0_m *= vec_yg; \ |
| + reg1_m *= vec_yg; \ |
| + reg0_m = __msa_srai_w(reg0_m, 16); \ |
| + reg1_m = __msa_srai_w(reg1_m, 16); \ |
| + reg4_m = reg0_m + br; \ |
| + reg5_m = reg1_m + br; \ |
| + reg2_m = reg0_m + bg; \ |
| + reg3_m = reg1_m + bg; \ |
| + reg0_m += bb; \ |
| + reg1_m += bb; \ |
| + vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in_u); \ |
| + reg6_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \ |
| + reg7_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \ |
| + vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in_v); \ |
| + reg8_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \ |
| + reg9_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \ |
| + reg0_m -= reg6_m * ub; \ |
| + reg1_m -= reg7_m * ub; \ |
| + reg2_m -= reg6_m * ug; \ |
| + reg3_m -= reg7_m * ug; \ |
| + reg4_m -= reg8_m * vr; \ |
| + reg5_m -= reg9_m * vr; \ |
| + reg2_m -= reg8_m * vg; \ |
| + reg3_m -= reg9_m * vg; \ |
| + reg0_m = __msa_srai_w(reg0_m, 6); \ |
| + reg1_m = __msa_srai_w(reg1_m, 6); \ |
| + reg2_m = __msa_srai_w(reg2_m, 6); \ |
| + reg3_m = __msa_srai_w(reg3_m, 6); \ |
| + reg4_m = __msa_srai_w(reg4_m, 6); \ |
| + reg5_m = __msa_srai_w(reg5_m, 6); \ |
| + reg0_m = __msa_maxi_s_w(reg0_m, 0); \ |
| + reg1_m = __msa_maxi_s_w(reg1_m, 0); \ |
| + reg2_m = __msa_maxi_s_w(reg2_m, 0); \ |
| + reg3_m = __msa_maxi_s_w(reg3_m, 0); \ |
| + reg4_m = __msa_maxi_s_w(reg4_m, 0); \ |
| + reg5_m = __msa_maxi_s_w(reg5_m, 0); \ |
| + reg0_m = __msa_min_s_w(reg0_m, max_val_m); \ |
| + reg1_m = __msa_min_s_w(reg1_m, max_val_m); \ |
| + reg2_m = __msa_min_s_w(reg2_m, max_val_m); \ |
| + reg3_m = __msa_min_s_w(reg3_m, max_val_m); \ |
| + reg4_m = __msa_min_s_w(reg4_m, max_val_m); \ |
| + reg5_m = __msa_min_s_w(reg5_m, max_val_m); \ |
| + out_b = __msa_pckev_h((v8i16) reg1_m, (v8i16) reg0_m); \ |
| + out_g = __msa_pckev_h((v8i16) reg3_m, (v8i16) reg2_m); \ |
| + out_r = __msa_pckev_h((v8i16) reg5_m, (v8i16) reg4_m); \ |
| } |
| void MirrorRow_MSA(const uint8* src, uint8* dst, int width) { |
| @@ -166,13 +180,10 @@ void I422ToARGBRow_MSA(const uint8* src_y, const uint8* src_u, |
| const uint8* src_v, uint8* rgb_buf, |
| const struct YuvConstants* yuvconstants, int width) { |
| int x; |
| - int32 data_u, data_v; |
| - int64 data_y; |
| v16u8 src0, src1, src2, dst0, dst1; |
| v8i16 vec0, vec1, vec2; |
| v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| v16u8 const_255 = (v16u8) __msa_ldi_b(255); |
| - v4i32 zero = { 0 }; |
| vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); |
| vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); |
| @@ -184,12 +195,7 @@ void I422ToARGBRow_MSA(const uint8* src_y, const uint8* src_u, |
| vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); |
| for (x = 0; x < width; x += 8) { |
| - data_y = LD(src_y); |
| - data_u = LW(src_u); |
| - data_v = LW(src_v); |
| - src0 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_y); |
| - src1 = (v16u8) __msa_insert_w(zero, 0, data_u); |
| - src2 = (v16u8) __msa_insert_w(zero, 0, data_v); |
| + LOAD_I422(src_y, src_u, src_v, src0, src1, src2); |
| I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, |
| vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); |
| vec0 = (v8i16) __msa_ilvev_b((v16i8) vec1, (v16i8) vec0); |
| @@ -208,13 +214,10 @@ void I422ToRGBARow_MSA(const uint8* src_y, const uint8* src_u, |
| const uint8* src_v, uint8* rgb_buf, |
| const struct YuvConstants* yuvconstants, int width) { |
| int x; |
| - int64 data_y; |
| - int32 data_u, data_v; |
| v16u8 src0, src1, src2, dst0, dst1; |
| v8i16 vec0, vec1, vec2; |
| v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| v16u8 const_255 = (v16u8) __msa_ldi_b(255); |
| - v4i32 zero = { 0 }; |
| vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); |
| vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); |
| @@ -226,12 +229,7 @@ void I422ToRGBARow_MSA(const uint8* src_y, const uint8* src_u, |
| vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); |
| for (x = 0; x < width; x += 8) { |
| - data_y = LD(src_y); |
| - data_u = LW(src_u); |
| - data_v = LW(src_v); |
| - src0 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_y); |
| - src1 = (v16u8) __msa_insert_w(zero, 0, data_u); |
| - src2 = (v16u8) __msa_insert_w(zero, 0, data_v); |
| + LOAD_I422(src_y, src_u, src_v, src0, src1, src2); |
| I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, |
| vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); |
| vec0 = (v8i16) __msa_ilvev_b((v16i8) vec0, (v16i8) const_255); |
| @@ -246,6 +244,100 @@ void I422ToRGBARow_MSA(const uint8* src_y, const uint8* src_u, |
| } |
| } |
| +void I422AlphaToARGBRow_MSA(const uint8* src_y, const uint8* src_u, |
| + const uint8* src_v, const uint8* src_a, |
| + uint8* rgb_buf, |
| + const struct YuvConstants* yuvconstants, |
| + int width) { |
| + int x; |
| + int64 data_a; |
| + v16u8 src0, src1, src2, src3, dst0, dst1; |
| + v8i16 vec0, vec1, vec2; |
| + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| + v4i32 zero = { 0 }; |
| + |
| + vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); |
| + vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); |
| + vec_ug = __msa_fill_w(yuvconstants->kUVToG[0]); |
| + vec_vg = __msa_fill_w(yuvconstants->kUVToG[1]); |
| + vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]); |
| + vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]); |
| + vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]); |
| + vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); |
| + |
| + for (x = 0; x < width; x += 8) { |
| + data_a = LD(src_a); |
| + LOAD_I422(src_y, src_u, src_v, src0, src1, src2); |
| + src3 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_a); |
| + I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, |
| + vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); |
| + src3 = (v16u8) __msa_ilvr_b((v16i8) src3, (v16i8) src3); |
|
fbarchard1
2016/10/26 17:56:31
consider a macro for STOREARGB since it will come
|
| + vec0 = (v8i16) __msa_ilvev_b((v16i8) vec1, (v16i8) vec0); |
| + vec1 = (v8i16) __msa_ilvev_b((v16i8) src3, (v16i8) vec2); |
| + dst0 = (v16u8) __msa_ilvr_h((v8i16) vec1, (v8i16) vec0); |
| + dst1 = (v16u8) __msa_ilvl_h((v8i16) vec1, (v8i16) vec0); |
| + ST_UB2(dst0, dst1, rgb_buf, 16); |
| + src_y += 8; |
| + src_u += 4; |
| + src_v += 4; |
| + src_a += 8; |
| + rgb_buf += 32; |
| + } |
| +} |
| + |
| +void I422ToRGB24Row_MSA(const uint8* src_y, const uint8* src_u, |
| + const uint8* src_v, uint8* rgb_buf, |
| + const struct YuvConstants* yuvconstants, int32 width) { |
| + int x; |
| + int64 data_u, data_v; |
| + v16u8 src0, src1, src2, src3, src4, src5, dst0, dst1, dst2; |
| + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; |
| + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| + v16u8 reg0, reg1, reg2, reg3; |
| + v2i64 zero = { 0 }; |
| + v16i8 shuffler0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10 }; |
| + v16i8 shuffler1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10 }; |
| + v16i8 shuffler2 = |
| + { 26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31 }; |
|
fbarchard1
2016/10/26 17:56:31
indent should be 4 from line above.
|
| + |
| + vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); |
| + vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); |
| + vec_ug = __msa_fill_w(yuvconstants->kUVToG[0]); |
| + vec_vg = __msa_fill_w(yuvconstants->kUVToG[1]); |
| + vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]); |
| + vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]); |
| + vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]); |
| + vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); |
| + |
| + for (x = 0; x < width; x += 16) { |
| + src0 = (v16u8) __msa_ld_b((v16u8*) src_y, 0); |
| + data_u = LD(src_u); |
| + data_v = LD(src_v); |
| + src1 = (v16u8) __msa_insert_d(zero, 0, data_u); |
| + src2 = (v16u8) __msa_insert_d(zero, 0, data_v); |
| + src3 = (v16u8) __msa_sldi_b((v16i8) src0, (v16i8) src0, 8); |
| + src4 = (v16u8) __msa_sldi_b((v16i8) src1, (v16i8) src1, 4); |
| + src5 = (v16u8) __msa_sldi_b((v16i8) src2, (v16i8) src2, 4); |
| + I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, |
| + vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); |
| + I422TORGB(src3, src4, src5, vec_ub, vec_vr, vec_ug, vec_vg, |
| + vec_bb, vec_bg, vec_br, vec_yg, vec3, vec4, vec5); |
| + reg0 = (v16u8) __msa_ilvev_b((v16i8) vec1, (v16i8) vec0); |
| + reg2 = (v16u8) __msa_ilvev_b((v16i8) vec4, (v16i8) vec3); |
| + reg3 = (v16u8) __msa_pckev_b((v16i8) vec5, (v16i8) vec2); |
| + reg1 = (v16u8) __msa_sldi_b((v16i8) reg2, (v16i8) reg0, 11); |
| + dst0 = (v16u8) __msa_vshf_b(shuffler0, (v16i8) reg3, (v16i8) reg0); |
| + dst1 = (v16u8) __msa_vshf_b(shuffler1, (v16i8) reg3, (v16i8) reg1); |
| + dst2 = (v16u8) __msa_vshf_b(shuffler2, (v16i8) reg3, (v16i8) reg2); |
| + ST_UB2(dst0, dst1, rgb_buf, 16); |
| + ST_UB(dst2, (rgb_buf + 32)); |
| + src_y += 16; |
| + src_u += 8; |
| + src_v += 8; |
| + rgb_buf += 48; |
| + } |
| +} |
| + |
| void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) { |
| int x; |
| v16u8 src0, src1, src2, src3, dst0, dst1; |