Index: source/row_msa.cc |
diff --git a/source/row_msa.cc b/source/row_msa.cc |
index 1e174fd66f4a5de7770dad0fb1ab19b3e69a1e57..19fdc3b5047021e45c8ef007842683aa03717672 100644 |
--- a/source/row_msa.cc |
+++ b/source/row_msa.cc |
@@ -2023,6 +2023,359 @@ void RAWToUVRow_MSA(const uint8* src_rgb0, |
} |
} |
+void NV12ToARGBRow_MSA(const uint8* src_y, |
+ const uint8* src_uv, |
+ uint8* rgb_buf, |
+ const struct YuvConstants* yuvconstants, |
+ int width) { |
+ int x; |
+ uint64 val0, val1; |
+ v16u8 src0, src1, dst0, dst1; |
+ v8u16 vec0, vec1, vec2, vec3; |
+ v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; |
+ v4i32 vec_ub, vec_vr, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v16u8 zero = {0}; |
+ v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
+ v4i32 max = __msa_ldi_w(255); |
+ |
+ vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); |
+ vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = |
+ __msa_fill_w((yuvconstants->kUVToG[0]) | (yuvconstants->kUVToG[1] << 16)); |
+ vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]); |
+ vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]); |
+ vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]); |
+ vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); |
+ |
+ for (x = 0; x < width; x += 8) { |
+ val0 = LD(src_y); |
+ val1 = LD(src_uv); |
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); |
+ vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); |
+ reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); |
+ reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); |
+ reg2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); |
+ reg3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); |
fbarchard1
2017/01/13 18:46:01
consider macros to read nv12, do yuvtorgb, and wri
manojkumar.bhosale
2017/01/17 10:29:45
Done.
|
+ reg0 *= vec_yg; |
+ reg1 *= vec_yg; |
+ reg2 *= vec_ubvr; |
+ reg3 *= vec_ubvr; |
+ reg0 = __msa_srai_w(reg0, 16); |
+ reg1 = __msa_srai_w(reg1, 16); |
+ reg4 = __msa_dotp_s_w((v8i16)vec1, (v8i16)vec_ugvg); |
+ reg5 = __msa_ilvev_w(reg2, reg2); |
+ reg6 = __msa_ilvev_w(reg3, reg3); |
+ reg7 = __msa_ilvr_w(reg4, reg4); |
+ reg2 = __msa_ilvod_w(reg2, reg2); |
+ reg3 = __msa_ilvod_w(reg3, reg3); |
+ reg4 = __msa_ilvl_w(reg4, reg4); |
+ reg5 = reg0 - reg5; |
+ reg6 = reg1 - reg6; |
+ reg2 = reg0 - reg2; |
+ reg3 = reg1 - reg3; |
+ reg7 = reg0 - reg7; |
+ reg4 = reg1 - reg4; |
+ reg5 += vec_bb; |
+ reg6 += vec_bb; |
+ reg7 += vec_bg; |
+ reg4 += vec_bg; |
+ reg2 += vec_br; |
+ reg3 += vec_br; |
+ reg5 = __msa_srai_w(reg5, 6); |
+ reg6 = __msa_srai_w(reg6, 6); |
+ reg7 = __msa_srai_w(reg7, 6); |
+ reg4 = __msa_srai_w(reg4, 6); |
+ reg2 = __msa_srai_w(reg2, 6); |
+ reg3 = __msa_srai_w(reg3, 6); |
+ reg5 = __msa_maxi_s_w(reg5, 0); |
fbarchard1
2017/01/13 18:46:01
no free way to pack words to bytes with saturation
manojkumar.bhosale
2017/01/17 10:29:45
No
|
+ reg6 = __msa_maxi_s_w(reg6, 0); |
+ reg7 = __msa_maxi_s_w(reg7, 0); |
+ reg4 = __msa_maxi_s_w(reg4, 0); |
+ reg2 = __msa_maxi_s_w(reg2, 0); |
+ reg3 = __msa_maxi_s_w(reg3, 0); |
+ reg5 = __msa_min_s_w(max, reg5); |
+ reg6 = __msa_min_s_w(max, reg6); |
+ reg7 = __msa_min_s_w(max, reg7); |
+ reg4 = __msa_min_s_w(max, reg4); |
+ reg2 = __msa_min_s_w(max, reg2); |
+ reg3 = __msa_min_s_w(max, reg3); |
+ vec0 = (v8u16)__msa_ilvev_b((v16i8)reg7, (v16i8)reg5); |
+ vec1 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg2); |
+ vec2 = (v8u16)__msa_ilvev_b((v16i8)reg4, (v16i8)reg6); |
+ vec3 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg3); |
+ dst0 = (v16u8)__msa_ilvev_h((v8i16)vec1, (v8i16)vec0); |
+ dst1 = (v16u8)__msa_ilvev_h((v8i16)vec3, (v8i16)vec2); |
+ ST_UB2(dst0, dst1, rgb_buf, 16); |
+ src_y += 8; |
+ src_uv += 8; |
+ rgb_buf += 32; |
+ } |
+} |
+ |
+void NV12ToRGB565Row_MSA(const uint8* src_y, |
+ const uint8* src_uv, |
+ uint8* rgb_buf, |
+ const struct YuvConstants* yuvconstants, |
+ int width) { |
+ int x; |
+ uint64 val0, val1; |
+ v16u8 src0, src1, dst0; |
+ v8u16 vec0, vec1, vec2; |
+ v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; |
+ v4i32 vec_ub, vec_vr, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v16u8 zero = {0}; |
+ v4i32 max = __msa_ldi_w(255); |
+ |
+ vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); |
+ vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = |
+ __msa_fill_w((yuvconstants->kUVToG[0]) | (yuvconstants->kUVToG[1] << 16)); |
+ vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]); |
+ vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]); |
+ vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]); |
+ vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); |
+ |
+ for (x = 0; x < width; x += 8) { |
+ val0 = LD(src_y); |
+ val1 = LD(src_uv); |
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); |
+ vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); |
+ reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); |
+ reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); |
+ reg2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); |
+ reg3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); |
+ reg0 *= vec_yg; |
+ reg1 *= vec_yg; |
+ reg2 *= vec_ubvr; |
+ reg3 *= vec_ubvr; |
+ reg0 = __msa_srai_w(reg0, 16); |
+ reg1 = __msa_srai_w(reg1, 16); |
+ reg4 = __msa_dotp_s_w((v8i16)vec1, (v8i16)vec_ugvg); |
+ reg5 = __msa_ilvev_w(reg2, reg2); |
+ reg6 = __msa_ilvev_w(reg3, reg3); |
+ reg7 = __msa_ilvr_w(reg4, reg4); |
+ reg2 = __msa_ilvod_w(reg2, reg2); |
+ reg3 = __msa_ilvod_w(reg3, reg3); |
+ reg4 = __msa_ilvl_w(reg4, reg4); |
+ reg5 = reg0 - reg5; |
+ reg6 = reg1 - reg6; |
+ reg2 = reg0 - reg2; |
+ reg3 = reg1 - reg3; |
+ reg7 = reg0 - reg7; |
+ reg4 = reg1 - reg4; |
+ reg5 += vec_bb; |
+ reg6 += vec_bb; |
+ reg7 += vec_bg; |
+ reg4 += vec_bg; |
+ reg2 += vec_br; |
+ reg3 += vec_br; |
+ reg5 = __msa_srai_w(reg5, 6); |
+ reg6 = __msa_srai_w(reg6, 6); |
+ reg7 = __msa_srai_w(reg7, 6); |
+ reg4 = __msa_srai_w(reg4, 6); |
+ reg2 = __msa_srai_w(reg2, 6); |
+ reg3 = __msa_srai_w(reg3, 6); |
+ reg5 = __msa_maxi_s_w(reg5, 0); |
+ reg6 = __msa_maxi_s_w(reg6, 0); |
+ reg7 = __msa_maxi_s_w(reg7, 0); |
+ reg4 = __msa_maxi_s_w(reg4, 0); |
+ reg2 = __msa_maxi_s_w(reg2, 0); |
+ reg3 = __msa_maxi_s_w(reg3, 0); |
+ reg5 = __msa_min_s_w(max, reg5); |
+ reg6 = __msa_min_s_w(max, reg6); |
+ reg7 = __msa_min_s_w(max, reg7); |
+ reg4 = __msa_min_s_w(max, reg4); |
+ reg2 = __msa_min_s_w(max, reg2); |
+ reg3 = __msa_min_s_w(max, reg3); |
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg6, (v8i16)reg5); |
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg4, (v8i16)reg7); |
+ vec2 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); |
+ vec0 = vec0 >> 3; |
+ vec1 = (vec1 >> 2) << 5; |
+ vec2 = (vec2 >> 3) << 11; |
+ dst0 = (v16u8)(vec0 | vec1 | vec2); |
+ ST_UB(dst0, rgb_buf); |
+ src_y += 8; |
+ src_uv += 8; |
+ rgb_buf += 16; |
+ } |
+} |
+ |
+void NV21ToARGBRow_MSA(const uint8* src_y, |
+ const uint8* src_vu, |
+ uint8* rgb_buf, |
+ const struct YuvConstants* yuvconstants, |
+ int width) { |
+ int x; |
+ uint64 val0, val1; |
+ v16u8 src0, src1, dst0, dst1; |
+ v8u16 vec0, vec1, vec2, vec3; |
+ v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; |
+ v4i32 vec_ub, vec_vr, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v16u8 zero = {0}; |
+ v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
+ v4i32 max = __msa_ldi_w(255); |
+ |
+ vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); |
+ vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); |
+ vec_ubvr = __msa_ilvr_w(vec_ub, vec_vr); |
+ vec_ugvg = |
+ __msa_fill_w((yuvconstants->kUVToG[1] | (yuvconstants->kUVToG[0] << 16))); |
+ vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]); |
+ vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]); |
+ vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]); |
+ vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); |
+ |
+ for (x = 0; x < width; x += 8) { |
+ val0 = LD(src_y); |
+ val1 = LD(src_vu); |
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); |
+ vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); |
+ reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); |
+ reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); |
+ reg2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); |
+ reg3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); |
+ reg0 *= vec_yg; |
+ reg1 *= vec_yg; |
+ reg2 *= vec_ubvr; |
+ reg3 *= vec_ubvr; |
+ reg0 = __msa_srai_w(reg0, 16); |
+ reg1 = __msa_srai_w(reg1, 16); |
+ reg4 = __msa_dotp_s_w((v8i16)vec1, (v8i16)vec_ugvg); |
+ reg5 = __msa_ilvod_w(reg2, reg2); |
+ reg6 = __msa_ilvod_w(reg3, reg3); |
+ reg7 = __msa_ilvr_w(reg4, reg4); |
+ reg2 = __msa_ilvev_w(reg2, reg2); |
+ reg3 = __msa_ilvev_w(reg3, reg3); |
+ reg4 = __msa_ilvl_w(reg4, reg4); |
+ reg5 = reg0 - reg5; |
+ reg6 = reg1 - reg6; |
+ reg2 = reg0 - reg2; |
+ reg3 = reg1 - reg3; |
+ reg7 = reg0 - reg7; |
+ reg4 = reg1 - reg4; |
+ reg5 += vec_bb; |
+ reg6 += vec_bb; |
+ reg7 += vec_bg; |
+ reg4 += vec_bg; |
+ reg2 += vec_br; |
+ reg3 += vec_br; |
+ reg5 = __msa_srai_w(reg5, 6); |
+ reg6 = __msa_srai_w(reg6, 6); |
+ reg7 = __msa_srai_w(reg7, 6); |
+ reg4 = __msa_srai_w(reg4, 6); |
+ reg2 = __msa_srai_w(reg2, 6); |
+ reg3 = __msa_srai_w(reg3, 6); |
+ reg5 = __msa_maxi_s_w(reg5, 0); |
+ reg6 = __msa_maxi_s_w(reg6, 0); |
+ reg7 = __msa_maxi_s_w(reg7, 0); |
+ reg4 = __msa_maxi_s_w(reg4, 0); |
+ reg2 = __msa_maxi_s_w(reg2, 0); |
+ reg3 = __msa_maxi_s_w(reg3, 0); |
+ reg5 = __msa_min_s_w(max, reg5); |
+ reg6 = __msa_min_s_w(max, reg6); |
+ reg7 = __msa_min_s_w(max, reg7); |
+ reg4 = __msa_min_s_w(max, reg4); |
+ reg2 = __msa_min_s_w(max, reg2); |
+ reg3 = __msa_min_s_w(max, reg3); |
+ vec0 = (v8u16)__msa_ilvev_b((v16i8)reg7, (v16i8)reg5); |
+ vec1 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg2); |
+ vec2 = (v8u16)__msa_ilvev_b((v16i8)reg4, (v16i8)reg6); |
+ vec3 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg3); |
+ dst0 = (v16u8)__msa_ilvev_h((v8i16)vec1, (v8i16)vec0); |
+ dst1 = (v16u8)__msa_ilvev_h((v8i16)vec3, (v8i16)vec2); |
+ ST_UB2(dst0, dst1, rgb_buf, 16); |
+ src_y += 8; |
+ src_vu += 8; |
+ rgb_buf += 32; |
+ } |
+} |
+ |
+void SobelRow_MSA(const uint8* src_sobelx, |
+ const uint8* src_sobely, |
+ uint8* dst_argb, |
+ int width) { |
+ int x; |
+ v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; |
+ v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; |
+ v16i8 const_0x4 = __msa_ldi_b(0x4); |
+ v16i8 mask1 = mask0 + const_0x4; |
+ v16i8 mask2 = mask1 + const_0x4; |
+ v16i8 mask3 = mask2 + const_0x4; |
+ v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF); |
+ |
+ for (x = 0; x < width; x += 16) { |
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
+ vec0 = __msa_adds_u_b(src0, src1); |
+ dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)const_0xFF, (v16i8)vec0); |
+ dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)const_0xFF, (v16i8)vec0); |
+ dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)const_0xFF, (v16i8)vec0); |
+ dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)const_0xFF, (v16i8)vec0); |
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); |
+ src_sobelx += 16; |
+ src_sobely += 16; |
+ dst_argb += 64; |
+ } |
+} |
+ |
+void SobelToPlaneRow_MSA(const uint8* src_sobelx, |
+ const uint8* src_sobely, |
+ uint8* dst_y, |
+ int width) { |
+ int x; |
+ v16u8 src0, src1, src2, src3, dst0, dst1; |
+ |
+ for (x = 0; x < width; x += 32) { |
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16); |
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16); |
+ dst0 = __msa_adds_u_b(src0, src2); |
+ dst1 = __msa_adds_u_b(src1, src3); |
+ ST_UB2(dst0, dst1, dst_y, 16); |
+ src_sobelx += 32; |
+ src_sobely += 32; |
+ dst_y += 32; |
+ } |
+} |
+ |
+void SobelXYRow_MSA(const uint8* src_sobelx, |
+ const uint8* src_sobely, |
+ uint8* dst_argb, |
+ int width) { |
+ int x; |
+ v16u8 src0, src1, vec0, vec1, vec2; |
+ v16u8 reg0, reg1, dst0, dst1, dst2, dst3; |
+ v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF); |
+ |
+ for (x = 0; x < width; x += 16) { |
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
+ vec0 = __msa_adds_u_b(src0, src1); |
+ vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); |
+ vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); |
+ reg0 = (v16u8)__msa_ilvr_b((v16i8)const_0xFF, (v16i8)vec0); |
+ reg1 = (v16u8)__msa_ilvl_b((v16i8)const_0xFF, (v16i8)vec0); |
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); |
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); |
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); |
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); |
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); |
+ src_sobelx += 16; |
+ src_sobely += 16; |
+ dst_argb += 64; |
+ } |
+} |
+ |
#ifdef __cplusplus |
} // extern "C" |
} // namespace libyuv |