Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Unified Diff: source/row_msa.cc

Issue 2636483002: Add MSA optimized NV12/21 To RGB row functions (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/row_msa.cc
diff --git a/source/row_msa.cc b/source/row_msa.cc
index 1e174fd66f4a5de7770dad0fb1ab19b3e69a1e57..19fdc3b5047021e45c8ef007842683aa03717672 100644
--- a/source/row_msa.cc
+++ b/source/row_msa.cc
@@ -2023,6 +2023,359 @@ void RAWToUVRow_MSA(const uint8* src_rgb0,
}
}
+void NV12ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint64 val0, val1;
+ v16u8 src0, src1, dst0, dst1;
+ v8u16 vec0, vec1, vec2, vec3;
+ v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v4i32 vec_ub, vec_vr, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg;
+ v16u8 zero = {0};
+ v16u8 const_255 = (v16u8)__msa_ldi_b(255);
+ v4i32 max = __msa_ldi_w(255);
+
+ vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]);
+ vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg =
+ __msa_fill_w((yuvconstants->kUVToG[0]) | (yuvconstants->kUVToG[1] << 16));
+ vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]);
+ vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]);
+ vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]);
+ vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]);
+
+ for (x = 0; x < width; x += 8) {
+ val0 = LD(src_y);
+ val1 = LD(src_uv);
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+ reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+ reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+ reg2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+ reg3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
fbarchard1 2017/01/13 18:46:01 consider macros to read nv12, do yuvtorgb, and wri
manojkumar.bhosale 2017/01/17 10:29:45 Done.
+ reg0 *= vec_yg;
+ reg1 *= vec_yg;
+ reg2 *= vec_ubvr;
+ reg3 *= vec_ubvr;
+ reg0 = __msa_srai_w(reg0, 16);
+ reg1 = __msa_srai_w(reg1, 16);
+ reg4 = __msa_dotp_s_w((v8i16)vec1, (v8i16)vec_ugvg);
+ reg5 = __msa_ilvev_w(reg2, reg2);
+ reg6 = __msa_ilvev_w(reg3, reg3);
+ reg7 = __msa_ilvr_w(reg4, reg4);
+ reg2 = __msa_ilvod_w(reg2, reg2);
+ reg3 = __msa_ilvod_w(reg3, reg3);
+ reg4 = __msa_ilvl_w(reg4, reg4);
+ reg5 = reg0 - reg5;
+ reg6 = reg1 - reg6;
+ reg2 = reg0 - reg2;
+ reg3 = reg1 - reg3;
+ reg7 = reg0 - reg7;
+ reg4 = reg1 - reg4;
+ reg5 += vec_bb;
+ reg6 += vec_bb;
+ reg7 += vec_bg;
+ reg4 += vec_bg;
+ reg2 += vec_br;
+ reg3 += vec_br;
+ reg5 = __msa_srai_w(reg5, 6);
+ reg6 = __msa_srai_w(reg6, 6);
+ reg7 = __msa_srai_w(reg7, 6);
+ reg4 = __msa_srai_w(reg4, 6);
+ reg2 = __msa_srai_w(reg2, 6);
+ reg3 = __msa_srai_w(reg3, 6);
+ reg5 = __msa_maxi_s_w(reg5, 0);
fbarchard1 2017/01/13 18:46:01 no free way to pack words to bytes with saturation
manojkumar.bhosale 2017/01/17 10:29:45 No
+ reg6 = __msa_maxi_s_w(reg6, 0);
+ reg7 = __msa_maxi_s_w(reg7, 0);
+ reg4 = __msa_maxi_s_w(reg4, 0);
+ reg2 = __msa_maxi_s_w(reg2, 0);
+ reg3 = __msa_maxi_s_w(reg3, 0);
+ reg5 = __msa_min_s_w(max, reg5);
+ reg6 = __msa_min_s_w(max, reg6);
+ reg7 = __msa_min_s_w(max, reg7);
+ reg4 = __msa_min_s_w(max, reg4);
+ reg2 = __msa_min_s_w(max, reg2);
+ reg3 = __msa_min_s_w(max, reg3);
+ vec0 = (v8u16)__msa_ilvev_b((v16i8)reg7, (v16i8)reg5);
+ vec1 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg2);
+ vec2 = (v8u16)__msa_ilvev_b((v16i8)reg4, (v16i8)reg6);
+ vec3 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg3);
+ dst0 = (v16u8)__msa_ilvev_h((v8i16)vec1, (v8i16)vec0);
+ dst1 = (v16u8)__msa_ilvev_h((v8i16)vec3, (v8i16)vec2);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ src_y += 8;
+ src_uv += 8;
+ rgb_buf += 32;
+ }
+}
+
+void NV12ToRGB565Row_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint64 val0, val1;
+ v16u8 src0, src1, dst0;
+ v8u16 vec0, vec1, vec2;
+ v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v4i32 vec_ub, vec_vr, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg;
+ v16u8 zero = {0};
+ v4i32 max = __msa_ldi_w(255);
+
+ vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]);
+ vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg =
+ __msa_fill_w((yuvconstants->kUVToG[0]) | (yuvconstants->kUVToG[1] << 16));
+ vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]);
+ vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]);
+ vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]);
+ vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]);
+
+ for (x = 0; x < width; x += 8) {
+ val0 = LD(src_y);
+ val1 = LD(src_uv);
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+ reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+ reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+ reg2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+ reg3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+ reg0 *= vec_yg;
+ reg1 *= vec_yg;
+ reg2 *= vec_ubvr;
+ reg3 *= vec_ubvr;
+ reg0 = __msa_srai_w(reg0, 16);
+ reg1 = __msa_srai_w(reg1, 16);
+ reg4 = __msa_dotp_s_w((v8i16)vec1, (v8i16)vec_ugvg);
+ reg5 = __msa_ilvev_w(reg2, reg2);
+ reg6 = __msa_ilvev_w(reg3, reg3);
+ reg7 = __msa_ilvr_w(reg4, reg4);
+ reg2 = __msa_ilvod_w(reg2, reg2);
+ reg3 = __msa_ilvod_w(reg3, reg3);
+ reg4 = __msa_ilvl_w(reg4, reg4);
+ reg5 = reg0 - reg5;
+ reg6 = reg1 - reg6;
+ reg2 = reg0 - reg2;
+ reg3 = reg1 - reg3;
+ reg7 = reg0 - reg7;
+ reg4 = reg1 - reg4;
+ reg5 += vec_bb;
+ reg6 += vec_bb;
+ reg7 += vec_bg;
+ reg4 += vec_bg;
+ reg2 += vec_br;
+ reg3 += vec_br;
+ reg5 = __msa_srai_w(reg5, 6);
+ reg6 = __msa_srai_w(reg6, 6);
+ reg7 = __msa_srai_w(reg7, 6);
+ reg4 = __msa_srai_w(reg4, 6);
+ reg2 = __msa_srai_w(reg2, 6);
+ reg3 = __msa_srai_w(reg3, 6);
+ reg5 = __msa_maxi_s_w(reg5, 0);
+ reg6 = __msa_maxi_s_w(reg6, 0);
+ reg7 = __msa_maxi_s_w(reg7, 0);
+ reg4 = __msa_maxi_s_w(reg4, 0);
+ reg2 = __msa_maxi_s_w(reg2, 0);
+ reg3 = __msa_maxi_s_w(reg3, 0);
+ reg5 = __msa_min_s_w(max, reg5);
+ reg6 = __msa_min_s_w(max, reg6);
+ reg7 = __msa_min_s_w(max, reg7);
+ reg4 = __msa_min_s_w(max, reg4);
+ reg2 = __msa_min_s_w(max, reg2);
+ reg3 = __msa_min_s_w(max, reg3);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg6, (v8i16)reg5);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg4, (v8i16)reg7);
+ vec2 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec0 = vec0 >> 3;
+ vec1 = (vec1 >> 2) << 5;
+ vec2 = (vec2 >> 3) << 11;
+ dst0 = (v16u8)(vec0 | vec1 | vec2);
+ ST_UB(dst0, rgb_buf);
+ src_y += 8;
+ src_uv += 8;
+ rgb_buf += 16;
+ }
+}
+
+void NV21ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint64 val0, val1;
+ v16u8 src0, src1, dst0, dst1;
+ v8u16 vec0, vec1, vec2, vec3;
+ v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v4i32 vec_ub, vec_vr, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg;
+ v16u8 zero = {0};
+ v16u8 const_255 = (v16u8)__msa_ldi_b(255);
+ v4i32 max = __msa_ldi_w(255);
+
+ vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]);
+ vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]);
+ vec_ubvr = __msa_ilvr_w(vec_ub, vec_vr);
+ vec_ugvg =
+ __msa_fill_w((yuvconstants->kUVToG[1] | (yuvconstants->kUVToG[0] << 16)));
+ vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]);
+ vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]);
+ vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]);
+ vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]);
+
+ for (x = 0; x < width; x += 8) {
+ val0 = LD(src_y);
+ val1 = LD(src_vu);
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+ reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+ reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+ reg2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+ reg3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+ reg0 *= vec_yg;
+ reg1 *= vec_yg;
+ reg2 *= vec_ubvr;
+ reg3 *= vec_ubvr;
+ reg0 = __msa_srai_w(reg0, 16);
+ reg1 = __msa_srai_w(reg1, 16);
+ reg4 = __msa_dotp_s_w((v8i16)vec1, (v8i16)vec_ugvg);
+ reg5 = __msa_ilvod_w(reg2, reg2);
+ reg6 = __msa_ilvod_w(reg3, reg3);
+ reg7 = __msa_ilvr_w(reg4, reg4);
+ reg2 = __msa_ilvev_w(reg2, reg2);
+ reg3 = __msa_ilvev_w(reg3, reg3);
+ reg4 = __msa_ilvl_w(reg4, reg4);
+ reg5 = reg0 - reg5;
+ reg6 = reg1 - reg6;
+ reg2 = reg0 - reg2;
+ reg3 = reg1 - reg3;
+ reg7 = reg0 - reg7;
+ reg4 = reg1 - reg4;
+ reg5 += vec_bb;
+ reg6 += vec_bb;
+ reg7 += vec_bg;
+ reg4 += vec_bg;
+ reg2 += vec_br;
+ reg3 += vec_br;
+ reg5 = __msa_srai_w(reg5, 6);
+ reg6 = __msa_srai_w(reg6, 6);
+ reg7 = __msa_srai_w(reg7, 6);
+ reg4 = __msa_srai_w(reg4, 6);
+ reg2 = __msa_srai_w(reg2, 6);
+ reg3 = __msa_srai_w(reg3, 6);
+ reg5 = __msa_maxi_s_w(reg5, 0);
+ reg6 = __msa_maxi_s_w(reg6, 0);
+ reg7 = __msa_maxi_s_w(reg7, 0);
+ reg4 = __msa_maxi_s_w(reg4, 0);
+ reg2 = __msa_maxi_s_w(reg2, 0);
+ reg3 = __msa_maxi_s_w(reg3, 0);
+ reg5 = __msa_min_s_w(max, reg5);
+ reg6 = __msa_min_s_w(max, reg6);
+ reg7 = __msa_min_s_w(max, reg7);
+ reg4 = __msa_min_s_w(max, reg4);
+ reg2 = __msa_min_s_w(max, reg2);
+ reg3 = __msa_min_s_w(max, reg3);
+ vec0 = (v8u16)__msa_ilvev_b((v16i8)reg7, (v16i8)reg5);
+ vec1 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg2);
+ vec2 = (v8u16)__msa_ilvev_b((v16i8)reg4, (v16i8)reg6);
+ vec3 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg3);
+ dst0 = (v16u8)__msa_ilvev_h((v8i16)vec1, (v8i16)vec0);
+ dst1 = (v16u8)__msa_ilvev_h((v8i16)vec3, (v8i16)vec2);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ src_y += 8;
+ src_vu += 8;
+ rgb_buf += 32;
+ }
+}
+
+void SobelRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
+ v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
+ v16i8 const_0x4 = __msa_ldi_b(0x4);
+ v16i8 mask1 = mask0 + const_0x4;
+ v16i8 mask2 = mask1 + const_0x4;
+ v16i8 mask3 = mask2 + const_0x4;
+ v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+ vec0 = __msa_adds_u_b(src0, src1);
+ dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)const_0xFF, (v16i8)vec0);
+ dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)const_0xFF, (v16i8)vec0);
+ dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)const_0xFF, (v16i8)vec0);
+ dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)const_0xFF, (v16i8)vec0);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_sobelx += 16;
+ src_sobely += 16;
+ dst_argb += 64;
+ }
+}
+
+void SobelToPlaneRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16);
+ dst0 = __msa_adds_u_b(src0, src2);
+ dst1 = __msa_adds_u_b(src1, src3);
+ ST_UB2(dst0, dst1, dst_y, 16);
+ src_sobelx += 32;
+ src_sobely += 32;
+ dst_y += 32;
+ }
+}
+
+void SobelXYRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, vec0, vec1, vec2;
+ v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
+ v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+ vec0 = __msa_adds_u_b(src0, src1);
+ vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
+ vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
+ reg0 = (v16u8)__msa_ilvr_b((v16i8)const_0xFF, (v16i8)vec0);
+ reg1 = (v16u8)__msa_ilvl_b((v16i8)const_0xFF, (v16i8)vec0);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_sobelx += 16;
+ src_sobely += 16;
+ dst_argb += 64;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698