Index: source/row_msa.cc |
diff --git a/source/row_msa.cc b/source/row_msa.cc |
index 1e174fd66f4a5de7770dad0fb1ab19b3e69a1e57..c5c0e98c5cf6960a2f9432002671ff75b7489339 100644 |
--- a/source/row_msa.cc |
+++ b/source/row_msa.cc |
@@ -47,65 +47,66 @@ extern "C" { |
} |
// Convert 8 pixels of YUV 420 to RGB. |
-#define YUVTORGB(in_y, in_u, in_v, ub, vr, ug, vg, bb, bg, br, yg, out_b, \ |
- out_g, out_r) \ |
- { \ |
- v8i16 vec0_m; \ |
- v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ |
- v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \ |
- v4i32 max_val_m = __msa_ldi_w(255); \ |
- v8i16 zero_m = {0}; \ |
- \ |
- in_u = (v16u8)__msa_ilvr_b((v16i8)in_u, (v16i8)in_u); \ |
- in_v = (v16u8)__msa_ilvr_b((v16i8)in_v, (v16i8)in_v); \ |
- vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ |
- reg0_m = (v4i32)__msa_ilvr_h(zero_m, vec0_m); \ |
- reg1_m = (v4i32)__msa_ilvl_h(zero_m, vec0_m); \ |
- reg0_m *= vec_yg; \ |
- reg1_m *= vec_yg; \ |
- reg0_m = __msa_srai_w(reg0_m, 16); \ |
- reg1_m = __msa_srai_w(reg1_m, 16); \ |
- reg4_m = reg0_m + br; \ |
- reg5_m = reg1_m + br; \ |
- reg2_m = reg0_m + bg; \ |
- reg3_m = reg1_m + bg; \ |
- reg0_m += bb; \ |
- reg1_m += bb; \ |
- vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_u); \ |
- reg6_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \ |
- reg7_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \ |
- vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_v); \ |
- reg8_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \ |
- reg9_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \ |
- reg0_m -= reg6_m * ub; \ |
- reg1_m -= reg7_m * ub; \ |
- reg2_m -= reg6_m * ug; \ |
- reg3_m -= reg7_m * ug; \ |
- reg4_m -= reg8_m * vr; \ |
- reg5_m -= reg9_m * vr; \ |
- reg2_m -= reg8_m * vg; \ |
- reg3_m -= reg9_m * vg; \ |
- reg0_m = __msa_srai_w(reg0_m, 6); \ |
- reg1_m = __msa_srai_w(reg1_m, 6); \ |
- reg2_m = __msa_srai_w(reg2_m, 6); \ |
- reg3_m = __msa_srai_w(reg3_m, 6); \ |
- reg4_m = __msa_srai_w(reg4_m, 6); \ |
- reg5_m = __msa_srai_w(reg5_m, 6); \ |
- reg0_m = __msa_maxi_s_w(reg0_m, 0); \ |
- reg1_m = __msa_maxi_s_w(reg1_m, 0); \ |
- reg2_m = __msa_maxi_s_w(reg2_m, 0); \ |
- reg3_m = __msa_maxi_s_w(reg3_m, 0); \ |
- reg4_m = __msa_maxi_s_w(reg4_m, 0); \ |
- reg5_m = __msa_maxi_s_w(reg5_m, 0); \ |
- reg0_m = __msa_min_s_w(reg0_m, max_val_m); \ |
- reg1_m = __msa_min_s_w(reg1_m, max_val_m); \ |
- reg2_m = __msa_min_s_w(reg2_m, max_val_m); \ |
- reg3_m = __msa_min_s_w(reg3_m, max_val_m); \ |
- reg4_m = __msa_min_s_w(reg4_m, max_val_m); \ |
- reg5_m = __msa_min_s_w(reg5_m, max_val_m); \ |
- out_b = __msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ |
- out_g = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ |
- out_r = __msa_pckev_h((v8i16)reg5_m, (v8i16)reg4_m); \ |
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ |
+ { \ |
+ v8i16 vec0_m, vec1_m; \ |
+ v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ |
+ v4i32 reg5_m, reg6_m, reg7_m; \ |
+ v4i32 max = __msa_ldi_w(255); \ |
+ v16i8 zero = {0}; \ |
+ \ |
+ vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ |
+ vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in_uv); \ |
+ reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0_m); \ |
+ reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0_m); \ |
+ reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1_m); \ |
+ reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1_m); \ |
+ reg0_m *= yg; \ |
+ reg1_m *= yg; \ |
+ reg2_m *= ubvr; \ |
+ reg3_m *= ubvr; \ |
+ reg0_m = __msa_srai_w(reg0_m, 16); \ |
+ reg1_m = __msa_srai_w(reg1_m, 16); \ |
+ reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ |
+ reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ |
+ reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ |
+ reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ |
+ reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ |
+ reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ |
+ reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ |
+ reg5_m = reg0_m - reg5_m; \ |
+ reg6_m = reg1_m - reg6_m; \ |
+ reg2_m = reg0_m - reg2_m; \ |
+ reg3_m = reg1_m - reg3_m; \ |
+ reg7_m = reg0_m - reg7_m; \ |
+ reg4_m = reg1_m - reg4_m; \ |
+ reg5_m += bb; \ |
+ reg6_m += bb; \ |
+ reg7_m += bg; \ |
+ reg4_m += bg; \ |
+ reg2_m += br; \ |
+ reg3_m += br; \ |
+ reg5_m = __msa_srai_w(reg5_m, 6); \ |
+ reg6_m = __msa_srai_w(reg6_m, 6); \ |
+ reg7_m = __msa_srai_w(reg7_m, 6); \ |
+ reg4_m = __msa_srai_w(reg4_m, 6); \ |
+ reg2_m = __msa_srai_w(reg2_m, 6); \ |
+ reg3_m = __msa_srai_w(reg3_m, 6); \ |
+ reg5_m = __msa_maxi_s_w(reg5_m, 0); \ |
+ reg6_m = __msa_maxi_s_w(reg6_m, 0); \ |
+ reg7_m = __msa_maxi_s_w(reg7_m, 0); \ |
+ reg4_m = __msa_maxi_s_w(reg4_m, 0); \ |
+ reg2_m = __msa_maxi_s_w(reg2_m, 0); \ |
+ reg3_m = __msa_maxi_s_w(reg3_m, 0); \ |
+ reg5_m = __msa_min_s_w(max, reg5_m); \ |
+ reg6_m = __msa_min_s_w(max, reg6_m); \ |
+ reg7_m = __msa_min_s_w(max, reg7_m); \ |
+ reg4_m = __msa_min_s_w(max, reg4_m); \ |
+ reg2_m = __msa_min_s_w(max, reg2_m); \ |
+ reg3_m = __msa_min_s_w(max, reg3_m); \ |
+ out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ |
+ out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ |
+ out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ |
} |
// Pack and Store 8 ARGB values. |
@@ -212,15 +213,19 @@ void I422ToARGBRow_MSA(const uint8* src_y, |
v16u8 src0, src1, src2; |
v8i16 vec0, vec1, vec2; |
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v4i32 vec_ubvr, vec_ugvg; |
v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
vec_br, vec_yg); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
for (x = 0; x < width; x += 8) { |
READYUV422(src_y, src_u, src_v, src0, src1, src2); |
- YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
- vec_br, vec_yg, vec0, vec1, vec2); |
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
+ vec0, vec1, vec2); |
STOREARGB(vec0, vec1, vec2, const_255, rgb_buf); |
src_y += 8; |
src_u += 4; |
@@ -239,15 +244,19 @@ void I422ToRGBARow_MSA(const uint8* src_y, |
v16u8 src0, src1, src2; |
v8i16 vec0, vec1, vec2; |
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v4i32 vec_ubvr, vec_ugvg; |
v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
vec_br, vec_yg); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
for (x = 0; x < width; x += 8) { |
READYUV422(src_y, src_u, src_v, src0, src1, src2); |
- YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
- vec_br, vec_yg, vec0, vec1, vec2); |
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
+ vec0, vec1, vec2); |
STOREARGB(const_255, vec0, vec1, vec2, rgb_buf); |
src_y += 8; |
src_u += 4; |
@@ -268,17 +277,21 @@ void I422AlphaToARGBRow_MSA(const uint8* src_y, |
v16u8 src0, src1, src2, src3; |
v8i16 vec0, vec1, vec2; |
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v4i32 vec_ubvr, vec_ugvg; |
v4i32 zero = {0}; |
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
vec_br, vec_yg); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
for (x = 0; x < width; x += 8) { |
data_a = LD(src_a); |
READYUV422(src_y, src_u, src_v, src0, src1, src2); |
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); |
- YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
- vec_br, vec_yg, vec0, vec1, vec2); |
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
+ vec0, vec1, vec2); |
src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); |
STOREARGB(vec0, vec1, vec2, src3, rgb_buf); |
src_y += 8; |
@@ -297,9 +310,10 @@ void I422ToRGB24Row_MSA(const uint8* src_y, |
int32 width) { |
int x; |
int64 data_u, data_v; |
- v16u8 src0, src1, src2, src3, src4, src5, dst0, dst1, dst2; |
+ v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; |
v8i16 vec0, vec1, vec2, vec3, vec4, vec5; |
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v4i32 vec_ubvr, vec_ugvg; |
v16u8 reg0, reg1, reg2, reg3; |
v2i64 zero = {0}; |
v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; |
@@ -309,6 +323,8 @@ void I422ToRGB24Row_MSA(const uint8* src_y, |
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
vec_br, vec_yg); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
for (x = 0; x < width; x += 16) { |
src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); |
@@ -316,13 +332,13 @@ void I422ToRGB24Row_MSA(const uint8* src_y, |
data_v = LD(src_v); |
src1 = (v16u8)__msa_insert_d(zero, 0, data_u); |
src2 = (v16u8)__msa_insert_d(zero, 0, data_v); |
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); |
- src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 4); |
- src5 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); |
- YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
- vec_br, vec_yg, vec0, vec1, vec2); |
- YUVTORGB(src3, src4, src5, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
- vec_br, vec_yg, vec3, vec4, vec5); |
+ src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); |
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
+ vec0, vec1, vec2); |
+ YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
+ vec3, vec4, vec5); |
reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); |
reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); |
reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); |
@@ -350,14 +366,18 @@ void I422ToRGB565Row_MSA(const uint8* src_y, |
v16u8 src0, src1, src2, dst0; |
v8i16 vec0, vec1, vec2; |
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v4i32 vec_ubvr, vec_ugvg; |
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
vec_br, vec_yg); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
for (x = 0; x < width; x += 8) { |
READYUV422(src_y, src_u, src_v, src0, src1, src2); |
- YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
- vec_br, vec_yg, vec0, vec2, vec1); |
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
+ vec0, vec2, vec1); |
vec0 = __msa_srai_h(vec0, 3); |
vec1 = __msa_srai_h(vec1, 3); |
vec2 = __msa_srai_h(vec2, 2); |
@@ -385,15 +405,19 @@ void I422ToARGB4444Row_MSA(const uint8* src_y, |
v8i16 vec0, vec1, vec2; |
v8u16 reg0, reg1, reg2; |
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v4i32 vec_ubvr, vec_ugvg; |
v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); |
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
vec_br, vec_yg); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
for (x = 0; x < width; x += 8) { |
READYUV422(src_y, src_u, src_v, src0, src1, src2); |
- YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
- vec_br, vec_yg, vec0, vec1, vec2); |
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
+ vec0, vec1, vec2); |
reg0 = (v8u16)__msa_srai_h(vec0, 4); |
reg1 = (v8u16)__msa_srai_h(vec1, 4); |
reg2 = (v8u16)__msa_srai_h(vec2, 4); |
@@ -421,15 +445,19 @@ void I422ToARGB1555Row_MSA(const uint8* src_y, |
v8i16 vec0, vec1, vec2; |
v8u16 reg0, reg1, reg2; |
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v4i32 vec_ubvr, vec_ugvg; |
v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); |
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
vec_br, vec_yg); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
for (x = 0; x < width; x += 8) { |
READYUV422(src_y, src_u, src_v, src0, src1, src2); |
- YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
- vec_br, vec_yg, vec0, vec1, vec2); |
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
+ vec0, vec1, vec2); |
reg0 = (v8u16)__msa_srai_h(vec0, 3); |
reg1 = (v8u16)__msa_srai_h(vec1, 3); |
reg2 = (v8u16)__msa_srai_h(vec2, 3); |
@@ -2023,6 +2051,195 @@ void RAWToUVRow_MSA(const uint8* src_rgb0, |
} |
} |
+void NV12ToARGBRow_MSA(const uint8* src_y, |
+ const uint8* src_uv, |
+ uint8* rgb_buf, |
+ const struct YuvConstants* yuvconstants, |
+ int width) { |
+ int x; |
+ uint64 val0, val1; |
+ v16u8 src0, src1, res0, res1, dst0, dst1; |
+ v8i16 vec0, vec1, vec2; |
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v4i32 vec_ubvr, vec_ugvg; |
+ v16u8 zero = {0}; |
+ v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
+ |
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
+ vec_br, vec_yg); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
+ |
+ for (x = 0; x < width; x += 8) { |
+ val0 = LD(src_y); |
+ val1 = LD(src_uv); |
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
+ vec0, vec1, vec2); |
+ res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); |
+ res1 = (v16u8)__msa_ilvev_b((v16i8)const_255, (v16i8)vec1); |
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); |
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); |
+ ST_UB2(dst0, dst1, rgb_buf, 16); |
+ src_y += 8; |
+ src_uv += 8; |
+ rgb_buf += 32; |
+ } |
+} |
+ |
+void NV12ToRGB565Row_MSA(const uint8* src_y, |
+ const uint8* src_uv, |
+ uint8* rgb_buf, |
+ const struct YuvConstants* yuvconstants, |
+ int width) { |
+ int x; |
+ uint64 val0, val1; |
+ v16u8 src0, src1, dst0; |
+ v8i16 vec0, vec1, vec2; |
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v4i32 vec_ubvr, vec_ugvg; |
+ v16u8 zero = {0}; |
+ |
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
+ vec_br, vec_yg); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
+ |
+ for (x = 0; x < width; x += 8) { |
+ val0 = LD(src_y); |
+ val1 = LD(src_uv); |
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
+ vec0, vec1, vec2); |
+ vec0 = vec0 >> 3; |
+ vec1 = (vec1 >> 2) << 5; |
+ vec2 = (vec2 >> 3) << 11; |
+ dst0 = (v16u8)(vec0 | vec1 | vec2); |
+ ST_UB(dst0, rgb_buf); |
+ src_y += 8; |
+ src_uv += 8; |
+ rgb_buf += 16; |
+ } |
+} |
+ |
+void NV21ToARGBRow_MSA(const uint8* src_y, |
+ const uint8* src_vu, |
+ uint8* rgb_buf, |
+ const struct YuvConstants* yuvconstants, |
+ int width) { |
+ int x; |
+ uint64 val0, val1; |
+ v16u8 src0, src1, res0, res1, dst0, dst1; |
+ v8i16 vec0, vec1, vec2; |
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
+ v4i32 vec_ubvr, vec_ugvg; |
+ v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
+ v16u8 zero = {0}; |
+ v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; |
+ |
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
+ vec_br, vec_yg); |
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
+ |
+ for (x = 0; x < width; x += 8) { |
+ val0 = LD(src_y); |
+ val1 = LD(src_vu); |
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
+ src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); |
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
+ vec0, vec1, vec2); |
+ res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); |
+ res1 = (v16u8)__msa_ilvev_b((v16i8)const_255, (v16i8)vec1); |
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); |
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); |
+ ST_UB2(dst0, dst1, rgb_buf, 16); |
+ src_y += 8; |
+ src_vu += 8; |
+ rgb_buf += 32; |
+ } |
+} |
+ |
+void SobelRow_MSA(const uint8* src_sobelx, |
+ const uint8* src_sobely, |
+ uint8* dst_argb, |
+ int width) { |
+ int x; |
+ v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; |
+ v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; |
+ v16i8 const_0x4 = __msa_ldi_b(0x4); |
+ v16i8 mask1 = mask0 + const_0x4; |
+ v16i8 mask2 = mask1 + const_0x4; |
+ v16i8 mask3 = mask2 + const_0x4; |
+ v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF); |
+ |
+ for (x = 0; x < width; x += 16) { |
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
+ vec0 = __msa_adds_u_b(src0, src1); |
+ dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)const_0xFF, (v16i8)vec0); |
+ dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)const_0xFF, (v16i8)vec0); |
+ dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)const_0xFF, (v16i8)vec0); |
+ dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)const_0xFF, (v16i8)vec0); |
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); |
+ src_sobelx += 16; |
+ src_sobely += 16; |
+ dst_argb += 64; |
+ } |
+} |
+ |
+void SobelToPlaneRow_MSA(const uint8* src_sobelx, |
+ const uint8* src_sobely, |
+ uint8* dst_y, |
+ int width) { |
+ int x; |
+ v16u8 src0, src1, src2, src3, dst0, dst1; |
+ |
+ for (x = 0; x < width; x += 32) { |
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16); |
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16); |
+ dst0 = __msa_adds_u_b(src0, src2); |
+ dst1 = __msa_adds_u_b(src1, src3); |
+ ST_UB2(dst0, dst1, dst_y, 16); |
+ src_sobelx += 32; |
+ src_sobely += 32; |
+ dst_y += 32; |
+ } |
+} |
+ |
+void SobelXYRow_MSA(const uint8* src_sobelx, |
+ const uint8* src_sobely, |
+ uint8* dst_argb, |
+ int width) { |
+ int x; |
+ v16u8 src0, src1, vec0, vec1, vec2; |
+ v16u8 reg0, reg1, dst0, dst1, dst2, dst3; |
+ v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF); |
+ |
+ for (x = 0; x < width; x += 16) { |
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
+ vec0 = __msa_adds_u_b(src0, src1); |
+ vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); |
+ vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); |
+ reg0 = (v16u8)__msa_ilvr_b((v16i8)const_0xFF, (v16i8)vec0); |
+ reg1 = (v16u8)__msa_ilvl_b((v16i8)const_0xFF, (v16i8)vec0); |
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); |
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); |
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); |
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); |
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); |
+ src_sobelx += 16; |
+ src_sobely += 16; |
+ dst_argb += 64; |
+ } |
+} |
+ |
#ifdef __cplusplus |
} // extern "C" |
} // namespace libyuv |