Chromium Code Reviews| Index: source/row_msa.cc |
| diff --git a/source/row_msa.cc b/source/row_msa.cc |
| index 52a246cdb2c6a0779e6789dd46f01bbde29077ff..b2ad8aaf6f829125397d5e393b27d30aea5627f0 100644 |
| --- a/source/row_msa.cc |
| +++ b/source/row_msa.cc |
| @@ -101,6 +101,124 @@ void I422ToUYVYRow_MSA(const uint8* src_y, |
| } |
| } |
| +void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) { |
| + int x; |
| + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; |
| + v16u8 dst0, dst1, dst2, dst3; |
| + |
| + for (x = 0; x < width; x += 64) { |
|
fbarchard1
2016/10/05 22:03:58
suggest 32 at a time... or 16. 4 of everything re
manojkumar.bhosale
2016/10/07 10:42:09
We have done loop unrolling considering the load l
|
| + LD_UB8(src_yuy2, 16, src0, src1, src2, src3, src4, src5, src6, src7); |
| + PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, |
| + dst0, dst1, dst2, dst3); |
| + ST_UB4(dst0, dst1, dst2, dst3, dst_y, 16); |
| + src_yuy2 += 128; |
| + dst_y += 64; |
|
fbarchard1
2016/10/05 22:03:58
this is unrolled quite alot. other platforms are
manojkumar.bhosale
2016/10/07 10:42:09
Done.
|
| + } |
| +} |
| + |
| +void YUY2ToUVRow_MSA(const uint8* src_yuy2, int src_stride_yuy2, |
| + uint8* dst_u, uint8* dst_v, int width) { |
| + const uint8* nxt = src_yuy2 + src_stride_yuy2; |
| + int x; |
| + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; |
| + v16u8 vec0, vec1, dst0, dst1; |
| + |
| + for (x = 0; x < width; x += 32) { |
| + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); |
| + LD_UB4(nxt, 16, src4, src5, src6, src7); |
| + PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, |
| + src0, src1, src2, src3); |
| + AVER_UB2_UB(src0, src2, src1, src3, vec0, vec1); |
| + dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0); |
|
fbarchard1
2016/10/05 22:03:58
style nit - i like seeing the intrinisics (or inli
manojkumar.bhosale
2016/10/07 10:42:09
Acknowledged.
|
| + dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0); |
| + ST_UB(dst0, dst_u); |
| + ST_UB(dst1, dst_v); |
| + src_yuy2 += 64; |
| + nxt += 64; |
| + dst_u += 16; |
| + dst_v += 16; |
| + } |
| +} |
| + |
| +void YUY2ToUV422Row_MSA(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, |
|
fbarchard1
2016/10/05 22:03:58
prefer this be same as YUY2TToUV and do 32 at a ti
manojkumar.bhosale
2016/10/07 10:42:09
Done.
|
| + int width) { |
| + int x; |
| + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; |
| + v16u8 dst0, dst1, dst2, dst3; |
| + |
| + for (x = 0; x < width; x += 64) { |
| + LD_UB8(src_yuy2, 16, src0, src1, src2, src3, src4, src5, src6, src7); |
| + PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, |
| + src0, src1, src2, src3); |
| + PCKEV_B2_UB(src1, src0, src3, src2, dst0, dst1); |
| + PCKOD_B2_UB(src1, src0, src3, src2, dst2, dst3); |
| + ST_UB2(dst0, dst1, dst_u, 16); |
| + ST_UB2(dst2, dst3, dst_v, 16); |
| + src_yuy2 += 128; |
| + dst_u += 32; |
| + dst_v += 32; |
| + } |
| +} |
| + |
| +void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) { |
| + int x; |
| + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; |
| + v16u8 dst0, dst1, dst2, dst3; |
| + |
| + for (x = 0; x < width; x += 64) { |
| + LD_UB8(src_uyvy, 16, src0, src1, src2, src3, src4, src5, src6, src7); |
| + PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, |
| + dst0, dst1, dst2, dst3); |
| + ST_UB4(dst0, dst1, dst2, dst3, dst_y, 16); |
| + src_uyvy += 128; |
| + dst_y += 64; |
| + } |
| +} |
| + |
| +void UYVYToUVRow_MSA(const uint8* src_uyvy, int src_stride_uyvy, |
| + uint8* dst_u, uint8* dst_v, int width) { |
| + const uint8 *nxt = src_uyvy + src_stride_uyvy; |
| + int x; |
| + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; |
| + v16u8 vec0, vec1, dst0, dst1; |
| + |
| + for (x = 0; x < width; x += 32) { |
| + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); |
| + LD_UB4(nxt, 16, src4, src5, src6, src7); |
| + PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, |
| + src0, src1, src2, src3); |
| + AVER_UB2_UB(src0, src2, src1, src3, vec0, vec1); |
| + dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0); |
| + dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0); |
|
fbarchard1
2016/10/05 22:03:58
the i8 version of vectors seems to be preferred?
manojkumar.bhosale
2016/10/07 10:42:09
As we are operating on pixel data (u8), we have us
|
| + ST_UB(dst0, dst_u); |
| + ST_UB(dst1, dst_v); |
| + src_uyvy += 64; |
| + nxt += 64; |
| + dst_u += 16; |
| + dst_v += 16; |
| + } |
| +} |
| + |
| +void UYVYToUV422Row_MSA(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, |
| + int width) { |
| + int x; |
| + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; |
| + v16u8 dst0, dst1, dst2, dst3; |
| + |
| + for (x = 0; x < width; x += 64) { |
| + LD_UB8(src_uyvy, 16, src0, src1, src2, src3, src4, src5, src6, src7); |
| + PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, |
| + src0, src1, src2, src3); |
| + PCKEV_B2_UB(src1, src0, src3, src2, dst0, dst1); |
| + PCKOD_B2_UB(src1, src0, src3, src2, dst2, dst3); |
| + ST_UB2(dst0, dst1, dst_u, 16); |
| + ST_UB2(dst2, dst3, dst_v, 16); |
| + src_uyvy += 128; |
| + dst_u += 32; |
| + dst_v += 32; |
| + } |
| +} |
| + |
| #ifdef __cplusplus |
| } // extern "C" |
| } // namespace libyuv |