OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 1020 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1031 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); | 1031 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); |
1032 dst0 = __msa_subs_u_b(src0, src2); | 1032 dst0 = __msa_subs_u_b(src0, src2); |
1033 dst1 = __msa_subs_u_b(src1, src3); | 1033 dst1 = __msa_subs_u_b(src1, src3); |
1034 ST_UB2(dst0, dst1, dst_argb, 16); | 1034 ST_UB2(dst0, dst1, dst_argb, 16); |
1035 src_argb0 += 32; | 1035 src_argb0 += 32; |
1036 src_argb1 += 32; | 1036 src_argb1 += 32; |
1037 dst_argb += 32; | 1037 dst_argb += 32; |
1038 } | 1038 } |
1039 } | 1039 } |
1040 | 1040 |
1041 void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { | |
1042 int x; | |
1043 v16u8 src0, src1, dst0, dst1; | |
1044 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; | |
1045 v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | |
1046 v8i16 zero = {0}; | |
1047 v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; | |
1048 | |
1049 for (x = 0; x < width; x += 8) { | |
1050 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); | |
fbarchard1
2016/12/12 20:09:03
Can this function be further optimized? Suggest a
manojkumar.bhosale
2016/12/15 06:35:21
Acknowledged.
| |
1051 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); | |
1052 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); | |
1053 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); | |
1054 vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1); | |
1055 vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1); | |
1056 vec4 = (v8u16)__msa_fill_h(vec0[3]); | |
1057 vec5 = (v8u16)__msa_fill_h(vec0[7]); | |
1058 vec6 = (v8u16)__msa_fill_h(vec1[3]); | |
1059 vec7 = (v8u16)__msa_fill_h(vec1[7]); | |
1060 vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); | |
1061 vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); | |
1062 vec6 = (v8u16)__msa_fill_h(vec2[3]); | |
1063 vec7 = (v8u16)__msa_fill_h(vec2[7]); | |
1064 vec8 = (v8u16)__msa_fill_h(vec3[3]); | |
1065 vec9 = (v8u16)__msa_fill_h(vec3[7]); | |
1066 vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); | |
1067 vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); | |
1068 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4); | |
1069 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4); | |
1070 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5); | |
1071 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5); | |
1072 reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6); | |
1073 reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6); | |
1074 reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7); | |
1075 reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7); | |
1076 reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); | |
1077 reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); | |
1078 reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); | |
1079 reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); | |
1080 reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); | |
1081 reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); | |
1082 reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); | |
1083 reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); | |
1084 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); | |
1085 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); | |
1086 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); | |
1087 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); | |
1088 reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24); | |
1089 reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24); | |
1090 reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24); | |
1091 reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24); | |
1092 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); | |
1093 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); | |
1094 vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); | |
1095 vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6); | |
1096 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); | |
1097 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); | |
1098 dst0 = __msa_bmnz_v(dst0, src0, mask); | |
1099 dst1 = __msa_bmnz_v(dst1, src1, mask); | |
1100 ST_UB2(dst0, dst1, dst_argb, 16); | |
1101 src_argb += 32; | |
1102 dst_argb += 32; | |
1103 } | |
1104 } | |
1105 | |
1106 void ARGBToRGB565DitherRow_MSA(const uint8* src_argb, | |
1107 uint8* dst_rgb, | |
1108 uint32 dither4, | |
1109 int width) { | |
1110 int x; | |
1111 v16u8 src0, src1, dst0, vec0, vec1; | |
1112 v8i16 vec_d0; | |
1113 v8i16 reg0, reg1, reg2; | |
1114 v16i8 zero = {0}; | |
1115 v8i16 max = __msa_ldi_h(0xFF); | |
1116 | |
1117 vec_d0 = (v8i16)__msa_fill_w(dither4); | |
1118 vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0); | |
1119 | |
1120 for (x = 0; x < width; x += 8) { | |
1121 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); | |
1122 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); | |
1123 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); | |
1124 vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); | |
1125 reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0); | |
1126 reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1); | |
1127 reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0); | |
1128 reg0 += vec_d0; | |
1129 reg1 += vec_d0; | |
1130 reg2 += vec_d0; | |
1131 reg0 = __msa_maxi_s_h((v8i16)reg0, 0); | |
1132 reg1 = __msa_maxi_s_h((v8i16)reg1, 0); | |
1133 reg2 = __msa_maxi_s_h((v8i16)reg2, 0); | |
1134 reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0); | |
1135 reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1); | |
1136 reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2); | |
1137 reg0 = __msa_srai_h(reg0, 3); | |
1138 reg2 = __msa_srai_h(reg2, 3); | |
1139 reg1 = __msa_srai_h(reg1, 2); | |
1140 reg2 = __msa_slli_h(reg2, 11); | |
1141 reg1 = __msa_slli_h(reg1, 5); | |
1142 reg0 |= reg1; | |
1143 dst0 = (v16u8)(reg0 | reg2); | |
1144 ST_UB(dst0, dst_rgb); | |
1145 src_argb += 32; | |
1146 dst_rgb += 16; | |
1147 } | |
1148 } | |
1149 | |
1150 void ARGBShuffleRow_MSA(const uint8* src_argb, | |
1151 uint8* dst_argb, | |
1152 const uint8* shuffler, | |
1153 int width) { | |
1154 int x; | |
1155 v16u8 src0, src1, dst0, dst1; | |
1156 v16i8 vec0; | |
1157 v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; | |
1158 int32 val = LW((int32*)shuffler); | |
1159 | |
1160 vec0 = (v16i8)__msa_fill_w(val); | |
1161 shuffler_vec += vec0; | |
1162 | |
1163 for (x = 0; x < width; x += 8) { | |
1164 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); | |
1165 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); | |
1166 dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0); | |
1167 dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1); | |
1168 ST_UB2(dst0, dst1, dst_argb, 16); | |
1169 src_argb += 32; | |
1170 dst_argb += 32; | |
1171 } | |
1172 } | |
1173 | |
1174 void ARGBShadeRow_MSA(const uint8* src_argb, | |
1175 uint8* dst_argb, | |
1176 int width, | |
1177 uint32 value) { | |
1178 int x; | |
1179 v16u8 src0, dst0; | |
1180 v8u16 vec0, vec1; | |
1181 v4u32 reg0, reg1, reg2, reg3, rgba_scale; | |
1182 v8i16 zero = {0}; | |
1183 | |
1184 rgba_scale[0] = value; | |
1185 rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale); | |
1186 rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale); | |
1187 | |
1188 for (x = 0; x < width; x += 4) { | |
1189 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); | |
1190 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); | |
1191 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); | |
1192 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); | |
1193 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); | |
1194 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); | |
1195 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); | |
1196 reg0 *= rgba_scale; | |
1197 reg1 *= rgba_scale; | |
1198 reg2 *= rgba_scale; | |
1199 reg3 *= rgba_scale; | |
1200 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); | |
1201 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); | |
1202 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); | |
1203 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); | |
1204 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); | |
1205 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); | |
1206 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); | |
1207 ST_UB(dst0, dst_argb); | |
1208 src_argb += 16; | |
1209 dst_argb += 16; | |
1210 } | |
1211 } | |
1212 | |
1213 void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { | |
1214 int x; | |
1215 v16u8 src0, src1, vec0, vec1, dst0, dst1; | |
1216 v8u16 reg0; | |
1217 v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26); | |
1218 v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); | |
fbarchard1
2016/12/12 20:09:03
note these constants may need to be passed in as p
manojkumar.bhosale
2016/12/15 06:35:21
Acknowledged.
| |
1219 | |
1220 for (x = 0; x < width; x += 8) { | |
1221 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); | |
1222 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); | |
1223 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); | |
1224 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); | |
1225 reg0 = __msa_dotp_u_h(vec0, const_0x4B0F); | |
1226 reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26); | |
1227 reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7); | |
1228 vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); | |
1229 vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); | |
1230 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); | |
1231 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); | |
1232 ST_UB2(dst0, dst1, dst_argb, 16); | |
1233 src_argb += 32; | |
1234 dst_argb += 32; | |
1235 } | |
1236 } | |
1237 | |
1238 void ARGBSepiaRow_MSA(uint8* dst_argb, int width) { | |
1239 int x; | |
1240 v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; | |
1241 v8u16 reg0, reg1, reg2; | |
1242 v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411); | |
1243 v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23); | |
1244 v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816); | |
1245 v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D); | |
1246 v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218); | |
1247 v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32); | |
1248 v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF); | |
1249 | |
1250 for (x = 0; x < width; x += 8) { | |
1251 src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0); | |
1252 src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16); | |
1253 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); | |
1254 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); | |
1255 vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1); | |
1256 reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411); | |
1257 reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816); | |
1258 reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218); | |
1259 reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23); | |
1260 reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D); | |
1261 reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32); | |
1262 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7); | |
1263 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7); | |
1264 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7); | |
1265 reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF); | |
1266 reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF); | |
1267 vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0); | |
1268 vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1); | |
1269 vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2); | |
1270 vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); | |
1271 vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); | |
1272 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); | |
1273 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); | |
1274 ST_UB2(dst0, dst1, dst_argb, 16); | |
1275 dst_argb += 32; | |
1276 } | |
1277 } | |
1278 | |
1041 void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, | 1279 void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, |
1042 uint8* dst_argb, | 1280 uint8* dst_argb, |
1043 int width) { | 1281 int width) { |
1044 int x; | 1282 int x; |
1045 v16u8 src0, src1; | 1283 v16u8 src0, src1; |
1046 v8u16 vec0, vec1, vec2, vec3; | 1284 v8u16 vec0, vec1, vec2, vec3; |
1047 v16u8 dst0, dst1, dst2, dst3; | 1285 v16u8 dst0, dst1, dst2, dst3; |
1048 | 1286 |
1049 for (x = 0; x < width; x += 16) { | 1287 for (x = 0; x < width; x += 16) { |
1050 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0); | 1288 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0); |
(...skipping 15 matching lines...) Expand all Loading... | |
1066 dst_argb += 64; | 1304 dst_argb += 64; |
1067 } | 1305 } |
1068 } | 1306 } |
1069 | 1307 |
1070 #ifdef __cplusplus | 1308 #ifdef __cplusplus |
1071 } // extern "C" | 1309 } // extern "C" |
1072 } // namespace libyuv | 1310 } // namespace libyuv |
1073 #endif | 1311 #endif |
1074 | 1312 |
1075 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 1313 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
OLD | NEW |