Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(104)

Side by Side Diff: source/row_msa.cc

Issue 2559693002: Add MSA optimized ARGB Attenuate/RGB565/Shuffle/Shader/Gray/Sepia row functions (Closed)
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 1020 matching lines...) Expand 10 before | Expand all | Expand 10 after
1031 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); 1031 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
1032 dst0 = __msa_subs_u_b(src0, src2); 1032 dst0 = __msa_subs_u_b(src0, src2);
1033 dst1 = __msa_subs_u_b(src1, src3); 1033 dst1 = __msa_subs_u_b(src1, src3);
1034 ST_UB2(dst0, dst1, dst_argb, 16); 1034 ST_UB2(dst0, dst1, dst_argb, 16);
1035 src_argb0 += 32; 1035 src_argb0 += 32;
1036 src_argb1 += 32; 1036 src_argb1 += 32;
1037 dst_argb += 32; 1037 dst_argb += 32;
1038 } 1038 }
1039 } 1039 }
1040 1040
1041 void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
1042 int x;
1043 v16u8 src0, src1, dst0, dst1;
1044 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1045 v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
1046 v8i16 zero = {0};
1047 v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
1048
1049 for (x = 0; x < width; x += 8) {
1050 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
fbarchard1 2016/12/12 20:09:03 Can this function be further optimized? Suggest a
manojkumar.bhosale 2016/12/15 06:35:21 Acknowledged.
1051 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1052 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1053 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1054 vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
1055 vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
1056 vec4 = (v8u16)__msa_fill_h(vec0[3]);
1057 vec5 = (v8u16)__msa_fill_h(vec0[7]);
1058 vec6 = (v8u16)__msa_fill_h(vec1[3]);
1059 vec7 = (v8u16)__msa_fill_h(vec1[7]);
1060 vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
1061 vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
1062 vec6 = (v8u16)__msa_fill_h(vec2[3]);
1063 vec7 = (v8u16)__msa_fill_h(vec2[7]);
1064 vec8 = (v8u16)__msa_fill_h(vec3[3]);
1065 vec9 = (v8u16)__msa_fill_h(vec3[7]);
1066 vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
1067 vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
1068 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
1069 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
1070 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
1071 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
1072 reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
1073 reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
1074 reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
1075 reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
1076 reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1077 reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1078 reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1079 reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1080 reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
1081 reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
1082 reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
1083 reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
1084 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
1085 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
1086 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
1087 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
1088 reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
1089 reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
1090 reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
1091 reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
1092 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1093 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1094 vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
1095 vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
1096 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1097 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1098 dst0 = __msa_bmnz_v(dst0, src0, mask);
1099 dst1 = __msa_bmnz_v(dst1, src1, mask);
1100 ST_UB2(dst0, dst1, dst_argb, 16);
1101 src_argb += 32;
1102 dst_argb += 32;
1103 }
1104 }
1105
1106 void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
1107 uint8* dst_rgb,
1108 uint32 dither4,
1109 int width) {
1110 int x;
1111 v16u8 src0, src1, dst0, vec0, vec1;
1112 v8i16 vec_d0;
1113 v8i16 reg0, reg1, reg2;
1114 v16i8 zero = {0};
1115 v8i16 max = __msa_ldi_h(0xFF);
1116
1117 vec_d0 = (v8i16)__msa_fill_w(dither4);
1118 vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
1119
1120 for (x = 0; x < width; x += 8) {
1121 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
1122 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1123 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1124 vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
1125 reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
1126 reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
1127 reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
1128 reg0 += vec_d0;
1129 reg1 += vec_d0;
1130 reg2 += vec_d0;
1131 reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
1132 reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
1133 reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
1134 reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
1135 reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
1136 reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
1137 reg0 = __msa_srai_h(reg0, 3);
1138 reg2 = __msa_srai_h(reg2, 3);
1139 reg1 = __msa_srai_h(reg1, 2);
1140 reg2 = __msa_slli_h(reg2, 11);
1141 reg1 = __msa_slli_h(reg1, 5);
1142 reg0 |= reg1;
1143 dst0 = (v16u8)(reg0 | reg2);
1144 ST_UB(dst0, dst_rgb);
1145 src_argb += 32;
1146 dst_rgb += 16;
1147 }
1148 }
1149
1150 void ARGBShuffleRow_MSA(const uint8* src_argb,
1151 uint8* dst_argb,
1152 const uint8* shuffler,
1153 int width) {
1154 int x;
1155 v16u8 src0, src1, dst0, dst1;
1156 v16i8 vec0;
1157 v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
1158 int32 val = LW((int32*)shuffler);
1159
1160 vec0 = (v16i8)__msa_fill_w(val);
1161 shuffler_vec += vec0;
1162
1163 for (x = 0; x < width; x += 8) {
1164 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1165 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
1166 dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
1167 dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
1168 ST_UB2(dst0, dst1, dst_argb, 16);
1169 src_argb += 32;
1170 dst_argb += 32;
1171 }
1172 }
1173
1174 void ARGBShadeRow_MSA(const uint8* src_argb,
1175 uint8* dst_argb,
1176 int width,
1177 uint32 value) {
1178 int x;
1179 v16u8 src0, dst0;
1180 v8u16 vec0, vec1;
1181 v4u32 reg0, reg1, reg2, reg3, rgba_scale;
1182 v8i16 zero = {0};
1183
1184 rgba_scale[0] = value;
1185 rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
1186 rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
1187
1188 for (x = 0; x < width; x += 4) {
1189 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1190 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1191 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1192 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1193 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1194 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1195 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1196 reg0 *= rgba_scale;
1197 reg1 *= rgba_scale;
1198 reg2 *= rgba_scale;
1199 reg3 *= rgba_scale;
1200 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
1201 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
1202 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
1203 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
1204 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1205 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1206 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1207 ST_UB(dst0, dst_argb);
1208 src_argb += 16;
1209 dst_argb += 16;
1210 }
1211 }
1212
1213 void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
1214 int x;
1215 v16u8 src0, src1, vec0, vec1, dst0, dst1;
1216 v8u16 reg0;
1217 v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
1218 v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
fbarchard1 2016/12/12 20:09:03 note these constants may need to be passed in as p
manojkumar.bhosale 2016/12/15 06:35:21 Acknowledged.
1219
1220 for (x = 0; x < width; x += 8) {
1221 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1222 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
1223 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
1224 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
1225 reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
1226 reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
1227 reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
1228 vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
1229 vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
1230 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
1231 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
1232 ST_UB2(dst0, dst1, dst_argb, 16);
1233 src_argb += 32;
1234 dst_argb += 32;
1235 }
1236 }
1237
1238 void ARGBSepiaRow_MSA(uint8* dst_argb, int width) {
1239 int x;
1240 v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
1241 v8u16 reg0, reg1, reg2;
1242 v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
1243 v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
1244 v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
1245 v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
1246 v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
1247 v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
1248 v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
1249
1250 for (x = 0; x < width; x += 8) {
1251 src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
1252 src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
1253 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
1254 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
1255 vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
1256 reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
1257 reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
1258 reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
1259 reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
1260 reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
1261 reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
1262 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
1263 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
1264 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
1265 reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
1266 reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
1267 vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
1268 vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
1269 vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
1270 vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
1271 vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
1272 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
1273 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
1274 ST_UB2(dst0, dst1, dst_argb, 16);
1275 dst_argb += 32;
1276 }
1277 }
1278
1041 void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, 1279 void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
1042 uint8* dst_argb, 1280 uint8* dst_argb,
1043 int width) { 1281 int width) {
1044 int x; 1282 int x;
1045 v16u8 src0, src1; 1283 v16u8 src0, src1;
1046 v8u16 vec0, vec1, vec2, vec3; 1284 v8u16 vec0, vec1, vec2, vec3;
1047 v16u8 dst0, dst1, dst2, dst3; 1285 v16u8 dst0, dst1, dst2, dst3;
1048 1286
1049 for (x = 0; x < width; x += 16) { 1287 for (x = 0; x < width; x += 16) {
1050 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0); 1288 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
(...skipping 15 matching lines...) Expand all
1066 dst_argb += 64; 1304 dst_argb += 64;
1067 } 1305 }
1068 } 1306 }
1069 1307
1070 #ifdef __cplusplus 1308 #ifdef __cplusplus
1071 } // extern "C" 1309 } // extern "C"
1072 } // namespace libyuv 1310 } // namespace libyuv
1073 #endif 1311 #endif
1074 1312
1075 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 1313 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
OLDNEW
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698