Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(407)

Side by Side Diff: source/row_msa.cc

Issue 2421843002: Add MSA optimized ARGB4444ToI420 and ARGB4444ToARGB functions (Closed)
Patch Set: Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« source/convert.cc ('K') | « source/row_any.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 203 matching lines...) Expand 10 before | Expand all | Expand 10 after
214 dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0); 214 dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
215 dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0); 215 dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
216 ST_UB(dst0, dst_u); 216 ST_UB(dst0, dst_u);
217 ST_UB(dst1, dst_v); 217 ST_UB(dst1, dst_v);
218 src_uyvy += 64; 218 src_uyvy += 64;
219 dst_u += 16; 219 dst_u += 16;
220 dst_v += 16; 220 dst_v += 16;
221 } 221 }
222 } 222 }
223 223
224 void ARGB4444ToYRow_MSA(const uint8* src_argb4444, uint8* dst_y, int width) {
225 int x;
226 const uint16* src_argb4444_u16 = (uint16*) src_argb4444;
227 v8u16 src0, src1;
228 v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
229 v16u8 dst0;
230 v8u16 const_0x19 = (v8u16) __msa_ldi_h(0x19);
231 v8u16 const_0x81 = (v8u16) __msa_ldi_h(0x81);
232 v8u16 const_0x42 = (v8u16) __msa_ldi_h(0x42);
233 v8u16 const_0x1080 = (v8u16) __msa_fill_h(0x1080);
234 v8u16 const_0x0F = (v8u16) __msa_ldi_h(0x0F);
235
236 for (x = 0; x < width; x += 16) {
237 LD_UH2(src_argb4444_u16, 8, src0, src1);
238 vec0 = src0 & const_0x0F;
239 vec1 = src1 & const_0x0F;
240 src0 = (v8u16) __msa_srai_h((v8i16) src0, 4);
241 src1 = (v8u16) __msa_srai_h((v8i16) src1, 4);
242 vec2 = src0 & const_0x0F;
243 vec3 = src1 & const_0x0F;
244 src0 = (v8u16) __msa_srai_h((v8i16) src0, 4);
245 src1 = (v8u16) __msa_srai_h((v8i16) src1, 4);
246 vec4 = src0 & const_0x0F;
247 vec5 = src1 & const_0x0F;
248 vec0 |= (v8u16) __msa_slli_h((v8i16) vec0, 4);
249 vec1 |= (v8u16) __msa_slli_h((v8i16) vec1, 4);
250 vec2 |= (v8u16) __msa_slli_h((v8i16) vec2, 4);
251 vec3 |= (v8u16) __msa_slli_h((v8i16) vec3, 4);
252 vec4 |= (v8u16) __msa_slli_h((v8i16) vec4, 4);
253 vec5 |= (v8u16) __msa_slli_h((v8i16) vec5, 4);
254 vec0 *= const_0x19;
fbarchard1 2016/10/14 21:35:16 FYI The YUV to RGB functions now take constants as
manojkumar.bhosale 2016/10/19 11:56:27 OK. will then fix them when changes happens.
255 vec1 *= const_0x19;
256 vec2 *= const_0x81;
257 vec3 *= const_0x81;
258 vec4 *= const_0x42;
259 vec5 *= const_0x42;
260 vec0 += vec2;
261 vec1 += vec3;
262 vec0 += vec4;
263 vec1 += vec5;
264 vec0 += const_0x1080;
265 vec1 += const_0x1080;
266 vec0 = (v8u16) __msa_srai_h((v8i16) vec0, 8);
267 vec1 = (v8u16) __msa_srai_h((v8i16) vec1, 8);
268 dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
269 ST_UB(dst0, dst_y);
270 src_argb4444_u16 += 16;
271 dst_y += 16;
272 }
273 }
274
275 void ARGB4444ToUVRow_MSA(const uint8* src_argb4444,
276 int src_stride_argb4444,
277 uint8* dst_u, uint8* dst_v, int width) {
278 int x;
279 const uint8* src_argb4444_next = src_argb4444 + src_stride_argb4444;
280 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
281 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
282 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
283 v16u8 dst0, dst1;
284 v8u16 const_0x70 = (v8u16) __msa_ldi_h(0x70);
285 v8u16 const_0x4A = (v8u16) __msa_ldi_h(0x4A);
286 v8u16 const_0x26 = (v8u16) __msa_ldi_h(0x26);
287 v8u16 const_0x5E = (v8u16) __msa_ldi_h(0x5E);
288 v8u16 const_0x12 = (v8u16) __msa_ldi_h(0x12);
289 v8u16 const_0x8080 = (v8u16) __msa_fill_h(0x8080);
290
291 for (x = 0; x < width; x += 32) {
292 LD_UB4(src_argb4444, 16, src0, src1, src2, src3);
293 LD_UB4(src_argb4444_next, 16, src4, src5, src6, src7);
294 reg0 = __msa_andi_b(src0, 0x0F);
295 reg1 = __msa_andi_b(src1, 0x0F);
296 reg2 = __msa_andi_b(src2, 0x0F);
297 reg3 = __msa_andi_b(src3, 0x0F);
298 reg0 += __msa_andi_b(src4, 0x0F);
299 reg1 += __msa_andi_b(src5, 0x0F);
300 reg2 += __msa_andi_b(src6, 0x0F);
301 reg3 += __msa_andi_b(src7, 0x0F);
302 src0 = __msa_andi_b(src0, 0xF0);
303 src1 = __msa_andi_b(src1, 0xF0);
304 src2 = __msa_andi_b(src2, 0xF0);
305 src3 = __msa_andi_b(src3, 0xF0);
306 src4 = __msa_andi_b(src4, 0xF0);
307 src5 = __msa_andi_b(src5, 0xF0);
308 src6 = __msa_andi_b(src6, 0xF0);
309 src7 = __msa_andi_b(src7, 0xF0);
310 reg4 = (v16u8) __msa_srli_b((v16i8) src0, 4);
311 reg5 = (v16u8) __msa_srli_b((v16i8) src1, 4);
312 reg6 = (v16u8) __msa_srli_b((v16i8) src2, 4);
313 reg7 = (v16u8) __msa_srli_b((v16i8) src3, 4);
314 reg4 += (v16u8) __msa_srli_b((v16i8) src4, 4);
315 reg5 += (v16u8) __msa_srli_b((v16i8) src5, 4);
316 reg6 += (v16u8) __msa_srli_b((v16i8) src6, 4);
317 reg7 += (v16u8) __msa_srli_b((v16i8) src7, 4);
318 reg8 = (v16u8) __msa_pckod_b((v16i8) reg1, (v16i8) reg0);
319 reg9 = (v16u8) __msa_pckod_b((v16i8) reg3, (v16i8) reg2);
320 reg0 = (v16u8) __msa_pckev_b((v16i8) reg1, (v16i8) reg0);
321 reg1 = (v16u8) __msa_pckev_b((v16i8) reg3, (v16i8) reg2);
322 reg2 = (v16u8) __msa_pckev_b((v16i8) reg5, (v16i8) reg4);
323 reg3 = (v16u8) __msa_pckev_b((v16i8) reg7, (v16i8) reg6);
324 vec0 = __msa_hadd_u_h(reg0, reg0);
325 vec1 = __msa_hadd_u_h(reg1, reg1);
326 vec2 = __msa_hadd_u_h(reg2, reg2);
327 vec3 = __msa_hadd_u_h(reg3, reg3);
328 vec4 = __msa_hadd_u_h(reg8, reg8);
329 vec5 = __msa_hadd_u_h(reg9, reg9);
330 vec0 = (v8u16) __msa_slli_h((v8i16) vec0, 2);
331 vec1 = (v8u16) __msa_slli_h((v8i16) vec1, 2);
332 vec2 = (v8u16) __msa_slli_h((v8i16) vec2, 2);
333 vec3 = (v8u16) __msa_slli_h((v8i16) vec3, 2);
334 vec4 = (v8u16) __msa_slli_h((v8i16) vec4, 2);
335 vec5 = (v8u16) __msa_slli_h((v8i16) vec5, 2);
336 vec0 |= (v8u16) __msa_srai_h((v8i16) vec0, 6);
337 vec1 |= (v8u16) __msa_srai_h((v8i16) vec1, 6);
338 vec2 |= (v8u16) __msa_srai_h((v8i16) vec2, 6);
339 vec3 |= (v8u16) __msa_srai_h((v8i16) vec3, 6);
340 vec4 |= (v8u16) __msa_srai_h((v8i16) vec4, 6);
341 vec5 |= (v8u16) __msa_srai_h((v8i16) vec5, 6);
fbarchard1 2016/10/14 21:35:16 I'm concerned that this is a lot of code for a for
manojkumar.bhosale 2016/10/19 11:56:27 Done.
342 vec6 = vec0 * const_0x70;
343 vec7 = vec1 * const_0x70;
344 vec8 = vec2 * const_0x4A;
345 vec9 = vec3 * const_0x4A;
346 vec0 *= const_0x12;
347 vec1 *= const_0x12;
348 vec2 *= const_0x5E;
349 vec3 *= const_0x5E;
350 vec6 += const_0x8080;
351 vec7 += const_0x8080;
352 vec8 += vec4 * const_0x26;
353 vec9 += vec5 * const_0x26;
354 vec4 *= const_0x70;
355 vec5 *= const_0x70;
356 vec2 += vec0;
357 vec3 += vec1;
358 vec4 += const_0x8080;
359 vec5 += const_0x8080;
360 vec0 = vec6 - vec8;
361 vec1 = vec7 - vec9;
362 vec2 = vec4 - vec2;
363 vec3 = vec5 - vec3;
364 vec0 = (v8u16) __msa_srli_h((v8i16) vec0, 8);
365 vec1 = (v8u16) __msa_srli_h((v8i16) vec1, 8);
366 vec2 = (v8u16) __msa_srli_h((v8i16) vec2, 8);
367 vec3 = (v8u16) __msa_srli_h((v8i16) vec3, 8);
368 dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
369 dst1 = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
370 ST_UB(dst0, dst_u);
371 ST_UB(dst1, dst_v);
372 src_argb4444 += 64;
373 src_argb4444_next += 64;
fbarchard1 2016/10/14 21:35:16 on other platforms I'd typically unroll less than
374 dst_u += 16;
375 dst_v += 16;
376 }
377 }
378
379 void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, uint8* dst_argb,
380 int width) {
381 int x;
382 v16u8 src0, src1;
383 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
384 v16u8 dst0, dst1, dst2, dst3;
385
386 for (x = 0; x < width; x += 16) {
387 LD_UB2(src_argb4444, 16, src0, src1);
388 vec0 = (v8u16) __msa_andi_b(src0, 0x0F);
389 vec1 = (v8u16) __msa_andi_b(src1, 0x0F);
390 vec2 = (v8u16) __msa_andi_b(src0, 0xF0);
391 vec3 = (v8u16) __msa_andi_b(src1, 0xF0);
392 vec4 = (v8u16) __msa_slli_b((v16i8) vec0, 4);
393 vec5 = (v8u16) __msa_slli_b((v16i8) vec1, 4);
394 vec6 = (v8u16) __msa_srli_b((v16i8) vec2, 4);
395 vec7 = (v8u16) __msa_srli_b((v16i8) vec3, 4);
396 vec0 |= vec4;
397 vec1 |= vec5;
398 vec2 |= vec6;
399 vec3 |= vec7;
400 dst0 = (v16u8) __msa_ilvr_b((v16i8) vec2, (v16i8) vec0);
401 dst1 = (v16u8) __msa_ilvl_b((v16i8) vec2, (v16i8) vec0);
402 dst2 = (v16u8) __msa_ilvr_b((v16i8) vec3, (v16i8) vec1);
403 dst3 = (v16u8) __msa_ilvl_b((v16i8) vec3, (v16i8) vec1);
404 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
405 src_argb4444 += 32;
406 dst_argb += 64;
407 }
408 }
409
224 #ifdef __cplusplus 410 #ifdef __cplusplus
225 } // extern "C" 411 } // extern "C"
226 } // namespace libyuv 412 } // namespace libyuv
227 #endif 413 #endif
228 414
229 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 415 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
OLDNEW
« source/convert.cc ('K') | « source/row_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698