Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(12)

Side by Side Diff: source/scale_msa.cc

Issue 2559683002: Add MSA optimized remaining scale row functions (Closed)
Patch Set: Changes as per review comments Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« include/libyuv/macros_msa.h ('K') | « source/scale_any.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after
162 reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); 162 reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
163 reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); 163 reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
164 dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); 164 dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
165 ST_UB(dst0, dst_argb); 165 ST_UB(dst0, dst_argb);
166 src_argb += stepx * 4; 166 src_argb += stepx * 4;
167 nxt_argb += stepx * 4; 167 nxt_argb += stepx * 4;
168 dst_argb += 16; 168 dst_argb += 16;
169 } 169 }
170 } 170 }
171 171
172 void ScaleRowDown2_MSA(const uint8_t* src_ptr,
173 ptrdiff_t src_stride,
174 uint8_t* dst,
175 int dst_width) {
176 int x;
177 v16u8 src0, src1, src2, src3, dst0, dst1;
178
179 for (x = 0; x < dst_width; x += 32) {
180 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
181 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
182 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
183 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
184 dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
185 dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
186 ST_UB2(dst0, dst1, dst, 16);
187 src_ptr += 64;
188 dst += 32;
189 }
190 }
191
192 void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
193 ptrdiff_t src_stride,
194 uint8_t* dst,
195 int dst_width) {
196 int x;
197 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
198
199 for (x = 0; x < dst_width; x += 32) {
200 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
201 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
202 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
203 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
204 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
205 vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
206 vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
207 vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
208 dst0 = __msa_aver_u_b(vec1, vec0);
209 dst1 = __msa_aver_u_b(vec3, vec2);
210 ST_UB2(dst0, dst1, dst, 16);
211 src_ptr += 64;
212 dst += 32;
213 }
214 }
215
216 void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
217 ptrdiff_t src_stride,
218 uint8_t* dst,
219 int dst_width) {
220 int x;
221 const uint8_t* nxt_ptr = src_ptr + src_stride;
222 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
223 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
224
225 for (x = 0; x < dst_width; x += 32) {
226 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
227 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
228 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
229 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
230 src4 = (v16u8)__msa_ld_b((v16i8*)nxt_ptr, 0);
231 src5 = (v16u8)__msa_ld_b((v16i8*)nxt_ptr, 16);
232 src6 = (v16u8)__msa_ld_b((v16i8*)nxt_ptr, 32);
233 src7 = (v16u8)__msa_ld_b((v16i8*)nxt_ptr, 48);
234 vec0 = __msa_hadd_u_h(src0, src0);
235 vec1 = __msa_hadd_u_h(src1, src1);
236 vec2 = __msa_hadd_u_h(src2, src2);
237 vec3 = __msa_hadd_u_h(src3, src3);
238 vec4 = __msa_hadd_u_h(src4, src4);
239 vec5 = __msa_hadd_u_h(src5, src5);
240 vec6 = __msa_hadd_u_h(src6, src6);
241 vec7 = __msa_hadd_u_h(src7, src7);
242 vec0 += vec4;
243 vec1 += vec5;
244 vec2 += vec6;
245 vec3 += vec7;
246 vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
247 vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
248 vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
249 vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
250 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
251 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
252 ST_UB2(dst0, dst1, dst, 16);
253 src_ptr += 64;
254 nxt_ptr += 64;
255 dst += 32;
256 }
257 }
258
259 void ScaleRowDown4_MSA(const uint8_t* src_ptr,
260 ptrdiff_t src_stride,
261 uint8_t* dst,
262 int dst_width) {
263 int x;
264 v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
265
266 for (x = 0; x < dst_width; x += 16) {
267 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
268 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
269 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
270 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
271 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
272 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
273 dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
274 ST_UB(dst0, dst);
275 src_ptr += 64;
276 dst += 16;
277 }
278 }
279
280 void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
281 ptrdiff_t src_stride,
282 uint8_t* dst,
283 int dst_width) {
284 int x;
285 const uint8_t* nxt0_ptr = src_ptr + src_stride;
286 const uint8_t* nxt1_ptr = src_ptr + src_stride * 2;
287 const uint8_t* nxt2_ptr = src_ptr + src_stride * 3;
288 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
289 v8u16 vec0, vec1, vec2, vec3;
290 v4u32 reg0, reg1, reg2, reg3;
291
292 for (x = 0; x < dst_width; x += 16) {
293 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
294 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
295 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
296 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
297 src4 = (v16u8)__msa_ld_b((v16i8*)nxt0_ptr, 0);
298 src5 = (v16u8)__msa_ld_b((v16i8*)nxt0_ptr, 16);
299 src6 = (v16u8)__msa_ld_b((v16i8*)nxt0_ptr, 32);
300 src7 = (v16u8)__msa_ld_b((v16i8*)nxt0_ptr, 48);
301 vec0 = __msa_hadd_u_h(src0, src0);
302 vec1 = __msa_hadd_u_h(src1, src1);
303 vec2 = __msa_hadd_u_h(src2, src2);
304 vec3 = __msa_hadd_u_h(src3, src3);
305 vec0 += __msa_hadd_u_h(src4, src4);
306 vec1 += __msa_hadd_u_h(src5, src5);
307 vec2 += __msa_hadd_u_h(src6, src6);
308 vec3 += __msa_hadd_u_h(src7, src7);
309 src0 = (v16u8)__msa_ld_b((v16i8*)nxt1_ptr, 0);
310 src1 = (v16u8)__msa_ld_b((v16i8*)nxt1_ptr, 16);
311 src2 = (v16u8)__msa_ld_b((v16i8*)nxt1_ptr, 32);
312 src3 = (v16u8)__msa_ld_b((v16i8*)nxt1_ptr, 48);
313 src4 = (v16u8)__msa_ld_b((v16i8*)nxt2_ptr, 0);
314 src5 = (v16u8)__msa_ld_b((v16i8*)nxt2_ptr, 16);
315 src6 = (v16u8)__msa_ld_b((v16i8*)nxt2_ptr, 32);
316 src7 = (v16u8)__msa_ld_b((v16i8*)nxt2_ptr, 48);
317 vec0 += __msa_hadd_u_h(src0, src0);
318 vec1 += __msa_hadd_u_h(src1, src1);
319 vec2 += __msa_hadd_u_h(src2, src2);
320 vec3 += __msa_hadd_u_h(src3, src3);
321 vec0 += __msa_hadd_u_h(src4, src4);
322 vec1 += __msa_hadd_u_h(src5, src5);
323 vec2 += __msa_hadd_u_h(src6, src6);
324 vec3 += __msa_hadd_u_h(src7, src7);
325 reg0 = __msa_hadd_u_w(vec0, vec0);
326 reg1 = __msa_hadd_u_w(vec1, vec1);
327 reg2 = __msa_hadd_u_w(vec2, vec2);
328 reg3 = __msa_hadd_u_w(vec3, vec3);
329 reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
330 reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
331 reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
332 reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
333 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
334 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
335 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
336 ST_UB(dst0, dst);
337 src_ptr += 64;
338 nxt0_ptr += 64;
339 nxt1_ptr += 64;
340 nxt2_ptr += 64;
341 dst += 16;
342 }
343 }
344
345 void ScaleRowDown38_MSA(const uint8_t* src_ptr,
346 ptrdiff_t src_stride,
347 uint8_t* dst,
348 int dst_width) {
349 int x, width;
350 uint64_t dst0;
351 uint32_t dst1;
352 v16u8 src0, src1, vec0;
353 v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
354
355 assert(dst_width % 3 == 0);
356 width = dst_width / 3;
357
358 for (x = 0; x < width; x += 4) {
359 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
360 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
361 vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
362 dst0 = __msa_copy_u_d((v2i64)vec0, 0);
363 dst1 = __msa_copy_u_w((v4i32)vec0, 2);
364 SD(dst0, dst);
365 SW(dst1, dst + 8);
366 src_ptr += 32;
367 dst += 12;
368 }
369 }
370
371 void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
372 ptrdiff_t src_stride,
373 uint8_t* dst_ptr,
374 int dst_width) {
375 const uint8_t* nxt_ptr = src_ptr + src_stride;
376 int x, width;
377 uint64_t dst0;
378 uint32_t dst1;
379 v16u8 src0, src1, src2, src3, out;
380 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
381 v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
382 v8i16 zero = {0};
383 v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
384 v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
385 v4u32 const_10922 = (v4u32)__msa_fill_w(10922);
386 v4u32 const_16384 = (v4u32)__msa_fill_w(16384);
387
388 assert((dst_width % 3 == 0) && (dst_width > 0));
389 width = dst_width / 3;
390
391 for (x = 0; x < width; x += 4) {
392 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
393 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
394 src2 = (v16u8)__msa_ld_b((v16i8*)nxt_ptr, 0);
395 src3 = (v16u8)__msa_ld_b((v16i8*)nxt_ptr, 16);
396 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
397 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
398 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
399 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
400 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
401 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
402 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
403 vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
404 vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
405 vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
406 vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
407 vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
408 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
409 vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
410 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
411 tmp0 = __msa_hadd_u_w(vec4, vec4);
412 tmp1 = __msa_hadd_u_w(vec5, vec5);
413 tmp2 = __msa_hadd_u_w(vec6, vec6);
414 tmp3 = __msa_hadd_u_w(vec7, vec7);
415 tmp4 = __msa_hadd_u_w(vec0, vec0);
416 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
417 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
418 tmp0 = __msa_hadd_u_w(vec0, vec0);
419 tmp1 = __msa_hadd_u_w(vec1, vec1);
420 tmp0 *= const_10922;
421 tmp1 *= const_10922;
422 tmp4 *= const_16384;
423 tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
424 tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
425 tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
426 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
427 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
428 out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
429 dst0 = __msa_copy_u_d((v2i64)out, 0);
430 dst1 = __msa_copy_u_w((v4i32)out, 2);
431 SD(dst0, dst_ptr);
432 SW(dst1, dst_ptr + 8);
433 src_ptr += 32;
434 nxt_ptr += 32;
435 dst_ptr += 12;
436 }
437 }
438
439 void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
440 ptrdiff_t src_stride,
441 uint8_t* dst_ptr,
442 int dst_width) {
443 int x, width;
444 const uint8_t* nxt0_ptr = src_ptr + src_stride;
445 const uint8_t* nxt1_ptr = src_ptr + src_stride * 2;
446 uint64_t dst0;
447 uint32_t dst1;
448 v16u8 src0, src1, src2, src3, src4, src5, out;
449 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
450 v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
451 v8u16 zero = {0};
452 v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
453 v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
454 v4u32 const_7281 = (v4u32)__msa_fill_w(7281);
455 v4u32 const_10922 = (v4u32)__msa_fill_w(10922);
456
457 assert((dst_width % 3 == 0) && (dst_width > 0));
458 width = dst_width / 3;
459
460 for (x = 0; x < width; x += 4) {
461 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
462 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
463 src2 = (v16u8)__msa_ld_b((v16i8*)nxt0_ptr, 0);
464 src3 = (v16u8)__msa_ld_b((v16i8*)nxt0_ptr, 16);
465 src4 = (v16u8)__msa_ld_b((v16i8*)nxt1_ptr, 0);
466 src5 = (v16u8)__msa_ld_b((v16i8*)nxt1_ptr, 16);
467 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
468 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
469 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
470 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
471 vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
472 vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
473 vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
474 vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
475 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
476 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
477 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
478 vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
479 vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
480 vec5 = __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
481 vec6 = __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
482 vec7 = __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
483 vec0 += vec4;
484 vec1 += vec5;
485 vec2 += vec6;
486 vec3 += vec7;
487 vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
488 vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
489 vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
490 vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
491 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
492 vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
493 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
494 tmp0 = __msa_hadd_u_w(vec4, vec4);
495 tmp1 = __msa_hadd_u_w(vec5, vec5);
496 tmp2 = __msa_hadd_u_w(vec6, vec6);
497 tmp3 = __msa_hadd_u_w(vec7, vec7);
498 tmp4 = __msa_hadd_u_w(vec0, vec0);
499 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
500 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
501 tmp0 = __msa_hadd_u_w(vec0, vec0);
502 tmp1 = __msa_hadd_u_w(vec1, vec1);
503 tmp0 *= const_7281;
504 tmp1 *= const_7281;
505 tmp4 *= const_10922;
506 tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
507 tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
508 tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
509 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
510 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
511 out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
512 dst0 = __msa_copy_u_d((v2i64)out, 0);
513 dst1 = __msa_copy_u_w((v4i32)out, 2);
514 SD(dst0, dst_ptr);
515 SW(dst1, dst_ptr + 8);
516 src_ptr += 32;
517 nxt0_ptr += 32;
518 nxt1_ptr += 32;
519 dst_ptr += 12;
520 }
521 }
522
523 void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
524 int x;
525 v16u8 src0;
526 v8u16 vec0, vec1, dst0, dst1;
527 v16i8 zero = {0};
528
529 assert(src_width > 0);
530
531 for (x = 0; x < src_width; x += 16) {
532 src0 = LD_UB(src_ptr);
533 dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
534 dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
535 vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
536 vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
537 dst0 += vec0;
538 dst1 += vec1;
539 ST_UH2(dst0, dst1, dst_ptr, 8);
540 src_ptr += 16;
541 dst_ptr += 16;
542 }
543 }
544
172 #ifdef __cplusplus 545 #ifdef __cplusplus
173 } // extern "C" 546 } // extern "C"
174 } // namespace libyuv 547 } // namespace libyuv
175 #endif 548 #endif
176 549
177 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 550 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
OLDNEW
« include/libyuv/macros_msa.h ('K') | « source/scale_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698