Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(180)

Side by Side Diff: source/scale_msa.cc

Issue 2559683002: Add MSA optimized remaining scale row functions (Closed)
Patch Set: Variable renaming and rework Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« include/libyuv/macros_msa.h ('K') | « source/scale_any.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h>
12
11 #include "libyuv/scale_row.h" 13 #include "libyuv/scale_row.h"
12 14
13 // This module is for GCC MSA 15 // This module is for GCC MSA
14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 16 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
15 #include "libyuv/macros_msa.h" 17 #include "libyuv/macros_msa.h"
16 18
17 #ifdef __cplusplus 19 #ifdef __cplusplus
18 namespace libyuv { 20 namespace libyuv {
19 extern "C" { 21 extern "C" {
20 #endif 22 #endif
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after
162 reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); 164 reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
163 reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); 165 reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
164 dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); 166 dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
165 ST_UB(dst0, dst_argb); 167 ST_UB(dst0, dst_argb);
166 src_argb += stepx * 4; 168 src_argb += stepx * 4;
167 nxt_argb += stepx * 4; 169 nxt_argb += stepx * 4;
168 dst_argb += 16; 170 dst_argb += 16;
169 } 171 }
170 } 172 }
171 173
174 void ScaleRowDown2_MSA(const uint8_t* src_ptr,
175 ptrdiff_t src_stride,
176 uint8_t* dst,
177 int dst_width) {
178 int x;
179 v16u8 src0, src1, src2, src3, dst0, dst1;
180
181 for (x = 0; x < dst_width; x += 32) {
182 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
183 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
184 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
185 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
186 dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
187 dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
188 ST_UB2(dst0, dst1, dst, 16);
189 src_ptr += 64;
190 dst += 32;
191 }
192 }
193
194 void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
195 ptrdiff_t src_stride,
196 uint8_t* dst,
197 int dst_width) {
198 int x;
199 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
200
201 for (x = 0; x < dst_width; x += 32) {
202 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
203 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
204 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
205 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
206 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
207 vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
208 vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
209 vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
210 dst0 = __msa_aver_u_b(vec1, vec0);
211 dst1 = __msa_aver_u_b(vec3, vec2);
212 ST_UB2(dst0, dst1, dst, 16);
213 src_ptr += 64;
214 dst += 32;
215 }
216 }
217
218 void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
219 ptrdiff_t src_stride,
220 uint8_t* dst,
221 int dst_width) {
222 int x;
223 const uint8_t* s = src_ptr;
224 const uint8_t* t = src_ptr + src_stride;
225 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
226 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
227
228 for (x = 0; x < dst_width; x += 32) {
229 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
230 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
231 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
232 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
233 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
234 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
235 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
236 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
237 vec0 = __msa_hadd_u_h(src0, src0);
238 vec1 = __msa_hadd_u_h(src1, src1);
239 vec2 = __msa_hadd_u_h(src2, src2);
240 vec3 = __msa_hadd_u_h(src3, src3);
241 vec0 += __msa_hadd_u_h(src4, src4);
242 vec1 += __msa_hadd_u_h(src5, src5);
243 vec2 += __msa_hadd_u_h(src6, src6);
244 vec3 += __msa_hadd_u_h(src7, src7);
245 vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
246 vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
247 vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
248 vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
249 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
250 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
251 ST_UB2(dst0, dst1, dst, 16);
252 s += 64;
253 t += 64;
254 dst += 32;
255 }
256 }
257
258 void ScaleRowDown4_MSA(const uint8_t* src_ptr,
259 ptrdiff_t src_stride,
260 uint8_t* dst,
261 int dst_width) {
262 int x;
263 v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
264
265 for (x = 0; x < dst_width; x += 16) {
266 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
267 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
268 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
269 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
270 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
271 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
272 dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
273 ST_UB(dst0, dst);
274 src_ptr += 64;
275 dst += 16;
276 }
277 }
278
279 void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
280 ptrdiff_t src_stride,
281 uint8_t* dst,
282 int dst_width) {
283 int x;
284 const uint8_t* s = src_ptr;
285 const uint8_t* t0 = s + src_stride;
286 const uint8_t* t1 = s + src_stride * 2;
287 const uint8_t* t2 = s + src_stride * 3;
288 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
289 v8u16 vec0, vec1, vec2, vec3;
290 v4u32 reg0, reg1, reg2, reg3;
291
292 for (x = 0; x < dst_width; x += 16) {
293 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
294 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
295 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
296 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
297 src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
298 src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
299 src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
300 src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
301 vec0 = __msa_hadd_u_h(src0, src0);
302 vec1 = __msa_hadd_u_h(src1, src1);
303 vec2 = __msa_hadd_u_h(src2, src2);
304 vec3 = __msa_hadd_u_h(src3, src3);
305 vec0 += __msa_hadd_u_h(src4, src4);
306 vec1 += __msa_hadd_u_h(src5, src5);
307 vec2 += __msa_hadd_u_h(src6, src6);
308 vec3 += __msa_hadd_u_h(src7, src7);
309 src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
310 src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
311 src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
312 src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
313 src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
314 src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
315 src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
316 src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
317 vec0 += __msa_hadd_u_h(src0, src0);
318 vec1 += __msa_hadd_u_h(src1, src1);
319 vec2 += __msa_hadd_u_h(src2, src2);
320 vec3 += __msa_hadd_u_h(src3, src3);
321 vec0 += __msa_hadd_u_h(src4, src4);
322 vec1 += __msa_hadd_u_h(src5, src5);
323 vec2 += __msa_hadd_u_h(src6, src6);
324 vec3 += __msa_hadd_u_h(src7, src7);
325 reg0 = __msa_hadd_u_w(vec0, vec0);
326 reg1 = __msa_hadd_u_w(vec1, vec1);
327 reg2 = __msa_hadd_u_w(vec2, vec2);
328 reg3 = __msa_hadd_u_w(vec3, vec3);
329 reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
330 reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
331 reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
332 reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
333 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
334 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
335 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
336 ST_UB(dst0, dst);
337 s += 64;
338 t0 += 64;
339 t1 += 64;
340 t2 += 64;
341 dst += 16;
342 }
343 }
344
345 void ScaleRowDown38_MSA(const uint8_t* src_ptr,
346 ptrdiff_t src_stride,
347 uint8_t* dst,
348 int dst_width) {
349 int x, width;
350 uint64_t dst0;
351 uint32_t dst1;
352 v16u8 src0, src1, vec0;
353 v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
354
355 assert(dst_width % 3 == 0);
356 width = dst_width / 3;
357
358 for (x = 0; x < width; x += 4) {
359 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
360 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
361 vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
362 dst0 = __msa_copy_u_d((v2i64)vec0, 0);
363 dst1 = __msa_copy_u_w((v4i32)vec0, 2);
364 SD(dst0, dst);
365 SW(dst1, dst + 8);
366 src_ptr += 32;
367 dst += 12;
368 }
369 }
370
371 void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
372 ptrdiff_t src_stride,
373 uint8_t* dst_ptr,
374 int dst_width) {
375 int x, width;
376 const uint8_t* s = src_ptr;
377 const uint8_t* t = src_ptr + src_stride;
378 uint64_t dst0;
379 uint32_t dst1;
380 v16u8 src0, src1, src2, src3, out;
381 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
382 v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
383 v8i16 zero = {0};
384 v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
385 v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
386 v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
387 v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
388
389 assert((dst_width % 3 == 0) && (dst_width > 0));
390 width = dst_width / 3;
391
392 for (x = 0; x < width; x += 4) {
393 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
394 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
395 src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
396 src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
397 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
398 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
399 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
400 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
401 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
402 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
403 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
404 vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
405 vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
406 vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
407 vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
408 vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
409 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
410 vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
411 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
412 tmp0 = __msa_hadd_u_w(vec4, vec4);
413 tmp1 = __msa_hadd_u_w(vec5, vec5);
414 tmp2 = __msa_hadd_u_w(vec6, vec6);
415 tmp3 = __msa_hadd_u_w(vec7, vec7);
416 tmp4 = __msa_hadd_u_w(vec0, vec0);
417 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
418 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
419 tmp0 = __msa_hadd_u_w(vec0, vec0);
420 tmp1 = __msa_hadd_u_w(vec1, vec1);
421 tmp0 *= const_0x2AAA;
422 tmp1 *= const_0x2AAA;
423 tmp4 *= const_0x4000;
424 tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
425 tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
426 tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
427 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
428 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
429 out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
430 dst0 = __msa_copy_u_d((v2i64)out, 0);
431 dst1 = __msa_copy_u_w((v4i32)out, 2);
432 SD(dst0, dst_ptr);
433 SW(dst1, dst_ptr + 8);
434 s += 32;
435 t += 32;
436 dst_ptr += 12;
437 }
438 }
439
440 void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
441 ptrdiff_t src_stride,
442 uint8_t* dst_ptr,
443 int dst_width) {
444 int x, width;
445 const uint8_t* s = src_ptr;
446 const uint8_t* t0 = s + src_stride;
447 const uint8_t* t1 = s + src_stride * 2;
448 uint64_t dst0;
449 uint32_t dst1;
450 v16u8 src0, src1, src2, src3, src4, src5, out;
451 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
452 v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
453 v8u16 zero = {0};
454 v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
455 v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
456 v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
457 v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
458
459 assert((dst_width % 3 == 0) && (dst_width > 0));
460 width = dst_width / 3;
461
462 for (x = 0; x < width; x += 4) {
463 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
464 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
465 src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
466 src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
467 src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
468 src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
469 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
470 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
471 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
472 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
473 vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
474 vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
475 vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
476 vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
477 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
478 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
479 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
480 vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
481 vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
482 vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
483 vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
484 vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
485 vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
486 vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
487 vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
488 vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
489 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
490 vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
491 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
492 tmp0 = __msa_hadd_u_w(vec4, vec4);
493 tmp1 = __msa_hadd_u_w(vec5, vec5);
494 tmp2 = __msa_hadd_u_w(vec6, vec6);
495 tmp3 = __msa_hadd_u_w(vec7, vec7);
496 tmp4 = __msa_hadd_u_w(vec0, vec0);
497 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
498 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
499 tmp0 = __msa_hadd_u_w(vec0, vec0);
500 tmp1 = __msa_hadd_u_w(vec1, vec1);
501 tmp0 *= const_0x1C71;
502 tmp1 *= const_0x1C71;
503 tmp4 *= const_0x2AAA;
504 tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
505 tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
506 tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
507 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
508 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
509 out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
510 dst0 = __msa_copy_u_d((v2i64)out, 0);
511 dst1 = __msa_copy_u_w((v4i32)out, 2);
512 SD(dst0, dst_ptr);
513 SW(dst1, dst_ptr + 8);
514 s += 32;
515 t0 += 32;
516 t1 += 32;
517 dst_ptr += 12;
518 }
519 }
520
521 void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
522 int x;
523 v16u8 src0;
524 v8u16 vec0, vec1, dst0, dst1;
525 v16i8 zero = {0};
526
527 assert(src_width > 0);
528
529 for (x = 0; x < src_width; x += 16) {
530 src0 = LD_UB(src_ptr);
531 dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
532 dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
533 dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
534 dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
535 ST_UH2(dst0, dst1, dst_ptr, 8);
536 src_ptr += 16;
537 dst_ptr += 16;
538 }
539 }
540
172 #ifdef __cplusplus 541 #ifdef __cplusplus
173 } // extern "C" 542 } // extern "C"
174 } // namespace libyuv 543 } // namespace libyuv
175 #endif 544 #endif
176 545
177 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 546 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
OLDNEW
« include/libyuv/macros_msa.h ('K') | « source/scale_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698