OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
| 11 #include <assert.h> |
| 12 |
11 #include "libyuv/scale_row.h" | 13 #include "libyuv/scale_row.h" |
12 | 14 |
13 // This module is for GCC MSA | 15 // This module is for GCC MSA |
14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 16 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
15 #include "libyuv/macros_msa.h" | 17 #include "libyuv/macros_msa.h" |
16 | 18 |
17 #ifdef __cplusplus | 19 #ifdef __cplusplus |
18 namespace libyuv { | 20 namespace libyuv { |
19 extern "C" { | 21 extern "C" { |
20 #endif | 22 #endif |
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
162 reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); | 164 reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); |
163 reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); | 165 reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); |
164 dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); | 166 dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); |
165 ST_UB(dst0, dst_argb); | 167 ST_UB(dst0, dst_argb); |
166 src_argb += stepx * 4; | 168 src_argb += stepx * 4; |
167 nxt_argb += stepx * 4; | 169 nxt_argb += stepx * 4; |
168 dst_argb += 16; | 170 dst_argb += 16; |
169 } | 171 } |
170 } | 172 } |
171 | 173 |
| 174 void ScaleRowDown2_MSA(const uint8_t* src_ptr, |
| 175 ptrdiff_t src_stride, |
| 176 uint8_t* dst, |
| 177 int dst_width) { |
| 178 int x; |
| 179 v16u8 src0, src1, src2, src3, dst0, dst1; |
| 180 |
| 181 for (x = 0; x < dst_width; x += 32) { |
| 182 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); |
| 183 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); |
| 184 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); |
| 185 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); |
| 186 dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); |
| 187 dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); |
| 188 ST_UB2(dst0, dst1, dst, 16); |
| 189 src_ptr += 64; |
| 190 dst += 32; |
| 191 } |
| 192 } |
| 193 |
| 194 void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, |
| 195 ptrdiff_t src_stride, |
| 196 uint8_t* dst, |
| 197 int dst_width) { |
| 198 int x; |
| 199 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1; |
| 200 |
| 201 for (x = 0; x < dst_width; x += 32) { |
| 202 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); |
| 203 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); |
| 204 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); |
| 205 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); |
| 206 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); |
| 207 vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); |
| 208 vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); |
| 209 vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); |
| 210 dst0 = __msa_aver_u_b(vec1, vec0); |
| 211 dst1 = __msa_aver_u_b(vec3, vec2); |
| 212 ST_UB2(dst0, dst1, dst, 16); |
| 213 src_ptr += 64; |
| 214 dst += 32; |
| 215 } |
| 216 } |
| 217 |
| 218 void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, |
| 219 ptrdiff_t src_stride, |
| 220 uint8_t* dst, |
| 221 int dst_width) { |
| 222 int x; |
| 223 const uint8_t* s = src_ptr; |
| 224 const uint8_t* t = src_ptr + src_stride; |
| 225 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1; |
| 226 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
| 227 |
| 228 for (x = 0; x < dst_width; x += 32) { |
| 229 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 230 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); |
| 231 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); |
| 232 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); |
| 233 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); |
| 234 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); |
| 235 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); |
| 236 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); |
| 237 vec0 = __msa_hadd_u_h(src0, src0); |
| 238 vec1 = __msa_hadd_u_h(src1, src1); |
| 239 vec2 = __msa_hadd_u_h(src2, src2); |
| 240 vec3 = __msa_hadd_u_h(src3, src3); |
| 241 vec0 += __msa_hadd_u_h(src4, src4); |
| 242 vec1 += __msa_hadd_u_h(src5, src5); |
| 243 vec2 += __msa_hadd_u_h(src6, src6); |
| 244 vec3 += __msa_hadd_u_h(src7, src7); |
| 245 vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2); |
| 246 vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2); |
| 247 vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2); |
| 248 vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2); |
| 249 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); |
| 250 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); |
| 251 ST_UB2(dst0, dst1, dst, 16); |
| 252 s += 64; |
| 253 t += 64; |
| 254 dst += 32; |
| 255 } |
| 256 } |
| 257 |
| 258 void ScaleRowDown4_MSA(const uint8_t* src_ptr, |
| 259 ptrdiff_t src_stride, |
| 260 uint8_t* dst, |
| 261 int dst_width) { |
| 262 int x; |
| 263 v16u8 src0, src1, src2, src3, vec0, vec1, dst0; |
| 264 |
| 265 for (x = 0; x < dst_width; x += 16) { |
| 266 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); |
| 267 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); |
| 268 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); |
| 269 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); |
| 270 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); |
| 271 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); |
| 272 dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); |
| 273 ST_UB(dst0, dst); |
| 274 src_ptr += 64; |
| 275 dst += 16; |
| 276 } |
| 277 } |
| 278 |
| 279 void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, |
| 280 ptrdiff_t src_stride, |
| 281 uint8_t* dst, |
| 282 int dst_width) { |
| 283 int x; |
| 284 const uint8_t* s = src_ptr; |
| 285 const uint8_t* t0 = s + src_stride; |
| 286 const uint8_t* t1 = s + src_stride * 2; |
| 287 const uint8_t* t2 = s + src_stride * 3; |
| 288 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0; |
| 289 v8u16 vec0, vec1, vec2, vec3; |
| 290 v4u32 reg0, reg1, reg2, reg3; |
| 291 |
| 292 for (x = 0; x < dst_width; x += 16) { |
| 293 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 294 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); |
| 295 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); |
| 296 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); |
| 297 src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0); |
| 298 src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16); |
| 299 src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32); |
| 300 src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48); |
| 301 vec0 = __msa_hadd_u_h(src0, src0); |
| 302 vec1 = __msa_hadd_u_h(src1, src1); |
| 303 vec2 = __msa_hadd_u_h(src2, src2); |
| 304 vec3 = __msa_hadd_u_h(src3, src3); |
| 305 vec0 += __msa_hadd_u_h(src4, src4); |
| 306 vec1 += __msa_hadd_u_h(src5, src5); |
| 307 vec2 += __msa_hadd_u_h(src6, src6); |
| 308 vec3 += __msa_hadd_u_h(src7, src7); |
| 309 src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0); |
| 310 src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16); |
| 311 src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32); |
| 312 src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48); |
| 313 src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0); |
| 314 src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16); |
| 315 src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32); |
| 316 src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48); |
| 317 vec0 += __msa_hadd_u_h(src0, src0); |
| 318 vec1 += __msa_hadd_u_h(src1, src1); |
| 319 vec2 += __msa_hadd_u_h(src2, src2); |
| 320 vec3 += __msa_hadd_u_h(src3, src3); |
| 321 vec0 += __msa_hadd_u_h(src4, src4); |
| 322 vec1 += __msa_hadd_u_h(src5, src5); |
| 323 vec2 += __msa_hadd_u_h(src6, src6); |
| 324 vec3 += __msa_hadd_u_h(src7, src7); |
| 325 reg0 = __msa_hadd_u_w(vec0, vec0); |
| 326 reg1 = __msa_hadd_u_w(vec1, vec1); |
| 327 reg2 = __msa_hadd_u_w(vec2, vec2); |
| 328 reg3 = __msa_hadd_u_w(vec3, vec3); |
| 329 reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4); |
| 330 reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4); |
| 331 reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4); |
| 332 reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4); |
| 333 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); |
| 334 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); |
| 335 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); |
| 336 ST_UB(dst0, dst); |
| 337 s += 64; |
| 338 t0 += 64; |
| 339 t1 += 64; |
| 340 t2 += 64; |
| 341 dst += 16; |
| 342 } |
| 343 } |
| 344 |
| 345 void ScaleRowDown38_MSA(const uint8_t* src_ptr, |
| 346 ptrdiff_t src_stride, |
| 347 uint8_t* dst, |
| 348 int dst_width) { |
| 349 int x, width; |
| 350 uint64_t dst0; |
| 351 uint32_t dst1; |
| 352 v16u8 src0, src1, vec0; |
| 353 v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; |
| 354 |
| 355 assert(dst_width % 3 == 0); |
| 356 width = dst_width / 3; |
| 357 |
| 358 for (x = 0; x < width; x += 4) { |
| 359 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); |
| 360 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); |
| 361 vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0); |
| 362 dst0 = __msa_copy_u_d((v2i64)vec0, 0); |
| 363 dst1 = __msa_copy_u_w((v4i32)vec0, 2); |
| 364 SD(dst0, dst); |
| 365 SW(dst1, dst + 8); |
| 366 src_ptr += 32; |
| 367 dst += 12; |
| 368 } |
| 369 } |
| 370 |
| 371 void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, |
| 372 ptrdiff_t src_stride, |
| 373 uint8_t* dst_ptr, |
| 374 int dst_width) { |
| 375 int x, width; |
| 376 const uint8_t* s = src_ptr; |
| 377 const uint8_t* t = src_ptr + src_stride; |
| 378 uint64_t dst0; |
| 379 uint32_t dst1; |
| 380 v16u8 src0, src1, src2, src3, out; |
| 381 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
| 382 v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; |
| 383 v8i16 zero = {0}; |
| 384 v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; |
| 385 v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; |
| 386 v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); |
| 387 v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000); |
| 388 |
| 389 assert((dst_width % 3 == 0) && (dst_width > 0)); |
| 390 width = dst_width / 3; |
| 391 |
| 392 for (x = 0; x < width; x += 4) { |
| 393 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 394 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); |
| 395 src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); |
| 396 src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); |
| 397 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); |
| 398 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); |
| 399 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); |
| 400 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); |
| 401 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); |
| 402 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); |
| 403 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); |
| 404 vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); |
| 405 vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0); |
| 406 vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1); |
| 407 vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2); |
| 408 vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3); |
| 409 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); |
| 410 vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); |
| 411 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); |
| 412 tmp0 = __msa_hadd_u_w(vec4, vec4); |
| 413 tmp1 = __msa_hadd_u_w(vec5, vec5); |
| 414 tmp2 = __msa_hadd_u_w(vec6, vec6); |
| 415 tmp3 = __msa_hadd_u_w(vec7, vec7); |
| 416 tmp4 = __msa_hadd_u_w(vec0, vec0); |
| 417 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); |
| 418 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); |
| 419 tmp0 = __msa_hadd_u_w(vec0, vec0); |
| 420 tmp1 = __msa_hadd_u_w(vec1, vec1); |
| 421 tmp0 *= const_0x2AAA; |
| 422 tmp1 *= const_0x2AAA; |
| 423 tmp4 *= const_0x4000; |
| 424 tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); |
| 425 tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); |
| 426 tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); |
| 427 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); |
| 428 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); |
| 429 out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); |
| 430 dst0 = __msa_copy_u_d((v2i64)out, 0); |
| 431 dst1 = __msa_copy_u_w((v4i32)out, 2); |
| 432 SD(dst0, dst_ptr); |
| 433 SW(dst1, dst_ptr + 8); |
| 434 s += 32; |
| 435 t += 32; |
| 436 dst_ptr += 12; |
| 437 } |
| 438 } |
| 439 |
| 440 void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, |
| 441 ptrdiff_t src_stride, |
| 442 uint8_t* dst_ptr, |
| 443 int dst_width) { |
| 444 int x, width; |
| 445 const uint8_t* s = src_ptr; |
| 446 const uint8_t* t0 = s + src_stride; |
| 447 const uint8_t* t1 = s + src_stride * 2; |
| 448 uint64_t dst0; |
| 449 uint32_t dst1; |
| 450 v16u8 src0, src1, src2, src3, src4, src5, out; |
| 451 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
| 452 v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; |
| 453 v8u16 zero = {0}; |
| 454 v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; |
| 455 v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; |
| 456 v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71); |
| 457 v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); |
| 458 |
| 459 assert((dst_width % 3 == 0) && (dst_width > 0)); |
| 460 width = dst_width / 3; |
| 461 |
| 462 for (x = 0; x < width; x += 4) { |
| 463 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 464 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); |
| 465 src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0); |
| 466 src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16); |
| 467 src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0); |
| 468 src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16); |
| 469 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); |
| 470 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); |
| 471 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); |
| 472 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); |
| 473 vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4); |
| 474 vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4); |
| 475 vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5); |
| 476 vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5); |
| 477 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); |
| 478 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); |
| 479 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); |
| 480 vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); |
| 481 vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); |
| 482 vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); |
| 483 vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); |
| 484 vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); |
| 485 vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0); |
| 486 vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1); |
| 487 vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2); |
| 488 vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3); |
| 489 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); |
| 490 vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); |
| 491 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); |
| 492 tmp0 = __msa_hadd_u_w(vec4, vec4); |
| 493 tmp1 = __msa_hadd_u_w(vec5, vec5); |
| 494 tmp2 = __msa_hadd_u_w(vec6, vec6); |
| 495 tmp3 = __msa_hadd_u_w(vec7, vec7); |
| 496 tmp4 = __msa_hadd_u_w(vec0, vec0); |
| 497 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); |
| 498 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); |
| 499 tmp0 = __msa_hadd_u_w(vec0, vec0); |
| 500 tmp1 = __msa_hadd_u_w(vec1, vec1); |
| 501 tmp0 *= const_0x1C71; |
| 502 tmp1 *= const_0x1C71; |
| 503 tmp4 *= const_0x2AAA; |
| 504 tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); |
| 505 tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); |
| 506 tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); |
| 507 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); |
| 508 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); |
| 509 out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); |
| 510 dst0 = __msa_copy_u_d((v2i64)out, 0); |
| 511 dst1 = __msa_copy_u_w((v4i32)out, 2); |
| 512 SD(dst0, dst_ptr); |
| 513 SW(dst1, dst_ptr + 8); |
| 514 s += 32; |
| 515 t0 += 32; |
| 516 t1 += 32; |
| 517 dst_ptr += 12; |
| 518 } |
| 519 } |
| 520 |
| 521 void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { |
| 522 int x; |
| 523 v16u8 src0; |
| 524 v8u16 vec0, vec1, dst0, dst1; |
| 525 v16i8 zero = {0}; |
| 526 |
| 527 assert(src_width > 0); |
| 528 |
| 529 for (x = 0; x < src_width; x += 16) { |
| 530 src0 = LD_UB(src_ptr); |
| 531 dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0); |
| 532 dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16); |
| 533 dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0); |
| 534 dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0); |
| 535 ST_UH2(dst0, dst1, dst_ptr, 8); |
| 536 src_ptr += 16; |
| 537 dst_ptr += 16; |
| 538 } |
| 539 } |
| 540 |
172 #ifdef __cplusplus | 541 #ifdef __cplusplus |
173 } // extern "C" | 542 } // extern "C" |
174 } // namespace libyuv | 543 } // namespace libyuv |
175 #endif | 544 #endif |
176 | 545 |
177 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 546 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
OLD | NEW |