| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 221 uint32_t val0_m, val1_m; \ | 221 uint32_t val0_m, val1_m; \ |
| 222 \ | 222 \ |
| 223 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ | 223 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ |
| 224 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ | 224 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ |
| 225 \ | 225 \ |
| 226 SW(val0_m, pdst_m1); \ | 226 SW(val0_m, pdst_m1); \ |
| 227 SW(val1_m, pdst_m1 + 4); \ | 227 SW(val1_m, pdst_m1 + 4); \ |
| 228 } | 228 } |
| 229 #endif // (__mips_isa_rev >= 6) | 229 #endif // (__mips_isa_rev >= 6) |
| 230 | 230 |
| 231 /* Description : Load 4 words with stride |
| 232 Arguments : Inputs - psrc (source pointer to load from) |
| 233 - stride |
| 234 Outputs - out0, out1, out2, out3 |
| 235 Details : Loads word in 'out0' from (psrc) |
| 236 Loads word in 'out1' from (psrc + stride) |
| 237 Loads word in 'out2' from (psrc + 2 * stride) |
| 238 Loads word in 'out3' from (psrc + 3 * stride) |
| 239 */ |
| 240 #define LW4(psrc, stride, out0, out1, out2, out3) { \ |
| 241 out0 = LW((psrc)); \ |
| 242 out1 = LW((psrc) + stride); \ |
| 243 out2 = LW((psrc) + 2 * stride); \ |
| 244 out3 = LW((psrc) + 3 * stride); \ |
| 245 } |
| 246 |
| 231 /* Description : Store 4 words with stride | 247 /* Description : Store 4 words with stride |
| 232 Arguments : Inputs - in0, in1, in2, in3, pdst, stride | 248 Arguments : Inputs - in0, in1, in2, in3, pdst, stride |
| 233 Details : Stores word from 'in0' to (pdst) | 249 Details : Stores word from 'in0' to (pdst) |
| 234 Stores word from 'in1' to (pdst + stride) | 250 Stores word from 'in1' to (pdst + stride) |
| 235 Stores word from 'in2' to (pdst + 2 * stride) | 251 Stores word from 'in2' to (pdst + 2 * stride) |
| 236 Stores word from 'in3' to (pdst + 3 * stride) | 252 Stores word from 'in3' to (pdst + 3 * stride) |
| 237 */ | 253 */ |
| 238 #define SW4(in0, in1, in2, in3, pdst, stride) { \ | 254 #define SW4(in0, in1, in2, in3, pdst, stride) { \ |
| 239 SW(in0, (pdst)) \ | 255 SW(in0, (pdst)) \ |
| 240 SW(in1, (pdst) + stride); \ | 256 SW(in1, (pdst) + stride); \ |
| (...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 329 #define LD_H16(RTYPE, psrc, stride, \ | 345 #define LD_H16(RTYPE, psrc, stride, \ |
| 330 out0, out1, out2, out3, out4, out5, out6, out7, \ | 346 out0, out1, out2, out3, out4, out5, out6, out7, \ |
| 331 out8, out9, out10, out11, out12, out13, out14, out15) { \ | 347 out8, out9, out10, out11, out12, out13, out14, out15) { \ |
| 332 LD_H8(RTYPE, (psrc), stride, \ | 348 LD_H8(RTYPE, (psrc), stride, \ |
| 333 out0, out1, out2, out3, out4, out5, out6, out7); \ | 349 out0, out1, out2, out3, out4, out5, out6, out7); \ |
| 334 LD_H8(RTYPE, (psrc) + 8 * stride, stride, \ | 350 LD_H8(RTYPE, (psrc) + 8 * stride, stride, \ |
| 335 out8, out9, out10, out11, out12, out13, out14, out15); \ | 351 out8, out9, out10, out11, out12, out13, out14, out15); \ |
| 336 } | 352 } |
| 337 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) | 353 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) |
| 338 | 354 |
| 355 /* Description : Load as 4x4 block of signed halfword elements from 1D source |
| 356 data into 4 vectors (Each vector with 4 signed halfwords) |
| 357 Arguments : Inputs - psrc |
| 358 Outputs - out0, out1, out2, out3 |
| 359 */ |
| 360 #define LD4x4_SH(psrc, out0, out1, out2, out3) { \ |
| 361 out0 = LD_SH(psrc); \ |
| 362 out2 = LD_SH(psrc + 8); \ |
| 363 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ |
| 364 out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ |
| 365 } |
| 366 |
| 339 /* Description : Store vectors of 16 byte elements with stride | 367 /* Description : Store vectors of 16 byte elements with stride |
| 340 Arguments : Inputs - in0, in1, stride | 368 Arguments : Inputs - in0, in1, stride |
| 341 Outputs - pdst (destination pointer to store to) | 369 Outputs - pdst (destination pointer to store to) |
| 342 Details : Stores 16 byte elements from 'in0' to (pdst) | 370 Details : Stores 16 byte elements from 'in0' to (pdst) |
| 343 Stores 16 byte elements from 'in1' to (pdst + stride) | 371 Stores 16 byte elements from 'in1' to (pdst + stride) |
| 344 */ | 372 */ |
| 345 #define ST_B2(RTYPE, in0, in1, pdst, stride) { \ | 373 #define ST_B2(RTYPE, in0, in1, pdst, stride) { \ |
| 346 ST_B(RTYPE, in0, (pdst)); \ | 374 ST_B(RTYPE, in0, (pdst)); \ |
| 347 ST_B(RTYPE, in1, (pdst) + stride); \ | 375 ST_B(RTYPE, in1, (pdst) + stride); \ |
| 348 } | 376 } |
| (...skipping 29 matching lines...) Expand all Loading... |
| 378 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ | 406 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ |
| 379 } | 407 } |
| 380 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) | 408 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) |
| 381 | 409 |
| 382 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \ | 410 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \ |
| 383 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ | 411 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ |
| 384 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ | 412 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ |
| 385 } | 413 } |
| 386 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) | 414 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) |
| 387 | 415 |
| 416 /* Description : Store as 2x4 byte block to destination memory from input vector |
| 417 Arguments : Inputs - in, stidx, pdst, stride |
| 418 Return Type - unsigned byte |
| 419 Details : Index stidx halfword element from 'in' vector is copied and |
| 420 stored on first line |
| 421 Index stidx+1 halfword element from 'in' vector is copied and |
| 422 stored on second line |
| 423 Index stidx+2 halfword element from 'in' vector is copied and |
| 424 stored on third line |
| 425 Index stidx+3 halfword element from 'in' vector is copied and |
| 426 stored on fourth line |
| 427 */ |
| 428 #define ST2x4_UB(in, stidx, pdst, stride) { \ |
| 429 uint16_t out0_m, out1_m, out2_m, out3_m; \ |
| 430 uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ |
| 431 \ |
| 432 out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ |
| 433 out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ |
| 434 out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ |
| 435 out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ |
| 436 \ |
| 437 SH(out0_m, pblk_2x4_m); \ |
| 438 SH(out1_m, pblk_2x4_m + stride); \ |
| 439 SH(out2_m, pblk_2x4_m + 2 * stride); \ |
| 440 SH(out3_m, pblk_2x4_m + 3 * stride); \ |
| 441 } |
| 442 |
| 388 /* Description : Store as 4x4 byte block to destination memory from input vector | 443 /* Description : Store as 4x4 byte block to destination memory from input vector |
| 389 Arguments : Inputs - in0, in1, pdst, stride | 444 Arguments : Inputs - in0, in1, pdst, stride |
| 390 Return Type - unsigned byte | 445 Return Type - unsigned byte |
| 391 Details : Idx0 word element from input vector 'in0' is copied and stored | 446 Details : Idx0 word element from input vector 'in0' is copied and stored |
| 392 on first line | 447 on first line |
| 393 Idx1 word element from input vector 'in0' is copied and stored | 448 Idx1 word element from input vector 'in0' is copied and stored |
| 394 on second line | 449 on second line |
| 395 Idx2 word element from input vector 'in1' is copied and stored | 450 Idx2 word element from input vector 'in1' is copied and stored |
| 396 on third line | 451 on third line |
| 397 Idx3 word element from input vector 'in1' is copied and stored | 452 Idx3 word element from input vector 'in1' is copied and stored |
| (...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 483 Details : Byte elements from 'zero_m' vector are slide into 'in0' by | 538 Details : Byte elements from 'zero_m' vector are slide into 'in0' by |
| 484 number of elements specified by 'slide_val' | 539 number of elements specified by 'slide_val' |
| 485 */ | 540 */ |
| 486 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \ | 541 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \ |
| 487 v16i8 zero_m = { 0 }; \ | 542 v16i8 zero_m = { 0 }; \ |
| 488 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ | 543 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ |
| 489 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ | 544 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ |
| 490 } | 545 } |
| 491 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) | 546 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) |
| 492 | 547 |
| 548 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \ |
| 549 out0, out1, out2, out3, slide_val) { \ |
| 550 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ |
| 551 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ |
| 552 } |
| 553 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) |
| 554 |
| 493 /* Description : Immediate number of columns to slide | 555 /* Description : Immediate number of columns to slide |
| 494 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val | 556 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val |
| 495 Outputs - out0, out1 | 557 Outputs - out0, out1 |
| 496 Return Type - as per RTYPE | 558 Return Type - as per RTYPE |
| 497 Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by | 559 Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by |
| 498 number of elements specified by 'slide_val' | 560 number of elements specified by 'slide_val' |
| 499 */ | 561 */ |
| 500 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \ | 562 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \ |
| 501 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ | 563 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ |
| 502 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ | 564 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ |
| (...skipping 171 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 674 }) | 736 }) |
| 675 #define CLIP_SH2_0_255(in0, in1) { \ | 737 #define CLIP_SH2_0_255(in0, in1) { \ |
| 676 in0 = CLIP_SH_0_255(in0); \ | 738 in0 = CLIP_SH_0_255(in0); \ |
| 677 in1 = CLIP_SH_0_255(in1); \ | 739 in1 = CLIP_SH_0_255(in1); \ |
| 678 } | 740 } |
| 679 #define CLIP_SH4_0_255(in0, in1, in2, in3) { \ | 741 #define CLIP_SH4_0_255(in0, in1, in2, in3) { \ |
| 680 CLIP_SH2_0_255(in0, in1); \ | 742 CLIP_SH2_0_255(in0, in1); \ |
| 681 CLIP_SH2_0_255(in2, in3); \ | 743 CLIP_SH2_0_255(in2, in3); \ |
| 682 } | 744 } |
| 683 | 745 |
| 746 /* Description : Insert specified word elements from input vectors to 1 |
| 747 destination vector |
| 748 Arguments : Inputs - in0, in1, in2, in3 (4 input vectors) |
| 749 Outputs - out (output vector) |
| 750 Return Type - as per RTYPE |
| 751 */ |
| 752 #define INSERT_W2(RTYPE, in0, in1, out) { \ |
| 753 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ |
| 754 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ |
| 755 } |
| 756 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) |
| 757 |
| 684 /* Description : Interleave even byte elements from vectors | 758 /* Description : Interleave even byte elements from vectors |
| 685 Arguments : Inputs - in0, in1, in2, in3 | 759 Arguments : Inputs - in0, in1, in2, in3 |
| 686 Outputs - out0, out1 | 760 Outputs - out0, out1 |
| 687 Return Type - as per RTYPE | 761 Return Type - as per RTYPE |
| 688 Details : Even byte elements of 'in0' and even byte | 762 Details : Even byte elements of 'in0' and even byte |
| 689 elements of 'in1' are interleaved and copied to 'out0' | 763 elements of 'in1' are interleaved and copied to 'out0' |
| 690 Even byte elements of 'in2' and even byte | 764 Even byte elements of 'in2' and even byte |
| 691 elements of 'in3' are interleaved and copied to 'out1' | 765 elements of 'in3' are interleaved and copied to 'out1' |
| 692 */ | 766 */ |
| 693 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ | 767 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| (...skipping 13 matching lines...) Expand all Loading... |
| 707 elements of 'in3' are interleaved and copied to 'out1' | 781 elements of 'in3' are interleaved and copied to 'out1' |
| 708 */ | 782 */ |
| 709 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ | 783 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 710 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ | 784 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ |
| 711 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ | 785 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ |
| 712 } | 786 } |
| 713 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) | 787 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) |
| 714 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) | 788 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) |
| 715 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) | 789 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) |
| 716 | 790 |
| 791 /* Description : Interleave even double word elements from vectors |
| 792 Arguments : Inputs - in0, in1, in2, in3 |
| 793 Outputs - out0, out1 |
| 794 Return Type - as per RTYPE |
| 795 Details : Even double word elements of 'in0' and even double word |
| 796 elements of 'in1' are interleaved and copied to 'out0' |
| 797 Even double word elements of 'in2' and even double word |
| 798 elements of 'in3' are interleaved and copied to 'out1' |
| 799 */ |
| 800 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 801 out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ |
| 802 out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ |
| 803 } |
| 804 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) |
| 805 |
| 717 /* Description : Interleave left half of byte elements from vectors | 806 /* Description : Interleave left half of byte elements from vectors |
| 718 Arguments : Inputs - in0, in1, in2, in3 | 807 Arguments : Inputs - in0, in1, in2, in3 |
| 719 Outputs - out0, out1 | 808 Outputs - out0, out1 |
| 720 Return Type - as per RTYPE | 809 Return Type - as per RTYPE |
| 721 Details : Left half of byte elements of in0 and left half of byte | 810 Details : Left half of byte elements of in0 and left half of byte |
| 722 elements of in1 are interleaved and copied to out0. | 811 elements of in1 are interleaved and copied to out0. |
| 723 Left half of byte elements of in2 and left half of byte | 812 Left half of byte elements of in2 and left half of byte |
| 724 elements of in3 are interleaved and copied to out1. | 813 elements of in3 are interleaved and copied to out1. |
| 725 */ | 814 */ |
| 726 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ | 815 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 793 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ | 882 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 794 out0, out1, out2, out3) { \ | 883 out0, out1, out2, out3) { \ |
| 795 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ | 884 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 796 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ | 885 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 797 } | 886 } |
| 798 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) | 887 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) |
| 799 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) | 888 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) |
| 800 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) | 889 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) |
| 801 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) | 890 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) |
| 802 | 891 |
| 892 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 893 in8, in9, in10, in11, in12, in13, in14, in15, \ |
| 894 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 895 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 896 out0, out1, out2, out3); \ |
| 897 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \ |
| 898 out4, out5, out6, out7); \ |
| 899 } |
| 900 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) |
| 901 |
| 803 /* Description : Interleave right half of halfword elements from vectors | 902 /* Description : Interleave right half of halfword elements from vectors |
| 804 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 | 903 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 805 Outputs - out0, out1, out2, out3 | 904 Outputs - out0, out1, out2, out3 |
| 806 Return Type - signed halfword | 905 Return Type - signed halfword |
| 807 Details : Right half of halfword elements of in0 and right half of | 906 Details : Right half of halfword elements of in0 and right half of |
| 808 halfword elements of in1 are interleaved and copied to out0. | 907 halfword elements of in1 are interleaved and copied to out0. |
| 809 Right half of halfword elements of in2 and right half of | 908 Right half of halfword elements of in2 and right half of |
| 810 halfword elements of in3 are interleaved and copied to out1. | 909 halfword elements of in3 are interleaved and copied to out1. |
| 811 Similar for other pairs | 910 Similar for other pairs |
| 812 */ | 911 */ |
| (...skipping 10 matching lines...) Expand all Loading... |
| 823 } | 922 } |
| 824 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) | 923 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) |
| 825 | 924 |
| 826 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ | 925 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 827 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ | 926 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ |
| 828 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ | 927 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ |
| 829 } | 928 } |
| 830 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) | 929 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) |
| 831 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) | 930 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) |
| 832 | 931 |
| 932 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 933 out0, out1, out2, out3) { \ |
| 934 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 935 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 936 } |
| 937 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) |
| 938 |
| 833 /* Description : Interleave right half of double word elements from vectors | 939 /* Description : Interleave right half of double word elements from vectors |
| 834 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 | 940 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 835 Outputs - out0, out1, out2, out3 | 941 Outputs - out0, out1, out2, out3 |
| 836 Return Type - unsigned double word | 942 Return Type - unsigned double word |
| 837 Details : Right half of double word elements of in0 and right half of | 943 Details : Right half of double word elements of in0 and right half of |
| 838 double word elements of in1 are interleaved and copied to out0. | 944 double word elements of in1 are interleaved and copied to out0. |
| 839 Right half of double word elements of in2 and right half of | 945 Right half of double word elements of in2 and right half of |
| 840 double word elements of in3 are interleaved and copied to out1. | 946 double word elements of in3 are interleaved and copied to out1. |
| 841 */ | 947 */ |
| 842 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ | 948 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| (...skipping 248 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1091 } | 1197 } |
| 1092 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) | 1198 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) |
| 1093 | 1199 |
| 1094 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ | 1200 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1095 out0, out1, out2, out3) { \ | 1201 out0, out1, out2, out3) { \ |
| 1096 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ | 1202 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 1097 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ | 1203 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 1098 } | 1204 } |
| 1099 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) | 1205 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) |
| 1100 | 1206 |
| 1207 /* Description : Arithmetic shift right all elements of vector |
| 1208 (generic for all data types) |
| 1209 Arguments : Inputs - in0, in1, in2, in3, shift |
| 1210 Outputs - in0, in1, in2, in3 (in place) |
| 1211 Return Type - as per input vector RTYPE |
| 1212 Details : Each element of vector 'in0' is right shifted by 'shift' and |
| 1213 result is in place written to 'in0' |
| 1214 Here, 'shift' is GP variable passed in |
| 1215 Similar for other pairs |
| 1216 */ |
| 1217 #define SRA_4V(in0, in1, in2, in3, shift) { \ |
| 1218 in0 = in0 >> shift; \ |
| 1219 in1 = in1 >> shift; \ |
| 1220 in2 = in2 >> shift; \ |
| 1221 in3 = in3 >> shift; \ |
| 1222 } |
| 1223 |
| 1101 /* Description : Shift right arithmetic rounded (immediate) | 1224 /* Description : Shift right arithmetic rounded (immediate) |
| 1102 Arguments : Inputs - in0, in1, in2, in3, shift | 1225 Arguments : Inputs - in0, in1, in2, in3, shift |
| 1103 Outputs - in0, in1, in2, in3 (in place) | 1226 Outputs - in0, in1, in2, in3 (in place) |
| 1104 Return Type - as per RTYPE | 1227 Return Type - as per RTYPE |
| 1105 Details : Each element of vector 'in0' is shifted right arithmetic by | 1228 Details : Each element of vector 'in0' is shifted right arithmetic by |
| 1106 value in 'shift'. | 1229 value in 'shift'. |
| 1107 The last discarded bit is added to shifted value for rounding | 1230 The last discarded bit is added to shifted value for rounding |
| 1108 and the result is in place written to 'in0' | 1231 and the result is in place written to 'in0' |
| 1109 Similar for other pairs | 1232 Similar for other pairs |
| 1110 */ | 1233 */ |
| (...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1171 out1 = in2 - in3; \ | 1294 out1 = in2 - in3; \ |
| 1172 } | 1295 } |
| 1173 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ | 1296 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1174 out0, out1, out2, out3) { \ | 1297 out0, out1, out2, out3) { \ |
| 1175 out0 = in0 - in1; \ | 1298 out0 = in0 - in1; \ |
| 1176 out1 = in2 - in3; \ | 1299 out1 = in2 - in3; \ |
| 1177 out2 = in4 - in5; \ | 1300 out2 = in4 - in5; \ |
| 1178 out3 = in6 - in7; \ | 1301 out3 = in6 - in7; \ |
| 1179 } | 1302 } |
| 1180 | 1303 |
| 1304 /* Description : Sign extend halfword elements from right half of the vector |
| 1305 Arguments : Inputs - in (input halfword vector) |
| 1306 Outputs - out (sign extended word vectors) |
| 1307 Return Type - signed word |
| 1308 Details : Sign bit of halfword elements from input vector 'in' is |
| 1309 extracted and interleaved with same vector 'in0' to generate |
| 1310 4 word elements keeping sign intact |
| 1311 */ |
| 1312 #define UNPCK_R_SH_SW(in, out) { \ |
| 1313 v8i16 sign_m; \ |
| 1314 \ |
| 1315 sign_m = __msa_clti_s_h((v8i16)in, 0); \ |
| 1316 out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ |
| 1317 } |
| 1318 |
| 1181 /* Description : Zero extend unsigned byte elements to halfword elements | 1319 /* Description : Zero extend unsigned byte elements to halfword elements |
| 1182 Arguments : Inputs - in (1 input unsigned byte vector) | 1320 Arguments : Inputs - in (1 input unsigned byte vector) |
| 1183 Outputs - out0, out1 (unsigned 2 halfword vectors) | 1321 Outputs - out0, out1 (unsigned 2 halfword vectors) |
| 1184 Return Type - signed halfword | 1322 Return Type - signed halfword |
| 1185 Details : Zero extended right half of vector is returned in 'out0' | 1323 Details : Zero extended right half of vector is returned in 'out0' |
| 1186 Zero extended left half of vector is returned in 'out1' | 1324 Zero extended left half of vector is returned in 'out1' |
| 1187 */ | 1325 */ |
| 1188 #define UNPCK_UB_SH(in, out0, out1) { \ | 1326 #define UNPCK_UB_SH(in, out0, out1) { \ |
| 1189 v16i8 zero_m = { 0 }; \ | 1327 v16i8 zero_m = { 0 }; \ |
| 1190 \ | 1328 \ |
| (...skipping 24 matching lines...) Expand all Loading... |
| 1215 out1 = in1 + in6; \ | 1353 out1 = in1 + in6; \ |
| 1216 out2 = in2 + in5; \ | 1354 out2 = in2 + in5; \ |
| 1217 out3 = in3 + in4; \ | 1355 out3 = in3 + in4; \ |
| 1218 \ | 1356 \ |
| 1219 out4 = in3 - in4; \ | 1357 out4 = in3 - in4; \ |
| 1220 out5 = in2 - in5; \ | 1358 out5 = in2 - in5; \ |
| 1221 out6 = in1 - in6; \ | 1359 out6 = in1 - in6; \ |
| 1222 out7 = in0 - in7; \ | 1360 out7 = in0 - in7; \ |
| 1223 } | 1361 } |
| 1224 | 1362 |
| 1363 /* Description : Transposes input 8x8 byte block |
| 1364 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 1365 (input 8x8 byte block) |
| 1366 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| 1367 (output 8x8 byte block) |
| 1368 Return Type - unsigned byte |
| 1369 Details : |
| 1370 */ |
| 1371 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1372 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 1373 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 1374 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ |
| 1375 \ |
| 1376 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \ |
| 1377 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ |
| 1378 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ |
| 1379 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ |
| 1380 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ |
| 1381 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ |
| 1382 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ |
| 1383 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ |
| 1384 } |
| 1385 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) |
| 1386 |
| 1387 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors |
| 1388 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, |
| 1389 in8, in9, in10, in11, in12, in13, in14, in15 |
| 1390 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| 1391 Return Type - unsigned byte |
| 1392 Details : |
| 1393 */ |
| 1394 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1395 in8, in9, in10, in11, in12, in13, in14, in15, \ |
| 1396 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 1397 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 1398 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ |
| 1399 \ |
| 1400 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ |
| 1401 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ |
| 1402 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ |
| 1403 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ |
| 1404 \ |
| 1405 tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ |
| 1406 tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ |
| 1407 tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ |
| 1408 tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ |
| 1409 out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ |
| 1410 tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ |
| 1411 out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ |
| 1412 tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ |
| 1413 \ |
| 1414 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ |
| 1415 out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ |
| 1416 out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ |
| 1417 \ |
| 1418 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ |
| 1419 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ |
| 1420 out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ |
| 1421 out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ |
| 1422 \ |
| 1423 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ |
| 1424 out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ |
| 1425 out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ |
| 1426 \ |
| 1427 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ |
| 1428 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ |
| 1429 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ |
| 1430 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ |
| 1431 out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ |
| 1432 out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ |
| 1433 } |
| 1434 |
| 1435 /* Description : Transposes 4x4 block with half word elements in vectors |
| 1436 Arguments : Inputs - in0, in1, in2, in3 |
| 1437 Outputs - out0, out1, out2, out3 |
| 1438 Return Type - signed halfword |
| 1439 Details : |
| 1440 */ |
| 1441 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ |
| 1442 v8i16 s0_m, s1_m; \ |
| 1443 \ |
| 1444 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ |
| 1445 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ |
| 1446 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ |
| 1447 out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ |
| 1448 } |
| 1449 |
| 1225 /* Description : Transposes 4x8 block with half word elements in vectors | 1450 /* Description : Transposes 4x8 block with half word elements in vectors |
| 1226 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 | 1451 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 1227 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 | 1452 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| 1228 Return Type - signed halfword | 1453 Return Type - signed halfword |
| 1229 Details : | 1454 Details : |
| 1230 */ | 1455 */ |
| 1231 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ | 1456 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1232 out0, out1, out2, out3, out4, out5, out6, out7) { \ | 1457 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 1233 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ | 1458 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 1234 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ | 1459 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1287 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ | 1512 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ |
| 1288 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ | 1513 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ |
| 1289 tmp3_m, tmp7_m, out0, out2, out4, out6); \ | 1514 tmp3_m, tmp7_m, out0, out2, out4, out6); \ |
| 1290 out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ | 1515 out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ |
| 1291 out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ | 1516 out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ |
| 1292 out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ | 1517 out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ |
| 1293 out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ | 1518 out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ |
| 1294 } | 1519 } |
| 1295 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) | 1520 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) |
| 1296 | 1521 |
| 1522 /* Description : Transposes 4x4 block with word elements in vectors |
| 1523 Arguments : Inputs - in0, in1, in2, in3 |
| 1524 Outputs - out0, out1, out2, out3 |
| 1525 Return Type - signed word |
| 1526 Details : |
| 1527 */ |
| 1528 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) { \ |
| 1529 v4i32 s0_m, s1_m, s2_m, s3_m; \ |
| 1530 \ |
| 1531 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ |
| 1532 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ |
| 1533 \ |
| 1534 out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ |
| 1535 out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ |
| 1536 out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ |
| 1537 out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ |
| 1538 } |
| 1539 |
| 1540 /* Description : Add block 4x4 |
| 1541 Arguments : Inputs - in0, in1, in2, in3, pdst, stride |
| 1542 Outputs - |
| 1543 Return Type - unsigned bytes |
| 1544 Details : Least significant 4 bytes from each input vector are added to |
| 1545 the destination bytes, clipped between 0-255 and then stored. |
| 1546 */ |
| 1547 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \ |
| 1548 uint32_t src0_m, src1_m, src2_m, src3_m; \ |
| 1549 uint32_t out0_m, out1_m, out2_m, out3_m; \ |
| 1550 v8i16 inp0_m, inp1_m, res0_m, res1_m; \ |
| 1551 v16i8 dst0_m = { 0 }; \ |
| 1552 v16i8 dst1_m = { 0 }; \ |
| 1553 v16i8 zero_m = { 0 }; \ |
| 1554 \ |
| 1555 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ |
| 1556 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ |
| 1557 INSERT_W2_SB(src0_m, src1_m, dst0_m); \ |
| 1558 INSERT_W2_SB(src2_m, src3_m, dst1_m); \ |
| 1559 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ |
| 1560 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ |
| 1561 CLIP_SH2_0_255(res0_m, res1_m); \ |
| 1562 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ |
| 1563 \ |
| 1564 out0_m = __msa_copy_u_w((v4i32)dst0_m, 0); \ |
| 1565 out1_m = __msa_copy_u_w((v4i32)dst0_m, 1); \ |
| 1566 out2_m = __msa_copy_u_w((v4i32)dst1_m, 0); \ |
| 1567 out3_m = __msa_copy_u_w((v4i32)dst1_m, 1); \ |
| 1568 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ |
| 1569 } |
| 1570 |
| 1297 /* Description : Pack even elements of input vectors & xor with 128 | 1571 /* Description : Pack even elements of input vectors & xor with 128 |
| 1298 Arguments : Inputs - in0, in1 | 1572 Arguments : Inputs - in0, in1 |
| 1299 Outputs - out_m | 1573 Outputs - out_m |
| 1300 Return Type - unsigned byte | 1574 Return Type - unsigned byte |
| 1301 Details : Signed byte even elements from 'in0' and 'in1' are packed | 1575 Details : Signed byte even elements from 'in0' and 'in1' are packed |
| 1302 together in one vector and the resulted vector is xor'ed with | 1576 together in one vector and the resulted vector is xor'ed with |
| 1303 128 to shift the range from signed to unsigned byte | 1577 128 to shift the range from signed to unsigned byte |
| 1304 */ | 1578 */ |
| 1305 #define PCKEV_XORI128_UB(in0, in1) ({ \ | 1579 #define PCKEV_XORI128_UB(in0, in1) ({ \ |
| 1306 v16u8 out_m; \ | 1580 v16u8 out_m; \ |
| 1307 \ | 1581 \ |
| 1308 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ | 1582 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ |
| 1309 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ | 1583 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ |
| 1310 out_m; \ | 1584 out_m; \ |
| 1311 }) | 1585 }) |
| 1312 | 1586 |
| 1587 /* Description : Converts inputs to unsigned bytes, interleave, average & store |
| 1588 as 8x4 unsigned byte block |
| 1589 Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, |
| 1590 pdst, stride |
| 1591 */ |
| 1592 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \ |
| 1593 dst0, dst1, dst2, dst3, pdst, stride) { \ |
| 1594 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 1595 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
| 1596 \ |
| 1597 tmp0_m = PCKEV_XORI128_UB(in0, in1); \ |
| 1598 tmp1_m = PCKEV_XORI128_UB(in2, in3); \ |
| 1599 ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ |
| 1600 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ |
| 1601 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ |
| 1602 } |
| 1603 |
| 1313 /* Description : Pack even byte elements and store byte vector in destination | 1604 /* Description : Pack even byte elements and store byte vector in destination |
| 1314 memory | 1605 memory |
| 1315 Arguments : Inputs - in0, in1, pdst | 1606 Arguments : Inputs - in0, in1, pdst |
| 1316 */ | 1607 */ |
| 1317 #define PCKEV_ST_SB(in0, in1, pdst) { \ | 1608 #define PCKEV_ST_SB(in0, in1, pdst) { \ |
| 1318 v16i8 tmp_m; \ | 1609 v16i8 tmp_m; \ |
| 1319 \ | 1610 \ |
| 1320 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ | 1611 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ |
| 1321 ST_SB(tmp_m, (pdst)); \ | 1612 ST_SB(tmp_m, (pdst)); \ |
| 1322 } | 1613 } |
| 1323 | 1614 |
| 1324 /* Description : Horizontal 2 tap filter kernel code | 1615 /* Description : Horizontal 2 tap filter kernel code |
| 1325 Arguments : Inputs - in0, in1, mask, coeff, shift | 1616 Arguments : Inputs - in0, in1, mask, coeff, shift |
| 1326 */ | 1617 */ |
| 1327 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \ | 1618 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \ |
| 1328 v16i8 tmp0_m; \ | 1619 v16i8 tmp0_m; \ |
| 1329 v8u16 tmp1_m; \ | 1620 v8u16 tmp1_m; \ |
| 1330 \ | 1621 \ |
| 1331 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ | 1622 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ |
| 1332 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ | 1623 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ |
| 1333 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ | 1624 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ |
| 1334 tmp1_m = __msa_sat_u_h(tmp1_m, shift); \ | 1625 tmp1_m = __msa_sat_u_h(tmp1_m, shift); \ |
| 1335 \ | 1626 \ |
| 1336 tmp1_m; \ | 1627 tmp1_m; \ |
| 1337 }) | 1628 }) |
| 1338 #endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */ | 1629 #endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */ |
| OLD | NEW |