OLD | NEW |
1 // Copyright 2012 Google Inc. All Rights Reserved. | 1 // Copyright 2012 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
9 // | 9 // |
10 // ARM NEON version of dsp functions and loop filtering. | 10 // ARM NEON version of dsp functions and loop filtering. |
(...skipping 371 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
382 vst4_lane_u8(v - 2 + 2 * stride, v0, 2); | 382 vst4_lane_u8(v - 2 + 2 * stride, v0, 2); |
383 vst4_lane_u8(v - 2 + 3 * stride, v0, 3); | 383 vst4_lane_u8(v - 2 + 3 * stride, v0, 3); |
384 vst4_lane_u8(v - 2 + 4 * stride, v0, 4); | 384 vst4_lane_u8(v - 2 + 4 * stride, v0, 4); |
385 vst4_lane_u8(v - 2 + 5 * stride, v0, 5); | 385 vst4_lane_u8(v - 2 + 5 * stride, v0, 5); |
386 vst4_lane_u8(v - 2 + 6 * stride, v0, 6); | 386 vst4_lane_u8(v - 2 + 6 * stride, v0, 6); |
387 vst4_lane_u8(v - 2 + 7 * stride, v0, 7); | 387 vst4_lane_u8(v - 2 + 7 * stride, v0, 7); |
388 } | 388 } |
389 | 389 |
390 #endif // !WORK_AROUND_GCC | 390 #endif // !WORK_AROUND_GCC |
391 | 391 |
392 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. | 392 // Zero extend 'v' to an int16x8_t. |
393 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { | 393 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) { |
394 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); | 394 return vreinterpretq_s16_u16(vmovl_u8(v)); |
395 } | 395 } |
396 | 396 |
397 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result | 397 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result |
398 // to the corresponding rows of 'dst'. | 398 // to the corresponding rows of 'dst'. |
399 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, | 399 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, |
400 const int16x8_t dst01, | 400 const int16x8_t dst01, |
401 const int16x8_t dst23) { | 401 const int16x8_t dst23) { |
402 // Unsigned saturate to 8b. | 402 // Unsigned saturate to 8b. |
403 const uint8x8_t dst01_u8 = vqmovun_s16(dst01); | 403 const uint8x8_t dst01_u8 = vqmovun_s16(dst01); |
404 const uint8x8_t dst23_u8 = vqmovun_s16(dst23); | 404 const uint8x8_t dst23_u8 = vqmovun_s16(dst23); |
(...skipping 11 matching lines...) Expand all Loading... |
416 uint32x2_t dst23 = vdup_n_u32(0); | 416 uint32x2_t dst23 = vdup_n_u32(0); |
417 | 417 |
418 // Load the source pixels. | 418 // Load the source pixels. |
419 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0); | 419 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0); |
420 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0); | 420 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0); |
421 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1); | 421 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1); |
422 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1); | 422 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1); |
423 | 423 |
424 { | 424 { |
425 // Convert to 16b. | 425 // Convert to 16b. |
426 const int16x8_t dst01_s16 = ConvertU8ToS16(dst01); | 426 const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01)); |
427 const int16x8_t dst23_s16 = ConvertU8ToS16(dst23); | 427 const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23)); |
428 | 428 |
429 // Descale with rounding. | 429 // Descale with rounding. |
430 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); | 430 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); |
431 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); | 431 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); |
432 // Add the inverse transform. | 432 // Add the inverse transform. |
433 SaturateAndStore4x4(dst, out01, out23); | 433 SaturateAndStore4x4(dst, out01, out23); |
434 } | 434 } |
435 } | 435 } |
436 | 436 |
437 //----------------------------------------------------------------------------- | 437 //----------------------------------------------------------------------------- |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
472 | 472 |
473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) { | 473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) { |
474 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) | 474 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) |
475 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0) | 475 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0) |
476 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0) | 476 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0) |
477 return s2; | 477 return s2; |
478 } | 478 } |
479 | 479 |
480 //------------------------------------------------------------------------------ | 480 //------------------------------------------------------------------------------ |
481 | 481 |
| 482 static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s, |
| 483 const int8x16_t delta, |
| 484 int8x16_t* const op0, int8x16_t* const oq0) { |
| 485 const int8x16_t kCst3 = vdupq_n_s8(0x03); |
| 486 const int8x16_t kCst4 = vdupq_n_s8(0x04); |
| 487 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); |
| 488 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); |
| 489 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); |
| 490 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); |
| 491 *op0 = vqaddq_s8(p0s, delta3); |
| 492 *oq0 = vqsubq_s8(q0s, delta4); |
| 493 } |
| 494 |
| 495 #if defined(WEBP_USE_INTRINSICS) |
| 496 |
482 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, | 497 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, |
483 const int8x16_t delta, | 498 const int8x16_t delta, |
484 uint8x16_t* const op0, uint8x16_t* const oq0) { | 499 uint8x16_t* const op0, uint8x16_t* const oq0) { |
485 const int8x16_t kCst3 = vdupq_n_s8(0x03); | 500 const int8x16_t kCst3 = vdupq_n_s8(0x03); |
486 const int8x16_t kCst4 = vdupq_n_s8(0x04); | 501 const int8x16_t kCst4 = vdupq_n_s8(0x04); |
487 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); | 502 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); |
488 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); | 503 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); |
489 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); | 504 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); |
490 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); | 505 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); |
491 const int8x16_t sp0 = vqaddq_s8(p0s, delta3); | 506 const int8x16_t sp0 = vqaddq_s8(p0s, delta3); |
492 const int8x16_t sq0 = vqsubq_s8(q0s, delta4); | 507 const int8x16_t sq0 = vqsubq_s8(q0s, delta4); |
493 *op0 = FlipSignBack(sp0); | 508 *op0 = FlipSignBack(sp0); |
494 *oq0 = FlipSignBack(sq0); | 509 *oq0 = FlipSignBack(sq0); |
495 } | 510 } |
496 | 511 |
497 #if defined(USE_INTRINSICS) | |
498 | |
499 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, | 512 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, |
500 const uint8x16_t q0, const uint8x16_t q1, | 513 const uint8x16_t q0, const uint8x16_t q1, |
501 const uint8x16_t mask, | 514 const uint8x16_t mask, |
502 uint8x16_t* const op0, uint8x16_t* const oq0) { | 515 uint8x16_t* const op0, uint8x16_t* const oq0) { |
503 const int8x16_t p1s = FlipSign(p1); | 516 const int8x16_t p1s = FlipSign(p1); |
504 const int8x16_t p0s = FlipSign(p0); | 517 const int8x16_t p0s = FlipSign(p0); |
505 const int8x16_t q0s = FlipSign(q0); | 518 const int8x16_t q0s = FlipSign(q0); |
506 const int8x16_t q1s = FlipSign(q1); | 519 const int8x16_t q1s = FlipSign(q1); |
507 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); | 520 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); |
508 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask)); | 521 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask)); |
(...skipping 110 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
619 "vswp d5, d24 \n" | 632 "vswp d5, d24 \n" |
620 STORE8x2(d4, d5, [%[p]], %[stride]) | 633 STORE8x2(d4, d5, [%[p]], %[stride]) |
621 STORE8x2(d24, d25, [%[p]], %[stride]) | 634 STORE8x2(d24, d25, [%[p]], %[stride]) |
622 | 635 |
623 : [p] "+r"(p) | 636 : [p] "+r"(p) |
624 : [stride] "r"(stride), [thresh] "r"(thresh) | 637 : [stride] "r"(stride), [thresh] "r"(thresh) |
625 : "memory", "r4", "r5", "r6", QRegs | 638 : "memory", "r4", "r5", "r6", QRegs |
626 ); | 639 ); |
627 } | 640 } |
628 | 641 |
629 #endif // USE_INTRINSICS | 642 #endif // WEBP_USE_INTRINSICS |
630 | 643 |
631 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { | 644 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { |
632 uint32_t k; | 645 uint32_t k; |
633 for (k = 3; k != 0; --k) { | 646 for (k = 3; k != 0; --k) { |
634 p += 4 * stride; | 647 p += 4 * stride; |
635 SimpleVFilter16(p, stride, thresh); | 648 SimpleVFilter16(p, stride, thresh); |
636 } | 649 } |
637 } | 650 } |
638 | 651 |
639 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { | 652 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { |
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
714 int8x16_t p0s = FlipSign(p0); | 727 int8x16_t p0s = FlipSign(p0); |
715 int8x16_t q0s = FlipSign(q0); | 728 int8x16_t q0s = FlipSign(q0); |
716 const int8x16_t q1s = FlipSign(q1); | 729 const int8x16_t q1s = FlipSign(q1); |
717 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); | 730 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); |
718 | 731 |
719 // do_filter2 part (simple loopfilter on pixels with hev) | 732 // do_filter2 part (simple loopfilter on pixels with hev) |
720 { | 733 { |
721 const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s); | 734 const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s); |
722 const int8x16_t simple_lf_delta = | 735 const int8x16_t simple_lf_delta = |
723 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask)); | 736 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask)); |
724 uint8x16_t tmp_p0, tmp_q0; | 737 ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); |
725 ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0); | |
726 // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here | |
727 p0s = FlipSign(tmp_p0); | |
728 q0s = FlipSign(tmp_q0); | |
729 } | 738 } |
730 | 739 |
731 // do_filter4 part (complex loopfilter on pixels without hev) | 740 // do_filter4 part (complex loopfilter on pixels without hev) |
732 { | 741 { |
733 const int8x16_t delta0 = GetBaseDelta0(p0s, q0s); | 742 const int8x16_t delta0 = GetBaseDelta0(p0s, q0s); |
734 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask | 743 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask |
735 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); | 744 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); |
736 const int8x16_t complex_lf_delta = | 745 const int8x16_t complex_lf_delta = |
737 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); | 746 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); |
738 ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1); | 747 ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1); |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
790 int8x16_t q0s = FlipSign(q0); | 799 int8x16_t q0s = FlipSign(q0); |
791 const int8x16_t q1s = FlipSign(q1); | 800 const int8x16_t q1s = FlipSign(q1); |
792 const int8x16_t q2s = FlipSign(q2); | 801 const int8x16_t q2s = FlipSign(q2); |
793 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); | 802 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); |
794 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); | 803 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); |
795 | 804 |
796 // do_filter2 part (simple loopfilter on pixels with hev) | 805 // do_filter2 part (simple loopfilter on pixels with hev) |
797 { | 806 { |
798 const int8x16_t simple_lf_delta = | 807 const int8x16_t simple_lf_delta = |
799 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask)); | 808 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask)); |
800 uint8x16_t tmp_p0, tmp_q0; | 809 ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); |
801 ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0); | |
802 // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here | |
803 p0s = FlipSign(tmp_p0); | |
804 q0s = FlipSign(tmp_q0); | |
805 } | 810 } |
806 | 811 |
807 // do_filter6 part (complex loopfilter on pixels without hev) | 812 // do_filter6 part (complex loopfilter on pixels without hev) |
808 { | 813 { |
809 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask | 814 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask |
810 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); | 815 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); |
811 const int8x16_t complex_lf_delta = | 816 const int8x16_t complex_lf_delta = |
812 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); | 817 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); |
813 ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta, | 818 ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta, |
814 op2, op1, op0, oq0, oq1, oq2); | 819 op2, op1, op0, oq0, oq1, oq2); |
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
979 | 984 |
980 // libwebp uses a trick to avoid some extra addition that libvpx does. | 985 // libwebp uses a trick to avoid some extra addition that libvpx does. |
981 // Instead of: | 986 // Instead of: |
982 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); | 987 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); |
983 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the | 988 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the |
984 // same issue with kC1 and vqdmulh that we work around by down shifting kC2 | 989 // same issue with kC1 and vqdmulh that we work around by down shifting kC2 |
985 | 990 |
986 static const int16_t kC1 = 20091; | 991 static const int16_t kC1 = 20091; |
987 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. | 992 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. |
988 | 993 |
989 #if defined(USE_INTRINSICS) | 994 #if defined(WEBP_USE_INTRINSICS) |
990 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, | 995 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, |
991 int16x8x2_t* const out) { | 996 int16x8x2_t* const out) { |
992 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 | 997 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 |
993 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 | 998 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 |
994 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... | 999 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... |
995 // b0 d0 b1 d1 b2 d2 ... | 1000 // b0 d0 b1 d1 b2 d2 ... |
996 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); | 1001 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); |
997 } | 1002 } |
998 | 1003 |
999 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { | 1004 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { |
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1156 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" | 1161 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" |
1157 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" | 1162 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" |
1158 "vst1.32 d1[1], [%[dst]] \n" | 1163 "vst1.32 d1[1], [%[dst]] \n" |
1159 | 1164 |
1160 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ | 1165 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ |
1161 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */ | 1166 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */ |
1162 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */ | 1167 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */ |
1163 ); | 1168 ); |
1164 } | 1169 } |
1165 | 1170 |
1166 #endif // USE_INTRINSICS | 1171 #endif // WEBP_USE_INTRINSICS |
1167 | 1172 |
1168 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { | 1173 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { |
1169 TransformOne(in, dst); | 1174 TransformOne(in, dst); |
1170 if (do_two) { | 1175 if (do_two) { |
1171 TransformOne(in + 16, dst + 4); | 1176 TransformOne(in + 16, dst + 4); |
1172 } | 1177 } |
1173 } | 1178 } |
1174 | 1179 |
1175 static void TransformDC(const int16_t* in, uint8_t* dst) { | 1180 static void TransformDC(const int16_t* in, uint8_t* dst) { |
1176 const int16x8_t DC = vdupq_n_s16(in[0]); | 1181 const int16x8_t DC = vdupq_n_s16(in[0]); |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1234 } | 1239 } |
1235 | 1240 |
1236 #undef STORE_WHT | 1241 #undef STORE_WHT |
1237 | 1242 |
1238 //------------------------------------------------------------------------------ | 1243 //------------------------------------------------------------------------------ |
1239 | 1244 |
1240 #define MUL(a, b) (((a) * (b)) >> 16) | 1245 #define MUL(a, b) (((a) * (b)) >> 16) |
1241 static void TransformAC3(const int16_t* in, uint8_t* dst) { | 1246 static void TransformAC3(const int16_t* in, uint8_t* dst) { |
1242 static const int kC1_full = 20091 + (1 << 16); | 1247 static const int kC1_full = 20091 + (1 << 16); |
1243 static const int kC2_full = 35468; | 1248 static const int kC2_full = 35468; |
1244 const int16x4_t A = vdup_n_s16(in[0]); | 1249 const int16x4_t A = vld1_dup_s16(in); |
1245 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full)); | 1250 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full)); |
1246 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full)); | 1251 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full)); |
1247 const int c1 = MUL(in[1], kC2_full); | 1252 const int c1 = MUL(in[1], kC2_full); |
1248 const int d1 = MUL(in[1], kC1_full); | 1253 const int d1 = MUL(in[1], kC1_full); |
1249 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 | | 1254 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 | |
1250 (uint64_t)( c1 & 0xffff) << 16 | | 1255 (uint64_t)( c1 & 0xffff) << 16 | |
1251 (uint64_t)(-c1 & 0xffff) << 32 | | 1256 (uint64_t)(-c1 & 0xffff) << 32 | |
1252 (uint64_t)(-d1 & 0xffff) << 48; | 1257 (uint64_t)(-d1 & 0xffff) << 48; |
1253 const int16x4_t CD = vcreate_s16(cd); | 1258 const int16x4_t CD = vcreate_s16(cd); |
1254 const int16x4_t B = vqadd_s16(A, CD); | 1259 const int16x4_t B = vqadd_s16(A, CD); |
1255 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4)); | 1260 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4)); |
1256 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4)); | 1261 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4)); |
1257 Add4x4(m0_m1, m2_m3, dst); | 1262 Add4x4(m0_m1, m2_m3, dst); |
1258 } | 1263 } |
1259 #undef MUL | 1264 #undef MUL |
1260 | 1265 |
1261 #endif // WEBP_USE_NEON | 1266 //------------------------------------------------------------------------------ |
| 1267 // 4x4 |
| 1268 |
| 1269 static void DC4(uint8_t* dst) { // DC |
| 1270 const uint8x8_t A = vld1_u8(dst - BPS); // top row |
| 1271 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top |
| 1272 const uint16x4_t p1 = vpadd_u16(p0, p0); |
| 1273 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); |
| 1274 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); |
| 1275 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); |
| 1276 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); |
| 1277 const uint16x8_t s0 = vaddq_u16(L0, L1); |
| 1278 const uint16x8_t s1 = vaddq_u16(L2, L3); |
| 1279 const uint16x8_t s01 = vaddq_u16(s0, s1); |
| 1280 const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1)); |
| 1281 const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3 |
| 1282 const uint8x8_t dc = vdup_lane_u8(dc0, 0); |
| 1283 int i; |
| 1284 for (i = 0; i < 4; ++i) { |
| 1285 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0); |
| 1286 } |
| 1287 } |
| 1288 |
| 1289 // TrueMotion (4x4 + 8x8) |
| 1290 static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { |
| 1291 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' |
| 1292 const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]' |
| 1293 const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1] |
| 1294 int y; |
| 1295 for (y = 0; y < size; y += 4) { |
| 1296 // left edge |
| 1297 const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); |
| 1298 const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); |
| 1299 const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); |
| 1300 const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); |
| 1301 const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1] |
| 1302 const int16x8_t r1 = vaddq_s16(L1, d); |
| 1303 const int16x8_t r2 = vaddq_s16(L2, d); |
| 1304 const int16x8_t r3 = vaddq_s16(L3, d); |
| 1305 // Saturate and store the result. |
| 1306 const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0)); |
| 1307 const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1)); |
| 1308 const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2)); |
| 1309 const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3)); |
| 1310 if (size == 4) { |
| 1311 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0); |
| 1312 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0); |
| 1313 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0); |
| 1314 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0); |
| 1315 } else { |
| 1316 vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32); |
| 1317 vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32); |
| 1318 vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32); |
| 1319 vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32); |
| 1320 } |
| 1321 dst += 4 * BPS; |
| 1322 } |
| 1323 } |
| 1324 |
| 1325 static void TM4(uint8_t* dst) { TrueMotion(dst, 4); } |
| 1326 |
| 1327 static void VE4(uint8_t* dst) { // vertical |
| 1328 // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS. |
| 1329 const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row |
| 1330 const uint64x1_t A1 = vshr_n_u64(A0, 8); |
| 1331 const uint64x1_t A2 = vshr_n_u64(A0, 16); |
| 1332 const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); |
| 1333 const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); |
| 1334 const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); |
| 1335 const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00); |
| 1336 const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0); |
| 1337 int i; |
| 1338 for (i = 0; i < 4; ++i) { |
| 1339 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0); |
| 1340 } |
| 1341 } |
| 1342 |
| 1343 static void RD4(uint8_t* dst) { // Down-right |
| 1344 const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1); |
| 1345 const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); |
| 1346 const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); |
| 1347 const uint32_t I = dst[-1 + 0 * BPS]; |
| 1348 const uint32_t J = dst[-1 + 1 * BPS]; |
| 1349 const uint32_t K = dst[-1 + 2 * BPS]; |
| 1350 const uint32_t L = dst[-1 + 3 * BPS]; |
| 1351 const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24)); |
| 1352 const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); |
| 1353 const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); |
| 1354 const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); |
| 1355 const uint8_t D = vget_lane_u8(XABCD_u8, 4); |
| 1356 const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); |
| 1357 const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); |
| 1358 const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); |
| 1359 const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); |
| 1360 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); |
| 1361 const uint32x2_t r3 = vreinterpret_u32_u8(avg2); |
| 1362 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); |
| 1363 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); |
| 1364 const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); |
| 1365 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0); |
| 1366 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0); |
| 1367 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0); |
| 1368 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); |
| 1369 } |
| 1370 |
| 1371 static void LD4(uint8_t* dst) { // Down-left |
| 1372 // Note using the same shift trick as VE4() is slower here. |
| 1373 const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0); |
| 1374 const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1); |
| 1375 const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2); |
| 1376 const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6); |
| 1377 const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0); |
| 1378 const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); |
| 1379 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); |
| 1380 const uint32x2_t r0 = vreinterpret_u32_u8(avg2); |
| 1381 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); |
| 1382 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); |
| 1383 const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); |
| 1384 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0); |
| 1385 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0); |
| 1386 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0); |
| 1387 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); |
| 1388 } |
| 1389 |
| 1390 //------------------------------------------------------------------------------ |
| 1391 // Chroma |
| 1392 |
| 1393 static void VE8uv(uint8_t* dst) { // vertical |
| 1394 const uint8x8_t top = vld1_u8(dst - BPS); |
| 1395 int j; |
| 1396 for (j = 0; j < 8; ++j) { |
| 1397 vst1_u8(dst + j * BPS, top); |
| 1398 } |
| 1399 } |
| 1400 |
| 1401 static void HE8uv(uint8_t* dst) { // horizontal |
| 1402 int j; |
| 1403 for (j = 0; j < 8; ++j) { |
| 1404 const uint8x8_t left = vld1_dup_u8(dst - 1); |
| 1405 vst1_u8(dst, left); |
| 1406 dst += BPS; |
| 1407 } |
| 1408 } |
| 1409 |
| 1410 static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) { |
| 1411 uint16x8_t sum_top; |
| 1412 uint16x8_t sum_left; |
| 1413 uint8x8_t dc0; |
| 1414 |
| 1415 if (do_top) { |
| 1416 const uint8x8_t A = vld1_u8(dst - BPS); // top row |
| 1417 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top |
| 1418 const uint16x4_t p1 = vpadd_u16(p0, p0); |
| 1419 const uint16x4_t p2 = vpadd_u16(p1, p1); |
| 1420 sum_top = vcombine_u16(p2, p2); |
| 1421 } |
| 1422 |
| 1423 if (do_left) { |
| 1424 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); |
| 1425 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); |
| 1426 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); |
| 1427 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); |
| 1428 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1)); |
| 1429 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1)); |
| 1430 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1)); |
| 1431 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1)); |
| 1432 const uint16x8_t s0 = vaddq_u16(L0, L1); |
| 1433 const uint16x8_t s1 = vaddq_u16(L2, L3); |
| 1434 const uint16x8_t s2 = vaddq_u16(L4, L5); |
| 1435 const uint16x8_t s3 = vaddq_u16(L6, L7); |
| 1436 const uint16x8_t s01 = vaddq_u16(s0, s1); |
| 1437 const uint16x8_t s23 = vaddq_u16(s2, s3); |
| 1438 sum_left = vaddq_u16(s01, s23); |
| 1439 } |
| 1440 |
| 1441 if (do_top && do_left) { |
| 1442 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); |
| 1443 dc0 = vrshrn_n_u16(sum, 4); |
| 1444 } else if (do_top) { |
| 1445 dc0 = vrshrn_n_u16(sum_top, 3); |
| 1446 } else if (do_left) { |
| 1447 dc0 = vrshrn_n_u16(sum_left, 3); |
| 1448 } else { |
| 1449 dc0 = vdup_n_u8(0x80); |
| 1450 } |
| 1451 |
| 1452 { |
| 1453 const uint8x8_t dc = vdup_lane_u8(dc0, 0); |
| 1454 int i; |
| 1455 for (i = 0; i < 8; ++i) { |
| 1456 vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc)); |
| 1457 } |
| 1458 } |
| 1459 } |
| 1460 |
| 1461 static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); } |
| 1462 static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); } |
| 1463 static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); } |
| 1464 static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); } |
| 1465 |
| 1466 static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); } |
| 1467 |
| 1468 //------------------------------------------------------------------------------ |
| 1469 // 16x16 |
| 1470 |
| 1471 static void VE16(uint8_t* dst) { // vertical |
| 1472 const uint8x16_t top = vld1q_u8(dst - BPS); |
| 1473 int j; |
| 1474 for (j = 0; j < 16; ++j) { |
| 1475 vst1q_u8(dst + j * BPS, top); |
| 1476 } |
| 1477 } |
| 1478 |
| 1479 static void HE16(uint8_t* dst) { // horizontal |
| 1480 int j; |
| 1481 for (j = 0; j < 16; ++j) { |
| 1482 const uint8x16_t left = vld1q_dup_u8(dst - 1); |
| 1483 vst1q_u8(dst, left); |
| 1484 dst += BPS; |
| 1485 } |
| 1486 } |
| 1487 |
| 1488 static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) { |
| 1489 uint16x8_t sum_top; |
| 1490 uint16x8_t sum_left; |
| 1491 uint8x8_t dc0; |
| 1492 |
| 1493 if (do_top) { |
| 1494 const uint8x16_t A = vld1q_u8(dst - BPS); // top row |
| 1495 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top |
| 1496 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); |
| 1497 const uint16x4_t p2 = vpadd_u16(p1, p1); |
| 1498 const uint16x4_t p3 = vpadd_u16(p2, p2); |
| 1499 sum_top = vcombine_u16(p3, p3); |
| 1500 } |
| 1501 |
| 1502 if (do_left) { |
| 1503 int i; |
| 1504 sum_left = vdupq_n_u16(0); |
| 1505 for (i = 0; i < 16; i += 8) { |
| 1506 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1)); |
| 1507 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1)); |
| 1508 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1)); |
| 1509 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1)); |
| 1510 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1)); |
| 1511 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1)); |
| 1512 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1)); |
| 1513 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1)); |
| 1514 const uint16x8_t s0 = vaddq_u16(L0, L1); |
| 1515 const uint16x8_t s1 = vaddq_u16(L2, L3); |
| 1516 const uint16x8_t s2 = vaddq_u16(L4, L5); |
| 1517 const uint16x8_t s3 = vaddq_u16(L6, L7); |
| 1518 const uint16x8_t s01 = vaddq_u16(s0, s1); |
| 1519 const uint16x8_t s23 = vaddq_u16(s2, s3); |
| 1520 const uint16x8_t sum = vaddq_u16(s01, s23); |
| 1521 sum_left = vaddq_u16(sum_left, sum); |
| 1522 } |
| 1523 } |
| 1524 |
| 1525 if (do_top && do_left) { |
| 1526 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); |
| 1527 dc0 = vrshrn_n_u16(sum, 5); |
| 1528 } else if (do_top) { |
| 1529 dc0 = vrshrn_n_u16(sum_top, 4); |
| 1530 } else if (do_left) { |
| 1531 dc0 = vrshrn_n_u16(sum_left, 4); |
| 1532 } else { |
| 1533 dc0 = vdup_n_u8(0x80); |
| 1534 } |
| 1535 |
| 1536 { |
| 1537 const uint8x16_t dc = vdupq_lane_u8(dc0, 0); |
| 1538 int i; |
| 1539 for (i = 0; i < 16; ++i) { |
| 1540 vst1q_u8(dst + i * BPS, dc); |
| 1541 } |
| 1542 } |
| 1543 } |
| 1544 |
| 1545 static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); } |
| 1546 static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); } |
| 1547 static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); } |
| 1548 static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); } |
| 1549 |
| 1550 static void TM16(uint8_t* dst) { |
| 1551 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' |
| 1552 const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]' |
| 1553 // A[c] - A[-1] |
| 1554 const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL)); |
| 1555 const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL)); |
| 1556 int y; |
| 1557 for (y = 0; y < 16; y += 4) { |
| 1558 // left edge |
| 1559 const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); |
| 1560 const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); |
| 1561 const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); |
| 1562 const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); |
| 1563 const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1] |
| 1564 const int16x8_t r1_lo = vaddq_s16(L1, d_lo); |
| 1565 const int16x8_t r2_lo = vaddq_s16(L2, d_lo); |
| 1566 const int16x8_t r3_lo = vaddq_s16(L3, d_lo); |
| 1567 const int16x8_t r0_hi = vaddq_s16(L0, d_hi); |
| 1568 const int16x8_t r1_hi = vaddq_s16(L1, d_hi); |
| 1569 const int16x8_t r2_hi = vaddq_s16(L2, d_hi); |
| 1570 const int16x8_t r3_hi = vaddq_s16(L3, d_hi); |
| 1571 // Saturate and store the result. |
| 1572 const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi)); |
| 1573 const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi)); |
| 1574 const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi)); |
| 1575 const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi)); |
| 1576 vst1q_u8(dst + 0 * BPS, row0); |
| 1577 vst1q_u8(dst + 1 * BPS, row1); |
| 1578 vst1q_u8(dst + 2 * BPS, row2); |
| 1579 vst1q_u8(dst + 3 * BPS, row3); |
| 1580 dst += 4 * BPS; |
| 1581 } |
| 1582 } |
1262 | 1583 |
1263 //------------------------------------------------------------------------------ | 1584 //------------------------------------------------------------------------------ |
1264 // Entry point | 1585 // Entry point |
1265 | 1586 |
1266 extern void VP8DspInitNEON(void); | 1587 extern void VP8DspInitNEON(void); |
1267 | 1588 |
1268 void VP8DspInitNEON(void) { | 1589 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) { |
1269 #if defined(WEBP_USE_NEON) | |
1270 VP8Transform = TransformTwo; | 1590 VP8Transform = TransformTwo; |
1271 VP8TransformAC3 = TransformAC3; | 1591 VP8TransformAC3 = TransformAC3; |
1272 VP8TransformDC = TransformDC; | 1592 VP8TransformDC = TransformDC; |
1273 VP8TransformWHT = TransformWHT; | 1593 VP8TransformWHT = TransformWHT; |
1274 | 1594 |
1275 VP8VFilter16 = VFilter16; | 1595 VP8VFilter16 = VFilter16; |
1276 VP8VFilter16i = VFilter16i; | 1596 VP8VFilter16i = VFilter16i; |
1277 VP8HFilter16 = HFilter16; | 1597 VP8HFilter16 = HFilter16; |
1278 #if !defined(WORK_AROUND_GCC) | 1598 #if !defined(WORK_AROUND_GCC) |
1279 VP8HFilter16i = HFilter16i; | 1599 VP8HFilter16i = HFilter16i; |
1280 #endif | 1600 #endif |
1281 VP8VFilter8 = VFilter8; | 1601 VP8VFilter8 = VFilter8; |
1282 VP8VFilter8i = VFilter8i; | 1602 VP8VFilter8i = VFilter8i; |
1283 #if !defined(WORK_AROUND_GCC) | 1603 #if !defined(WORK_AROUND_GCC) |
1284 VP8HFilter8 = HFilter8; | 1604 VP8HFilter8 = HFilter8; |
1285 VP8HFilter8i = HFilter8i; | 1605 VP8HFilter8i = HFilter8i; |
1286 #endif | 1606 #endif |
1287 VP8SimpleVFilter16 = SimpleVFilter16; | 1607 VP8SimpleVFilter16 = SimpleVFilter16; |
1288 VP8SimpleHFilter16 = SimpleHFilter16; | 1608 VP8SimpleHFilter16 = SimpleHFilter16; |
1289 VP8SimpleVFilter16i = SimpleVFilter16i; | 1609 VP8SimpleVFilter16i = SimpleVFilter16i; |
1290 VP8SimpleHFilter16i = SimpleHFilter16i; | 1610 VP8SimpleHFilter16i = SimpleHFilter16i; |
1291 #endif // WEBP_USE_NEON | 1611 |
| 1612 VP8PredLuma4[0] = DC4; |
| 1613 VP8PredLuma4[1] = TM4; |
| 1614 VP8PredLuma4[2] = VE4; |
| 1615 VP8PredLuma4[4] = RD4; |
| 1616 VP8PredLuma4[6] = LD4; |
| 1617 |
| 1618 VP8PredLuma16[0] = DC16TopLeft; |
| 1619 VP8PredLuma16[1] = TM16; |
| 1620 VP8PredLuma16[2] = VE16; |
| 1621 VP8PredLuma16[3] = HE16; |
| 1622 VP8PredLuma16[4] = DC16NoTop; |
| 1623 VP8PredLuma16[5] = DC16NoLeft; |
| 1624 VP8PredLuma16[6] = DC16NoTopLeft; |
| 1625 |
| 1626 VP8PredChroma8[0] = DC8uv; |
| 1627 VP8PredChroma8[1] = TM8uv; |
| 1628 VP8PredChroma8[2] = VE8uv; |
| 1629 VP8PredChroma8[3] = HE8uv; |
| 1630 VP8PredChroma8[4] = DC8uvNoTop; |
| 1631 VP8PredChroma8[5] = DC8uvNoLeft; |
| 1632 VP8PredChroma8[6] = DC8uvNoTopLeft; |
1292 } | 1633 } |
| 1634 |
| 1635 #else // !WEBP_USE_NEON |
| 1636 |
| 1637 WEBP_DSP_INIT_STUB(VP8DspInitNEON) |
| 1638 |
| 1639 #endif // WEBP_USE_NEON |
OLD | NEW |