Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(67)

Side by Side Diff: third_party/libwebp/dsp/dec_neon.c

Issue 1546003002: libwebp: update to 0.5.0 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: rebase Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/dec_mips_dsp_r2.c ('k') | third_party/libwebp/dsp/dec_sse2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 Google Inc. All Rights Reserved. 1 // Copyright 2012 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // ARM NEON version of dsp functions and loop filtering. 10 // ARM NEON version of dsp functions and loop filtering.
(...skipping 371 matching lines...) Expand 10 before | Expand all | Expand 10 after
382 vst4_lane_u8(v - 2 + 2 * stride, v0, 2); 382 vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
383 vst4_lane_u8(v - 2 + 3 * stride, v0, 3); 383 vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
384 vst4_lane_u8(v - 2 + 4 * stride, v0, 4); 384 vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
385 vst4_lane_u8(v - 2 + 5 * stride, v0, 5); 385 vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
386 vst4_lane_u8(v - 2 + 6 * stride, v0, 6); 386 vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
387 vst4_lane_u8(v - 2 + 7 * stride, v0, 7); 387 vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
388 } 388 }
389 389
390 #endif // !WORK_AROUND_GCC 390 #endif // !WORK_AROUND_GCC
391 391
392 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. 392 // Zero extend 'v' to an int16x8_t.
393 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { 393 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
394 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); 394 return vreinterpretq_s16_u16(vmovl_u8(v));
395 } 395 }
396 396
397 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result 397 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
398 // to the corresponding rows of 'dst'. 398 // to the corresponding rows of 'dst'.
399 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, 399 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
400 const int16x8_t dst01, 400 const int16x8_t dst01,
401 const int16x8_t dst23) { 401 const int16x8_t dst23) {
402 // Unsigned saturate to 8b. 402 // Unsigned saturate to 8b.
403 const uint8x8_t dst01_u8 = vqmovun_s16(dst01); 403 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
404 const uint8x8_t dst23_u8 = vqmovun_s16(dst23); 404 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
(...skipping 11 matching lines...) Expand all
416 uint32x2_t dst23 = vdup_n_u32(0); 416 uint32x2_t dst23 = vdup_n_u32(0);
417 417
418 // Load the source pixels. 418 // Load the source pixels.
419 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0); 419 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
420 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0); 420 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
421 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1); 421 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
422 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1); 422 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
423 423
424 { 424 {
425 // Convert to 16b. 425 // Convert to 16b.
426 const int16x8_t dst01_s16 = ConvertU8ToS16(dst01); 426 const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
427 const int16x8_t dst23_s16 = ConvertU8ToS16(dst23); 427 const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));
428 428
429 // Descale with rounding. 429 // Descale with rounding.
430 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); 430 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
431 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); 431 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
432 // Add the inverse transform. 432 // Add the inverse transform.
433 SaturateAndStore4x4(dst, out01, out23); 433 SaturateAndStore4x4(dst, out01, out23);
434 } 434 }
435 } 435 }
436 436
437 //----------------------------------------------------------------------------- 437 //-----------------------------------------------------------------------------
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
472 472
473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) { 473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
474 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) 474 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
475 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0) 475 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
476 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0) 476 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
477 return s2; 477 return s2;
478 } 478 }
479 479
480 //------------------------------------------------------------------------------ 480 //------------------------------------------------------------------------------
481 481
482 static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
483 const int8x16_t delta,
484 int8x16_t* const op0, int8x16_t* const oq0) {
485 const int8x16_t kCst3 = vdupq_n_s8(0x03);
486 const int8x16_t kCst4 = vdupq_n_s8(0x04);
487 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
488 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
489 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
490 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
491 *op0 = vqaddq_s8(p0s, delta3);
492 *oq0 = vqsubq_s8(q0s, delta4);
493 }
494
495 #if defined(WEBP_USE_INTRINSICS)
496
482 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, 497 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
483 const int8x16_t delta, 498 const int8x16_t delta,
484 uint8x16_t* const op0, uint8x16_t* const oq0) { 499 uint8x16_t* const op0, uint8x16_t* const oq0) {
485 const int8x16_t kCst3 = vdupq_n_s8(0x03); 500 const int8x16_t kCst3 = vdupq_n_s8(0x03);
486 const int8x16_t kCst4 = vdupq_n_s8(0x04); 501 const int8x16_t kCst4 = vdupq_n_s8(0x04);
487 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); 502 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
488 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); 503 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
489 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); 504 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
490 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); 505 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
491 const int8x16_t sp0 = vqaddq_s8(p0s, delta3); 506 const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
492 const int8x16_t sq0 = vqsubq_s8(q0s, delta4); 507 const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
493 *op0 = FlipSignBack(sp0); 508 *op0 = FlipSignBack(sp0);
494 *oq0 = FlipSignBack(sq0); 509 *oq0 = FlipSignBack(sq0);
495 } 510 }
496 511
497 #if defined(USE_INTRINSICS)
498
499 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, 512 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
500 const uint8x16_t q0, const uint8x16_t q1, 513 const uint8x16_t q0, const uint8x16_t q1,
501 const uint8x16_t mask, 514 const uint8x16_t mask,
502 uint8x16_t* const op0, uint8x16_t* const oq0) { 515 uint8x16_t* const op0, uint8x16_t* const oq0) {
503 const int8x16_t p1s = FlipSign(p1); 516 const int8x16_t p1s = FlipSign(p1);
504 const int8x16_t p0s = FlipSign(p0); 517 const int8x16_t p0s = FlipSign(p0);
505 const int8x16_t q0s = FlipSign(q0); 518 const int8x16_t q0s = FlipSign(q0);
506 const int8x16_t q1s = FlipSign(q1); 519 const int8x16_t q1s = FlipSign(q1);
507 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); 520 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
508 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask)); 521 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
(...skipping 110 matching lines...) Expand 10 before | Expand all | Expand 10 after
619 "vswp d5, d24 \n" 632 "vswp d5, d24 \n"
620 STORE8x2(d4, d5, [%[p]], %[stride]) 633 STORE8x2(d4, d5, [%[p]], %[stride])
621 STORE8x2(d24, d25, [%[p]], %[stride]) 634 STORE8x2(d24, d25, [%[p]], %[stride])
622 635
623 : [p] "+r"(p) 636 : [p] "+r"(p)
624 : [stride] "r"(stride), [thresh] "r"(thresh) 637 : [stride] "r"(stride), [thresh] "r"(thresh)
625 : "memory", "r4", "r5", "r6", QRegs 638 : "memory", "r4", "r5", "r6", QRegs
626 ); 639 );
627 } 640 }
628 641
629 #endif // USE_INTRINSICS 642 #endif // WEBP_USE_INTRINSICS
630 643
631 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { 644 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
632 uint32_t k; 645 uint32_t k;
633 for (k = 3; k != 0; --k) { 646 for (k = 3; k != 0; --k) {
634 p += 4 * stride; 647 p += 4 * stride;
635 SimpleVFilter16(p, stride, thresh); 648 SimpleVFilter16(p, stride, thresh);
636 } 649 }
637 } 650 }
638 651
639 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { 652 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after
714 int8x16_t p0s = FlipSign(p0); 727 int8x16_t p0s = FlipSign(p0);
715 int8x16_t q0s = FlipSign(q0); 728 int8x16_t q0s = FlipSign(q0);
716 const int8x16_t q1s = FlipSign(q1); 729 const int8x16_t q1s = FlipSign(q1);
717 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); 730 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
718 731
719 // do_filter2 part (simple loopfilter on pixels with hev) 732 // do_filter2 part (simple loopfilter on pixels with hev)
720 { 733 {
721 const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s); 734 const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
722 const int8x16_t simple_lf_delta = 735 const int8x16_t simple_lf_delta =
723 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask)); 736 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
724 uint8x16_t tmp_p0, tmp_q0; 737 ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
725 ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
726 // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
727 p0s = FlipSign(tmp_p0);
728 q0s = FlipSign(tmp_q0);
729 } 738 }
730 739
731 // do_filter4 part (complex loopfilter on pixels without hev) 740 // do_filter4 part (complex loopfilter on pixels without hev)
732 { 741 {
733 const int8x16_t delta0 = GetBaseDelta0(p0s, q0s); 742 const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
734 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask 743 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
735 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); 744 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
736 const int8x16_t complex_lf_delta = 745 const int8x16_t complex_lf_delta =
737 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); 746 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
738 ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1); 747 ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
790 int8x16_t q0s = FlipSign(q0); 799 int8x16_t q0s = FlipSign(q0);
791 const int8x16_t q1s = FlipSign(q1); 800 const int8x16_t q1s = FlipSign(q1);
792 const int8x16_t q2s = FlipSign(q2); 801 const int8x16_t q2s = FlipSign(q2);
793 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); 802 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
794 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); 803 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
795 804
796 // do_filter2 part (simple loopfilter on pixels with hev) 805 // do_filter2 part (simple loopfilter on pixels with hev)
797 { 806 {
798 const int8x16_t simple_lf_delta = 807 const int8x16_t simple_lf_delta =
799 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask)); 808 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
800 uint8x16_t tmp_p0, tmp_q0; 809 ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
801 ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
802 // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
803 p0s = FlipSign(tmp_p0);
804 q0s = FlipSign(tmp_q0);
805 } 810 }
806 811
807 // do_filter6 part (complex loopfilter on pixels without hev) 812 // do_filter6 part (complex loopfilter on pixels without hev)
808 { 813 {
809 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask 814 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
810 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); 815 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
811 const int8x16_t complex_lf_delta = 816 const int8x16_t complex_lf_delta =
812 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); 817 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
813 ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta, 818 ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
814 op2, op1, op0, oq0, oq1, oq2); 819 op2, op1, op0, oq0, oq1, oq2);
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after
979 984
980 // libwebp uses a trick to avoid some extra addition that libvpx does. 985 // libwebp uses a trick to avoid some extra addition that libvpx does.
981 // Instead of: 986 // Instead of:
982 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); 987 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
983 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the 988 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
984 // same issue with kC1 and vqdmulh that we work around by down shifting kC2 989 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
985 990
986 static const int16_t kC1 = 20091; 991 static const int16_t kC1 = 20091;
987 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. 992 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
988 993
989 #if defined(USE_INTRINSICS) 994 #if defined(WEBP_USE_INTRINSICS)
990 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, 995 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
991 int16x8x2_t* const out) { 996 int16x8x2_t* const out) {
992 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 997 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
993 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 998 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
994 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... 999 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
995 // b0 d0 b1 d1 b2 d2 ... 1000 // b0 d0 b1 d1 b2 d2 ...
996 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); 1001 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
997 } 1002 }
998 1003
999 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { 1004 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after
1156 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 1161 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
1157 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 1162 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
1158 "vst1.32 d1[1], [%[dst]] \n" 1163 "vst1.32 d1[1], [%[dst]] \n"
1159 1164
1160 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ 1165 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
1161 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */ 1166 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */
1162 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */ 1167 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */
1163 ); 1168 );
1164 } 1169 }
1165 1170
1166 #endif // USE_INTRINSICS 1171 #endif // WEBP_USE_INTRINSICS
1167 1172
1168 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { 1173 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
1169 TransformOne(in, dst); 1174 TransformOne(in, dst);
1170 if (do_two) { 1175 if (do_two) {
1171 TransformOne(in + 16, dst + 4); 1176 TransformOne(in + 16, dst + 4);
1172 } 1177 }
1173 } 1178 }
1174 1179
1175 static void TransformDC(const int16_t* in, uint8_t* dst) { 1180 static void TransformDC(const int16_t* in, uint8_t* dst) {
1176 const int16x8_t DC = vdupq_n_s16(in[0]); 1181 const int16x8_t DC = vdupq_n_s16(in[0]);
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
1234 } 1239 }
1235 1240
1236 #undef STORE_WHT 1241 #undef STORE_WHT
1237 1242
1238 //------------------------------------------------------------------------------ 1243 //------------------------------------------------------------------------------
1239 1244
1240 #define MUL(a, b) (((a) * (b)) >> 16) 1245 #define MUL(a, b) (((a) * (b)) >> 16)
1241 static void TransformAC3(const int16_t* in, uint8_t* dst) { 1246 static void TransformAC3(const int16_t* in, uint8_t* dst) {
1242 static const int kC1_full = 20091 + (1 << 16); 1247 static const int kC1_full = 20091 + (1 << 16);
1243 static const int kC2_full = 35468; 1248 static const int kC2_full = 35468;
1244 const int16x4_t A = vdup_n_s16(in[0]); 1249 const int16x4_t A = vld1_dup_s16(in);
1245 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full)); 1250 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1246 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full)); 1251 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1247 const int c1 = MUL(in[1], kC2_full); 1252 const int c1 = MUL(in[1], kC2_full);
1248 const int d1 = MUL(in[1], kC1_full); 1253 const int d1 = MUL(in[1], kC1_full);
1249 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 | 1254 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |
1250 (uint64_t)( c1 & 0xffff) << 16 | 1255 (uint64_t)( c1 & 0xffff) << 16 |
1251 (uint64_t)(-c1 & 0xffff) << 32 | 1256 (uint64_t)(-c1 & 0xffff) << 32 |
1252 (uint64_t)(-d1 & 0xffff) << 48; 1257 (uint64_t)(-d1 & 0xffff) << 48;
1253 const int16x4_t CD = vcreate_s16(cd); 1258 const int16x4_t CD = vcreate_s16(cd);
1254 const int16x4_t B = vqadd_s16(A, CD); 1259 const int16x4_t B = vqadd_s16(A, CD);
1255 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4)); 1260 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1256 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4)); 1261 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1257 Add4x4(m0_m1, m2_m3, dst); 1262 Add4x4(m0_m1, m2_m3, dst);
1258 } 1263 }
1259 #undef MUL 1264 #undef MUL
1260 1265
1261 #endif // WEBP_USE_NEON 1266 //------------------------------------------------------------------------------
1267 // 4x4
1268
1269 static void DC4(uint8_t* dst) { // DC
1270 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1271 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1272 const uint16x4_t p1 = vpadd_u16(p0, p0);
1273 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1274 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1275 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1276 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1277 const uint16x8_t s0 = vaddq_u16(L0, L1);
1278 const uint16x8_t s1 = vaddq_u16(L2, L3);
1279 const uint16x8_t s01 = vaddq_u16(s0, s1);
1280 const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1281 const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
1282 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1283 int i;
1284 for (i = 0; i < 4; ++i) {
1285 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1286 }
1287 }
1288
1289 // TrueMotion (4x4 + 8x8)
1290 static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
1291 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1292 const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
1293 const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]
1294 int y;
1295 for (y = 0; y < size; y += 4) {
1296 // left edge
1297 const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
1298 const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
1299 const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
1300 const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
1301 const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
1302 const int16x8_t r1 = vaddq_s16(L1, d);
1303 const int16x8_t r2 = vaddq_s16(L2, d);
1304 const int16x8_t r3 = vaddq_s16(L3, d);
1305 // Saturate and store the result.
1306 const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1307 const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1308 const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1309 const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1310 if (size == 4) {
1311 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1312 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1313 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1314 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1315 } else {
1316 vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1317 vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1318 vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1319 vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1320 }
1321 dst += 4 * BPS;
1322 }
1323 }
1324
1325 static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
1326
1327 static void VE4(uint8_t* dst) { // vertical
1328 // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1329 const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row
1330 const uint64x1_t A1 = vshr_n_u64(A0, 8);
1331 const uint64x1_t A2 = vshr_n_u64(A0, 16);
1332 const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1333 const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1334 const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1335 const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1336 const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1337 int i;
1338 for (i = 0; i < 4; ++i) {
1339 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1340 }
1341 }
1342
1343 static void RD4(uint8_t* dst) { // Down-right
1344 const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1345 const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1346 const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1347 const uint32_t I = dst[-1 + 0 * BPS];
1348 const uint32_t J = dst[-1 + 1 * BPS];
1349 const uint32_t K = dst[-1 + 2 * BPS];
1350 const uint32_t L = dst[-1 + 3 * BPS];
1351 const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
1352 const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1353 const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1354 const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1355 const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1356 const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1357 const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1358 const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1359 const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1360 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1361 const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1362 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1363 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1364 const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1365 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1366 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1367 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1368 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1369 }
1370
1371 static void LD4(uint8_t* dst) { // Down-left
1372 // Note using the same shift trick as VE4() is slower here.
1373 const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1374 const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1375 const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1376 const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1377 const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1378 const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1379 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1380 const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1381 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1382 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1383 const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1384 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1385 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1386 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1387 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1388 }
1389
1390 //------------------------------------------------------------------------------
1391 // Chroma
1392
1393 static void VE8uv(uint8_t* dst) { // vertical
1394 const uint8x8_t top = vld1_u8(dst - BPS);
1395 int j;
1396 for (j = 0; j < 8; ++j) {
1397 vst1_u8(dst + j * BPS, top);
1398 }
1399 }
1400
1401 static void HE8uv(uint8_t* dst) { // horizontal
1402 int j;
1403 for (j = 0; j < 8; ++j) {
1404 const uint8x8_t left = vld1_dup_u8(dst - 1);
1405 vst1_u8(dst, left);
1406 dst += BPS;
1407 }
1408 }
1409
1410 static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
1411 uint16x8_t sum_top;
1412 uint16x8_t sum_left;
1413 uint8x8_t dc0;
1414
1415 if (do_top) {
1416 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1417 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1418 const uint16x4_t p1 = vpadd_u16(p0, p0);
1419 const uint16x4_t p2 = vpadd_u16(p1, p1);
1420 sum_top = vcombine_u16(p2, p2);
1421 }
1422
1423 if (do_left) {
1424 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1425 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1426 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1427 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1428 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
1429 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
1430 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
1431 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
1432 const uint16x8_t s0 = vaddq_u16(L0, L1);
1433 const uint16x8_t s1 = vaddq_u16(L2, L3);
1434 const uint16x8_t s2 = vaddq_u16(L4, L5);
1435 const uint16x8_t s3 = vaddq_u16(L6, L7);
1436 const uint16x8_t s01 = vaddq_u16(s0, s1);
1437 const uint16x8_t s23 = vaddq_u16(s2, s3);
1438 sum_left = vaddq_u16(s01, s23);
1439 }
1440
1441 if (do_top && do_left) {
1442 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1443 dc0 = vrshrn_n_u16(sum, 4);
1444 } else if (do_top) {
1445 dc0 = vrshrn_n_u16(sum_top, 3);
1446 } else if (do_left) {
1447 dc0 = vrshrn_n_u16(sum_left, 3);
1448 } else {
1449 dc0 = vdup_n_u8(0x80);
1450 }
1451
1452 {
1453 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1454 int i;
1455 for (i = 0; i < 8; ++i) {
1456 vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1457 }
1458 }
1459 }
1460
1461 static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); }
1462 static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); }
1463 static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); }
1464 static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); }
1465
1466 static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
1467
1468 //------------------------------------------------------------------------------
1469 // 16x16
1470
1471 static void VE16(uint8_t* dst) { // vertical
1472 const uint8x16_t top = vld1q_u8(dst - BPS);
1473 int j;
1474 for (j = 0; j < 16; ++j) {
1475 vst1q_u8(dst + j * BPS, top);
1476 }
1477 }
1478
1479 static void HE16(uint8_t* dst) { // horizontal
1480 int j;
1481 for (j = 0; j < 16; ++j) {
1482 const uint8x16_t left = vld1q_dup_u8(dst - 1);
1483 vst1q_u8(dst, left);
1484 dst += BPS;
1485 }
1486 }
1487
1488 static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
1489 uint16x8_t sum_top;
1490 uint16x8_t sum_left;
1491 uint8x8_t dc0;
1492
1493 if (do_top) {
1494 const uint8x16_t A = vld1q_u8(dst - BPS); // top row
1495 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
1496 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1497 const uint16x4_t p2 = vpadd_u16(p1, p1);
1498 const uint16x4_t p3 = vpadd_u16(p2, p2);
1499 sum_top = vcombine_u16(p3, p3);
1500 }
1501
1502 if (do_left) {
1503 int i;
1504 sum_left = vdupq_n_u16(0);
1505 for (i = 0; i < 16; i += 8) {
1506 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
1507 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
1508 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
1509 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
1510 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
1511 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
1512 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
1513 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
1514 const uint16x8_t s0 = vaddq_u16(L0, L1);
1515 const uint16x8_t s1 = vaddq_u16(L2, L3);
1516 const uint16x8_t s2 = vaddq_u16(L4, L5);
1517 const uint16x8_t s3 = vaddq_u16(L6, L7);
1518 const uint16x8_t s01 = vaddq_u16(s0, s1);
1519 const uint16x8_t s23 = vaddq_u16(s2, s3);
1520 const uint16x8_t sum = vaddq_u16(s01, s23);
1521 sum_left = vaddq_u16(sum_left, sum);
1522 }
1523 }
1524
1525 if (do_top && do_left) {
1526 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1527 dc0 = vrshrn_n_u16(sum, 5);
1528 } else if (do_top) {
1529 dc0 = vrshrn_n_u16(sum_top, 4);
1530 } else if (do_left) {
1531 dc0 = vrshrn_n_u16(sum_left, 4);
1532 } else {
1533 dc0 = vdup_n_u8(0x80);
1534 }
1535
1536 {
1537 const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1538 int i;
1539 for (i = 0; i < 16; ++i) {
1540 vst1q_u8(dst + i * BPS, dc);
1541 }
1542 }
1543 }
1544
1545 static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); }
1546 static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); }
1547 static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); }
1548 static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); }
1549
1550 static void TM16(uint8_t* dst) {
1551 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1552 const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
1553 // A[c] - A[-1]
1554 const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1555 const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1556 int y;
1557 for (y = 0; y < 16; y += 4) {
1558 // left edge
1559 const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
1560 const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
1561 const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
1562 const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
1563 const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]
1564 const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1565 const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1566 const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1567 const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1568 const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1569 const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1570 const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1571 // Saturate and store the result.
1572 const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1573 const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1574 const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1575 const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1576 vst1q_u8(dst + 0 * BPS, row0);
1577 vst1q_u8(dst + 1 * BPS, row1);
1578 vst1q_u8(dst + 2 * BPS, row2);
1579 vst1q_u8(dst + 3 * BPS, row3);
1580 dst += 4 * BPS;
1581 }
1582 }
1262 1583
1263 //------------------------------------------------------------------------------ 1584 //------------------------------------------------------------------------------
1264 // Entry point 1585 // Entry point
1265 1586
1266 extern void VP8DspInitNEON(void); 1587 extern void VP8DspInitNEON(void);
1267 1588
1268 void VP8DspInitNEON(void) { 1589 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1269 #if defined(WEBP_USE_NEON)
1270 VP8Transform = TransformTwo; 1590 VP8Transform = TransformTwo;
1271 VP8TransformAC3 = TransformAC3; 1591 VP8TransformAC3 = TransformAC3;
1272 VP8TransformDC = TransformDC; 1592 VP8TransformDC = TransformDC;
1273 VP8TransformWHT = TransformWHT; 1593 VP8TransformWHT = TransformWHT;
1274 1594
1275 VP8VFilter16 = VFilter16; 1595 VP8VFilter16 = VFilter16;
1276 VP8VFilter16i = VFilter16i; 1596 VP8VFilter16i = VFilter16i;
1277 VP8HFilter16 = HFilter16; 1597 VP8HFilter16 = HFilter16;
1278 #if !defined(WORK_AROUND_GCC) 1598 #if !defined(WORK_AROUND_GCC)
1279 VP8HFilter16i = HFilter16i; 1599 VP8HFilter16i = HFilter16i;
1280 #endif 1600 #endif
1281 VP8VFilter8 = VFilter8; 1601 VP8VFilter8 = VFilter8;
1282 VP8VFilter8i = VFilter8i; 1602 VP8VFilter8i = VFilter8i;
1283 #if !defined(WORK_AROUND_GCC) 1603 #if !defined(WORK_AROUND_GCC)
1284 VP8HFilter8 = HFilter8; 1604 VP8HFilter8 = HFilter8;
1285 VP8HFilter8i = HFilter8i; 1605 VP8HFilter8i = HFilter8i;
1286 #endif 1606 #endif
1287 VP8SimpleVFilter16 = SimpleVFilter16; 1607 VP8SimpleVFilter16 = SimpleVFilter16;
1288 VP8SimpleHFilter16 = SimpleHFilter16; 1608 VP8SimpleHFilter16 = SimpleHFilter16;
1289 VP8SimpleVFilter16i = SimpleVFilter16i; 1609 VP8SimpleVFilter16i = SimpleVFilter16i;
1290 VP8SimpleHFilter16i = SimpleHFilter16i; 1610 VP8SimpleHFilter16i = SimpleHFilter16i;
1291 #endif // WEBP_USE_NEON 1611
1612 VP8PredLuma4[0] = DC4;
1613 VP8PredLuma4[1] = TM4;
1614 VP8PredLuma4[2] = VE4;
1615 VP8PredLuma4[4] = RD4;
1616 VP8PredLuma4[6] = LD4;
1617
1618 VP8PredLuma16[0] = DC16TopLeft;
1619 VP8PredLuma16[1] = TM16;
1620 VP8PredLuma16[2] = VE16;
1621 VP8PredLuma16[3] = HE16;
1622 VP8PredLuma16[4] = DC16NoTop;
1623 VP8PredLuma16[5] = DC16NoLeft;
1624 VP8PredLuma16[6] = DC16NoTopLeft;
1625
1626 VP8PredChroma8[0] = DC8uv;
1627 VP8PredChroma8[1] = TM8uv;
1628 VP8PredChroma8[2] = VE8uv;
1629 VP8PredChroma8[3] = HE8uv;
1630 VP8PredChroma8[4] = DC8uvNoTop;
1631 VP8PredChroma8[5] = DC8uvNoLeft;
1632 VP8PredChroma8[6] = DC8uvNoTopLeft;
1292 } 1633 }
1634
1635 #else // !WEBP_USE_NEON
1636
1637 WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1638
1639 #endif // WEBP_USE_NEON
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/dec_mips_dsp_r2.c ('k') | third_party/libwebp/dsp/dec_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698