Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(57)

Side by Side Diff: third_party/libwebp/dsp/enc_neon.c

Issue 1546003002: libwebp: update to 0.5.0 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: rebase Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/enc_mips_dsp_r2.c ('k') | third_party/libwebp/dsp/enc_sse2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 Google Inc. All Rights Reserved. 1 // Copyright 2012 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // ARM NEON version of speed-critical encoding functions. 10 // ARM NEON version of speed-critical encoding functions.
(...skipping 14 matching lines...) Expand all
25 25
26 // Inverse transform. 26 // Inverse transform.
27 // This code is pretty much the same as TransformOne in the dec_neon.c, except 27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
28 // for subtraction to *ref. See the comments there for algorithmic explanations. 28 // for subtraction to *ref. See the comments there for algorithmic explanations.
29 29
30 static const int16_t kC1 = 20091; 30 static const int16_t kC1 = 20091;
31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. 31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
32 32
33 // This code works but is *slower* than the inlined-asm version below 33 // This code works but is *slower* than the inlined-asm version below
34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to 34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
35 // USE_INTRINSICS define. 35 // WEBP_USE_INTRINSICS define.
36 // With gcc-4.8, it's a little faster speed than inlined-assembly. 36 // With gcc-4.8, it's a little faster speed than inlined-assembly.
37 #if defined(USE_INTRINSICS) 37 #if defined(WEBP_USE_INTRINSICS)
38 38
39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. 39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { 40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
41 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); 41 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
42 } 42 }
43 43
44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result 44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
45 // to the corresponding rows of 'dst'. 45 // to the corresponding rows of 'dst'.
46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, 46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
47 const int16x8_t dst01, 47 const int16x8_t dst01,
(...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after
234 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 234 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
235 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 235 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
236 "vst1.32 d1[1], [%[dst]] \n" 236 "vst1.32 d1[1], [%[dst]] \n"
237 237
238 : [in] "+r"(in), [dst] "+r"(dst) // modified registers 238 : [in] "+r"(in), [dst] "+r"(dst) // modified registers
239 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants 239 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
240 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered 240 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
241 ); 241 );
242 } 242 }
243 243
244 #endif // USE_INTRINSICS 244 #endif // WEBP_USE_INTRINSICS
245 245
246 static void ITransform(const uint8_t* ref, 246 static void ITransform(const uint8_t* ref,
247 const int16_t* in, uint8_t* dst, int do_two) { 247 const int16_t* in, uint8_t* dst, int do_two) {
248 ITransformOne(ref, in, dst); 248 ITransformOne(ref, in, dst);
249 if (do_two) { 249 if (do_two) {
250 ITransformOne(ref + 4, in + 16, dst + 4); 250 ITransformOne(ref + 4, in + 16, dst + 4);
251 } 251 }
252 } 252 }
253 253
254 // Load all 4x4 pixels into a single uint8x16_t variable. 254 // Load all 4x4 pixels into a single uint8x16_t variable.
255 static uint8x16_t Load4x4(const uint8_t* src) { 255 static uint8x16_t Load4x4(const uint8_t* src) {
256 uint32x4_t out = vdupq_n_u32(0); 256 uint32x4_t out = vdupq_n_u32(0);
257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); 257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); 258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); 259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); 260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
261 return vreinterpretq_u8_u32(out); 261 return vreinterpretq_u8_u32(out);
262 } 262 }
263 263
264 // Forward transform. 264 // Forward transform.
265 265
266 #if defined(USE_INTRINSICS) 266 #if defined(WEBP_USE_INTRINSICS)
267 267
268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B, 268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
269 const int16x4_t C, const int16x4_t D, 269 const int16x4_t C, const int16x4_t D,
270 int16x8_t* const out01, 270 int16x8_t* const out01,
271 int16x8_t* const out32) { 271 int16x8_t* const out32) {
272 const int16x4x2_t AB = vtrn_s16(A, B); 272 const int16x4x2_t AB = vtrn_s16(A, B);
273 const int16x4x2_t CD = vtrn_s16(C, D); 273 const int16x4x2_t CD = vtrn_s16(C, D);
274 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]), 274 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
275 vreinterpret_s32_s16(CD.val[0])); 275 vreinterpret_s32_s16(CD.val[0]));
276 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]), 276 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
(...skipping 264 matching lines...) Expand 10 before | Expand all | Expand 10 after
541 } 541 }
542 } 542 }
543 #undef LOAD_LANE_16b 543 #undef LOAD_LANE_16b
544 544
545 //------------------------------------------------------------------------------ 545 //------------------------------------------------------------------------------
546 // Texture distortion 546 // Texture distortion
547 // 547 //
548 // We try to match the spectral content (weighted) between source and 548 // We try to match the spectral content (weighted) between source and
549 // reconstructed samples. 549 // reconstructed samples.
550 550
551 // This code works but is *slower* than the inlined-asm version below 551 // a 0123, b 0123
552 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to 552 // a 4567, b 4567
553 // USE_INTRINSICS define. 553 // a 89ab, b 89ab
554 // With gcc-4.8, it's only slightly slower than the inlined. 554 // a cdef, b cdef
555 #if defined(USE_INTRINSICS) 555 //
556 // transpose
557 //
558 // a 048c, b 048c
559 // a 159d, b 159d
560 // a 26ae, b 26ae
561 // a 37bf, b 37bf
562 //
563 static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) {
564 const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]);
565 const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]);
566 const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]),
567 vreinterpret_u16_u8(d2_tmp1.val[0]));
568 const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]),
569 vreinterpret_u16_u8(d2_tmp1.val[1]));
556 570
557 // Zero extend an uint16x4_t 'v' to an int32x4_t. 571 d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]);
558 static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) { 572 d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]);
559 return vreinterpretq_s32_u32(vmovl_u16(v)); 573 d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]);
574 d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]);
575 return d4_in;
560 } 576 }
561 577
562 // Does a regular 4x4 transpose followed by an adjustment of the upper columns 578 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
563 // in the inner rows to restore the source order of differences, 579 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
564 // i.e., a0 - a1 | a3 - a2. 580 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
565 static WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) { 581 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
566 int32x4x4_t out = Transpose4x4(rows); 582 vreinterpretq_s32_s16(q2_tmp1.val[0]));
567 // restore source order in the columns containing differences. 583 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
568 const int32x2_t r1h = vget_high_s32(out.val[1]); 584 vreinterpretq_s32_s16(q2_tmp1.val[1]));
569 const int32x2_t r2h = vget_high_s32(out.val[2]); 585 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
570 out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h); 586 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
571 out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h); 587 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
572 return out; 588 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
589 return q4_in;
573 } 590 }
574 591
575 static WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1, 592 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) {
576 const uint8x8_t r2r3) { 593 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
577 // a0 = in[0] + in[2] | a1 = in[1] + in[3] 594 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
578 const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3); 595 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0],
579 // a3 = in[0] - in[2] | a2 = in[1] - in[3] 596 d4_in.val[2]));
580 const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3); 597 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1],
581 const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1)); // a0 + a1 598 d4_in.val[3]));
582 const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2)); // a3 + a2 599 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0],
583 // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations. 600 d4_in.val[2]));
584 // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 601 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1],
585 // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 602 d4_in.val[3]));
586 const int16x8x2_t transpose = 603 int16x8x4_t q4_out;
587 vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2)); 604 // tmp[0] = a0 + a1
588 // tmp[3] = a0 - a1 | tmp[2] = a3 - a2 605 // tmp[1] = a3 + a2
589 const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]), 606 // tmp[2] = a3 - a2
590 vget_low_s16(transpose.val[1])); 607 // tmp[3] = a0 - a1
591 const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]), 608 INIT_VECTOR4(q4_out,
592 vget_high_s16(transpose.val[1])); 609 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
593 // [0]: tmp[3] [1]: tmp[2] 610 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
594 const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2); 611 return q4_out;
595 const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } };
596 return res;
597 } 612 }
598 613
599 static WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) { 614 static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) {
600 // a0 = tmp[0 + i] + tmp[8 + i]; 615 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
601 const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]); 616 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
602 // a1 = tmp[4 + i] + tmp[12+ i]; 617 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
603 const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]); 618 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
604 // a2 = tmp[4 + i] - tmp[12+ i]; 619
605 const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]); 620 q4_in.val[0] = vaddq_s16(q_a0, q_a1);
606 // a3 = tmp[0 + i] - tmp[8 + i]; 621 q4_in.val[1] = vaddq_s16(q_a3, q_a2);
607 const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]); 622 q4_in.val[2] = vabdq_s16(q_a3, q_a2);
608 const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1)); // abs(a0 + a1) 623 q4_in.val[3] = vabdq_s16(q_a0, q_a1);
609 const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2)); // abs(a3 + a2) 624 q4_in.val[0] = vabsq_s16(q4_in.val[0]);
610 const int32x4_t b2 = vabdq_s32(a3, a2); // abs(a3 - a2) 625 q4_in.val[1] = vabsq_s16(q4_in.val[1]);
611 const int32x4_t b3 = vabdq_s32(a0, a1); // abs(a0 - a1) 626 return q4_in;
612 const int32x4x4_t res = { { b0, b1, b2, b3 } };
613 return res;
614 } 627 }
615 628
616 // Calculate the weighted sum of the rows in 'b'. 629 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
617 static WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b, 630 const uint16x8_t q_w07 = vld1q_u16(&w[0]);
618 const int32x4_t w0, const int32x4_t w1, 631 const uint16x8_t q_w8f = vld1q_u16(&w[8]);
619 const int32x4_t w2, const int32x4_t w3) { 632 int16x4x4_t d4_w;
620 const int32x4_t s0 = vmulq_s32(w0, b.val[0]); 633 INIT_VECTOR4(d4_w,
621 const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]); 634 vget_low_s16(vreinterpretq_s16_u16(q_w07)),
622 const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]); 635 vget_high_s16(vreinterpretq_s16_u16(q_w07)),
623 const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]); 636 vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
624 const int64x2_t sum1 = vpaddlq_s32(s3); 637 vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
625 const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1)); 638 return d4_w;
626 return sum2; 639 }
640
641 static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
642 const int16x4x4_t d4_w) {
643 int32x2_t d_sum;
644 // sum += w[ 0] * abs(b0);
645 // sum += w[ 4] * abs(b1);
646 // sum += w[ 8] * abs(b2);
647 // sum += w[12] * abs(b3);
648 int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
649 int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
650 int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
651 int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
652 q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
653 q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
654 q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
655 q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
656
657 q_sum0 = vaddq_s32(q_sum0, q_sum1);
658 q_sum2 = vaddq_s32(q_sum2, q_sum3);
659 q_sum2 = vaddq_s32(q_sum0, q_sum2);
660 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
661 d_sum = vpadd_s32(d_sum, d_sum);
662 return d_sum;
627 } 663 }
628 664
629 #define LOAD_LANE_32b(src, VALUE, LANE) \ 665 #define LOAD_LANE_32b(src, VALUE, LANE) \
630 (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE)) 666 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
631 667
632 // Hadamard transform 668 // Hadamard transform
633 // Returns the weighted sum of the absolute value of transformed coefficients. 669 // Returns the weighted sum of the absolute value of transformed coefficients.
634 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, 670 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
635 const uint16_t* const w) { 671 const uint16_t* const w) {
636 uint32x4_t d0d1 = { 0, 0, 0, 0 }; 672 uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
637 uint32x4_t d2d3 = { 0, 0, 0, 0 }; 673 uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
638 LOAD_LANE_32b(a + 0 * BPS, d0d1, 0); // a00 a01 a02 a03 674 uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
639 LOAD_LANE_32b(a + 1 * BPS, d0d1, 1); // a10 a11 a12 a13 675 uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
640 LOAD_LANE_32b(b + 0 * BPS, d0d1, 2); // b00 b01 b02 b03 676 uint8x8x4_t d4_in;
641 LOAD_LANE_32b(b + 1 * BPS, d0d1, 3); // b10 b11 b12 b13 677
642 LOAD_LANE_32b(a + 2 * BPS, d2d3, 0); // a20 a21 a22 a23 678 // load data a, b
643 LOAD_LANE_32b(a + 3 * BPS, d2d3, 1); // a30 a31 a32 a33 679 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
644 LOAD_LANE_32b(b + 2 * BPS, d2d3, 2); // b20 b21 b22 b23 680 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
645 LOAD_LANE_32b(b + 3 * BPS, d2d3, 3); // b30 b31 b32 b33 681 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
682 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
683 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
684 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
685 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
686 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
687 INIT_VECTOR4(d4_in,
688 vreinterpret_u8_u32(d_in_ab_0123),
689 vreinterpret_u8_u32(d_in_ab_4567),
690 vreinterpret_u8_u32(d_in_ab_89ab),
691 vreinterpret_u8_u32(d_in_ab_cdef));
646 692
647 { 693 {
648 // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31 694 // horizontal pass
649 // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33 695 const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in);
650 const uint16x8x2_t tmp = 696 const int16x8x4_t q4_h = DistoHorizontalPass(d4_t);
651 vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3)); 697 const int16x4x4_t d4_w = DistoLoadW(w);
652 const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]); 698 // vertical pass
653 const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]); 699 const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h);
654 const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8), 700 const int16x8x4_t q4_v = DistoVerticalPass(q4_t);
655 vget_low_u8(d2d3u8)); 701 int32x2_t d_sum = DistoSum(q4_v, d4_w);
656 const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8), 702
657 vget_high_u8(d2d3u8)); 703 // abs(sum2 - sum1) >> 5
658 const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a); 704 d_sum = vabs_s32(d_sum);
659 const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b); 705 d_sum = vshr_n_s32(d_sum, 5);
660 const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a); 706 return vget_lane_s32(d_sum, 0);
661 const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b);
662 const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0));
663 const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4));
664 const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8));
665 const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12));
666 const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3);
667 const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3);
668 const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1),
669 vreinterpret_s32_s64(sum2));
670 const int32x2_t res = vshr_n_s32(diff, 5);
671 return vget_lane_s32(res, 0);
672 } 707 }
673 } 708 }
674
675 #undef LOAD_LANE_32b 709 #undef LOAD_LANE_32b
676 710
677 #else
678
679 // Hadamard transform
680 // Returns the weighted sum of the absolute value of transformed coefficients.
681 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
682 const uint16_t* const w) {
683 const int kBPS = BPS;
684 const uint8_t* A = a;
685 const uint8_t* B = b;
686 const uint16_t* W = w;
687 int sum;
688 __asm__ volatile (
689 "vld1.32 d0[0], [%[a]], %[kBPS] \n"
690 "vld1.32 d0[1], [%[a]], %[kBPS] \n"
691 "vld1.32 d2[0], [%[a]], %[kBPS] \n"
692 "vld1.32 d2[1], [%[a]] \n"
693
694 "vld1.32 d1[0], [%[b]], %[kBPS] \n"
695 "vld1.32 d1[1], [%[b]], %[kBPS] \n"
696 "vld1.32 d3[0], [%[b]], %[kBPS] \n"
697 "vld1.32 d3[1], [%[b]] \n"
698
699 // a d0/d2, b d1/d3
700 // d0/d1: 01 01 01 01
701 // d2/d3: 23 23 23 23
702 // But: it goes 01 45 23 67
703 // Notice the middle values are transposed
704 "vtrn.16 q0, q1 \n"
705
706 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
707 "vaddl.u8 q2, d0, d2 \n"
708 "vaddl.u8 q10, d1, d3 \n"
709 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
710 "vsubl.u8 q3, d0, d2 \n"
711 "vsubl.u8 q11, d1, d3 \n"
712
713 // tmp[0] = a0 + a1
714 "vpaddl.s16 q0, q2 \n"
715 "vpaddl.s16 q8, q10 \n"
716
717 // tmp[1] = a3 + a2
718 "vpaddl.s16 q1, q3 \n"
719 "vpaddl.s16 q9, q11 \n"
720
721 // No pair subtract
722 // q2 = {a0, a3}
723 // q3 = {a1, a2}
724 "vtrn.16 q2, q3 \n"
725 "vtrn.16 q10, q11 \n"
726
727 // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
728 "vsubl.s16 q12, d4, d6 \n"
729 "vsubl.s16 q13, d5, d7 \n"
730 "vsubl.s16 q14, d20, d22 \n"
731 "vsubl.s16 q15, d21, d23 \n"
732
733 // separate tmp[3] and tmp[2]
734 // q12 = tmp[3]
735 // q13 = tmp[2]
736 "vtrn.32 q12, q13 \n"
737 "vtrn.32 q14, q15 \n"
738
739 // Transpose tmp for a
740 "vswp d1, d26 \n" // vtrn.64
741 "vswp d3, d24 \n" // vtrn.64
742 "vtrn.32 q0, q1 \n"
743 "vtrn.32 q13, q12 \n"
744
745 // Transpose tmp for b
746 "vswp d17, d30 \n" // vtrn.64
747 "vswp d19, d28 \n" // vtrn.64
748 "vtrn.32 q8, q9 \n"
749 "vtrn.32 q15, q14 \n"
750
751 // The first Q register is a, the second b.
752 // q0/8 tmp[0-3]
753 // q13/15 tmp[4-7]
754 // q1/9 tmp[8-11]
755 // q12/14 tmp[12-15]
756
757 // These are still in 01 45 23 67 order. We fix it easily in the addition
758 // case but the subtraction propagates them.
759 "vswp d3, d27 \n"
760 "vswp d19, d31 \n"
761
762 // a0 = tmp[0] + tmp[8]
763 "vadd.s32 q2, q0, q1 \n"
764 "vadd.s32 q3, q8, q9 \n"
765
766 // a1 = tmp[4] + tmp[12]
767 "vadd.s32 q10, q13, q12 \n"
768 "vadd.s32 q11, q15, q14 \n"
769
770 // a2 = tmp[4] - tmp[12]
771 "vsub.s32 q13, q13, q12 \n"
772 "vsub.s32 q15, q15, q14 \n"
773
774 // a3 = tmp[0] - tmp[8]
775 "vsub.s32 q0, q0, q1 \n"
776 "vsub.s32 q8, q8, q9 \n"
777
778 // b0 = a0 + a1
779 "vadd.s32 q1, q2, q10 \n"
780 "vadd.s32 q9, q3, q11 \n"
781
782 // b1 = a3 + a2
783 "vadd.s32 q12, q0, q13 \n"
784 "vadd.s32 q14, q8, q15 \n"
785
786 // b2 = a3 - a2
787 "vsub.s32 q0, q0, q13 \n"
788 "vsub.s32 q8, q8, q15 \n"
789
790 // b3 = a0 - a1
791 "vsub.s32 q2, q2, q10 \n"
792 "vsub.s32 q3, q3, q11 \n"
793
794 "vld1.64 {q10, q11}, [%[w]] \n"
795
796 // abs(b0)
797 "vabs.s32 q1, q1 \n"
798 "vabs.s32 q9, q9 \n"
799 // abs(b1)
800 "vabs.s32 q12, q12 \n"
801 "vabs.s32 q14, q14 \n"
802 // abs(b2)
803 "vabs.s32 q0, q0 \n"
804 "vabs.s32 q8, q8 \n"
805 // abs(b3)
806 "vabs.s32 q2, q2 \n"
807 "vabs.s32 q3, q3 \n"
808
809 // expand w before using.
810 "vmovl.u16 q13, d20 \n"
811 "vmovl.u16 q15, d21 \n"
812
813 // w[0] * abs(b0)
814 "vmul.u32 q1, q1, q13 \n"
815 "vmul.u32 q9, q9, q13 \n"
816
817 // w[4] * abs(b1)
818 "vmla.u32 q1, q12, q15 \n"
819 "vmla.u32 q9, q14, q15 \n"
820
821 // expand w before using.
822 "vmovl.u16 q13, d22 \n"
823 "vmovl.u16 q15, d23 \n"
824
825 // w[8] * abs(b1)
826 "vmla.u32 q1, q0, q13 \n"
827 "vmla.u32 q9, q8, q13 \n"
828
829 // w[12] * abs(b1)
830 "vmla.u32 q1, q2, q15 \n"
831 "vmla.u32 q9, q3, q15 \n"
832
833 // Sum the arrays
834 "vpaddl.u32 q1, q1 \n"
835 "vpaddl.u32 q9, q9 \n"
836 "vadd.u64 d2, d3 \n"
837 "vadd.u64 d18, d19 \n"
838
839 // Hadamard transform needs 4 bits of extra precision (2 bits in each
840 // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
841 // precision for coeff is 8bit of input + 4bits of Hadamard transform +
842 // 16bits for w[] + 2 bits of abs() summation.
843 //
844 // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
845 // A-OK.
846
847 // sum2 - sum1
848 "vsub.u32 d0, d2, d18 \n"
849 // abs(sum2 - sum1)
850 "vabs.s32 d0, d0 \n"
851 // abs(sum2 - sum1) >> 5
852 "vshr.u32 d0, #5 \n"
853
854 // It would be better to move the value straight into r0 but I'm not
855 // entirely sure how this works with inline assembly.
856 "vmov.32 %[sum], d0[0] \n"
857
858 : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
859 : [kBPS] "r"(kBPS)
860 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
861 "q10", "q11", "q12", "q13", "q14", "q15" // clobbered
862 ) ;
863
864 return sum;
865 }
866
867 #endif // USE_INTRINSICS
868
869 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, 711 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
870 const uint16_t* const w) { 712 const uint16_t* const w) {
871 int D = 0; 713 int D = 0;
872 int x, y; 714 int x, y;
873 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 715 for (y = 0; y < 16 * BPS; y += 4 * BPS) {
874 for (x = 0; x < 16; x += 4) { 716 for (x = 0; x < 16; x += 4) {
875 D += Disto4x4(a + x + y, b + x + y, w); 717 D += Disto4x4(a + x + y, b + x + y, w);
876 } 718 }
877 } 719 }
878 return D; 720 return D;
879 } 721 }
880 722
881 //------------------------------------------------------------------------------ 723 //------------------------------------------------------------------------------
882 724
883 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, 725 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
884 int start_block, int end_block, 726 int start_block, int end_block,
885 VP8Histogram* const histo) { 727 VP8Histogram* const histo) {
886 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); 728 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
887 int j; 729 int j;
730 int distribution[MAX_COEFF_THRESH + 1] = { 0 };
888 for (j = start_block; j < end_block; ++j) { 731 for (j = start_block; j < end_block; ++j) {
889 int16_t out[16]; 732 int16_t out[16];
890 FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); 733 FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
891 { 734 {
892 int k; 735 int k;
893 const int16x8_t a0 = vld1q_s16(out + 0); 736 const int16x8_t a0 = vld1q_s16(out + 0);
894 const int16x8_t b0 = vld1q_s16(out + 8); 737 const int16x8_t b0 = vld1q_s16(out + 8);
895 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0)); 738 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
896 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0)); 739 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
897 const uint16x8_t a2 = vshrq_n_u16(a1, 3); 740 const uint16x8_t a2 = vshrq_n_u16(a1, 3);
898 const uint16x8_t b2 = vshrq_n_u16(b1, 3); 741 const uint16x8_t b2 = vshrq_n_u16(b1, 3);
899 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh); 742 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
900 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh); 743 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
901 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3)); 744 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
902 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3)); 745 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
903 // Convert coefficients to bin. 746 // Convert coefficients to bin.
904 for (k = 0; k < 16; ++k) { 747 for (k = 0; k < 16; ++k) {
905 histo->distribution[out[k]]++; 748 ++distribution[out[k]];
906 } 749 }
907 } 750 }
908 } 751 }
752 VP8SetHistogramData(distribution, histo);
909 } 753 }
910 754
911 //------------------------------------------------------------------------------ 755 //------------------------------------------------------------------------------
912 756
913 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, 757 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
914 const uint8_t* const b, 758 const uint8_t* const b,
915 uint32x4_t* const sum) { 759 uint32x4_t* const sum) {
916 const uint8x16_t a0 = vld1q_u8(a); 760 const uint8x16_t a0 = vld1q_u8(a);
917 const uint8x16_t b0 = vld1q_u8(b); 761 const uint8x16_t b0 = vld1q_u8(b);
918 const uint8x16_t abs_diff = vabdq_u8(a0, b0); 762 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after
1042 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]); 886 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
1043 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]); 887 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
1044 // test zeros 888 // test zeros
1045 if (*(uint64_t*)(out + 0) != 0) return 1; 889 if (*(uint64_t*)(out + 0) != 0) return 1;
1046 if (*(uint64_t*)(out + 4) != 0) return 1; 890 if (*(uint64_t*)(out + 4) != 0) return 1;
1047 if (*(uint64_t*)(out + 8) != 0) return 1; 891 if (*(uint64_t*)(out + 8) != 0) return 1;
1048 if (*(uint64_t*)(out + 12) != 0) return 1; 892 if (*(uint64_t*)(out + 12) != 0) return 1;
1049 return 0; 893 return 0;
1050 } 894 }
1051 895
896 static int Quantize2Blocks(int16_t in[32], int16_t out[32],
897 const VP8Matrix* const mtx) {
898 int nz;
899 nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
900 nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
901 return nz;
902 }
903
1052 #endif // !WORK_AROUND_GCC 904 #endif // !WORK_AROUND_GCC
1053 905
1054 #endif // WEBP_USE_NEON
1055
1056 //------------------------------------------------------------------------------ 906 //------------------------------------------------------------------------------
1057 // Entry point 907 // Entry point
1058 908
1059 extern void VP8EncDspInitNEON(void); 909 extern void VP8EncDspInitNEON(void);
1060 910
1061 void VP8EncDspInitNEON(void) { 911 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
1062 #if defined(WEBP_USE_NEON)
1063 VP8ITransform = ITransform; 912 VP8ITransform = ITransform;
1064 VP8FTransform = FTransform; 913 VP8FTransform = FTransform;
1065 914
1066 VP8FTransformWHT = FTransformWHT; 915 VP8FTransformWHT = FTransformWHT;
1067 916
1068 VP8TDisto4x4 = Disto4x4; 917 VP8TDisto4x4 = Disto4x4;
1069 VP8TDisto16x16 = Disto16x16; 918 VP8TDisto16x16 = Disto16x16;
1070 VP8CollectHistogram = CollectHistogram; 919 VP8CollectHistogram = CollectHistogram;
1071 VP8SSE16x16 = SSE16x16; 920 VP8SSE16x16 = SSE16x16;
1072 VP8SSE16x8 = SSE16x8; 921 VP8SSE16x8 = SSE16x8;
1073 VP8SSE8x8 = SSE8x8; 922 VP8SSE8x8 = SSE8x8;
1074 VP8SSE4x4 = SSE4x4; 923 VP8SSE4x4 = SSE4x4;
1075 #if !defined(WORK_AROUND_GCC) 924 #if !defined(WORK_AROUND_GCC)
1076 VP8EncQuantizeBlock = QuantizeBlock; 925 VP8EncQuantizeBlock = QuantizeBlock;
926 VP8EncQuantize2Blocks = Quantize2Blocks;
1077 #endif 927 #endif
1078 #endif // WEBP_USE_NEON
1079 } 928 }
929
930 #else // !WEBP_USE_NEON
931
932 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
933
934 #endif // WEBP_USE_NEON
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/enc_mips_dsp_r2.c ('k') | third_party/libwebp/dsp/enc_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698