third_party/libwebp/dsp/enc_neon.c - Issue 1546003002: libwebp: update to 0.5.0

Side by Side Diff: third_party/libwebp/dsp/enc_neon.c

Issue 1546003002: libwebp: update to 0.5.0 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: rebase around clang-cl fix Created 4 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 Google Inc. All Rights Reserved.	1 // Copyright 2012 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // ARM NEON version of speed-critical encoding functions.	10 // ARM NEON version of speed-critical encoding functions.

(...skipping 14 matching lines...) Expand all Loading...
25	25

26 // Inverse transform.	26 // Inverse transform.

27 // This code is pretty much the same as TransformOne in the dec_neon.c, except	27 // This code is pretty much the same as TransformOne in the dec_neon.c, except

28 // for subtraction to *ref. See the comments there for algorithmic explanations.	28 // for subtraction to *ref. See the comments there for algorithmic explanations.

29	29

30 static const int16_t kC1 = 20091;	30 static const int16_t kC1 = 20091;

31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.	31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.

32	32

33 // This code works but is slower than the inlined-asm version below	33 // This code works but is slower than the inlined-asm version below

34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to	34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to

35 // USE_INTRINSICS define.	35 // WEBP_USE_INTRINSICS define.

36 // With gcc-4.8, it's a little faster speed than inlined-assembly.	36 // With gcc-4.8, it's a little faster speed than inlined-assembly.

37 #if defined(USE_INTRINSICS)	37 #if defined(WEBP_USE_INTRINSICS)

38	38

39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.	39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.

40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {	40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {

41 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));	41 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));

42 }	42 }

43	43

44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result	44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result

45 // to the corresponding rows of 'dst'.	45 // to the corresponding rows of 'dst'.

46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,	46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,

47 const int16x8_t dst01,	47 const int16x8_t dst01,

(...skipping 186 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
234 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"	234 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"

235 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"	235 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"

236 "vst1.32 d1[1], [%[dst]] \n"	236 "vst1.32 d1[1], [%[dst]] \n"

237	237

238 : [in] "+r"(in), [dst] "+r"(dst) // modified registers	238 : [in] "+r"(in), [dst] "+r"(dst) // modified registers

239 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants	239 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants

240 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered	240 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered

241 );	241 );

242 }	242 }

243	243

244 #endif // USE_INTRINSICS	244 #endif // WEBP_USE_INTRINSICS

245	245

246 static void ITransform(const uint8_t* ref,	246 static void ITransform(const uint8_t* ref,

247 const int16_t* in, uint8_t* dst, int do_two) {	247 const int16_t* in, uint8_t* dst, int do_two) {

248 ITransformOne(ref, in, dst);	248 ITransformOne(ref, in, dst);

249 if (do_two) {	249 if (do_two) {

250 ITransformOne(ref + 4, in + 16, dst + 4);	250 ITransformOne(ref + 4, in + 16, dst + 4);

251 }	251 }

252 }	252 }

253	253

254 // Load all 4x4 pixels into a single uint8x16_t variable.	254 // Load all 4x4 pixels into a single uint8x16_t variable.

255 static uint8x16_t Load4x4(const uint8_t* src) {	255 static uint8x16_t Load4x4(const uint8_t* src) {

256 uint32x4_t out = vdupq_n_u32(0);	256 uint32x4_t out = vdupq_n_u32(0);

257 out = vld1q_lane_u32((const uint32_t)(src + 0 BPS), out, 0);	257 out = vld1q_lane_u32((const uint32_t)(src + 0 BPS), out, 0);

258 out = vld1q_lane_u32((const uint32_t)(src + 1 BPS), out, 1);	258 out = vld1q_lane_u32((const uint32_t)(src + 1 BPS), out, 1);

259 out = vld1q_lane_u32((const uint32_t)(src + 2 BPS), out, 2);	259 out = vld1q_lane_u32((const uint32_t)(src + 2 BPS), out, 2);

260 out = vld1q_lane_u32((const uint32_t)(src + 3 BPS), out, 3);	260 out = vld1q_lane_u32((const uint32_t)(src + 3 BPS), out, 3);

261 return vreinterpretq_u8_u32(out);	261 return vreinterpretq_u8_u32(out);

262 }	262 }

263	263

264 // Forward transform.	264 // Forward transform.

265	265

266 #if defined(USE_INTRINSICS)	266 #if defined(WEBP_USE_INTRINSICS)

267	267

268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,	268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,

269 const int16x4_t C, const int16x4_t D,	269 const int16x4_t C, const int16x4_t D,

270 int16x8_t* const out01,	270 int16x8_t* const out01,

271 int16x8_t* const out32) {	271 int16x8_t* const out32) {

272 const int16x4x2_t AB = vtrn_s16(A, B);	272 const int16x4x2_t AB = vtrn_s16(A, B);

273 const int16x4x2_t CD = vtrn_s16(C, D);	273 const int16x4x2_t CD = vtrn_s16(C, D);

274 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),	274 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),

275 vreinterpret_s32_s16(CD.val[0]));	275 vreinterpret_s32_s16(CD.val[0]));

276 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),	276 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),

(...skipping 264 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
541 }	541 }

542 }	542 }

543 #undef LOAD_LANE_16b	543 #undef LOAD_LANE_16b

544	544

545 //------------------------------------------------------------------------------	545 //------------------------------------------------------------------------------

546 // Texture distortion	546 // Texture distortion

547 //	547 //

548 // We try to match the spectral content (weighted) between source and	548 // We try to match the spectral content (weighted) between source and

549 // reconstructed samples.	549 // reconstructed samples.

550	550

551 // This code works but is slower than the inlined-asm version below	551 // a 0123, b 0123

552 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to	552 // a 4567, b 4567

553 // USE_INTRINSICS define.	553 // a 89ab, b 89ab

554 // With gcc-4.8, it's only slightly slower than the inlined.	554 // a cdef, b cdef

555 #if defined(USE_INTRINSICS)	555 //

	556 // transpose

	557 //

	558 // a 048c, b 048c

	559 // a 159d, b 159d

	560 // a 26ae, b 26ae

	561 // a 37bf, b 37bf

	562 //

	563 static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) {

	564 const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]);

	565 const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]);

	566 const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]),

	567 vreinterpret_u16_u8(d2_tmp1.val[0]));

	568 const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]),

	569 vreinterpret_u16_u8(d2_tmp1.val[1]));

556	570

557 // Zero extend an uint16x4_t 'v' to an int32x4_t.	571 d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]);

558 static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {	572 d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]);

559 return vreinterpretq_s32_u32(vmovl_u16(v));	573 d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]);

	574 d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]);

	575 return d4_in;

560 }	576 }

561	577

562 // Does a regular 4x4 transpose followed by an adjustment of the upper columns	578 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {

563 // in the inner rows to restore the source order of differences,	579 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);

564 // i.e., a0 - a1 \| a3 - a2.	580 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);

565 static WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) {	581 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),

566 int32x4x4_t out = Transpose4x4(rows);	582 vreinterpretq_s32_s16(q2_tmp1.val[0]));

567 // restore source order in the columns containing differences.	583 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),

568 const int32x2_t r1h = vget_high_s32(out.val[1]);	584 vreinterpretq_s32_s16(q2_tmp1.val[1]));

569 const int32x2_t r2h = vget_high_s32(out.val[2]);	585 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);

570 out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h);	586 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);

571 out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h);	587 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);

572 return out;	588 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);

	589 return q4_in;

573 }	590 }

574	591

575 static WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1,	592 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) {

576 const uint8x8_t r2r3) {	593 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}

577 // a0 = in[0] + in[2] \| a1 = in[1] + in[3]	594 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}

578 const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3);	595 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0],

579 // a3 = in[0] - in[2] \| a2 = in[1] - in[3]	596 d4_in.val[2]));

580 const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3);	597 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1],

581 const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1)); // a0 + a1	598 d4_in.val[3]));

582 const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2)); // a3 + a2	599 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0],

583 // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations.	600 d4_in.val[2]));

584 // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3	601 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1],

585 // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2	602 d4_in.val[3]));

586 const int16x8x2_t transpose =	603 int16x8x4_t q4_out;

587 vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2));	604 // tmp[0] = a0 + a1

588 // tmp[3] = a0 - a1 \| tmp[2] = a3 - a2	605 // tmp[1] = a3 + a2

589 const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]),	606 // tmp[2] = a3 - a2

590 vget_low_s16(transpose.val[1]));	607 // tmp[3] = a0 - a1

591 const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]),	608 INIT_VECTOR4(q4_out,

592 vget_high_s16(transpose.val[1]));	609 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),

593 // [0]: tmp[3] [1]: tmp[2]	610 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));

594 const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2);	611 return q4_out;

595 const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } };

596 return res;

597 }	612 }

598	613

599 static WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) {	614 static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) {

600 // a0 = tmp[0 + i] + tmp[8 + i];	615 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);

601 const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]);	616 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);

602 // a1 = tmp[4 + i] + tmp[12+ i];	617 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);

603 const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]);	618 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);

604 // a2 = tmp[4 + i] - tmp[12+ i];	619

605 const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]);	620 q4_in.val[0] = vaddq_s16(q_a0, q_a1);

606 // a3 = tmp[0 + i] - tmp[8 + i];	621 q4_in.val[1] = vaddq_s16(q_a3, q_a2);

607 const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]);	622 q4_in.val[2] = vabdq_s16(q_a3, q_a2);

608 const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1)); // abs(a0 + a1)	623 q4_in.val[3] = vabdq_s16(q_a0, q_a1);

609 const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2)); // abs(a3 + a2)	624 q4_in.val[0] = vabsq_s16(q4_in.val[0]);

610 const int32x4_t b2 = vabdq_s32(a3, a2); // abs(a3 - a2)	625 q4_in.val[1] = vabsq_s16(q4_in.val[1]);

611 const int32x4_t b3 = vabdq_s32(a0, a1); // abs(a0 - a1)	626 return q4_in;

612 const int32x4x4_t res = { { b0, b1, b2, b3 } };

613 return res;

614 }	627 }

615	628

616 // Calculate the weighted sum of the rows in 'b'.	629 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {

617 static WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b,	630 const uint16x8_t q_w07 = vld1q_u16(&w[0]);

618 const int32x4_t w0, const int32x4_t w1,	631 const uint16x8_t q_w8f = vld1q_u16(&w[8]);

619 const int32x4_t w2, const int32x4_t w3) {	632 int16x4x4_t d4_w;

620 const int32x4_t s0 = vmulq_s32(w0, b.val[0]);	633 INIT_VECTOR4(d4_w,

621 const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]);	634 vget_low_s16(vreinterpretq_s16_u16(q_w07)),

622 const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]);	635 vget_high_s16(vreinterpretq_s16_u16(q_w07)),

623 const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]);	636 vget_low_s16(vreinterpretq_s16_u16(q_w8f)),

624 const int64x2_t sum1 = vpaddlq_s32(s3);	637 vget_high_s16(vreinterpretq_s16_u16(q_w8f)));

625 const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1));	638 return d4_w;

626 return sum2;	639 }

	640

	641 static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,

	642 const int16x4x4_t d4_w) {

	643 int32x2_t d_sum;

	644 // sum += w[ 0] * abs(b0);

	645 // sum += w[ 4] * abs(b1);

	646 // sum += w[ 8] * abs(b2);

	647 // sum += w[12] * abs(b3);

	648 int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));

	649 int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));

	650 int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));

	651 int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));

	652 q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));

	653 q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));

	654 q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));

	655 q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));

	656

	657 q_sum0 = vaddq_s32(q_sum0, q_sum1);

	658 q_sum2 = vaddq_s32(q_sum2, q_sum3);

	659 q_sum2 = vaddq_s32(q_sum0, q_sum2);

	660 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));

	661 d_sum = vpadd_s32(d_sum, d_sum);

	662 return d_sum;

627 }	663 }

628	664

629 #define LOAD_LANE_32b(src, VALUE, LANE) \	665 #define LOAD_LANE_32b(src, VALUE, LANE) \

630 (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE))	666 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))

631	667

632 // Hadamard transform	668 // Hadamard transform

633 // Returns the weighted sum of the absolute value of transformed coefficients.	669 // Returns the weighted sum of the absolute value of transformed coefficients.

634 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,	670 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,

635 const uint16_t* const w) {	671 const uint16_t* const w) {

636 uint32x4_t d0d1 = { 0, 0, 0, 0 };	672 uint32x2_t d_in_ab_0123 = vdup_n_u32(0);

637 uint32x4_t d2d3 = { 0, 0, 0, 0 };	673 uint32x2_t d_in_ab_4567 = vdup_n_u32(0);

638 LOAD_LANE_32b(a + 0 * BPS, d0d1, 0); // a00 a01 a02 a03	674 uint32x2_t d_in_ab_89ab = vdup_n_u32(0);

639 LOAD_LANE_32b(a + 1 * BPS, d0d1, 1); // a10 a11 a12 a13	675 uint32x2_t d_in_ab_cdef = vdup_n_u32(0);

640 LOAD_LANE_32b(b + 0 * BPS, d0d1, 2); // b00 b01 b02 b03	676 uint8x8x4_t d4_in;

641 LOAD_LANE_32b(b + 1 * BPS, d0d1, 3); // b10 b11 b12 b13	677

642 LOAD_LANE_32b(a + 2 * BPS, d2d3, 0); // a20 a21 a22 a23	678 // load data a, b

643 LOAD_LANE_32b(a + 3 * BPS, d2d3, 1); // a30 a31 a32 a33	679 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);

644 LOAD_LANE_32b(b + 2 * BPS, d2d3, 2); // b20 b21 b22 b23	680 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);

645 LOAD_LANE_32b(b + 3 * BPS, d2d3, 3); // b30 b31 b32 b33	681 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);

	682 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);

	683 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);

	684 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);

	685 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);

	686 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);

	687 INIT_VECTOR4(d4_in,

	688 vreinterpret_u8_u32(d_in_ab_0123),

	689 vreinterpret_u8_u32(d_in_ab_4567),

	690 vreinterpret_u8_u32(d_in_ab_89ab),

	691 vreinterpret_u8_u32(d_in_ab_cdef));

646	692

647 {	693 {

648 // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31	694 // horizontal pass

649 // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33	695 const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in);

650 const uint16x8x2_t tmp =	696 const int16x8x4_t q4_h = DistoHorizontalPass(d4_t);

651 vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3));	697 const int16x4x4_t d4_w = DistoLoadW(w);

652 const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]);	698 // vertical pass

653 const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]);	699 const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h);

654 const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8),	700 const int16x8x4_t q4_v = DistoVerticalPass(q4_t);

655 vget_low_u8(d2d3u8));	701 int32x2_t d_sum = DistoSum(q4_v, d4_w);

656 const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8),	702

657 vget_high_u8(d2d3u8));	703 // abs(sum2 - sum1) >> 5

658 const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a);	704 d_sum = vabs_s32(d_sum);

659 const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b);	705 d_sum = vshr_n_s32(d_sum, 5);

660 const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a);	706 return vget_lane_s32(d_sum, 0);

661 const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b);

662 const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0));

663 const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4));

664 const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8));

665 const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12));

666 const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3);

667 const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3);

668 const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1),

669 vreinterpret_s32_s64(sum2));

670 const int32x2_t res = vshr_n_s32(diff, 5);

671 return vget_lane_s32(res, 0);

672 }	707 }

673 }	708 }

674

675 #undef LOAD_LANE_32b	709 #undef LOAD_LANE_32b

676	710

677 #else

678

679 // Hadamard transform

680 // Returns the weighted sum of the absolute value of transformed coefficients.

681 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,

682 const uint16_t* const w) {

683 const int kBPS = BPS;

684 const uint8_t* A = a;

685 const uint8_t* B = b;

686 const uint16_t* W = w;

687 int sum;

688 __asm__ volatile (

689 "vld1.32 d0[0], [%[a]], %[kBPS] \n"

690 "vld1.32 d0[1], [%[a]], %[kBPS] \n"

691 "vld1.32 d2[0], [%[a]], %[kBPS] \n"

692 "vld1.32 d2[1], [%[a]] \n"

693

694 "vld1.32 d1[0], [%[b]], %[kBPS] \n"

695 "vld1.32 d1[1], [%[b]], %[kBPS] \n"

696 "vld1.32 d3[0], [%[b]], %[kBPS] \n"

697 "vld1.32 d3[1], [%[b]] \n"

698

699 // a d0/d2, b d1/d3

700 // d0/d1: 01 01 01 01

701 // d2/d3: 23 23 23 23

702 // But: it goes 01 45 23 67

703 // Notice the middle values are transposed

704 "vtrn.16 q0, q1 \n"

705

706 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}

707 "vaddl.u8 q2, d0, d2 \n"

708 "vaddl.u8 q10, d1, d3 \n"

709 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}

710 "vsubl.u8 q3, d0, d2 \n"

711 "vsubl.u8 q11, d1, d3 \n"

712

713 // tmp[0] = a0 + a1

714 "vpaddl.s16 q0, q2 \n"

715 "vpaddl.s16 q8, q10 \n"

716

717 // tmp[1] = a3 + a2

718 "vpaddl.s16 q1, q3 \n"

719 "vpaddl.s16 q9, q11 \n"

720

721 // No pair subtract

722 // q2 = {a0, a3}

723 // q3 = {a1, a2}

724 "vtrn.16 q2, q3 \n"

725 "vtrn.16 q10, q11 \n"

726

727 // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}

728 "vsubl.s16 q12, d4, d6 \n"

729 "vsubl.s16 q13, d5, d7 \n"

730 "vsubl.s16 q14, d20, d22 \n"

731 "vsubl.s16 q15, d21, d23 \n"

732

733 // separate tmp[3] and tmp[2]

734 // q12 = tmp[3]

735 // q13 = tmp[2]

736 "vtrn.32 q12, q13 \n"

737 "vtrn.32 q14, q15 \n"

738

739 // Transpose tmp for a

740 "vswp d1, d26 \n" // vtrn.64

741 "vswp d3, d24 \n" // vtrn.64

742 "vtrn.32 q0, q1 \n"

743 "vtrn.32 q13, q12 \n"

744

745 // Transpose tmp for b

746 "vswp d17, d30 \n" // vtrn.64

747 "vswp d19, d28 \n" // vtrn.64

748 "vtrn.32 q8, q9 \n"

749 "vtrn.32 q15, q14 \n"

750

751 // The first Q register is a, the second b.

752 // q0/8 tmp[0-3]

753 // q13/15 tmp[4-7]

754 // q1/9 tmp[8-11]

755 // q12/14 tmp[12-15]

756

757 // These are still in 01 45 23 67 order. We fix it easily in the addition

758 // case but the subtraction propagates them.

759 "vswp d3, d27 \n"

760 "vswp d19, d31 \n"

761

762 // a0 = tmp[0] + tmp[8]

763 "vadd.s32 q2, q0, q1 \n"

764 "vadd.s32 q3, q8, q9 \n"

765

766 // a1 = tmp[4] + tmp[12]

767 "vadd.s32 q10, q13, q12 \n"

768 "vadd.s32 q11, q15, q14 \n"

769

770 // a2 = tmp[4] - tmp[12]

771 "vsub.s32 q13, q13, q12 \n"

772 "vsub.s32 q15, q15, q14 \n"

773

774 // a3 = tmp[0] - tmp[8]

775 "vsub.s32 q0, q0, q1 \n"

776 "vsub.s32 q8, q8, q9 \n"

777

778 // b0 = a0 + a1

779 "vadd.s32 q1, q2, q10 \n"

780 "vadd.s32 q9, q3, q11 \n"

781

782 // b1 = a3 + a2

783 "vadd.s32 q12, q0, q13 \n"

784 "vadd.s32 q14, q8, q15 \n"

785

786 // b2 = a3 - a2

787 "vsub.s32 q0, q0, q13 \n"

788 "vsub.s32 q8, q8, q15 \n"

789

790 // b3 = a0 - a1

791 "vsub.s32 q2, q2, q10 \n"

792 "vsub.s32 q3, q3, q11 \n"

793

794 "vld1.64 {q10, q11}, [%[w]] \n"

795

796 // abs(b0)

797 "vabs.s32 q1, q1 \n"

798 "vabs.s32 q9, q9 \n"

799 // abs(b1)

800 "vabs.s32 q12, q12 \n"

801 "vabs.s32 q14, q14 \n"

802 // abs(b2)

803 "vabs.s32 q0, q0 \n"

804 "vabs.s32 q8, q8 \n"

805 // abs(b3)

806 "vabs.s32 q2, q2 \n"

807 "vabs.s32 q3, q3 \n"

808

809 // expand w before using.

810 "vmovl.u16 q13, d20 \n"

811 "vmovl.u16 q15, d21 \n"

812

813 // w[0] * abs(b0)

814 "vmul.u32 q1, q1, q13 \n"

815 "vmul.u32 q9, q9, q13 \n"

816

817 // w[4] * abs(b1)

818 "vmla.u32 q1, q12, q15 \n"

819 "vmla.u32 q9, q14, q15 \n"

820

821 // expand w before using.

822 "vmovl.u16 q13, d22 \n"

823 "vmovl.u16 q15, d23 \n"

824

825 // w[8] * abs(b1)

826 "vmla.u32 q1, q0, q13 \n"

827 "vmla.u32 q9, q8, q13 \n"

828

829 // w[12] * abs(b1)

830 "vmla.u32 q1, q2, q15 \n"

831 "vmla.u32 q9, q3, q15 \n"

832

833 // Sum the arrays

834 "vpaddl.u32 q1, q1 \n"

835 "vpaddl.u32 q9, q9 \n"

836 "vadd.u64 d2, d3 \n"

837 "vadd.u64 d18, d19 \n"

838

839 // Hadamard transform needs 4 bits of extra precision (2 bits in each

840 // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum

841 // precision for coeff is 8bit of input + 4bits of Hadamard transform +

842 // 16bits for w[] + 2 bits of abs() summation.

843 //

844 // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is

845 // A-OK.

846

847 // sum2 - sum1

848 "vsub.u32 d0, d2, d18 \n"

849 // abs(sum2 - sum1)

850 "vabs.s32 d0, d0 \n"

851 // abs(sum2 - sum1) >> 5

852 "vshr.u32 d0, #5 \n"

853

854 // It would be better to move the value straight into r0 but I'm not

855 // entirely sure how this works with inline assembly.

856 "vmov.32 %[sum], d0[0] \n"

857

858 : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)

859 : [kBPS] "r"(kBPS)

860 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",

861 "q10", "q11", "q12", "q13", "q14", "q15" // clobbered

862 ) ;

863

864 return sum;

865 }

866

867 #endif // USE_INTRINSICS

868

869 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,	711 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,

870 const uint16_t* const w) {	712 const uint16_t* const w) {

871 int D = 0;	713 int D = 0;

872 int x, y;	714 int x, y;

873 for (y = 0; y < 16 * BPS; y += 4 * BPS) {	715 for (y = 0; y < 16 * BPS; y += 4 * BPS) {

874 for (x = 0; x < 16; x += 4) {	716 for (x = 0; x < 16; x += 4) {

875 D += Disto4x4(a + x + y, b + x + y, w);	717 D += Disto4x4(a + x + y, b + x + y, w);

876 }	718 }

877 }	719 }

878 return D;	720 return D;

879 }	721 }

880	722

881 //------------------------------------------------------------------------------	723 //------------------------------------------------------------------------------

882	724

883 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,	725 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,

884 int start_block, int end_block,	726 int start_block, int end_block,

885 VP8Histogram* const histo) {	727 VP8Histogram* const histo) {

886 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);	728 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);

887 int j;	729 int j;

	730 int distribution[MAX_COEFF_THRESH + 1] = { 0 };

888 for (j = start_block; j < end_block; ++j) {	731 for (j = start_block; j < end_block; ++j) {

889 int16_t out[16];	732 int16_t out[16];

890 FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);	733 FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

891 {	734 {

892 int k;	735 int k;

893 const int16x8_t a0 = vld1q_s16(out + 0);	736 const int16x8_t a0 = vld1q_s16(out + 0);

894 const int16x8_t b0 = vld1q_s16(out + 8);	737 const int16x8_t b0 = vld1q_s16(out + 8);

895 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));	738 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));

896 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));	739 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));

897 const uint16x8_t a2 = vshrq_n_u16(a1, 3);	740 const uint16x8_t a2 = vshrq_n_u16(a1, 3);

898 const uint16x8_t b2 = vshrq_n_u16(b1, 3);	741 const uint16x8_t b2 = vshrq_n_u16(b1, 3);

899 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);	742 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);

900 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);	743 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);

901 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));	744 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));

902 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));	745 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));

903 // Convert coefficients to bin.	746 // Convert coefficients to bin.

904 for (k = 0; k < 16; ++k) {	747 for (k = 0; k < 16; ++k) {

905 histo->distribution[out[k]]++;	748 ++distribution[out[k]];

906 }	749 }

907 }	750 }

908 }	751 }

	752 VP8SetHistogramData(distribution, histo);

909 }	753 }

910	754

911 //------------------------------------------------------------------------------	755 //------------------------------------------------------------------------------

912	756

913 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,	757 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,

914 const uint8_t* const b,	758 const uint8_t* const b,

915 uint32x4_t* const sum) {	759 uint32x4_t* const sum) {

916 const uint8x16_t a0 = vld1q_u8(a);	760 const uint8x16_t a0 = vld1q_u8(a);

917 const uint8x16_t b0 = vld1q_u8(b);	761 const uint8x16_t b0 = vld1q_u8(b);

918 const uint8x16_t abs_diff = vabdq_u8(a0, b0);	762 const uint8x16_t abs_diff = vabdq_u8(a0, b0);

(...skipping 123 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1042 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);	886 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);

1043 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);	887 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);

1044 // test zeros	888 // test zeros

1045 if ((uint64_t)(out + 0) != 0) return 1;	889 if ((uint64_t)(out + 0) != 0) return 1;

1046 if ((uint64_t)(out + 4) != 0) return 1;	890 if ((uint64_t)(out + 4) != 0) return 1;

1047 if ((uint64_t)(out + 8) != 0) return 1;	891 if ((uint64_t)(out + 8) != 0) return 1;

1048 if ((uint64_t)(out + 12) != 0) return 1;	892 if ((uint64_t)(out + 12) != 0) return 1;

1049 return 0;	893 return 0;

1050 }	894 }

1051	895

	896 static int Quantize2Blocks(int16_t in[32], int16_t out[32],

	897 const VP8Matrix* const mtx) {

	898 int nz;

	899 nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;

	900 nz \|= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;

	901 return nz;

	902 }

	903

1052 #endif // !WORK_AROUND_GCC	904 #endif // !WORK_AROUND_GCC

1053	905

1054 #endif // WEBP_USE_NEON

1055

1056 //------------------------------------------------------------------------------	906 //------------------------------------------------------------------------------

1057 // Entry point	907 // Entry point

1058	908

1059 extern void VP8EncDspInitNEON(void);	909 extern void VP8EncDspInitNEON(void);

1060	910

1061 void VP8EncDspInitNEON(void) {	911 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {

1062 #if defined(WEBP_USE_NEON)

1063 VP8ITransform = ITransform;	912 VP8ITransform = ITransform;

1064 VP8FTransform = FTransform;	913 VP8FTransform = FTransform;

1065	914

1066 VP8FTransformWHT = FTransformWHT;	915 VP8FTransformWHT = FTransformWHT;

1067	916

1068 VP8TDisto4x4 = Disto4x4;	917 VP8TDisto4x4 = Disto4x4;

1069 VP8TDisto16x16 = Disto16x16;	918 VP8TDisto16x16 = Disto16x16;

1070 VP8CollectHistogram = CollectHistogram;	919 VP8CollectHistogram = CollectHistogram;

1071 VP8SSE16x16 = SSE16x16;	920 VP8SSE16x16 = SSE16x16;

1072 VP8SSE16x8 = SSE16x8;	921 VP8SSE16x8 = SSE16x8;

1073 VP8SSE8x8 = SSE8x8;	922 VP8SSE8x8 = SSE8x8;

1074 VP8SSE4x4 = SSE4x4;	923 VP8SSE4x4 = SSE4x4;

1075 #if !defined(WORK_AROUND_GCC)	924 #if !defined(WORK_AROUND_GCC)

1076 VP8EncQuantizeBlock = QuantizeBlock;	925 VP8EncQuantizeBlock = QuantizeBlock;

	926 VP8EncQuantize2Blocks = Quantize2Blocks;

1077 #endif	927 #endif

1078 #endif // WEBP_USE_NEON

1079 }	928 }

	929

	930 #else // !WEBP_USE_NEON

	931

	932 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)

	933

	934 #endif // WEBP_USE_NEON

OLD	NEW

« third_party/libwebp/BUILD.gn ('K') | « third_party/libwebp/dsp/enc_mips_dsp_r2.c ('k') | third_party/libwebp/dsp/enc_sse2.c » ('j') | third_party/libwebp/libwebp.gyp » ('J')