| Index: third_party/libwebp/dsp/neon.h
|
| diff --git a/third_party/libwebp/dsp/neon.h b/third_party/libwebp/dsp/neon.h
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..7e06eaeef967daa57edd8b75913f88cd4fed2c79
|
| --- /dev/null
|
| +++ b/third_party/libwebp/dsp/neon.h
|
| @@ -0,0 +1,82 @@
|
| +// Copyright 2014 Google Inc. All Rights Reserved.
|
| +//
|
| +// Use of this source code is governed by a BSD-style license
|
| +// that can be found in the COPYING file in the root of the source
|
| +// tree. An additional intellectual property rights grant can be found
|
| +// in the file PATENTS. All contributing project authors may
|
| +// be found in the AUTHORS file in the root of the source tree.
|
| +// -----------------------------------------------------------------------------
|
| +//
|
| +// NEON common code.
|
| +
|
| +#ifndef WEBP_DSP_NEON_H_
|
| +#define WEBP_DSP_NEON_H_
|
| +
|
| +#include <arm_neon.h>
|
| +
|
| +#include "./dsp.h"
|
| +
|
| +// Right now, some intrinsics functions seem slower, so we disable them
|
| +// everywhere except aarch64 where the inline assembly is incompatible.
|
| +#if defined(__aarch64__)
|
| +#define USE_INTRINSICS // use intrinsics when possible
|
| +#endif
|
| +
|
| +#define INIT_VECTOR2(v, a, b) do { \
|
| + v.val[0] = a; \
|
| + v.val[1] = b; \
|
| +} while (0)
|
| +
|
| +#define INIT_VECTOR3(v, a, b, c) do { \
|
| + v.val[0] = a; \
|
| + v.val[1] = b; \
|
| + v.val[2] = c; \
|
| +} while (0)
|
| +
|
| +#define INIT_VECTOR4(v, a, b, c, d) do { \
|
| + v.val[0] = a; \
|
| + v.val[1] = b; \
|
| + v.val[2] = c; \
|
| + v.val[3] = d; \
|
| +} while (0)
|
| +
|
| +// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
|
| +// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
|
| +// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
|
| +#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
|
| +#define WORK_AROUND_GCC
|
| +#endif
|
| +
|
| +static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
|
| + uint64x2x2_t row01, row23;
|
| +
|
| + row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
|
| + row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
|
| + row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
|
| + row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
|
| + // Transpose 64-bit values (there's no vswp equivalent)
|
| + {
|
| + const uint64x1_t row0h = vget_high_u64(row01.val[0]);
|
| + const uint64x1_t row2l = vget_low_u64(row23.val[0]);
|
| + const uint64x1_t row1h = vget_high_u64(row01.val[1]);
|
| + const uint64x1_t row3l = vget_low_u64(row23.val[1]);
|
| + row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
|
| + row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
|
| + row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
|
| + row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
|
| + }
|
| + {
|
| + const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
|
| + vreinterpretq_s32_u64(row01.val[1]));
|
| + const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
|
| + vreinterpretq_s32_u64(row23.val[1]));
|
| + int32x4x4_t out;
|
| + out.val[0] = out01.val[0];
|
| + out.val[1] = out01.val[1];
|
| + out.val[2] = out23.val[0];
|
| + out.val[3] = out23.val[1];
|
| + return out;
|
| + }
|
| +}
|
| +
|
| +#endif // WEBP_DSP_NEON_H_
|
|
|