OLD | NEW |
(Empty) | |
| 1 // Copyright 2014 Google Inc. All Rights Reserved. |
| 2 // |
| 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- |
| 9 // |
| 10 // NEON common code. |
| 11 |
| 12 #ifndef WEBP_DSP_NEON_H_ |
| 13 #define WEBP_DSP_NEON_H_ |
| 14 |
| 15 #include <arm_neon.h> |
| 16 |
| 17 #include "./dsp.h" |
| 18 |
| 19 // Right now, some intrinsics functions seem slower, so we disable them |
| 20 // everywhere except aarch64 where the inline assembly is incompatible. |
| 21 #if defined(__aarch64__) |
| 22 #define USE_INTRINSICS // use intrinsics when possible |
| 23 #endif |
| 24 |
| 25 #define INIT_VECTOR2(v, a, b) do { \ |
| 26 v.val[0] = a; \ |
| 27 v.val[1] = b; \ |
| 28 } while (0) |
| 29 |
| 30 #define INIT_VECTOR3(v, a, b, c) do { \ |
| 31 v.val[0] = a; \ |
| 32 v.val[1] = b; \ |
| 33 v.val[2] = c; \ |
| 34 } while (0) |
| 35 |
| 36 #define INIT_VECTOR4(v, a, b, c, d) do { \ |
| 37 v.val[0] = a; \ |
| 38 v.val[1] = b; \ |
| 39 v.val[2] = c; \ |
| 40 v.val[3] = d; \ |
| 41 } while (0) |
| 42 |
| 43 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3 |
| 44 // crash ("internal compiler error: in immed_double_const, at emit-rtl."). |
| 45 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) |
| 46 #if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) |
| 47 #define WORK_AROUND_GCC |
| 48 #endif |
| 49 |
| 50 static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) { |
| 51 uint64x2x2_t row01, row23; |
| 52 |
| 53 row01.val[0] = vreinterpretq_u64_s32(rows.val[0]); |
| 54 row01.val[1] = vreinterpretq_u64_s32(rows.val[1]); |
| 55 row23.val[0] = vreinterpretq_u64_s32(rows.val[2]); |
| 56 row23.val[1] = vreinterpretq_u64_s32(rows.val[3]); |
| 57 // Transpose 64-bit values (there's no vswp equivalent) |
| 58 { |
| 59 const uint64x1_t row0h = vget_high_u64(row01.val[0]); |
| 60 const uint64x1_t row2l = vget_low_u64(row23.val[0]); |
| 61 const uint64x1_t row1h = vget_high_u64(row01.val[1]); |
| 62 const uint64x1_t row3l = vget_low_u64(row23.val[1]); |
| 63 row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l); |
| 64 row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0])); |
| 65 row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l); |
| 66 row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1])); |
| 67 } |
| 68 { |
| 69 const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]), |
| 70 vreinterpretq_s32_u64(row01.val[1])); |
| 71 const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]), |
| 72 vreinterpretq_s32_u64(row23.val[1])); |
| 73 int32x4x4_t out; |
| 74 out.val[0] = out01.val[0]; |
| 75 out.val[1] = out01.val[1]; |
| 76 out.val[2] = out23.val[0]; |
| 77 out.val[3] = out23.val[1]; |
| 78 return out; |
| 79 } |
| 80 } |
| 81 |
| 82 #endif // WEBP_DSP_NEON_H_ |
OLD | NEW |