OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
12 #include "./vp9_rtcd.h" | 12 #include "./vp9_rtcd.h" |
13 #include "./vpx_config.h" | 13 #include "./vpx_config.h" |
14 | 14 |
15 #include "vpx/vpx_integer.h" | 15 #include "vpx/vpx_integer.h" |
16 | 16 |
17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, | 17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, |
18 const uint16x8_t vec_hi) { | 18 const uint16x8_t vec_hi) { |
19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), | 19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), |
20 vget_high_u16(vec_lo)); | 20 vget_high_u16(vec_lo)); |
21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), | 21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), |
22 vget_high_u16(vec_hi)); | 22 vget_high_u16(vec_hi)); |
23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); | 23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); |
24 const uint64x2_t b = vpaddlq_u32(a); | 24 const uint64x2_t b = vpaddlq_u32(a); |
25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), | 25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), |
26 vreinterpret_u32_u64(vget_high_u64(b))); | 26 vreinterpret_u32_u64(vget_high_u64(b))); |
27 return vget_lane_u32(c, 0); | 27 return vget_lane_u32(c, 0); |
28 } | 28 } |
29 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_lo, | 29 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { |
30 const uint16x8_t vec_hi) { | 30 const uint32x4_t a = vpaddlq_u16(vec_16x8); |
31 const uint32x4_t a = vpaddlq_u16(vaddq_u16(vec_lo, vec_hi)); | |
32 const uint64x2_t b = vpaddlq_u32(a); | 31 const uint64x2_t b = vpaddlq_u32(a); |
33 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), | 32 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), |
34 vreinterpret_u32_u64(vget_high_u64(b))); | 33 vreinterpret_u32_u64(vget_high_u64(b))); |
35 return vget_lane_u32(c, 0); | 34 return vget_lane_u32(c, 0); |
36 } | 35 } |
37 | 36 |
38 unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, | 37 unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, |
39 const uint8_t *ref, int ref_stride) { | 38 const uint8_t *ref, int ref_stride) { |
40 int i; | 39 int i; |
41 uint16x8_t vec_accum_lo = vdupq_n_u16(0); | 40 uint16x8_t vec_accum_lo = vdupq_n_u16(0); |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
86 ref += ref_stride; | 85 ref += ref_stride; |
87 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), | 86 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), |
88 vget_low_u8(vec_ref_00)); | 87 vget_low_u8(vec_ref_00)); |
89 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), | 88 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), |
90 vget_high_u8(vec_ref_00)); | 89 vget_high_u8(vec_ref_00)); |
91 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), | 90 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), |
92 vget_low_u8(vec_ref_16)); | 91 vget_low_u8(vec_ref_16)); |
93 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), | 92 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), |
94 vget_high_u8(vec_ref_16)); | 93 vget_high_u8(vec_ref_16)); |
95 } | 94 } |
96 return horizontal_add_16x8(vec_accum_lo, vec_accum_hi); | 95 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); |
97 } | 96 } |
98 | 97 |
99 unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, | 98 unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, |
100 const uint8_t *ref, int ref_stride) { | 99 const uint8_t *ref, int ref_stride) { |
101 int i; | 100 int i; |
102 uint16x8_t vec_accum_lo = vdupq_n_u16(0); | 101 uint16x8_t vec_accum_lo = vdupq_n_u16(0); |
103 uint16x8_t vec_accum_hi = vdupq_n_u16(0); | 102 uint16x8_t vec_accum_hi = vdupq_n_u16(0); |
104 | 103 |
105 for (i = 0; i < 16; ++i) { | 104 for (i = 0; i < 16; ++i) { |
106 const uint8x16_t vec_src = vld1q_u8(src); | 105 const uint8x16_t vec_src = vld1q_u8(src); |
107 const uint8x16_t vec_ref = vld1q_u8(ref); | 106 const uint8x16_t vec_ref = vld1q_u8(ref); |
108 src += src_stride; | 107 src += src_stride; |
109 ref += ref_stride; | 108 ref += ref_stride; |
110 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), | 109 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), |
111 vget_low_u8(vec_ref)); | 110 vget_low_u8(vec_ref)); |
112 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), | 111 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), |
113 vget_high_u8(vec_ref)); | 112 vget_high_u8(vec_ref)); |
114 } | 113 } |
115 return horizontal_add_16x8(vec_accum_lo, vec_accum_hi); | 114 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); |
116 } | 115 } |
| 116 |
| 117 unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride, |
| 118 const uint8_t *ref, int ref_stride) { |
| 119 int i; |
| 120 uint16x8_t vec_accum = vdupq_n_u16(0); |
| 121 |
| 122 for (i = 0; i < 8; ++i) { |
| 123 const uint8x8_t vec_src = vld1_u8(src); |
| 124 const uint8x8_t vec_ref = vld1_u8(ref); |
| 125 src += src_stride; |
| 126 ref += ref_stride; |
| 127 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); |
| 128 } |
| 129 return horizontal_add_16x8(vec_accum); |
| 130 } |
OLD | NEW |