OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
12 #include "./vp9_rtcd.h" | 12 |
13 #include "./vpx_config.h" | 13 #include "./vpx_config.h" |
14 | 14 |
15 #include "vpx/vpx_integer.h" | 15 #include "vpx/vpx_integer.h" |
16 | 16 |
| 17 unsigned int vpx_sad8x16_neon( |
| 18 unsigned char *src_ptr, |
| 19 int src_stride, |
| 20 unsigned char *ref_ptr, |
| 21 int ref_stride) { |
| 22 uint8x8_t d0, d8; |
| 23 uint16x8_t q12; |
| 24 uint32x4_t q1; |
| 25 uint64x2_t q3; |
| 26 uint32x2_t d5; |
| 27 int i; |
| 28 |
| 29 d0 = vld1_u8(src_ptr); |
| 30 src_ptr += src_stride; |
| 31 d8 = vld1_u8(ref_ptr); |
| 32 ref_ptr += ref_stride; |
| 33 q12 = vabdl_u8(d0, d8); |
| 34 |
| 35 for (i = 0; i < 15; i++) { |
| 36 d0 = vld1_u8(src_ptr); |
| 37 src_ptr += src_stride; |
| 38 d8 = vld1_u8(ref_ptr); |
| 39 ref_ptr += ref_stride; |
| 40 q12 = vabal_u8(q12, d0, d8); |
| 41 } |
| 42 |
| 43 q1 = vpaddlq_u16(q12); |
| 44 q3 = vpaddlq_u32(q1); |
| 45 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), |
| 46 vreinterpret_u32_u64(vget_high_u64(q3))); |
| 47 |
| 48 return vget_lane_u32(d5, 0); |
| 49 } |
| 50 |
| 51 unsigned int vpx_sad4x4_neon( |
| 52 unsigned char *src_ptr, |
| 53 int src_stride, |
| 54 unsigned char *ref_ptr, |
| 55 int ref_stride) { |
| 56 uint8x8_t d0, d8; |
| 57 uint16x8_t q12; |
| 58 uint32x2_t d1; |
| 59 uint64x1_t d3; |
| 60 int i; |
| 61 |
| 62 d0 = vld1_u8(src_ptr); |
| 63 src_ptr += src_stride; |
| 64 d8 = vld1_u8(ref_ptr); |
| 65 ref_ptr += ref_stride; |
| 66 q12 = vabdl_u8(d0, d8); |
| 67 |
| 68 for (i = 0; i < 3; i++) { |
| 69 d0 = vld1_u8(src_ptr); |
| 70 src_ptr += src_stride; |
| 71 d8 = vld1_u8(ref_ptr); |
| 72 ref_ptr += ref_stride; |
| 73 q12 = vabal_u8(q12, d0, d8); |
| 74 } |
| 75 |
| 76 d1 = vpaddl_u16(vget_low_u16(q12)); |
| 77 d3 = vpaddl_u32(d1); |
| 78 |
| 79 return vget_lane_u32(vreinterpret_u32_u64(d3), 0); |
| 80 } |
| 81 |
| 82 unsigned int vpx_sad16x8_neon( |
| 83 unsigned char *src_ptr, |
| 84 int src_stride, |
| 85 unsigned char *ref_ptr, |
| 86 int ref_stride) { |
| 87 uint8x16_t q0, q4; |
| 88 uint16x8_t q12, q13; |
| 89 uint32x4_t q1; |
| 90 uint64x2_t q3; |
| 91 uint32x2_t d5; |
| 92 int i; |
| 93 |
| 94 q0 = vld1q_u8(src_ptr); |
| 95 src_ptr += src_stride; |
| 96 q4 = vld1q_u8(ref_ptr); |
| 97 ref_ptr += ref_stride; |
| 98 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); |
| 99 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); |
| 100 |
| 101 for (i = 0; i < 7; i++) { |
| 102 q0 = vld1q_u8(src_ptr); |
| 103 src_ptr += src_stride; |
| 104 q4 = vld1q_u8(ref_ptr); |
| 105 ref_ptr += ref_stride; |
| 106 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); |
| 107 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); |
| 108 } |
| 109 |
| 110 q12 = vaddq_u16(q12, q13); |
| 111 q1 = vpaddlq_u16(q12); |
| 112 q3 = vpaddlq_u32(q1); |
| 113 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), |
| 114 vreinterpret_u32_u64(vget_high_u64(q3))); |
| 115 |
| 116 return vget_lane_u32(d5, 0); |
| 117 } |
| 118 |
17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, | 119 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, |
18 const uint16x8_t vec_hi) { | 120 const uint16x8_t vec_hi) { |
19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), | 121 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), |
20 vget_high_u16(vec_lo)); | 122 vget_high_u16(vec_lo)); |
21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), | 123 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), |
22 vget_high_u16(vec_hi)); | 124 vget_high_u16(vec_hi)); |
23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); | 125 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); |
24 const uint64x2_t b = vpaddlq_u32(a); | 126 const uint64x2_t b = vpaddlq_u32(a); |
25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), | 127 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), |
26 vreinterpret_u32_u64(vget_high_u64(b))); | 128 vreinterpret_u32_u64(vget_high_u64(b))); |
27 return vget_lane_u32(c, 0); | 129 return vget_lane_u32(c, 0); |
28 } | 130 } |
29 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { | 131 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { |
30 const uint32x4_t a = vpaddlq_u16(vec_16x8); | 132 const uint32x4_t a = vpaddlq_u16(vec_16x8); |
31 const uint64x2_t b = vpaddlq_u32(a); | 133 const uint64x2_t b = vpaddlq_u32(a); |
32 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), | 134 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), |
33 vreinterpret_u32_u64(vget_high_u64(b))); | 135 vreinterpret_u32_u64(vget_high_u64(b))); |
34 return vget_lane_u32(c, 0); | 136 return vget_lane_u32(c, 0); |
35 } | 137 } |
36 | 138 |
37 unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, | 139 unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, |
38 const uint8_t *ref, int ref_stride) { | 140 const uint8_t *ref, int ref_stride) { |
39 int i; | 141 int i; |
40 uint16x8_t vec_accum_lo = vdupq_n_u16(0); | 142 uint16x8_t vec_accum_lo = vdupq_n_u16(0); |
41 uint16x8_t vec_accum_hi = vdupq_n_u16(0); | 143 uint16x8_t vec_accum_hi = vdupq_n_u16(0); |
42 for (i = 0; i < 64; ++i) { | 144 for (i = 0; i < 64; ++i) { |
43 const uint8x16_t vec_src_00 = vld1q_u8(src); | 145 const uint8x16_t vec_src_00 = vld1q_u8(src); |
44 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); | 146 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); |
45 const uint8x16_t vec_src_32 = vld1q_u8(src + 32); | 147 const uint8x16_t vec_src_32 = vld1q_u8(src + 32); |
46 const uint8x16_t vec_src_48 = vld1q_u8(src + 48); | 148 const uint8x16_t vec_src_48 = vld1q_u8(src + 48); |
47 const uint8x16_t vec_ref_00 = vld1q_u8(ref); | 149 const uint8x16_t vec_ref_00 = vld1q_u8(ref); |
(...skipping 15 matching lines...) Expand all Loading... |
63 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), | 165 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), |
64 vget_high_u8(vec_ref_32)); | 166 vget_high_u8(vec_ref_32)); |
65 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), | 167 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), |
66 vget_low_u8(vec_ref_48)); | 168 vget_low_u8(vec_ref_48)); |
67 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), | 169 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), |
68 vget_high_u8(vec_ref_48)); | 170 vget_high_u8(vec_ref_48)); |
69 } | 171 } |
70 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); | 172 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); |
71 } | 173 } |
72 | 174 |
73 unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, | 175 unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, |
74 const uint8_t *ref, int ref_stride) { | 176 const uint8_t *ref, int ref_stride) { |
75 int i; | 177 int i; |
76 uint16x8_t vec_accum_lo = vdupq_n_u16(0); | 178 uint16x8_t vec_accum_lo = vdupq_n_u16(0); |
77 uint16x8_t vec_accum_hi = vdupq_n_u16(0); | 179 uint16x8_t vec_accum_hi = vdupq_n_u16(0); |
78 | 180 |
79 for (i = 0; i < 32; ++i) { | 181 for (i = 0; i < 32; ++i) { |
80 const uint8x16_t vec_src_00 = vld1q_u8(src); | 182 const uint8x16_t vec_src_00 = vld1q_u8(src); |
81 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); | 183 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); |
82 const uint8x16_t vec_ref_00 = vld1q_u8(ref); | 184 const uint8x16_t vec_ref_00 = vld1q_u8(ref); |
83 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); | 185 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); |
84 src += src_stride; | 186 src += src_stride; |
85 ref += ref_stride; | 187 ref += ref_stride; |
86 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), | 188 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), |
87 vget_low_u8(vec_ref_00)); | 189 vget_low_u8(vec_ref_00)); |
88 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), | 190 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), |
89 vget_high_u8(vec_ref_00)); | 191 vget_high_u8(vec_ref_00)); |
90 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), | 192 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), |
91 vget_low_u8(vec_ref_16)); | 193 vget_low_u8(vec_ref_16)); |
92 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), | 194 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), |
93 vget_high_u8(vec_ref_16)); | 195 vget_high_u8(vec_ref_16)); |
94 } | 196 } |
95 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); | 197 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); |
96 } | 198 } |
97 | 199 |
98 unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, | 200 unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, |
99 const uint8_t *ref, int ref_stride) { | 201 const uint8_t *ref, int ref_stride) { |
100 int i; | 202 int i; |
101 uint16x8_t vec_accum_lo = vdupq_n_u16(0); | 203 uint16x8_t vec_accum_lo = vdupq_n_u16(0); |
102 uint16x8_t vec_accum_hi = vdupq_n_u16(0); | 204 uint16x8_t vec_accum_hi = vdupq_n_u16(0); |
103 | 205 |
104 for (i = 0; i < 16; ++i) { | 206 for (i = 0; i < 16; ++i) { |
105 const uint8x16_t vec_src = vld1q_u8(src); | 207 const uint8x16_t vec_src = vld1q_u8(src); |
106 const uint8x16_t vec_ref = vld1q_u8(ref); | 208 const uint8x16_t vec_ref = vld1q_u8(ref); |
107 src += src_stride; | 209 src += src_stride; |
108 ref += ref_stride; | 210 ref += ref_stride; |
109 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), | 211 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), |
110 vget_low_u8(vec_ref)); | 212 vget_low_u8(vec_ref)); |
111 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), | 213 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), |
112 vget_high_u8(vec_ref)); | 214 vget_high_u8(vec_ref)); |
113 } | 215 } |
114 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); | 216 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); |
115 } | 217 } |
116 | 218 |
117 unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride, | 219 unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, |
118 const uint8_t *ref, int ref_stride) { | 220 const uint8_t *ref, int ref_stride) { |
119 int i; | 221 int i; |
120 uint16x8_t vec_accum = vdupq_n_u16(0); | 222 uint16x8_t vec_accum = vdupq_n_u16(0); |
121 | 223 |
122 for (i = 0; i < 8; ++i) { | 224 for (i = 0; i < 8; ++i) { |
123 const uint8x8_t vec_src = vld1_u8(src); | 225 const uint8x8_t vec_src = vld1_u8(src); |
124 const uint8x8_t vec_ref = vld1_u8(ref); | 226 const uint8x8_t vec_ref = vld1_u8(ref); |
125 src += src_stride; | 227 src += src_stride; |
126 ref += ref_stride; | 228 ref += ref_stride; |
127 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); | 229 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); |
128 } | 230 } |
129 return horizontal_add_16x8(vec_accum); | 231 return horizontal_add_16x8(vec_accum); |
130 } | 232 } |
OLD | NEW |