OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
12 #include "./vp9_rtcd.h" | 12 |
13 #include "./vpx_config.h" | 13 #include "./vpx_config.h" |
14 | 14 #include "./vpx_dsp_rtcd.h" |
15 #include "vpx/vpx_integer.h" | 15 #include "vpx/vpx_integer.h" |
16 | 16 |
17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, | 17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, |
18 const uint16x8_t vec_hi) { | 18 const uint16x8_t vec_hi) { |
19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), | 19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), |
20 vget_high_u16(vec_lo)); | 20 vget_high_u16(vec_lo)); |
21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), | 21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), |
22 vget_high_u16(vec_hi)); | 22 vget_high_u16(vec_hi)); |
23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); | 23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); |
24 const uint64x2_t b = vpaddlq_u32(a); | 24 const uint64x2_t b = vpaddlq_u32(a); |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
73 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), | 73 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), |
74 vget_low_u8(vec_ref_00)); | 74 vget_low_u8(vec_ref_00)); |
75 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), | 75 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), |
76 vget_high_u8(vec_ref_00)); | 76 vget_high_u8(vec_ref_00)); |
77 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), | 77 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), |
78 vget_low_u8(vec_ref_16)); | 78 vget_low_u8(vec_ref_16)); |
79 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), | 79 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), |
80 vget_high_u8(vec_ref_16)); | 80 vget_high_u8(vec_ref_16)); |
81 } | 81 } |
82 | 82 |
83 void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride, | 83 void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, |
84 const uint8_t* const ref[4], int ref_stride, | 84 const uint8_t* const ref[4], int ref_stride, |
85 unsigned int *res) { | 85 uint32_t *res) { |
86 int i; | 86 int i; |
87 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); | 87 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); |
88 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); | 88 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); |
89 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); | 89 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); |
90 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); | 90 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); |
91 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); | 91 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); |
92 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); | 92 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); |
93 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); | 93 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); |
94 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); | 94 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); |
95 const uint8_t *ref0, *ref1, *ref2, *ref3; | 95 const uint8_t *ref0, *ref1, *ref2, *ref3; |
(...skipping 23 matching lines...) Expand all Loading... |
119 ref2 += ref_stride; | 119 ref2 += ref_stride; |
120 ref3 += ref_stride; | 120 ref3 += ref_stride; |
121 } | 121 } |
122 | 122 |
123 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); | 123 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); |
124 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); | 124 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); |
125 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); | 125 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); |
126 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); | 126 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); |
127 } | 127 } |
128 | 128 |
129 void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride, | 129 void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, |
130 const uint8_t* const ref[4], int ref_stride, | 130 const uint8_t* const ref[4], int ref_stride, |
131 unsigned int *res) { | 131 uint32_t *res) { |
132 int i; | 132 int i; |
133 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); | 133 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); |
134 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); | 134 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); |
135 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); | 135 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); |
136 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); | 136 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); |
137 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); | 137 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); |
138 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); | 138 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); |
139 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); | 139 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); |
140 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); | 140 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); |
141 const uint8_t *ref0, *ref1, *ref2, *ref3; | 141 const uint8_t *ref0, *ref1, *ref2, *ref3; |
(...skipping 21 matching lines...) Expand all Loading... |
163 ref2 += ref_stride; | 163 ref2 += ref_stride; |
164 ref3 += ref_stride; | 164 ref3 += ref_stride; |
165 } | 165 } |
166 | 166 |
167 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); | 167 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); |
168 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); | 168 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); |
169 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); | 169 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); |
170 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); | 170 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); |
171 } | 171 } |
172 | 172 |
173 void vp9_sad16x16x4d_neon(const uint8_t *src, int src_stride, | 173 void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, |
174 const uint8_t* const ref[4], int ref_stride, | 174 const uint8_t* const ref[4], int ref_stride, |
175 unsigned int *res) { | 175 uint32_t *res) { |
176 int i; | 176 int i; |
177 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); | 177 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); |
178 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); | 178 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); |
179 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); | 179 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); |
180 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); | 180 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); |
181 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); | 181 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); |
182 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); | 182 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); |
183 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); | 183 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); |
184 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); | 184 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); |
185 const uint8_t *ref0, *ref1, *ref2, *ref3; | 185 const uint8_t *ref0, *ref1, *ref2, *ref3; |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
217 ref1 += ref_stride; | 217 ref1 += ref_stride; |
218 ref2 += ref_stride; | 218 ref2 += ref_stride; |
219 ref3 += ref_stride; | 219 ref3 += ref_stride; |
220 } | 220 } |
221 | 221 |
222 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); | 222 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); |
223 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); | 223 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); |
224 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); | 224 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); |
225 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); | 225 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); |
226 } | 226 } |
OLD | NEW |