OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <arm_neon.h> | |
12 #include "./vp9_rtcd.h" | |
13 #include "./vpx_config.h" | |
14 | |
15 #include "vpx/vpx_integer.h" | |
16 | |
17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, | |
18 const uint16x8_t vec_hi) { | |
19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), | |
20 vget_high_u16(vec_lo)); | |
21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), | |
22 vget_high_u16(vec_hi)); | |
23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); | |
24 const uint64x2_t b = vpaddlq_u32(a); | |
25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), | |
26 vreinterpret_u32_u64(vget_high_u64(b))); | |
27 return vget_lane_u32(c, 0); | |
28 } | |
29 | |
30 // Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, | |
31 // vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo | |
32 // and vec_sum_ref_hi. | |
33 static void sad_neon_64(const uint8x16_t vec_src_00, | |
34 const uint8x16_t vec_src_16, | |
35 const uint8x16_t vec_src_32, | |
36 const uint8x16_t vec_src_48, | |
37 const uint8_t *ref, | |
38 uint16x8_t *vec_sum_ref_lo, | |
39 uint16x8_t *vec_sum_ref_hi) { | |
40 const uint8x16_t vec_ref_00 = vld1q_u8(ref); | |
41 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); | |
42 const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); | |
43 const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); | |
44 | |
45 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), | |
46 vget_low_u8(vec_ref_00)); | |
47 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), | |
48 vget_high_u8(vec_ref_00)); | |
49 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), | |
50 vget_low_u8(vec_ref_16)); | |
51 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), | |
52 vget_high_u8(vec_ref_16)); | |
53 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), | |
54 vget_low_u8(vec_ref_32)); | |
55 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), | |
56 vget_high_u8(vec_ref_32)); | |
57 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), | |
58 vget_low_u8(vec_ref_48)); | |
59 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), | |
60 vget_high_u8(vec_ref_48)); | |
61 } | |
62 | |
63 // Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, | |
64 // and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. | |
65 static void sad_neon_32(const uint8x16_t vec_src_00, | |
66 const uint8x16_t vec_src_16, | |
67 const uint8_t *ref, | |
68 uint16x8_t *vec_sum_ref_lo, | |
69 uint16x8_t *vec_sum_ref_hi) { | |
70 const uint8x16_t vec_ref_00 = vld1q_u8(ref); | |
71 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); | |
72 | |
73 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), | |
74 vget_low_u8(vec_ref_00)); | |
75 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), | |
76 vget_high_u8(vec_ref_00)); | |
77 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), | |
78 vget_low_u8(vec_ref_16)); | |
79 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), | |
80 vget_high_u8(vec_ref_16)); | |
81 } | |
82 | |
83 void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride, | |
84 const uint8_t* const ref[4], int ref_stride, | |
85 unsigned int *res) { | |
86 int i; | |
87 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); | |
88 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); | |
89 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); | |
90 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); | |
91 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); | |
92 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); | |
93 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); | |
94 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); | |
95 const uint8_t *ref0, *ref1, *ref2, *ref3; | |
96 ref0 = ref[0]; | |
97 ref1 = ref[1]; | |
98 ref2 = ref[2]; | |
99 ref3 = ref[3]; | |
100 | |
101 for (i = 0; i < 64; ++i) { | |
102 const uint8x16_t vec_src_00 = vld1q_u8(src); | |
103 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); | |
104 const uint8x16_t vec_src_32 = vld1q_u8(src + 32); | |
105 const uint8x16_t vec_src_48 = vld1q_u8(src + 48); | |
106 | |
107 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, | |
108 &vec_sum_ref0_lo, &vec_sum_ref0_hi); | |
109 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, | |
110 &vec_sum_ref1_lo, &vec_sum_ref1_hi); | |
111 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, | |
112 &vec_sum_ref2_lo, &vec_sum_ref2_hi); | |
113 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, | |
114 &vec_sum_ref3_lo, &vec_sum_ref3_hi); | |
115 | |
116 src += src_stride; | |
117 ref0 += ref_stride; | |
118 ref1 += ref_stride; | |
119 ref2 += ref_stride; | |
120 ref3 += ref_stride; | |
121 } | |
122 | |
123 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); | |
124 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); | |
125 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); | |
126 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); | |
127 } | |
128 | |
129 void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride, | |
130 const uint8_t* const ref[4], int ref_stride, | |
131 unsigned int *res) { | |
132 int i; | |
133 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); | |
134 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); | |
135 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); | |
136 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); | |
137 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); | |
138 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); | |
139 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); | |
140 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); | |
141 const uint8_t *ref0, *ref1, *ref2, *ref3; | |
142 ref0 = ref[0]; | |
143 ref1 = ref[1]; | |
144 ref2 = ref[2]; | |
145 ref3 = ref[3]; | |
146 | |
147 for (i = 0; i < 32; ++i) { | |
148 const uint8x16_t vec_src_00 = vld1q_u8(src); | |
149 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); | |
150 | |
151 sad_neon_32(vec_src_00, vec_src_16, ref0, | |
152 &vec_sum_ref0_lo, &vec_sum_ref0_hi); | |
153 sad_neon_32(vec_src_00, vec_src_16, ref1, | |
154 &vec_sum_ref1_lo, &vec_sum_ref1_hi); | |
155 sad_neon_32(vec_src_00, vec_src_16, ref2, | |
156 &vec_sum_ref2_lo, &vec_sum_ref2_hi); | |
157 sad_neon_32(vec_src_00, vec_src_16, ref3, | |
158 &vec_sum_ref3_lo, &vec_sum_ref3_hi); | |
159 | |
160 src += src_stride; | |
161 ref0 += ref_stride; | |
162 ref1 += ref_stride; | |
163 ref2 += ref_stride; | |
164 ref3 += ref_stride; | |
165 } | |
166 | |
167 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); | |
168 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); | |
169 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); | |
170 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); | |
171 } | |
172 | |
173 void vp9_sad16x16x4d_neon(const uint8_t *src, int src_stride, | |
174 const uint8_t* const ref[4], int ref_stride, | |
175 unsigned int *res) { | |
176 int i; | |
177 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); | |
178 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); | |
179 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); | |
180 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); | |
181 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); | |
182 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); | |
183 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); | |
184 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); | |
185 const uint8_t *ref0, *ref1, *ref2, *ref3; | |
186 ref0 = ref[0]; | |
187 ref1 = ref[1]; | |
188 ref2 = ref[2]; | |
189 ref3 = ref[3]; | |
190 | |
191 for (i = 0; i < 16; ++i) { | |
192 const uint8x16_t vec_src = vld1q_u8(src); | |
193 const uint8x16_t vec_ref0 = vld1q_u8(ref0); | |
194 const uint8x16_t vec_ref1 = vld1q_u8(ref1); | |
195 const uint8x16_t vec_ref2 = vld1q_u8(ref2); | |
196 const uint8x16_t vec_ref3 = vld1q_u8(ref3); | |
197 | |
198 vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), | |
199 vget_low_u8(vec_ref0)); | |
200 vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), | |
201 vget_high_u8(vec_ref0)); | |
202 vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), | |
203 vget_low_u8(vec_ref1)); | |
204 vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), | |
205 vget_high_u8(vec_ref1)); | |
206 vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), | |
207 vget_low_u8(vec_ref2)); | |
208 vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), | |
209 vget_high_u8(vec_ref2)); | |
210 vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), | |
211 vget_low_u8(vec_ref3)); | |
212 vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), | |
213 vget_high_u8(vec_ref3)); | |
214 | |
215 src += src_stride; | |
216 ref0 += ref_stride; | |
217 ref1 += ref_stride; | |
218 ref2 += ref_stride; | |
219 ref3 += ref_stride; | |
220 } | |
221 | |
222 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); | |
223 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); | |
224 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); | |
225 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); | |
226 } | |
OLD | NEW |