OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <arm_neon.h> | |
12 #include "vpx_ports/mem.h" | |
13 | |
14 unsigned int vp8_variance16x16_neon( | |
15 const unsigned char *src_ptr, | |
16 int source_stride, | |
17 const unsigned char *ref_ptr, | |
18 int recon_stride, | |
19 unsigned int *sse) { | |
20 int i; | |
21 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; | |
22 uint32x2_t d0u32, d10u32; | |
23 int64x1_t d0s64, d1s64; | |
24 uint8x16_t q0u8, q1u8, q2u8, q3u8; | |
25 uint16x8_t q11u16, q12u16, q13u16, q14u16; | |
26 int32x4_t q8s32, q9s32, q10s32; | |
27 int64x2_t q0s64, q1s64, q5s64; | |
28 | |
29 q8s32 = vdupq_n_s32(0); | |
30 q9s32 = vdupq_n_s32(0); | |
31 q10s32 = vdupq_n_s32(0); | |
32 | |
33 for (i = 0; i < 8; i++) { | |
34 q0u8 = vld1q_u8(src_ptr); | |
35 src_ptr += source_stride; | |
36 q1u8 = vld1q_u8(src_ptr); | |
37 src_ptr += source_stride; | |
38 __builtin_prefetch(src_ptr); | |
39 | |
40 q2u8 = vld1q_u8(ref_ptr); | |
41 ref_ptr += recon_stride; | |
42 q3u8 = vld1q_u8(ref_ptr); | |
43 ref_ptr += recon_stride; | |
44 __builtin_prefetch(ref_ptr); | |
45 | |
46 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); | |
47 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); | |
48 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); | |
49 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); | |
50 | |
51 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | |
52 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); | |
53 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); | |
54 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); | |
55 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); | |
56 | |
57 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | |
58 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | |
59 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); | |
60 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); | |
61 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); | |
62 | |
63 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); | |
64 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); | |
65 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); | |
66 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); | |
67 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); | |
68 | |
69 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); | |
70 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); | |
71 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); | |
72 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); | |
73 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); | |
74 } | |
75 | |
76 q10s32 = vaddq_s32(q10s32, q9s32); | |
77 q0s64 = vpaddlq_s32(q8s32); | |
78 q1s64 = vpaddlq_s32(q10s32); | |
79 | |
80 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); | |
81 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); | |
82 | |
83 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), | |
84 vreinterpret_s32_s64(d0s64)); | |
85 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); | |
86 | |
87 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); | |
88 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); | |
89 | |
90 return vget_lane_u32(d0u32, 0); | |
91 } | |
92 | |
93 unsigned int vp8_variance16x8_neon( | |
94 const unsigned char *src_ptr, | |
95 int source_stride, | |
96 const unsigned char *ref_ptr, | |
97 int recon_stride, | |
98 unsigned int *sse) { | |
99 int i; | |
100 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; | |
101 uint32x2_t d0u32, d10u32; | |
102 int64x1_t d0s64, d1s64; | |
103 uint8x16_t q0u8, q1u8, q2u8, q3u8; | |
104 uint16x8_t q11u16, q12u16, q13u16, q14u16; | |
105 int32x4_t q8s32, q9s32, q10s32; | |
106 int64x2_t q0s64, q1s64, q5s64; | |
107 | |
108 q8s32 = vdupq_n_s32(0); | |
109 q9s32 = vdupq_n_s32(0); | |
110 q10s32 = vdupq_n_s32(0); | |
111 | |
112 for (i = 0; i < 4; i++) { // variance16x8_neon_loop | |
113 q0u8 = vld1q_u8(src_ptr); | |
114 src_ptr += source_stride; | |
115 q1u8 = vld1q_u8(src_ptr); | |
116 src_ptr += source_stride; | |
117 __builtin_prefetch(src_ptr); | |
118 | |
119 q2u8 = vld1q_u8(ref_ptr); | |
120 ref_ptr += recon_stride; | |
121 q3u8 = vld1q_u8(ref_ptr); | |
122 ref_ptr += recon_stride; | |
123 __builtin_prefetch(ref_ptr); | |
124 | |
125 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); | |
126 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); | |
127 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); | |
128 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); | |
129 | |
130 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | |
131 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); | |
132 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); | |
133 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); | |
134 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); | |
135 | |
136 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | |
137 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | |
138 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); | |
139 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); | |
140 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); | |
141 | |
142 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); | |
143 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); | |
144 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); | |
145 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); | |
146 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); | |
147 | |
148 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); | |
149 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); | |
150 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); | |
151 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); | |
152 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); | |
153 } | |
154 | |
155 q10s32 = vaddq_s32(q10s32, q9s32); | |
156 q0s64 = vpaddlq_s32(q8s32); | |
157 q1s64 = vpaddlq_s32(q10s32); | |
158 | |
159 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); | |
160 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); | |
161 | |
162 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), | |
163 vreinterpret_s32_s64(d0s64)); | |
164 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); | |
165 | |
166 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); | |
167 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); | |
168 | |
169 return vget_lane_u32(d0u32, 0); | |
170 } | |
171 | |
172 unsigned int vp8_variance8x16_neon( | |
173 const unsigned char *src_ptr, | |
174 int source_stride, | |
175 const unsigned char *ref_ptr, | |
176 int recon_stride, | |
177 unsigned int *sse) { | |
178 int i; | |
179 uint8x8_t d0u8, d2u8, d4u8, d6u8; | |
180 int16x4_t d22s16, d23s16, d24s16, d25s16; | |
181 uint32x2_t d0u32, d10u32; | |
182 int64x1_t d0s64, d1s64; | |
183 uint16x8_t q11u16, q12u16; | |
184 int32x4_t q8s32, q9s32, q10s32; | |
185 int64x2_t q0s64, q1s64, q5s64; | |
186 | |
187 q8s32 = vdupq_n_s32(0); | |
188 q9s32 = vdupq_n_s32(0); | |
189 q10s32 = vdupq_n_s32(0); | |
190 | |
191 for (i = 0; i < 8; i++) { // variance8x16_neon_loop | |
192 d0u8 = vld1_u8(src_ptr); | |
193 src_ptr += source_stride; | |
194 d2u8 = vld1_u8(src_ptr); | |
195 src_ptr += source_stride; | |
196 __builtin_prefetch(src_ptr); | |
197 | |
198 d4u8 = vld1_u8(ref_ptr); | |
199 ref_ptr += recon_stride; | |
200 d6u8 = vld1_u8(ref_ptr); | |
201 ref_ptr += recon_stride; | |
202 __builtin_prefetch(ref_ptr); | |
203 | |
204 q11u16 = vsubl_u8(d0u8, d4u8); | |
205 q12u16 = vsubl_u8(d2u8, d6u8); | |
206 | |
207 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | |
208 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); | |
209 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); | |
210 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); | |
211 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); | |
212 | |
213 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | |
214 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | |
215 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); | |
216 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); | |
217 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); | |
218 } | |
219 | |
220 q10s32 = vaddq_s32(q10s32, q9s32); | |
221 q0s64 = vpaddlq_s32(q8s32); | |
222 q1s64 = vpaddlq_s32(q10s32); | |
223 | |
224 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); | |
225 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); | |
226 | |
227 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), | |
228 vreinterpret_s32_s64(d0s64)); | |
229 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); | |
230 | |
231 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); | |
232 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); | |
233 | |
234 return vget_lane_u32(d0u32, 0); | |
235 } | |
236 | |
237 unsigned int vp8_variance8x8_neon( | |
238 const unsigned char *src_ptr, | |
239 int source_stride, | |
240 const unsigned char *ref_ptr, | |
241 int recon_stride, | |
242 unsigned int *sse) { | |
243 int i; | |
244 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; | |
245 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; | |
246 uint32x2_t d0u32, d10u32; | |
247 int64x1_t d0s64, d1s64; | |
248 uint16x8_t q11u16, q12u16, q13u16, q14u16; | |
249 int32x4_t q8s32, q9s32, q10s32; | |
250 int64x2_t q0s64, q1s64, q5s64; | |
251 | |
252 q8s32 = vdupq_n_s32(0); | |
253 q9s32 = vdupq_n_s32(0); | |
254 q10s32 = vdupq_n_s32(0); | |
255 | |
256 for (i = 0; i < 2; i++) { // variance8x8_neon_loop | |
257 d0u8 = vld1_u8(src_ptr); | |
258 src_ptr += source_stride; | |
259 d1u8 = vld1_u8(src_ptr); | |
260 src_ptr += source_stride; | |
261 d2u8 = vld1_u8(src_ptr); | |
262 src_ptr += source_stride; | |
263 d3u8 = vld1_u8(src_ptr); | |
264 src_ptr += source_stride; | |
265 | |
266 d4u8 = vld1_u8(ref_ptr); | |
267 ref_ptr += recon_stride; | |
268 d5u8 = vld1_u8(ref_ptr); | |
269 ref_ptr += recon_stride; | |
270 d6u8 = vld1_u8(ref_ptr); | |
271 ref_ptr += recon_stride; | |
272 d7u8 = vld1_u8(ref_ptr); | |
273 ref_ptr += recon_stride; | |
274 | |
275 q11u16 = vsubl_u8(d0u8, d4u8); | |
276 q12u16 = vsubl_u8(d1u8, d5u8); | |
277 q13u16 = vsubl_u8(d2u8, d6u8); | |
278 q14u16 = vsubl_u8(d3u8, d7u8); | |
279 | |
280 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | |
281 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); | |
282 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); | |
283 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); | |
284 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); | |
285 | |
286 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | |
287 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | |
288 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); | |
289 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); | |
290 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); | |
291 | |
292 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); | |
293 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); | |
294 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); | |
295 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); | |
296 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); | |
297 | |
298 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); | |
299 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); | |
300 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); | |
301 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); | |
302 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); | |
303 } | |
304 | |
305 q10s32 = vaddq_s32(q10s32, q9s32); | |
306 q0s64 = vpaddlq_s32(q8s32); | |
307 q1s64 = vpaddlq_s32(q10s32); | |
308 | |
309 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); | |
310 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); | |
311 | |
312 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), | |
313 vreinterpret_s32_s64(d0s64)); | |
314 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); | |
315 | |
316 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6); | |
317 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); | |
318 | |
319 return vget_lane_u32(d0u32, 0); | |
320 } | |
OLD | NEW |