OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
12 #include "./vp9_rtcd.h" | 12 #include "./vp9_rtcd.h" |
| 13 #include "./vpx_dsp_rtcd.h" |
13 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
14 | 15 |
15 #include "vpx_ports/mem.h" | 16 #include "vpx_ports/mem.h" |
16 #include "vpx/vpx_integer.h" | 17 #include "vpx/vpx_integer.h" |
17 | 18 |
18 #include "vp9/common/vp9_common.h" | 19 #include "vp9/common/vp9_common.h" |
19 #include "vp9/common/vp9_filter.h" | 20 #include "vp9/common/vp9_filter.h" |
20 | 21 |
21 #include "vp9/encoder/vp9_variance.h" | 22 #include "vp9/encoder/vp9_variance.h" |
22 | 23 |
23 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { | |
24 const int32x4_t a = vpaddlq_s16(v_16x8); | |
25 const int64x2_t b = vpaddlq_s32(a); | |
26 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), | |
27 vreinterpret_s32_s64(vget_high_s64(b))); | |
28 return vget_lane_s32(c, 0); | |
29 } | |
30 | |
31 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { | |
32 const int64x2_t b = vpaddlq_s32(v_32x4); | |
33 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), | |
34 vreinterpret_s32_s64(vget_high_s64(b))); | |
35 return vget_lane_s32(c, 0); | |
36 } | |
37 | |
38 // w * h must be less than 2048 or local variable v_sum may overflow. | |
39 static void variance_neon_w8(const uint8_t *a, int a_stride, | |
40 const uint8_t *b, int b_stride, | |
41 int w, int h, uint32_t *sse, int *sum) { | |
42 int i, j; | |
43 int16x8_t v_sum = vdupq_n_s16(0); | |
44 int32x4_t v_sse_lo = vdupq_n_s32(0); | |
45 int32x4_t v_sse_hi = vdupq_n_s32(0); | |
46 | |
47 for (i = 0; i < h; ++i) { | |
48 for (j = 0; j < w; j += 8) { | |
49 const uint8x8_t v_a = vld1_u8(&a[j]); | |
50 const uint8x8_t v_b = vld1_u8(&b[j]); | |
51 const uint16x8_t v_diff = vsubl_u8(v_a, v_b); | |
52 const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); | |
53 v_sum = vaddq_s16(v_sum, sv_diff); | |
54 v_sse_lo = vmlal_s16(v_sse_lo, | |
55 vget_low_s16(sv_diff), | |
56 vget_low_s16(sv_diff)); | |
57 v_sse_hi = vmlal_s16(v_sse_hi, | |
58 vget_high_s16(sv_diff), | |
59 vget_high_s16(sv_diff)); | |
60 } | |
61 a += a_stride; | |
62 b += b_stride; | |
63 } | |
64 | |
65 *sum = horizontal_add_s16x8(v_sum); | |
66 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); | |
67 } | |
68 | |
69 void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, | |
70 const uint8_t *ref_ptr, int ref_stride, | |
71 unsigned int *sse, int *sum) { | |
72 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8, | |
73 8, sse, sum); | |
74 } | |
75 | |
76 unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride, | |
77 const uint8_t *b, int b_stride, | |
78 unsigned int *sse) { | |
79 int sum; | |
80 variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); | |
81 return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 | |
82 } | |
83 | |
84 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, | |
85 const uint8_t *ref_ptr, int ref_stride, | |
86 unsigned int *sse, int *sum) { | |
87 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16, | |
88 16, sse, sum); | |
89 } | |
90 | |
91 unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, | |
92 const uint8_t *b, int b_stride, | |
93 unsigned int *sse) { | |
94 int sum; | |
95 variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); | |
96 return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 | |
97 } | |
98 | |
99 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, | 24 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, |
100 uint8_t *output_ptr, | 25 uint8_t *output_ptr, |
101 unsigned int src_pixels_per_line, | 26 unsigned int src_pixels_per_line, |
102 int pixel_step, | 27 int pixel_step, |
103 unsigned int output_height, | 28 unsigned int output_height, |
104 unsigned int output_width, | 29 unsigned int output_width, |
105 const int16_t *vp9_filter) { | 30 const int16_t *vp9_filter) { |
106 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); | 31 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); |
107 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); | 32 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); |
108 unsigned int i; | 33 unsigned int i; |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
155 int dst_stride, | 80 int dst_stride, |
156 unsigned int *sse) { | 81 unsigned int *sse) { |
157 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); | 82 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); |
158 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); | 83 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); |
159 | 84 |
160 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, | 85 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, |
161 9, 8, | 86 9, 8, |
162 BILINEAR_FILTERS_2TAP(xoffset)); | 87 BILINEAR_FILTERS_2TAP(xoffset)); |
163 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, | 88 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, |
164 8, BILINEAR_FILTERS_2TAP(yoffset)); | 89 8, BILINEAR_FILTERS_2TAP(yoffset)); |
165 return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse); | 90 return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); |
166 } | 91 } |
167 | 92 |
168 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, | 93 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, |
169 int src_stride, | 94 int src_stride, |
170 int xoffset, | 95 int xoffset, |
171 int yoffset, | 96 int yoffset, |
172 const uint8_t *dst, | 97 const uint8_t *dst, |
173 int dst_stride, | 98 int dst_stride, |
174 unsigned int *sse) { | 99 unsigned int *sse) { |
175 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); | 100 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); |
176 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); | 101 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); |
177 | 102 |
178 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, | 103 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, |
179 17, 16, | 104 17, 16, |
180 BILINEAR_FILTERS_2TAP(xoffset)); | 105 BILINEAR_FILTERS_2TAP(xoffset)); |
181 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, | 106 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, |
182 16, BILINEAR_FILTERS_2TAP(yoffset)); | 107 16, BILINEAR_FILTERS_2TAP(yoffset)); |
183 return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse); | 108 return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); |
184 } | |
185 | |
186 void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride, | |
187 const uint8_t *ref_ptr, int ref_stride, | |
188 unsigned int *sse, int *sum) { | |
189 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32, | |
190 32, sse, sum); | |
191 } | |
192 | |
193 unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, | |
194 const uint8_t *b, int b_stride, | |
195 unsigned int *sse) { | |
196 int sum; | |
197 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); | |
198 return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 | |
199 } | |
200 | |
201 unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride, | |
202 const uint8_t *b, int b_stride, | |
203 unsigned int *sse) { | |
204 int sum1, sum2; | |
205 uint32_t sse1, sse2; | |
206 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); | |
207 variance_neon_w8(a + (32 * a_stride), a_stride, | |
208 b + (32 * b_stride), b_stride, 32, 32, | |
209 &sse2, &sum2); | |
210 *sse = sse1 + sse2; | |
211 sum1 += sum2; | |
212 return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 | |
213 } | |
214 | |
215 unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride, | |
216 const uint8_t *b, int b_stride, | |
217 unsigned int *sse) { | |
218 int sum1, sum2; | |
219 uint32_t sse1, sse2; | |
220 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); | |
221 variance_neon_w8(a + (16 * a_stride), a_stride, | |
222 b + (16 * b_stride), b_stride, 64, 16, | |
223 &sse2, &sum2); | |
224 *sse = sse1 + sse2; | |
225 sum1 += sum2; | |
226 return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 | |
227 } | |
228 | |
229 unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride, | |
230 const uint8_t *b, int b_stride, | |
231 unsigned int *sse) { | |
232 int sum1, sum2; | |
233 uint32_t sse1, sse2; | |
234 | |
235 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); | |
236 variance_neon_w8(a + (16 * a_stride), a_stride, | |
237 b + (16 * b_stride), b_stride, 64, 16, | |
238 &sse2, &sum2); | |
239 sse1 += sse2; | |
240 sum1 += sum2; | |
241 | |
242 variance_neon_w8(a + (16 * 2 * a_stride), a_stride, | |
243 b + (16 * 2 * b_stride), b_stride, | |
244 64, 16, &sse2, &sum2); | |
245 sse1 += sse2; | |
246 sum1 += sum2; | |
247 | |
248 variance_neon_w8(a + (16 * 3 * a_stride), a_stride, | |
249 b + (16 * 3 * b_stride), b_stride, | |
250 64, 16, &sse2, &sum2); | |
251 *sse = sse1 + sse2; | |
252 sum1 += sum2; | |
253 return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 | |
254 } | 109 } |
255 | 110 |
256 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, | 111 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, |
257 int src_stride, | 112 int src_stride, |
258 int xoffset, | 113 int xoffset, |
259 int yoffset, | 114 int yoffset, |
260 const uint8_t *dst, | 115 const uint8_t *dst, |
261 int dst_stride, | 116 int dst_stride, |
262 unsigned int *sse) { | 117 unsigned int *sse) { |
263 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); | 118 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); |
264 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); | 119 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); |
265 | 120 |
266 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, | 121 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, |
267 33, 32, | 122 33, 32, |
268 BILINEAR_FILTERS_2TAP(xoffset)); | 123 BILINEAR_FILTERS_2TAP(xoffset)); |
269 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, | 124 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, |
270 32, BILINEAR_FILTERS_2TAP(yoffset)); | 125 32, BILINEAR_FILTERS_2TAP(yoffset)); |
271 return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse); | 126 return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); |
272 } | 127 } |
273 | 128 |
274 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, | 129 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, |
275 int src_stride, | 130 int src_stride, |
276 int xoffset, | 131 int xoffset, |
277 int yoffset, | 132 int yoffset, |
278 const uint8_t *dst, | 133 const uint8_t *dst, |
279 int dst_stride, | 134 int dst_stride, |
280 unsigned int *sse) { | 135 unsigned int *sse) { |
281 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); | 136 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); |
282 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); | 137 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); |
283 | 138 |
284 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, | 139 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, |
285 65, 64, | 140 65, 64, |
286 BILINEAR_FILTERS_2TAP(xoffset)); | 141 BILINEAR_FILTERS_2TAP(xoffset)); |
287 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, | 142 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, |
288 64, BILINEAR_FILTERS_2TAP(yoffset)); | 143 64, BILINEAR_FILTERS_2TAP(yoffset)); |
289 return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse); | 144 return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); |
290 } | 145 } |
OLD | NEW |