Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1363)

Side by Side Diff: source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <arm_neon.h> 11 #include <arm_neon.h>
12 #include "./vp9_rtcd.h" 12 #include "./vp9_rtcd.h"
13 #include "./vpx_dsp_rtcd.h"
13 #include "./vpx_config.h" 14 #include "./vpx_config.h"
14 15
15 #include "vpx_ports/mem.h" 16 #include "vpx_ports/mem.h"
16 #include "vpx/vpx_integer.h" 17 #include "vpx/vpx_integer.h"
17 18
18 #include "vp9/common/vp9_common.h" 19 #include "vp9/common/vp9_common.h"
19 #include "vp9/common/vp9_filter.h" 20 #include "vp9/common/vp9_filter.h"
20 21
21 #include "vp9/encoder/vp9_variance.h" 22 #include "vp9/encoder/vp9_variance.h"
22 23
23 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
24 const int32x4_t a = vpaddlq_s16(v_16x8);
25 const int64x2_t b = vpaddlq_s32(a);
26 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
27 vreinterpret_s32_s64(vget_high_s64(b)));
28 return vget_lane_s32(c, 0);
29 }
30
31 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
32 const int64x2_t b = vpaddlq_s32(v_32x4);
33 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
34 vreinterpret_s32_s64(vget_high_s64(b)));
35 return vget_lane_s32(c, 0);
36 }
37
38 // w * h must be less than 2048 or local variable v_sum may overflow.
39 static void variance_neon_w8(const uint8_t *a, int a_stride,
40 const uint8_t *b, int b_stride,
41 int w, int h, uint32_t *sse, int *sum) {
42 int i, j;
43 int16x8_t v_sum = vdupq_n_s16(0);
44 int32x4_t v_sse_lo = vdupq_n_s32(0);
45 int32x4_t v_sse_hi = vdupq_n_s32(0);
46
47 for (i = 0; i < h; ++i) {
48 for (j = 0; j < w; j += 8) {
49 const uint8x8_t v_a = vld1_u8(&a[j]);
50 const uint8x8_t v_b = vld1_u8(&b[j]);
51 const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
52 const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
53 v_sum = vaddq_s16(v_sum, sv_diff);
54 v_sse_lo = vmlal_s16(v_sse_lo,
55 vget_low_s16(sv_diff),
56 vget_low_s16(sv_diff));
57 v_sse_hi = vmlal_s16(v_sse_hi,
58 vget_high_s16(sv_diff),
59 vget_high_s16(sv_diff));
60 }
61 a += a_stride;
62 b += b_stride;
63 }
64
65 *sum = horizontal_add_s16x8(v_sum);
66 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
67 }
68
69 void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
70 const uint8_t *ref_ptr, int ref_stride,
71 unsigned int *sse, int *sum) {
72 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,
73 8, sse, sum);
74 }
75
76 unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
77 const uint8_t *b, int b_stride,
78 unsigned int *sse) {
79 int sum;
80 variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
81 return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
82 }
83
84 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
85 const uint8_t *ref_ptr, int ref_stride,
86 unsigned int *sse, int *sum) {
87 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,
88 16, sse, sum);
89 }
90
91 unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
92 const uint8_t *b, int b_stride,
93 unsigned int *sse) {
94 int sum;
95 variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
96 return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
97 }
98
99 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, 24 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
100 uint8_t *output_ptr, 25 uint8_t *output_ptr,
101 unsigned int src_pixels_per_line, 26 unsigned int src_pixels_per_line,
102 int pixel_step, 27 int pixel_step,
103 unsigned int output_height, 28 unsigned int output_height,
104 unsigned int output_width, 29 unsigned int output_width,
105 const int16_t *vp9_filter) { 30 const int16_t *vp9_filter) {
106 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); 31 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
107 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); 32 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
108 unsigned int i; 33 unsigned int i;
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
155 int dst_stride, 80 int dst_stride,
156 unsigned int *sse) { 81 unsigned int *sse) {
157 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); 82 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
158 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); 83 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
159 84
160 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 85 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
161 9, 8, 86 9, 8,
162 BILINEAR_FILTERS_2TAP(xoffset)); 87 BILINEAR_FILTERS_2TAP(xoffset));
163 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 88 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
164 8, BILINEAR_FILTERS_2TAP(yoffset)); 89 8, BILINEAR_FILTERS_2TAP(yoffset));
165 return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse); 90 return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
166 } 91 }
167 92
168 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, 93 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
169 int src_stride, 94 int src_stride,
170 int xoffset, 95 int xoffset,
171 int yoffset, 96 int yoffset,
172 const uint8_t *dst, 97 const uint8_t *dst,
173 int dst_stride, 98 int dst_stride,
174 unsigned int *sse) { 99 unsigned int *sse) {
175 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); 100 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
176 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); 101 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
177 102
178 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 103 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
179 17, 16, 104 17, 16,
180 BILINEAR_FILTERS_2TAP(xoffset)); 105 BILINEAR_FILTERS_2TAP(xoffset));
181 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 106 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
182 16, BILINEAR_FILTERS_2TAP(yoffset)); 107 16, BILINEAR_FILTERS_2TAP(yoffset));
183 return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse); 108 return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
184 }
185
186 void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
187 const uint8_t *ref_ptr, int ref_stride,
188 unsigned int *sse, int *sum) {
189 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,
190 32, sse, sum);
191 }
192
193 unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
194 const uint8_t *b, int b_stride,
195 unsigned int *sse) {
196 int sum;
197 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
198 return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
199 }
200
201 unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
202 const uint8_t *b, int b_stride,
203 unsigned int *sse) {
204 int sum1, sum2;
205 uint32_t sse1, sse2;
206 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
207 variance_neon_w8(a + (32 * a_stride), a_stride,
208 b + (32 * b_stride), b_stride, 32, 32,
209 &sse2, &sum2);
210 *sse = sse1 + sse2;
211 sum1 += sum2;
212 return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
213 }
214
215 unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
216 const uint8_t *b, int b_stride,
217 unsigned int *sse) {
218 int sum1, sum2;
219 uint32_t sse1, sse2;
220 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
221 variance_neon_w8(a + (16 * a_stride), a_stride,
222 b + (16 * b_stride), b_stride, 64, 16,
223 &sse2, &sum2);
224 *sse = sse1 + sse2;
225 sum1 += sum2;
226 return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
227 }
228
229 unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
230 const uint8_t *b, int b_stride,
231 unsigned int *sse) {
232 int sum1, sum2;
233 uint32_t sse1, sse2;
234
235 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
236 variance_neon_w8(a + (16 * a_stride), a_stride,
237 b + (16 * b_stride), b_stride, 64, 16,
238 &sse2, &sum2);
239 sse1 += sse2;
240 sum1 += sum2;
241
242 variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
243 b + (16 * 2 * b_stride), b_stride,
244 64, 16, &sse2, &sum2);
245 sse1 += sse2;
246 sum1 += sum2;
247
248 variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
249 b + (16 * 3 * b_stride), b_stride,
250 64, 16, &sse2, &sum2);
251 *sse = sse1 + sse2;
252 sum1 += sum2;
253 return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
254 } 109 }
255 110
256 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, 111 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
257 int src_stride, 112 int src_stride,
258 int xoffset, 113 int xoffset,
259 int yoffset, 114 int yoffset,
260 const uint8_t *dst, 115 const uint8_t *dst,
261 int dst_stride, 116 int dst_stride,
262 unsigned int *sse) { 117 unsigned int *sse) {
263 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); 118 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
264 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); 119 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
265 120
266 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 121 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
267 33, 32, 122 33, 32,
268 BILINEAR_FILTERS_2TAP(xoffset)); 123 BILINEAR_FILTERS_2TAP(xoffset));
269 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 124 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
270 32, BILINEAR_FILTERS_2TAP(yoffset)); 125 32, BILINEAR_FILTERS_2TAP(yoffset));
271 return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse); 126 return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
272 } 127 }
273 128
274 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, 129 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
275 int src_stride, 130 int src_stride,
276 int xoffset, 131 int xoffset,
277 int yoffset, 132 int yoffset,
278 const uint8_t *dst, 133 const uint8_t *dst,
279 int dst_stride, 134 int dst_stride,
280 unsigned int *sse) { 135 unsigned int *sse) {
281 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); 136 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
282 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); 137 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
283 138
284 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 139 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
285 65, 64, 140 65, 64,
286 BILINEAR_FILTERS_2TAP(xoffset)); 141 BILINEAR_FILTERS_2TAP(xoffset));
287 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 142 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
288 64, BILINEAR_FILTERS_2TAP(yoffset)); 143 64, BILINEAR_FILTERS_2TAP(yoffset));
289 return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse); 144 return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
290 } 145 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/decoder/vp9_detokenize.c ('k') | source/libvpx/vp9/encoder/vp9_aq_complexity.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698