source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c - Issue 1124333011: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: only update to last nights LKGR Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <arm_neon.h>	11 #include <arm_neon.h>

12 #include "./vp9_rtcd.h"	12 #include "./vp9_rtcd.h"

13 #include "./vpx_config.h"	13 #include "./vpx_config.h"

14	14

15 #include "vpx_ports/mem.h"	15 #include "vpx_ports/mem.h"

16 #include "vpx/vpx_integer.h"	16 #include "vpx/vpx_integer.h"

17	17

18 #include "vp9/common/vp9_common.h"	18 #include "vp9/common/vp9_common.h"

19 #include "vp9/common/vp9_filter.h"	19 #include "vp9/common/vp9_filter.h"

20	20

21 #include "vp9/encoder/vp9_variance.h"	21 #include "vp9/encoder/vp9_variance.h"

22	22

23 enum { kWidth8 = 8 };

24 enum { kHeight8 = 8 };

25 enum { kHeight8PlusOne = 9 };

26 enum { kWidth16 = 16 };

27 enum { kHeight16 = 16 };

28 enum { kHeight16PlusOne = 17 };

29 enum { kWidth32 = 32 };

30 enum { kHeight32 = 32 };

31 enum { kHeight32PlusOne = 33 };

32 enum { kWidth64 = 64 };

33 enum { kHeight64 = 64 };

34 enum { kHeight64PlusOne = 65 };

35 enum { kPixelStepOne = 1 };

36 enum { kAlign16 = 16 };

37

38 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {	23 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {

39 const int32x4_t a = vpaddlq_s16(v_16x8);	24 const int32x4_t a = vpaddlq_s16(v_16x8);

40 const int64x2_t b = vpaddlq_s32(a);	25 const int64x2_t b = vpaddlq_s32(a);

41 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),	26 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),

42 vreinterpret_s32_s64(vget_high_s64(b)));	27 vreinterpret_s32_s64(vget_high_s64(b)));

43 return vget_lane_s32(c, 0);	28 return vget_lane_s32(c, 0);

44 }	29 }

45	30

46 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {	31 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {

47 const int64x2_t b = vpaddlq_s32(v_32x4);	32 const int64x2_t b = vpaddlq_s32(v_32x4);

(...skipping 29 matching lines...) Expand all Loading...
77 b += b_stride;	62 b += b_stride;

78 }	63 }

79	64

80 *sum = horizontal_add_s16x8(v_sum);	65 *sum = horizontal_add_s16x8(v_sum);

81 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));	66 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));

82 }	67 }

83	68

84 void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,	69 void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,

85 const uint8_t *ref_ptr, int ref_stride,	70 const uint8_t *ref_ptr, int ref_stride,

86 unsigned int sse, int sum) {	71 unsigned int sse, int sum) {

87 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8,	72 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,

88 kHeight8, sse, sum);	73 8, sse, sum);

89 }	74 }

90	75

91 unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,	76 unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,

92 const uint8_t *b, int b_stride,	77 const uint8_t *b, int b_stride,

93 unsigned int *sse) {	78 unsigned int *sse) {

94 int sum;	79 int sum;

95 variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);	80 variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);

96 return sse - (((int64_t)sum sum) >> 6); // >> 6 = / 8 * 8	81 return sse - (((int64_t)sum sum) >> 6); // >> 6 = / 8 * 8

97 }	82 }

98	83

99 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,	84 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,

100 const uint8_t *ref_ptr, int ref_stride,	85 const uint8_t *ref_ptr, int ref_stride,

101 unsigned int sse, int sum) {	86 unsigned int sse, int sum) {

102 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,	87 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,

103 kHeight16, sse, sum);	88 16, sse, sum);

104 }	89 }

105	90

106 unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,	91 unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,

107 const uint8_t *b, int b_stride,	92 const uint8_t *b, int b_stride,

108 unsigned int *sse) {	93 unsigned int *sse) {

109 int sum;	94 int sum;

110 variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);	95 variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);

111 return sse - (((int64_t)sum sum) >> 8); // >> 8 = / 16 * 16	96 return sse - (((int64_t)sum sum) >> 8); // >> 8 = / 16 * 16

112 }	97 }

113	98

114 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,	99 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,

115 uint8_t *output_ptr,	100 uint8_t *output_ptr,

116 unsigned int src_pixels_per_line,	101 unsigned int src_pixels_per_line,

117 int pixel_step,	102 int pixel_step,

118 unsigned int output_height,	103 unsigned int output_height,

119 unsigned int output_width,	104 unsigned int output_width,

120 const int16_t *vp9_filter) {	105 const int16_t *vp9_filter) {

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
162 }	147 }

163 }	148 }

164	149

165 unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,	150 unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,

166 int src_stride,	151 int src_stride,

167 int xoffset,	152 int xoffset,

168 int yoffset,	153 int yoffset,

169 const uint8_t *dst,	154 const uint8_t *dst,

170 int dst_stride,	155 int dst_stride,

171 unsigned int *sse) {	156 unsigned int *sse) {

172 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);	157 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);

173 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);	158 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);

174	159

175 var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,	160 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,

176 kHeight8PlusOne, kWidth8,	161 9, 8,

177 BILINEAR_FILTERS_2TAP(xoffset));	162 BILINEAR_FILTERS_2TAP(xoffset));

178 var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,	163 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,

179 kWidth8, BILINEAR_FILTERS_2TAP(yoffset));	164 8, BILINEAR_FILTERS_2TAP(yoffset));

180 return vp9_variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);	165 return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse);

181 }	166 }

182	167

183 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,	168 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,

184 int src_stride,	169 int src_stride,

185 int xoffset,	170 int xoffset,

186 int yoffset,	171 int yoffset,

187 const uint8_t *dst,	172 const uint8_t *dst,

188 int dst_stride,	173 int dst_stride,

189 unsigned int *sse) {	174 unsigned int *sse) {

190 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16);	175 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);

191 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);	176 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);

192	177

193 var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,	178 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

194 kHeight16PlusOne, kWidth16,	179 17, 16,

195 BILINEAR_FILTERS_2TAP(xoffset));	180 BILINEAR_FILTERS_2TAP(xoffset));

196 var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,	181 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,

197 kWidth16, BILINEAR_FILTERS_2TAP(yoffset));	182 16, BILINEAR_FILTERS_2TAP(yoffset));

198 return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);	183 return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse);

199 }	184 }

200	185

201 void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,	186 void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,

202 const uint8_t *ref_ptr, int ref_stride,	187 const uint8_t *ref_ptr, int ref_stride,

203 unsigned int sse, int sum) {	188 unsigned int sse, int sum) {

204 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32,	189 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,

205 kHeight32, sse, sum);	190 32, sse, sum);

206 }	191 }

207	192

208 unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,	193 unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,

209 const uint8_t *b, int b_stride,	194 const uint8_t *b, int b_stride,

210 unsigned int *sse) {	195 unsigned int *sse) {

211 int sum;	196 int sum;

212 variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum);	197 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);

213 return sse - (((int64_t)sum sum) >> 10); // >> 10 = / 32 * 32	198 return sse - (((int64_t)sum sum) >> 10); // >> 10 = / 32 * 32

214 }	199 }

215	200

216 unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,	201 unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,

217 const uint8_t *b, int b_stride,	202 const uint8_t *b, int b_stride,

218 unsigned int *sse) {	203 unsigned int *sse) {

219 int sum1, sum2;	204 int sum1, sum2;

220 uint32_t sse1, sse2;	205 uint32_t sse1, sse2;

221 variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, &sse1, &sum1);	206 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);

222 variance_neon_w8(a + (kHeight32 * a_stride), a_stride,	207 variance_neon_w8(a + (32 * a_stride), a_stride,

223 b + (kHeight32 * b_stride), b_stride, kWidth32, kHeight32,	208 b + (32 * b_stride), b_stride, 32, 32,

224 &sse2, &sum2);	209 &sse2, &sum2);

225 *sse = sse1 + sse2;	210 *sse = sse1 + sse2;

226 sum1 += sum2;	211 sum1 += sum2;

227 return sse - (((int64_t)sum1 sum1) >> 11); // >> 11 = / 32 * 64	212 return sse - (((int64_t)sum1 sum1) >> 11); // >> 11 = / 32 * 64

228 }	213 }

229	214

230 unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,	215 unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,

231 const uint8_t *b, int b_stride,	216 const uint8_t *b, int b_stride,

232 unsigned int *sse) {	217 unsigned int *sse) {

233 int sum1, sum2;	218 int sum1, sum2;

234 uint32_t sse1, sse2;	219 uint32_t sse1, sse2;

235 variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);	220 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);

236 variance_neon_w8(a + (kHeight16 * a_stride), a_stride,	221 variance_neon_w8(a + (16 * a_stride), a_stride,

237 b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,	222 b + (16 * b_stride), b_stride, 64, 16,

238 &sse2, &sum2);	223 &sse2, &sum2);

239 *sse = sse1 + sse2;	224 *sse = sse1 + sse2;

240 sum1 += sum2;	225 sum1 += sum2;

241 return sse - (((int64_t)sum1 sum1) >> 11); // >> 11 = / 32 * 64	226 return sse - (((int64_t)sum1 sum1) >> 11); // >> 11 = / 32 * 64

242 }	227 }

243	228

244 unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,	229 unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,

245 const uint8_t *b, int b_stride,	230 const uint8_t *b, int b_stride,

246 unsigned int *sse) {	231 unsigned int *sse) {

247 int sum1, sum2;	232 int sum1, sum2;

248 uint32_t sse1, sse2;	233 uint32_t sse1, sse2;

249	234

250 variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);	235 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);

251 variance_neon_w8(a + (kHeight16 * a_stride), a_stride,	236 variance_neon_w8(a + (16 * a_stride), a_stride,

252 b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,	237 b + (16 * b_stride), b_stride, 64, 16,

253 &sse2, &sum2);	238 &sse2, &sum2);

254 sse1 += sse2;	239 sse1 += sse2;

255 sum1 += sum2;	240 sum1 += sum2;

256	241

257 variance_neon_w8(a + (kHeight16 * 2 * a_stride), a_stride,	242 variance_neon_w8(a + (16 * 2 * a_stride), a_stride,

258 b + (kHeight16 * 2 * b_stride), b_stride,	243 b + (16 * 2 * b_stride), b_stride,

259 kWidth64, kHeight16, &sse2, &sum2);	244 64, 16, &sse2, &sum2);

260 sse1 += sse2;	245 sse1 += sse2;

261 sum1 += sum2;	246 sum1 += sum2;

262	247

263 variance_neon_w8(a + (kHeight16 * 3 * a_stride), a_stride,	248 variance_neon_w8(a + (16 * 3 * a_stride), a_stride,

264 b + (kHeight16 * 3 * b_stride), b_stride,	249 b + (16 * 3 * b_stride), b_stride,

265 kWidth64, kHeight16, &sse2, &sum2);	250 64, 16, &sse2, &sum2);

266 *sse = sse1 + sse2;	251 *sse = sse1 + sse2;

267 sum1 += sum2;	252 sum1 += sum2;

268 return sse - (((int64_t)sum1 sum1) >> 12); // >> 12 = / 64 * 64	253 return sse - (((int64_t)sum1 sum1) >> 12); // >> 12 = / 64 * 64

269 }	254 }

270	255

271 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,	256 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,

272 int src_stride,	257 int src_stride,

273 int xoffset,	258 int xoffset,

274 int yoffset,	259 int yoffset,

275 const uint8_t *dst,	260 const uint8_t *dst,

276 int dst_stride,	261 int dst_stride,

277 unsigned int *sse) {	262 unsigned int *sse) {

278 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32);	263 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);

279 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32);	264 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);

280	265

281 var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,	266 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

282 kHeight32PlusOne, kWidth32,	267 33, 32,

283 BILINEAR_FILTERS_2TAP(xoffset));	268 BILINEAR_FILTERS_2TAP(xoffset));

284 var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32,	269 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,

285 kWidth32, BILINEAR_FILTERS_2TAP(yoffset));	270 32, BILINEAR_FILTERS_2TAP(yoffset));

286 return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse);	271 return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse);

287 }	272 }

288	273

289 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,	274 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,

290 int src_stride,	275 int src_stride,

291 int xoffset,	276 int xoffset,

292 int yoffset,	277 int yoffset,

293 const uint8_t *dst,	278 const uint8_t *dst,

294 int dst_stride,	279 int dst_stride,

295 unsigned int *sse) {	280 unsigned int *sse) {

296 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight64 * kWidth64);	281 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);

297 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight64PlusOne * kWidth64);	282 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);

298	283

299 var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,	284 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

300 kHeight64PlusOne, kWidth64,	285 65, 64,

301 BILINEAR_FILTERS_2TAP(xoffset));	286 BILINEAR_FILTERS_2TAP(xoffset));

302 var_filter_block2d_bil_w16(fdata3, temp2, kWidth64, kWidth64, kHeight64,	287 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,

303 kWidth64, BILINEAR_FILTERS_2TAP(yoffset));	288 64, BILINEAR_FILTERS_2TAP(yoffset));

304 return vp9_variance64x64_neon(temp2, kWidth64, dst, dst_stride, sse);	289 return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse);

305 }	290 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c ('k') | source/libvpx/vp9/encoder/vp9_aq_complexity.c » ('j') | no next file with comments »