source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c - Issue 1162573005: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <arm_neon.h>	11 #include <arm_neon.h>

12 #include "./vp9_rtcd.h"	12 #include "./vp9_rtcd.h"

	13 #include "./vpx_dsp_rtcd.h"

13 #include "./vpx_config.h"	14 #include "./vpx_config.h"

14	15

15 #include "vpx_ports/mem.h"	16 #include "vpx_ports/mem.h"

16 #include "vpx/vpx_integer.h"	17 #include "vpx/vpx_integer.h"

17	18

18 #include "vp9/common/vp9_common.h"	19 #include "vp9/common/vp9_common.h"

19 #include "vp9/common/vp9_filter.h"	20 #include "vp9/common/vp9_filter.h"

20	21

21 #include "vp9/encoder/vp9_variance.h"	22 #include "vp9/encoder/vp9_variance.h"

22	23

23 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {

24 const int32x4_t a = vpaddlq_s16(v_16x8);

25 const int64x2_t b = vpaddlq_s32(a);

26 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),

27 vreinterpret_s32_s64(vget_high_s64(b)));

28 return vget_lane_s32(c, 0);

29 }

30

31 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {

32 const int64x2_t b = vpaddlq_s32(v_32x4);

33 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),

34 vreinterpret_s32_s64(vget_high_s64(b)));

35 return vget_lane_s32(c, 0);

36 }

37

38 // w * h must be less than 2048 or local variable v_sum may overflow.

39 static void variance_neon_w8(const uint8_t *a, int a_stride,

40 const uint8_t *b, int b_stride,

41 int w, int h, uint32_t sse, int sum) {

42 int i, j;

43 int16x8_t v_sum = vdupq_n_s16(0);

44 int32x4_t v_sse_lo = vdupq_n_s32(0);

45 int32x4_t v_sse_hi = vdupq_n_s32(0);

46

47 for (i = 0; i < h; ++i) {

48 for (j = 0; j < w; j += 8) {

49 const uint8x8_t v_a = vld1_u8(&a[j]);

50 const uint8x8_t v_b = vld1_u8(&b[j]);

51 const uint16x8_t v_diff = vsubl_u8(v_a, v_b);

52 const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);

53 v_sum = vaddq_s16(v_sum, sv_diff);

54 v_sse_lo = vmlal_s16(v_sse_lo,

55 vget_low_s16(sv_diff),

56 vget_low_s16(sv_diff));

57 v_sse_hi = vmlal_s16(v_sse_hi,

58 vget_high_s16(sv_diff),

59 vget_high_s16(sv_diff));

60 }

61 a += a_stride;

62 b += b_stride;

63 }

64

65 *sum = horizontal_add_s16x8(v_sum);

66 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));

67 }

68

69 void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,

70 const uint8_t *ref_ptr, int ref_stride,

71 unsigned int sse, int sum) {

72 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,

73 8, sse, sum);

74 }

75

76 unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,

77 const uint8_t *b, int b_stride,

78 unsigned int *sse) {

79 int sum;

80 variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);

81 return sse - (((int64_t)sum sum) >> 6); // >> 6 = / 8 * 8

82 }

83

84 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,

85 const uint8_t *ref_ptr, int ref_stride,

86 unsigned int sse, int sum) {

87 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,

88 16, sse, sum);

89 }

90

91 unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,

92 const uint8_t *b, int b_stride,

93 unsigned int *sse) {

94 int sum;

95 variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);

96 return sse - (((int64_t)sum sum) >> 8); // >> 8 = / 16 * 16

97 }

98

99 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,	24 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,

100 uint8_t *output_ptr,	25 uint8_t *output_ptr,

101 unsigned int src_pixels_per_line,	26 unsigned int src_pixels_per_line,

102 int pixel_step,	27 int pixel_step,

103 unsigned int output_height,	28 unsigned int output_height,

104 unsigned int output_width,	29 unsigned int output_width,

105 const int16_t *vp9_filter) {	30 const int16_t *vp9_filter) {

106 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);	31 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);

107 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);	32 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);

108 unsigned int i;	33 unsigned int i;

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
155 int dst_stride,	80 int dst_stride,

156 unsigned int *sse) {	81 unsigned int *sse) {

157 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);	82 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);

158 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);	83 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);

159	84

160 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,	85 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,

161 9, 8,	86 9, 8,

162 BILINEAR_FILTERS_2TAP(xoffset));	87 BILINEAR_FILTERS_2TAP(xoffset));

163 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,	88 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,

164 8, BILINEAR_FILTERS_2TAP(yoffset));	89 8, BILINEAR_FILTERS_2TAP(yoffset));

165 return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse);	90 return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);

166 }	91 }

167	92

168 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,	93 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,

169 int src_stride,	94 int src_stride,

170 int xoffset,	95 int xoffset,

171 int yoffset,	96 int yoffset,

172 const uint8_t *dst,	97 const uint8_t *dst,

173 int dst_stride,	98 int dst_stride,

174 unsigned int *sse) {	99 unsigned int *sse) {

175 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);	100 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);

176 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);	101 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);

177	102

178 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,	103 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

179 17, 16,	104 17, 16,

180 BILINEAR_FILTERS_2TAP(xoffset));	105 BILINEAR_FILTERS_2TAP(xoffset));

181 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,	106 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,

182 16, BILINEAR_FILTERS_2TAP(yoffset));	107 16, BILINEAR_FILTERS_2TAP(yoffset));

183 return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse);	108 return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);

184 }

185

186 void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,

187 const uint8_t *ref_ptr, int ref_stride,

188 unsigned int sse, int sum) {

189 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,

190 32, sse, sum);

191 }

192

193 unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,

194 const uint8_t *b, int b_stride,

195 unsigned int *sse) {

196 int sum;

197 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);

198 return sse - (((int64_t)sum sum) >> 10); // >> 10 = / 32 * 32

199 }

200

201 unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,

202 const uint8_t *b, int b_stride,

203 unsigned int *sse) {

204 int sum1, sum2;

205 uint32_t sse1, sse2;

206 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);

207 variance_neon_w8(a + (32 * a_stride), a_stride,

208 b + (32 * b_stride), b_stride, 32, 32,

209 &sse2, &sum2);

210 *sse = sse1 + sse2;

211 sum1 += sum2;

212 return sse - (((int64_t)sum1 sum1) >> 11); // >> 11 = / 32 * 64

213 }

214

215 unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,

216 const uint8_t *b, int b_stride,

217 unsigned int *sse) {

218 int sum1, sum2;

219 uint32_t sse1, sse2;

220 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);

221 variance_neon_w8(a + (16 * a_stride), a_stride,

222 b + (16 * b_stride), b_stride, 64, 16,

223 &sse2, &sum2);

224 *sse = sse1 + sse2;

225 sum1 += sum2;

226 return sse - (((int64_t)sum1 sum1) >> 11); // >> 11 = / 32 * 64

227 }

228

229 unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,

230 const uint8_t *b, int b_stride,

231 unsigned int *sse) {

232 int sum1, sum2;

233 uint32_t sse1, sse2;

234

235 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);

236 variance_neon_w8(a + (16 * a_stride), a_stride,

237 b + (16 * b_stride), b_stride, 64, 16,

238 &sse2, &sum2);

239 sse1 += sse2;

240 sum1 += sum2;

241

242 variance_neon_w8(a + (16 * 2 * a_stride), a_stride,

243 b + (16 * 2 * b_stride), b_stride,

244 64, 16, &sse2, &sum2);

245 sse1 += sse2;

246 sum1 += sum2;

247

248 variance_neon_w8(a + (16 * 3 * a_stride), a_stride,

249 b + (16 * 3 * b_stride), b_stride,

250 64, 16, &sse2, &sum2);

251 *sse = sse1 + sse2;

252 sum1 += sum2;

253 return sse - (((int64_t)sum1 sum1) >> 12); // >> 12 = / 64 * 64

254 }	109 }

255	110

256 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,	111 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,

257 int src_stride,	112 int src_stride,

258 int xoffset,	113 int xoffset,

259 int yoffset,	114 int yoffset,

260 const uint8_t *dst,	115 const uint8_t *dst,

261 int dst_stride,	116 int dst_stride,

262 unsigned int *sse) {	117 unsigned int *sse) {

263 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);	118 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);

264 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);	119 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);

265	120

266 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,	121 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

267 33, 32,	122 33, 32,

268 BILINEAR_FILTERS_2TAP(xoffset));	123 BILINEAR_FILTERS_2TAP(xoffset));

269 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,	124 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,

270 32, BILINEAR_FILTERS_2TAP(yoffset));	125 32, BILINEAR_FILTERS_2TAP(yoffset));

271 return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse);	126 return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);

272 }	127 }

273	128

274 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,	129 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,

275 int src_stride,	130 int src_stride,

276 int xoffset,	131 int xoffset,

277 int yoffset,	132 int yoffset,

278 const uint8_t *dst,	133 const uint8_t *dst,

279 int dst_stride,	134 int dst_stride,

280 unsigned int *sse) {	135 unsigned int *sse) {

281 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);	136 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);

282 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);	137 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);

283	138

284 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,	139 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

285 65, 64,	140 65, 64,

286 BILINEAR_FILTERS_2TAP(xoffset));	141 BILINEAR_FILTERS_2TAP(xoffset));

287 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,	142 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,

288 64, BILINEAR_FILTERS_2TAP(yoffset));	143 64, BILINEAR_FILTERS_2TAP(yoffset));

289 return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse);	144 return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);

290 }	145 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/decoder/vp9_detokenize.c ('k') | source/libvpx/vp9/encoder/vp9_aq_complexity.h » ('j') | no next file with comments »