source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c - Issue 800493003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c

Issue 800493003: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Keep vp9_iht8x8_add_neon disabled because of http://llvm.org/bugs/show_bug.cgi?id=22178 Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <arm_neon.h>	11 #include <arm_neon.h>

12 #include "./vp9_rtcd.h"	12 #include "./vp9_rtcd.h"

	13 #include "./vpx_config.h"

13	14

14 #include "vpx_ports/mem.h"	15 #include "vpx_ports/mem.h"

15 #include "vpx/vpx_integer.h"	16 #include "vpx/vpx_integer.h"

16	17

17 #include "vp9/common/vp9_common.h"	18 #include "vp9/common/vp9_common.h"

18 #include "vp9/common/vp9_filter.h"	19 #include "vp9/common/vp9_filter.h"

19	20

20 #include "vp9/encoder/vp9_variance.h"	21 #include "vp9/encoder/vp9_variance.h"

21	22

22 enum { kWidth8 = 8 };	23 enum { kWidth8 = 8 };

23 enum { kHeight8 = 8 };	24 enum { kHeight8 = 8 };

24 enum { kHeight8PlusOne = 9 };	25 enum { kHeight8PlusOne = 9 };

25 enum { kWidth16 = 16 };	26 enum { kWidth16 = 16 };

26 enum { kHeight16 = 16 };	27 enum { kHeight16 = 16 };

27 enum { kHeight16PlusOne = 17 };	28 enum { kHeight16PlusOne = 17 };

28 enum { kWidth32 = 32 };	29 enum { kWidth32 = 32 };

29 enum { kHeight32 = 32 };	30 enum { kHeight32 = 32 };

30 enum { kHeight32PlusOne = 33 };	31 enum { kHeight32PlusOne = 33 };

	32 enum { kWidth64 = 64 };

	33 enum { kHeight64 = 64 };

	34 enum { kHeight64PlusOne = 65 };

31 enum { kPixelStepOne = 1 };	35 enum { kPixelStepOne = 1 };

32 enum { kAlign16 = 16 };	36 enum { kAlign16 = 16 };

33	37

34 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {	38 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {

35 const int32x4_t a = vpaddlq_s16(v_16x8);	39 const int32x4_t a = vpaddlq_s16(v_16x8);

36 const int64x2_t b = vpaddlq_s32(a);	40 const int64x2_t b = vpaddlq_s32(a);

37 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),	41 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),

38 vreinterpret_s32_s64(vget_high_s64(b)));	42 vreinterpret_s32_s64(vget_high_s64(b)));

39 return vget_lane_s32(c, 0);	43 return vget_lane_s32(c, 0);

40 }	44 }

41	45

42 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {	46 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {

43 const int64x2_t b = vpaddlq_s32(v_32x4);	47 const int64x2_t b = vpaddlq_s32(v_32x4);

44 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),	48 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),

45 vreinterpret_s32_s64(vget_high_s64(b)));	49 vreinterpret_s32_s64(vget_high_s64(b)));

46 return vget_lane_s32(c, 0);	50 return vget_lane_s32(c, 0);

47 }	51 }

48	52

	53 // w * h must be less than 2048 or local variable v_sum may overflow.

49 static void variance_neon_w8(const uint8_t *a, int a_stride,	54 static void variance_neon_w8(const uint8_t *a, int a_stride,

50 const uint8_t *b, int b_stride,	55 const uint8_t *b, int b_stride,

51 int w, int h, unsigned int sse, int sum) {	56 int w, int h, uint32_t sse, int sum) {

52 int i, j;	57 int i, j;

53 int16x8_t v_sum = vdupq_n_s16(0);	58 int16x8_t v_sum = vdupq_n_s16(0);

54 int32x4_t v_sse_lo = vdupq_n_s32(0);	59 int32x4_t v_sse_lo = vdupq_n_s32(0);

55 int32x4_t v_sse_hi = vdupq_n_s32(0);	60 int32x4_t v_sse_hi = vdupq_n_s32(0);

56	61

57 for (i = 0; i < h; ++i) {	62 for (i = 0; i < h; ++i) {

58 for (j = 0; j < w; j += 8) {	63 for (j = 0; j < w; j += 8) {

59 const uint8x8_t v_a = vld1_u8(&a[j]);	64 const uint8x8_t v_a = vld1_u8(&a[j]);

60 const uint8x8_t v_b = vld1_u8(&b[j]);	65 const uint8x8_t v_b = vld1_u8(&b[j]);

61 const uint16x8_t v_diff = vsubl_u8(v_a, v_b);	66 const uint16x8_t v_diff = vsubl_u8(v_a, v_b);

(...skipping 19 matching lines...) Expand all Loading...
81 unsigned int sse, int sum) {	86 unsigned int sse, int sum) {

82 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8,	87 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8,

83 kHeight8, sse, sum);	88 kHeight8, sse, sum);

84 }	89 }

85	90

86 unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,	91 unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,

87 const uint8_t *b, int b_stride,	92 const uint8_t *b, int b_stride,

88 unsigned int *sse) {	93 unsigned int *sse) {

89 int sum;	94 int sum;

90 variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);	95 variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);

91 return sse - (((int64_t)sum sum) / (kWidth8 * kHeight8));	96 return sse - (((int64_t)sum sum) >> 6); // >> 6 = / 8 * 8

92 }	97 }

93	98

94 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,	99 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,

95 const uint8_t *ref_ptr, int ref_stride,	100 const uint8_t *ref_ptr, int ref_stride,

96 unsigned int sse, int sum) {	101 unsigned int sse, int sum) {

97 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,	102 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,

98 kHeight16, sse, sum);	103 kHeight16, sse, sum);

99 }	104 }

100	105

101 unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,	106 unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,

102 const uint8_t *b, int b_stride,	107 const uint8_t *b, int b_stride,

103 unsigned int *sse) {	108 unsigned int *sse) {

104 int sum;	109 int sum;

105 variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);	110 variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);

106 return sse - (((int64_t)sum sum) / (kWidth16 * kHeight16));	111 return sse - (((int64_t)sum sum) >> 8); // >> 8 = / 16 * 16

107 }	112 }

108	113

109 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,	114 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,

110 uint8_t *output_ptr,	115 uint8_t *output_ptr,

111 unsigned int src_pixels_per_line,	116 unsigned int src_pixels_per_line,

112 int pixel_step,	117 int pixel_step,

113 unsigned int output_height,	118 unsigned int output_height,

114 unsigned int output_width,	119 unsigned int output_width,

115 const int16_t *vp9_filter) {	120 const int16_t *vp9_filter) {

116 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);	121 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
198 unsigned int sse, int sum) {	203 unsigned int sse, int sum) {

199 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32,	204 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32,

200 kHeight32, sse, sum);	205 kHeight32, sse, sum);

201 }	206 }

202	207

203 unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,	208 unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,

204 const uint8_t *b, int b_stride,	209 const uint8_t *b, int b_stride,

205 unsigned int *sse) {	210 unsigned int *sse) {

206 int sum;	211 int sum;

207 variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum);	212 variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum);

208 return sse - (((int64_t)sum sum) / (kWidth32 * kHeight32));	213 return sse - (((int64_t)sum sum) >> 10); // >> 10 = / 32 * 32

	214 }

	215

	216 unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,

	217 const uint8_t *b, int b_stride,

	218 unsigned int *sse) {

	219 int sum1, sum2;

	220 uint32_t sse1, sse2;

	221 variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, &sse1, &sum1);

	222 variance_neon_w8(a + (kHeight32 * a_stride), a_stride,

	223 b + (kHeight32 * b_stride), b_stride, kWidth32, kHeight32,

	224 &sse2, &sum2);

	225 *sse = sse1 + sse2;

	226 sum1 += sum2;

	227 return sse - (((int64_t)sum1 sum1) >> 11); // >> 11 = / 32 * 64

	228 }

	229

	230 unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,

	231 const uint8_t *b, int b_stride,

	232 unsigned int *sse) {

	233 int sum1, sum2;

	234 uint32_t sse1, sse2;

	235 variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);

	236 variance_neon_w8(a + (kHeight16 * a_stride), a_stride,

	237 b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,

	238 &sse2, &sum2);

	239 *sse = sse1 + sse2;

	240 sum1 += sum2;

	241 return sse - (((int64_t)sum1 sum1) >> 11); // >> 11 = / 32 * 64

	242 }

	243

	244 unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,

	245 const uint8_t *b, int b_stride,

	246 unsigned int *sse) {

	247 int sum1, sum2;

	248 uint32_t sse1, sse2;

	249

	250 variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);

	251 variance_neon_w8(a + (kHeight16 * a_stride), a_stride,

	252 b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,

	253 &sse2, &sum2);

	254 sse1 += sse2;

	255 sum1 += sum2;

	256

	257 variance_neon_w8(a + (kHeight16 * 2 * a_stride), a_stride,

	258 b + (kHeight16 * 2 * b_stride), b_stride,

	259 kWidth64, kHeight16, &sse2, &sum2);

	260 sse1 += sse2;

	261 sum1 += sum2;

	262

	263 variance_neon_w8(a + (kHeight16 * 3 * a_stride), a_stride,

	264 b + (kHeight16 * 3 * b_stride), b_stride,

	265 kWidth64, kHeight16, &sse2, &sum2);

	266 *sse = sse1 + sse2;

	267 sum1 += sum2;

	268 return sse - (((int64_t)sum1 sum1) >> 12); // >> 12 = / 64 * 64

209 }	269 }

210	270

211 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,	271 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,

212 int src_stride,	272 int src_stride,

213 int xoffset,	273 int xoffset,

214 int yoffset,	274 int yoffset,

215 const uint8_t *dst,	275 const uint8_t *dst,

216 int dst_stride,	276 int dst_stride,

217 unsigned int *sse) {	277 unsigned int *sse) {

218 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32);	278 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32);

219 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32);	279 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32);

220	280

221 var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,	281 var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,

222 kHeight32PlusOne, kWidth32,	282 kHeight32PlusOne, kWidth32,

223 BILINEAR_FILTERS_2TAP(xoffset));	283 BILINEAR_FILTERS_2TAP(xoffset));

224 var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32,	284 var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32,

225 kWidth32, BILINEAR_FILTERS_2TAP(yoffset));	285 kWidth32, BILINEAR_FILTERS_2TAP(yoffset));

226 return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse);	286 return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse);

227 }	287 }

	288

	289 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,

	290 int src_stride,

	291 int xoffset,

	292 int yoffset,

	293 const uint8_t *dst,

	294 int dst_stride,

	295 unsigned int *sse) {

	296 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight64 * kWidth64);

	297 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight64PlusOne * kWidth64);

	298

	299 var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,

	300 kHeight64PlusOne, kWidth64,

	301 BILINEAR_FILTERS_2TAP(xoffset));

	302 var_filter_block2d_bil_w16(fdata3, temp2, kWidth64, kWidth64, kHeight64,

	303 kWidth64, BILINEAR_FILTERS_2TAP(yoffset));

	304 return vp9_variance64x64_neon(temp2, kWidth64, dst, dst_stride, sse);

	305 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/decoder/vp9_dthread.c ('k') | source/libvpx/vp9/encoder/vp9_bitstream.c » ('j') | no next file with comments »