source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c - Issue 1169543007: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c

Issue 1169543007: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <arm_neon.h>	11 #include <arm_neon.h>

12 #include "./vp9_rtcd.h"	12 #include "./vp9_rtcd.h"

13 #include "./vpx_dsp_rtcd.h"	13 #include "./vpx_dsp_rtcd.h"

14 #include "./vpx_config.h"	14 #include "./vpx_config.h"

15	15

16 #include "vpx_ports/mem.h"	16 #include "vpx_ports/mem.h"

17 #include "vpx/vpx_integer.h"	17 #include "vpx/vpx_integer.h"

18	18

19 #include "vp9/common/vp9_common.h"

20 #include "vp9/common/vp9_filter.h"	19 #include "vp9/common/vp9_filter.h"

21	20

22 #include "vp9/encoder/vp9_variance.h"	21 static uint8_t bilinear_filters[8][2] = {

	22 { 128, 0, },

	23 { 112, 16, },

	24 { 96, 32, },

	25 { 80, 48, },

	26 { 64, 64, },

	27 { 48, 80, },

	28 { 32, 96, },

	29 { 16, 112, },

	30 };

23	31

24 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,	32 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,

25 uint8_t *output_ptr,	33 uint8_t *output_ptr,

26 unsigned int src_pixels_per_line,	34 unsigned int src_pixels_per_line,

27 int pixel_step,	35 int pixel_step,

28 unsigned int output_height,	36 unsigned int output_height,

29 unsigned int output_width,	37 unsigned int output_width,

30 const int16_t *vp9_filter) {	38 const uint8_t *vp9_filter) {

31 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);	39 const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);

32 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);	40 const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);

33 unsigned int i;	41 unsigned int i;

34 for (i = 0; i < output_height; ++i) {	42 for (i = 0; i < output_height; ++i) {

35 const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);	43 const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);

36 const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);	44 const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);

37 const uint16x8_t a = vmull_u8(src_0, f0);	45 const uint16x8_t a = vmull_u8(src_0, f0);

38 const uint16x8_t b = vmlal_u8(a, src_1, f1);	46 const uint16x8_t b = vmlal_u8(a, src_1, f1);

39 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);	47 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);

40 vst1_u8(&output_ptr[0], out);	48 vst1_u8(&output_ptr[0], out);

41 // Next row...	49 // Next row...

42 src_ptr += src_pixels_per_line;	50 src_ptr += src_pixels_per_line;

43 output_ptr += output_width;	51 output_ptr += output_width;

44 }	52 }

45 }	53 }

46	54

47 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,	55 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,

48 uint8_t *output_ptr,	56 uint8_t *output_ptr,

49 unsigned int src_pixels_per_line,	57 unsigned int src_pixels_per_line,

50 int pixel_step,	58 int pixel_step,

51 unsigned int output_height,	59 unsigned int output_height,

52 unsigned int output_width,	60 unsigned int output_width,

53 const int16_t *vp9_filter) {	61 const uint8_t *vp9_filter) {

54 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);	62 const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);

55 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);	63 const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);

56 unsigned int i, j;	64 unsigned int i, j;

57 for (i = 0; i < output_height; ++i) {	65 for (i = 0; i < output_height; ++i) {

58 for (j = 0; j < output_width; j += 16) {	66 for (j = 0; j < output_width; j += 16) {

59 const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);	67 const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);

60 const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);	68 const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);

61 const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);	69 const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);

62 const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);	70 const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);

63 const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);	71 const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);

64 const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);	72 const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);

65 const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);	73 const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);

(...skipping 11 matching lines...) Expand all Loading...
77 int xoffset,	85 int xoffset,

78 int yoffset,	86 int yoffset,

79 const uint8_t *dst,	87 const uint8_t *dst,

80 int dst_stride,	88 int dst_stride,

81 unsigned int *sse) {	89 unsigned int *sse) {

82 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);	90 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);

83 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);	91 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);

84	92

85 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,	93 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,

86 9, 8,	94 9, 8,

87 BILINEAR_FILTERS_2TAP(xoffset));	95 bilinear_filters[xoffset]);

88 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,	96 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,

89 8, BILINEAR_FILTERS_2TAP(yoffset));	97 8, bilinear_filters[yoffset]);

90 return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);	98 return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);

91 }	99 }

92	100

93 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,	101 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,

94 int src_stride,	102 int src_stride,

95 int xoffset,	103 int xoffset,

96 int yoffset,	104 int yoffset,

97 const uint8_t *dst,	105 const uint8_t *dst,

98 int dst_stride,	106 int dst_stride,

99 unsigned int *sse) {	107 unsigned int *sse) {

100 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);	108 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);

101 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);	109 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);

102	110

103 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,	111 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

104 17, 16,	112 17, 16,

105 BILINEAR_FILTERS_2TAP(xoffset));	113 bilinear_filters[xoffset]);

106 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,	114 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,

107 16, BILINEAR_FILTERS_2TAP(yoffset));	115 16, bilinear_filters[yoffset]);

108 return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);	116 return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);

109 }	117 }

110	118

111 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,	119 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,

112 int src_stride,	120 int src_stride,

113 int xoffset,	121 int xoffset,

114 int yoffset,	122 int yoffset,

115 const uint8_t *dst,	123 const uint8_t *dst,

116 int dst_stride,	124 int dst_stride,

117 unsigned int *sse) {	125 unsigned int *sse) {

118 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);	126 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);

119 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);	127 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);

120	128

121 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,	129 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

122 33, 32,	130 33, 32,

123 BILINEAR_FILTERS_2TAP(xoffset));	131 bilinear_filters[xoffset]);

124 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,	132 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,

125 32, BILINEAR_FILTERS_2TAP(yoffset));	133 32, bilinear_filters[yoffset]);

126 return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);	134 return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);

127 }	135 }

128	136

129 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,	137 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,

130 int src_stride,	138 int src_stride,

131 int xoffset,	139 int xoffset,

132 int yoffset,	140 int yoffset,

133 const uint8_t *dst,	141 const uint8_t *dst,

134 int dst_stride,	142 int dst_stride,

135 unsigned int *sse) {	143 unsigned int *sse) {

136 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);	144 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);

137 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);	145 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);

138	146

139 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,	147 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

140 65, 64,	148 65, 64,

141 BILINEAR_FILTERS_2TAP(xoffset));	149 bilinear_filters[xoffset]);

142 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,	150 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,

143 64, BILINEAR_FILTERS_2TAP(yoffset));	151 64, bilinear_filters[yoffset]);

144 return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);	152 return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);

145 }	153 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/decoder/vp9_detokenize.c ('k') | source/libvpx/vp9/encoder/vp9_aq_variance.c » ('j') | no next file with comments »