source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c - Issue 554673004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c

Issue 554673004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10 #include "./vpx_config.h"	10 #include "./vpx_config.h"

11	11

12 #include "vp9/encoder/vp9_variance.h"	12 #include "vp9/encoder/vp9_variance.h"

13 #include "vpx_ports/mem.h"	13 #include "vpx_ports/mem.h"

14	14

15 typedef void (*get_var_avx2) (	15 typedef void (get_var_avx2)(const uint8_t src, int src_stride,

16 const unsigned char *src_ptr,	16 const uint8_t *ref, int ref_stride,

17 int source_stride,	17 unsigned int sse, int sum);

18 const unsigned char *ref_ptr,

19 int recon_stride,

20 unsigned int *SSE,

21 int *Sum

22 );

23	18

24 void vp9_get16x16var_avx2	19 void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,

25 (	20 const uint8_t *ref, int ref_stride,

26 const unsigned char *src_ptr,	21 unsigned int sse, int sum);

27 int source_stride,

28 const unsigned char *ref_ptr,

29 int recon_stride,

30 unsigned int *SSE,

31 int *Sum

32 );

33	22

34 void vp9_get32x32var_avx2	23 void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,

35 (	24 const uint8_t *ref, int ref_stride,

36 const unsigned char *src_ptr,	25 unsigned int sse, int sum);

37 int source_stride,

38 const unsigned char *ref_ptr,

39 int recon_stride,

40 unsigned int *SSE,

41 int *Sum

42 );

43	26

44 unsigned int vp9_sub_pixel_variance32xh_avx2	27 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,

45 (	28 int x_offset, int y_offset,

46 const uint8_t *src,	29 const uint8_t *dst, int dst_stride,

47 int src_stride,	30 int height,

48 int x_offset,	31 unsigned int *sse);

49 int y_offset,

50 const uint8_t *dst,

51 int dst_stride,

52 int height,

53 unsigned int *sse

54 );

55	32

56 unsigned int vp9_sub_pixel_avg_variance32xh_avx2	33 unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,

57 (	34 int src_stride,

58 const uint8_t *src,	35 int x_offset,

59 int src_stride,	36 int y_offset,

60 int x_offset,	37 const uint8_t *dst,

61 int y_offset,	38 int dst_stride,

62 const uint8_t *dst,	39 const uint8_t *sec,

63 int dst_stride,	40 int sec_stride,

64 const uint8_t *sec,	41 int height,

65 int sec_stride,	42 unsigned int *sseptr);

66 int height,

67 unsigned int *sseptr

68 );

69	43

70 static void variance_avx2(const unsigned char *src_ptr, int source_stride,	44 static void variance_avx2(const uint8_t *src, int src_stride,

71 const unsigned char *ref_ptr, int recon_stride,	45 const uint8_t *ref, int ref_stride,

72 int w, int h, unsigned int sse, int sum,	46 int w, int h, unsigned int sse, int sum,

73 get_var_avx2 var_fn, int block_size) {	47 get_var_avx2 var_fn, int block_size) {

74 unsigned int sse0;

75 int sum0;

76 int i, j;	48 int i, j;

77	49

78 *sse = 0;	50 *sse = 0;

79 *sum = 0;	51 *sum = 0;

80	52

81 for (i = 0; i < h; i += 16) {	53 for (i = 0; i < h; i += 16) {

82 for (j = 0; j < w; j += block_size) {	54 for (j = 0; j < w; j += block_size) {

83 // processing 16 rows horizontally each call	55 unsigned int sse0;

84 var_fn(src_ptr + source_stride * i + j, source_stride,	56 int sum0;

85 ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);	57 var_fn(&src[src_stride * i + j], src_stride,

	58 &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);

86 *sse += sse0;	59 *sse += sse0;

87 *sum += sum0;	60 *sum += sum0;

88 }	61 }

89 }	62 }

90 }	63 }

91	64

92 unsigned int vp9_variance16x16_avx2

93 (

94 const unsigned char *src_ptr,

95 int source_stride,

96 const unsigned char *ref_ptr,

97 int recon_stride,

98 unsigned int *sse) {

99 unsigned int var;

100 int avg;

101	65

102 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,	66 unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,

103 &var, &avg, vp9_get16x16var_avx2, 16);	67 const uint8_t *ref, int ref_stride,

104 *sse = var;	68 unsigned int *sse) {

105 return (var - (((unsigned int)avg * avg) >> 8));	69 int sum;

	70 variance_avx2(src, src_stride, ref, ref_stride, 16, 16,

	71 sse, &sum, vp9_get16x16var_avx2, 16);

	72 return sse - (((unsigned int)sum sum) >> 8);

106 }	73 }

107	74

108 unsigned int vp9_mse16x16_avx2(	75 unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,

109 const unsigned char *src_ptr,	76 const uint8_t *ref, int ref_stride,

110 int source_stride,	77 unsigned int *sse) {

111 const unsigned char *ref_ptr,	78 int sum;

112 int recon_stride,	79 vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);

113 unsigned int *sse) {	80 return *sse;

114 unsigned int sse0;

115 int sum0;

116 vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,

117 &sum0);

118 *sse = sse0;

119 return sse0;

120 }	81 }

121	82

122 unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,	83 unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,

123 int source_stride,	84 const uint8_t *ref, int ref_stride,

124 const uint8_t *ref_ptr,

125 int recon_stride,

126 unsigned int *sse) {	85 unsigned int *sse) {

127 unsigned int var;	86 int sum;

128 int avg;	87 variance_avx2(src, src_stride, ref, ref_stride, 32, 16,

129	88 sse, &sum, vp9_get32x32var_avx2, 32);

130 // processing 32 elements vertically in parallel	89 return sse - (((int64_t)sum sum) >> 9);

131 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,

132 &var, &avg, vp9_get32x32var_avx2, 32);

133 *sse = var;

134 return (var - (((int64_t)avg * avg) >> 10));

135 }	90 }

136	91

137 unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,	92 unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,

138 int source_stride,	93 const uint8_t *ref, int ref_stride,

139 const uint8_t *ref_ptr,

140 int recon_stride,

141 unsigned int *sse) {	94 unsigned int *sse) {

142 unsigned int var;	95 int sum;

143 int avg;	96 variance_avx2(src, src_stride, ref, ref_stride, 32, 32,

144	97 sse, &sum, vp9_get32x32var_avx2, 32);

145 // processing 32 elements vertically in parallel	98 return sse - (((int64_t)sum sum) >> 10);

146 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,

147 &var, &avg, vp9_get32x32var_avx2, 32);

148 *sse = var;

149 return (var - (((int64_t)avg * avg) >> 9));

150 }	99 }

151	100

152	101 unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,

153 unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,	102 const uint8_t *ref, int ref_stride,

154 int source_stride,

155 const uint8_t *ref_ptr,

156 int recon_stride,

157 unsigned int *sse) {	103 unsigned int *sse) {

158 unsigned int var;	104 int sum;

159 int avg;	105 variance_avx2(src, src_stride, ref, ref_stride, 64, 64,

160	106 sse, &sum, vp9_get32x32var_avx2, 32);

161 // processing 32 elements vertically in parallel	107 return sse - (((int64_t)sum sum) >> 12);

162 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,

163 &var, &avg, vp9_get32x32var_avx2, 32);

164 *sse = var;

165 return (var - (((int64_t)avg * avg) >> 12));

166 }	108 }

167	109

168 unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,	110 unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,

169 int source_stride,	111 const uint8_t *ref, int ref_stride,

170 const uint8_t *ref_ptr,

171 int recon_stride,

172 unsigned int *sse) {	112 unsigned int *sse) {

173 unsigned int var;	113 int sum;

174 int avg;	114 variance_avx2(src, src_stride, ref, ref_stride, 64, 32,

175	115 sse, &sum, vp9_get32x32var_avx2, 32);

176 // processing 32 elements vertically in parallel	116 return sse - (((int64_t)sum sum) >> 11);

177 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,

178 &var, &avg, vp9_get32x32var_avx2, 32);

179

180 *sse = var;

181 return (var - (((int64_t)avg * avg) >> 11));

182 }	117 }

183	118

184 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,	119 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,

185 int src_stride,	120 int src_stride,

186 int x_offset,	121 int x_offset,

187 int y_offset,	122 int y_offset,

188 const uint8_t *dst,	123 const uint8_t *dst,

189 int dst_stride,	124 int dst_stride,

190 unsigned int *sse_ptr) {	125 unsigned int *sse) {

191 // processing 32 elements in parallel	126 unsigned int sse1;

192 unsigned int sse;	127 const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,

193 int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,	128 y_offset, dst, dst_stride,

194 y_offset, dst, dst_stride,	129 64, &sse1);

195 64, &sse);

196 // processing the next 32 elements in parallel

197 unsigned int sse2;	130 unsigned int sse2;

198 int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,	131 const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,

199 x_offset, y_offset,	132 x_offset, y_offset,

200 dst + 32, dst_stride,	133 dst + 32, dst_stride,

201 64, &sse2);	134 64, &sse2);

202 se += se2;	135 const int se = se1 + se2;

203 sse += sse2;	136 *sse = sse1 + sse2;

204 *sse_ptr = sse;	137 return sse - (((int64_t)se se) >> 12);

205 return sse - (((int64_t)se * se) >> 12);

206 }	138 }

207	139

208 unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,	140 unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,

209 int src_stride,	141 int src_stride,

210 int x_offset,	142 int x_offset,

211 int y_offset,	143 int y_offset,

212 const uint8_t *dst,	144 const uint8_t *dst,

213 int dst_stride,	145 int dst_stride,

214 unsigned int *sse_ptr) {	146 unsigned int *sse) {

215 // processing 32 element in parallel	147 const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,

216 unsigned int sse;	148 y_offset, dst, dst_stride,

217 int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,	149 32, sse);

218 y_offset, dst, dst_stride,	150 return sse - (((int64_t)se se) >> 10);

219 32, &sse);

220 *sse_ptr = sse;

221 return sse - (((int64_t)se * se) >> 10);

222 }	151 }

223	152

224 unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,	153 unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,

225 int src_stride,	154 int src_stride,

226 int x_offset,	155 int x_offset,

227 int y_offset,	156 int y_offset,

228 const uint8_t *dst,	157 const uint8_t *dst,

229 int dst_stride,	158 int dst_stride,

230 unsigned int *sseptr,	159 unsigned int *sse,

231 const uint8_t *sec) {	160 const uint8_t *sec) {

232 // processing 32 elements in parallel	161 unsigned int sse1;

233 unsigned int sse;	162 const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,

	163 y_offset, dst, dst_stride,

	164 sec, 64, 64, &sse1);

	165 unsigned int sse2;

	166 const int se2 =

	167 vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,

	168 y_offset, dst + 32, dst_stride,

	169 sec + 32, 64, 64, &sse2);

	170 const int se = se1 + se2;

234	171

235 int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,	172 *sse = sse1 + sse2;

236 y_offset, dst, dst_stride,

237 sec, 64, 64, &sse);

238 unsigned int sse2;

239 // processing the next 32 elements in parallel

240 int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,

241 y_offset, dst + 32, dst_stride,

242 sec + 32, 64, 64, &sse2);

243 se += se2;

244 sse += sse2;

245 *sseptr = sse;

246	173

247 return sse - (((int64_t)se * se) >> 12);	174 return sse - (((int64_t)se se) >> 12);

248 }	175 }

249	176

250 unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,	177 unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,

251 int src_stride,	178 int src_stride,

252 int x_offset,	179 int x_offset,

253 int y_offset,	180 int y_offset,

254 const uint8_t *dst,	181 const uint8_t *dst,

255 int dst_stride,	182 int dst_stride,

256 unsigned int *sseptr,	183 unsigned int *sse,

257 const uint8_t *sec) {	184 const uint8_t *sec) {

258 // processing 32 element in parallel	185 // processing 32 element in parallel

259 unsigned int sse;	186 const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,

260 int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,	187 y_offset, dst, dst_stride,

261 y_offset, dst, dst_stride,	188 sec, 32, 32, sse);

262 sec, 32, 32, &sse);	189 return sse - (((int64_t)se se) >> 10);

263 *sseptr = sse;

264 return sse - (((int64_t)se * se) >> 10);

265 }	190 }

266

267

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm » ('j') | no next file with comments »