source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c - Issue 1162573005: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <emmintrin.h> // SSE2	11 #include <emmintrin.h> // SSE2

12	12

	13 #include "./vp9_rtcd.h"

13 #include "./vpx_config.h"	14 #include "./vpx_config.h"

14	15

15 #include "vp9/encoder/vp9_variance.h"	16 #include "vp9/encoder/vp9_variance.h"

16 #include "vpx_ports/mem.h"	17 #include "vpx_ports/mem.h"

17	18

18 typedef unsigned int (variance_fn_t) (const unsigned char src, int src_stride,

19 const unsigned char *ref, int ref_stride,

20 unsigned int sse, int sum);

21

22 unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {

23 __m128i vsum = _mm_setzero_si128();

24 int i;

25

26 for (i = 0; i < 32; ++i) {

27 const __m128i v = _mm_loadu_si128((const __m128i *)src);

28 vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));

29 src += 8;

30 }

31

32 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));

33 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));

34 return _mm_cvtsi128_si32(vsum);

35 }

36

37 #define READ64(p, stride, i) \

38 _mm_unpacklo_epi8(_mm_cvtsi32_si128((const uint32_t )(p + i * stride)), \

39 _mm_cvtsi32_si128((const uint32_t )(p + (i + 1) * stride)))

40

41 unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride,

42 const uint8_t *ref, int ref_stride,

43 unsigned int sse, int sum) {

44 const __m128i zero = _mm_setzero_si128();

45 const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);

46 const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);

47 const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);

48 const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);

49 const __m128i diff0 = _mm_sub_epi16(src0, ref0);

50 const __m128i diff1 = _mm_sub_epi16(src1, ref1);

51

52 // sum

53 __m128i vsum = _mm_add_epi16(diff0, diff1);

54 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));

55 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));

56 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));

57 *sum = (int16_t)_mm_extract_epi16(vsum, 0);

58

59 // sse

60 vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),

61 _mm_madd_epi16(diff1, diff1));

62 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));

63 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));

64 *sse = _mm_cvtsi128_si32(vsum);

65

66 return 0;

67 }

68

69 unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,

70 const uint8_t *ref, int ref_stride,

71 unsigned int sse, int sum) {

72 const __m128i zero = _mm_setzero_si128();

73 __m128i vsum = _mm_setzero_si128();

74 __m128i vsse = _mm_setzero_si128();

75 int i;

76

77 for (i = 0; i < 8; i += 2) {

78 const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(

79 (const __m128i )(src + i src_stride)), zero);

80 const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(

81 (const __m128i )(ref + i ref_stride)), zero);

82 const __m128i diff0 = _mm_sub_epi16(src0, ref0);

83

84 const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(

85 (const __m128i )(src + (i + 1) src_stride)), zero);

86 const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(

87 (const __m128i )(ref + (i + 1) ref_stride)), zero);

88 const __m128i diff1 = _mm_sub_epi16(src1, ref1);

89

90 vsum = _mm_add_epi16(vsum, diff0);

91 vsum = _mm_add_epi16(vsum, diff1);

92 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));

93 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));

94 }

95

96 // sum

97 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));

98 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));

99 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));

100 *sum = (int16_t)_mm_extract_epi16(vsum, 0);

101

102 // sse

103 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));

104 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));

105 *sse = _mm_cvtsi128_si32(vsse);

106

107 return 0;

108 }

109

110 unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride,

111 const uint8_t *ref, int ref_stride,

112 unsigned int sse, int sum) {

113 const __m128i zero = _mm_setzero_si128();

114 __m128i vsum = _mm_setzero_si128();

115 __m128i vsse = _mm_setzero_si128();

116 int i;

117

118 for (i = 0; i < 16; ++i) {

119 const __m128i s = _mm_loadu_si128((const __m128i *)src);

120 const __m128i r = _mm_loadu_si128((const __m128i *)ref);

121

122 const __m128i src0 = _mm_unpacklo_epi8(s, zero);

123 const __m128i ref0 = _mm_unpacklo_epi8(r, zero);

124 const __m128i diff0 = _mm_sub_epi16(src0, ref0);

125

126 const __m128i src1 = _mm_unpackhi_epi8(s, zero);

127 const __m128i ref1 = _mm_unpackhi_epi8(r, zero);

128 const __m128i diff1 = _mm_sub_epi16(src1, ref1);

129

130 vsum = _mm_add_epi16(vsum, diff0);

131 vsum = _mm_add_epi16(vsum, diff1);

132 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));

133 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));

134

135 src += src_stride;

136 ref += ref_stride;

137 }

138

139 // sum

140 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));

141 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));

142 *sum = (int16_t)_mm_extract_epi16(vsum, 0) +

143 (int16_t)_mm_extract_epi16(vsum, 1);

144

145 // sse

146 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));

147 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));

148 *sse = _mm_cvtsi128_si32(vsse);

149

150 return 0;

151 }

152

153

154 static void variance_sse2(const unsigned char *src, int src_stride,

155 const unsigned char *ref, int ref_stride,

156 int w, int h, unsigned int sse, int sum,

157 variance_fn_t var_fn, int block_size) {

158 int i, j;

159

160 *sse = 0;

161 *sum = 0;

162

163 for (i = 0; i < h; i += block_size) {

164 for (j = 0; j < w; j += block_size) {

165 unsigned int sse0;

166 int sum0;

167 var_fn(src + src_stride * i + j, src_stride,

168 ref + ref_stride * i + j, ref_stride, &sse0, &sum0);

169 *sse += sse0;

170 *sum += sum0;

171 }

172 }

173 }

174

175 unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,

176 const unsigned char *ref, int ref_stride,

177 unsigned int *sse) {

178 int sum;

179 vp9_get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);

180 return sse - (((unsigned int)sum sum) >> 4);

181 }

182

183 unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,

184 const uint8_t *ref, int ref_stride,

185 unsigned int *sse) {

186 int sum;

187 variance_sse2(src, src_stride, ref, ref_stride, 8, 4,

188 sse, &sum, vp9_get4x4var_sse2, 4);

189 return sse - (((unsigned int)sum sum) >> 5);

190 }

191

192 unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,

193 const uint8_t *ref, int ref_stride,

194 unsigned int *sse) {

195 int sum;

196 variance_sse2(src, src_stride, ref, ref_stride, 4, 8,

197 sse, &sum, vp9_get4x4var_sse2, 4);

198 return sse - (((unsigned int)sum sum) >> 5);

199 }

200

201 unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,

202 const unsigned char *ref, int ref_stride,

203 unsigned int *sse) {

204 int sum;

205 vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);

206 return sse - (((unsigned int)sum sum) >> 6);

207 }

208

209 unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,

210 const unsigned char *ref, int ref_stride,

211 unsigned int *sse) {

212 int sum;

213 variance_sse2(src, src_stride, ref, ref_stride, 16, 8,

214 sse, &sum, vp9_get8x8var_sse2, 8);

215 return sse - (((unsigned int)sum sum) >> 7);

216 }

217

218 unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,

219 const unsigned char *ref, int ref_stride,

220 unsigned int *sse) {

221 int sum;

222 variance_sse2(src, src_stride, ref, ref_stride, 8, 16,

223 sse, &sum, vp9_get8x8var_sse2, 8);

224 return sse - (((unsigned int)sum sum) >> 7);

225 }

226

227 unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,

228 const unsigned char *ref, int ref_stride,

229 unsigned int *sse) {

230 int sum;

231 vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);

232 return sse - (((unsigned int)sum sum) >> 8);

233 }

234

235 unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,

236 const uint8_t *ref, int ref_stride,

237 unsigned int *sse) {

238 int sum;

239 variance_sse2(src, src_stride, ref, ref_stride, 32, 32,

240 sse, &sum, vp9_get16x16var_sse2, 16);

241 return sse - (((int64_t)sum sum) >> 10);

242 }

243

244 unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,

245 const uint8_t *ref, int ref_stride,

246 unsigned int *sse) {

247 int sum;

248 variance_sse2(src, src_stride, ref, ref_stride, 32, 16,

249 sse, &sum, vp9_get16x16var_sse2, 16);

250 return sse - (((int64_t)sum sum) >> 9);

251 }

252

253 unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,

254 const uint8_t *ref, int ref_stride,

255 unsigned int *sse) {

256 int sum;

257 variance_sse2(src, src_stride, ref, ref_stride, 16, 32,

258 sse, &sum, vp9_get16x16var_sse2, 16);

259 return sse - (((int64_t)sum sum) >> 9);

260 }

261

262 unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,

263 const uint8_t *ref, int ref_stride,

264 unsigned int *sse) {

265 int sum;

266 variance_sse2(src, src_stride, ref, ref_stride, 64, 64,

267 sse, &sum, vp9_get16x16var_sse2, 16);

268 return sse - (((int64_t)sum sum) >> 12);

269 }

270

271 unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,

272 const uint8_t *ref, int ref_stride,

273 unsigned int *sse) {

274 int sum;

275 variance_sse2(src, src_stride, ref, ref_stride, 64, 32,

276 sse, &sum, vp9_get16x16var_sse2, 16);

277 return sse - (((int64_t)sum sum) >> 11);

278 }

279

280 unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,

281 const uint8_t *ref, int ref_stride,

282 unsigned int *sse) {

283 int sum;

284 variance_sse2(src, src_stride, ref, ref_stride, 32, 64,

285 sse, &sum, vp9_get16x16var_sse2, 16);

286 return sse - (((int64_t)sum sum) >> 11);

287 }

288

289 unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,

290 const uint8_t *ref, int ref_stride,

291 unsigned int *sse) {

292 vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);

293 return *sse;

294 }

295

296 unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,

297 const uint8_t *ref, int ref_stride,

298 unsigned int *sse) {

299 vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);

300 return *sse;

301 }

302

303 unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,

304 const uint8_t *ref, int ref_stride,

305 unsigned int *sse) {

306 vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);

307 return *sse;

308 }

309

310 unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,

311 const uint8_t *ref, int ref_stride,

312 unsigned int *sse) {

313 vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);

314 return *sse;

315 }

316

317 // The 2 unused parameters are place holders for PIC enabled build.	19 // The 2 unused parameters are place holders for PIC enabled build.

318 #define DECL(w, opt) \	20 #define DECL(w, opt) \

319 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \	21 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \

320 ptrdiff_t src_stride, \	22 ptrdiff_t src_stride, \

321 int x_offset, int y_offset, \	23 int x_offset, int y_offset, \

322 const uint8_t *dst, \	24 const uint8_t *dst, \

323 ptrdiff_t dst_stride, \	25 ptrdiff_t dst_stride, \

324 int height, unsigned int *sse, \	26 int height, unsigned int *sse, \

325 void unused0, void unused)	27 void unused0, void unused)

326 #define DECLS(opt1, opt2) \	28 #define DECLS(opt1, opt2) \

(...skipping 144 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
471 FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \	173 FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \

472 FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \	174 FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \

473 FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \	175 FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \

474 FN(4, 4, 4, 2, 2, opt2, (unsigned int))	176 FN(4, 4, 4, 2, 2, opt2, (unsigned int))

475	177

476 FNS(sse2, sse);	178 FNS(sse2, sse);

477 FNS(ssse3, ssse3);	179 FNS(ssse3, ssse3);

478	180

479 #undef FNS	181 #undef FNS

480 #undef FN	182 #undef FN

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c ('k') | source/libvpx/vp9/vp9_common.mk » ('j') | no next file with comments »