source/libvpx/vp9/common/x86/vp9_asm_stubs.c - Issue 1162573005: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_asm_stubs.c

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <assert.h>	11 #include "./vp9_rtcd.h"

12

13 #include "./vpx_config.h"	12 #include "./vpx_config.h"

14 #include "./vp9_rtcd.h"	13 #include "vp9/common/x86/convolve.h"

15 #include "vpx_ports/mem.h"

16

17 typedef void filter8_1dfunction (

18 const unsigned char *src_ptr,

19 const ptrdiff_t src_pitch,

20 unsigned char *output_ptr,

21 ptrdiff_t out_pitch,

22 unsigned int output_height,

23 const short *filter

24 );

25

26 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \

27 void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \

28 uint8_t *dst, ptrdiff_t dst_stride, \

29 const int16_t *filter_x, int x_step_q4, \

30 const int16_t *filter_y, int y_step_q4, \

31 int w, int h) { \

32 if (step_q4 == 16 && filter[3] != 128) { \

33 if (filter[0] \|\| filter[1] \|\| filter[2]) { \

34 while (w >= 16) { \

35 vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \

36 src_stride, \

37 dst, \

38 dst_stride, \

39 h, \

40 filter); \

41 src += 16; \

42 dst += 16; \

43 w -= 16; \

44 } \

45 while (w >= 8) { \

46 vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \

47 src_stride, \

48 dst, \

49 dst_stride, \

50 h, \

51 filter); \

52 src += 8; \

53 dst += 8; \

54 w -= 8; \

55 } \

56 while (w >= 4) { \

57 vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \

58 src_stride, \

59 dst, \

60 dst_stride, \

61 h, \

62 filter); \

63 src += 4; \

64 dst += 4; \

65 w -= 4; \

66 } \

67 } else { \

68 while (w >= 16) { \

69 vp9_filter_block1d16_##dir##2_##avg##opt(src, \

70 src_stride, \

71 dst, \

72 dst_stride, \

73 h, \

74 filter); \

75 src += 16; \

76 dst += 16; \

77 w -= 16; \

78 } \

79 while (w >= 8) { \

80 vp9_filter_block1d8_##dir##2_##avg##opt(src, \

81 src_stride, \

82 dst, \

83 dst_stride, \

84 h, \

85 filter); \

86 src += 8; \

87 dst += 8; \

88 w -= 8; \

89 } \

90 while (w >= 4) { \

91 vp9_filter_block1d4_##dir##2_##avg##opt(src, \

92 src_stride, \

93 dst, \

94 dst_stride, \

95 h, \

96 filter); \

97 src += 4; \

98 dst += 4; \

99 w -= 4; \

100 } \

101 } \

102 } \

103 if (w) { \

104 vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \

105 filter_x, x_step_q4, filter_y, y_step_q4, \

106 w, h); \

107 } \

108 }

109

110 #define FUN_CONV_2D(avg, opt) \

111 void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \

112 uint8_t *dst, ptrdiff_t dst_stride, \

113 const int16_t *filter_x, int x_step_q4, \

114 const int16_t *filter_y, int y_step_q4, \

115 int w, int h) { \

116 assert(w <= 64); \

117 assert(h <= 64); \

118 if (x_step_q4 == 16 && y_step_q4 == 16) { \

119 if (filter_x[0] \|\| filter_x[1] \|\| filter_x[2] \|\| filter_x[3] == 128 \|\| \

120 filter_y[0] \|\| filter_y[1] \|\| filter_y[2] \|\| filter_y[3] == 128) { \

121 DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 71]); \

122 vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \

123 filter_x, x_step_q4, filter_y, y_step_q4, \

124 w, h + 7); \

125 vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \

126 filter_x, x_step_q4, filter_y, \

127 y_step_q4, w, h); \

128 } else { \

129 DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 65]); \

130 vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \

131 filter_x, x_step_q4, filter_y, y_step_q4, \

132 w, h + 1); \

133 vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \

134 filter_x, x_step_q4, filter_y, \

135 y_step_q4, w, h); \

136 } \

137 } else { \

138 vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \

139 filter_x, x_step_q4, filter_y, y_step_q4, w, h); \

140 } \

141 }

142

143 #if CONFIG_VP9_HIGHBITDEPTH

144

145 typedef void highbd_filter8_1dfunction (

146 const uint16_t *src_ptr,

147 const ptrdiff_t src_pitch,

148 uint16_t *output_ptr,

149 ptrdiff_t out_pitch,

150 unsigned int output_height,

151 const int16_t *filter,

152 int bd

153 );

154

155 #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \

156 void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \

157 ptrdiff_t src_stride, \

158 uint8_t *dst8, \

159 ptrdiff_t dst_stride, \

160 const int16_t *filter_x, \

161 int x_step_q4, \

162 const int16_t *filter_y, \

163 int y_step_q4, \

164 int w, int h, int bd) { \

165 if (step_q4 == 16 && filter[3] != 128) { \

166 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

167 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

168 if (filter[0] \|\| filter[1] \|\| filter[2]) { \

169 while (w >= 16) { \

170 vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \

171 src_stride, \

172 dst, \

173 dst_stride, \

174 h, \

175 filter, \

176 bd); \

177 src += 16; \

178 dst += 16; \

179 w -= 16; \

180 } \

181 while (w >= 8) { \

182 vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \

183 src_stride, \

184 dst, \

185 dst_stride, \

186 h, \

187 filter, \

188 bd); \

189 src += 8; \

190 dst += 8; \

191 w -= 8; \

192 } \

193 while (w >= 4) { \

194 vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \

195 src_stride, \

196 dst, \

197 dst_stride, \

198 h, \

199 filter, \

200 bd); \

201 src += 4; \

202 dst += 4; \

203 w -= 4; \

204 } \

205 } else { \

206 while (w >= 16) { \

207 vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \

208 src_stride, \

209 dst, \

210 dst_stride, \

211 h, \

212 filter, \

213 bd); \

214 src += 16; \

215 dst += 16; \

216 w -= 16; \

217 } \

218 while (w >= 8) { \

219 vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \

220 src_stride, \

221 dst, \

222 dst_stride, \

223 h, \

224 filter, \

225 bd); \

226 src += 8; \

227 dst += 8; \

228 w -= 8; \

229 } \

230 while (w >= 4) { \

231 vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \

232 src_stride, \

233 dst, \

234 dst_stride, \

235 h, \

236 filter, \

237 bd); \

238 src += 4; \

239 dst += 4; \

240 w -= 4; \

241 } \

242 } \

243 } \

244 if (w) { \

245 vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \

246 filter_x, x_step_q4, filter_y, y_step_q4, \

247 w, h, bd); \

248 } \

249 }

250

251 #define HIGH_FUN_CONV_2D(avg, opt) \

252 void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \

253 uint8_t *dst, ptrdiff_t dst_stride, \

254 const int16_t *filter_x, int x_step_q4, \

255 const int16_t *filter_y, int y_step_q4, \

256 int w, int h, int bd) { \

257 assert(w <= 64); \

258 assert(h <= 64); \

259 if (x_step_q4 == 16 && y_step_q4 == 16) { \

260 if (filter_x[0] \|\| filter_x[1] \|\| filter_x[2] \|\| filter_x[3] == 128 \|\| \

261 filter_y[0] \|\| filter_y[1] \|\| filter_y[2] \|\| filter_y[3] == 128) { \

262 DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \

263 vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \

264 CONVERT_TO_BYTEPTR(fdata2), 64, \

265 filter_x, x_step_q4, \

266 filter_y, y_step_q4, \

267 w, h + 7, bd); \

268 vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \

269 64, dst, dst_stride, \

270 filter_x, x_step_q4, \

271 filter_y, y_step_q4, \

272 w, h, bd); \

273 } else { \

274 DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \

275 vp9_highbd_convolve8_horiz_##opt(src, src_stride, \

276 CONVERT_TO_BYTEPTR(fdata2), 64, \

277 filter_x, x_step_q4, \

278 filter_y, y_step_q4, \

279 w, h + 1, bd); \

280 vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \

281 dst, dst_stride, \

282 filter_x, x_step_q4, \

283 filter_y, y_step_q4, \

284 w, h, bd); \

285 } \

286 } else { \

287 vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \

288 filter_x, x_step_q4, filter_y, y_step_q4, w, \

289 h, bd); \

290 } \

291 }

292 #endif // CONFIG_VP9_HIGHBITDEPTH

293

294 #if HAVE_AVX2 && HAVE_SSSE3

295 filter8_1dfunction vp9_filter_block1d16_v8_avx2;

296 filter8_1dfunction vp9_filter_block1d16_h8_avx2;

297 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;

298 #if ARCH_X86_64

299 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;

300 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;

301 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;

302 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3

303 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3

304 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3

305 #else // ARCH_X86

306 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;

307 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;

308 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;

309 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3

310 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3

311 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3

312 #endif // ARCH_X86_64 / ARCH_X86

313 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;

314 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;

315 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;

316 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;

317 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;

318 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;

319 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3

320 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3

321 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3

322 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3

323 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3

324 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3

325 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3

326 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,

327 // uint8_t *dst, ptrdiff_t dst_stride,

328 // const int16_t *filter_x, int x_step_q4,

329 // const int16_t *filter_y, int y_step_q4,

330 // int w, int h);

331 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,

332 // uint8_t *dst, ptrdiff_t dst_stride,

333 // const int16_t *filter_x, int x_step_q4,

334 // const int16_t *filter_y, int y_step_q4,

335 // int w, int h);

336 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);

337 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);

338

339 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,

340 // uint8_t *dst, ptrdiff_t dst_stride,

341 // const int16_t *filter_x, int x_step_q4,

342 // const int16_t *filter_y, int y_step_q4,

343 // int w, int h);

344 FUN_CONV_2D(, avx2);

345 #endif // HAVE_AX2 && HAVE_SSSE3

346 #if HAVE_SSSE3

347 #if ARCH_X86_64

348 filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;

349 filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;

350 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;

351 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;

352 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;

353 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;

354 #define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3

355 #define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3

356 #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3

357 #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3

358 #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3

359 #else // ARCH_X86

360 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;

361 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;

362 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;

363 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;

364 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;

365 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;

366 #endif // ARCH_X86_64 / ARCH_X86

367 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;

368 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;

369 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;

370 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;

371 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;

372 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;

373

374 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;

375 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;

376 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;

377 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;

378 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;

379 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;

380 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;

381 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;

382 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;

383 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;

384 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;

385 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;

386

387 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,

388 // uint8_t *dst, ptrdiff_t dst_stride,

389 // const int16_t *filter_x, int x_step_q4,

390 // const int16_t *filter_y, int y_step_q4,

391 // int w, int h);

392 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,

393 // uint8_t *dst, ptrdiff_t dst_stride,

394 // const int16_t *filter_x, int x_step_q4,

395 // const int16_t *filter_y, int y_step_q4,

396 // int w, int h);

397 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,

398 // uint8_t *dst, ptrdiff_t dst_stride,

399 // const int16_t *filter_x, int x_step_q4,

400 // const int16_t *filter_y, int y_step_q4,

401 // int w, int h);

402 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,

403 // uint8_t *dst, ptrdiff_t dst_stride,

404 // const int16_t *filter_x, int x_step_q4,

405 // const int16_t *filter_y, int y_step_q4,

406 // int w, int h);

407 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);

408 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);

409 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);

410 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,

411 ssse3);

412

413 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,

414 // uint8_t *dst, ptrdiff_t dst_stride,

415 // const int16_t *filter_x, int x_step_q4,

416 // const int16_t *filter_y, int y_step_q4,

417 // int w, int h);

418 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,

419 // uint8_t *dst, ptrdiff_t dst_stride,

420 // const int16_t *filter_x, int x_step_q4,

421 // const int16_t *filter_y, int y_step_q4,

422 // int w, int h);

423 FUN_CONV_2D(, ssse3);

424 FUN_CONV_2D(avg_ , ssse3);

425 #endif // HAVE_SSSE3

426	14

427 #if HAVE_SSE2	15 #if HAVE_SSE2

428 filter8_1dfunction vp9_filter_block1d16_v8_sse2;	16 filter8_1dfunction vp9_filter_block1d16_v8_sse2;

429 filter8_1dfunction vp9_filter_block1d16_h8_sse2;	17 filter8_1dfunction vp9_filter_block1d16_h8_sse2;

430 filter8_1dfunction vp9_filter_block1d8_v8_sse2;	18 filter8_1dfunction vp9_filter_block1d8_v8_sse2;

431 filter8_1dfunction vp9_filter_block1d8_h8_sse2;	19 filter8_1dfunction vp9_filter_block1d8_h8_sse2;

432 filter8_1dfunction vp9_filter_block1d4_v8_sse2;	20 filter8_1dfunction vp9_filter_block1d4_v8_sse2;

433 filter8_1dfunction vp9_filter_block1d4_h8_sse2;	21 filter8_1dfunction vp9_filter_block1d4_h8_sse2;

434 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;	22 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;

435 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;	23 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;

(...skipping 129 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
565 // int w, int h, int bd);	153 // int w, int h, int bd);

566 // void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,	154 // void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,

567 // uint8_t *dst, ptrdiff_t dst_stride,	155 // uint8_t *dst, ptrdiff_t dst_stride,

568 // const int16_t *filter_x, int x_step_q4,	156 // const int16_t *filter_x, int x_step_q4,

569 // const int16_t *filter_y, int y_step_q4,	157 // const int16_t *filter_y, int y_step_q4,

570 // int w, int h, int bd);	158 // int w, int h, int bd);

571 HIGH_FUN_CONV_2D(, sse2);	159 HIGH_FUN_CONV_2D(, sse2);

572 HIGH_FUN_CONV_2D(avg_ , sse2);	160 HIGH_FUN_CONV_2D(avg_ , sse2);

573 #endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64	161 #endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64

574 #endif // HAVE_SSE2	162 #endif // HAVE_SSE2

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/x86/convolve.h ('k') | source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c » ('j') | no next file with comments »