Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(105)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_asm_stubs.c

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h> 11 #include "./vp9_rtcd.h"
12
13 #include "./vpx_config.h" 12 #include "./vpx_config.h"
14 #include "./vp9_rtcd.h" 13 #include "vp9/common/x86/convolve.h"
15 #include "vpx_ports/mem.h"
16
17 typedef void filter8_1dfunction (
18 const unsigned char *src_ptr,
19 const ptrdiff_t src_pitch,
20 unsigned char *output_ptr,
21 ptrdiff_t out_pitch,
22 unsigned int output_height,
23 const short *filter
24 );
25
26 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
27 void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
28 uint8_t *dst, ptrdiff_t dst_stride, \
29 const int16_t *filter_x, int x_step_q4, \
30 const int16_t *filter_y, int y_step_q4, \
31 int w, int h) { \
32 if (step_q4 == 16 && filter[3] != 128) { \
33 if (filter[0] || filter[1] || filter[2]) { \
34 while (w >= 16) { \
35 vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
36 src_stride, \
37 dst, \
38 dst_stride, \
39 h, \
40 filter); \
41 src += 16; \
42 dst += 16; \
43 w -= 16; \
44 } \
45 while (w >= 8) { \
46 vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
47 src_stride, \
48 dst, \
49 dst_stride, \
50 h, \
51 filter); \
52 src += 8; \
53 dst += 8; \
54 w -= 8; \
55 } \
56 while (w >= 4) { \
57 vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
58 src_stride, \
59 dst, \
60 dst_stride, \
61 h, \
62 filter); \
63 src += 4; \
64 dst += 4; \
65 w -= 4; \
66 } \
67 } else { \
68 while (w >= 16) { \
69 vp9_filter_block1d16_##dir##2_##avg##opt(src, \
70 src_stride, \
71 dst, \
72 dst_stride, \
73 h, \
74 filter); \
75 src += 16; \
76 dst += 16; \
77 w -= 16; \
78 } \
79 while (w >= 8) { \
80 vp9_filter_block1d8_##dir##2_##avg##opt(src, \
81 src_stride, \
82 dst, \
83 dst_stride, \
84 h, \
85 filter); \
86 src += 8; \
87 dst += 8; \
88 w -= 8; \
89 } \
90 while (w >= 4) { \
91 vp9_filter_block1d4_##dir##2_##avg##opt(src, \
92 src_stride, \
93 dst, \
94 dst_stride, \
95 h, \
96 filter); \
97 src += 4; \
98 dst += 4; \
99 w -= 4; \
100 } \
101 } \
102 } \
103 if (w) { \
104 vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
105 filter_x, x_step_q4, filter_y, y_step_q4, \
106 w, h); \
107 } \
108 }
109
110 #define FUN_CONV_2D(avg, opt) \
111 void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
112 uint8_t *dst, ptrdiff_t dst_stride, \
113 const int16_t *filter_x, int x_step_q4, \
114 const int16_t *filter_y, int y_step_q4, \
115 int w, int h) { \
116 assert(w <= 64); \
117 assert(h <= 64); \
118 if (x_step_q4 == 16 && y_step_q4 == 16) { \
119 if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
120 filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
121 DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 71]); \
122 vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
123 filter_x, x_step_q4, filter_y, y_step_q4, \
124 w, h + 7); \
125 vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
126 filter_x, x_step_q4, filter_y, \
127 y_step_q4, w, h); \
128 } else { \
129 DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 65]); \
130 vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
131 filter_x, x_step_q4, filter_y, y_step_q4, \
132 w, h + 1); \
133 vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
134 filter_x, x_step_q4, filter_y, \
135 y_step_q4, w, h); \
136 } \
137 } else { \
138 vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
139 filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
140 } \
141 }
142
143 #if CONFIG_VP9_HIGHBITDEPTH
144
145 typedef void highbd_filter8_1dfunction (
146 const uint16_t *src_ptr,
147 const ptrdiff_t src_pitch,
148 uint16_t *output_ptr,
149 ptrdiff_t out_pitch,
150 unsigned int output_height,
151 const int16_t *filter,
152 int bd
153 );
154
155 #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
156 void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
157 ptrdiff_t src_stride, \
158 uint8_t *dst8, \
159 ptrdiff_t dst_stride, \
160 const int16_t *filter_x, \
161 int x_step_q4, \
162 const int16_t *filter_y, \
163 int y_step_q4, \
164 int w, int h, int bd) { \
165 if (step_q4 == 16 && filter[3] != 128) { \
166 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
167 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
168 if (filter[0] || filter[1] || filter[2]) { \
169 while (w >= 16) { \
170 vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
171 src_stride, \
172 dst, \
173 dst_stride, \
174 h, \
175 filter, \
176 bd); \
177 src += 16; \
178 dst += 16; \
179 w -= 16; \
180 } \
181 while (w >= 8) { \
182 vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
183 src_stride, \
184 dst, \
185 dst_stride, \
186 h, \
187 filter, \
188 bd); \
189 src += 8; \
190 dst += 8; \
191 w -= 8; \
192 } \
193 while (w >= 4) { \
194 vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
195 src_stride, \
196 dst, \
197 dst_stride, \
198 h, \
199 filter, \
200 bd); \
201 src += 4; \
202 dst += 4; \
203 w -= 4; \
204 } \
205 } else { \
206 while (w >= 16) { \
207 vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
208 src_stride, \
209 dst, \
210 dst_stride, \
211 h, \
212 filter, \
213 bd); \
214 src += 16; \
215 dst += 16; \
216 w -= 16; \
217 } \
218 while (w >= 8) { \
219 vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
220 src_stride, \
221 dst, \
222 dst_stride, \
223 h, \
224 filter, \
225 bd); \
226 src += 8; \
227 dst += 8; \
228 w -= 8; \
229 } \
230 while (w >= 4) { \
231 vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
232 src_stride, \
233 dst, \
234 dst_stride, \
235 h, \
236 filter, \
237 bd); \
238 src += 4; \
239 dst += 4; \
240 w -= 4; \
241 } \
242 } \
243 } \
244 if (w) { \
245 vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
246 filter_x, x_step_q4, filter_y, y_step_q4, \
247 w, h, bd); \
248 } \
249 }
250
251 #define HIGH_FUN_CONV_2D(avg, opt) \
252 void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
253 uint8_t *dst, ptrdiff_t dst_stride, \
254 const int16_t *filter_x, int x_step_q4, \
255 const int16_t *filter_y, int y_step_q4, \
256 int w, int h, int bd) { \
257 assert(w <= 64); \
258 assert(h <= 64); \
259 if (x_step_q4 == 16 && y_step_q4 == 16) { \
260 if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
261 filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
262 DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
263 vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
264 CONVERT_TO_BYTEPTR(fdata2), 64, \
265 filter_x, x_step_q4, \
266 filter_y, y_step_q4, \
267 w, h + 7, bd); \
268 vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
269 64, dst, dst_stride, \
270 filter_x, x_step_q4, \
271 filter_y, y_step_q4, \
272 w, h, bd); \
273 } else { \
274 DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
275 vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
276 CONVERT_TO_BYTEPTR(fdata2), 64, \
277 filter_x, x_step_q4, \
278 filter_y, y_step_q4, \
279 w, h + 1, bd); \
280 vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
281 dst, dst_stride, \
282 filter_x, x_step_q4, \
283 filter_y, y_step_q4, \
284 w, h, bd); \
285 } \
286 } else { \
287 vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
288 filter_x, x_step_q4, filter_y, y_step_q4, w, \
289 h, bd); \
290 } \
291 }
292 #endif // CONFIG_VP9_HIGHBITDEPTH
293
294 #if HAVE_AVX2 && HAVE_SSSE3
295 filter8_1dfunction vp9_filter_block1d16_v8_avx2;
296 filter8_1dfunction vp9_filter_block1d16_h8_avx2;
297 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
298 #if ARCH_X86_64
299 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
300 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
301 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
302 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
303 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
304 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
305 #else // ARCH_X86
306 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
307 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
308 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
309 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
310 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
311 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
312 #endif // ARCH_X86_64 / ARCH_X86
313 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
314 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
315 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
316 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
317 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
318 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
319 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
320 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
321 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
322 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
323 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
324 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
325 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
326 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
327 // uint8_t *dst, ptrdiff_t dst_stride,
328 // const int16_t *filter_x, int x_step_q4,
329 // const int16_t *filter_y, int y_step_q4,
330 // int w, int h);
331 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
332 // uint8_t *dst, ptrdiff_t dst_stride,
333 // const int16_t *filter_x, int x_step_q4,
334 // const int16_t *filter_y, int y_step_q4,
335 // int w, int h);
336 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
337 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
338
339 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
340 // uint8_t *dst, ptrdiff_t dst_stride,
341 // const int16_t *filter_x, int x_step_q4,
342 // const int16_t *filter_y, int y_step_q4,
343 // int w, int h);
344 FUN_CONV_2D(, avx2);
345 #endif // HAVE_AX2 && HAVE_SSSE3
346 #if HAVE_SSSE3
347 #if ARCH_X86_64
348 filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
349 filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
350 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
351 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
352 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
353 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
354 #define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
355 #define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
356 #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
357 #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
358 #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
359 #else // ARCH_X86
360 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
361 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
362 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
363 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
364 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
365 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
366 #endif // ARCH_X86_64 / ARCH_X86
367 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
368 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
369 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
370 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
371 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
372 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
373
374 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
375 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
376 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
377 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
378 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
379 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
380 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
381 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
382 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
383 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
384 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
385 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
386
387 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
388 // uint8_t *dst, ptrdiff_t dst_stride,
389 // const int16_t *filter_x, int x_step_q4,
390 // const int16_t *filter_y, int y_step_q4,
391 // int w, int h);
392 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
393 // uint8_t *dst, ptrdiff_t dst_stride,
394 // const int16_t *filter_x, int x_step_q4,
395 // const int16_t *filter_y, int y_step_q4,
396 // int w, int h);
397 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
398 // uint8_t *dst, ptrdiff_t dst_stride,
399 // const int16_t *filter_x, int x_step_q4,
400 // const int16_t *filter_y, int y_step_q4,
401 // int w, int h);
402 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
403 // uint8_t *dst, ptrdiff_t dst_stride,
404 // const int16_t *filter_x, int x_step_q4,
405 // const int16_t *filter_y, int y_step_q4,
406 // int w, int h);
407 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
408 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
409 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
410 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
411 ssse3);
412
413 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
414 // uint8_t *dst, ptrdiff_t dst_stride,
415 // const int16_t *filter_x, int x_step_q4,
416 // const int16_t *filter_y, int y_step_q4,
417 // int w, int h);
418 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
419 // uint8_t *dst, ptrdiff_t dst_stride,
420 // const int16_t *filter_x, int x_step_q4,
421 // const int16_t *filter_y, int y_step_q4,
422 // int w, int h);
423 FUN_CONV_2D(, ssse3);
424 FUN_CONV_2D(avg_ , ssse3);
425 #endif // HAVE_SSSE3
426 14
427 #if HAVE_SSE2 15 #if HAVE_SSE2
428 filter8_1dfunction vp9_filter_block1d16_v8_sse2; 16 filter8_1dfunction vp9_filter_block1d16_v8_sse2;
429 filter8_1dfunction vp9_filter_block1d16_h8_sse2; 17 filter8_1dfunction vp9_filter_block1d16_h8_sse2;
430 filter8_1dfunction vp9_filter_block1d8_v8_sse2; 18 filter8_1dfunction vp9_filter_block1d8_v8_sse2;
431 filter8_1dfunction vp9_filter_block1d8_h8_sse2; 19 filter8_1dfunction vp9_filter_block1d8_h8_sse2;
432 filter8_1dfunction vp9_filter_block1d4_v8_sse2; 20 filter8_1dfunction vp9_filter_block1d4_v8_sse2;
433 filter8_1dfunction vp9_filter_block1d4_h8_sse2; 21 filter8_1dfunction vp9_filter_block1d4_h8_sse2;
434 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; 22 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
435 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; 23 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
(...skipping 129 matching lines...) Expand 10 before | Expand all | Expand 10 after
565 // int w, int h, int bd); 153 // int w, int h, int bd);
566 // void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, 154 // void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
567 // uint8_t *dst, ptrdiff_t dst_stride, 155 // uint8_t *dst, ptrdiff_t dst_stride,
568 // const int16_t *filter_x, int x_step_q4, 156 // const int16_t *filter_x, int x_step_q4,
569 // const int16_t *filter_y, int y_step_q4, 157 // const int16_t *filter_y, int y_step_q4,
570 // int w, int h, int bd); 158 // int w, int h, int bd);
571 HIGH_FUN_CONV_2D(, sse2); 159 HIGH_FUN_CONV_2D(, sse2);
572 HIGH_FUN_CONV_2D(avg_ , sse2); 160 HIGH_FUN_CONV_2D(avg_ , sse2);
573 #endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 161 #endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
574 #endif // HAVE_SSE2 162 #endif // HAVE_SSE2
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/x86/convolve.h ('k') | source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698