OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 18 matching lines...) Expand all Loading... |
29 { 64, 64, 64, 64, 64, 64, 64, 64 }, | 29 { 64, 64, 64, 64, 64, 64, 64, 64 }, |
30 { 56, 56, 56, 56, 72, 72, 72, 72 }, | 30 { 56, 56, 56, 56, 72, 72, 72, 72 }, |
31 { 48, 48, 48, 48, 80, 80, 80, 80 }, | 31 { 48, 48, 48, 48, 80, 80, 80, 80 }, |
32 { 40, 40, 40, 40, 88, 88, 88, 88 }, | 32 { 40, 40, 40, 40, 88, 88, 88, 88 }, |
33 { 32, 32, 32, 32, 96, 96, 96, 96 }, | 33 { 32, 32, 32, 32, 96, 96, 96, 96 }, |
34 { 24, 24, 24, 24, 104, 104, 104, 104 }, | 34 { 24, 24, 24, 24, 104, 104, 104, 104 }, |
35 { 16, 16, 16, 16, 112, 112, 112, 112 }, | 35 { 16, 16, 16, 16, 112, 112, 112, 112 }, |
36 { 8, 8, 8, 8, 120, 120, 120, 120 } | 36 { 8, 8, 8, 8, 120, 120, 120, 120 } |
37 }; | 37 }; |
38 | 38 |
| 39 typedef void filter8_1dfunction ( |
| 40 const unsigned char *src_ptr, |
| 41 const unsigned int src_pitch, |
| 42 unsigned char *output_ptr, |
| 43 unsigned int out_pitch, |
| 44 unsigned int output_height, |
| 45 const short *filter |
| 46 ); |
| 47 |
39 #if HAVE_SSSE3 | 48 #if HAVE_SSSE3 |
40 void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, | 49 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; |
41 const unsigned int src_pitch, | 50 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; |
42 unsigned char *output_ptr, | 51 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
43 unsigned int out_pitch, | 52 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
44 unsigned int output_height, | 53 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
45 const short *filter); | 54 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
46 | 55 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; |
47 void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, | 56 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; |
48 const unsigned int src_pitch, | 57 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; |
49 unsigned char *output_ptr, | 58 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; |
50 unsigned int out_pitch, | 59 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; |
51 unsigned int output_height, | 60 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; |
52 const short *filter); | |
53 | |
54 void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, | |
55 const unsigned int src_pitch, | |
56 unsigned char *output_ptr, | |
57 unsigned int out_pitch, | |
58 unsigned int output_height, | |
59 const short *filter); | |
60 | |
61 void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, | |
62 const unsigned int src_pitch, | |
63 unsigned char *output_ptr, | |
64 unsigned int out_pitch, | |
65 unsigned int output_height, | |
66 const short *filter); | |
67 | |
68 void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr, | |
69 const unsigned int src_pitch, | |
70 unsigned char *output_ptr, | |
71 unsigned int out_pitch, | |
72 unsigned int output_height, | |
73 const short *filter); | |
74 | |
75 void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr, | |
76 const unsigned int src_pitch, | |
77 unsigned char *output_ptr, | |
78 unsigned int out_pitch, | |
79 unsigned int output_height, | |
80 const short *filter); | |
81 | |
82 void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr, | |
83 const unsigned int src_pitch, | |
84 unsigned char *output_ptr, | |
85 unsigned int out_pitch, | |
86 unsigned int output_height, | |
87 const short *filter); | |
88 | |
89 void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr, | |
90 const unsigned int src_pitch, | |
91 unsigned char *output_ptr, | |
92 unsigned int out_pitch, | |
93 unsigned int output_height, | |
94 const short *filter); | |
95 | |
96 void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr, | |
97 const unsigned int src_pitch, | |
98 unsigned char *output_ptr, | |
99 unsigned int out_pitch, | |
100 unsigned int output_height, | |
101 const short *filter); | |
102 | |
103 void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr, | |
104 const unsigned int src_pitch, | |
105 unsigned char *output_ptr, | |
106 unsigned int out_pitch, | |
107 unsigned int output_height, | |
108 const short *filter); | |
109 | |
110 void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr, | |
111 const unsigned int src_pitch, | |
112 unsigned char *output_ptr, | |
113 unsigned int out_pitch, | |
114 unsigned int output_height, | |
115 const short *filter); | |
116 | |
117 void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr, | |
118 const unsigned int src_pitch, | |
119 unsigned char *output_ptr, | |
120 unsigned int out_pitch, | |
121 unsigned int output_height, | |
122 const short *filter); | |
123 | 61 |
124 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 62 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
125 uint8_t *dst, ptrdiff_t dst_stride, | 63 uint8_t *dst, ptrdiff_t dst_stride, |
126 const int16_t *filter_x, int x_step_q4, | 64 const int16_t *filter_x, int x_step_q4, |
127 const int16_t *filter_y, int y_step_q4, | 65 const int16_t *filter_y, int y_step_q4, |
128 int w, int h) { | 66 int w, int h) { |
129 /* Ensure the filter can be compressed to int16_t. */ | 67 /* Ensure the filter can be compressed to int16_t. */ |
130 if (x_step_q4 == 16 && filter_x[3] != 128) { | 68 if (x_step_q4 == 16 && filter_x[3] != 128) { |
131 while (w >= 16) { | 69 while (w >= 16) { |
132 vp9_filter_block1d16_h8_ssse3(src, src_stride, | 70 vp9_filter_block1d16_h8_ssse3(src, src_stride, |
(...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
272 filter_x, x_step_q4, filter_y, y_step_q4, | 210 filter_x, x_step_q4, filter_y, y_step_q4, |
273 w, h); | 211 w, h); |
274 } | 212 } |
275 } | 213 } |
276 | 214 |
277 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 215 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
278 uint8_t *dst, ptrdiff_t dst_stride, | 216 uint8_t *dst, ptrdiff_t dst_stride, |
279 const int16_t *filter_x, int x_step_q4, | 217 const int16_t *filter_x, int x_step_q4, |
280 const int16_t *filter_y, int y_step_q4, | 218 const int16_t *filter_y, int y_step_q4, |
281 int w, int h) { | 219 int w, int h) { |
282 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); | 220 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
283 | 221 |
284 assert(w <= 64); | 222 assert(w <= 64); |
285 assert(h <= 64); | 223 assert(h <= 64); |
286 if (x_step_q4 == 16 && y_step_q4 == 16) { | 224 if (x_step_q4 == 16 && y_step_q4 == 16) { |
287 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, | 225 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, |
288 filter_x, x_step_q4, filter_y, y_step_q4, | 226 filter_x, x_step_q4, filter_y, y_step_q4, |
289 w, h + 7); | 227 w, h + 7); |
290 vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, | 228 vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, |
291 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | 229 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
292 } else { | 230 } else { |
293 vp9_convolve8_c(src, src_stride, dst, dst_stride, | 231 vp9_convolve8_c(src, src_stride, dst, dst_stride, |
294 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | 232 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
295 } | 233 } |
296 } | 234 } |
297 | 235 |
298 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 236 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
299 uint8_t *dst, ptrdiff_t dst_stride, | 237 uint8_t *dst, ptrdiff_t dst_stride, |
300 const int16_t *filter_x, int x_step_q4, | 238 const int16_t *filter_x, int x_step_q4, |
301 const int16_t *filter_y, int y_step_q4, | 239 const int16_t *filter_y, int y_step_q4, |
302 int w, int h) { | 240 int w, int h) { |
303 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); | 241 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
304 | 242 |
305 assert(w <= 64); | 243 assert(w <= 64); |
306 assert(h <= 64); | 244 assert(h <= 64); |
307 if (x_step_q4 == 16 && y_step_q4 == 16) { | 245 if (x_step_q4 == 16 && y_step_q4 == 16) { |
308 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, | 246 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, |
309 filter_x, x_step_q4, filter_y, y_step_q4, | 247 filter_x, x_step_q4, filter_y, y_step_q4, |
310 w, h + 7); | 248 w, h + 7); |
311 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, | 249 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, |
312 filter_x, x_step_q4, filter_y, y_step_q4, | 250 filter_x, x_step_q4, filter_y, y_step_q4, |
313 w, h); | 251 w, h); |
314 } else { | 252 } else { |
315 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, | 253 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |
316 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | 254 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
317 } | 255 } |
318 } | 256 } |
319 #endif | 257 #endif |
| 258 |
| 259 #if HAVE_SSE2 |
| 260 filter8_1dfunction vp9_filter_block1d16_v8_sse2; |
| 261 filter8_1dfunction vp9_filter_block1d16_h8_sse2; |
| 262 filter8_1dfunction vp9_filter_block1d8_v8_sse2; |
| 263 filter8_1dfunction vp9_filter_block1d8_h8_sse2; |
| 264 filter8_1dfunction vp9_filter_block1d4_v8_sse2; |
| 265 filter8_1dfunction vp9_filter_block1d4_h8_sse2; |
| 266 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; |
| 267 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; |
| 268 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; |
| 269 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; |
| 270 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; |
| 271 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; |
| 272 |
| 273 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 274 uint8_t *dst, ptrdiff_t dst_stride, |
| 275 const int16_t *filter_x, int x_step_q4, |
| 276 const int16_t *filter_y, int y_step_q4, |
| 277 int w, int h) { |
| 278 /* Ensure the filter can be compressed to int16_t. */ |
| 279 if (x_step_q4 == 16 && filter_x[3] != 128) { |
| 280 while (w >= 16) { |
| 281 vp9_filter_block1d16_h8_sse2(src, src_stride, |
| 282 dst, dst_stride, |
| 283 h, filter_x); |
| 284 src += 16; |
| 285 dst += 16; |
| 286 w -= 16; |
| 287 } |
| 288 while (w >= 8) { |
| 289 vp9_filter_block1d8_h8_sse2(src, src_stride, |
| 290 dst, dst_stride, |
| 291 h, filter_x); |
| 292 src += 8; |
| 293 dst += 8; |
| 294 w -= 8; |
| 295 } |
| 296 while (w >= 4) { |
| 297 vp9_filter_block1d4_h8_sse2(src, src_stride, |
| 298 dst, dst_stride, |
| 299 h, filter_x); |
| 300 src += 4; |
| 301 dst += 4; |
| 302 w -= 4; |
| 303 } |
| 304 } |
| 305 if (w) { |
| 306 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, |
| 307 filter_x, x_step_q4, filter_y, y_step_q4, |
| 308 w, h); |
| 309 } |
| 310 } |
| 311 |
| 312 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 313 uint8_t *dst, ptrdiff_t dst_stride, |
| 314 const int16_t *filter_x, int x_step_q4, |
| 315 const int16_t *filter_y, int y_step_q4, |
| 316 int w, int h) { |
| 317 if (y_step_q4 == 16 && filter_y[3] != 128) { |
| 318 while (w >= 16) { |
| 319 vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride, |
| 320 dst, dst_stride, |
| 321 h, filter_y); |
| 322 src += 16; |
| 323 dst += 16; |
| 324 w -= 16; |
| 325 } |
| 326 while (w >= 8) { |
| 327 vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride, |
| 328 dst, dst_stride, |
| 329 h, filter_y); |
| 330 src += 8; |
| 331 dst += 8; |
| 332 w -= 8; |
| 333 } |
| 334 while (w >= 4) { |
| 335 vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride, |
| 336 dst, dst_stride, |
| 337 h, filter_y); |
| 338 src += 4; |
| 339 dst += 4; |
| 340 w -= 4; |
| 341 } |
| 342 } |
| 343 if (w) { |
| 344 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, |
| 345 filter_x, x_step_q4, filter_y, y_step_q4, |
| 346 w, h); |
| 347 } |
| 348 } |
| 349 |
| 350 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 351 uint8_t *dst, ptrdiff_t dst_stride, |
| 352 const int16_t *filter_x, int x_step_q4, |
| 353 const int16_t *filter_y, int y_step_q4, |
| 354 int w, int h) { |
| 355 if (x_step_q4 == 16 && filter_x[3] != 128) { |
| 356 while (w >= 16) { |
| 357 vp9_filter_block1d16_h8_avg_sse2(src, src_stride, |
| 358 dst, dst_stride, |
| 359 h, filter_x); |
| 360 src += 16; |
| 361 dst += 16; |
| 362 w -= 16; |
| 363 } |
| 364 while (w >= 8) { |
| 365 vp9_filter_block1d8_h8_avg_sse2(src, src_stride, |
| 366 dst, dst_stride, |
| 367 h, filter_x); |
| 368 src += 8; |
| 369 dst += 8; |
| 370 w -= 8; |
| 371 } |
| 372 while (w >= 4) { |
| 373 vp9_filter_block1d4_h8_avg_sse2(src, src_stride, |
| 374 dst, dst_stride, |
| 375 h, filter_x); |
| 376 src += 4; |
| 377 dst += 4; |
| 378 w -= 4; |
| 379 } |
| 380 } |
| 381 if (w) { |
| 382 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |
| 383 filter_x, x_step_q4, filter_y, y_step_q4, |
| 384 w, h); |
| 385 } |
| 386 } |
| 387 |
| 388 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 389 uint8_t *dst, ptrdiff_t dst_stride, |
| 390 const int16_t *filter_x, int x_step_q4, |
| 391 const int16_t *filter_y, int y_step_q4, |
| 392 int w, int h) { |
| 393 if (y_step_q4 == 16 && filter_y[3] != 128) { |
| 394 while (w >= 16) { |
| 395 vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride, |
| 396 dst, dst_stride, |
| 397 h, filter_y); |
| 398 src += 16; |
| 399 dst += 16; |
| 400 w -= 16; |
| 401 } |
| 402 while (w >= 8) { |
| 403 vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride, |
| 404 dst, dst_stride, |
| 405 h, filter_y); |
| 406 src += 8; |
| 407 dst += 8; |
| 408 w -= 8; |
| 409 } |
| 410 while (w >= 4) { |
| 411 vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride, |
| 412 dst, dst_stride, |
| 413 h, filter_y); |
| 414 src += 4; |
| 415 dst += 4; |
| 416 w -= 4; |
| 417 } |
| 418 } |
| 419 if (w) { |
| 420 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |
| 421 filter_x, x_step_q4, filter_y, y_step_q4, |
| 422 w, h); |
| 423 } |
| 424 } |
| 425 |
| 426 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 427 uint8_t *dst, ptrdiff_t dst_stride, |
| 428 const int16_t *filter_x, int x_step_q4, |
| 429 const int16_t *filter_y, int y_step_q4, |
| 430 int w, int h) { |
| 431 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
| 432 |
| 433 assert(w <= 64); |
| 434 assert(h <= 64); |
| 435 if (x_step_q4 == 16 && y_step_q4 == 16) { |
| 436 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, |
| 437 filter_x, x_step_q4, filter_y, y_step_q4, |
| 438 w, h + 7); |
| 439 vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, |
| 440 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
| 441 } else { |
| 442 vp9_convolve8_c(src, src_stride, dst, dst_stride, |
| 443 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
| 444 } |
| 445 } |
| 446 |
| 447 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 448 uint8_t *dst, ptrdiff_t dst_stride, |
| 449 const int16_t *filter_x, int x_step_q4, |
| 450 const int16_t *filter_y, int y_step_q4, |
| 451 int w, int h) { |
| 452 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
| 453 |
| 454 assert(w <= 64); |
| 455 assert(h <= 64); |
| 456 if (x_step_q4 == 16 && y_step_q4 == 16) { |
| 457 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, |
| 458 filter_x, x_step_q4, filter_y, y_step_q4, |
| 459 w, h + 7); |
| 460 vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, |
| 461 filter_x, x_step_q4, filter_y, y_step_q4, |
| 462 w, h); |
| 463 } else { |
| 464 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |
| 465 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
| 466 } |
| 467 } |
| 468 #endif |
OLD | NEW |