| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 18 matching lines...) Expand all Loading... |
| 29 { 64, 64, 64, 64, 64, 64, 64, 64 }, | 29 { 64, 64, 64, 64, 64, 64, 64, 64 }, |
| 30 { 56, 56, 56, 56, 72, 72, 72, 72 }, | 30 { 56, 56, 56, 56, 72, 72, 72, 72 }, |
| 31 { 48, 48, 48, 48, 80, 80, 80, 80 }, | 31 { 48, 48, 48, 48, 80, 80, 80, 80 }, |
| 32 { 40, 40, 40, 40, 88, 88, 88, 88 }, | 32 { 40, 40, 40, 40, 88, 88, 88, 88 }, |
| 33 { 32, 32, 32, 32, 96, 96, 96, 96 }, | 33 { 32, 32, 32, 32, 96, 96, 96, 96 }, |
| 34 { 24, 24, 24, 24, 104, 104, 104, 104 }, | 34 { 24, 24, 24, 24, 104, 104, 104, 104 }, |
| 35 { 16, 16, 16, 16, 112, 112, 112, 112 }, | 35 { 16, 16, 16, 16, 112, 112, 112, 112 }, |
| 36 { 8, 8, 8, 8, 120, 120, 120, 120 } | 36 { 8, 8, 8, 8, 120, 120, 120, 120 } |
| 37 }; | 37 }; |
| 38 | 38 |
| 39 typedef void filter8_1dfunction ( |
| 40 const unsigned char *src_ptr, |
| 41 const unsigned int src_pitch, |
| 42 unsigned char *output_ptr, |
| 43 unsigned int out_pitch, |
| 44 unsigned int output_height, |
| 45 const short *filter |
| 46 ); |
| 47 |
| 39 #if HAVE_SSSE3 | 48 #if HAVE_SSSE3 |
| 40 void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, | 49 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; |
| 41 const unsigned int src_pitch, | 50 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; |
| 42 unsigned char *output_ptr, | 51 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
| 43 unsigned int out_pitch, | 52 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
| 44 unsigned int output_height, | 53 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
| 45 const short *filter); | 54 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
| 46 | 55 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; |
| 47 void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, | 56 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; |
| 48 const unsigned int src_pitch, | 57 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; |
| 49 unsigned char *output_ptr, | 58 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; |
| 50 unsigned int out_pitch, | 59 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; |
| 51 unsigned int output_height, | 60 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; |
| 52 const short *filter); | |
| 53 | |
| 54 void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, | |
| 55 const unsigned int src_pitch, | |
| 56 unsigned char *output_ptr, | |
| 57 unsigned int out_pitch, | |
| 58 unsigned int output_height, | |
| 59 const short *filter); | |
| 60 | |
| 61 void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, | |
| 62 const unsigned int src_pitch, | |
| 63 unsigned char *output_ptr, | |
| 64 unsigned int out_pitch, | |
| 65 unsigned int output_height, | |
| 66 const short *filter); | |
| 67 | |
| 68 void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr, | |
| 69 const unsigned int src_pitch, | |
| 70 unsigned char *output_ptr, | |
| 71 unsigned int out_pitch, | |
| 72 unsigned int output_height, | |
| 73 const short *filter); | |
| 74 | |
| 75 void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr, | |
| 76 const unsigned int src_pitch, | |
| 77 unsigned char *output_ptr, | |
| 78 unsigned int out_pitch, | |
| 79 unsigned int output_height, | |
| 80 const short *filter); | |
| 81 | |
| 82 void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr, | |
| 83 const unsigned int src_pitch, | |
| 84 unsigned char *output_ptr, | |
| 85 unsigned int out_pitch, | |
| 86 unsigned int output_height, | |
| 87 const short *filter); | |
| 88 | |
| 89 void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr, | |
| 90 const unsigned int src_pitch, | |
| 91 unsigned char *output_ptr, | |
| 92 unsigned int out_pitch, | |
| 93 unsigned int output_height, | |
| 94 const short *filter); | |
| 95 | |
| 96 void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr, | |
| 97 const unsigned int src_pitch, | |
| 98 unsigned char *output_ptr, | |
| 99 unsigned int out_pitch, | |
| 100 unsigned int output_height, | |
| 101 const short *filter); | |
| 102 | |
| 103 void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr, | |
| 104 const unsigned int src_pitch, | |
| 105 unsigned char *output_ptr, | |
| 106 unsigned int out_pitch, | |
| 107 unsigned int output_height, | |
| 108 const short *filter); | |
| 109 | |
| 110 void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr, | |
| 111 const unsigned int src_pitch, | |
| 112 unsigned char *output_ptr, | |
| 113 unsigned int out_pitch, | |
| 114 unsigned int output_height, | |
| 115 const short *filter); | |
| 116 | |
| 117 void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr, | |
| 118 const unsigned int src_pitch, | |
| 119 unsigned char *output_ptr, | |
| 120 unsigned int out_pitch, | |
| 121 unsigned int output_height, | |
| 122 const short *filter); | |
| 123 | 61 |
| 124 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 62 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 125 uint8_t *dst, ptrdiff_t dst_stride, | 63 uint8_t *dst, ptrdiff_t dst_stride, |
| 126 const int16_t *filter_x, int x_step_q4, | 64 const int16_t *filter_x, int x_step_q4, |
| 127 const int16_t *filter_y, int y_step_q4, | 65 const int16_t *filter_y, int y_step_q4, |
| 128 int w, int h) { | 66 int w, int h) { |
| 129 /* Ensure the filter can be compressed to int16_t. */ | 67 /* Ensure the filter can be compressed to int16_t. */ |
| 130 if (x_step_q4 == 16 && filter_x[3] != 128) { | 68 if (x_step_q4 == 16 && filter_x[3] != 128) { |
| 131 while (w >= 16) { | 69 while (w >= 16) { |
| 132 vp9_filter_block1d16_h8_ssse3(src, src_stride, | 70 vp9_filter_block1d16_h8_ssse3(src, src_stride, |
| (...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 272 filter_x, x_step_q4, filter_y, y_step_q4, | 210 filter_x, x_step_q4, filter_y, y_step_q4, |
| 273 w, h); | 211 w, h); |
| 274 } | 212 } |
| 275 } | 213 } |
| 276 | 214 |
| 277 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 215 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 278 uint8_t *dst, ptrdiff_t dst_stride, | 216 uint8_t *dst, ptrdiff_t dst_stride, |
| 279 const int16_t *filter_x, int x_step_q4, | 217 const int16_t *filter_x, int x_step_q4, |
| 280 const int16_t *filter_y, int y_step_q4, | 218 const int16_t *filter_y, int y_step_q4, |
| 281 int w, int h) { | 219 int w, int h) { |
| 282 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); | 220 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
| 283 | 221 |
| 284 assert(w <= 64); | 222 assert(w <= 64); |
| 285 assert(h <= 64); | 223 assert(h <= 64); |
| 286 if (x_step_q4 == 16 && y_step_q4 == 16) { | 224 if (x_step_q4 == 16 && y_step_q4 == 16) { |
| 287 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, | 225 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, |
| 288 filter_x, x_step_q4, filter_y, y_step_q4, | 226 filter_x, x_step_q4, filter_y, y_step_q4, |
| 289 w, h + 7); | 227 w, h + 7); |
| 290 vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, | 228 vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, |
| 291 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | 229 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
| 292 } else { | 230 } else { |
| 293 vp9_convolve8_c(src, src_stride, dst, dst_stride, | 231 vp9_convolve8_c(src, src_stride, dst, dst_stride, |
| 294 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | 232 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
| 295 } | 233 } |
| 296 } | 234 } |
| 297 | 235 |
| 298 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 236 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 299 uint8_t *dst, ptrdiff_t dst_stride, | 237 uint8_t *dst, ptrdiff_t dst_stride, |
| 300 const int16_t *filter_x, int x_step_q4, | 238 const int16_t *filter_x, int x_step_q4, |
| 301 const int16_t *filter_y, int y_step_q4, | 239 const int16_t *filter_y, int y_step_q4, |
| 302 int w, int h) { | 240 int w, int h) { |
| 303 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); | 241 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
| 304 | 242 |
| 305 assert(w <= 64); | 243 assert(w <= 64); |
| 306 assert(h <= 64); | 244 assert(h <= 64); |
| 307 if (x_step_q4 == 16 && y_step_q4 == 16) { | 245 if (x_step_q4 == 16 && y_step_q4 == 16) { |
| 308 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, | 246 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, |
| 309 filter_x, x_step_q4, filter_y, y_step_q4, | 247 filter_x, x_step_q4, filter_y, y_step_q4, |
| 310 w, h + 7); | 248 w, h + 7); |
| 311 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, | 249 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, |
| 312 filter_x, x_step_q4, filter_y, y_step_q4, | 250 filter_x, x_step_q4, filter_y, y_step_q4, |
| 313 w, h); | 251 w, h); |
| 314 } else { | 252 } else { |
| 315 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, | 253 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |
| 316 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | 254 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
| 317 } | 255 } |
| 318 } | 256 } |
| 319 #endif | 257 #endif |
| 258 |
| 259 #if HAVE_SSE2 |
| 260 filter8_1dfunction vp9_filter_block1d16_v8_sse2; |
| 261 filter8_1dfunction vp9_filter_block1d16_h8_sse2; |
| 262 filter8_1dfunction vp9_filter_block1d8_v8_sse2; |
| 263 filter8_1dfunction vp9_filter_block1d8_h8_sse2; |
| 264 filter8_1dfunction vp9_filter_block1d4_v8_sse2; |
| 265 filter8_1dfunction vp9_filter_block1d4_h8_sse2; |
| 266 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; |
| 267 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; |
| 268 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; |
| 269 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; |
| 270 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; |
| 271 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; |
| 272 |
| 273 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 274 uint8_t *dst, ptrdiff_t dst_stride, |
| 275 const int16_t *filter_x, int x_step_q4, |
| 276 const int16_t *filter_y, int y_step_q4, |
| 277 int w, int h) { |
| 278 /* Ensure the filter can be compressed to int16_t. */ |
| 279 if (x_step_q4 == 16 && filter_x[3] != 128) { |
| 280 while (w >= 16) { |
| 281 vp9_filter_block1d16_h8_sse2(src, src_stride, |
| 282 dst, dst_stride, |
| 283 h, filter_x); |
| 284 src += 16; |
| 285 dst += 16; |
| 286 w -= 16; |
| 287 } |
| 288 while (w >= 8) { |
| 289 vp9_filter_block1d8_h8_sse2(src, src_stride, |
| 290 dst, dst_stride, |
| 291 h, filter_x); |
| 292 src += 8; |
| 293 dst += 8; |
| 294 w -= 8; |
| 295 } |
| 296 while (w >= 4) { |
| 297 vp9_filter_block1d4_h8_sse2(src, src_stride, |
| 298 dst, dst_stride, |
| 299 h, filter_x); |
| 300 src += 4; |
| 301 dst += 4; |
| 302 w -= 4; |
| 303 } |
| 304 } |
| 305 if (w) { |
| 306 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, |
| 307 filter_x, x_step_q4, filter_y, y_step_q4, |
| 308 w, h); |
| 309 } |
| 310 } |
| 311 |
| 312 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 313 uint8_t *dst, ptrdiff_t dst_stride, |
| 314 const int16_t *filter_x, int x_step_q4, |
| 315 const int16_t *filter_y, int y_step_q4, |
| 316 int w, int h) { |
| 317 if (y_step_q4 == 16 && filter_y[3] != 128) { |
| 318 while (w >= 16) { |
| 319 vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride, |
| 320 dst, dst_stride, |
| 321 h, filter_y); |
| 322 src += 16; |
| 323 dst += 16; |
| 324 w -= 16; |
| 325 } |
| 326 while (w >= 8) { |
| 327 vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride, |
| 328 dst, dst_stride, |
| 329 h, filter_y); |
| 330 src += 8; |
| 331 dst += 8; |
| 332 w -= 8; |
| 333 } |
| 334 while (w >= 4) { |
| 335 vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride, |
| 336 dst, dst_stride, |
| 337 h, filter_y); |
| 338 src += 4; |
| 339 dst += 4; |
| 340 w -= 4; |
| 341 } |
| 342 } |
| 343 if (w) { |
| 344 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, |
| 345 filter_x, x_step_q4, filter_y, y_step_q4, |
| 346 w, h); |
| 347 } |
| 348 } |
| 349 |
| 350 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 351 uint8_t *dst, ptrdiff_t dst_stride, |
| 352 const int16_t *filter_x, int x_step_q4, |
| 353 const int16_t *filter_y, int y_step_q4, |
| 354 int w, int h) { |
| 355 if (x_step_q4 == 16 && filter_x[3] != 128) { |
| 356 while (w >= 16) { |
| 357 vp9_filter_block1d16_h8_avg_sse2(src, src_stride, |
| 358 dst, dst_stride, |
| 359 h, filter_x); |
| 360 src += 16; |
| 361 dst += 16; |
| 362 w -= 16; |
| 363 } |
| 364 while (w >= 8) { |
| 365 vp9_filter_block1d8_h8_avg_sse2(src, src_stride, |
| 366 dst, dst_stride, |
| 367 h, filter_x); |
| 368 src += 8; |
| 369 dst += 8; |
| 370 w -= 8; |
| 371 } |
| 372 while (w >= 4) { |
| 373 vp9_filter_block1d4_h8_avg_sse2(src, src_stride, |
| 374 dst, dst_stride, |
| 375 h, filter_x); |
| 376 src += 4; |
| 377 dst += 4; |
| 378 w -= 4; |
| 379 } |
| 380 } |
| 381 if (w) { |
| 382 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |
| 383 filter_x, x_step_q4, filter_y, y_step_q4, |
| 384 w, h); |
| 385 } |
| 386 } |
| 387 |
| 388 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 389 uint8_t *dst, ptrdiff_t dst_stride, |
| 390 const int16_t *filter_x, int x_step_q4, |
| 391 const int16_t *filter_y, int y_step_q4, |
| 392 int w, int h) { |
| 393 if (y_step_q4 == 16 && filter_y[3] != 128) { |
| 394 while (w >= 16) { |
| 395 vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride, |
| 396 dst, dst_stride, |
| 397 h, filter_y); |
| 398 src += 16; |
| 399 dst += 16; |
| 400 w -= 16; |
| 401 } |
| 402 while (w >= 8) { |
| 403 vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride, |
| 404 dst, dst_stride, |
| 405 h, filter_y); |
| 406 src += 8; |
| 407 dst += 8; |
| 408 w -= 8; |
| 409 } |
| 410 while (w >= 4) { |
| 411 vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride, |
| 412 dst, dst_stride, |
| 413 h, filter_y); |
| 414 src += 4; |
| 415 dst += 4; |
| 416 w -= 4; |
| 417 } |
| 418 } |
| 419 if (w) { |
| 420 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |
| 421 filter_x, x_step_q4, filter_y, y_step_q4, |
| 422 w, h); |
| 423 } |
| 424 } |
| 425 |
| 426 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 427 uint8_t *dst, ptrdiff_t dst_stride, |
| 428 const int16_t *filter_x, int x_step_q4, |
| 429 const int16_t *filter_y, int y_step_q4, |
| 430 int w, int h) { |
| 431 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
| 432 |
| 433 assert(w <= 64); |
| 434 assert(h <= 64); |
| 435 if (x_step_q4 == 16 && y_step_q4 == 16) { |
| 436 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, |
| 437 filter_x, x_step_q4, filter_y, y_step_q4, |
| 438 w, h + 7); |
| 439 vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, |
| 440 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
| 441 } else { |
| 442 vp9_convolve8_c(src, src_stride, dst, dst_stride, |
| 443 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
| 444 } |
| 445 } |
| 446 |
| 447 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 448 uint8_t *dst, ptrdiff_t dst_stride, |
| 449 const int16_t *filter_x, int x_step_q4, |
| 450 const int16_t *filter_y, int y_step_q4, |
| 451 int w, int h) { |
| 452 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
| 453 |
| 454 assert(w <= 64); |
| 455 assert(h <= 64); |
| 456 if (x_step_q4 == 16 && y_step_q4 == 16) { |
| 457 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, |
| 458 filter_x, x_step_q4, filter_y, y_step_q4, |
| 459 w, h + 7); |
| 460 vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, |
| 461 filter_x, x_step_q4, filter_y, y_step_q4, |
| 462 w, h); |
| 463 } else { |
| 464 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |
| 465 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
| 466 } |
| 467 } |
| 468 #endif |
| OLD | NEW |