OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 350 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
361 : "+r"(src_ptr), // %0 | 361 : "+r"(src_ptr), // %0 |
362 "+r"(dst_ptr), // %1 | 362 "+r"(dst_ptr), // %1 |
363 "+r"(dst_width), // %2 | 363 "+r"(dst_width), // %2 |
364 "+r"(stridex3) // %3 | 364 "+r"(stridex3) // %3 |
365 : "r"((intptr_t)(src_stride)) // %4 | 365 : "r"((intptr_t)(src_stride)) // %4 |
366 : "memory", "cc", NACL_R14 | 366 : "memory", "cc", NACL_R14 |
367 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 367 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
368 ); | 368 ); |
369 } | 369 } |
370 | 370 |
| 371 |
| 372 #ifdef HAS_SCALEROWDOWN4_AVX2 |
| 373 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 374 uint8* dst_ptr, int dst_width) { |
| 375 asm volatile ( |
| 376 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 377 "vpsrld $0x18,%%ymm5,%%ymm5 \n" |
| 378 "vpslld $0x10,%%ymm5,%%ymm5 \n" |
| 379 LABELALIGN |
| 380 "1: \n" |
| 381 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 382 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 383 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 384 "vpand %%ymm5,%%ymm0,%%ymm0 \n" |
| 385 "vpand %%ymm5,%%ymm1,%%ymm1 \n" |
| 386 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
| 387 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
| 388 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
| 389 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" |
| 390 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
| 391 "vmovdqu %%xmm0," MEMACCESS(1) " \n" |
| 392 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 393 "sub $0x10,%2 \n" |
| 394 "jg 1b \n" |
| 395 "vzeroupper \n" |
| 396 : "+r"(src_ptr), // %0 |
| 397 "+r"(dst_ptr), // %1 |
| 398 "+r"(dst_width) // %2 |
| 399 :: "memory", "cc", "xmm0", "xmm1", "xmm5" |
| 400 ); |
| 401 } |
| 402 |
| 403 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 404 uint8* dst_ptr, int dst_width) { |
| 405 asm volatile ( |
| 406 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" |
| 407 "vpsrlw $0xf,%%ymm4,%%ymm4 \n" |
| 408 "vpsllw $0x3,%%ymm4,%%ymm5 \n" |
| 409 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" |
| 410 |
| 411 LABELALIGN |
| 412 "1: \n" |
| 413 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 414 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 415 MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 |
| 416 MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 |
| 417 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" |
| 418 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" |
| 419 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" |
| 420 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" |
| 421 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" |
| 422 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" |
| 423 MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2 |
| 424 MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3 |
| 425 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" |
| 426 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" |
| 427 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" |
| 428 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" |
| 429 MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2 |
| 430 MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3 |
| 431 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 432 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" |
| 433 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" |
| 434 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" |
| 435 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" |
| 436 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" |
| 437 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
| 438 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" |
| 439 "vpsrlw $0x4,%%ymm0,%%ymm0 \n" |
| 440 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" |
| 441 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
| 442 "vmovdqu %%xmm0," MEMACCESS(1) " \n" |
| 443 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 444 "sub $0x10,%2 \n" |
| 445 "jg 1b \n" |
| 446 "vzeroupper \n" |
| 447 : "+r"(src_ptr), // %0 |
| 448 "+r"(dst_ptr), // %1 |
| 449 "+r"(dst_width) // %2 |
| 450 : "r"((intptr_t)(src_stride)), // %3 |
| 451 "r"((intptr_t)(src_stride * 3)) // %4 |
| 452 : "memory", "cc", NACL_R14 |
| 453 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 454 ); |
| 455 } |
| 456 #endif // HAS_SCALEROWDOWN4_AVX2 |
| 457 |
371 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, | 458 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
372 uint8* dst_ptr, int dst_width) { | 459 uint8* dst_ptr, int dst_width) { |
373 asm volatile ( | 460 asm volatile ( |
374 "movdqa %0,%%xmm3 \n" | 461 "movdqa %0,%%xmm3 \n" |
375 "movdqa %1,%%xmm4 \n" | 462 "movdqa %1,%%xmm4 \n" |
376 "movdqa %2,%%xmm5 \n" | 463 "movdqa %2,%%xmm5 \n" |
377 : | 464 : |
378 : "m"(kShuf0), // %0 | 465 : "m"(kShuf0), // %0 |
379 "m"(kShuf1), // %1 | 466 "m"(kShuf1), // %1 |
380 "m"(kShuf2) // %2 | 467 "m"(kShuf2) // %2 |
(...skipping 815 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1196 ); | 1283 ); |
1197 return num; | 1284 return num; |
1198 } | 1285 } |
1199 | 1286 |
1200 #endif // defined(__x86_64__) || defined(__i386__) | 1287 #endif // defined(__x86_64__) || defined(__i386__) |
1201 | 1288 |
1202 #ifdef __cplusplus | 1289 #ifdef __cplusplus |
1203 } // extern "C" | 1290 } // extern "C" |
1204 } // namespace libyuv | 1291 } // namespace libyuv |
1205 #endif | 1292 #endif |
OLD | NEW |