OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 3449 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3460 "+r"(src_argb1), // %1 | 3460 "+r"(src_argb1), // %1 |
3461 "+r"(dst_argb), // %2 | 3461 "+r"(dst_argb), // %2 |
3462 "+r"(width) // %3 | 3462 "+r"(width) // %3 |
3463 : "m"(kShuffleAlpha) // %4 | 3463 : "m"(kShuffleAlpha) // %4 |
3464 : "memory", "cc" | 3464 : "memory", "cc" |
3465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 3465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
3466 ); | 3466 ); |
3467 } | 3467 } |
3468 #endif // HAS_ARGBBLENDROW_SSSE3 | 3468 #endif // HAS_ARGBBLENDROW_SSSE3 |
3469 | 3469 |
3470 | |
3471 #ifdef HAS_BLENDPLANEROW_SSSE3 | 3470 #ifdef HAS_BLENDPLANEROW_SSSE3 |
3472 // Blend 8 pixels at a time. | 3471 // Blend 8 pixels at a time. |
3473 // =((G2*C2)+(H2*(D2))+32768+127)/256 | 3472 // =((G2*C2)+(H2*(D2))+32768+127)/256 |
3474 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, | 3473 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, |
3475 const uint8* alpha, uint8* dst, int width) { | 3474 const uint8* alpha, uint8* dst, int width) { |
3476 asm volatile ( | 3475 asm volatile ( |
3477 "pcmpeqb %%xmm5,%%xmm5 \n" | 3476 "pcmpeqb %%xmm5,%%xmm5 \n" |
3478 "psllw $0x8,%%xmm5 \n" | 3477 "psllw $0x8,%%xmm5 \n" |
3479 "mov $0x80808080,%%eax \n" | 3478 "mov $0x80808080,%%eax \n" |
3480 "movd %%eax,%%xmm6 \n" | 3479 "movd %%eax,%%xmm6 \n" |
(...skipping 26 matching lines...) Expand all Loading... |
3507 : "+r"(src0), // %0 | 3506 : "+r"(src0), // %0 |
3508 "+r"(src1), // %1 | 3507 "+r"(src1), // %1 |
3509 "+r"(alpha), // %2 | 3508 "+r"(alpha), // %2 |
3510 "+r"(dst), // %3 | 3509 "+r"(dst), // %3 |
3511 "+r"(width) // %4 | 3510 "+r"(width) // %4 |
3512 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" | 3511 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" |
3513 ); | 3512 ); |
3514 } | 3513 } |
3515 #endif // HAS_BLENDPLANEROW_SSSE3 | 3514 #endif // HAS_BLENDPLANEROW_SSSE3 |
3516 | 3515 |
| 3516 #ifdef HAS_BLENDPLANEROW_AVX2 |
| 3517 // Blend 16 pixels at a time. |
| 3518 // =((G2*C2)+(H2*(D2))+32768+127)/256 |
| 3519 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, |
| 3520 const uint8* alpha, uint8* dst, int width) { |
| 3521 asm volatile ( |
| 3522 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 3523 "vpsllw $0x8,%%ymm5,%%ymm5 \n" |
| 3524 "mov $0x80808080,%%eax \n" |
| 3525 "vmovd %%eax,%%xmm6 \n" |
| 3526 "vbroadcastss %%xmm6,%%ymm6 \n" |
| 3527 "mov $0x807f807f,%%eax \n" |
| 3528 "vmovd %%eax,%%xmm7 \n" |
| 3529 "vbroadcastss %%xmm7,%%ymm7 \n" |
| 3530 "sub %2,%0 \n" |
| 3531 "sub %2,%1 \n" |
| 3532 "sub %2,%3 \n" |
| 3533 |
| 3534 // 16 pixel loop. |
| 3535 LABELALIGN |
| 3536 "1: \n" |
| 3537 "vmovdqu (%2),%%xmm0 \n" |
| 3538 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
| 3539 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" |
| 3540 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" |
| 3541 "vmovdqu (%0,%2,1),%%xmm1 \n" |
| 3542 "vmovdqu (%1,%2,1),%%xmm2 \n" |
| 3543 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
| 3544 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
| 3545 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" |
| 3546 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" |
| 3547 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" |
| 3548 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" |
| 3549 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
| 3550 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" |
| 3551 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
| 3552 "vmovdqu %%xmm0,(%3,%2,1) \n" |
| 3553 "lea 0x10(%2),%2 \n" |
| 3554 "sub $0x10,%4 \n" |
| 3555 "jg 1b \n" |
| 3556 "vzeroupper \n" |
| 3557 : "+r"(src0), // %0 |
| 3558 "+r"(src1), // %1 |
| 3559 "+r"(alpha), // %2 |
| 3560 "+r"(dst), // %3 |
| 3561 "+r"(width) // %4 |
| 3562 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" |
| 3563 ); |
| 3564 } |
| 3565 #endif // HAS_BLENDPLANEROW_AVX2 |
3517 | 3566 |
3518 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | 3567 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
3519 // Shuffle table duplicating alpha | 3568 // Shuffle table duplicating alpha |
3520 static uvec8 kShuffleAlpha0 = { | 3569 static uvec8 kShuffleAlpha0 = { |
3521 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u | 3570 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u |
3522 }; | 3571 }; |
3523 static uvec8 kShuffleAlpha1 = { | 3572 static uvec8 kShuffleAlpha1 = { |
3524 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | 3573 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
3525 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u | 3574 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u |
3526 }; | 3575 }; |
(...skipping 2011 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5538 ); | 5587 ); |
5539 } | 5588 } |
5540 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5589 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5541 | 5590 |
5542 #endif // defined(__x86_64__) || defined(__i386__) | 5591 #endif // defined(__x86_64__) || defined(__i386__) |
5543 | 5592 |
5544 #ifdef __cplusplus | 5593 #ifdef __cplusplus |
5545 } // extern "C" | 5594 } // extern "C" |
5546 } // namespace libyuv | 5595 } // namespace libyuv |
5547 #endif | 5596 #endif |
OLD | NEW |