Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(322)

Side by Side Diff: source/row_gcc.cc

Issue 1505673003: Optimize yuv alpha blend AVX2 code to do 32 pixels at time. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: merge cpuid changes Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/planar_functions.cc ('k') | source/row_win.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
(...skipping 3451 matching lines...) Expand 10 before | Expand all | Expand 10 after
3462 "+r"(width) // %3 3462 "+r"(width) // %3
3463 : "m"(kShuffleAlpha) // %4 3463 : "m"(kShuffleAlpha) // %4
3464 : "memory", "cc" 3464 : "memory", "cc"
3465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 3465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3466 ); 3466 );
3467 } 3467 }
3468 #endif // HAS_ARGBBLENDROW_SSSE3 3468 #endif // HAS_ARGBBLENDROW_SSSE3
3469 3469
3470 #ifdef HAS_BLENDPLANEROW_SSSE3 3470 #ifdef HAS_BLENDPLANEROW_SSSE3
3471 // Blend 8 pixels at a time. 3471 // Blend 8 pixels at a time.
3472 // =((G2*C2)+(H2*(D2))+32768+127)/256 3472 // unsigned version of math
3473 // =((A2*C2)+(B2*(255-C2))+255)/256
3474 // signed version of math
3475 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
3473 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, 3476 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
3474 const uint8* alpha, uint8* dst, int width) { 3477 const uint8* alpha, uint8* dst, int width) {
3475 asm volatile ( 3478 asm volatile (
3476 "pcmpeqb %%xmm5,%%xmm5 \n" 3479 "pcmpeqb %%xmm5,%%xmm5 \n"
3477 "psllw $0x8,%%xmm5 \n" 3480 "psllw $0x8,%%xmm5 \n"
3478 "mov $0x80808080,%%eax \n" 3481 "mov $0x80808080,%%eax \n"
3479 "movd %%eax,%%xmm6 \n" 3482 "movd %%eax,%%xmm6 \n"
3480 "pshufd $0x0,%%xmm6,%%xmm6 \n" 3483 "pshufd $0x0,%%xmm6,%%xmm6 \n"
3481 "mov $0x807f807f,%%eax \n" 3484 "mov $0x807f807f,%%eax \n"
3482 "movd %%eax,%%xmm7 \n" 3485 "movd %%eax,%%xmm7 \n"
(...skipping 24 matching lines...) Expand all
3507 "+r"(src1), // %1 3510 "+r"(src1), // %1
3508 "+r"(alpha), // %2 3511 "+r"(alpha), // %2
3509 "+r"(dst), // %3 3512 "+r"(dst), // %3
3510 "+r"(width) // %4 3513 "+r"(width) // %4
3511 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" 3514 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
3512 ); 3515 );
3513 } 3516 }
3514 #endif // HAS_BLENDPLANEROW_SSSE3 3517 #endif // HAS_BLENDPLANEROW_SSSE3
3515 3518
3516 #ifdef HAS_BLENDPLANEROW_AVX2 3519 #ifdef HAS_BLENDPLANEROW_AVX2
3517 // Blend 16 pixels at a time. 3520 // Blend 32 pixels at a time.
3518 // =((G2*C2)+(H2*(D2))+32768+127)/256 3521 // unsigned version of math
3522 // =((A2*C2)+(B2*(255-C2))+255)/256
3523 // signed version of math
3524 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
3519 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, 3525 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
3520 const uint8* alpha, uint8* dst, int width) { 3526 const uint8* alpha, uint8* dst, int width) {
3521 asm volatile ( 3527 asm volatile (
3522 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3528 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3523 "vpsllw $0x8,%%ymm5,%%ymm5 \n" 3529 "vpsllw $0x8,%%ymm5,%%ymm5 \n"
3524 "mov $0x80808080,%%eax \n" 3530 "mov $0x80808080,%%eax \n"
3525 "vmovd %%eax,%%xmm6 \n" 3531 "vmovd %%eax,%%xmm6 \n"
3526 "vbroadcastss %%xmm6,%%ymm6 \n" 3532 "vbroadcastss %%xmm6,%%ymm6 \n"
3527 "mov $0x807f807f,%%eax \n" 3533 "mov $0x807f807f,%%eax \n"
3528 "vmovd %%eax,%%xmm7 \n" 3534 "vmovd %%eax,%%xmm7 \n"
3529 "vbroadcastss %%xmm7,%%ymm7 \n" 3535 "vbroadcastss %%xmm7,%%ymm7 \n"
3530 "sub %2,%0 \n" 3536 "sub %2,%0 \n"
3531 "sub %2,%1 \n" 3537 "sub %2,%1 \n"
3532 "sub %2,%3 \n" 3538 "sub %2,%3 \n"
3533 3539
3534 // 16 pixel loop. 3540 // 32 pixel loop.
3535 LABELALIGN 3541 LABELALIGN
3536 "1: \n" 3542 "1: \n"
3537 "vmovdqu (%2),%%xmm0 \n" 3543 "vmovdqu (%2),%%ymm0 \n"
3538 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3544 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
3539 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" 3545 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
3546 "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
3540 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" 3547 "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
3541 "vmovdqu (%0,%2,1),%%xmm1 \n" 3548 "vmovdqu (%0,%2,1),%%ymm1 \n"
3542 "vmovdqu (%1,%2,1),%%xmm2 \n" 3549 "vmovdqu (%1,%2,1),%%ymm2 \n"
3543 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3550 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
3544 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
3545 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" 3551 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
3552 "vpsubb %%ymm6,%%ymm1,%%ymm4 \n"
3546 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" 3553 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
3554 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
3547 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" 3555 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
3556 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
3548 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" 3557 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
3558 "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
3549 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3559 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3550 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3560 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
3551 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3561 "vmovdqu %%ymm0,(%3,%2,1) \n"
3552 "vmovdqu %%xmm0,(%3,%2,1) \n" 3562 "lea 0x20(%2),%2 \n"
3553 "lea 0x10(%2),%2 \n" 3563 "sub $0x20,%4 \n"
3554 "sub $0x10,%4 \n"
3555 "jg 1b \n" 3564 "jg 1b \n"
3556 "vzeroupper \n" 3565 "vzeroupper \n"
3557 : "+r"(src0), // %0 3566 : "+r"(src0), // %0
3558 "+r"(src1), // %1 3567 "+r"(src1), // %1
3559 "+r"(alpha), // %2 3568 "+r"(alpha), // %2
3560 "+r"(dst), // %3 3569 "+r"(dst), // %3
3561 "+r"(width) // %4 3570 "+r"(width) // %4
3562 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" 3571 :: "memory", "cc", "eax",
3572 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3563 ); 3573 );
3564 } 3574 }
3565 #endif // HAS_BLENDPLANEROW_AVX2 3575 #endif // HAS_BLENDPLANEROW_AVX2
3566 3576
3567 #ifdef HAS_ARGBATTENUATEROW_SSSE3 3577 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3568 // Shuffle table duplicating alpha 3578 // Shuffle table duplicating alpha
3569 static uvec8 kShuffleAlpha0 = { 3579 static uvec8 kShuffleAlpha0 = {
3570 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u 3580 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3571 }; 3581 };
3572 static uvec8 kShuffleAlpha1 = { 3582 static uvec8 kShuffleAlpha1 = {
(...skipping 2014 matching lines...) Expand 10 before | Expand all | Expand 10 after
5587 ); 5597 );
5588 } 5598 }
5589 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5599 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5590 5600
5591 #endif // defined(__x86_64__) || defined(__i386__) 5601 #endif // defined(__x86_64__) || defined(__i386__)
5592 5602
5593 #ifdef __cplusplus 5603 #ifdef __cplusplus
5594 } // extern "C" 5604 } // extern "C"
5595 } // namespace libyuv 5605 } // namespace libyuv
5596 #endif 5606 #endif
OLDNEW
« no previous file with comments | « source/planar_functions.cc ('k') | source/row_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698