source/row_win.cc - Issue 1505433002: AVX2 YUV alpha blender and improved unittests

Side by Side Diff: source/row_win.cc

Issue 1505433002: AVX2 YUV alpha blender and improved unittests (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master

Patch Set: off by 1 fix on win Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 507 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
518 // v * (256 + 8)	518 // v * (256 + 8)

519 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3	519 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3

520 __declspec(naked)	520 __declspec(naked)

521 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,	521 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,

522 int width) {	522 int width) {

523 __asm {	523 __asm {

524 mov eax, 0x01080108 // generate multiplier to repeat 5 bits	524 mov eax, 0x01080108 // generate multiplier to repeat 5 bits

525 vmovd xmm5, eax	525 vmovd xmm5, eax

526 vbroadcastss ymm5, xmm5	526 vbroadcastss ymm5, xmm5

527 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits	527 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits

528 movd xmm6, eax	528 vmovd xmm6, eax

529 vbroadcastss ymm6, xmm6	529 vbroadcastss ymm6, xmm6

530 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red	530 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red

531 vpsllw ymm3, ymm3, 11	531 vpsllw ymm3, ymm3, 11

532 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green	532 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green

533 vpsllw ymm4, ymm4, 10	533 vpsllw ymm4, ymm4, 10

534 vpsrlw ymm4, ymm4, 5	534 vpsrlw ymm4, ymm4, 5

535 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha	535 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha

536 vpsllw ymm7, ymm7, 8	536 vpsllw ymm7, ymm7, 8

537	537

538 mov eax, [esp + 4] // src_rgb565	538 mov eax, [esp + 4] // src_rgb565

(...skipping 30 matching lines...) Expand all Loading...
569	569

570 #ifdef HAS_ARGB1555TOARGBROW_AVX2	570 #ifdef HAS_ARGB1555TOARGBROW_AVX2

571 __declspec(naked)	571 __declspec(naked)

572 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,	572 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,

573 int width) {	573 int width) {

574 __asm {	574 __asm {

575 mov eax, 0x01080108 // generate multiplier to repeat 5 bits	575 mov eax, 0x01080108 // generate multiplier to repeat 5 bits

576 vmovd xmm5, eax	576 vmovd xmm5, eax

577 vbroadcastss ymm5, xmm5	577 vbroadcastss ymm5, xmm5

578 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits	578 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits

579 movd xmm6, eax	579 vmovd xmm6, eax

580 vbroadcastss ymm6, xmm6	580 vbroadcastss ymm6, xmm6

581 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red	581 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red

582 vpsllw ymm3, ymm3, 11	582 vpsllw ymm3, ymm3, 11

583 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green	583 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green

584 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha	584 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha

585 vpsllw ymm7, ymm7, 8	585 vpsllw ymm7, ymm7, 8

586	586

587 mov eax, [esp + 4] // src_argb1555	587 mov eax, [esp + 4] // src_argb1555

588 mov edx, [esp + 8] // dst_argb	588 mov edx, [esp + 8] // dst_argb

589 mov ecx, [esp + 12] // width	589 mov ecx, [esp + 12] // width

(...skipping 3509 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4099 movq xmm2, qword ptr [edx + esi] // src1	4099 movq xmm2, qword ptr [edx + esi] // src1

4100 punpcklbw xmm1, xmm2	4100 punpcklbw xmm1, xmm2

4101 psubb xmm1, xmm6 // bias src0/1 - 128	4101 psubb xmm1, xmm6 // bias src0/1 - 128

4102 pmaddubsw xmm0, xmm1	4102 pmaddubsw xmm0, xmm1

4103 paddw xmm0, xmm7 // unbias result - 32768 and round.	4103 paddw xmm0, xmm7 // unbias result - 32768 and round.

4104 psrlw xmm0, 8	4104 psrlw xmm0, 8

4105 packuswb xmm0, xmm0	4105 packuswb xmm0, xmm0

4106 movq qword ptr [edi + esi], xmm0	4106 movq qword ptr [edi + esi], xmm0

4107 lea esi, [esi + 8]	4107 lea esi, [esi + 8]

4108 sub ecx, 8	4108 sub ecx, 8

4109 jge convertloop8	4109 jg convertloop8

4110	4110

4111 pop edi	4111 pop edi

4112 pop esi	4112 pop esi

4113 ret	4113 ret

4114 }	4114 }

4115 }	4115 }

4116 #endif // HAS_BLENDPLANEROW_SSSE3	4116 #endif // HAS_BLENDPLANEROW_SSSE3

4117	4117

	4118 #ifdef HAS_BLENDPLANEROW_AVX2

	4119 // Blend 16 pixels at a time.

	4120 // =((G2C2)+(H2(D2))+32768+127)/256

	4121 __declspec(naked)

	4122 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,

	4123 const uint8* alpha, uint8* dst, int width) {

	4124 __asm {

	4125 push esi

	4126 push edi

	4127 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00

	4128 vpsllw ymm5, ymm5, 8

	4129 mov eax, 0x80808080 // 128 for biasing image to signed.

	4130 vmovd xmm6, eax

	4131 vbroadcastss ymm6, xmm6

	4132 mov eax, 0x807f807f // 32768 + 127 for unbias and round.

	4133 vmovd xmm7, eax

	4134 vbroadcastss ymm7, xmm7

	4135 mov eax, [esp + 8 + 4] // src0

	4136 mov edx, [esp + 8 + 8] // src1

	4137 mov esi, [esp + 8 + 12] // alpha

	4138 mov edi, [esp + 8 + 16] // dst

	4139 mov ecx, [esp + 8 + 20] // width

	4140 sub eax, esi

	4141 sub edx, esi

	4142 sub edi, esi

	4143

	4144 // 16 pixel loop.

	4145 convertloop16:

	4146 vmovdqu xmm0, [esi] // alpha

	4147 vpermq ymm0, ymm0, 0xd8

	4148 vpunpcklbw ymm0, ymm0, ymm0

	4149 vpxor ymm0, ymm0, ymm5 // a, 255-a

	4150 vmovdqu xmm1, [eax + esi] // src0

	4151 vmovdqu xmm2, [edx + esi] // src1

	4152 vpermq ymm1, ymm1, 0xd8

	4153 vpermq ymm2, ymm2, 0xd8

	4154 vpunpcklbw ymm1, ymm1, ymm2

	4155 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128

	4156 vpmaddubsw ymm0, ymm0, ymm1

	4157 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.

	4158 vpsrlw ymm0, ymm0, 8

	4159 vpackuswb ymm0, ymm0, ymm0

	4160 vpermq ymm0, ymm0, 0xd8

	4161 vmovdqu [edi + esi], xmm0

	4162 lea esi, [esi + 16]

	4163 sub ecx, 16

	4164 jg convertloop16

	4165

	4166 pop edi

	4167 pop esi

	4168 vzeroupper

	4169 ret

	4170 }

	4171 }

	4172 #endif // HAS_BLENDPLANEROW_AVX2

	4173

4118 #ifdef HAS_ARGBBLENDROW_SSSE3	4174 #ifdef HAS_ARGBBLENDROW_SSSE3

4119 // Shuffle table for isolating alpha.	4175 // Shuffle table for isolating alpha.

4120 static const uvec8 kShuffleAlpha = {	4176 static const uvec8 kShuffleAlpha = {

4121 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,	4177 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,

4122 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80	4178 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80

4123 };	4179 };

4124	4180

4125 // Blend 8 pixels at a time.	4181 // Blend 8 pixels at a time.

4126 __declspec(naked)	4182 __declspec(naked)

4127 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,	4183 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,

(...skipping 2174 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6302 }	6358 }

6303 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3	6359 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3

6304	6360

6305 #endif // defined(_M_X64)	6361 #endif // defined(_M_X64)

6306 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) \|\| defined(_M_X64))	6362 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) \|\| defined(_M_X64))

6307	6363

6308 #ifdef __cplusplus	6364 #ifdef __cplusplus

6309 } // extern "C"	6365 } // extern "C"

6310 } // namespace libyuv	6366 } // namespace libyuv

6311 #endif	6367 #endif

OLD	NEW

« no previous file with comments | « source/row_gcc.cc ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »