source/row_win.cc - Issue 1505673003: Optimize yuv alpha blend AVX2 code to do 32 pixels at time.

Side by Side Diff: source/row_win.cc

Issue 1505673003: Optimize yuv alpha blend AVX2 code to do 32 pixels at time. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master

Patch Set: merge cpuid changes Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 4047 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4058 jg convertloop	4058 jg convertloop

4059	4059

4060 pop edi	4060 pop edi

4061 ret	4061 ret

4062 }	4062 }

4063 }	4063 }

4064 #endif // HAS_YUY2TOYROW_SSE2	4064 #endif // HAS_YUY2TOYROW_SSE2

4065	4065

4066 #ifdef HAS_BLENDPLANEROW_SSSE3	4066 #ifdef HAS_BLENDPLANEROW_SSSE3

4067 // Blend 8 pixels at a time.	4067 // Blend 8 pixels at a time.

4068 // =((G2C2)+(H2(D2))+32768+127)/256	4068 // unsigned version of math

	4069 // =((A2C2)+(B2(255-C2))+255)/256

	4070 // signed version of math

	4071 // =(((A2-128)C2)+((B2-128)(255-C2))+32768+127)/256

4069 __declspec(naked)	4072 __declspec(naked)

4070 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,	4073 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,

4071 const uint8* alpha, uint8* dst, int width) {	4074 const uint8* alpha, uint8* dst, int width) {

4072 __asm {	4075 __asm {

4073 push esi	4076 push esi

4074 push edi	4077 push edi

4075 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00	4078 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00

4076 psllw xmm5, 8	4079 psllw xmm5, 8

4077 mov eax, 0x80808080 // 128 for biasing image to signed.	4080 mov eax, 0x80808080 // 128 for biasing image to signed.

4078 movd xmm6, eax	4081 movd xmm6, eax

(...skipping 30 matching lines...) Expand all Loading...
4109 jg convertloop8	4112 jg convertloop8

4110	4113

4111 pop edi	4114 pop edi

4112 pop esi	4115 pop esi

4113 ret	4116 ret

4114 }	4117 }

4115 }	4118 }

4116 #endif // HAS_BLENDPLANEROW_SSSE3	4119 #endif // HAS_BLENDPLANEROW_SSSE3

4117	4120

4118 #ifdef HAS_BLENDPLANEROW_AVX2	4121 #ifdef HAS_BLENDPLANEROW_AVX2

4119 // Blend 16 pixels at a time.	4122 // Blend 32 pixels at a time.

4120 // =((G2C2)+(H2(D2))+32768+127)/256	4123 // unsigned version of math

	4124 // =((A2C2)+(B2(255-C2))+255)/256

	4125 // signed version of math

	4126 // =(((A2-128)C2)+((B2-128)(255-C2))+32768+127)/256

4121 __declspec(naked)	4127 __declspec(naked)

4122 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,	4128 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,

4123 const uint8* alpha, uint8* dst, int width) {	4129 const uint8* alpha, uint8* dst, int width) {

4124 __asm {	4130 __asm {

4125 push esi	4131 push esi

4126 push edi	4132 push edi

4127 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00	4133 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00

4128 vpsllw ymm5, ymm5, 8	4134 vpsllw ymm5, ymm5, 8

4129 mov eax, 0x80808080 // 128 for biasing image to signed.	4135 mov eax, 0x80808080 // 128 for biasing image to signed.

4130 vmovd xmm6, eax	4136 vmovd xmm6, eax

4131 vbroadcastss ymm6, xmm6	4137 vbroadcastss ymm6, xmm6

4132 mov eax, 0x807f807f // 32768 + 127 for unbias and round.	4138 mov eax, 0x807f807f // 32768 + 127 for unbias and round.

4133 vmovd xmm7, eax	4139 vmovd xmm7, eax

4134 vbroadcastss ymm7, xmm7	4140 vbroadcastss ymm7, xmm7

4135 mov eax, [esp + 8 + 4] // src0	4141 mov eax, [esp + 8 + 4] // src0

4136 mov edx, [esp + 8 + 8] // src1	4142 mov edx, [esp + 8 + 8] // src1

4137 mov esi, [esp + 8 + 12] // alpha	4143 mov esi, [esp + 8 + 12] // alpha

4138 mov edi, [esp + 8 + 16] // dst	4144 mov edi, [esp + 8 + 16] // dst

4139 mov ecx, [esp + 8 + 20] // width	4145 mov ecx, [esp + 8 + 20] // width

4140 sub eax, esi	4146 sub eax, esi

4141 sub edx, esi	4147 sub edx, esi

4142 sub edi, esi	4148 sub edi, esi

4143	4149

4144 // 16 pixel loop.	4150 // 32 pixel loop.

4145 convertloop16:	4151 convertloop32:

4146 vmovdqu xmm0, [esi] // alpha	4152 vmovdqu ymm0, [esi] // alpha

4147 vpermq ymm0, ymm0, 0xd8	4153 vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31

4148 vpunpcklbw ymm0, ymm0, ymm0	4154 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23

	4155 vpxor ymm3, ymm3, ymm5 // a, 255-a

4149 vpxor ymm0, ymm0, ymm5 // a, 255-a	4156 vpxor ymm0, ymm0, ymm5 // a, 255-a

4150 vmovdqu xmm1, [eax + esi] // src0	4157 vmovdqu ymm1, [eax + esi] // src0

4151 vmovdqu xmm2, [edx + esi] // src1	4158 vmovdqu ymm2, [edx + esi] // src1

4152 vpermq ymm1, ymm1, 0xd8	4159 vpunpckhbw ymm4, ymm1, ymm2

4153 vpermq ymm2, ymm2, 0xd8

4154 vpunpcklbw ymm1, ymm1, ymm2	4160 vpunpcklbw ymm1, ymm1, ymm2

	4161 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128

4155 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128	4162 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128

	4163 vpmaddubsw ymm3, ymm3, ymm4

4156 vpmaddubsw ymm0, ymm0, ymm1	4164 vpmaddubsw ymm0, ymm0, ymm1

	4165 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.

4157 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.	4166 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.

	4167 vpsrlw ymm3, ymm3, 8

4158 vpsrlw ymm0, ymm0, 8	4168 vpsrlw ymm0, ymm0, 8

4159 vpackuswb ymm0, ymm0, ymm0	4169 vpackuswb ymm0, ymm0, ymm3

4160 vpermq ymm0, ymm0, 0xd8	4170 vmovdqu [edi + esi], ymm0

4161 vmovdqu [edi + esi], xmm0	4171 lea esi, [esi + 32]

4162 lea esi, [esi + 16]	4172 sub ecx, 32

4163 sub ecx, 16	4173 jg convertloop32

4164 jg convertloop16

4165	4174

4166 pop edi	4175 pop edi

4167 pop esi	4176 pop esi

4168 vzeroupper	4177 vzeroupper

4169 ret	4178 ret

4170 }	4179 }

4171 }	4180 }

4172 #endif // HAS_BLENDPLANEROW_AVX2	4181 #endif // HAS_BLENDPLANEROW_AVX2

4173	4182

4174 #ifdef HAS_ARGBBLENDROW_SSSE3	4183 #ifdef HAS_ARGBBLENDROW_SSSE3

(...skipping 2183 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6358 }	6367 }

6359 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3	6368 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3

6360	6369

6361 #endif // defined(_M_X64)	6370 #endif // defined(_M_X64)

6362 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) \|\| defined(_M_X64))	6371 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) \|\| defined(_M_X64))

6363	6372

6364 #ifdef __cplusplus	6373 #ifdef __cplusplus

6365 } // extern "C"	6374 } // extern "C"

6366 } // namespace libyuv	6375 } // namespace libyuv

6367 #endif	6376 #endif

OLD	NEW

« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »