Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(144)

Side by Side Diff: source/row_win.cc

Issue 1505673003: Optimize yuv alpha blend AVX2 code to do 32 pixels at time. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: merge cpuid changes Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 4047 matching lines...) Expand 10 before | Expand all | Expand 10 after
4058 jg convertloop 4058 jg convertloop
4059 4059
4060 pop edi 4060 pop edi
4061 ret 4061 ret
4062 } 4062 }
4063 } 4063 }
4064 #endif // HAS_YUY2TOYROW_SSE2 4064 #endif // HAS_YUY2TOYROW_SSE2
4065 4065
4066 #ifdef HAS_BLENDPLANEROW_SSSE3 4066 #ifdef HAS_BLENDPLANEROW_SSSE3
4067 // Blend 8 pixels at a time. 4067 // Blend 8 pixels at a time.
4068 // =((G2*C2)+(H2*(D2))+32768+127)/256 4068 // unsigned version of math
4069 // =((A2*C2)+(B2*(255-C2))+255)/256
4070 // signed version of math
4071 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4069 __declspec(naked) 4072 __declspec(naked)
4070 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, 4073 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
4071 const uint8* alpha, uint8* dst, int width) { 4074 const uint8* alpha, uint8* dst, int width) {
4072 __asm { 4075 __asm {
4073 push esi 4076 push esi
4074 push edi 4077 push edi
4075 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4078 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4076 psllw xmm5, 8 4079 psllw xmm5, 8
4077 mov eax, 0x80808080 // 128 for biasing image to signed. 4080 mov eax, 0x80808080 // 128 for biasing image to signed.
4078 movd xmm6, eax 4081 movd xmm6, eax
(...skipping 30 matching lines...) Expand all
4109 jg convertloop8 4112 jg convertloop8
4110 4113
4111 pop edi 4114 pop edi
4112 pop esi 4115 pop esi
4113 ret 4116 ret
4114 } 4117 }
4115 } 4118 }
4116 #endif // HAS_BLENDPLANEROW_SSSE3 4119 #endif // HAS_BLENDPLANEROW_SSSE3
4117 4120
4118 #ifdef HAS_BLENDPLANEROW_AVX2 4121 #ifdef HAS_BLENDPLANEROW_AVX2
4119 // Blend 16 pixels at a time. 4122 // Blend 32 pixels at a time.
4120 // =((G2*C2)+(H2*(D2))+32768+127)/256 4123 // unsigned version of math
4124 // =((A2*C2)+(B2*(255-C2))+255)/256
4125 // signed version of math
4126 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4121 __declspec(naked) 4127 __declspec(naked)
4122 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, 4128 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
4123 const uint8* alpha, uint8* dst, int width) { 4129 const uint8* alpha, uint8* dst, int width) {
4124 __asm { 4130 __asm {
4125 push esi 4131 push esi
4126 push edi 4132 push edi
4127 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 4133 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
4128 vpsllw ymm5, ymm5, 8 4134 vpsllw ymm5, ymm5, 8
4129 mov eax, 0x80808080 // 128 for biasing image to signed. 4135 mov eax, 0x80808080 // 128 for biasing image to signed.
4130 vmovd xmm6, eax 4136 vmovd xmm6, eax
4131 vbroadcastss ymm6, xmm6 4137 vbroadcastss ymm6, xmm6
4132 mov eax, 0x807f807f // 32768 + 127 for unbias and round. 4138 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4133 vmovd xmm7, eax 4139 vmovd xmm7, eax
4134 vbroadcastss ymm7, xmm7 4140 vbroadcastss ymm7, xmm7
4135 mov eax, [esp + 8 + 4] // src0 4141 mov eax, [esp + 8 + 4] // src0
4136 mov edx, [esp + 8 + 8] // src1 4142 mov edx, [esp + 8 + 8] // src1
4137 mov esi, [esp + 8 + 12] // alpha 4143 mov esi, [esp + 8 + 12] // alpha
4138 mov edi, [esp + 8 + 16] // dst 4144 mov edi, [esp + 8 + 16] // dst
4139 mov ecx, [esp + 8 + 20] // width 4145 mov ecx, [esp + 8 + 20] // width
4140 sub eax, esi 4146 sub eax, esi
4141 sub edx, esi 4147 sub edx, esi
4142 sub edi, esi 4148 sub edi, esi
4143 4149
4144 // 16 pixel loop. 4150 // 32 pixel loop.
4145 convertloop16: 4151 convertloop32:
4146 vmovdqu xmm0, [esi] // alpha 4152 vmovdqu ymm0, [esi] // alpha
4147 vpermq ymm0, ymm0, 0xd8 4153 vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
4148 vpunpcklbw ymm0, ymm0, ymm0 4154 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
4155 vpxor ymm3, ymm3, ymm5 // a, 255-a
4149 vpxor ymm0, ymm0, ymm5 // a, 255-a 4156 vpxor ymm0, ymm0, ymm5 // a, 255-a
4150 vmovdqu xmm1, [eax + esi] // src0 4157 vmovdqu ymm1, [eax + esi] // src0
4151 vmovdqu xmm2, [edx + esi] // src1 4158 vmovdqu ymm2, [edx + esi] // src1
4152 vpermq ymm1, ymm1, 0xd8 4159 vpunpckhbw ymm4, ymm1, ymm2
4153 vpermq ymm2, ymm2, 0xd8
4154 vpunpcklbw ymm1, ymm1, ymm2 4160 vpunpcklbw ymm1, ymm1, ymm2
4161 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
4155 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 4162 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
4163 vpmaddubsw ymm3, ymm3, ymm4
4156 vpmaddubsw ymm0, ymm0, ymm1 4164 vpmaddubsw ymm0, ymm0, ymm1
4165 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
4157 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. 4166 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
4167 vpsrlw ymm3, ymm3, 8
4158 vpsrlw ymm0, ymm0, 8 4168 vpsrlw ymm0, ymm0, 8
4159 vpackuswb ymm0, ymm0, ymm0 4169 vpackuswb ymm0, ymm0, ymm3
4160 vpermq ymm0, ymm0, 0xd8 4170 vmovdqu [edi + esi], ymm0
4161 vmovdqu [edi + esi], xmm0 4171 lea esi, [esi + 32]
4162 lea esi, [esi + 16] 4172 sub ecx, 32
4163 sub ecx, 16 4173 jg convertloop32
4164 jg convertloop16
4165 4174
4166 pop edi 4175 pop edi
4167 pop esi 4176 pop esi
4168 vzeroupper 4177 vzeroupper
4169 ret 4178 ret
4170 } 4179 }
4171 } 4180 }
4172 #endif // HAS_BLENDPLANEROW_AVX2 4181 #endif // HAS_BLENDPLANEROW_AVX2
4173 4182
4174 #ifdef HAS_ARGBBLENDROW_SSSE3 4183 #ifdef HAS_ARGBBLENDROW_SSSE3
(...skipping 2183 matching lines...) Expand 10 before | Expand all | Expand 10 after
6358 } 6367 }
6359 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6368 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6360 6369
6361 #endif // defined(_M_X64) 6370 #endif // defined(_M_X64)
6362 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6371 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6363 6372
6364 #ifdef __cplusplus 6373 #ifdef __cplusplus
6365 } // extern "C" 6374 } // extern "C"
6366 } // namespace libyuv 6375 } // namespace libyuv
6367 #endif 6376 #endif
OLDNEW
« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698