Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(170)

Side by Side Diff: source/row_win.cc

Issue 1505673003: Optimize yuv alpha blend AVX2 code to do 32 pixels at time. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: gcc port of avx2 that does 32 pixels Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« source/cpu_id.cc ('K') | « source/row_gcc.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 4123 matching lines...) Expand 10 before | Expand all | Expand 10 after
4134 vbroadcastss ymm7, xmm7 4134 vbroadcastss ymm7, xmm7
4135 mov eax, [esp + 8 + 4] // src0 4135 mov eax, [esp + 8 + 4] // src0
4136 mov edx, [esp + 8 + 8] // src1 4136 mov edx, [esp + 8 + 8] // src1
4137 mov esi, [esp + 8 + 12] // alpha 4137 mov esi, [esp + 8 + 12] // alpha
4138 mov edi, [esp + 8 + 16] // dst 4138 mov edi, [esp + 8 + 16] // dst
4139 mov ecx, [esp + 8 + 20] // width 4139 mov ecx, [esp + 8 + 20] // width
4140 sub eax, esi 4140 sub eax, esi
4141 sub edx, esi 4141 sub edx, esi
4142 sub edi, esi 4142 sub edi, esi
4143 4143
4144 // 16 pixel loop. 4144 // 32 pixel loop.
4145 convertloop16: 4145 convertloop16:
4146 vmovdqu xmm0, [esi] // alpha 4146 vmovdqu ymm0, [esi] // alpha
4147 vpermq ymm0, ymm0, 0xd8 4147 vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
4148 vpunpcklbw ymm0, ymm0, ymm0 4148 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
4149 vpxor ymm3, ymm3, ymm5 // a, 255-a
4149 vpxor ymm0, ymm0, ymm5 // a, 255-a 4150 vpxor ymm0, ymm0, ymm5 // a, 255-a
4150 vmovdqu xmm1, [eax + esi] // src0 4151 vmovdqu ymm1, [eax + esi] // src0
4151 vmovdqu xmm2, [edx + esi] // src1 4152 vmovdqu ymm2, [edx + esi] // src1
4152 vpermq ymm1, ymm1, 0xd8 4153 vpunpckhbw ymm4, ymm1, ymm2
4153 vpermq ymm2, ymm2, 0xd8
4154 vpunpcklbw ymm1, ymm1, ymm2 4154 vpunpcklbw ymm1, ymm1, ymm2
4155 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
4155 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 4156 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
4157 vpmaddubsw ymm3, ymm3, ymm4
4156 vpmaddubsw ymm0, ymm0, ymm1 4158 vpmaddubsw ymm0, ymm0, ymm1
4159 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
4157 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. 4160 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
4161 vpsrlw ymm3, ymm3, 8
4158 vpsrlw ymm0, ymm0, 8 4162 vpsrlw ymm0, ymm0, 8
4159 vpackuswb ymm0, ymm0, ymm0 4163 vpackuswb ymm0, ymm0, ymm3
4160 vpermq ymm0, ymm0, 0xd8 4164 vmovdqu [edi + esi], ymm0
4161 vmovdqu [edi + esi], xmm0 4165 lea esi, [esi + 32]
4162 lea esi, [esi + 16] 4166 sub ecx, 32
4163 sub ecx, 16
4164 jg convertloop16 4167 jg convertloop16
4165 4168
4166 pop edi 4169 pop edi
4167 pop esi 4170 pop esi
4168 vzeroupper 4171 vzeroupper
4169 ret 4172 ret
4170 } 4173 }
4171 } 4174 }
4172 #endif // HAS_BLENDPLANEROW_AVX2 4175 #endif // HAS_BLENDPLANEROW_AVX2
4173 4176
(...skipping 2184 matching lines...) Expand 10 before | Expand all | Expand 10 after
6358 } 6361 }
6359 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6362 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6360 6363
6361 #endif // defined(_M_X64) 6364 #endif // defined(_M_X64)
6362 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6365 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6363 6366
6364 #ifdef __cplusplus 6367 #ifdef __cplusplus
6365 } // extern "C" 6368 } // extern "C"
6366 } // namespace libyuv 6369 } // namespace libyuv
6367 #endif 6370 #endif
OLDNEW
« source/cpu_id.cc ('K') | « source/row_gcc.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698