Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(133)

Side by Side Diff: source/row_win.cc

Issue 2421993002: Port HalfFloatRow_SSE2 to AVX2 but not using F16C. (Closed)
Patch Set: disable f16 Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 6041 matching lines...) Expand 10 before | Expand all | Expand 10 after
6052 } 6052 }
6053 } 6053 }
6054 #endif // HAS_HALFFLOATROW_SSE2 6054 #endif // HAS_HALFFLOATROW_SSE2
6055 6055
6056 #ifdef HAS_HALFFLOATROW_AVX2 6056 #ifdef HAS_HALFFLOATROW_AVX2
6057 __declspec(naked) 6057 __declspec(naked)
6058 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { 6058 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
6059 __asm { 6059 __asm {
6060 mov eax, [esp + 4] /* src */ 6060 mov eax, [esp + 4] /* src */
6061 mov edx, [esp + 8] /* dst */ 6061 mov edx, [esp + 8] /* dst */
6062 movd xmm4, dword ptr [esp + 12] /* scale */
6063 mov ecx, [esp + 16] /* width */
6064
6065 vmulss xmm4, xmm4, kExpBias
6066 vbroadcastss ymm4, xmm4
6067 vpxor ymm5, ymm5, ymm5
6068
6069 // 16 pixel loop.
6070 convertloop:
6071 vmovdqu ymm2, [eax] // 16 shorts
6072 lea eax, [eax + 32]
6073 vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
6074 vpunpcklwd ymm2, ymm2, ymm5
6075 vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
6076 vcvtdq2ps ymm2, ymm2
6077 vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
6078 vmulps ymm2, ymm2, ymm4
6079 vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
6080 vpsrld ymm2, ymm2, 13
6081 vpackssdw ymm2, ymm2, ymm3
6082 vmovdqu [edx], ymm2
6083 lea edx, [edx + 32]
6084 sub ecx, 16
6085 jg convertloop
6086 vzeroupper
6087 ret
6088 }
6089 }
6090 #endif // HAS_HALFFLOATROW_AVX2
6091
6092 #ifdef HAS_HALFFLOATROW_F16C
6093 __declspec(naked)
6094 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
6095 __asm {
6096 mov eax, [esp + 4] /* src */
6097 mov edx, [esp + 8] /* dst */
6062 vbroadcastss ymm4, [esp + 12] /* scale */ 6098 vbroadcastss ymm4, [esp + 12] /* scale */
6063 mov ecx, [esp + 16] /* width */ 6099 mov ecx, [esp + 16] /* width */
6064 6100
6065 // 8 pixel loop. 6101 // 16 pixel loop.
6066 convertloop: 6102 convertloop:
6067 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints 6103 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
6068 vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts 6104 vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
6069 lea eax, [eax + 32] 6105 lea eax, [eax + 32]
6070 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats 6106 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
6071 vcvtdq2ps ymm3, ymm3 6107 vcvtdq2ps ymm3, ymm3
6072 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 6108 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
6073 vmulps ymm3, ymm3, ymm4 6109 vmulps ymm3, ymm3, ymm4
6074 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate 6110 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
6075 vcvtps2ph xmm3, ymm3, 3 6111 vcvtps2ph xmm3, ymm3, 3
6076 vmovdqu [edx], xmm2 6112 vmovdqu [edx], xmm2
6077 vmovdqu [edx + 16], xmm3 6113 vmovdqu [edx + 16], xmm3
6078 lea edx, [edx + 32] 6114 lea edx, [edx + 32]
6079 sub ecx, 16 6115 sub ecx, 16
6080 jg convertloop 6116 jg convertloop
6081 vzeroupper 6117 vzeroupper
6082 ret 6118 ret
6083 } 6119 }
6084 } 6120 }
6085 #endif // HAS_HALFFLOATROW_AVX2 6121 #endif // HAS_HALFFLOATROW_F16C
6086 6122
6087 #ifdef HAS_ARGBCOLORTABLEROW_X86 6123 #ifdef HAS_ARGBCOLORTABLEROW_X86
6088 // Tranform ARGB pixels with color table. 6124 // Tranform ARGB pixels with color table.
6089 __declspec(naked) 6125 __declspec(naked)
6090 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 6126 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
6091 int width) { 6127 int width) {
6092 __asm { 6128 __asm {
6093 push esi 6129 push esi
6094 mov eax, [esp + 4 + 4] /* dst_argb */ 6130 mov eax, [esp + 4 + 4] /* dst_argb */
6095 mov esi, [esp + 4 + 8] /* table_argb */ 6131 mov esi, [esp + 4 + 8] /* table_argb */
(...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after
6249 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6285 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6250 6286
6251 #endif // defined(_M_X64) 6287 #endif // defined(_M_X64)
6252 6288
6253 #ifdef __cplusplus 6289 #ifdef __cplusplus
6254 } // extern "C" 6290 } // extern "C"
6255 } // namespace libyuv 6291 } // namespace libyuv
6256 #endif 6292 #endif
6257 6293
6258 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6294 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
OLDNEW
« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698