source/row_win.cc - Issue 2421993002: Port HalfFloatRow_SSE2 to AVX2 but not using F16C.

Side by Side Diff: source/row_win.cc

Issue 2421993002: Port HalfFloatRow_SSE2 to AVX2 but not using F16C. (Closed)

Patch Set: disable f16 Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 6041 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6052 }	6052 }

6053 }	6053 }

6054 #endif // HAS_HALFFLOATROW_SSE2	6054 #endif // HAS_HALFFLOATROW_SSE2

6055	6055

6056 #ifdef HAS_HALFFLOATROW_AVX2	6056 #ifdef HAS_HALFFLOATROW_AVX2

6057 __declspec(naked)	6057 __declspec(naked)

6058 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {	6058 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {

6059 __asm {	6059 __asm {

6060 mov eax, [esp + 4] /* src */	6060 mov eax, [esp + 4] /* src */

6061 mov edx, [esp + 8] /* dst */	6061 mov edx, [esp + 8] /* dst */

	6062 movd xmm4, dword ptr [esp + 12] /* scale */

	6063 mov ecx, [esp + 16] /* width */

	6064

	6065 vmulss xmm4, xmm4, kExpBias

	6066 vbroadcastss ymm4, xmm4

	6067 vpxor ymm5, ymm5, ymm5

	6068

	6069 // 16 pixel loop.

	6070 convertloop:

	6071 vmovdqu ymm2, [eax] // 16 shorts

	6072 lea eax, [eax + 32]

	6073 vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints

	6074 vpunpcklwd ymm2, ymm2, ymm5

	6075 vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats

	6076 vcvtdq2ps ymm2, ymm2

	6077 vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.

	6078 vmulps ymm2, ymm2, ymm4

	6079 vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate

	6080 vpsrld ymm2, ymm2, 13

	6081 vpackssdw ymm2, ymm2, ymm3

	6082 vmovdqu [edx], ymm2

	6083 lea edx, [edx + 32]

	6084 sub ecx, 16

	6085 jg convertloop

	6086 vzeroupper

	6087 ret

	6088 }

	6089 }

	6090 #endif // HAS_HALFFLOATROW_AVX2

	6091

	6092 #ifdef HAS_HALFFLOATROW_F16C

	6093 __declspec(naked)

	6094 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {

	6095 __asm {

	6096 mov eax, [esp + 4] /* src */

	6097 mov edx, [esp + 8] /* dst */

6062 vbroadcastss ymm4, [esp + 12] /* scale */	6098 vbroadcastss ymm4, [esp + 12] /* scale */

6063 mov ecx, [esp + 16] /* width */	6099 mov ecx, [esp + 16] /* width */

6064	6100

6065 // 8 pixel loop.	6101 // 16 pixel loop.

6066 convertloop:	6102 convertloop:

6067 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints	6103 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints

6068 vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts	6104 vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts

6069 lea eax, [eax + 32]	6105 lea eax, [eax + 32]

6070 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats	6106 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats

6071 vcvtdq2ps ymm3, ymm3	6107 vcvtdq2ps ymm3, ymm3

6072 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1	6108 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1

6073 vmulps ymm3, ymm3, ymm4	6109 vmulps ymm3, ymm3, ymm4

6074 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate	6110 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate

6075 vcvtps2ph xmm3, ymm3, 3	6111 vcvtps2ph xmm3, ymm3, 3

6076 vmovdqu [edx], xmm2	6112 vmovdqu [edx], xmm2

6077 vmovdqu [edx + 16], xmm3	6113 vmovdqu [edx + 16], xmm3

6078 lea edx, [edx + 32]	6114 lea edx, [edx + 32]

6079 sub ecx, 16	6115 sub ecx, 16

6080 jg convertloop	6116 jg convertloop

6081 vzeroupper	6117 vzeroupper

6082 ret	6118 ret

6083 }	6119 }

6084 }	6120 }

6085 #endif // HAS_HALFFLOATROW_AVX2	6121 #endif // HAS_HALFFLOATROW_F16C

6086	6122

6087 #ifdef HAS_ARGBCOLORTABLEROW_X86	6123 #ifdef HAS_ARGBCOLORTABLEROW_X86

6088 // Tranform ARGB pixels with color table.	6124 // Tranform ARGB pixels with color table.

6089 __declspec(naked)	6125 __declspec(naked)

6090 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,	6126 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,

6091 int width) {	6127 int width) {

6092 __asm {	6128 __asm {

6093 push esi	6129 push esi

6094 mov eax, [esp + 4 + 4] /* dst_argb */	6130 mov eax, [esp + 4 + 4] /* dst_argb */

6095 mov esi, [esp + 4 + 8] /* table_argb */	6131 mov esi, [esp + 4 + 8] /* table_argb */

(...skipping 153 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6249 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3	6285 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3

6250	6286

6251 #endif // defined(_M_X64)	6287 #endif // defined(_M_X64)

6252	6288

6253 #ifdef __cplusplus	6289 #ifdef __cplusplus

6254 } // extern "C"	6290 } // extern "C"

6255 } // namespace libyuv	6291 } // namespace libyuv

6256 #endif	6292 #endif

6257	6293

6258 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) \|\| defined(_M_X64))	6294 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) \|\| defined(_M_X64))

OLD	NEW

« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »