OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 6041 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6052 } | 6052 } |
6053 } | 6053 } |
6054 #endif // HAS_HALFFLOATROW_SSE2 | 6054 #endif // HAS_HALFFLOATROW_SSE2 |
6055 | 6055 |
6056 #ifdef HAS_HALFFLOATROW_AVX2 | 6056 #ifdef HAS_HALFFLOATROW_AVX2 |
6057 __declspec(naked) | 6057 __declspec(naked) |
6058 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { | 6058 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { |
6059 __asm { | 6059 __asm { |
6060 mov eax, [esp + 4] /* src */ | 6060 mov eax, [esp + 4] /* src */ |
6061 mov edx, [esp + 8] /* dst */ | 6061 mov edx, [esp + 8] /* dst */ |
| 6062 movd xmm4, dword ptr [esp + 12] /* scale */ |
| 6063 mov ecx, [esp + 16] /* width */ |
| 6064 |
| 6065 vmulss xmm4, xmm4, kExpBias |
| 6066 vbroadcastss ymm4, xmm4 |
| 6067 vpxor ymm5, ymm5, ymm5 |
| 6068 |
| 6069 // 16 pixel loop. |
| 6070 convertloop: |
| 6071 vmovdqu ymm2, [eax] // 16 shorts |
| 6072 lea eax, [eax + 32] |
| 6073 vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints |
| 6074 vpunpcklwd ymm2, ymm2, ymm5 |
| 6075 vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats |
| 6076 vcvtdq2ps ymm2, ymm2 |
| 6077 vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. |
| 6078 vmulps ymm2, ymm2, ymm4 |
| 6079 vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate |
| 6080 vpsrld ymm2, ymm2, 13 |
| 6081 vpackssdw ymm2, ymm2, ymm3 |
| 6082 vmovdqu [edx], ymm2 |
| 6083 lea edx, [edx + 32] |
| 6084 sub ecx, 16 |
| 6085 jg convertloop |
| 6086 vzeroupper |
| 6087 ret |
| 6088 } |
| 6089 } |
| 6090 #endif // HAS_HALFFLOATROW_AVX2 |
| 6091 |
| 6092 #ifdef HAS_HALFFLOATROW_F16C |
| 6093 __declspec(naked) |
| 6094 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { |
| 6095 __asm { |
| 6096 mov eax, [esp + 4] /* src */ |
| 6097 mov edx, [esp + 8] /* dst */ |
6062 vbroadcastss ymm4, [esp + 12] /* scale */ | 6098 vbroadcastss ymm4, [esp + 12] /* scale */ |
6063 mov ecx, [esp + 16] /* width */ | 6099 mov ecx, [esp + 16] /* width */ |
6064 | 6100 |
6065 // 8 pixel loop. | 6101 // 16 pixel loop. |
6066 convertloop: | 6102 convertloop: |
6067 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints | 6103 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints |
6068 vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts | 6104 vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts |
6069 lea eax, [eax + 32] | 6105 lea eax, [eax + 32] |
6070 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats | 6106 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats |
6071 vcvtdq2ps ymm3, ymm3 | 6107 vcvtdq2ps ymm3, ymm3 |
6072 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 | 6108 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 |
6073 vmulps ymm3, ymm3, ymm4 | 6109 vmulps ymm3, ymm3, ymm4 |
6074 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate | 6110 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate |
6075 vcvtps2ph xmm3, ymm3, 3 | 6111 vcvtps2ph xmm3, ymm3, 3 |
6076 vmovdqu [edx], xmm2 | 6112 vmovdqu [edx], xmm2 |
6077 vmovdqu [edx + 16], xmm3 | 6113 vmovdqu [edx + 16], xmm3 |
6078 lea edx, [edx + 32] | 6114 lea edx, [edx + 32] |
6079 sub ecx, 16 | 6115 sub ecx, 16 |
6080 jg convertloop | 6116 jg convertloop |
6081 vzeroupper | 6117 vzeroupper |
6082 ret | 6118 ret |
6083 } | 6119 } |
6084 } | 6120 } |
6085 #endif // HAS_HALFFLOATROW_AVX2 | 6121 #endif // HAS_HALFFLOATROW_F16C |
6086 | 6122 |
6087 #ifdef HAS_ARGBCOLORTABLEROW_X86 | 6123 #ifdef HAS_ARGBCOLORTABLEROW_X86 |
6088 // Tranform ARGB pixels with color table. | 6124 // Tranform ARGB pixels with color table. |
6089 __declspec(naked) | 6125 __declspec(naked) |
6090 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, | 6126 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, |
6091 int width) { | 6127 int width) { |
6092 __asm { | 6128 __asm { |
6093 push esi | 6129 push esi |
6094 mov eax, [esp + 4 + 4] /* dst_argb */ | 6130 mov eax, [esp + 4 + 4] /* dst_argb */ |
6095 mov esi, [esp + 4 + 8] /* table_argb */ | 6131 mov esi, [esp + 4 + 8] /* table_argb */ |
(...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6249 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6285 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6250 | 6286 |
6251 #endif // defined(_M_X64) | 6287 #endif // defined(_M_X64) |
6252 | 6288 |
6253 #ifdef __cplusplus | 6289 #ifdef __cplusplus |
6254 } // extern "C" | 6290 } // extern "C" |
6255 } // namespace libyuv | 6291 } // namespace libyuv |
6256 #endif | 6292 #endif |
6257 | 6293 |
6258 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6294 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
OLD | NEW |