OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 4047 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4058 jg convertloop | 4058 jg convertloop |
4059 | 4059 |
4060 pop edi | 4060 pop edi |
4061 ret | 4061 ret |
4062 } | 4062 } |
4063 } | 4063 } |
4064 #endif // HAS_YUY2TOYROW_SSE2 | 4064 #endif // HAS_YUY2TOYROW_SSE2 |
4065 | 4065 |
4066 #ifdef HAS_BLENDPLANEROW_SSSE3 | 4066 #ifdef HAS_BLENDPLANEROW_SSSE3 |
4067 // Blend 8 pixels at a time. | 4067 // Blend 8 pixels at a time. |
4068 // =((G2*C2)+(H2*(D2))+32768+127)/256 | 4068 // unsigned version of math |
| 4069 // =((A2*C2)+(B2*(255-C2))+255)/256 |
| 4070 // signed version of math |
| 4071 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
4069 __declspec(naked) | 4072 __declspec(naked) |
4070 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, | 4073 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, |
4071 const uint8* alpha, uint8* dst, int width) { | 4074 const uint8* alpha, uint8* dst, int width) { |
4072 __asm { | 4075 __asm { |
4073 push esi | 4076 push esi |
4074 push edi | 4077 push edi |
4075 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 | 4078 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
4076 psllw xmm5, 8 | 4079 psllw xmm5, 8 |
4077 mov eax, 0x80808080 // 128 for biasing image to signed. | 4080 mov eax, 0x80808080 // 128 for biasing image to signed. |
4078 movd xmm6, eax | 4081 movd xmm6, eax |
(...skipping 30 matching lines...) Expand all Loading... |
4109 jg convertloop8 | 4112 jg convertloop8 |
4110 | 4113 |
4111 pop edi | 4114 pop edi |
4112 pop esi | 4115 pop esi |
4113 ret | 4116 ret |
4114 } | 4117 } |
4115 } | 4118 } |
4116 #endif // HAS_BLENDPLANEROW_SSSE3 | 4119 #endif // HAS_BLENDPLANEROW_SSSE3 |
4117 | 4120 |
4118 #ifdef HAS_BLENDPLANEROW_AVX2 | 4121 #ifdef HAS_BLENDPLANEROW_AVX2 |
4119 // Blend 16 pixels at a time. | 4122 // Blend 32 pixels at a time. |
4120 // =((G2*C2)+(H2*(D2))+32768+127)/256 | 4123 // unsigned version of math |
| 4124 // =((A2*C2)+(B2*(255-C2))+255)/256 |
| 4125 // signed version of math |
| 4126 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
4121 __declspec(naked) | 4127 __declspec(naked) |
4122 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, | 4128 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, |
4123 const uint8* alpha, uint8* dst, int width) { | 4129 const uint8* alpha, uint8* dst, int width) { |
4124 __asm { | 4130 __asm { |
4125 push esi | 4131 push esi |
4126 push edi | 4132 push edi |
4127 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 | 4133 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 |
4128 vpsllw ymm5, ymm5, 8 | 4134 vpsllw ymm5, ymm5, 8 |
4129 mov eax, 0x80808080 // 128 for biasing image to signed. | 4135 mov eax, 0x80808080 // 128 for biasing image to signed. |
4130 vmovd xmm6, eax | 4136 vmovd xmm6, eax |
4131 vbroadcastss ymm6, xmm6 | 4137 vbroadcastss ymm6, xmm6 |
4132 mov eax, 0x807f807f // 32768 + 127 for unbias and round. | 4138 mov eax, 0x807f807f // 32768 + 127 for unbias and round. |
4133 vmovd xmm7, eax | 4139 vmovd xmm7, eax |
4134 vbroadcastss ymm7, xmm7 | 4140 vbroadcastss ymm7, xmm7 |
4135 mov eax, [esp + 8 + 4] // src0 | 4141 mov eax, [esp + 8 + 4] // src0 |
4136 mov edx, [esp + 8 + 8] // src1 | 4142 mov edx, [esp + 8 + 8] // src1 |
4137 mov esi, [esp + 8 + 12] // alpha | 4143 mov esi, [esp + 8 + 12] // alpha |
4138 mov edi, [esp + 8 + 16] // dst | 4144 mov edi, [esp + 8 + 16] // dst |
4139 mov ecx, [esp + 8 + 20] // width | 4145 mov ecx, [esp + 8 + 20] // width |
4140 sub eax, esi | 4146 sub eax, esi |
4141 sub edx, esi | 4147 sub edx, esi |
4142 sub edi, esi | 4148 sub edi, esi |
4143 | 4149 |
4144 // 16 pixel loop. | 4150 // 32 pixel loop. |
4145 convertloop16: | 4151 convertloop32: |
4146 vmovdqu xmm0, [esi] // alpha | 4152 vmovdqu ymm0, [esi] // alpha |
4147 vpermq ymm0, ymm0, 0xd8 | 4153 vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 |
4148 vpunpcklbw ymm0, ymm0, ymm0 | 4154 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 |
| 4155 vpxor ymm3, ymm3, ymm5 // a, 255-a |
4149 vpxor ymm0, ymm0, ymm5 // a, 255-a | 4156 vpxor ymm0, ymm0, ymm5 // a, 255-a |
4150 vmovdqu xmm1, [eax + esi] // src0 | 4157 vmovdqu ymm1, [eax + esi] // src0 |
4151 vmovdqu xmm2, [edx + esi] // src1 | 4158 vmovdqu ymm2, [edx + esi] // src1 |
4152 vpermq ymm1, ymm1, 0xd8 | 4159 vpunpckhbw ymm4, ymm1, ymm2 |
4153 vpermq ymm2, ymm2, 0xd8 | |
4154 vpunpcklbw ymm1, ymm1, ymm2 | 4160 vpunpcklbw ymm1, ymm1, ymm2 |
| 4161 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 |
4155 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 | 4162 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 |
| 4163 vpmaddubsw ymm3, ymm3, ymm4 |
4156 vpmaddubsw ymm0, ymm0, ymm1 | 4164 vpmaddubsw ymm0, ymm0, ymm1 |
| 4165 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. |
4157 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. | 4166 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. |
| 4167 vpsrlw ymm3, ymm3, 8 |
4158 vpsrlw ymm0, ymm0, 8 | 4168 vpsrlw ymm0, ymm0, 8 |
4159 vpackuswb ymm0, ymm0, ymm0 | 4169 vpackuswb ymm0, ymm0, ymm3 |
4160 vpermq ymm0, ymm0, 0xd8 | 4170 vmovdqu [edi + esi], ymm0 |
4161 vmovdqu [edi + esi], xmm0 | 4171 lea esi, [esi + 32] |
4162 lea esi, [esi + 16] | 4172 sub ecx, 32 |
4163 sub ecx, 16 | 4173 jg convertloop32 |
4164 jg convertloop16 | |
4165 | 4174 |
4166 pop edi | 4175 pop edi |
4167 pop esi | 4176 pop esi |
4168 vzeroupper | 4177 vzeroupper |
4169 ret | 4178 ret |
4170 } | 4179 } |
4171 } | 4180 } |
4172 #endif // HAS_BLENDPLANEROW_AVX2 | 4181 #endif // HAS_BLENDPLANEROW_AVX2 |
4173 | 4182 |
4174 #ifdef HAS_ARGBBLENDROW_SSSE3 | 4183 #ifdef HAS_ARGBBLENDROW_SSSE3 |
(...skipping 2183 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6358 } | 6367 } |
6359 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6368 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6360 | 6369 |
6361 #endif // defined(_M_X64) | 6370 #endif // defined(_M_X64) |
6362 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6371 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6363 | 6372 |
6364 #ifdef __cplusplus | 6373 #ifdef __cplusplus |
6365 } // extern "C" | 6374 } // extern "C" |
6366 } // namespace libyuv | 6375 } // namespace libyuv |
6367 #endif | 6376 #endif |
OLD | NEW |