OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 507 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
518 // v * (256 + 8) | 518 // v * (256 + 8) |
519 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 | 519 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
520 __declspec(naked) | 520 __declspec(naked) |
521 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, | 521 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, |
522 int width) { | 522 int width) { |
523 __asm { | 523 __asm { |
524 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 524 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
525 vmovd xmm5, eax | 525 vmovd xmm5, eax |
526 vbroadcastss ymm5, xmm5 | 526 vbroadcastss ymm5, xmm5 |
527 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits | 527 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
528 movd xmm6, eax | 528 vmovd xmm6, eax |
529 vbroadcastss ymm6, xmm6 | 529 vbroadcastss ymm6, xmm6 |
530 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red | 530 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
531 vpsllw ymm3, ymm3, 11 | 531 vpsllw ymm3, ymm3, 11 |
532 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green | 532 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green |
533 vpsllw ymm4, ymm4, 10 | 533 vpsllw ymm4, ymm4, 10 |
534 vpsrlw ymm4, ymm4, 5 | 534 vpsrlw ymm4, ymm4, 5 |
535 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha | 535 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
536 vpsllw ymm7, ymm7, 8 | 536 vpsllw ymm7, ymm7, 8 |
537 | 537 |
538 mov eax, [esp + 4] // src_rgb565 | 538 mov eax, [esp + 4] // src_rgb565 |
(...skipping 30 matching lines...) Expand all Loading... |
569 | 569 |
570 #ifdef HAS_ARGB1555TOARGBROW_AVX2 | 570 #ifdef HAS_ARGB1555TOARGBROW_AVX2 |
571 __declspec(naked) | 571 __declspec(naked) |
572 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, | 572 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, |
573 int width) { | 573 int width) { |
574 __asm { | 574 __asm { |
575 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 575 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
576 vmovd xmm5, eax | 576 vmovd xmm5, eax |
577 vbroadcastss ymm5, xmm5 | 577 vbroadcastss ymm5, xmm5 |
578 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits | 578 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
579 movd xmm6, eax | 579 vmovd xmm6, eax |
580 vbroadcastss ymm6, xmm6 | 580 vbroadcastss ymm6, xmm6 |
581 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red | 581 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
582 vpsllw ymm3, ymm3, 11 | 582 vpsllw ymm3, ymm3, 11 |
583 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green | 583 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green |
584 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha | 584 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
585 vpsllw ymm7, ymm7, 8 | 585 vpsllw ymm7, ymm7, 8 |
586 | 586 |
587 mov eax, [esp + 4] // src_argb1555 | 587 mov eax, [esp + 4] // src_argb1555 |
588 mov edx, [esp + 8] // dst_argb | 588 mov edx, [esp + 8] // dst_argb |
589 mov ecx, [esp + 12] // width | 589 mov ecx, [esp + 12] // width |
(...skipping 3509 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4099 movq xmm2, qword ptr [edx + esi] // src1 | 4099 movq xmm2, qword ptr [edx + esi] // src1 |
4100 punpcklbw xmm1, xmm2 | 4100 punpcklbw xmm1, xmm2 |
4101 psubb xmm1, xmm6 // bias src0/1 - 128 | 4101 psubb xmm1, xmm6 // bias src0/1 - 128 |
4102 pmaddubsw xmm0, xmm1 | 4102 pmaddubsw xmm0, xmm1 |
4103 paddw xmm0, xmm7 // unbias result - 32768 and round. | 4103 paddw xmm0, xmm7 // unbias result - 32768 and round. |
4104 psrlw xmm0, 8 | 4104 psrlw xmm0, 8 |
4105 packuswb xmm0, xmm0 | 4105 packuswb xmm0, xmm0 |
4106 movq qword ptr [edi + esi], xmm0 | 4106 movq qword ptr [edi + esi], xmm0 |
4107 lea esi, [esi + 8] | 4107 lea esi, [esi + 8] |
4108 sub ecx, 8 | 4108 sub ecx, 8 |
4109 jge convertloop8 | 4109 jg convertloop8 |
4110 | 4110 |
4111 pop edi | 4111 pop edi |
4112 pop esi | 4112 pop esi |
4113 ret | 4113 ret |
4114 } | 4114 } |
4115 } | 4115 } |
4116 #endif // HAS_BLENDPLANEROW_SSSE3 | 4116 #endif // HAS_BLENDPLANEROW_SSSE3 |
4117 | 4117 |
| 4118 #ifdef HAS_BLENDPLANEROW_AVX2 |
| 4119 // Blend 16 pixels at a time. |
| 4120 // =((G2*C2)+(H2*(D2))+32768+127)/256 |
| 4121 __declspec(naked) |
| 4122 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, |
| 4123 const uint8* alpha, uint8* dst, int width) { |
| 4124 __asm { |
| 4125 push esi |
| 4126 push edi |
| 4127 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 |
| 4128 vpsllw ymm5, ymm5, 8 |
| 4129 mov eax, 0x80808080 // 128 for biasing image to signed. |
| 4130 vmovd xmm6, eax |
| 4131 vbroadcastss ymm6, xmm6 |
| 4132 mov eax, 0x807f807f // 32768 + 127 for unbias and round. |
| 4133 vmovd xmm7, eax |
| 4134 vbroadcastss ymm7, xmm7 |
| 4135 mov eax, [esp + 8 + 4] // src0 |
| 4136 mov edx, [esp + 8 + 8] // src1 |
| 4137 mov esi, [esp + 8 + 12] // alpha |
| 4138 mov edi, [esp + 8 + 16] // dst |
| 4139 mov ecx, [esp + 8 + 20] // width |
| 4140 sub eax, esi |
| 4141 sub edx, esi |
| 4142 sub edi, esi |
| 4143 |
| 4144 // 16 pixel loop. |
| 4145 convertloop16: |
| 4146 vmovdqu xmm0, [esi] // alpha |
| 4147 vpermq ymm0, ymm0, 0xd8 |
| 4148 vpunpcklbw ymm0, ymm0, ymm0 |
| 4149 vpxor ymm0, ymm0, ymm5 // a, 255-a |
| 4150 vmovdqu xmm1, [eax + esi] // src0 |
| 4151 vmovdqu xmm2, [edx + esi] // src1 |
| 4152 vpermq ymm1, ymm1, 0xd8 |
| 4153 vpermq ymm2, ymm2, 0xd8 |
| 4154 vpunpcklbw ymm1, ymm1, ymm2 |
| 4155 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 |
| 4156 vpmaddubsw ymm0, ymm0, ymm1 |
| 4157 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. |
| 4158 vpsrlw ymm0, ymm0, 8 |
| 4159 vpackuswb ymm0, ymm0, ymm0 |
| 4160 vpermq ymm0, ymm0, 0xd8 |
| 4161 vmovdqu [edi + esi], xmm0 |
| 4162 lea esi, [esi + 16] |
| 4163 sub ecx, 16 |
| 4164 jg convertloop16 |
| 4165 |
| 4166 pop edi |
| 4167 pop esi |
| 4168 vzeroupper |
| 4169 ret |
| 4170 } |
| 4171 } |
| 4172 #endif // HAS_BLENDPLANEROW_AVX2 |
| 4173 |
4118 #ifdef HAS_ARGBBLENDROW_SSSE3 | 4174 #ifdef HAS_ARGBBLENDROW_SSSE3 |
4119 // Shuffle table for isolating alpha. | 4175 // Shuffle table for isolating alpha. |
4120 static const uvec8 kShuffleAlpha = { | 4176 static const uvec8 kShuffleAlpha = { |
4121 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, | 4177 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
4122 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | 4178 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
4123 }; | 4179 }; |
4124 | 4180 |
4125 // Blend 8 pixels at a time. | 4181 // Blend 8 pixels at a time. |
4126 __declspec(naked) | 4182 __declspec(naked) |
4127 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | 4183 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
(...skipping 2174 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6302 } | 6358 } |
6303 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6359 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6304 | 6360 |
6305 #endif // defined(_M_X64) | 6361 #endif // defined(_M_X64) |
6306 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6362 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6307 | 6363 |
6308 #ifdef __cplusplus | 6364 #ifdef __cplusplus |
6309 } // extern "C" | 6365 } // extern "C" |
6310 } // namespace libyuv | 6366 } // namespace libyuv |
6311 #endif | 6367 #endif |
OLD | NEW |