Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(137)

Side by Side Diff: source/row_win.cc

Issue 1505433002: AVX2 YUV alpha blender and improved unittests (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: off by 1 fix on win Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_gcc.cc ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 507 matching lines...) Expand 10 before | Expand all | Expand 10 after
518 // v * (256 + 8) 518 // v * (256 + 8)
519 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 519 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
520 __declspec(naked) 520 __declspec(naked)
521 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, 521 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
522 int width) { 522 int width) {
523 __asm { 523 __asm {
524 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 524 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
525 vmovd xmm5, eax 525 vmovd xmm5, eax
526 vbroadcastss ymm5, xmm5 526 vbroadcastss ymm5, xmm5
527 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 527 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
528 movd xmm6, eax 528 vmovd xmm6, eax
529 vbroadcastss ymm6, xmm6 529 vbroadcastss ymm6, xmm6
530 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 530 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
531 vpsllw ymm3, ymm3, 11 531 vpsllw ymm3, ymm3, 11
532 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green 532 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
533 vpsllw ymm4, ymm4, 10 533 vpsllw ymm4, ymm4, 10
534 vpsrlw ymm4, ymm4, 5 534 vpsrlw ymm4, ymm4, 5
535 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 535 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
536 vpsllw ymm7, ymm7, 8 536 vpsllw ymm7, ymm7, 8
537 537
538 mov eax, [esp + 4] // src_rgb565 538 mov eax, [esp + 4] // src_rgb565
(...skipping 30 matching lines...) Expand all
569 569
570 #ifdef HAS_ARGB1555TOARGBROW_AVX2 570 #ifdef HAS_ARGB1555TOARGBROW_AVX2
571 __declspec(naked) 571 __declspec(naked)
572 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, 572 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
573 int width) { 573 int width) {
574 __asm { 574 __asm {
575 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 575 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
576 vmovd xmm5, eax 576 vmovd xmm5, eax
577 vbroadcastss ymm5, xmm5 577 vbroadcastss ymm5, xmm5
578 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 578 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
579 movd xmm6, eax 579 vmovd xmm6, eax
580 vbroadcastss ymm6, xmm6 580 vbroadcastss ymm6, xmm6
581 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 581 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
582 vpsllw ymm3, ymm3, 11 582 vpsllw ymm3, ymm3, 11
583 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green 583 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
584 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 584 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
585 vpsllw ymm7, ymm7, 8 585 vpsllw ymm7, ymm7, 8
586 586
587 mov eax, [esp + 4] // src_argb1555 587 mov eax, [esp + 4] // src_argb1555
588 mov edx, [esp + 8] // dst_argb 588 mov edx, [esp + 8] // dst_argb
589 mov ecx, [esp + 12] // width 589 mov ecx, [esp + 12] // width
(...skipping 3509 matching lines...) Expand 10 before | Expand all | Expand 10 after
4099 movq xmm2, qword ptr [edx + esi] // src1 4099 movq xmm2, qword ptr [edx + esi] // src1
4100 punpcklbw xmm1, xmm2 4100 punpcklbw xmm1, xmm2
4101 psubb xmm1, xmm6 // bias src0/1 - 128 4101 psubb xmm1, xmm6 // bias src0/1 - 128
4102 pmaddubsw xmm0, xmm1 4102 pmaddubsw xmm0, xmm1
4103 paddw xmm0, xmm7 // unbias result - 32768 and round. 4103 paddw xmm0, xmm7 // unbias result - 32768 and round.
4104 psrlw xmm0, 8 4104 psrlw xmm0, 8
4105 packuswb xmm0, xmm0 4105 packuswb xmm0, xmm0
4106 movq qword ptr [edi + esi], xmm0 4106 movq qword ptr [edi + esi], xmm0
4107 lea esi, [esi + 8] 4107 lea esi, [esi + 8]
4108 sub ecx, 8 4108 sub ecx, 8
4109 jge convertloop8 4109 jg convertloop8
4110 4110
4111 pop edi 4111 pop edi
4112 pop esi 4112 pop esi
4113 ret 4113 ret
4114 } 4114 }
4115 } 4115 }
4116 #endif // HAS_BLENDPLANEROW_SSSE3 4116 #endif // HAS_BLENDPLANEROW_SSSE3
4117 4117
4118 #ifdef HAS_BLENDPLANEROW_AVX2
4119 // Blend 16 pixels at a time.
4120 // =((G2*C2)+(H2*(D2))+32768+127)/256
4121 __declspec(naked)
4122 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
4123 const uint8* alpha, uint8* dst, int width) {
4124 __asm {
4125 push esi
4126 push edi
4127 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
4128 vpsllw ymm5, ymm5, 8
4129 mov eax, 0x80808080 // 128 for biasing image to signed.
4130 vmovd xmm6, eax
4131 vbroadcastss ymm6, xmm6
4132 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4133 vmovd xmm7, eax
4134 vbroadcastss ymm7, xmm7
4135 mov eax, [esp + 8 + 4] // src0
4136 mov edx, [esp + 8 + 8] // src1
4137 mov esi, [esp + 8 + 12] // alpha
4138 mov edi, [esp + 8 + 16] // dst
4139 mov ecx, [esp + 8 + 20] // width
4140 sub eax, esi
4141 sub edx, esi
4142 sub edi, esi
4143
4144 // 16 pixel loop.
4145 convertloop16:
4146 vmovdqu xmm0, [esi] // alpha
4147 vpermq ymm0, ymm0, 0xd8
4148 vpunpcklbw ymm0, ymm0, ymm0
4149 vpxor ymm0, ymm0, ymm5 // a, 255-a
4150 vmovdqu xmm1, [eax + esi] // src0
4151 vmovdqu xmm2, [edx + esi] // src1
4152 vpermq ymm1, ymm1, 0xd8
4153 vpermq ymm2, ymm2, 0xd8
4154 vpunpcklbw ymm1, ymm1, ymm2
4155 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
4156 vpmaddubsw ymm0, ymm0, ymm1
4157 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
4158 vpsrlw ymm0, ymm0, 8
4159 vpackuswb ymm0, ymm0, ymm0
4160 vpermq ymm0, ymm0, 0xd8
4161 vmovdqu [edi + esi], xmm0
4162 lea esi, [esi + 16]
4163 sub ecx, 16
4164 jg convertloop16
4165
4166 pop edi
4167 pop esi
4168 vzeroupper
4169 ret
4170 }
4171 }
4172 #endif // HAS_BLENDPLANEROW_AVX2
4173
4118 #ifdef HAS_ARGBBLENDROW_SSSE3 4174 #ifdef HAS_ARGBBLENDROW_SSSE3
4119 // Shuffle table for isolating alpha. 4175 // Shuffle table for isolating alpha.
4120 static const uvec8 kShuffleAlpha = { 4176 static const uvec8 kShuffleAlpha = {
4121 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 4177 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4122 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 4178 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4123 }; 4179 };
4124 4180
4125 // Blend 8 pixels at a time. 4181 // Blend 8 pixels at a time.
4126 __declspec(naked) 4182 __declspec(naked)
4127 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 4183 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
(...skipping 2174 matching lines...) Expand 10 before | Expand all | Expand 10 after
6302 } 6358 }
6303 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6359 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6304 6360
6305 #endif // defined(_M_X64) 6361 #endif // defined(_M_X64)
6306 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6362 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6307 6363
6308 #ifdef __cplusplus 6364 #ifdef __cplusplus
6309 } // extern "C" 6365 } // extern "C"
6310 } // namespace libyuv 6366 } // namespace libyuv
6311 #endif 6367 #endif
OLDNEW
« no previous file with comments | « source/row_gcc.cc ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698