source/libvpx/third_party/libyuv/source/row_win.cc - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/third_party/libyuv/source/row_win.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include "libyuv/row.h"	11 #include "libyuv/row.h"

12	12

13 #if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)	13 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \

	14 defined(_MSC_VER) && !defined(__clang__)

14 #include <emmintrin.h>	15 #include <emmintrin.h>

15 #include <tmmintrin.h> // For _mm_maddubs_epi16	16 #include <tmmintrin.h> // For _mm_maddubs_epi16

16 #endif	17 #endif

17	18

18 #ifdef __cplusplus	19 #ifdef __cplusplus

19 namespace libyuv {	20 namespace libyuv {

20 extern "C" {	21 extern "C" {

21 #endif	22 #endif

22	23

23 // This module is for Visual C.	24 // This module is for Visual C.

24 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \	25 #if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) \|\| defined(_M_X64)) && \

25 (defined(_M_IX86) \|\| defined(_M_X64))	26 defined(_MSC_VER) && !defined(__clang__)

26

27 // YUV to RGB conversion constants.

28 // Y contribution to R,G,B. Scale and bias.

29 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

30 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */

31

32 // U and V contributions to R,G,B.

33 #define UB -128 /* -min(128, round(2.018 * 64)) */

34 #define UG 25 /* -round(-0.391 * 64) */

35 #define VG 52 /* -round(-0.813 * 64) */

36 #define VR -102 /* -round(1.596 * 64) */

37

38 // Bias values to subtract 16 from Y and 128 from U and V.

39 #define BB (UB * 128 - YGB)

40 #define BG (UG * 128 + VG * 128 - YGB)

41 #define BR (VR * 128 - YGB)

42	27

43 struct YuvConstants {	28 struct YuvConstants {

44 lvec8 kUVToB; // 0	29 lvec8 kUVToB; // 0

45 lvec8 kUVToG; // 32	30 lvec8 kUVToG; // 32

46 lvec8 kUVToR; // 64	31 lvec8 kUVToR; // 64

47 lvec16 kUVBiasB; // 96	32 lvec16 kUVBiasB; // 96

48 lvec16 kUVBiasG; // 128	33 lvec16 kUVBiasG; // 128

49 lvec16 kUVBiasR; // 160	34 lvec16 kUVBiasR; // 160

50 lvec16 kYToRgb; // 192	35 lvec16 kYToRgb; // 192

51 };	36 };

52	37

	38 // BT.601 YUV to RGB reference

	39 // R = (Y - 16) * 1.164 - V * -1.596

	40 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813

	41 // B = (Y - 16) * 1.164 - U * -2.018

	42

	43 // Y contribution to R,G,B. Scale and bias.

	44 // TODO(fbarchard): Consider moving constants into a common header.

	45 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

	46 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

	47

	48 // U and V contributions to R,G,B.

	49 #define UB -128 /* max(-128, round(-2.018 * 64)) */

	50 #define UG 25 /* round(0.391 * 64) */

	51 #define VG 52 /* round(0.813 * 64) */

	52 #define VR -102 /* round(-1.596 * 64) */

	53

	54 // Bias values to subtract 16 from Y and 128 from U and V.

	55 #define BB (UB * 128 + YGB)

	56 #define BG (UG * 128 + VG * 128 + YGB)

	57 #define BR (VR * 128 + YGB)

	58

53 // BT601 constants for YUV to RGB.	59 // BT601 constants for YUV to RGB.

54 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {	60 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {

55 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,	61 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

56 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },	62 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },

57 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,	63 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

58 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },	64 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },

59 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,	65 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

60 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },	66 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },

61 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },	67 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

62 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },	68 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

63 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },	69 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

64 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }	70 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

65 };	71 };

66	72

67 // BT601 constants for NV21 where chroma plane is VU instead of UV.	73 // BT601 constants for NV21 where chroma plane is VU instead of UV.

68 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {	74 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {

69 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,	75 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

70 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },	76 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },

71 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,	77 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

72 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },	78 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },

73 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,	79 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

74 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },	80 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },

75 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },	81 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

76 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },	82 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

77 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },	83 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

78 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }	84 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

79 };	85 };

80	86

	87 #undef YG

	88 #undef YGB

	89 #undef UB

	90 #undef UG

	91 #undef VG

	92 #undef VR

	93 #undef BB

	94 #undef BG

	95 #undef BR

	96

	97 // JPEG YUV to RGB reference

	98 // * R = Y - V * -1.40200

	99 // * G = Y - U * 0.34414 - V * 0.71414

	100 // * B = Y - U * -1.77200

	101

	102 // Y contribution to R,G,B. Scale and bias.

	103 // TODO(fbarchard): Consider moving constants into a common header.

	104 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */

	105 #define YGBJ 32 /* 64 / 2 */

	106

	107 // U and V contributions to R,G,B.

	108 #define UBJ -113 /* round(-1.77200 * 64) */

	109 #define UGJ 22 /* round(0.34414 * 64) */

	110 #define VGJ 46 /* round(0.71414 * 64) */

	111 #define VRJ -90 /* round(-1.40200 * 64) */

	112

	113 // Bias values to subtract 16 from Y and 128 from U and V.

	114 #define BBJ (UBJ * 128 + YGBJ)

	115 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)

	116 #define BRJ (VRJ * 128 + YGBJ)

	117

	118 // JPEG constants for YUV to RGB.

	119 static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {

	120 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,

	121 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },

	122 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

	123 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

	124 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

	125 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },

	126 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,

	127 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },

	128 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,

	129 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },

	130 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,

	131 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },

	132 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,

	133 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },

	134 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,

	135 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }

	136 };

	137

	138 #undef YGJ

	139 #undef YGBJ

	140 #undef UBJ

	141 #undef UGJ

	142 #undef VGJ

	143 #undef VRJ

	144 #undef BBJ

	145 #undef BGJ

	146 #undef BRJ

	147

81 // 64 bit	148 // 64 bit

82 #if defined(_M_X64)	149 #if defined(_M_X64)

83	150 #if defined(HAS_I422TOARGBROW_SSSE3)

84 __declspec(align(16))

85 void I422ToARGBRow_SSSE3(const uint8* y_buf,	151 void I422ToARGBRow_SSSE3(const uint8* y_buf,

86 const uint8* u_buf,	152 const uint8* u_buf,

87 const uint8* v_buf,	153 const uint8* v_buf,

88 uint8* dst_argb,	154 uint8* dst_argb,

89 int width) {	155 int width) {

90 __m128i xmm0, xmm1, xmm2, xmm3;	156 __m128i xmm0, xmm1, xmm2, xmm3;

91 const __m128i xmm5 = _mm_set1_epi8(-1);	157 const __m128i xmm5 = _mm_set1_epi8(-1);

92 const ptrdiff_t offset = (uint8)v_buf - (uint8)u_buf;	158 const ptrdiff_t offset = (uint8)v_buf - (uint8)u_buf;

93	159

94 while (width > 0) {	160 while (width > 0) {

(...skipping 29 matching lines...) Expand all Loading...
124	190

125 _mm_storeu_si128((__m128i *)dst_argb, xmm0);	191 _mm_storeu_si128((__m128i *)dst_argb, xmm0);

126 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);	192 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);

127	193

128 y_buf += 8;	194 y_buf += 8;

129 u_buf += 4;	195 u_buf += 4;

130 dst_argb += 32;	196 dst_argb += 32;

131 width -= 8;	197 width -= 8;

132 }	198 }

133 }	199 }

134	200 #endif

135 // 32 bit	201 // 32 bit

136 #else // defined(_M_X64)	202 #else // defined(_M_X64)

137

138 #ifdef HAS_ARGBTOYROW_SSSE3	203 #ifdef HAS_ARGBTOYROW_SSSE3

139	204

140 // Constants for ARGB.	205 // Constants for ARGB.

141 static const vec8 kARGBToY = {	206 static const vec8 kARGBToY = {

142 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0	207 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0

143 };	208 };

144	209

145 // JPeg full range.	210 // JPeg full range.

146 static const vec8 kARGBToYJ = {	211 static const vec8 kARGBToYJ = {

147 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0	212 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0

(...skipping 102 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
250 static const uvec8 kShuffleMaskARGBToRGB24_0 = {	315 static const uvec8 kShuffleMaskARGBToRGB24_0 = {

251 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u	316 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u

252 };	317 };

253	318

254 // Shuffle table for converting ARGB to RAW.	319 // Shuffle table for converting ARGB to RAW.

255 static const uvec8 kShuffleMaskARGBToRAW_0 = {	320 static const uvec8 kShuffleMaskARGBToRAW_0 = {

256 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u	321 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u

257 };	322 };

258	323

259 // Duplicates gray value 3 times and fills in alpha opaque.	324 // Duplicates gray value 3 times and fills in alpha opaque.

260 __declspec(naked) __declspec(align(16))	325 __declspec(naked)

261 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {	326 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {

262 __asm {	327 __asm {

263 mov eax, [esp + 4] // src_y	328 mov eax, [esp + 4] // src_y

264 mov edx, [esp + 8] // dst_argb	329 mov edx, [esp + 8] // dst_argb

265 mov ecx, [esp + 12] // pix	330 mov ecx, [esp + 12] // pix

266 pcmpeqb xmm5, xmm5 // generate mask 0xff000000	331 pcmpeqb xmm5, xmm5 // generate mask 0xff000000

267 pslld xmm5, 24	332 pslld xmm5, 24

268	333

269 convertloop:	334 convertloop:

270 movq xmm0, qword ptr [eax]	335 movq xmm0, qword ptr [eax]

271 lea eax, [eax + 8]	336 lea eax, [eax + 8]

272 punpcklbw xmm0, xmm0	337 punpcklbw xmm0, xmm0

273 movdqa xmm1, xmm0	338 movdqa xmm1, xmm0

274 punpcklwd xmm0, xmm0	339 punpcklwd xmm0, xmm0

275 punpckhwd xmm1, xmm1	340 punpckhwd xmm1, xmm1

276 por xmm0, xmm5	341 por xmm0, xmm5

277 por xmm1, xmm5	342 por xmm1, xmm5

278 movdqu [edx], xmm0	343 movdqu [edx], xmm0

279 movdqu [edx + 16], xmm1	344 movdqu [edx + 16], xmm1

280 lea edx, [edx + 32]	345 lea edx, [edx + 32]

281 sub ecx, 8	346 sub ecx, 8

282 jg convertloop	347 jg convertloop

283 ret	348 ret

284 }	349 }

285 }	350 }

286	351

287 __declspec(naked) __declspec(align(16))	352 #ifdef HAS_J400TOARGBROW_AVX2

	353 // Duplicates gray value 3 times and fills in alpha opaque.

	354 __declspec(naked)

	355 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {

	356 __asm {

	357 mov eax, [esp + 4] // src_y

	358 mov edx, [esp + 8] // dst_argb

	359 mov ecx, [esp + 12] // pix

	360 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000

	361 vpslld ymm5, ymm5, 24

	362

	363 convertloop:

	364 vmovdqu xmm0, [eax]

	365 lea eax, [eax + 16]

	366 vpermq ymm0, ymm0, 0xd8

	367 vpunpcklbw ymm0, ymm0, ymm0

	368 vpermq ymm0, ymm0, 0xd8

	369 vpunpckhwd ymm1, ymm0, ymm0

	370 vpunpcklwd ymm0, ymm0, ymm0

	371 vpor ymm0, ymm0, ymm5

	372 vpor ymm1, ymm1, ymm5

	373 vmovdqu [edx], ymm0

	374 vmovdqu [edx + 32], ymm1

	375 lea edx, [edx + 64]

	376 sub ecx, 16

	377 jg convertloop

	378 vzeroupper

	379 ret

	380 }

	381 }

	382 #endif // HAS_J400TOARGBROW_AVX2

	383

	384 __declspec(naked)

288 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {	385 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {

289 __asm {	386 __asm {

290 mov eax, [esp + 4] // src_rgb24	387 mov eax, [esp + 4] // src_rgb24

291 mov edx, [esp + 8] // dst_argb	388 mov edx, [esp + 8] // dst_argb

292 mov ecx, [esp + 12] // pix	389 mov ecx, [esp + 12] // pix

293 pcmpeqb xmm5, xmm5 // generate mask 0xff000000	390 pcmpeqb xmm5, xmm5 // generate mask 0xff000000

294 pslld xmm5, 24	391 pslld xmm5, 24

295 movdqa xmm4, kShuffleMaskRGB24ToARGB	392 movdqa xmm4, kShuffleMaskRGB24ToARGB

296	393

297 convertloop:	394 convertloop:

(...skipping 17 matching lines...) Expand all Loading...
315 movdqu [edx + 16], xmm1	412 movdqu [edx + 16], xmm1

316 por xmm3, xmm5	413 por xmm3, xmm5

317 movdqu [edx + 48], xmm3	414 movdqu [edx + 48], xmm3

318 lea edx, [edx + 64]	415 lea edx, [edx + 64]

319 sub ecx, 16	416 sub ecx, 16

320 jg convertloop	417 jg convertloop

321 ret	418 ret

322 }	419 }

323 }	420 }

324	421

325 __declspec(naked) __declspec(align(16))	422 __declspec(naked)

326 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,	423 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,

327 int pix) {	424 int pix) {

328 __asm {	425 __asm {

329 mov eax, [esp + 4] // src_raw	426 mov eax, [esp + 4] // src_raw

330 mov edx, [esp + 8] // dst_argb	427 mov edx, [esp + 8] // dst_argb

331 mov ecx, [esp + 12] // pix	428 mov ecx, [esp + 12] // pix

332 pcmpeqb xmm5, xmm5 // generate mask 0xff000000	429 pcmpeqb xmm5, xmm5 // generate mask 0xff000000

333 pslld xmm5, 24	430 pslld xmm5, 24

334 movdqa xmm4, kShuffleMaskRAWToARGB	431 movdqa xmm4, kShuffleMaskRAWToARGB

335	432

(...skipping 25 matching lines...) Expand all Loading...
361 }	458 }

362 }	459 }

363	460

364 // pmul method to replicate bits.	461 // pmul method to replicate bits.

365 // Math to replicate bits:	462 // Math to replicate bits:

366 // (v << 8) \| (v << 3)	463 // (v << 8) \| (v << 3)

367 // v * 256 + v * 8	464 // v * 256 + v * 8

368 // v * (256 + 8)	465 // v * (256 + 8)

369 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3	466 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3

370 // 20 instructions.	467 // 20 instructions.

371 __declspec(naked) __declspec(align(16))	468 __declspec(naked)

372 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,	469 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,

373 int pix) {	470 int pix) {

374 __asm {	471 __asm {

375 mov eax, 0x01080108 // generate multiplier to repeat 5 bits	472 mov eax, 0x01080108 // generate multiplier to repeat 5 bits

376 movd xmm5, eax	473 movd xmm5, eax

377 pshufd xmm5, xmm5, 0	474 pshufd xmm5, xmm5, 0

378 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits	475 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits

379 movd xmm6, eax	476 movd xmm6, eax

380 pshufd xmm6, xmm6, 0	477 pshufd xmm6, xmm6, 0

381 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red	478 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red

(...skipping 28 matching lines...) Expand all Loading...
410 punpckhbw xmm2, xmm0	507 punpckhbw xmm2, xmm0

411 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB	508 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB

412 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB	509 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB

413 lea eax, [eax + 16]	510 lea eax, [eax + 16]

414 sub ecx, 8	511 sub ecx, 8

415 jg convertloop	512 jg convertloop

416 ret	513 ret

417 }	514 }

418 }	515 }

419	516

	517 #ifdef HAS_RGB565TOARGBROW_AVX2

	518 // pmul method to replicate bits.

	519 // Math to replicate bits:

	520 // (v << 8) \| (v << 3)

	521 // v * 256 + v * 8

	522 // v * (256 + 8)

	523 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3

	524 __declspec(naked)

	525 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,

	526 int pix) {

	527 __asm {

	528 mov eax, 0x01080108 // generate multiplier to repeat 5 bits

	529 vmovd xmm5, eax

	530 vbroadcastss ymm5, xmm5

	531 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits

	532 movd xmm6, eax

	533 vbroadcastss ymm6, xmm6

	534 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red

	535 vpsllw ymm3, ymm3, 11

	536 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green

	537 vpsllw ymm4, ymm4, 10

	538 vpsrlw ymm4, ymm4, 5

	539 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha

	540 vpsllw ymm7, ymm7, 8

	541

	542 mov eax, [esp + 4] // src_rgb565

	543 mov edx, [esp + 8] // dst_argb

	544 mov ecx, [esp + 12] // pix

	545 sub edx, eax

	546 sub edx, eax

	547

	548 convertloop:

	549 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565

	550 vpand ymm1, ymm0, ymm3 // R in upper 5 bits

	551 vpsllw ymm2, ymm0, 11 // B in upper 5 bits

	552 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)

	553 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)

	554 vpsllw ymm1, ymm1, 8

	555 vpor ymm1, ymm1, ymm2 // RB

	556 vpand ymm0, ymm0, ymm4 // G in middle 6 bits

	557 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)

	558 vpor ymm0, ymm0, ymm7 // AG

	559 vpermq ymm0, ymm0, 0xd8 // mutate for unpack

	560 vpermq ymm1, ymm1, 0xd8

	561 vpunpckhbw ymm2, ymm1, ymm0

	562 vpunpcklbw ymm1, ymm1, ymm0

	563 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB

	564 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB

	565 lea eax, [eax + 32]

	566 sub ecx, 16

	567 jg convertloop

	568 vzeroupper

	569 ret

	570 }

	571 }

	572 #endif // HAS_RGB565TOARGBROW_AVX2

	573

	574 #ifdef HAS_ARGB1555TOARGBROW_AVX2

	575 __declspec(naked)

	576 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,

	577 int pix) {

	578 __asm {

	579 mov eax, 0x01080108 // generate multiplier to repeat 5 bits

	580 vmovd xmm5, eax

	581 vbroadcastss ymm5, xmm5

	582 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits

	583 movd xmm6, eax

	584 vbroadcastss ymm6, xmm6

	585 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red

	586 vpsllw ymm3, ymm3, 11

	587 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green

	588 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha

	589 vpsllw ymm7, ymm7, 8

	590

	591 mov eax, [esp + 4] // src_argb1555

	592 mov edx, [esp + 8] // dst_argb

	593 mov ecx, [esp + 12] // pix

	594 sub edx, eax

	595 sub edx, eax

	596

	597 convertloop:

	598 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555

	599 vpsllw ymm1, ymm0, 1 // R in upper 5 bits

	600 vpsllw ymm2, ymm0, 11 // B in upper 5 bits

	601 vpand ymm1, ymm1, ymm3

	602 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)

	603 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)

	604 vpsllw ymm1, ymm1, 8

	605 vpor ymm1, ymm1, ymm2 // RB

	606 vpsraw ymm2, ymm0, 8 // A

	607 vpand ymm0, ymm0, ymm4 // G in middle 5 bits

	608 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)

	609 vpand ymm2, ymm2, ymm7

	610 vpor ymm0, ymm0, ymm2 // AG

	611 vpermq ymm0, ymm0, 0xd8 // mutate for unpack

	612 vpermq ymm1, ymm1, 0xd8

	613 vpunpckhbw ymm2, ymm1, ymm0

	614 vpunpcklbw ymm1, ymm1, ymm0

	615 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB

	616 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB

	617 lea eax, [eax + 32]

	618 sub ecx, 16

	619 jg convertloop

	620 vzeroupper

	621 ret

	622 }

	623 }

	624 #endif // HAS_ARGB1555TOARGBROW_AVX2

	625

	626 #ifdef HAS_ARGB4444TOARGBROW_AVX2

	627 __declspec(naked)

	628 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,

	629 int pix) {

	630 __asm {

	631 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f

	632 vmovd xmm4, eax

	633 vbroadcastss ymm4, xmm4

	634 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles

	635 mov eax, [esp + 4] // src_argb4444

	636 mov edx, [esp + 8] // dst_argb

	637 mov ecx, [esp + 12] // pix

	638 sub edx, eax

	639 sub edx, eax

	640

	641 convertloop:

	642 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444

	643 vpand ymm2, ymm0, ymm5 // mask high nibbles

	644 vpand ymm0, ymm0, ymm4 // mask low nibbles

	645 vpsrlw ymm3, ymm2, 4

	646 vpsllw ymm1, ymm0, 4

	647 vpor ymm2, ymm2, ymm3

	648 vpor ymm0, ymm0, ymm1

	649 vpermq ymm0, ymm0, 0xd8 // mutate for unpack

	650 vpermq ymm2, ymm2, 0xd8

	651 vpunpckhbw ymm1, ymm0, ymm2

	652 vpunpcklbw ymm0, ymm0, ymm2

	653 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB

	654 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB

	655 lea eax, [eax + 32]

	656 sub ecx, 16

	657 jg convertloop

	658 vzeroupper

	659 ret

	660 }

	661 }

	662 #endif // HAS_ARGB4444TOARGBROW_AVX2

	663

420 // 24 instructions	664 // 24 instructions

421 __declspec(naked) __declspec(align(16))	665 __declspec(naked)

422 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,	666 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,

423 int pix) {	667 int pix) {

424 __asm {	668 __asm {

425 mov eax, 0x01080108 // generate multiplier to repeat 5 bits	669 mov eax, 0x01080108 // generate multiplier to repeat 5 bits

426 movd xmm5, eax	670 movd xmm5, eax

427 pshufd xmm5, xmm5, 0	671 pshufd xmm5, xmm5, 0

428 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits	672 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits

429 movd xmm6, eax	673 movd xmm6, eax

430 pshufd xmm6, xmm6, 0	674 pshufd xmm6, xmm6, 0

431 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red	675 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
464 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB	708 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB

465 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB	709 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB

466 lea eax, [eax + 16]	710 lea eax, [eax + 16]

467 sub ecx, 8	711 sub ecx, 8

468 jg convertloop	712 jg convertloop

469 ret	713 ret

470 }	714 }

471 }	715 }

472	716

473 // 18 instructions.	717 // 18 instructions.

474 __declspec(naked) __declspec(align(16))	718 __declspec(naked)

475 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,	719 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,

476 int pix) {	720 int pix) {

477 __asm {	721 __asm {

478 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f	722 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f

479 movd xmm4, eax	723 movd xmm4, eax

480 pshufd xmm4, xmm4, 0	724 pshufd xmm4, xmm4, 0

481 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles	725 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles

482 pslld xmm5, 4	726 pslld xmm5, 4

483 mov eax, [esp + 4] // src_argb4444	727 mov eax, [esp + 4] // src_argb4444

484 mov edx, [esp + 8] // dst_argb	728 mov edx, [esp + 8] // dst_argb

(...skipping 17 matching lines...) Expand all Loading...
502 punpckhbw xmm1, xmm2	746 punpckhbw xmm1, xmm2

503 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB	747 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB

504 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB	748 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB

505 lea eax, [eax + 16]	749 lea eax, [eax + 16]

506 sub ecx, 8	750 sub ecx, 8

507 jg convertloop	751 jg convertloop

508 ret	752 ret

509 }	753 }

510 }	754 }

511	755

512 __declspec(naked) __declspec(align(16))	756 __declspec(naked)

513 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {	757 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {

514 __asm {	758 __asm {

515 mov eax, [esp + 4] // src_argb	759 mov eax, [esp + 4] // src_argb

516 mov edx, [esp + 8] // dst_rgb	760 mov edx, [esp + 8] // dst_rgb

517 mov ecx, [esp + 12] // pix	761 mov ecx, [esp + 12] // pix

518 movdqa xmm6, kShuffleMaskARGBToRGB24	762 movdqa xmm6, kShuffleMaskARGBToRGB24

519	763

520 convertloop:	764 convertloop:

521 movdqu xmm0, [eax] // fetch 16 pixels of argb	765 movdqu xmm0, [eax] // fetch 16 pixels of argb

522 movdqu xmm1, [eax + 16]	766 movdqu xmm1, [eax + 16]

(...skipping 17 matching lines...) Expand all Loading...
540 por xmm2, xmm3 // 12 bytes from 3 for 2	784 por xmm2, xmm3 // 12 bytes from 3 for 2

541 movdqu [edx + 16], xmm1 // store 1	785 movdqu [edx + 16], xmm1 // store 1

542 movdqu [edx + 32], xmm2 // store 2	786 movdqu [edx + 32], xmm2 // store 2

543 lea edx, [edx + 48]	787 lea edx, [edx + 48]

544 sub ecx, 16	788 sub ecx, 16

545 jg convertloop	789 jg convertloop

546 ret	790 ret

547 }	791 }

548 }	792 }

549	793

550 __declspec(naked) __declspec(align(16))	794 __declspec(naked)

551 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {	795 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {

552 __asm {	796 __asm {

553 mov eax, [esp + 4] // src_argb	797 mov eax, [esp + 4] // src_argb

554 mov edx, [esp + 8] // dst_rgb	798 mov edx, [esp + 8] // dst_rgb

555 mov ecx, [esp + 12] // pix	799 mov ecx, [esp + 12] // pix

556 movdqa xmm6, kShuffleMaskARGBToRAW	800 movdqa xmm6, kShuffleMaskARGBToRAW

557	801

558 convertloop:	802 convertloop:

559 movdqu xmm0, [eax] // fetch 16 pixels of argb	803 movdqu xmm0, [eax] // fetch 16 pixels of argb

560 movdqu xmm1, [eax + 16]	804 movdqu xmm1, [eax + 16]

(...skipping 17 matching lines...) Expand all Loading...
578 por xmm2, xmm3 // 12 bytes from 3 for 2	822 por xmm2, xmm3 // 12 bytes from 3 for 2

579 movdqu [edx + 16], xmm1 // store 1	823 movdqu [edx + 16], xmm1 // store 1

580 movdqu [edx + 32], xmm2 // store 2	824 movdqu [edx + 32], xmm2 // store 2

581 lea edx, [edx + 48]	825 lea edx, [edx + 48]

582 sub ecx, 16	826 sub ecx, 16

583 jg convertloop	827 jg convertloop

584 ret	828 ret

585 }	829 }

586 }	830 }

587	831

588 __declspec(naked) __declspec(align(16))	832 // 4 pixels

	833 __declspec(naked)

589 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {	834 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {

590 __asm {	835 __asm {

591 mov eax, [esp + 4] // src_argb	836 mov eax, [esp + 4] // src_argb

592 mov edx, [esp + 8] // dst_rgb	837 mov edx, [esp + 8] // dst_rgb

593 mov ecx, [esp + 12] // pix	838 mov ecx, [esp + 12] // pix

594 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f	839 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f

595 psrld xmm3, 27	840 psrld xmm3, 27

596 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0	841 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0

597 psrld xmm4, 26	842 psrld xmm4, 26

598 pslld xmm4, 5	843 pslld xmm4, 5

(...skipping 16 matching lines...) Expand all Loading...
615 packssdw xmm0, xmm0	860 packssdw xmm0, xmm0

616 lea eax, [eax + 16]	861 lea eax, [eax + 16]

617 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565	862 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565

618 lea edx, [edx + 8]	863 lea edx, [edx + 8]

619 sub ecx, 4	864 sub ecx, 4

620 jg convertloop	865 jg convertloop

621 ret	866 ret

622 }	867 }

623 }	868 }

624	869

	870 // 8 pixels

	871 __declspec(naked)

	872 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,

	873 const uint32 dither4, int pix) {

	874 __asm {

	875

	876 mov eax, [esp + 4] // src_argb

	877 mov edx, [esp + 8] // dst_rgb

	878 movd xmm6, [esp + 12] // dither4

	879 mov ecx, [esp + 16] // pix

	880 punpcklbw xmm6, xmm6 // make dither 16 bytes

	881 movdqa xmm7, xmm6

	882 punpcklwd xmm6, xmm6

	883 punpckhwd xmm7, xmm7

	884 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f

	885 psrld xmm3, 27

	886 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0

	887 psrld xmm4, 26

	888 pslld xmm4, 5

	889 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800

	890 pslld xmm5, 11

	891

	892 convertloop:

	893 movdqu xmm0, [eax] // fetch 4 pixels of argb

	894 paddusb xmm0, xmm6 // add dither

	895 movdqa xmm1, xmm0 // B

	896 movdqa xmm2, xmm0 // G

	897 pslld xmm0, 8 // R

	898 psrld xmm1, 3 // B

	899 psrld xmm2, 5 // G

	900 psrad xmm0, 16 // R

	901 pand xmm1, xmm3 // B

	902 pand xmm2, xmm4 // G

	903 pand xmm0, xmm5 // R

	904 por xmm1, xmm2 // BG

	905 por xmm0, xmm1 // BGR

	906 packssdw xmm0, xmm0

	907 lea eax, [eax + 16]

	908 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565

	909 lea edx, [edx + 8]

	910 sub ecx, 4

	911 jg convertloop

	912 ret

	913 }

	914 }

	915

	916 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2

	917 __declspec(naked)

	918 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,

	919 const uint32 dither4, int pix) {

	920 __asm {

	921 mov eax, [esp + 4] // src_argb

	922 mov edx, [esp + 8] // dst_rgb

	923 vbroadcastss xmm6, [esp + 12] // dither4

	924 mov ecx, [esp + 16] // pix

	925 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes

	926 vpermq ymm6, ymm6, 0xd8

	927 vpunpcklwd ymm6, ymm6, ymm6

	928 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f

	929 vpsrld ymm3, ymm3, 27

	930 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0

	931 vpsrld ymm4, ymm4, 26

	932 vpslld ymm4, ymm4, 5

	933 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800

	934

	935 convertloop:

	936 vmovdqu ymm0, [eax] // fetch 8 pixels of argb

	937 vpaddusb ymm0, ymm0, ymm6 // add dither

	938 vpsrld ymm2, ymm0, 5 // G

	939 vpsrld ymm1, ymm0, 3 // B

	940 vpsrld ymm0, ymm0, 8 // R

	941 vpand ymm2, ymm2, ymm4 // G

	942 vpand ymm1, ymm1, ymm3 // B

	943 vpand ymm0, ymm0, ymm5 // R

	944 vpor ymm1, ymm1, ymm2 // BG

	945 vpor ymm0, ymm0, ymm1 // BGR

	946 vpackusdw ymm0, ymm0, ymm0

	947 vpermq ymm0, ymm0, 0xd8

	948 lea eax, [eax + 32]

	949 vmovdqu [edx], xmm0 // store 8 pixels of RGB565

	950 lea edx, [edx + 16]

	951 sub ecx, 8

	952 jg convertloop

	953 vzeroupper

	954 ret

	955 }

	956 }

	957 #endif // HAS_ARGBTORGB565DITHERROW_AVX2

	958

625 // TODO(fbarchard): Improve sign extension/packing.	959 // TODO(fbarchard): Improve sign extension/packing.

626 __declspec(naked) __declspec(align(16))	960 __declspec(naked)

627 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {	961 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {

628 __asm {	962 __asm {

629 mov eax, [esp + 4] // src_argb	963 mov eax, [esp + 4] // src_argb

630 mov edx, [esp + 8] // dst_rgb	964 mov edx, [esp + 8] // dst_rgb

631 mov ecx, [esp + 12] // pix	965 mov ecx, [esp + 12] // pix

632 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f	966 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f

633 psrld xmm4, 27	967 psrld xmm4, 27

634 movdqa xmm5, xmm4 // generate mask 0x000003e0	968 movdqa xmm5, xmm4 // generate mask 0x000003e0

635 pslld xmm5, 5	969 pslld xmm5, 5

636 movdqa xmm6, xmm4 // generate mask 0x00007c00	970 movdqa xmm6, xmm4 // generate mask 0x00007c00

(...skipping 20 matching lines...) Expand all Loading...
657 packssdw xmm0, xmm0	991 packssdw xmm0, xmm0

658 lea eax, [eax + 16]	992 lea eax, [eax + 16]

659 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555	993 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555

660 lea edx, [edx + 8]	994 lea edx, [edx + 8]

661 sub ecx, 4	995 sub ecx, 4

662 jg convertloop	996 jg convertloop

663 ret	997 ret

664 }	998 }

665 }	999 }

666	1000

667 __declspec(naked) __declspec(align(16))	1001 __declspec(naked)

668 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {	1002 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {

669 __asm {	1003 __asm {

670 mov eax, [esp + 4] // src_argb	1004 mov eax, [esp + 4] // src_argb

671 mov edx, [esp + 8] // dst_rgb	1005 mov edx, [esp + 8] // dst_rgb

672 mov ecx, [esp + 12] // pix	1006 mov ecx, [esp + 12] // pix

673 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000	1007 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000

674 psllw xmm4, 12	1008 psllw xmm4, 12

675 movdqa xmm3, xmm4 // generate mask 0x00f000f0	1009 movdqa xmm3, xmm4 // generate mask 0x00f000f0

676 psrlw xmm3, 8	1010 psrlw xmm3, 8

677	1011

678 convertloop:	1012 convertloop:

679 movdqu xmm0, [eax] // fetch 4 pixels of argb	1013 movdqu xmm0, [eax] // fetch 4 pixels of argb

680 movdqa xmm1, xmm0	1014 movdqa xmm1, xmm0

681 pand xmm0, xmm3 // low nibble	1015 pand xmm0, xmm3 // low nibble

682 pand xmm1, xmm4 // high nibble	1016 pand xmm1, xmm4 // high nibble

683 psrld xmm0, 4	1017 psrld xmm0, 4

684 psrld xmm1, 8	1018 psrld xmm1, 8

685 por xmm0, xmm1	1019 por xmm0, xmm1

686 packuswb xmm0, xmm0	1020 packuswb xmm0, xmm0

687 lea eax, [eax + 16]	1021 lea eax, [eax + 16]

688 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444	1022 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444

689 lea edx, [edx + 8]	1023 lea edx, [edx + 8]

690 sub ecx, 4	1024 sub ecx, 4

691 jg convertloop	1025 jg convertloop

692 ret	1026 ret

693 }	1027 }

694 }	1028 }

695	1029

696 #ifdef HAS_ARGBTORGB565ROW_AVX2	1030 #ifdef HAS_ARGBTORGB565ROW_AVX2

697 __declspec(naked) __declspec(align(16))	1031 __declspec(naked)

698 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {	1032 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {

699 __asm {	1033 __asm {

700 mov eax, [esp + 4] // src_argb	1034 mov eax, [esp + 4] // src_argb

701 mov edx, [esp + 8] // dst_rgb	1035 mov edx, [esp + 8] // dst_rgb

702 mov ecx, [esp + 12] // pix	1036 mov ecx, [esp + 12] // pix

703 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f	1037 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f

704 vpsrld ymm3, ymm3, 27	1038 vpsrld ymm3, ymm3, 27

705 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0	1039 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0

706 vpsrld ymm4, ymm4, 26	1040 vpsrld ymm4, ymm4, 26

707 vpslld ymm4, ymm4, 5	1041 vpslld ymm4, ymm4, 5

708 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800	1042 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800

709 vpslld ymm5, ymm5, 11

710	1043

711 convertloop:	1044 convertloop:

712 vmovdqu ymm0, [eax] // fetch 8 pixels of argb	1045 vmovdqu ymm0, [eax] // fetch 8 pixels of argb

713 vpsrld ymm2, ymm0, 5 // G	1046 vpsrld ymm2, ymm0, 5 // G

714 vpsrld ymm1, ymm0, 3 // B	1047 vpsrld ymm1, ymm0, 3 // B

715 vpslld ymm0, ymm0, 8 // R	1048 vpsrld ymm0, ymm0, 8 // R

716 vpand ymm2, ymm2, ymm4 // G	1049 vpand ymm2, ymm2, ymm4 // G

717 vpand ymm1, ymm1, ymm3 // B	1050 vpand ymm1, ymm1, ymm3 // B

718 vpsrad ymm0, ymm0, 16 // R

719 vpand ymm0, ymm0, ymm5 // R	1051 vpand ymm0, ymm0, ymm5 // R

720 vpor ymm1, ymm1, ymm2 // BG	1052 vpor ymm1, ymm1, ymm2 // BG

721 vpor ymm0, ymm0, ymm1 // BGR	1053 vpor ymm0, ymm0, ymm1 // BGR

722 vpackssdw ymm0, ymm0, ymm0	1054 vpackusdw ymm0, ymm0, ymm0

723 vpermq ymm0, ymm0, 0xd8	1055 vpermq ymm0, ymm0, 0xd8

724 lea eax, [eax + 32]	1056 lea eax, [eax + 32]

725 vmovdqu [edx], xmm0 // store 8 pixels of RGB565	1057 vmovdqu [edx], xmm0 // store 8 pixels of RGB565

726 lea edx, [edx + 16]	1058 lea edx, [edx + 16]

727 sub ecx, 8	1059 sub ecx, 8

728 jg convertloop	1060 jg convertloop

729 vzeroupper	1061 vzeroupper

730 ret	1062 ret

731 }	1063 }

732 }	1064 }

733 #endif // HAS_ARGBTORGB565ROW_AVX2	1065 #endif // HAS_ARGBTORGB565ROW_AVX2

734	1066

735 #ifdef HAS_ARGBTOARGB1555ROW_AVX2	1067 #ifdef HAS_ARGBTOARGB1555ROW_AVX2

736 __declspec(naked) __declspec(align(16))	1068 __declspec(naked)

737 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {	1069 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {

738 __asm {	1070 __asm {

739 mov eax, [esp + 4] // src_argb	1071 mov eax, [esp + 4] // src_argb

740 mov edx, [esp + 8] // dst_rgb	1072 mov edx, [esp + 8] // dst_rgb

741 mov ecx, [esp + 12] // pix	1073 mov ecx, [esp + 12] // pix

742 vpcmpeqb ymm4, ymm4, ymm4	1074 vpcmpeqb ymm4, ymm4, ymm4

743 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f	1075 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f

744 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0	1076 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0

745 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00	1077 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00

746 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000	1078 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000

(...skipping 19 matching lines...) Expand all Loading...
766 lea edx, [edx + 16]	1098 lea edx, [edx + 16]

767 sub ecx, 8	1099 sub ecx, 8

768 jg convertloop	1100 jg convertloop

769 vzeroupper	1101 vzeroupper

770 ret	1102 ret

771 }	1103 }

772 }	1104 }

773 #endif // HAS_ARGBTOARGB1555ROW_AVX2	1105 #endif // HAS_ARGBTOARGB1555ROW_AVX2

774	1106

775 #ifdef HAS_ARGBTOARGB4444ROW_AVX2	1107 #ifdef HAS_ARGBTOARGB4444ROW_AVX2

776 __declspec(naked) __declspec(align(16))	1108 __declspec(naked)

777 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {	1109 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {

778 __asm {	1110 __asm {

779 mov eax, [esp + 4] // src_argb	1111 mov eax, [esp + 4] // src_argb

780 mov edx, [esp + 8] // dst_rgb	1112 mov edx, [esp + 8] // dst_rgb

781 mov ecx, [esp + 12] // pix	1113 mov ecx, [esp + 12] // pix

782 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000	1114 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000

783 vpsllw ymm4, ymm4, 12	1115 vpsllw ymm4, ymm4, 12

784 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0	1116 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0

785	1117

786 convertloop:	1118 convertloop:

(...skipping 10 matching lines...) Expand all Loading...
797 lea edx, [edx + 16]	1129 lea edx, [edx + 16]

798 sub ecx, 8	1130 sub ecx, 8

799 jg convertloop	1131 jg convertloop

800 vzeroupper	1132 vzeroupper

801 ret	1133 ret

802 }	1134 }

803 }	1135 }

804 #endif // HAS_ARGBTOARGB4444ROW_AVX2	1136 #endif // HAS_ARGBTOARGB4444ROW_AVX2

805	1137

806 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.	1138 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.

807 __declspec(naked) __declspec(align(16))	1139 __declspec(naked)

808 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {	1140 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

809 __asm {	1141 __asm {

810 mov eax, [esp + 4] /* src_argb */	1142 mov eax, [esp + 4] /* src_argb */

811 mov edx, [esp + 8] /* dst_y */	1143 mov edx, [esp + 8] /* dst_y */

812 mov ecx, [esp + 12] /* pix */	1144 mov ecx, [esp + 12] /* pix */

813 movdqa xmm4, kARGBToY	1145 movdqa xmm4, kARGBToY

814 movdqa xmm5, kAddY16	1146 movdqa xmm5, kAddY16

815	1147

816 convertloop:	1148 convertloop:

817 movdqu xmm0, [eax]	1149 movdqu xmm0, [eax]

(...skipping 14 matching lines...) Expand all Loading...
832 movdqu [edx], xmm0	1164 movdqu [edx], xmm0

833 lea edx, [edx + 16]	1165 lea edx, [edx + 16]

834 sub ecx, 16	1166 sub ecx, 16

835 jg convertloop	1167 jg convertloop

836 ret	1168 ret

837 }	1169 }

838 }	1170 }

839	1171

840 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.	1172 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.

841 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.	1173 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.

842 __declspec(naked) __declspec(align(16))	1174 __declspec(naked)

843 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {	1175 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

844 __asm {	1176 __asm {

845 mov eax, [esp + 4] /* src_argb */	1177 mov eax, [esp + 4] /* src_argb */

846 mov edx, [esp + 8] /* dst_y */	1178 mov edx, [esp + 8] /* dst_y */

847 mov ecx, [esp + 12] /* pix */	1179 mov ecx, [esp + 12] /* pix */

848 movdqa xmm4, kARGBToYJ	1180 movdqa xmm4, kARGBToYJ

849 movdqa xmm5, kAddYJ64	1181 movdqa xmm5, kAddYJ64

850	1182

851 convertloop:	1183 convertloop:

852 movdqu xmm0, [eax]	1184 movdqu xmm0, [eax]

(...skipping 20 matching lines...) Expand all Loading...
873 }	1205 }

874 }	1206 }

875	1207

876 #ifdef HAS_ARGBTOYROW_AVX2	1208 #ifdef HAS_ARGBTOYROW_AVX2

877 // vpermd for vphaddw + vpackuswb vpermd.	1209 // vpermd for vphaddw + vpackuswb vpermd.

878 static const lvec32 kPermdARGBToY_AVX = {	1210 static const lvec32 kPermdARGBToY_AVX = {

879 0, 4, 1, 5, 2, 6, 3, 7	1211 0, 4, 1, 5, 2, 6, 3, 7

880 };	1212 };

881	1213

882 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.	1214 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.

883 __declspec(naked) __declspec(align(32))	1215 __declspec(naked)

884 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {	1216 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {

885 __asm {	1217 __asm {

886 mov eax, [esp + 4] /* src_argb */	1218 mov eax, [esp + 4] /* src_argb */

887 mov edx, [esp + 8] /* dst_y */	1219 mov edx, [esp + 8] /* dst_y */

888 mov ecx, [esp + 12] /* pix */	1220 mov ecx, [esp + 12] /* pix */

889 vbroadcastf128 ymm4, kARGBToY	1221 vbroadcastf128 ymm4, kARGBToY

890 vbroadcastf128 ymm5, kAddY16	1222 vbroadcastf128 ymm5, kAddY16

891 vmovdqu ymm6, kPermdARGBToY_AVX	1223 vmovdqu ymm6, kPermdARGBToY_AVX

892	1224

893 convertloop:	1225 convertloop:

(...skipping 16 matching lines...) Expand all Loading...
910 vmovdqu [edx], ymm0	1242 vmovdqu [edx], ymm0

911 lea edx, [edx + 32]	1243 lea edx, [edx + 32]

912 sub ecx, 32	1244 sub ecx, 32

913 jg convertloop	1245 jg convertloop

914 vzeroupper	1246 vzeroupper

915 ret	1247 ret

916 }	1248 }

917 }	1249 }

918 #endif // HAS_ARGBTOYROW_AVX2	1250 #endif // HAS_ARGBTOYROW_AVX2

919	1251

920 #ifdef HAS_ARGBTOYROW_AVX2	1252 #ifdef HAS_ARGBTOYJROW_AVX2

921 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.	1253 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.

922 __declspec(naked) __declspec(align(32))	1254 __declspec(naked)

923 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {	1255 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {

924 __asm {	1256 __asm {

925 mov eax, [esp + 4] /* src_argb */	1257 mov eax, [esp + 4] /* src_argb */

926 mov edx, [esp + 8] /* dst_y */	1258 mov edx, [esp + 8] /* dst_y */

927 mov ecx, [esp + 12] /* pix */	1259 mov ecx, [esp + 12] /* pix */

928 vbroadcastf128 ymm4, kARGBToYJ	1260 vbroadcastf128 ymm4, kARGBToYJ

929 vbroadcastf128 ymm5, kAddYJ64	1261 vbroadcastf128 ymm5, kAddYJ64

930 vmovdqu ymm6, kPermdARGBToY_AVX	1262 vmovdqu ymm6, kPermdARGBToY_AVX

931	1263

932 convertloop:	1264 convertloop:

(...skipping 18 matching lines...) Expand all Loading...
951 lea edx, [edx + 32]	1283 lea edx, [edx + 32]

952 sub ecx, 32	1284 sub ecx, 32

953 jg convertloop	1285 jg convertloop

954	1286

955 vzeroupper	1287 vzeroupper

956 ret	1288 ret

957 }	1289 }

958 }	1290 }

959 #endif // HAS_ARGBTOYJROW_AVX2	1291 #endif // HAS_ARGBTOYJROW_AVX2

960	1292

961 __declspec(naked) __declspec(align(16))	1293 __declspec(naked)

962 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {	1294 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

963 __asm {	1295 __asm {

964 mov eax, [esp + 4] /* src_argb */	1296 mov eax, [esp + 4] /* src_argb */

965 mov edx, [esp + 8] /* dst_y */	1297 mov edx, [esp + 8] /* dst_y */

966 mov ecx, [esp + 12] /* pix */	1298 mov ecx, [esp + 12] /* pix */

967 movdqa xmm4, kBGRAToY	1299 movdqa xmm4, kBGRAToY

968 movdqa xmm5, kAddY16	1300 movdqa xmm5, kAddY16

969	1301

970 convertloop:	1302 convertloop:

971 movdqu xmm0, [eax]	1303 movdqu xmm0, [eax]

(...skipping 12 matching lines...) Expand all Loading...
984 packuswb xmm0, xmm2	1316 packuswb xmm0, xmm2

985 paddb xmm0, xmm5	1317 paddb xmm0, xmm5

986 movdqu [edx], xmm0	1318 movdqu [edx], xmm0

987 lea edx, [edx + 16]	1319 lea edx, [edx + 16]

988 sub ecx, 16	1320 sub ecx, 16

989 jg convertloop	1321 jg convertloop

990 ret	1322 ret

991 }	1323 }

992 }	1324 }

993	1325

994 __declspec(naked) __declspec(align(16))	1326 __declspec(naked)

995 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {	1327 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

996 __asm {	1328 __asm {

997 mov eax, [esp + 4] /* src_argb */	1329 mov eax, [esp + 4] /* src_argb */

998 mov edx, [esp + 8] /* dst_y */	1330 mov edx, [esp + 8] /* dst_y */

999 mov ecx, [esp + 12] /* pix */	1331 mov ecx, [esp + 12] /* pix */

1000 movdqa xmm4, kABGRToY	1332 movdqa xmm4, kABGRToY

1001 movdqa xmm5, kAddY16	1333 movdqa xmm5, kAddY16

1002	1334

1003 convertloop:	1335 convertloop:

1004 movdqu xmm0, [eax]	1336 movdqu xmm0, [eax]

(...skipping 12 matching lines...) Expand all Loading...
1017 packuswb xmm0, xmm2	1349 packuswb xmm0, xmm2

1018 paddb xmm0, xmm5	1350 paddb xmm0, xmm5

1019 movdqu [edx], xmm0	1351 movdqu [edx], xmm0

1020 lea edx, [edx + 16]	1352 lea edx, [edx + 16]

1021 sub ecx, 16	1353 sub ecx, 16

1022 jg convertloop	1354 jg convertloop

1023 ret	1355 ret

1024 }	1356 }

1025 }	1357 }

1026	1358

1027 __declspec(naked) __declspec(align(16))	1359 __declspec(naked)

1028 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {	1360 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

1029 __asm {	1361 __asm {

1030 mov eax, [esp + 4] /* src_argb */	1362 mov eax, [esp + 4] /* src_argb */

1031 mov edx, [esp + 8] /* dst_y */	1363 mov edx, [esp + 8] /* dst_y */

1032 mov ecx, [esp + 12] /* pix */	1364 mov ecx, [esp + 12] /* pix */

1033 movdqa xmm4, kRGBAToY	1365 movdqa xmm4, kRGBAToY

1034 movdqa xmm5, kAddY16	1366 movdqa xmm5, kAddY16

1035	1367

1036 convertloop:	1368 convertloop:

1037 movdqu xmm0, [eax]	1369 movdqu xmm0, [eax]

(...skipping 12 matching lines...) Expand all Loading...
1050 packuswb xmm0, xmm2	1382 packuswb xmm0, xmm2

1051 paddb xmm0, xmm5	1383 paddb xmm0, xmm5

1052 movdqu [edx], xmm0	1384 movdqu [edx], xmm0

1053 lea edx, [edx + 16]	1385 lea edx, [edx + 16]

1054 sub ecx, 16	1386 sub ecx, 16

1055 jg convertloop	1387 jg convertloop

1056 ret	1388 ret

1057 }	1389 }

1058 }	1390 }

1059	1391

1060 __declspec(naked) __declspec(align(16))	1392 __declspec(naked)

1061 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,	1393 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

1062 uint8* dst_u, uint8* dst_v, int width) {	1394 uint8* dst_u, uint8* dst_v, int width) {

1063 __asm {	1395 __asm {

1064 push esi	1396 push esi

1065 push edi	1397 push edi

1066 mov eax, [esp + 8 + 4] // src_argb	1398 mov eax, [esp + 8 + 4] // src_argb

1067 mov esi, [esp + 8 + 8] // src_stride_argb	1399 mov esi, [esp + 8 + 8] // src_stride_argb

1068 mov edx, [esp + 8 + 12] // dst_u	1400 mov edx, [esp + 8 + 12] // dst_u

1069 mov edi, [esp + 8 + 16] // dst_v	1401 mov edi, [esp + 8 + 16] // dst_v

1070 mov ecx, [esp + 8 + 20] // pix	1402 mov ecx, [esp + 8 + 20] // pix

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1120 lea edx, [edx + 8]	1452 lea edx, [edx + 8]

1121 sub ecx, 16	1453 sub ecx, 16

1122 jg convertloop	1454 jg convertloop

1123	1455

1124 pop edi	1456 pop edi

1125 pop esi	1457 pop esi

1126 ret	1458 ret

1127 }	1459 }

1128 }	1460 }

1129	1461

1130 __declspec(naked) __declspec(align(16))	1462 __declspec(naked)

1131 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,	1463 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

1132 uint8* dst_u, uint8* dst_v, int width) {	1464 uint8* dst_u, uint8* dst_v, int width) {

1133 __asm {	1465 __asm {

1134 push esi	1466 push esi

1135 push edi	1467 push edi

1136 mov eax, [esp + 8 + 4] // src_argb	1468 mov eax, [esp + 8 + 4] // src_argb

1137 mov esi, [esp + 8 + 8] // src_stride_argb	1469 mov esi, [esp + 8 + 8] // src_stride_argb

1138 mov edx, [esp + 8 + 12] // dst_u	1470 mov edx, [esp + 8 + 12] // dst_u

1139 mov edi, [esp + 8 + 16] // dst_v	1471 mov edi, [esp + 8 + 16] // dst_v

1140 mov ecx, [esp + 8 + 20] // pix	1472 mov ecx, [esp + 8 + 20] // pix

(...skipping 51 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1192 sub ecx, 16	1524 sub ecx, 16

1193 jg convertloop	1525 jg convertloop

1194	1526

1195 pop edi	1527 pop edi

1196 pop esi	1528 pop esi

1197 ret	1529 ret

1198 }	1530 }

1199 }	1531 }

1200	1532

1201 #ifdef HAS_ARGBTOUVROW_AVX2	1533 #ifdef HAS_ARGBTOUVROW_AVX2

1202 __declspec(naked) __declspec(align(32))	1534 __declspec(naked)

1203 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,	1535 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,

1204 uint8* dst_u, uint8* dst_v, int width) {	1536 uint8* dst_u, uint8* dst_v, int width) {

1205 __asm {	1537 __asm {

1206 push esi	1538 push esi

1207 push edi	1539 push edi

1208 mov eax, [esp + 8 + 4] // src_argb	1540 mov eax, [esp + 8 + 4] // src_argb

1209 mov esi, [esp + 8 + 8] // src_stride_argb	1541 mov esi, [esp + 8 + 8] // src_stride_argb

1210 mov edx, [esp + 8 + 12] // dst_u	1542 mov edx, [esp + 8 + 12] // dst_u

1211 mov edi, [esp + 8 + 16] // dst_v	1543 mov edi, [esp + 8 + 16] // dst_v

1212 mov ecx, [esp + 8 + 20] // pix	1544 mov ecx, [esp + 8 + 20] // pix

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1257 jg convertloop	1589 jg convertloop

1258	1590

1259 pop edi	1591 pop edi

1260 pop esi	1592 pop esi

1261 vzeroupper	1593 vzeroupper

1262 ret	1594 ret

1263 }	1595 }

1264 }	1596 }

1265 #endif // HAS_ARGBTOUVROW_AVX2	1597 #endif // HAS_ARGBTOUVROW_AVX2

1266	1598

1267 __declspec(naked) __declspec(align(16))	1599 __declspec(naked)

1268 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,	1600 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,

1269 uint8* dst_u, uint8* dst_v, int width) {	1601 uint8* dst_u, uint8* dst_v, int width) {

1270 __asm {	1602 __asm {

1271 push edi	1603 push edi

1272 mov eax, [esp + 4 + 4] // src_argb	1604 mov eax, [esp + 4 + 4] // src_argb

1273 mov edx, [esp + 4 + 8] // dst_u	1605 mov edx, [esp + 4 + 8] // dst_u

1274 mov edi, [esp + 4 + 12] // dst_v	1606 mov edi, [esp + 4 + 12] // dst_v

1275 mov ecx, [esp + 4 + 16] // pix	1607 mov ecx, [esp + 4 + 16] // pix

1276 movdqa xmm5, kAddUV128	1608 movdqa xmm5, kAddUV128

1277 movdqa xmm6, kARGBToV	1609 movdqa xmm6, kARGBToV

(...skipping 36 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1314 movdqu [edx + edi], xmm0	1646 movdqu [edx + edi], xmm0

1315 lea edx, [edx + 16]	1647 lea edx, [edx + 16]

1316 sub ecx, 16	1648 sub ecx, 16

1317 jg convertloop	1649 jg convertloop

1318	1650

1319 pop edi	1651 pop edi

1320 ret	1652 ret

1321 }	1653 }

1322 }	1654 }

1323	1655

1324 __declspec(naked) __declspec(align(16))	1656 __declspec(naked)

1325 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,	1657 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,

1326 uint8* dst_u, uint8* dst_v, int width) {	1658 uint8* dst_u, uint8* dst_v, int width) {

1327 __asm {	1659 __asm {

1328 push edi	1660 push edi

1329 mov eax, [esp + 4 + 4] // src_argb	1661 mov eax, [esp + 4 + 4] // src_argb

1330 mov edx, [esp + 4 + 8] // dst_u	1662 mov edx, [esp + 4 + 8] // dst_u

1331 mov edi, [esp + 4 + 12] // dst_v	1663 mov edi, [esp + 4 + 12] // dst_v

1332 mov ecx, [esp + 4 + 16] // pix	1664 mov ecx, [esp + 4 + 16] // pix

1333 movdqa xmm5, kAddUV128	1665 movdqa xmm5, kAddUV128

1334 movdqa xmm6, kARGBToV	1666 movdqa xmm6, kARGBToV

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1372 movhps qword ptr [edx + edi], xmm0 // V	1704 movhps qword ptr [edx + edi], xmm0 // V

1373 lea edx, [edx + 8]	1705 lea edx, [edx + 8]

1374 sub ecx, 16	1706 sub ecx, 16

1375 jg convertloop	1707 jg convertloop

1376	1708

1377 pop edi	1709 pop edi

1378 ret	1710 ret

1379 }	1711 }

1380 }	1712 }

1381	1713

1382 __declspec(naked) __declspec(align(16))	1714 __declspec(naked)

1383 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,	1715 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

1384 uint8* dst_u, uint8* dst_v, int width) {	1716 uint8* dst_u, uint8* dst_v, int width) {

1385 __asm {	1717 __asm {

1386 push esi	1718 push esi

1387 push edi	1719 push edi

1388 mov eax, [esp + 8 + 4] // src_argb	1720 mov eax, [esp + 8 + 4] // src_argb

1389 mov esi, [esp + 8 + 8] // src_stride_argb	1721 mov esi, [esp + 8 + 8] // src_stride_argb

1390 mov edx, [esp + 8 + 12] // dst_u	1722 mov edx, [esp + 8 + 12] // dst_u

1391 mov edi, [esp + 8 + 16] // dst_v	1723 mov edi, [esp + 8 + 16] // dst_v

1392 mov ecx, [esp + 8 + 20] // pix	1724 mov ecx, [esp + 8 + 20] // pix

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1442 lea edx, [edx + 8]	1774 lea edx, [edx + 8]

1443 sub ecx, 16	1775 sub ecx, 16

1444 jg convertloop	1776 jg convertloop

1445	1777

1446 pop edi	1778 pop edi

1447 pop esi	1779 pop esi

1448 ret	1780 ret

1449 }	1781 }

1450 }	1782 }

1451	1783

1452 __declspec(naked) __declspec(align(16))	1784 __declspec(naked)

1453 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,	1785 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

1454 uint8* dst_u, uint8* dst_v, int width) {	1786 uint8* dst_u, uint8* dst_v, int width) {

1455 __asm {	1787 __asm {

1456 push esi	1788 push esi

1457 push edi	1789 push edi

1458 mov eax, [esp + 8 + 4] // src_argb	1790 mov eax, [esp + 8 + 4] // src_argb

1459 mov esi, [esp + 8 + 8] // src_stride_argb	1791 mov esi, [esp + 8 + 8] // src_stride_argb

1460 mov edx, [esp + 8 + 12] // dst_u	1792 mov edx, [esp + 8 + 12] // dst_u

1461 mov edi, [esp + 8 + 16] // dst_v	1793 mov edi, [esp + 8 + 16] // dst_v

1462 mov ecx, [esp + 8 + 20] // pix	1794 mov ecx, [esp + 8 + 20] // pix

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1512 lea edx, [edx + 8]	1844 lea edx, [edx + 8]

1513 sub ecx, 16	1845 sub ecx, 16

1514 jg convertloop	1846 jg convertloop

1515	1847

1516 pop edi	1848 pop edi

1517 pop esi	1849 pop esi

1518 ret	1850 ret

1519 }	1851 }

1520 }	1852 }

1521	1853

1522 __declspec(naked) __declspec(align(16))	1854 __declspec(naked)

1523 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,	1855 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

1524 uint8* dst_u, uint8* dst_v, int width) {	1856 uint8* dst_u, uint8* dst_v, int width) {

1525 __asm {	1857 __asm {

1526 push esi	1858 push esi

1527 push edi	1859 push edi

1528 mov eax, [esp + 8 + 4] // src_argb	1860 mov eax, [esp + 8 + 4] // src_argb

1529 mov esi, [esp + 8 + 8] // src_stride_argb	1861 mov esi, [esp + 8 + 8] // src_stride_argb

1530 mov edx, [esp + 8 + 12] // dst_u	1862 mov edx, [esp + 8 + 12] // dst_u

1531 mov edi, [esp + 8 + 16] // dst_v	1863 mov edi, [esp + 8 + 16] // dst_v

1532 mov ecx, [esp + 8 + 20] // pix	1864 mov ecx, [esp + 8 + 20] // pix

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1583 sub ecx, 16	1915 sub ecx, 16

1584 jg convertloop	1916 jg convertloop

1585	1917

1586 pop edi	1918 pop edi

1587 pop esi	1919 pop esi

1588 ret	1920 ret

1589 }	1921 }

1590 }	1922 }

1591 #endif // HAS_ARGBTOYROW_SSSE3	1923 #endif // HAS_ARGBTOYROW_SSSE3

1592	1924

	1925 // Read 16 UV from 444

	1926 #define READYUV444_AVX2 __asm { \

	1927 __asm vmovdqu xmm0, [esi] /* U / / NOLINT */ \

	1928 __asm vmovdqu xmm1, [esi + edi] /* V / / NOLINT */ \

	1929 __asm lea esi, [esi + 16] \

	1930 __asm vpermq ymm0, ymm0, 0xd8 \

	1931 __asm vpermq ymm1, ymm1, 0xd8 \

	1932 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \

	1933 }

	1934

1593 // Read 8 UV from 422, upsample to 16 UV.	1935 // Read 8 UV from 422, upsample to 16 UV.

1594 #define READYUV422_AVX2 __asm { \	1936 #define READYUV422_AVX2 __asm { \

1595 __asm vmovq xmm0, qword ptr [esi] /* U / / NOLINT */ \	1937 __asm vmovq xmm0, qword ptr [esi] /* U / / NOLINT */ \

1596 __asm vmovq xmm1, qword ptr [esi + edi] /* V / / NOLINT */ \	1938 __asm vmovq xmm1, qword ptr [esi + edi] /* V / / NOLINT */ \

1597 __asm lea esi, [esi + 8] \	1939 __asm lea esi, [esi + 8] \

1598 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \	1940 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \

1599 __asm vpermq ymm0, ymm0, 0xd8 \	1941 __asm vpermq ymm0, ymm0, 0xd8 \

1600 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \	1942 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \

1601 }	1943 }

1602	1944

	1945 // Read 4 UV from 411, upsample to 16 UV.

	1946 #define READYUV411_AVX2 __asm { \

	1947 __asm vmovd xmm0, dword ptr [esi] /* U / / NOLINT */ \

	1948 __asm vmovd xmm1, dword ptr [esi + edi] /* V / / NOLINT */ \

	1949 __asm lea esi, [esi + 4] \

	1950 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \

	1951 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \

	1952 __asm vpermq ymm0, ymm0, 0xd8 \

	1953 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \

	1954 }

	1955

1603 // Read 8 UV from NV12, upsample to 16 UV.	1956 // Read 8 UV from NV12, upsample to 16 UV.

1604 #define READNV12_AVX2 __asm { \	1957 #define READNV12_AVX2 __asm { \

1605 __asm vmovdqu xmm0, [esi] /* UV */ \	1958 __asm vmovdqu xmm0, [esi] /* UV */ \

1606 __asm lea esi, [esi + 16] \	1959 __asm lea esi, [esi + 16] \

1607 __asm vpermq ymm0, ymm0, 0xd8 \	1960 __asm vpermq ymm0, ymm0, 0xd8 \

1608 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \	1961 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \

1609 }	1962 }

1610	1963

1611 // Convert 16 pixels: 16 UV and 16 Y.	1964 // Convert 16 pixels: 16 UV and 16 Y.

1612 #define YUVTORGB_AVX2(YuvConstants) __asm { \	1965 #define YUVTORGB_AVX2(YuvConstants) __asm { \

(...skipping 26 matching lines...) Expand all Loading...
1639	1992

1640 // Store 16 ARGB values.	1993 // Store 16 ARGB values.

1641 #define STOREARGB_AVX2 __asm { \	1994 #define STOREARGB_AVX2 __asm { \

1642 /* Step 3: Weave into ARGB */ \	1995 /* Step 3: Weave into ARGB */ \

1643 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \	1996 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \

1644 __asm vpermq ymm0, ymm0, 0xd8 \	1997 __asm vpermq ymm0, ymm0, 0xd8 \

1645 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \	1998 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \

1646 __asm vpermq ymm2, ymm2, 0xd8 \	1999 __asm vpermq ymm2, ymm2, 0xd8 \

1647 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \	2000 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \

1648 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \	2001 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \

1649 __asm vmovdqu [edx], ymm1 \	2002 __asm vmovdqu 0[edx], ymm1 \

1650 __asm vmovdqu [edx + 32], ymm0 \	2003 __asm vmovdqu 32[edx], ymm0 \

1651 __asm lea edx, [edx + 64] \	2004 __asm lea edx, [edx + 64] \

1652 }	2005 }

1653	2006

1654 #ifdef HAS_I422TOARGBROW_AVX2	2007 #ifdef HAS_I422TOARGBROW_AVX2

1655 // 16 pixels	2008 // 16 pixels

1656 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).	2009 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

1657 __declspec(naked) __declspec(align(16))	2010 __declspec(naked)

1658 void I422ToARGBRow_AVX2(const uint8* y_buf,	2011 void I422ToARGBRow_AVX2(const uint8* y_buf,

1659 const uint8* u_buf,	2012 const uint8* u_buf,

1660 const uint8* v_buf,	2013 const uint8* v_buf,

1661 uint8* dst_argb,	2014 uint8* dst_argb,

1662 int width) {	2015 int width) {

1663 __asm {	2016 __asm {

1664 push esi	2017 push esi

1665 push edi	2018 push edi

1666 mov eax, [esp + 8 + 4] // Y	2019 mov eax, [esp + 8 + 4] // Y

1667 mov esi, [esp + 8 + 8] // U	2020 mov esi, [esp + 8 + 8] // U

(...skipping 12 matching lines...) Expand all Loading...
1680 jg convertloop	2033 jg convertloop

1681	2034

1682 pop edi	2035 pop edi

1683 pop esi	2036 pop esi

1684 vzeroupper	2037 vzeroupper

1685 ret	2038 ret

1686 }	2039 }

1687 }	2040 }

1688 #endif // HAS_I422TOARGBROW_AVX2	2041 #endif // HAS_I422TOARGBROW_AVX2

1689	2042

	2043 #ifdef HAS_J422TOARGBROW_AVX2

	2044 // 16 pixels

	2045 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

	2046 __declspec(naked)

	2047 void J422ToARGBRow_AVX2(const uint8* y_buf,

	2048 const uint8* u_buf,

	2049 const uint8* v_buf,

	2050 uint8* dst_argb,

	2051 int width) {

	2052 __asm {

	2053 push esi

	2054 push edi

	2055 mov eax, [esp + 8 + 4] // Y

	2056 mov esi, [esp + 8 + 8] // U

	2057 mov edi, [esp + 8 + 12] // V

	2058 mov edx, [esp + 8 + 16] // argb

	2059 mov ecx, [esp + 8 + 20] // width

	2060 sub edi, esi

	2061 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha

	2062

	2063 convertloop:

	2064 READYUV422_AVX2

	2065 YUVTORGB_AVX2(kYuvJConstants)

	2066 STOREARGB_AVX2

	2067

	2068 sub ecx, 16

	2069 jg convertloop

	2070

	2071 pop edi

	2072 pop esi

	2073 vzeroupper

	2074 ret

	2075 }

	2076 }

	2077 #endif // HAS_J422TOARGBROW_AVX2

	2078

	2079 #ifdef HAS_I444TOARGBROW_AVX2

	2080 // 16 pixels

	2081 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).

	2082 __declspec(naked)

	2083 void I444ToARGBRow_AVX2(const uint8* y_buf,

	2084 const uint8* u_buf,

	2085 const uint8* v_buf,

	2086 uint8* dst_argb,

	2087 int width) {

	2088 __asm {

	2089 push esi

	2090 push edi

	2091 mov eax, [esp + 8 + 4] // Y

	2092 mov esi, [esp + 8 + 8] // U

	2093 mov edi, [esp + 8 + 12] // V

	2094 mov edx, [esp + 8 + 16] // argb

	2095 mov ecx, [esp + 8 + 20] // width

	2096 sub edi, esi

	2097 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha

	2098

	2099 convertloop:

	2100 READYUV444_AVX2

	2101 YUVTORGB_AVX2(kYuvConstants)

	2102 STOREARGB_AVX2

	2103

	2104 sub ecx, 16

	2105 jg convertloop

	2106

	2107 pop edi

	2108 pop esi

	2109 vzeroupper

	2110 ret

	2111 }

	2112 }

	2113 #endif // HAS_I444TOARGBROW_AVX2

	2114

	2115 #ifdef HAS_I411TOARGBROW_AVX2

	2116 // 16 pixels

	2117 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

	2118 __declspec(naked)

	2119 void I411ToARGBRow_AVX2(const uint8* y_buf,

	2120 const uint8* u_buf,

	2121 const uint8* v_buf,

	2122 uint8* dst_argb,

	2123 int width) {

	2124 __asm {

	2125 push esi

	2126 push edi

	2127 mov eax, [esp + 8 + 4] // Y

	2128 mov esi, [esp + 8 + 8] // U

	2129 mov edi, [esp + 8 + 12] // V

	2130 mov edx, [esp + 8 + 16] // argb

	2131 mov ecx, [esp + 8 + 20] // width

	2132 sub edi, esi

	2133 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha

	2134

	2135 convertloop:

	2136 READYUV411_AVX2

	2137 YUVTORGB_AVX2(kYuvConstants)

	2138 STOREARGB_AVX2

	2139

	2140 sub ecx, 16

	2141 jg convertloop

	2142

	2143 pop edi

	2144 pop esi

	2145 vzeroupper

	2146 ret

	2147 }

	2148 }

	2149 #endif // HAS_I411TOARGBROW_AVX2

	2150

1690 #ifdef HAS_NV12TOARGBROW_AVX2	2151 #ifdef HAS_NV12TOARGBROW_AVX2

1691 // 16 pixels.	2152 // 16 pixels.

1692 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).	2153 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

1693 __declspec(naked) __declspec(align(16))	2154 __declspec(naked)

1694 void NV12ToARGBRow_AVX2(const uint8* y_buf,	2155 void NV12ToARGBRow_AVX2(const uint8* y_buf,

1695 const uint8* uv_buf,	2156 const uint8* uv_buf,

1696 uint8* dst_argb,	2157 uint8* dst_argb,

1697 int width) {	2158 int width) {

1698 __asm {	2159 __asm {

1699 push esi	2160 push esi

1700 mov eax, [esp + 4 + 4] // Y	2161 mov eax, [esp + 4 + 4] // Y

1701 mov esi, [esp + 4 + 8] // UV	2162 mov esi, [esp + 4 + 8] // UV

1702 mov edx, [esp + 4 + 12] // argb	2163 mov edx, [esp + 4 + 12] // argb

1703 mov ecx, [esp + 4 + 16] // width	2164 mov ecx, [esp + 4 + 16] // width

1704 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha	2165 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha

1705	2166

1706 convertloop:	2167 convertloop:

1707 READNV12_AVX2	2168 READNV12_AVX2

1708 YUVTORGB_AVX2(kYuvConstants)	2169 YUVTORGB_AVX2(kYuvConstants)

1709 STOREARGB_AVX2	2170 STOREARGB_AVX2

1710	2171

1711 sub ecx, 16	2172 sub ecx, 16

1712 jg convertloop	2173 jg convertloop

1713	2174

1714 pop esi	2175 pop esi

	2176 vzeroupper

1715 ret	2177 ret

1716 }	2178 }

1717 }	2179 }

1718 #endif // HAS_NV12TOARGBROW_AVX2	2180 #endif // HAS_NV12TOARGBROW_AVX2

1719	2181

1720 #ifdef HAS_NV21TOARGBROW_AVX2	2182 #ifdef HAS_NV21TOARGBROW_AVX2

1721 // 16 pixels.	2183 // 16 pixels.

1722 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).	2184 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).

1723 __declspec(naked) __declspec(align(16))	2185 __declspec(naked)

1724 void NV21ToARGBRow_AVX2(const uint8* y_buf,	2186 void NV21ToARGBRow_AVX2(const uint8* y_buf,

1725 const uint8* uv_buf,	2187 const uint8* uv_buf,

1726 uint8* dst_argb,	2188 uint8* dst_argb,

1727 int width) {	2189 int width) {

1728 __asm {	2190 __asm {

1729 push esi	2191 push esi

1730 mov eax, [esp + 4 + 4] // Y	2192 mov eax, [esp + 4 + 4] // Y

1731 mov esi, [esp + 4 + 8] // UV	2193 mov esi, [esp + 4 + 8] // UV

1732 mov edx, [esp + 4 + 12] // argb	2194 mov edx, [esp + 4 + 12] // argb

1733 mov ecx, [esp + 4 + 16] // width	2195 mov ecx, [esp + 4 + 16] // width

1734 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha	2196 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha

1735	2197

1736 convertloop:	2198 convertloop:

1737 READNV12_AVX2	2199 READNV12_AVX2

1738 YUVTORGB_AVX2(kYvuConstants)	2200 YUVTORGB_AVX2(kYvuConstants)

1739 STOREARGB_AVX2	2201 STOREARGB_AVX2

1740	2202

1741 sub ecx, 16	2203 sub ecx, 16

1742 jg convertloop	2204 jg convertloop

1743	2205

1744 pop esi	2206 pop esi

	2207 vzeroupper

1745 ret	2208 ret

1746 }	2209 }

1747 }	2210 }

1748 #endif // HAS_NV21TOARGBROW_AVX2	2211 #endif // HAS_NV21TOARGBROW_AVX2

1749	2212

1750 #ifdef HAS_I422TOBGRAROW_AVX2	2213 #ifdef HAS_I422TOBGRAROW_AVX2

1751 // 16 pixels	2214 // 16 pixels

1752 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).	2215 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).

1753 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.	2216 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.

1754 __declspec(naked) __declspec(align(16))	2217 __declspec(naked)

1755 void I422ToBGRARow_AVX2(const uint8* y_buf,	2218 void I422ToBGRARow_AVX2(const uint8* y_buf,

1756 const uint8* u_buf,	2219 const uint8* u_buf,

1757 const uint8* v_buf,	2220 const uint8* v_buf,

1758 uint8* dst_argb,	2221 uint8* dst_argb,

1759 int width) {	2222 int width) {

1760 __asm {	2223 __asm {

1761 push esi	2224 push esi

1762 push edi	2225 push edi

1763 mov eax, [esp + 8 + 4] // Y	2226 mov eax, [esp + 8 + 4] // Y

1764 mov esi, [esp + 8 + 8] // U	2227 mov esi, [esp + 8 + 8] // U

(...skipping 25 matching lines...) Expand all Loading...
1790 vzeroupper	2253 vzeroupper

1791 ret	2254 ret

1792 }	2255 }

1793 }	2256 }

1794 #endif // HAS_I422TOBGRAROW_AVX2	2257 #endif // HAS_I422TOBGRAROW_AVX2

1795	2258

1796 #ifdef HAS_I422TORGBAROW_AVX2	2259 #ifdef HAS_I422TORGBAROW_AVX2

1797 // 16 pixels	2260 // 16 pixels

1798 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).	2261 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).

1799 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.	2262 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.

1800 __declspec(naked) __declspec(align(16))	2263 __declspec(naked)

1801 void I422ToRGBARow_AVX2(const uint8* y_buf,	2264 void I422ToRGBARow_AVX2(const uint8* y_buf,

1802 const uint8* u_buf,	2265 const uint8* u_buf,

1803 const uint8* v_buf,	2266 const uint8* v_buf,

1804 uint8* dst_argb,	2267 uint8* dst_argb,

1805 int width) {	2268 int width) {

1806 __asm {	2269 __asm {

1807 push esi	2270 push esi

1808 push edi	2271 push edi

1809 mov eax, [esp + 8 + 4] // Y	2272 mov eax, [esp + 8 + 4] // Y

1810 mov esi, [esp + 8 + 8] // U	2273 mov esi, [esp + 8 + 8] // U

(...skipping 25 matching lines...) Expand all Loading...
1836 vzeroupper	2299 vzeroupper

1837 ret	2300 ret

1838 }	2301 }

1839 }	2302 }

1840 #endif // HAS_I422TORGBAROW_AVX2	2303 #endif // HAS_I422TORGBAROW_AVX2

1841	2304

1842 #ifdef HAS_I422TOABGRROW_AVX2	2305 #ifdef HAS_I422TOABGRROW_AVX2

1843 // 16 pixels	2306 // 16 pixels

1844 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).	2307 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).

1845 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.	2308 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.

1846 __declspec(naked) __declspec(align(16))	2309 __declspec(naked)

1847 void I422ToABGRRow_AVX2(const uint8* y_buf,	2310 void I422ToABGRRow_AVX2(const uint8* y_buf,

1848 const uint8* u_buf,	2311 const uint8* u_buf,

1849 const uint8* v_buf,	2312 const uint8* v_buf,

1850 uint8* dst_argb,	2313 uint8* dst_argb,

1851 int width) {	2314 int width) {

1852 __asm {	2315 __asm {

1853 push esi	2316 push esi

1854 push edi	2317 push edi

1855 mov eax, [esp + 8 + 4] // Y	2318 mov eax, [esp + 8 + 4] // Y

1856 mov esi, [esp + 8 + 8] // U	2319 mov esi, [esp + 8 + 8] // U

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1907	2370

1908 // Read 2 UV from 411, upsample to 8 UV.	2371 // Read 2 UV from 411, upsample to 8 UV.

1909 #define READYUV411 __asm { \	2372 #define READYUV411 __asm { \

1910 __asm movzx ebx, word ptr [esi] /* U / / NOLINT */ \	2373 __asm movzx ebx, word ptr [esi] /* U / / NOLINT */ \

1911 __asm movd xmm0, ebx \	2374 __asm movd xmm0, ebx \

1912 __asm movzx ebx, word ptr [esi + edi] /* V / / NOLINT */ \	2375 __asm movzx ebx, word ptr [esi + edi] /* V / / NOLINT */ \

1913 __asm movd xmm1, ebx \	2376 __asm movd xmm1, ebx \

1914 __asm lea esi, [esi + 2] \	2377 __asm lea esi, [esi + 2] \

1915 __asm punpcklbw xmm0, xmm1 /* UV */ \	2378 __asm punpcklbw xmm0, xmm1 /* UV */ \

1916 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \	2379 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \

1917 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \	2380 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \

1918 }	2381 }

1919	2382

1920 // Read 4 UV from NV12, upsample to 8 UV.	2383 // Read 4 UV from NV12, upsample to 8 UV.

1921 #define READNV12 __asm { \	2384 #define READNV12 __asm { \

1922 __asm movq xmm0, qword ptr [esi] /* UV / / NOLINT */ \	2385 __asm movq xmm0, qword ptr [esi] /* UV / / NOLINT */ \

1923 __asm lea esi, [esi + 8] \	2386 __asm lea esi, [esi + 8] \

1924 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \	2387 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \

1925 }	2388 }

1926	2389

1927 // Convert 8 pixels: 8 UV and 8 Y.	2390 // Convert 8 pixels: 8 UV and 8 Y.

(...skipping 28 matching lines...) Expand all Loading...
1956 }	2419 }

1957	2420

1958 // Store 8 ARGB values.	2421 // Store 8 ARGB values.

1959 #define STOREARGB __asm { \	2422 #define STOREARGB __asm { \

1960 /* Step 3: Weave into ARGB */ \	2423 /* Step 3: Weave into ARGB */ \

1961 __asm punpcklbw xmm0, xmm1 /* BG */ \	2424 __asm punpcklbw xmm0, xmm1 /* BG */ \

1962 __asm punpcklbw xmm2, xmm5 /* RA */ \	2425 __asm punpcklbw xmm2, xmm5 /* RA */ \

1963 __asm movdqa xmm1, xmm0 \	2426 __asm movdqa xmm1, xmm0 \

1964 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \	2427 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \

1965 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \	2428 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \

1966 __asm movdqu [edx], xmm0 \	2429 __asm movdqu 0[edx], xmm0 \

1967 __asm movdqu [edx + 16], xmm1 \	2430 __asm movdqu 16[edx], xmm1 \

1968 __asm lea edx, [edx + 32] \	2431 __asm lea edx, [edx + 32] \

1969 }	2432 }

1970	2433

1971 // Store 8 BGRA values.	2434 // Store 8 BGRA values.

1972 #define STOREBGRA __asm { \	2435 #define STOREBGRA __asm { \

1973 /* Step 3: Weave into BGRA */ \	2436 /* Step 3: Weave into BGRA */ \

1974 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \	2437 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \

1975 __asm punpcklbw xmm1, xmm0 /* GB */ \	2438 __asm punpcklbw xmm1, xmm0 /* GB */ \

1976 __asm punpcklbw xmm5, xmm2 /* AR */ \	2439 __asm punpcklbw xmm5, xmm2 /* AR */ \

1977 __asm movdqa xmm0, xmm5 \	2440 __asm movdqa xmm0, xmm5 \

1978 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \	2441 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \

1979 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \	2442 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \

1980 __asm movdqu [edx], xmm5 \	2443 __asm movdqu 0[edx], xmm5 \

1981 __asm movdqu [edx + 16], xmm0 \	2444 __asm movdqu 16[edx], xmm0 \

1982 __asm lea edx, [edx + 32] \	2445 __asm lea edx, [edx + 32] \

1983 }	2446 }

1984	2447

1985 // Store 8 ABGR values.	2448 // Store 8 ABGR values.

1986 #define STOREABGR __asm { \	2449 #define STOREABGR __asm { \

1987 /* Step 3: Weave into ABGR */ \	2450 /* Step 3: Weave into ABGR */ \

1988 __asm punpcklbw xmm2, xmm1 /* RG */ \	2451 __asm punpcklbw xmm2, xmm1 /* RG */ \

1989 __asm punpcklbw xmm0, xmm5 /* BA */ \	2452 __asm punpcklbw xmm0, xmm5 /* BA */ \

1990 __asm movdqa xmm1, xmm2 \	2453 __asm movdqa xmm1, xmm2 \

1991 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \	2454 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \

1992 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \	2455 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \

1993 __asm movdqu [edx], xmm2 \	2456 __asm movdqu 0[edx], xmm2 \

1994 __asm movdqu [edx + 16], xmm1 \	2457 __asm movdqu 16[edx], xmm1 \

1995 __asm lea edx, [edx + 32] \	2458 __asm lea edx, [edx + 32] \

1996 }	2459 }

1997	2460

1998 // Store 8 RGBA values.	2461 // Store 8 RGBA values.

1999 #define STORERGBA __asm { \	2462 #define STORERGBA __asm { \

2000 /* Step 3: Weave into RGBA */ \	2463 /* Step 3: Weave into RGBA */ \

2001 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \	2464 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \

2002 __asm punpcklbw xmm1, xmm2 /* GR */ \	2465 __asm punpcklbw xmm1, xmm2 /* GR */ \

2003 __asm punpcklbw xmm5, xmm0 /* AB */ \	2466 __asm punpcklbw xmm5, xmm0 /* AB */ \

2004 __asm movdqa xmm0, xmm5 \	2467 __asm movdqa xmm0, xmm5 \

2005 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \	2468 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \

2006 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \	2469 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \

2007 __asm movdqu [edx], xmm5 \	2470 __asm movdqu 0[edx], xmm5 \

2008 __asm movdqu [edx + 16], xmm0 \	2471 __asm movdqu 16[edx], xmm0 \

2009 __asm lea edx, [edx + 32] \	2472 __asm lea edx, [edx + 32] \

2010 }	2473 }

2011	2474

2012 // Store 8 RGB24 values.	2475 // Store 8 RGB24 values.

2013 #define STORERGB24 __asm { \	2476 #define STORERGB24 __asm { \

2014 /* Step 3: Weave into RRGB */ \	2477 /* Step 3: Weave into RRGB */ \

2015 __asm punpcklbw xmm0, xmm1 /* BG */ \	2478 __asm punpcklbw xmm0, xmm1 /* BG */ \

2016 __asm punpcklbw xmm2, xmm2 /* RR */ \	2479 __asm punpcklbw xmm2, xmm2 /* RR */ \

2017 __asm movdqa xmm1, xmm0 \	2480 __asm movdqa xmm1, xmm0 \

2018 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \	2481 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \

2019 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \	2482 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \

2020 /* Step 4: RRGB -> RGB24 */ \	2483 /* Step 4: RRGB -> RGB24 */ \

2021 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \	2484 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \

2022 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \	2485 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \

2023 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \	2486 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \

2024 __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \	2487 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \

2025 __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \	2488 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \

2026 __asm lea edx, [edx + 24] \	2489 __asm lea edx, [edx + 24] \

2027 }	2490 }

2028	2491

2029 // Store 8 RAW values.	2492 // Store 8 RAW values.

2030 #define STORERAW __asm { \	2493 #define STORERAW __asm { \

2031 /* Step 3: Weave into RRGB */ \	2494 /* Step 3: Weave into RRGB */ \

2032 __asm punpcklbw xmm0, xmm1 /* BG */ \	2495 __asm punpcklbw xmm0, xmm1 /* BG */ \

2033 __asm punpcklbw xmm2, xmm2 /* RR */ \	2496 __asm punpcklbw xmm2, xmm2 /* RR */ \

2034 __asm movdqa xmm1, xmm0 \	2497 __asm movdqa xmm1, xmm0 \

2035 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \	2498 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \

2036 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \	2499 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \

2037 /* Step 4: RRGB -> RAW */ \	2500 /* Step 4: RRGB -> RAW */ \

2038 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \	2501 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \

2039 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \	2502 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \

2040 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \	2503 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \

2041 __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \	2504 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \

2042 __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \	2505 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \

2043 __asm lea edx, [edx + 24] \	2506 __asm lea edx, [edx + 24] \

2044 }	2507 }

2045	2508

2046 // Store 8 RGB565 values.	2509 // Store 8 RGB565 values.

2047 #define STORERGB565 __asm { \	2510 #define STORERGB565 __asm { \

2048 /* Step 3: Weave into RRGB */ \	2511 /* Step 3: Weave into RRGB */ \

2049 __asm punpcklbw xmm0, xmm1 /* BG */ \	2512 __asm punpcklbw xmm0, xmm1 /* BG */ \

2050 __asm punpcklbw xmm2, xmm2 /* RR */ \	2513 __asm punpcklbw xmm2, xmm2 /* RR */ \

2051 __asm movdqa xmm1, xmm0 \	2514 __asm movdqa xmm1, xmm0 \

2052 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \	2515 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \

(...skipping 15 matching lines...) Expand all Loading...
2068 __asm pslld xmm1, 8 /* R */ \	2531 __asm pslld xmm1, 8 /* R */ \

2069 __asm psrld xmm3, 3 /* B */ \	2532 __asm psrld xmm3, 3 /* B */ \

2070 __asm psrld xmm2, 5 /* G */ \	2533 __asm psrld xmm2, 5 /* G */ \

2071 __asm psrad xmm1, 16 /* R */ \	2534 __asm psrad xmm1, 16 /* R */ \

2072 __asm pand xmm3, xmm5 /* B */ \	2535 __asm pand xmm3, xmm5 /* B */ \

2073 __asm pand xmm2, xmm6 /* G */ \	2536 __asm pand xmm2, xmm6 /* G */ \

2074 __asm pand xmm1, xmm7 /* R */ \	2537 __asm pand xmm1, xmm7 /* R */ \

2075 __asm por xmm3, xmm2 /* BG */ \	2538 __asm por xmm3, xmm2 /* BG */ \

2076 __asm por xmm1, xmm3 /* BGR */ \	2539 __asm por xmm1, xmm3 /* BGR */ \

2077 __asm packssdw xmm0, xmm1 \	2540 __asm packssdw xmm0, xmm1 \

2078 __asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \	2541 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \

2079 __asm lea edx, [edx + 16] \	2542 __asm lea edx, [edx + 16] \

2080 }	2543 }

2081	2544

2082 // 8 pixels.	2545 // 8 pixels.

2083 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).	2546 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).

2084 __declspec(naked) __declspec(align(16))	2547 __declspec(naked)

2085 void I444ToARGBRow_SSSE3(const uint8* y_buf,	2548 void I444ToARGBRow_SSSE3(const uint8* y_buf,

2086 const uint8* u_buf,	2549 const uint8* u_buf,

2087 const uint8* v_buf,	2550 const uint8* v_buf,

2088 uint8* dst_argb,	2551 uint8* dst_argb,

2089 int width) {	2552 int width) {

2090 __asm {	2553 __asm {

2091 push esi	2554 push esi

2092 push edi	2555 push edi

2093 mov eax, [esp + 8 + 4] // Y	2556 mov eax, [esp + 8 + 4] // Y

2094 mov esi, [esp + 8 + 8] // U	2557 mov esi, [esp + 8 + 8] // U

(...skipping 12 matching lines...) Expand all Loading...
2107 jg convertloop	2570 jg convertloop

2108	2571

2109 pop edi	2572 pop edi

2110 pop esi	2573 pop esi

2111 ret	2574 ret

2112 }	2575 }

2113 }	2576 }

2114	2577

2115 // 8 pixels.	2578 // 8 pixels.

2116 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).	2579 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).

2117 __declspec(naked) __declspec(align(16))	2580 __declspec(naked)

2118 void I422ToRGB24Row_SSSE3(const uint8* y_buf,	2581 void I422ToRGB24Row_SSSE3(const uint8* y_buf,

2119 const uint8* u_buf,	2582 const uint8* u_buf,

2120 const uint8* v_buf,	2583 const uint8* v_buf,

2121 uint8* dst_rgb24,	2584 uint8* dst_rgb24,

2122 int width) {	2585 int width) {

2123 __asm {	2586 __asm {

2124 push esi	2587 push esi

2125 push edi	2588 push edi

2126 mov eax, [esp + 8 + 4] // Y	2589 mov eax, [esp + 8 + 4] // Y

2127 mov esi, [esp + 8 + 8] // U	2590 mov esi, [esp + 8 + 8] // U

(...skipping 13 matching lines...) Expand all Loading...
2141 jg convertloop	2604 jg convertloop

2142	2605

2143 pop edi	2606 pop edi

2144 pop esi	2607 pop esi

2145 ret	2608 ret

2146 }	2609 }

2147 }	2610 }

2148	2611

2149 // 8 pixels.	2612 // 8 pixels.

2150 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).	2613 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).

2151 __declspec(naked) __declspec(align(16))	2614 __declspec(naked)

2152 void I422ToRAWRow_SSSE3(const uint8* y_buf,	2615 void I422ToRAWRow_SSSE3(const uint8* y_buf,

2153 const uint8* u_buf,	2616 const uint8* u_buf,

2154 const uint8* v_buf,	2617 const uint8* v_buf,

2155 uint8* dst_raw,	2618 uint8* dst_raw,

2156 int width) {	2619 int width) {

2157 __asm {	2620 __asm {

2158 push esi	2621 push esi

2159 push edi	2622 push edi

2160 mov eax, [esp + 8 + 4] // Y	2623 mov eax, [esp + 8 + 4] // Y

2161 mov esi, [esp + 8 + 8] // U	2624 mov esi, [esp + 8 + 8] // U

(...skipping 13 matching lines...) Expand all Loading...
2175 jg convertloop	2638 jg convertloop

2176	2639

2177 pop edi	2640 pop edi

2178 pop esi	2641 pop esi

2179 ret	2642 ret

2180 }	2643 }

2181 }	2644 }

2182	2645

2183 // 8 pixels	2646 // 8 pixels

2184 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).	2647 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).

2185 __declspec(naked) __declspec(align(16))	2648 __declspec(naked)

2186 void I422ToRGB565Row_SSSE3(const uint8* y_buf,	2649 void I422ToRGB565Row_SSSE3(const uint8* y_buf,

2187 const uint8* u_buf,	2650 const uint8* u_buf,

2188 const uint8* v_buf,	2651 const uint8* v_buf,

2189 uint8* rgb565_buf,	2652 uint8* rgb565_buf,

2190 int width) {	2653 int width) {

2191 __asm {	2654 __asm {

2192 push esi	2655 push esi

2193 push edi	2656 push edi

2194 mov eax, [esp + 8 + 4] // Y	2657 mov eax, [esp + 8 + 4] // Y

2195 mov esi, [esp + 8 + 8] // U	2658 mov esi, [esp + 8 + 8] // U

(...skipping 18 matching lines...) Expand all Loading...
2214 jg convertloop	2677 jg convertloop

2215	2678

2216 pop edi	2679 pop edi

2217 pop esi	2680 pop esi

2218 ret	2681 ret

2219 }	2682 }

2220 }	2683 }

2221	2684

2222 // 8 pixels.	2685 // 8 pixels.

2223 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).	2686 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

2224 __declspec(naked) __declspec(align(16))	2687 __declspec(naked)

2225 void I422ToARGBRow_SSSE3(const uint8* y_buf,	2688 void I422ToARGBRow_SSSE3(const uint8* y_buf,

2226 const uint8* u_buf,	2689 const uint8* u_buf,

2227 const uint8* v_buf,	2690 const uint8* v_buf,

2228 uint8* dst_argb,	2691 uint8* dst_argb,

2229 int width) {	2692 int width) {

2230 __asm {	2693 __asm {

2231 push esi	2694 push esi

2232 push edi	2695 push edi

2233 mov eax, [esp + 8 + 4] // Y	2696 mov eax, [esp + 8 + 4] // Y

2234 mov esi, [esp + 8 + 8] // U	2697 mov esi, [esp + 8 + 8] // U

(...skipping 11 matching lines...) Expand all Loading...
2246 sub ecx, 8	2709 sub ecx, 8

2247 jg convertloop	2710 jg convertloop

2248	2711

2249 pop edi	2712 pop edi

2250 pop esi	2713 pop esi

2251 ret	2714 ret

2252 }	2715 }

2253 }	2716 }

2254	2717

2255 // 8 pixels.	2718 // 8 pixels.

	2719 // JPeg color space version of I422ToARGB

	2720 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

	2721 __declspec(naked)

	2722 void J422ToARGBRow_SSSE3(const uint8* y_buf,

	2723 const uint8* u_buf,

	2724 const uint8* v_buf,

	2725 uint8* dst_argb,

	2726 int width) {

	2727 __asm {

	2728 push esi

	2729 push edi

	2730 mov eax, [esp + 8 + 4] // Y

	2731 mov esi, [esp + 8 + 8] // U

	2732 mov edi, [esp + 8 + 12] // V

	2733 mov edx, [esp + 8 + 16] // argb

	2734 mov ecx, [esp + 8 + 20] // width

	2735 sub edi, esi

	2736 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha

	2737

	2738 convertloop:

	2739 READYUV422

	2740 YUVTORGB(kYuvJConstants)

	2741 STOREARGB

	2742

	2743 sub ecx, 8

	2744 jg convertloop

	2745

	2746 pop edi

	2747 pop esi

	2748 ret

	2749 }

	2750 }

	2751

	2752 // 8 pixels.

2256 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).	2753 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

2257 // Similar to I420 but duplicate UV once more.	2754 // Similar to I420 but duplicate UV once more.

2258 __declspec(naked) __declspec(align(16))	2755 __declspec(naked)

2259 void I411ToARGBRow_SSSE3(const uint8* y_buf,	2756 void I411ToARGBRow_SSSE3(const uint8* y_buf,

2260 const uint8* u_buf,	2757 const uint8* u_buf,

2261 const uint8* v_buf,	2758 const uint8* v_buf,

2262 uint8* dst_argb,	2759 uint8* dst_argb,

2263 int width) {	2760 int width) {

2264 __asm {	2761 __asm {

2265 push ebx	2762 push ebx

2266 push esi	2763 push esi

2267 push edi	2764 push edi

2268 mov eax, [esp + 12 + 4] // Y	2765 mov eax, [esp + 12 + 4] // Y

(...skipping 14 matching lines...) Expand all Loading...
2283	2780

2284 pop edi	2781 pop edi

2285 pop esi	2782 pop esi

2286 pop ebx	2783 pop ebx

2287 ret	2784 ret

2288 }	2785 }

2289 }	2786 }

2290	2787

2291 // 8 pixels.	2788 // 8 pixels.

2292 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).	2789 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

2293 __declspec(naked) __declspec(align(16))	2790 __declspec(naked)

2294 void NV12ToARGBRow_SSSE3(const uint8* y_buf,	2791 void NV12ToARGBRow_SSSE3(const uint8* y_buf,

2295 const uint8* uv_buf,	2792 const uint8* uv_buf,

2296 uint8* dst_argb,	2793 uint8* dst_argb,

2297 int width) {	2794 int width) {

2298 __asm {	2795 __asm {

2299 push esi	2796 push esi

2300 mov eax, [esp + 4 + 4] // Y	2797 mov eax, [esp + 4 + 4] // Y

2301 mov esi, [esp + 4 + 8] // UV	2798 mov esi, [esp + 4 + 8] // UV

2302 mov edx, [esp + 4 + 12] // argb	2799 mov edx, [esp + 4 + 12] // argb

2303 mov ecx, [esp + 4 + 16] // width	2800 mov ecx, [esp + 4 + 16] // width

2304 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha	2801 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha

2305	2802

2306 convertloop:	2803 convertloop:

2307 READNV12	2804 READNV12

2308 YUVTORGB(kYuvConstants)	2805 YUVTORGB(kYuvConstants)

2309 STOREARGB	2806 STOREARGB

2310	2807

2311 sub ecx, 8	2808 sub ecx, 8

2312 jg convertloop	2809 jg convertloop

2313	2810

2314 pop esi	2811 pop esi

2315 ret	2812 ret

2316 }	2813 }

2317 }	2814 }

2318	2815

2319 // 8 pixels.	2816 // 8 pixels.

2320 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).	2817 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).

2321 __declspec(naked) __declspec(align(16))	2818 __declspec(naked)

2322 void NV21ToARGBRow_SSSE3(const uint8* y_buf,	2819 void NV21ToARGBRow_SSSE3(const uint8* y_buf,

2323 const uint8* uv_buf,	2820 const uint8* uv_buf,

2324 uint8* dst_argb,	2821 uint8* dst_argb,

2325 int width) {	2822 int width) {

2326 __asm {	2823 __asm {

2327 push esi	2824 push esi

2328 mov eax, [esp + 4 + 4] // Y	2825 mov eax, [esp + 4 + 4] // Y

2329 mov esi, [esp + 4 + 8] // UV	2826 mov esi, [esp + 4 + 8] // UV

2330 mov edx, [esp + 4 + 12] // argb	2827 mov edx, [esp + 4 + 12] // argb

2331 mov ecx, [esp + 4 + 16] // width	2828 mov ecx, [esp + 4 + 16] // width

2332 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha	2829 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha

2333	2830

2334 convertloop:	2831 convertloop:

2335 READNV12	2832 READNV12

2336 YUVTORGB(kYvuConstants)	2833 YUVTORGB(kYvuConstants)

2337 STOREARGB	2834 STOREARGB

2338	2835

2339 sub ecx, 8	2836 sub ecx, 8

2340 jg convertloop	2837 jg convertloop

2341	2838

2342 pop esi	2839 pop esi

2343 ret	2840 ret

2344 }	2841 }

2345 }	2842 }

2346	2843

2347 __declspec(naked) __declspec(align(16))	2844 __declspec(naked)

2348 void I422ToBGRARow_SSSE3(const uint8* y_buf,	2845 void I422ToBGRARow_SSSE3(const uint8* y_buf,

2349 const uint8* u_buf,	2846 const uint8* u_buf,

2350 const uint8* v_buf,	2847 const uint8* v_buf,

2351 uint8* dst_bgra,	2848 uint8* dst_bgra,

2352 int width) {	2849 int width) {

2353 __asm {	2850 __asm {

2354 push esi	2851 push esi

2355 push edi	2852 push edi

2356 mov eax, [esp + 8 + 4] // Y	2853 mov eax, [esp + 8 + 4] // Y

2357 mov esi, [esp + 8 + 8] // U	2854 mov esi, [esp + 8 + 8] // U

2358 mov edi, [esp + 8 + 12] // V	2855 mov edi, [esp + 8 + 12] // V

2359 mov edx, [esp + 8 + 16] // bgra	2856 mov edx, [esp + 8 + 16] // bgra

2360 mov ecx, [esp + 8 + 20] // width	2857 mov ecx, [esp + 8 + 20] // width

2361 sub edi, esi	2858 sub edi, esi

2362	2859

2363 convertloop:	2860 convertloop:

2364 READYUV422	2861 READYUV422

2365 YUVTORGB(kYuvConstants)	2862 YUVTORGB(kYuvConstants)

2366 STOREBGRA	2863 STOREBGRA

2367	2864

2368 sub ecx, 8	2865 sub ecx, 8

2369 jg convertloop	2866 jg convertloop

2370	2867

2371 pop edi	2868 pop edi

2372 pop esi	2869 pop esi

2373 ret	2870 ret

2374 }	2871 }

2375 }	2872 }

2376	2873

2377 __declspec(naked) __declspec(align(16))	2874 __declspec(naked)

2378 void I422ToABGRRow_SSSE3(const uint8* y_buf,	2875 void I422ToABGRRow_SSSE3(const uint8* y_buf,

2379 const uint8* u_buf,	2876 const uint8* u_buf,

2380 const uint8* v_buf,	2877 const uint8* v_buf,

2381 uint8* dst_abgr,	2878 uint8* dst_abgr,

2382 int width) {	2879 int width) {

2383 __asm {	2880 __asm {

2384 push esi	2881 push esi

2385 push edi	2882 push edi

2386 mov eax, [esp + 8 + 4] // Y	2883 mov eax, [esp + 8 + 4] // Y

2387 mov esi, [esp + 8 + 8] // U	2884 mov esi, [esp + 8 + 8] // U

(...skipping 10 matching lines...) Expand all Loading...
2398	2895

2399 sub ecx, 8	2896 sub ecx, 8

2400 jg convertloop	2897 jg convertloop

2401	2898

2402 pop edi	2899 pop edi

2403 pop esi	2900 pop esi

2404 ret	2901 ret

2405 }	2902 }

2406 }	2903 }

2407	2904

2408 __declspec(naked) __declspec(align(16))	2905 __declspec(naked)

2409 void I422ToRGBARow_SSSE3(const uint8* y_buf,	2906 void I422ToRGBARow_SSSE3(const uint8* y_buf,

2410 const uint8* u_buf,	2907 const uint8* u_buf,

2411 const uint8* v_buf,	2908 const uint8* v_buf,

2412 uint8* dst_rgba,	2909 uint8* dst_rgba,

2413 int width) {	2910 int width) {

2414 __asm {	2911 __asm {

2415 push esi	2912 push esi

2416 push edi	2913 push edi

2417 mov eax, [esp + 8 + 4] // Y	2914 mov eax, [esp + 8 + 4] // Y

2418 mov esi, [esp + 8 + 8] // U	2915 mov esi, [esp + 8 + 8] // U

(...skipping 11 matching lines...) Expand all Loading...
2430 jg convertloop	2927 jg convertloop

2431	2928

2432 pop edi	2929 pop edi

2433 pop esi	2930 pop esi

2434 ret	2931 ret

2435 }	2932 }

2436 }	2933 }

2437	2934

2438 #endif // HAS_I422TOARGBROW_SSSE3	2935 #endif // HAS_I422TOARGBROW_SSSE3

2439	2936

2440 #ifdef HAS_YTOARGBROW_SSE2	2937 #ifdef HAS_I400TOARGBROW_SSE2

2441 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).	2938 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).

2442 __declspec(naked) __declspec(align(16))	2939 __declspec(naked)

2443 void YToARGBRow_SSE2(const uint8* y_buf,	2940 void I400ToARGBRow_SSE2(const uint8* y_buf,

2444 uint8* rgb_buf,	2941 uint8* rgb_buf,

2445 int width) {	2942 int width) {

2446 __asm {	2943 __asm {

2447 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)	2944 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)

2448 movd xmm2, eax	2945 movd xmm2, eax

2449 pshufd xmm2, xmm2,0	2946 pshufd xmm2, xmm2,0

2450 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)	2947 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)

2451 movd xmm3, eax	2948 movd xmm3, eax

2452 pshufd xmm3, xmm3, 0	2949 pshufd xmm3, xmm3, 0

2453 pcmpeqb xmm4, xmm4 // generate mask 0xff000000	2950 pcmpeqb xmm4, xmm4 // generate mask 0xff000000

2454 pslld xmm4, 24	2951 pslld xmm4, 24

2455	2952

(...skipping 19 matching lines...) Expand all Loading...
2475 por xmm0, xmm4	2972 por xmm0, xmm4

2476 por xmm1, xmm4	2973 por xmm1, xmm4

2477 movdqu [edx], xmm0	2974 movdqu [edx], xmm0

2478 movdqu [edx + 16], xmm1	2975 movdqu [edx + 16], xmm1

2479 lea edx, [edx + 32]	2976 lea edx, [edx + 32]

2480 sub ecx, 8	2977 sub ecx, 8

2481 jg convertloop	2978 jg convertloop

2482 ret	2979 ret

2483 }	2980 }

2484 }	2981 }

2485 #endif // HAS_YTOARGBROW_SSE2	2982 #endif // HAS_I400TOARGBROW_SSE2

2486	2983

2487 #ifdef HAS_YTOARGBROW_AVX2	2984 #ifdef HAS_I400TOARGBROW_AVX2

2488 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).	2985 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).

2489 // note: vpunpcklbw mutates and vpackuswb unmutates.	2986 // note: vpunpcklbw mutates and vpackuswb unmutates.

2490 __declspec(naked) __declspec(align(16))	2987 __declspec(naked)

2491 void YToARGBRow_AVX2(const uint8* y_buf,	2988 void I400ToARGBRow_AVX2(const uint8* y_buf,

2492 uint8* rgb_buf,	2989 uint8* rgb_buf,

2493 int width) {	2990 int width) {

2494 __asm {	2991 __asm {

2495 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)	2992 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)

2496 vmovd xmm2, eax	2993 vmovd xmm2, eax

2497 vbroadcastss ymm2, xmm2	2994 vbroadcastss ymm2, xmm2

2498 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)	2995 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)

2499 vmovd xmm3, eax	2996 vmovd xmm3, eax

2500 vbroadcastss ymm3, xmm3	2997 vbroadcastss ymm3, xmm3

2501 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000	2998 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000

2502 vpslld ymm4, ymm4, 24	2999 vpslld ymm4, ymm4, 24

2503	3000

2504 mov eax, [esp + 4] // Y	3001 mov eax, [esp + 4] // Y

2505 mov edx, [esp + 8] // rgb	3002 mov edx, [esp + 8] // rgb

2506 mov ecx, [esp + 12] // width	3003 mov ecx, [esp + 12] // width

2507	3004

2508 convertloop:	3005 convertloop:

2509 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164	3006 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164

2510 vmovdqu xmm0, [eax]	3007 vmovdqu xmm0, [eax]

2511 lea eax, [eax + 16]	3008 lea eax, [eax + 16]

2512 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates	3009 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates

2513 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y	3010 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y

2514 vpmulhuw ymm0, ymm0, ymm2	3011 vpmulhuw ymm0, ymm0, ymm2

2515 vpsubusw ymm0, ymm0, ymm3	3012 vpsubusw ymm0, ymm0, ymm3

2516 vpsrlw ymm0, ymm0, 6	3013 vpsrlw ymm0, ymm0, 6

2517 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120	3014 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120

2518	3015

2519 // TODO(fbarchard): Weave alpha with unpack.	3016 // TODO(fbarchard): Weave alpha with unpack.

2520 // Step 2: Weave into ARGB	3017 // Step 2: Weave into ARGB

2521 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates	3018 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates

2522 vpermq ymm1, ymm1, 0xd8	3019 vpermq ymm1, ymm1, 0xd8

2523 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels	3020 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels

2524 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels	3021 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels

2525 vpor ymm0, ymm0, ymm4	3022 vpor ymm0, ymm0, ymm4

2526 vpor ymm1, ymm1, ymm4	3023 vpor ymm1, ymm1, ymm4

2527 vmovdqu [edx], ymm0	3024 vmovdqu [edx], ymm0

2528 vmovdqu [edx + 32], ymm1	3025 vmovdqu [edx + 32], ymm1

2529 lea edx, [edx + 64]	3026 lea edx, [edx + 64]

2530 sub ecx, 16	3027 sub ecx, 16

2531 jg convertloop	3028 jg convertloop

2532 vzeroupper	3029 vzeroupper

2533 ret	3030 ret

2534 }	3031 }

2535 }	3032 }

2536 #endif // HAS_YTOARGBROW_AVX2	3033 #endif // HAS_I400TOARGBROW_AVX2

2537	3034

2538 #ifdef HAS_MIRRORROW_SSSE3	3035 #ifdef HAS_MIRRORROW_SSSE3

2539 // Shuffle table for reversing the bytes.	3036 // Shuffle table for reversing the bytes.

2540 static const uvec8 kShuffleMirror = {	3037 static const uvec8 kShuffleMirror = {

2541 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u	3038 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u

2542 };	3039 };

2543	3040

2544 // TODO(fbarchard): Replace lea with -16 offset.	3041 // TODO(fbarchard): Replace lea with -16 offset.

2545 __declspec(naked) __declspec(align(16))	3042 __declspec(naked)

2546 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {	3043 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {

2547 __asm {	3044 __asm {

2548 mov eax, [esp + 4] // src	3045 mov eax, [esp + 4] // src

2549 mov edx, [esp + 8] // dst	3046 mov edx, [esp + 8] // dst

2550 mov ecx, [esp + 12] // width	3047 mov ecx, [esp + 12] // width

2551 movdqa xmm5, kShuffleMirror	3048 movdqa xmm5, kShuffleMirror

2552	3049

2553 convertloop:	3050 convertloop:

2554 movdqu xmm0, [eax - 16 + ecx]	3051 movdqu xmm0, [eax - 16 + ecx]

2555 pshufb xmm0, xmm5	3052 pshufb xmm0, xmm5

2556 movdqu [edx], xmm0	3053 movdqu [edx], xmm0

2557 lea edx, [edx + 16]	3054 lea edx, [edx + 16]

2558 sub ecx, 16	3055 sub ecx, 16

2559 jg convertloop	3056 jg convertloop

2560 ret	3057 ret

2561 }	3058 }

2562 }	3059 }

2563 #endif // HAS_MIRRORROW_SSSE3	3060 #endif // HAS_MIRRORROW_SSSE3

2564	3061

2565 #ifdef HAS_MIRRORROW_AVX2	3062 #ifdef HAS_MIRRORROW_AVX2

2566 __declspec(naked) __declspec(align(16))	3063 __declspec(naked)

2567 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {	3064 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {

2568 __asm {	3065 __asm {

2569 mov eax, [esp + 4] // src	3066 mov eax, [esp + 4] // src

2570 mov edx, [esp + 8] // dst	3067 mov edx, [esp + 8] // dst

2571 mov ecx, [esp + 12] // width	3068 mov ecx, [esp + 12] // width

2572 vbroadcastf128 ymm5, kShuffleMirror	3069 vbroadcastf128 ymm5, kShuffleMirror

2573	3070

2574 convertloop:	3071 convertloop:

2575 vmovdqu ymm0, [eax - 32 + ecx]	3072 vmovdqu ymm0, [eax - 32 + ecx]

2576 vpshufb ymm0, ymm0, ymm5	3073 vpshufb ymm0, ymm0, ymm5

2577 vpermq ymm0, ymm0, 0x4e // swap high and low halfs	3074 vpermq ymm0, ymm0, 0x4e // swap high and low halfs

2578 vmovdqu [edx], ymm0	3075 vmovdqu [edx], ymm0

2579 lea edx, [edx + 32]	3076 lea edx, [edx + 32]

2580 sub ecx, 32	3077 sub ecx, 32

2581 jg convertloop	3078 jg convertloop

2582 vzeroupper	3079 vzeroupper

2583 ret	3080 ret

2584 }	3081 }

2585 }	3082 }

2586 #endif // HAS_MIRRORROW_AVX2	3083 #endif // HAS_MIRRORROW_AVX2

2587	3084

2588 #ifdef HAS_MIRRORROW_SSE2	3085 #ifdef HAS_MIRRORROW_SSE2

2589 __declspec(naked) __declspec(align(16))	3086 __declspec(naked)

2590 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {	3087 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {

2591 __asm {	3088 __asm {

2592 mov eax, [esp + 4] // src	3089 mov eax, [esp + 4] // src

2593 mov edx, [esp + 8] // dst	3090 mov edx, [esp + 8] // dst

2594 mov ecx, [esp + 12] // width	3091 mov ecx, [esp + 12] // width

2595	3092

2596 convertloop:	3093 convertloop:

2597 movdqu xmm0, [eax - 16 + ecx]	3094 movdqu xmm0, [eax - 16 + ecx]

2598 movdqa xmm1, xmm0 // swap bytes	3095 movdqa xmm1, xmm0 // swap bytes

2599 psllw xmm0, 8	3096 psllw xmm0, 8

(...skipping 10 matching lines...) Expand all Loading...
2610 }	3107 }

2611 }	3108 }

2612 #endif // HAS_MIRRORROW_SSE2	3109 #endif // HAS_MIRRORROW_SSE2

2613	3110

2614 #ifdef HAS_MIRRORROW_UV_SSSE3	3111 #ifdef HAS_MIRRORROW_UV_SSSE3

2615 // Shuffle table for reversing the bytes of UV channels.	3112 // Shuffle table for reversing the bytes of UV channels.

2616 static const uvec8 kShuffleMirrorUV = {	3113 static const uvec8 kShuffleMirrorUV = {

2617 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u	3114 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u

2618 };	3115 };

2619	3116

2620 __declspec(naked) __declspec(align(16))	3117 __declspec(naked)

2621 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,	3118 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,

2622 int width) {	3119 int width) {

2623 __asm {	3120 __asm {

2624 push edi	3121 push edi

2625 mov eax, [esp + 4 + 4] // src	3122 mov eax, [esp + 4 + 4] // src

2626 mov edx, [esp + 4 + 8] // dst_u	3123 mov edx, [esp + 4 + 8] // dst_u

2627 mov edi, [esp + 4 + 12] // dst_v	3124 mov edi, [esp + 4 + 12] // dst_v

2628 mov ecx, [esp + 4 + 16] // width	3125 mov ecx, [esp + 4 + 16] // width

2629 movdqa xmm1, kShuffleMirrorUV	3126 movdqa xmm1, kShuffleMirrorUV

2630 lea eax, [eax + ecx * 2 - 16]	3127 lea eax, [eax + ecx * 2 - 16]

2631 sub edi, edx	3128 sub edi, edx

2632	3129

2633 convertloop:	3130 convertloop:

2634 movdqu xmm0, [eax]	3131 movdqu xmm0, [eax]

2635 lea eax, [eax - 16]	3132 lea eax, [eax - 16]

2636 pshufb xmm0, xmm1	3133 pshufb xmm0, xmm1

2637 movlpd qword ptr [edx], xmm0	3134 movlpd qword ptr [edx], xmm0

2638 movhpd qword ptr [edx + edi], xmm0	3135 movhpd qword ptr [edx + edi], xmm0

2639 lea edx, [edx + 8]	3136 lea edx, [edx + 8]

2640 sub ecx, 8	3137 sub ecx, 8

2641 jg convertloop	3138 jg convertloop

2642	3139

2643 pop edi	3140 pop edi

2644 ret	3141 ret

2645 }	3142 }

2646 }	3143 }

2647 #endif // HAS_MIRRORROW_UV_SSSE3	3144 #endif // HAS_MIRRORROW_UV_SSSE3

2648	3145

2649 #ifdef HAS_ARGBMIRRORROW_SSE2	3146 #ifdef HAS_ARGBMIRRORROW_SSE2

2650 __declspec(naked) __declspec(align(16))	3147 __declspec(naked)

2651 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {	3148 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {

2652 __asm {	3149 __asm {

2653 mov eax, [esp + 4] // src	3150 mov eax, [esp + 4] // src

2654 mov edx, [esp + 8] // dst	3151 mov edx, [esp + 8] // dst

2655 mov ecx, [esp + 12] // width	3152 mov ecx, [esp + 12] // width

2656 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.	3153 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.

2657	3154

2658 convertloop:	3155 convertloop:

2659 movdqu xmm0, [eax]	3156 movdqu xmm0, [eax]

2660 lea eax, [eax - 16]	3157 lea eax, [eax - 16]

2661 pshufd xmm0, xmm0, 0x1b	3158 pshufd xmm0, xmm0, 0x1b

2662 movdqu [edx], xmm0	3159 movdqu [edx], xmm0

2663 lea edx, [edx + 16]	3160 lea edx, [edx + 16]

2664 sub ecx, 4	3161 sub ecx, 4

2665 jg convertloop	3162 jg convertloop

2666 ret	3163 ret

2667 }	3164 }

2668 }	3165 }

2669 #endif // HAS_ARGBMIRRORROW_SSE2	3166 #endif // HAS_ARGBMIRRORROW_SSE2

2670	3167

2671 #ifdef HAS_ARGBMIRRORROW_AVX2	3168 #ifdef HAS_ARGBMIRRORROW_AVX2

2672 // Shuffle table for reversing the bytes.	3169 // Shuffle table for reversing the bytes.

2673 static const ulvec32 kARGBShuffleMirror_AVX2 = {	3170 static const ulvec32 kARGBShuffleMirror_AVX2 = {

2674 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u	3171 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u

2675 };	3172 };

2676	3173

2677 __declspec(naked) __declspec(align(16))	3174 __declspec(naked)

2678 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {	3175 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {

2679 __asm {	3176 __asm {

2680 mov eax, [esp + 4] // src	3177 mov eax, [esp + 4] // src

2681 mov edx, [esp + 8] // dst	3178 mov edx, [esp + 8] // dst

2682 mov ecx, [esp + 12] // width	3179 mov ecx, [esp + 12] // width

2683 vmovdqu ymm5, kARGBShuffleMirror_AVX2	3180 vmovdqu ymm5, kARGBShuffleMirror_AVX2

2684	3181

2685 convertloop:	3182 convertloop:

2686 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order	3183 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order

2687 vmovdqu [edx], ymm0	3184 vmovdqu [edx], ymm0

2688 lea edx, [edx + 32]	3185 lea edx, [edx + 32]

2689 sub ecx, 8	3186 sub ecx, 8

2690 jg convertloop	3187 jg convertloop

2691 vzeroupper	3188 vzeroupper

2692 ret	3189 ret

2693 }	3190 }

2694 }	3191 }

2695 #endif // HAS_ARGBMIRRORROW_AVX2	3192 #endif // HAS_ARGBMIRRORROW_AVX2

2696	3193

2697 #ifdef HAS_SPLITUVROW_SSE2	3194 #ifdef HAS_SPLITUVROW_SSE2

2698 __declspec(naked) __declspec(align(16))	3195 __declspec(naked)

2699 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {	3196 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {

2700 __asm {	3197 __asm {

2701 push edi	3198 push edi

2702 mov eax, [esp + 4 + 4] // src_uv	3199 mov eax, [esp + 4 + 4] // src_uv

2703 mov edx, [esp + 4 + 8] // dst_u	3200 mov edx, [esp + 4 + 8] // dst_u

2704 mov edi, [esp + 4 + 12] // dst_v	3201 mov edi, [esp + 4 + 12] // dst_v

2705 mov ecx, [esp + 4 + 16] // pix	3202 mov ecx, [esp + 4 + 16] // pix

2706 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff	3203 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

2707 psrlw xmm5, 8	3204 psrlw xmm5, 8

2708 sub edi, edx	3205 sub edi, edx

(...skipping 17 matching lines...) Expand all Loading...
2726 jg convertloop	3223 jg convertloop

2727	3224

2728 pop edi	3225 pop edi

2729 ret	3226 ret

2730 }	3227 }

2731 }	3228 }

2732	3229

2733 #endif // HAS_SPLITUVROW_SSE2	3230 #endif // HAS_SPLITUVROW_SSE2

2734	3231

2735 #ifdef HAS_SPLITUVROW_AVX2	3232 #ifdef HAS_SPLITUVROW_AVX2

2736 __declspec(naked) __declspec(align(16))	3233 __declspec(naked)

2737 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {	3234 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {

2738 __asm {	3235 __asm {

2739 push edi	3236 push edi

2740 mov eax, [esp + 4 + 4] // src_uv	3237 mov eax, [esp + 4 + 4] // src_uv

2741 mov edx, [esp + 4 + 8] // dst_u	3238 mov edx, [esp + 4 + 8] // dst_u

2742 mov edi, [esp + 4 + 12] // dst_v	3239 mov edi, [esp + 4 + 12] // dst_v

2743 mov ecx, [esp + 4 + 16] // pix	3240 mov ecx, [esp + 4 + 16] // pix

2744 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff	3241 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff

2745 vpsrlw ymm5, ymm5, 8	3242 vpsrlw ymm5, ymm5, 8

2746 sub edi, edx	3243 sub edi, edx

(...skipping 17 matching lines...) Expand all Loading...
2764 jg convertloop	3261 jg convertloop

2765	3262

2766 pop edi	3263 pop edi

2767 vzeroupper	3264 vzeroupper

2768 ret	3265 ret

2769 }	3266 }

2770 }	3267 }

2771 #endif // HAS_SPLITUVROW_AVX2	3268 #endif // HAS_SPLITUVROW_AVX2

2772	3269

2773 #ifdef HAS_MERGEUVROW_SSE2	3270 #ifdef HAS_MERGEUVROW_SSE2

2774 __declspec(naked) __declspec(align(16))	3271 __declspec(naked)

2775 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,	3272 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

2776 int width) {	3273 int width) {

2777 __asm {	3274 __asm {

2778 push edi	3275 push edi

2779 mov eax, [esp + 4 + 4] // src_u	3276 mov eax, [esp + 4 + 4] // src_u

2780 mov edx, [esp + 4 + 8] // src_v	3277 mov edx, [esp + 4 + 8] // src_v

2781 mov edi, [esp + 4 + 12] // dst_uv	3278 mov edi, [esp + 4 + 12] // dst_uv

2782 mov ecx, [esp + 4 + 16] // width	3279 mov ecx, [esp + 4 + 16] // width

2783 sub edx, eax	3280 sub edx, eax

2784	3281

(...skipping 10 matching lines...) Expand all Loading...
2795 sub ecx, 16	3292 sub ecx, 16

2796 jg convertloop	3293 jg convertloop

2797	3294

2798 pop edi	3295 pop edi

2799 ret	3296 ret

2800 }	3297 }

2801 }	3298 }

2802 #endif // HAS_MERGEUVROW_SSE2	3299 #endif // HAS_MERGEUVROW_SSE2

2803	3300

2804 #ifdef HAS_MERGEUVROW_AVX2	3301 #ifdef HAS_MERGEUVROW_AVX2

2805 __declspec(naked) __declspec(align(16))	3302 __declspec(naked)

2806 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,	3303 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

2807 int width) {	3304 int width) {

2808 __asm {	3305 __asm {

2809 push edi	3306 push edi

2810 mov eax, [esp + 4 + 4] // src_u	3307 mov eax, [esp + 4 + 4] // src_u

2811 mov edx, [esp + 4 + 8] // src_v	3308 mov edx, [esp + 4 + 8] // src_v

2812 mov edi, [esp + 4 + 12] // dst_uv	3309 mov edi, [esp + 4 + 12] // dst_uv

2813 mov ecx, [esp + 4 + 16] // width	3310 mov ecx, [esp + 4 + 16] // width

2814 sub edx, eax	3311 sub edx, eax

2815	3312

(...skipping 13 matching lines...) Expand all Loading...
2829	3326

2830 pop edi	3327 pop edi

2831 vzeroupper	3328 vzeroupper

2832 ret	3329 ret

2833 }	3330 }

2834 }	3331 }

2835 #endif // HAS_MERGEUVROW_AVX2	3332 #endif // HAS_MERGEUVROW_AVX2

2836	3333

2837 #ifdef HAS_COPYROW_SSE2	3334 #ifdef HAS_COPYROW_SSE2

2838 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.	3335 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.

2839 __declspec(naked) __declspec(align(16))	3336 __declspec(naked)

2840 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {	3337 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {

2841 __asm {	3338 __asm {

2842 mov eax, [esp + 4] // src	3339 mov eax, [esp + 4] // src

2843 mov edx, [esp + 8] // dst	3340 mov edx, [esp + 8] // dst

2844 mov ecx, [esp + 12] // count	3341 mov ecx, [esp + 12] // count

2845	3342

2846 convertloop:	3343 convertloop:

2847 movdqu xmm0, [eax]	3344 movdqu xmm0, [eax]

2848 movdqu xmm1, [eax + 16]	3345 movdqu xmm1, [eax + 16]

2849 lea eax, [eax + 32]	3346 lea eax, [eax + 32]

2850 movdqu [edx], xmm0	3347 movdqu [edx], xmm0

2851 movdqu [edx + 16], xmm1	3348 movdqu [edx + 16], xmm1

2852 lea edx, [edx + 32]	3349 lea edx, [edx + 32]

2853 sub ecx, 32	3350 sub ecx, 32

2854 jg convertloop	3351 jg convertloop

2855 ret	3352 ret

2856 }	3353 }

2857 }	3354 }

2858 #endif // HAS_COPYROW_SSE2	3355 #endif // HAS_COPYROW_SSE2

2859	3356

2860 #ifdef HAS_COPYROW_AVX	3357 #ifdef HAS_COPYROW_AVX

2861 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.	3358 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.

2862 __declspec(naked) __declspec(align(16))	3359 __declspec(naked)

2863 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {	3360 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {

2864 __asm {	3361 __asm {

2865 mov eax, [esp + 4] // src	3362 mov eax, [esp + 4] // src

2866 mov edx, [esp + 8] // dst	3363 mov edx, [esp + 8] // dst

2867 mov ecx, [esp + 12] // count	3364 mov ecx, [esp + 12] // count

2868	3365

2869 convertloop:	3366 convertloop:

2870 vmovdqu ymm0, [eax]	3367 vmovdqu ymm0, [eax]

2871 vmovdqu ymm1, [eax + 32]	3368 vmovdqu ymm1, [eax + 32]

2872 lea eax, [eax + 64]	3369 lea eax, [eax + 64]

2873 vmovdqu [edx], ymm0	3370 vmovdqu [edx], ymm0

2874 vmovdqu [edx + 32], ymm1	3371 vmovdqu [edx + 32], ymm1

2875 lea edx, [edx + 64]	3372 lea edx, [edx + 64]

2876 sub ecx, 64	3373 sub ecx, 64

2877 jg convertloop	3374 jg convertloop

2878	3375

2879 vzeroupper	3376 vzeroupper

2880 ret	3377 ret

2881 }	3378 }

2882 }	3379 }

2883 #endif // HAS_COPYROW_AVX	3380 #endif // HAS_COPYROW_AVX

2884	3381

2885 // Multiple of 1.	3382 // Multiple of 1.

2886 __declspec(naked) __declspec(align(16))	3383 __declspec(naked)

2887 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {	3384 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {

2888 __asm {	3385 __asm {

2889 mov eax, esi	3386 mov eax, esi

2890 mov edx, edi	3387 mov edx, edi

2891 mov esi, [esp + 4] // src	3388 mov esi, [esp + 4] // src

2892 mov edi, [esp + 8] // dst	3389 mov edi, [esp + 8] // dst

2893 mov ecx, [esp + 12] // count	3390 mov ecx, [esp + 12] // count

2894 rep movsb	3391 rep movsb

2895 mov edi, edx	3392 mov edi, edx

2896 mov esi, eax	3393 mov esi, eax

2897 ret	3394 ret

2898 }	3395 }

2899 }	3396 }

2900	3397

2901 #ifdef HAS_ARGBCOPYALPHAROW_SSE2	3398 #ifdef HAS_ARGBCOPYALPHAROW_SSE2

2902 // width in pixels	3399 // width in pixels

2903 __declspec(naked) __declspec(align(16))	3400 __declspec(naked)

2904 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {	3401 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {

2905 __asm {	3402 __asm {

2906 mov eax, [esp + 4] // src	3403 mov eax, [esp + 4] // src

2907 mov edx, [esp + 8] // dst	3404 mov edx, [esp + 8] // dst

2908 mov ecx, [esp + 12] // count	3405 mov ecx, [esp + 12] // count

2909 pcmpeqb xmm0, xmm0 // generate mask 0xff000000	3406 pcmpeqb xmm0, xmm0 // generate mask 0xff000000

2910 pslld xmm0, 24	3407 pslld xmm0, 24

2911 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff	3408 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff

2912 psrld xmm1, 8	3409 psrld xmm1, 8

2913	3410

(...skipping 15 matching lines...) Expand all Loading...
2929 sub ecx, 8	3426 sub ecx, 8

2930 jg convertloop	3427 jg convertloop

2931	3428

2932 ret	3429 ret

2933 }	3430 }

2934 }	3431 }

2935 #endif // HAS_ARGBCOPYALPHAROW_SSE2	3432 #endif // HAS_ARGBCOPYALPHAROW_SSE2

2936	3433

2937 #ifdef HAS_ARGBCOPYALPHAROW_AVX2	3434 #ifdef HAS_ARGBCOPYALPHAROW_AVX2

2938 // width in pixels	3435 // width in pixels

2939 __declspec(naked) __declspec(align(16))	3436 __declspec(naked)

2940 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {	3437 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {

2941 __asm {	3438 __asm {

2942 mov eax, [esp + 4] // src	3439 mov eax, [esp + 4] // src

2943 mov edx, [esp + 8] // dst	3440 mov edx, [esp + 8] // dst

2944 mov ecx, [esp + 12] // count	3441 mov ecx, [esp + 12] // count

2945 vpcmpeqb ymm0, ymm0, ymm0	3442 vpcmpeqb ymm0, ymm0, ymm0

2946 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff	3443 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff

2947	3444

2948 convertloop:	3445 convertloop:

2949 vmovdqu ymm1, [eax]	3446 vmovdqu ymm1, [eax]

2950 vmovdqu ymm2, [eax + 32]	3447 vmovdqu ymm2, [eax + 32]

2951 lea eax, [eax + 64]	3448 lea eax, [eax + 64]

2952 vpblendvb ymm1, ymm1, [edx], ymm0	3449 vpblendvb ymm1, ymm1, [edx], ymm0

2953 vpblendvb ymm2, ymm2, [edx + 32], ymm0	3450 vpblendvb ymm2, ymm2, [edx + 32], ymm0

2954 vmovdqu [edx], ymm1	3451 vmovdqu [edx], ymm1

2955 vmovdqu [edx + 32], ymm2	3452 vmovdqu [edx + 32], ymm2

2956 lea edx, [edx + 64]	3453 lea edx, [edx + 64]

2957 sub ecx, 16	3454 sub ecx, 16

2958 jg convertloop	3455 jg convertloop

2959	3456

2960 vzeroupper	3457 vzeroupper

2961 ret	3458 ret

2962 }	3459 }

2963 }	3460 }

2964 #endif // HAS_ARGBCOPYALPHAROW_AVX2	3461 #endif // HAS_ARGBCOPYALPHAROW_AVX2

2965	3462

2966 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2	3463 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2

2967 // width in pixels	3464 // width in pixels

2968 __declspec(naked) __declspec(align(16))	3465 __declspec(naked)

2969 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {	3466 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {

2970 __asm {	3467 __asm {

2971 mov eax, [esp + 4] // src	3468 mov eax, [esp + 4] // src

2972 mov edx, [esp + 8] // dst	3469 mov edx, [esp + 8] // dst

2973 mov ecx, [esp + 12] // count	3470 mov ecx, [esp + 12] // count

2974 pcmpeqb xmm0, xmm0 // generate mask 0xff000000	3471 pcmpeqb xmm0, xmm0 // generate mask 0xff000000

2975 pslld xmm0, 24	3472 pslld xmm0, 24

2976 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff	3473 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff

2977 psrld xmm1, 8	3474 psrld xmm1, 8

2978	3475

(...skipping 17 matching lines...) Expand all Loading...
2996 sub ecx, 8	3493 sub ecx, 8

2997 jg convertloop	3494 jg convertloop

2998	3495

2999 ret	3496 ret

3000 }	3497 }

3001 }	3498 }

3002 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2	3499 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2

3003	3500

3004 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2	3501 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2

3005 // width in pixels	3502 // width in pixels

3006 __declspec(naked) __declspec(align(16))	3503 __declspec(naked)

3007 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {	3504 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {

3008 __asm {	3505 __asm {

3009 mov eax, [esp + 4] // src	3506 mov eax, [esp + 4] // src

3010 mov edx, [esp + 8] // dst	3507 mov edx, [esp + 8] // dst

3011 mov ecx, [esp + 12] // count	3508 mov ecx, [esp + 12] // count

3012 vpcmpeqb ymm0, ymm0, ymm0	3509 vpcmpeqb ymm0, ymm0, ymm0

3013 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff	3510 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff

3014	3511

3015 convertloop:	3512 convertloop:

3016 vpmovzxbd ymm1, qword ptr [eax]	3513 vpmovzxbd ymm1, qword ptr [eax]

(...skipping 11 matching lines...) Expand all Loading...
3028	3525

3029 vzeroupper	3526 vzeroupper

3030 ret	3527 ret

3031 }	3528 }

3032 }	3529 }

3033 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2	3530 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2

3034	3531

3035 #ifdef HAS_SETROW_X86	3532 #ifdef HAS_SETROW_X86

3036 // Write 'count' bytes using an 8 bit value repeated.	3533 // Write 'count' bytes using an 8 bit value repeated.

3037 // Count should be multiple of 4.	3534 // Count should be multiple of 4.

3038 __declspec(naked) __declspec(align(16))	3535 __declspec(naked)

3039 void SetRow_X86(uint8* dst, uint8 v8, int count) {	3536 void SetRow_X86(uint8* dst, uint8 v8, int count) {

3040 __asm {	3537 __asm {

3041 movzx eax, byte ptr [esp + 8] // v8	3538 movzx eax, byte ptr [esp + 8] // v8

3042 mov edx, 0x01010101 // Duplicate byte to all bytes.	3539 mov edx, 0x01010101 // Duplicate byte to all bytes.

3043 mul edx // overwrites edx with upper part of result.	3540 mul edx // overwrites edx with upper part of result.

3044 mov edx, edi	3541 mov edx, edi

3045 mov edi, [esp + 4] // dst	3542 mov edi, [esp + 4] // dst

3046 mov ecx, [esp + 12] // count	3543 mov ecx, [esp + 12] // count

3047 shr ecx, 2	3544 shr ecx, 2

3048 rep stosd	3545 rep stosd

3049 mov edi, edx	3546 mov edi, edx

3050 ret	3547 ret

3051 }	3548 }

3052 }	3549 }

3053	3550

3054 // Write 'count' bytes using an 8 bit value repeated.	3551 // Write 'count' bytes using an 8 bit value repeated.

3055 __declspec(naked) __declspec(align(16))	3552 __declspec(naked)

3056 void SetRow_ERMS(uint8* dst, uint8 v8, int count) {	3553 void SetRow_ERMS(uint8* dst, uint8 v8, int count) {

3057 __asm {	3554 __asm {

3058 mov edx, edi	3555 mov edx, edi

3059 mov edi, [esp + 4] // dst	3556 mov edi, [esp + 4] // dst

3060 mov eax, [esp + 8] // v8	3557 mov eax, [esp + 8] // v8

3061 mov ecx, [esp + 12] // count	3558 mov ecx, [esp + 12] // count

3062 rep stosb	3559 rep stosb

3063 mov edi, edx	3560 mov edi, edx

3064 ret	3561 ret

3065 }	3562 }

3066 }	3563 }

3067	3564

3068 // Write 'count' 32 bit values.	3565 // Write 'count' 32 bit values.

3069 __declspec(naked) __declspec(align(16))	3566 __declspec(naked)

3070 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {	3567 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {

3071 __asm {	3568 __asm {

3072 mov edx, edi	3569 mov edx, edi

3073 mov edi, [esp + 4] // dst	3570 mov edi, [esp + 4] // dst

3074 mov eax, [esp + 8] // v32	3571 mov eax, [esp + 8] // v32

3075 mov ecx, [esp + 12] // count	3572 mov ecx, [esp + 12] // count

3076 rep stosd	3573 rep stosd

3077 mov edi, edx	3574 mov edi, edx

3078 ret	3575 ret

3079 }	3576 }

3080 }	3577 }

3081 #endif // HAS_SETROW_X86	3578 #endif // HAS_SETROW_X86

3082	3579

3083 #ifdef HAS_YUY2TOYROW_AVX2	3580 #ifdef HAS_YUY2TOYROW_AVX2

3084 __declspec(naked) __declspec(align(16))	3581 __declspec(naked)

3085 void YUY2ToYRow_AVX2(const uint8* src_yuy2,	3582 void YUY2ToYRow_AVX2(const uint8* src_yuy2,

3086 uint8* dst_y, int pix) {	3583 uint8* dst_y, int pix) {

3087 __asm {	3584 __asm {

3088 mov eax, [esp + 4] // src_yuy2	3585 mov eax, [esp + 4] // src_yuy2

3089 mov edx, [esp + 8] // dst_y	3586 mov edx, [esp + 8] // dst_y

3090 mov ecx, [esp + 12] // pix	3587 mov ecx, [esp + 12] // pix

3091 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff	3588 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff

3092 vpsrlw ymm5, ymm5, 8	3589 vpsrlw ymm5, ymm5, 8

3093	3590

3094 convertloop:	3591 convertloop:

3095 vmovdqu ymm0, [eax]	3592 vmovdqu ymm0, [eax]

3096 vmovdqu ymm1, [eax + 32]	3593 vmovdqu ymm1, [eax + 32]

3097 lea eax, [eax + 64]	3594 lea eax, [eax + 64]

3098 vpand ymm0, ymm0, ymm5 // even bytes are Y	3595 vpand ymm0, ymm0, ymm5 // even bytes are Y

3099 vpand ymm1, ymm1, ymm5	3596 vpand ymm1, ymm1, ymm5

3100 vpackuswb ymm0, ymm0, ymm1 // mutates.	3597 vpackuswb ymm0, ymm0, ymm1 // mutates.

3101 vpermq ymm0, ymm0, 0xd8	3598 vpermq ymm0, ymm0, 0xd8

3102 vmovdqu [edx], ymm0	3599 vmovdqu [edx], ymm0

3103 lea edx, [edx + 32]	3600 lea edx, [edx + 32]

3104 sub ecx, 32	3601 sub ecx, 32

3105 jg convertloop	3602 jg convertloop

3106 vzeroupper	3603 vzeroupper

3107 ret	3604 ret

3108 }	3605 }

3109 }	3606 }

3110	3607

3111 __declspec(naked) __declspec(align(16))	3608 __declspec(naked)

3112 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,	3609 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

3113 uint8* dst_u, uint8* dst_v, int pix) {	3610 uint8* dst_u, uint8* dst_v, int pix) {

3114 __asm {	3611 __asm {

3115 push esi	3612 push esi

3116 push edi	3613 push edi

3117 mov eax, [esp + 8 + 4] // src_yuy2	3614 mov eax, [esp + 8 + 4] // src_yuy2

3118 mov esi, [esp + 8 + 8] // stride_yuy2	3615 mov esi, [esp + 8 + 8] // stride_yuy2

3119 mov edx, [esp + 8 + 12] // dst_u	3616 mov edx, [esp + 8 + 12] // dst_u

3120 mov edi, [esp + 8 + 16] // dst_v	3617 mov edi, [esp + 8 + 16] // dst_v

3121 mov ecx, [esp + 8 + 20] // pix	3618 mov ecx, [esp + 8 + 20] // pix

(...skipping 23 matching lines...) Expand all Loading...
3145 sub ecx, 32	3642 sub ecx, 32

3146 jg convertloop	3643 jg convertloop

3147	3644

3148 pop edi	3645 pop edi

3149 pop esi	3646 pop esi

3150 vzeroupper	3647 vzeroupper

3151 ret	3648 ret

3152 }	3649 }

3153 }	3650 }

3154	3651

3155 __declspec(naked) __declspec(align(16))	3652 __declspec(naked)

3156 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,	3653 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,

3157 uint8* dst_u, uint8* dst_v, int pix) {	3654 uint8* dst_u, uint8* dst_v, int pix) {

3158 __asm {	3655 __asm {

3159 push edi	3656 push edi

3160 mov eax, [esp + 4 + 4] // src_yuy2	3657 mov eax, [esp + 4 + 4] // src_yuy2

3161 mov edx, [esp + 4 + 8] // dst_u	3658 mov edx, [esp + 4 + 8] // dst_u

3162 mov edi, [esp + 4 + 12] // dst_v	3659 mov edi, [esp + 4 + 12] // dst_v

3163 mov ecx, [esp + 4 + 16] // pix	3660 mov ecx, [esp + 4 + 16] // pix

3164 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff	3661 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff

3165 vpsrlw ymm5, ymm5, 8	3662 vpsrlw ymm5, ymm5, 8

(...skipping 18 matching lines...) Expand all Loading...
3184 lea edx, [edx + 16]	3681 lea edx, [edx + 16]

3185 sub ecx, 32	3682 sub ecx, 32

3186 jg convertloop	3683 jg convertloop

3187	3684

3188 pop edi	3685 pop edi

3189 vzeroupper	3686 vzeroupper

3190 ret	3687 ret

3191 }	3688 }

3192 }	3689 }

3193	3690

3194 __declspec(naked) __declspec(align(16))	3691 __declspec(naked)

3195 void UYVYToYRow_AVX2(const uint8* src_uyvy,	3692 void UYVYToYRow_AVX2(const uint8* src_uyvy,

3196 uint8* dst_y, int pix) {	3693 uint8* dst_y, int pix) {

3197 __asm {	3694 __asm {

3198 mov eax, [esp + 4] // src_uyvy	3695 mov eax, [esp + 4] // src_uyvy

3199 mov edx, [esp + 8] // dst_y	3696 mov edx, [esp + 8] // dst_y

3200 mov ecx, [esp + 12] // pix	3697 mov ecx, [esp + 12] // pix

3201	3698

3202 convertloop:	3699 convertloop:

3203 vmovdqu ymm0, [eax]	3700 vmovdqu ymm0, [eax]

3204 vmovdqu ymm1, [eax + 32]	3701 vmovdqu ymm1, [eax + 32]

3205 lea eax, [eax + 64]	3702 lea eax, [eax + 64]

3206 vpsrlw ymm0, ymm0, 8 // odd bytes are Y	3703 vpsrlw ymm0, ymm0, 8 // odd bytes are Y

3207 vpsrlw ymm1, ymm1, 8	3704 vpsrlw ymm1, ymm1, 8

3208 vpackuswb ymm0, ymm0, ymm1 // mutates.	3705 vpackuswb ymm0, ymm0, ymm1 // mutates.

3209 vpermq ymm0, ymm0, 0xd8	3706 vpermq ymm0, ymm0, 0xd8

3210 vmovdqu [edx], ymm0	3707 vmovdqu [edx], ymm0

3211 lea edx, [edx + 32]	3708 lea edx, [edx + 32]

3212 sub ecx, 32	3709 sub ecx, 32

3213 jg convertloop	3710 jg convertloop

3214 vzeroupper	3711 vzeroupper

3215 ret	3712 ret

3216 }	3713 }

3217 }	3714 }

3218	3715

3219 __declspec(naked) __declspec(align(16))	3716 __declspec(naked)

3220 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,	3717 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

3221 uint8* dst_u, uint8* dst_v, int pix) {	3718 uint8* dst_u, uint8* dst_v, int pix) {

3222 __asm {	3719 __asm {

3223 push esi	3720 push esi

3224 push edi	3721 push edi

3225 mov eax, [esp + 8 + 4] // src_yuy2	3722 mov eax, [esp + 8 + 4] // src_yuy2

3226 mov esi, [esp + 8 + 8] // stride_yuy2	3723 mov esi, [esp + 8 + 8] // stride_yuy2

3227 mov edx, [esp + 8 + 12] // dst_u	3724 mov edx, [esp + 8 + 12] // dst_u

3228 mov edi, [esp + 8 + 16] // dst_v	3725 mov edi, [esp + 8 + 16] // dst_v

3229 mov ecx, [esp + 8 + 20] // pix	3726 mov ecx, [esp + 8 + 20] // pix

(...skipping 23 matching lines...) Expand all Loading...
3253 sub ecx, 32	3750 sub ecx, 32

3254 jg convertloop	3751 jg convertloop

3255	3752

3256 pop edi	3753 pop edi

3257 pop esi	3754 pop esi

3258 vzeroupper	3755 vzeroupper

3259 ret	3756 ret

3260 }	3757 }

3261 }	3758 }

3262	3759

3263 __declspec(naked) __declspec(align(16))	3760 __declspec(naked)

3264 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,	3761 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

3265 uint8* dst_u, uint8* dst_v, int pix) {	3762 uint8* dst_u, uint8* dst_v, int pix) {

3266 __asm {	3763 __asm {

3267 push edi	3764 push edi

3268 mov eax, [esp + 4 + 4] // src_yuy2	3765 mov eax, [esp + 4 + 4] // src_yuy2

3269 mov edx, [esp + 4 + 8] // dst_u	3766 mov edx, [esp + 4 + 8] // dst_u

3270 mov edi, [esp + 4 + 12] // dst_v	3767 mov edi, [esp + 4 + 12] // dst_v

3271 mov ecx, [esp + 4 + 16] // pix	3768 mov ecx, [esp + 4 + 16] // pix

3272 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff	3769 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff

3273 vpsrlw ymm5, ymm5, 8	3770 vpsrlw ymm5, ymm5, 8

(...skipping 20 matching lines...) Expand all Loading...
3294 jg convertloop	3791 jg convertloop

3295	3792

3296 pop edi	3793 pop edi

3297 vzeroupper	3794 vzeroupper

3298 ret	3795 ret

3299 }	3796 }

3300 }	3797 }

3301 #endif // HAS_YUY2TOYROW_AVX2	3798 #endif // HAS_YUY2TOYROW_AVX2

3302	3799

3303 #ifdef HAS_YUY2TOYROW_SSE2	3800 #ifdef HAS_YUY2TOYROW_SSE2

3304 __declspec(naked) __declspec(align(16))	3801 __declspec(naked)

3305 void YUY2ToYRow_SSE2(const uint8* src_yuy2,	3802 void YUY2ToYRow_SSE2(const uint8* src_yuy2,

3306 uint8* dst_y, int pix) {	3803 uint8* dst_y, int pix) {

3307 __asm {	3804 __asm {

3308 mov eax, [esp + 4] // src_yuy2	3805 mov eax, [esp + 4] // src_yuy2

3309 mov edx, [esp + 8] // dst_y	3806 mov edx, [esp + 8] // dst_y

3310 mov ecx, [esp + 12] // pix	3807 mov ecx, [esp + 12] // pix

3311 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff	3808 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

3312 psrlw xmm5, 8	3809 psrlw xmm5, 8

3313	3810

3314 convertloop:	3811 convertloop:

3315 movdqu xmm0, [eax]	3812 movdqu xmm0, [eax]

3316 movdqu xmm1, [eax + 16]	3813 movdqu xmm1, [eax + 16]

3317 lea eax, [eax + 32]	3814 lea eax, [eax + 32]

3318 pand xmm0, xmm5 // even bytes are Y	3815 pand xmm0, xmm5 // even bytes are Y

3319 pand xmm1, xmm5	3816 pand xmm1, xmm5

3320 packuswb xmm0, xmm1	3817 packuswb xmm0, xmm1

3321 movdqu [edx], xmm0	3818 movdqu [edx], xmm0

3322 lea edx, [edx + 16]	3819 lea edx, [edx + 16]

3323 sub ecx, 16	3820 sub ecx, 16

3324 jg convertloop	3821 jg convertloop

3325 ret	3822 ret

3326 }	3823 }

3327 }	3824 }

3328	3825

3329 __declspec(naked) __declspec(align(16))	3826 __declspec(naked)

3330 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,	3827 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,

3331 uint8* dst_u, uint8* dst_v, int pix) {	3828 uint8* dst_u, uint8* dst_v, int pix) {

3332 __asm {	3829 __asm {

3333 push esi	3830 push esi

3334 push edi	3831 push edi

3335 mov eax, [esp + 8 + 4] // src_yuy2	3832 mov eax, [esp + 8 + 4] // src_yuy2

3336 mov esi, [esp + 8 + 8] // stride_yuy2	3833 mov esi, [esp + 8 + 8] // stride_yuy2

3337 mov edx, [esp + 8 + 12] // dst_u	3834 mov edx, [esp + 8 + 12] // dst_u

3338 mov edi, [esp + 8 + 16] // dst_v	3835 mov edi, [esp + 8 + 16] // dst_v

3339 mov ecx, [esp + 8 + 20] // pix	3836 mov ecx, [esp + 8 + 20] // pix

(...skipping 22 matching lines...) Expand all Loading...
3362 lea edx, [edx + 8]	3859 lea edx, [edx + 8]

3363 sub ecx, 16	3860 sub ecx, 16

3364 jg convertloop	3861 jg convertloop

3365	3862

3366 pop edi	3863 pop edi

3367 pop esi	3864 pop esi

3368 ret	3865 ret

3369 }	3866 }

3370 }	3867 }

3371	3868

3372 __declspec(naked) __declspec(align(16))	3869 __declspec(naked)

3373 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,	3870 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,

3374 uint8* dst_u, uint8* dst_v, int pix) {	3871 uint8* dst_u, uint8* dst_v, int pix) {

3375 __asm {	3872 __asm {

3376 push edi	3873 push edi

3377 mov eax, [esp + 4 + 4] // src_yuy2	3874 mov eax, [esp + 4 + 4] // src_yuy2

3378 mov edx, [esp + 4 + 8] // dst_u	3875 mov edx, [esp + 4 + 8] // dst_u

3379 mov edi, [esp + 4 + 12] // dst_v	3876 mov edi, [esp + 4 + 12] // dst_v

3380 mov ecx, [esp + 4 + 16] // pix	3877 mov ecx, [esp + 4 + 16] // pix

3381 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff	3878 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

3382 psrlw xmm5, 8	3879 psrlw xmm5, 8

(...skipping 15 matching lines...) Expand all Loading...
3398 movq qword ptr [edx + edi], xmm1	3895 movq qword ptr [edx + edi], xmm1

3399 lea edx, [edx + 8]	3896 lea edx, [edx + 8]

3400 sub ecx, 16	3897 sub ecx, 16

3401 jg convertloop	3898 jg convertloop

3402	3899

3403 pop edi	3900 pop edi

3404 ret	3901 ret

3405 }	3902 }

3406 }	3903 }

3407	3904

3408 __declspec(naked) __declspec(align(16))	3905 __declspec(naked)

3409 void UYVYToYRow_SSE2(const uint8* src_uyvy,	3906 void UYVYToYRow_SSE2(const uint8* src_uyvy,

3410 uint8* dst_y, int pix) {	3907 uint8* dst_y, int pix) {

3411 __asm {	3908 __asm {

3412 mov eax, [esp + 4] // src_uyvy	3909 mov eax, [esp + 4] // src_uyvy

3413 mov edx, [esp + 8] // dst_y	3910 mov edx, [esp + 8] // dst_y

3414 mov ecx, [esp + 12] // pix	3911 mov ecx, [esp + 12] // pix

3415	3912

3416 convertloop:	3913 convertloop:

3417 movdqu xmm0, [eax]	3914 movdqu xmm0, [eax]

3418 movdqu xmm1, [eax + 16]	3915 movdqu xmm1, [eax + 16]

3419 lea eax, [eax + 32]	3916 lea eax, [eax + 32]

3420 psrlw xmm0, 8 // odd bytes are Y	3917 psrlw xmm0, 8 // odd bytes are Y

3421 psrlw xmm1, 8	3918 psrlw xmm1, 8

3422 packuswb xmm0, xmm1	3919 packuswb xmm0, xmm1

3423 movdqu [edx], xmm0	3920 movdqu [edx], xmm0

3424 lea edx, [edx + 16]	3921 lea edx, [edx + 16]

3425 sub ecx, 16	3922 sub ecx, 16

3426 jg convertloop	3923 jg convertloop

3427 ret	3924 ret

3428 }	3925 }

3429 }	3926 }

3430	3927

3431 __declspec(naked) __declspec(align(16))	3928 __declspec(naked)

3432 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,	3929 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,

3433 uint8* dst_u, uint8* dst_v, int pix) {	3930 uint8* dst_u, uint8* dst_v, int pix) {

3434 __asm {	3931 __asm {

3435 push esi	3932 push esi

3436 push edi	3933 push edi

3437 mov eax, [esp + 8 + 4] // src_yuy2	3934 mov eax, [esp + 8 + 4] // src_yuy2

3438 mov esi, [esp + 8 + 8] // stride_yuy2	3935 mov esi, [esp + 8 + 8] // stride_yuy2

3439 mov edx, [esp + 8 + 12] // dst_u	3936 mov edx, [esp + 8 + 12] // dst_u

3440 mov edi, [esp + 8 + 16] // dst_v	3937 mov edi, [esp + 8 + 16] // dst_v

3441 mov ecx, [esp + 8 + 20] // pix	3938 mov ecx, [esp + 8 + 20] // pix

(...skipping 22 matching lines...) Expand all Loading...
3464 lea edx, [edx + 8]	3961 lea edx, [edx + 8]

3465 sub ecx, 16	3962 sub ecx, 16

3466 jg convertloop	3963 jg convertloop

3467	3964

3468 pop edi	3965 pop edi

3469 pop esi	3966 pop esi

3470 ret	3967 ret

3471 }	3968 }

3472 }	3969 }

3473	3970

3474 __declspec(naked) __declspec(align(16))	3971 __declspec(naked)

3475 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,	3972 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,

3476 uint8* dst_u, uint8* dst_v, int pix) {	3973 uint8* dst_u, uint8* dst_v, int pix) {

3477 __asm {	3974 __asm {

3478 push edi	3975 push edi

3479 mov eax, [esp + 4 + 4] // src_yuy2	3976 mov eax, [esp + 4 + 4] // src_yuy2

3480 mov edx, [esp + 4 + 8] // dst_u	3977 mov edx, [esp + 4 + 8] // dst_u

3481 mov edi, [esp + 4 + 12] // dst_v	3978 mov edi, [esp + 4 + 12] // dst_v

3482 mov ecx, [esp + 4 + 16] // pix	3979 mov ecx, [esp + 4 + 16] // pix

3483 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff	3980 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

3484 psrlw xmm5, 8	3981 psrlw xmm5, 8

(...skipping 18 matching lines...) Expand all Loading...
3503 jg convertloop	4000 jg convertloop

3504	4001

3505 pop edi	4002 pop edi

3506 ret	4003 ret

3507 }	4004 }

3508 }	4005 }

3509 #endif // HAS_YUY2TOYROW_SSE2	4006 #endif // HAS_YUY2TOYROW_SSE2

3510	4007

3511 #ifdef HAS_ARGBBLENDROW_SSE2	4008 #ifdef HAS_ARGBBLENDROW_SSE2

3512 // Blend 8 pixels at a time.	4009 // Blend 8 pixels at a time.

3513 __declspec(naked) __declspec(align(16))	4010 __declspec(naked)

3514 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,	4011 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

3515 uint8* dst_argb, int width) {	4012 uint8* dst_argb, int width) {

3516 __asm {	4013 __asm {

3517 push esi	4014 push esi

3518 mov eax, [esp + 4 + 4] // src_argb0	4015 mov eax, [esp + 4 + 4] // src_argb0

3519 mov esi, [esp + 4 + 8] // src_argb1	4016 mov esi, [esp + 4 + 8] // src_argb1

3520 mov edx, [esp + 4 + 12] // dst_argb	4017 mov edx, [esp + 4 + 12] // dst_argb

3521 mov ecx, [esp + 4 + 16] // width	4018 mov ecx, [esp + 4 + 16] // width

3522 pcmpeqb xmm7, xmm7 // generate constant 1	4019 pcmpeqb xmm7, xmm7 // generate constant 1

3523 psrlw xmm7, 15	4020 psrlw xmm7, 15

3524 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff	4021 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff

3525 psrlw xmm6, 8	4022 psrlw xmm6, 8

3526 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00	4023 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00

3527 psllw xmm5, 8	4024 psllw xmm5, 8

3528 pcmpeqb xmm4, xmm4 // generate mask 0xff000000	4025 pcmpeqb xmm4, xmm4 // generate mask 0xff000000

3529 pslld xmm4, 24	4026 pslld xmm4, 24

3530	4027 sub ecx, 4

3531 sub ecx, 1	4028 jl convertloop4b // less than 4 pixels?

3532 je convertloop1 // only 1 pixel?

3533 jl convertloop1b

3534

3535 // 1 pixel loop until destination pointer is aligned.

3536 alignloop1:

3537 test edx, 15 // aligned?

3538 je alignloop1b

3539 movd xmm3, [eax]

3540 lea eax, [eax + 4]

3541 movdqa xmm0, xmm3 // src argb

3542 pxor xmm3, xmm4 // ~alpha

3543 movd xmm2, [esi] // _r_b

3544 psrlw xmm3, 8 // alpha

3545 pshufhw xmm3, xmm3, 0F5h // 8 alpha words

3546 pshuflw xmm3, xmm3, 0F5h

3547 pand xmm2, xmm6 // _r_b

3548 paddw xmm3, xmm7 // 256 - alpha

3549 pmullw xmm2, xmm3 // _r_b * alpha

3550 movd xmm1, [esi] // _a_g

3551 lea esi, [esi + 4]

3552 psrlw xmm1, 8 // _a_g

3553 por xmm0, xmm4 // set alpha to 255

3554 pmullw xmm1, xmm3 // _a_g * alpha

3555 psrlw xmm2, 8 // _r_b convert to 8 bits again

3556 paddusb xmm0, xmm2 // + src argb

3557 pand xmm1, xmm5 // a_g_ convert to 8 bits again

3558 paddusb xmm0, xmm1 // + src argb

3559 movd [edx], xmm0

3560 lea edx, [edx + 4]

3561 sub ecx, 1

3562 jge alignloop1

3563

3564 alignloop1b:

3565 add ecx, 1 - 4

3566 jl convertloop4b

3567	4029

3568 // 4 pixel loop.	4030 // 4 pixel loop.

3569 convertloop4:	4031 convertloop4:

3570 movdqu xmm3, [eax] // src argb	4032 movdqu xmm3, [eax] // src argb

3571 lea eax, [eax + 16]	4033 lea eax, [eax + 16]

3572 movdqa xmm0, xmm3 // src argb	4034 movdqa xmm0, xmm3 // src argb

3573 pxor xmm3, xmm4 // ~alpha	4035 pxor xmm3, xmm4 // ~alpha

3574 movdqu xmm2, [esi] // _r_b	4036 movdqu xmm2, [esi] // _r_b

3575 psrlw xmm3, 8 // alpha	4037 psrlw xmm3, 8 // alpha

3576 pshufhw xmm3, xmm3, 0F5h // 8 alpha words	4038 pshufhw xmm3, xmm3, 0F5h // 8 alpha words

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3637 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80	4099 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80

3638 };	4100 };

3639 // Same as SSE2, but replaces:	4101 // Same as SSE2, but replaces:

3640 // psrlw xmm3, 8 // alpha	4102 // psrlw xmm3, 8 // alpha

3641 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words	4103 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words

3642 // pshuflw xmm3, xmm3, 0F5h	4104 // pshuflw xmm3, xmm3, 0F5h

3643 // with..	4105 // with..

3644 // pshufb xmm3, kShuffleAlpha // alpha	4106 // pshufb xmm3, kShuffleAlpha // alpha

3645 // Blend 8 pixels at a time.	4107 // Blend 8 pixels at a time.

3646	4108

3647 __declspec(naked) __declspec(align(16))	4109 __declspec(naked)

3648 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,	4110 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,

3649 uint8* dst_argb, int width) {	4111 uint8* dst_argb, int width) {

3650 __asm {	4112 __asm {

3651 push esi	4113 push esi

3652 mov eax, [esp + 4 + 4] // src_argb0	4114 mov eax, [esp + 4 + 4] // src_argb0

3653 mov esi, [esp + 4 + 8] // src_argb1	4115 mov esi, [esp + 4 + 8] // src_argb1

3654 mov edx, [esp + 4 + 12] // dst_argb	4116 mov edx, [esp + 4 + 12] // dst_argb

3655 mov ecx, [esp + 4 + 16] // width	4117 mov ecx, [esp + 4 + 16] // width

3656 pcmpeqb xmm7, xmm7 // generate constant 0x0001	4118 pcmpeqb xmm7, xmm7 // generate constant 0x0001

3657 psrlw xmm7, 15	4119 psrlw xmm7, 15

3658 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff	4120 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff

3659 psrlw xmm6, 8	4121 psrlw xmm6, 8

3660 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00	4122 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00

3661 psllw xmm5, 8	4123 psllw xmm5, 8

3662 pcmpeqb xmm4, xmm4 // generate mask 0xff000000	4124 pcmpeqb xmm4, xmm4 // generate mask 0xff000000

3663 pslld xmm4, 24	4125 pslld xmm4, 24

3664	4126 sub ecx, 4

3665 sub ecx, 1	4127 jl convertloop4b // less than 4 pixels?

3666 je convertloop1 // only 1 pixel?

3667 jl convertloop1b

3668

3669 // 1 pixel loop until destination pointer is aligned.

3670 alignloop1:

3671 test edx, 15 // aligned?

3672 je alignloop1b

3673 movd xmm3, [eax]

3674 lea eax, [eax + 4]

3675 movdqa xmm0, xmm3 // src argb

3676 pxor xmm3, xmm4 // ~alpha

3677 movd xmm2, [esi] // _r_b

3678 pshufb xmm3, kShuffleAlpha // alpha

3679 pand xmm2, xmm6 // _r_b

3680 paddw xmm3, xmm7 // 256 - alpha

3681 pmullw xmm2, xmm3 // _r_b * alpha

3682 movd xmm1, [esi] // _a_g

3683 lea esi, [esi + 4]

3684 psrlw xmm1, 8 // _a_g

3685 por xmm0, xmm4 // set alpha to 255

3686 pmullw xmm1, xmm3 // _a_g * alpha

3687 psrlw xmm2, 8 // _r_b convert to 8 bits again

3688 paddusb xmm0, xmm2 // + src argb

3689 pand xmm1, xmm5 // a_g_ convert to 8 bits again

3690 paddusb xmm0, xmm1 // + src argb

3691 movd [edx], xmm0

3692 lea edx, [edx + 4]

3693 sub ecx, 1

3694 jge alignloop1

3695

3696 alignloop1b:

3697 add ecx, 1 - 4

3698 jl convertloop4b

3699	4128

3700 // 4 pixel loop.	4129 // 4 pixel loop.

3701 convertloop4:	4130 convertloop4:

3702 movdqu xmm3, [eax] // src argb	4131 movdqu xmm3, [eax] // src argb

3703 lea eax, [eax + 16]	4132 lea eax, [eax + 16]

3704 movdqa xmm0, xmm3 // src argb	4133 movdqa xmm0, xmm3 // src argb

3705 pxor xmm3, xmm4 // ~alpha	4134 pxor xmm3, xmm4 // ~alpha

3706 movdqu xmm2, [esi] // _r_b	4135 movdqu xmm2, [esi] // _r_b

3707 pshufb xmm3, kShuffleAlpha // alpha	4136 pshufb xmm3, kShuffleAlpha // alpha

3708 pand xmm2, xmm6 // _r_b	4137 pand xmm2, xmm6 // _r_b

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3753	4182

3754 convertloop1b:	4183 convertloop1b:

3755 pop esi	4184 pop esi

3756 ret	4185 ret

3757 }	4186 }

3758 }	4187 }

3759 #endif // HAS_ARGBBLENDROW_SSSE3	4188 #endif // HAS_ARGBBLENDROW_SSSE3

3760	4189

3761 #ifdef HAS_ARGBATTENUATEROW_SSE2	4190 #ifdef HAS_ARGBATTENUATEROW_SSE2

3762 // Attenuate 4 pixels at a time.	4191 // Attenuate 4 pixels at a time.

3763 __declspec(naked) __declspec(align(16))	4192 __declspec(naked)

3764 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {	4193 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {

3765 __asm {	4194 __asm {

3766 mov eax, [esp + 4] // src_argb0	4195 mov eax, [esp + 4] // src_argb0

3767 mov edx, [esp + 8] // dst_argb	4196 mov edx, [esp + 8] // dst_argb

3768 mov ecx, [esp + 12] // width	4197 mov ecx, [esp + 12] // width

3769 pcmpeqb xmm4, xmm4 // generate mask 0xff000000	4198 pcmpeqb xmm4, xmm4 // generate mask 0xff000000

3770 pslld xmm4, 24	4199 pslld xmm4, 24

3771 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff	4200 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff

3772 psrld xmm5, 8	4201 psrld xmm5, 8

3773	4202

(...skipping 28 matching lines...) Expand all Loading...
3802	4231

3803 #ifdef HAS_ARGBATTENUATEROW_SSSE3	4232 #ifdef HAS_ARGBATTENUATEROW_SSSE3

3804 // Shuffle table duplicating alpha.	4233 // Shuffle table duplicating alpha.

3805 static const uvec8 kShuffleAlpha0 = {	4234 static const uvec8 kShuffleAlpha0 = {

3806 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,	4235 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,

3807 };	4236 };

3808 static const uvec8 kShuffleAlpha1 = {	4237 static const uvec8 kShuffleAlpha1 = {

3809 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,	4238 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,

3810 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,	4239 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,

3811 };	4240 };

3812 __declspec(naked) __declspec(align(16))	4241 __declspec(naked)

3813 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {	4242 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {

3814 __asm {	4243 __asm {

3815 mov eax, [esp + 4] // src_argb0	4244 mov eax, [esp + 4] // src_argb0

3816 mov edx, [esp + 8] // dst_argb	4245 mov edx, [esp + 8] // dst_argb

3817 mov ecx, [esp + 12] // width	4246 mov ecx, [esp + 12] // width

3818 pcmpeqb xmm3, xmm3 // generate mask 0xff000000	4247 pcmpeqb xmm3, xmm3 // generate mask 0xff000000

3819 pslld xmm3, 24	4248 pslld xmm3, 24

3820 movdqa xmm4, kShuffleAlpha0	4249 movdqa xmm4, kShuffleAlpha0

3821 movdqa xmm5, kShuffleAlpha1	4250 movdqa xmm5, kShuffleAlpha1

3822	4251

(...skipping 23 matching lines...) Expand all Loading...
3846 ret	4275 ret

3847 }	4276 }

3848 }	4277 }

3849 #endif // HAS_ARGBATTENUATEROW_SSSE3	4278 #endif // HAS_ARGBATTENUATEROW_SSSE3

3850	4279

3851 #ifdef HAS_ARGBATTENUATEROW_AVX2	4280 #ifdef HAS_ARGBATTENUATEROW_AVX2

3852 // Shuffle table duplicating alpha.	4281 // Shuffle table duplicating alpha.

3853 static const uvec8 kShuffleAlpha_AVX2 = {	4282 static const uvec8 kShuffleAlpha_AVX2 = {

3854 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u	4283 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u

3855 };	4284 };

3856 __declspec(naked) __declspec(align(16))	4285 __declspec(naked)

3857 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {	4286 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {

3858 __asm {	4287 __asm {

3859 mov eax, [esp + 4] // src_argb0	4288 mov eax, [esp + 4] // src_argb0

3860 mov edx, [esp + 8] // dst_argb	4289 mov edx, [esp + 8] // dst_argb

3861 mov ecx, [esp + 12] // width	4290 mov ecx, [esp + 12] // width

3862 sub edx, eax	4291 sub edx, eax

3863 vbroadcastf128 ymm4,kShuffleAlpha_AVX2	4292 vbroadcastf128 ymm4,kShuffleAlpha_AVX2

3864 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000	4293 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000

3865 vpslld ymm5, ymm5, 24	4294 vpslld ymm5, ymm5, 24

3866	4295

(...skipping 16 matching lines...) Expand all Loading...
3883 jg convertloop	4312 jg convertloop

3884	4313

3885 vzeroupper	4314 vzeroupper

3886 ret	4315 ret

3887 }	4316 }

3888 }	4317 }

3889 #endif // HAS_ARGBATTENUATEROW_AVX2	4318 #endif // HAS_ARGBATTENUATEROW_AVX2

3890	4319

3891 #ifdef HAS_ARGBUNATTENUATEROW_SSE2	4320 #ifdef HAS_ARGBUNATTENUATEROW_SSE2

3892 // Unattenuate 4 pixels at a time.	4321 // Unattenuate 4 pixels at a time.

3893 __declspec(naked) __declspec(align(16))	4322 __declspec(naked)

3894 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,	4323 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,

3895 int width) {	4324 int width) {

3896 __asm {	4325 __asm {

3897 push esi	4326 push esi

3898 push edi	4327 push edi

3899 mov eax, [esp + 8 + 4] // src_argb0	4328 mov eax, [esp + 8 + 4] // src_argb0

3900 mov edx, [esp + 8 + 8] // dst_argb	4329 mov edx, [esp + 8 + 8] // dst_argb

3901 mov ecx, [esp + 8 + 12] // width	4330 mov ecx, [esp + 8 + 12] // width

3902	4331

3903 convertloop:	4332 convertloop:

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3937 #endif // HAS_ARGBUNATTENUATEROW_SSE2	4366 #endif // HAS_ARGBUNATTENUATEROW_SSE2

3938	4367

3939 #ifdef HAS_ARGBUNATTENUATEROW_AVX2	4368 #ifdef HAS_ARGBUNATTENUATEROW_AVX2

3940 // Shuffle table duplicating alpha.	4369 // Shuffle table duplicating alpha.

3941 static const uvec8 kUnattenShuffleAlpha_AVX2 = {	4370 static const uvec8 kUnattenShuffleAlpha_AVX2 = {

3942 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u	4371 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u

3943 };	4372 };

3944 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.	4373 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.

3945 // USE_GATHER is not on by default, due to being a slow instruction.	4374 // USE_GATHER is not on by default, due to being a slow instruction.

3946 #ifdef USE_GATHER	4375 #ifdef USE_GATHER

3947 __declspec(naked) __declspec(align(16))	4376 __declspec(naked)

3948 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,	4377 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,

3949 int width) {	4378 int width) {

3950 __asm {	4379 __asm {

3951 mov eax, [esp + 4] // src_argb0	4380 mov eax, [esp + 4] // src_argb0

3952 mov edx, [esp + 8] // dst_argb	4381 mov edx, [esp + 8] // dst_argb

3953 mov ecx, [esp + 12] // width	4382 mov ecx, [esp + 12] // width

3954 sub edx, eax	4383 sub edx, eax

3955 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2	4384 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2

3956	4385

3957 convertloop:	4386 convertloop:

(...skipping 13 matching lines...) Expand all Loading...
3971 vmovdqu [eax + edx], ymm0	4400 vmovdqu [eax + edx], ymm0

3972 lea eax, [eax + 32]	4401 lea eax, [eax + 32]

3973 sub ecx, 8	4402 sub ecx, 8

3974 jg convertloop	4403 jg convertloop

3975	4404

3976 vzeroupper	4405 vzeroupper

3977 ret	4406 ret

3978 }	4407 }

3979 }	4408 }

3980 #else // USE_GATHER	4409 #else // USE_GATHER

3981 __declspec(naked) __declspec(align(16))	4410 __declspec(naked)

3982 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,	4411 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,

3983 int width) {	4412 int width) {

3984 __asm {	4413 __asm {

3985	4414

3986 mov eax, [esp + 4] // src_argb0	4415 mov eax, [esp + 4] // src_argb0

3987 mov edx, [esp + 8] // dst_argb	4416 mov edx, [esp + 8] // dst_argb

3988 mov ecx, [esp + 12] // width	4417 mov ecx, [esp + 12] // width

3989 sub edx, eax	4418 sub edx, eax

3990 vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2	4419 vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2

3991	4420

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4038 pop esi	4467 pop esi

4039 vzeroupper	4468 vzeroupper

4040 ret	4469 ret

4041 }	4470 }

4042 }	4471 }

4043 #endif // USE_GATHER	4472 #endif // USE_GATHER

4044 #endif // HAS_ARGBATTENUATEROW_AVX2	4473 #endif // HAS_ARGBATTENUATEROW_AVX2

4045	4474

4046 #ifdef HAS_ARGBGRAYROW_SSSE3	4475 #ifdef HAS_ARGBGRAYROW_SSSE3

4047 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.	4476 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.

4048 __declspec(naked) __declspec(align(16))	4477 __declspec(naked)

4049 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {	4478 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {

4050 __asm {	4479 __asm {

4051 mov eax, [esp + 4] /* src_argb */	4480 mov eax, [esp + 4] /* src_argb */

4052 mov edx, [esp + 8] /* dst_argb */	4481 mov edx, [esp + 8] /* dst_argb */

4053 mov ecx, [esp + 12] /* width */	4482 mov ecx, [esp + 12] /* width */

4054 movdqa xmm4, kARGBToYJ	4483 movdqa xmm4, kARGBToYJ

4055 movdqa xmm5, kAddYJ64	4484 movdqa xmm5, kAddYJ64

4056	4485

4057 convertloop:	4486 convertloop:

4058 movdqu xmm0, [eax] // G	4487 movdqu xmm0, [eax] // G

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4097	4526

4098 static const vec8 kARGBToSepiaG = {	4527 static const vec8 kARGBToSepiaG = {

4099 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0	4528 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0

4100 };	4529 };

4101	4530

4102 static const vec8 kARGBToSepiaR = {	4531 static const vec8 kARGBToSepiaR = {

4103 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0	4532 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0

4104 };	4533 };

4105	4534

4106 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.	4535 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.

4107 __declspec(naked) __declspec(align(16))	4536 __declspec(naked)

4108 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {	4537 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {

4109 __asm {	4538 __asm {

4110 mov eax, [esp + 4] /* dst_argb */	4539 mov eax, [esp + 4] /* dst_argb */

4111 mov ecx, [esp + 8] /* width */	4540 mov ecx, [esp + 8] /* width */

4112 movdqa xmm2, kARGBToSepiaB	4541 movdqa xmm2, kARGBToSepiaB

4113 movdqa xmm3, kARGBToSepiaG	4542 movdqa xmm3, kARGBToSepiaG

4114 movdqa xmm4, kARGBToSepiaR	4543 movdqa xmm4, kARGBToSepiaR

4115	4544

4116 convertloop:	4545 convertloop:

4117 movdqu xmm0, [eax] // B	4546 movdqu xmm0, [eax] // B

(...skipping 36 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4154 ret	4583 ret

4155 }	4584 }

4156 }	4585 }

4157 #endif // HAS_ARGBSEPIAROW_SSSE3	4586 #endif // HAS_ARGBSEPIAROW_SSSE3

4158	4587

4159 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3	4588 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3

4160 // Tranform 8 ARGB pixels (32 bytes) with color matrix.	4589 // Tranform 8 ARGB pixels (32 bytes) with color matrix.

4161 // Same as Sepia except matrix is provided.	4590 // Same as Sepia except matrix is provided.

4162 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R	4591 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R

4163 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.	4592 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.

4164 __declspec(naked) __declspec(align(16))	4593 __declspec(naked)

4165 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,	4594 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

4166 const int8* matrix_argb, int width) {	4595 const int8* matrix_argb, int width) {

4167 __asm {	4596 __asm {

4168 mov eax, [esp + 4] /* src_argb */	4597 mov eax, [esp + 4] /* src_argb */

4169 mov edx, [esp + 8] /* dst_argb */	4598 mov edx, [esp + 8] /* dst_argb */

4170 mov ecx, [esp + 12] /* matrix_argb */	4599 mov ecx, [esp + 12] /* matrix_argb */

4171 movdqu xmm5, [ecx]	4600 movdqu xmm5, [ecx]

4172 pshufd xmm2, xmm5, 0x00	4601 pshufd xmm2, xmm5, 0x00

4173 pshufd xmm3, xmm5, 0x55	4602 pshufd xmm3, xmm5, 0x55

4174 pshufd xmm4, xmm5, 0xaa	4603 pshufd xmm4, xmm5, 0xaa

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4215 lea edx, [edx + 32]	4644 lea edx, [edx + 32]

4216 sub ecx, 8	4645 sub ecx, 8

4217 jg convertloop	4646 jg convertloop

4218 ret	4647 ret

4219 }	4648 }

4220 }	4649 }

4221 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3	4650 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3

4222	4651

4223 #ifdef HAS_ARGBQUANTIZEROW_SSE2	4652 #ifdef HAS_ARGBQUANTIZEROW_SSE2

4224 // Quantize 4 ARGB pixels (16 bytes).	4653 // Quantize 4 ARGB pixels (16 bytes).

4225 __declspec(naked) __declspec(align(16))	4654 __declspec(naked)

4226 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,	4655 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,

4227 int interval_offset, int width) {	4656 int interval_offset, int width) {

4228 __asm {	4657 __asm {

4229 mov eax, [esp + 4] /* dst_argb */	4658 mov eax, [esp + 4] /* dst_argb */

4230 movd xmm2, [esp + 8] /* scale */	4659 movd xmm2, [esp + 8] /* scale */

4231 movd xmm3, [esp + 12] /* interval_size */	4660 movd xmm3, [esp + 12] /* interval_size */

4232 movd xmm4, [esp + 16] /* interval_offset */	4661 movd xmm4, [esp + 16] /* interval_offset */

4233 mov ecx, [esp + 20] /* width */	4662 mov ecx, [esp + 20] /* width */

4234 pshuflw xmm2, xmm2, 040h	4663 pshuflw xmm2, xmm2, 040h

4235 pshufd xmm2, xmm2, 044h	4664 pshufd xmm2, xmm2, 044h

(...skipping 24 matching lines...) Expand all Loading...
4260 lea eax, [eax + 16]	4689 lea eax, [eax + 16]

4261 sub ecx, 4	4690 sub ecx, 4

4262 jg convertloop	4691 jg convertloop

4263 ret	4692 ret

4264 }	4693 }

4265 }	4694 }

4266 #endif // HAS_ARGBQUANTIZEROW_SSE2	4695 #endif // HAS_ARGBQUANTIZEROW_SSE2

4267	4696

4268 #ifdef HAS_ARGBSHADEROW_SSE2	4697 #ifdef HAS_ARGBSHADEROW_SSE2

4269 // Shade 4 pixels at a time by specified value.	4698 // Shade 4 pixels at a time by specified value.

4270 __declspec(naked) __declspec(align(16))	4699 __declspec(naked)

4271 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,	4700 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,

4272 uint32 value) {	4701 uint32 value) {

4273 __asm {	4702 __asm {

4274 mov eax, [esp + 4] // src_argb	4703 mov eax, [esp + 4] // src_argb

4275 mov edx, [esp + 8] // dst_argb	4704 mov edx, [esp + 8] // dst_argb

4276 mov ecx, [esp + 12] // width	4705 mov ecx, [esp + 12] // width

4277 movd xmm2, [esp + 16] // value	4706 movd xmm2, [esp + 16] // value

4278 punpcklbw xmm2, xmm2	4707 punpcklbw xmm2, xmm2

4279 punpcklqdq xmm2, xmm2	4708 punpcklqdq xmm2, xmm2

4280	4709

(...skipping 13 matching lines...) Expand all Loading...
4294 sub ecx, 4	4723 sub ecx, 4

4295 jg convertloop	4724 jg convertloop

4296	4725

4297 ret	4726 ret

4298 }	4727 }

4299 }	4728 }

4300 #endif // HAS_ARGBSHADEROW_SSE2	4729 #endif // HAS_ARGBSHADEROW_SSE2

4301	4730

4302 #ifdef HAS_ARGBMULTIPLYROW_SSE2	4731 #ifdef HAS_ARGBMULTIPLYROW_SSE2

4303 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.	4732 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.

4304 __declspec(naked) __declspec(align(16))	4733 __declspec(naked)

4305 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,	4734 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

4306 uint8* dst_argb, int width) {	4735 uint8* dst_argb, int width) {

4307 __asm {	4736 __asm {

4308 push esi	4737 push esi

4309 mov eax, [esp + 4 + 4] // src_argb0	4738 mov eax, [esp + 4 + 4] // src_argb0

4310 mov esi, [esp + 4 + 8] // src_argb1	4739 mov esi, [esp + 4 + 8] // src_argb1

4311 mov edx, [esp + 4 + 12] // dst_argb	4740 mov edx, [esp + 4 + 12] // dst_argb

4312 mov ecx, [esp + 4 + 16] // width	4741 mov ecx, [esp + 4 + 16] // width

4313 pxor xmm5, xmm5 // constant 0	4742 pxor xmm5, xmm5 // constant 0

4314	4743

(...skipping 18 matching lines...) Expand all Loading...
4333	4762

4334 pop esi	4763 pop esi

4335 ret	4764 ret

4336 }	4765 }

4337 }	4766 }

4338 #endif // HAS_ARGBMULTIPLYROW_SSE2	4767 #endif // HAS_ARGBMULTIPLYROW_SSE2

4339	4768

4340 #ifdef HAS_ARGBADDROW_SSE2	4769 #ifdef HAS_ARGBADDROW_SSE2

4341 // Add 2 rows of ARGB pixels together, 4 pixels at a time.	4770 // Add 2 rows of ARGB pixels together, 4 pixels at a time.

4342 // TODO(fbarchard): Port this to posix, neon and other math functions.	4771 // TODO(fbarchard): Port this to posix, neon and other math functions.

4343 __declspec(naked) __declspec(align(16))	4772 __declspec(naked)

4344 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,	4773 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

4345 uint8* dst_argb, int width) {	4774 uint8* dst_argb, int width) {

4346 __asm {	4775 __asm {

4347 push esi	4776 push esi

4348 mov eax, [esp + 4 + 4] // src_argb0	4777 mov eax, [esp + 4 + 4] // src_argb0

4349 mov esi, [esp + 4 + 8] // src_argb1	4778 mov esi, [esp + 4 + 8] // src_argb1

4350 mov edx, [esp + 4 + 12] // dst_argb	4779 mov edx, [esp + 4 + 12] // dst_argb

4351 mov ecx, [esp + 4 + 16] // width	4780 mov ecx, [esp + 4 + 16] // width

4352	4781

4353 sub ecx, 4	4782 sub ecx, 4

(...skipping 27 matching lines...) Expand all Loading...
4381	4810

4382 convertloop19:	4811 convertloop19:

4383 pop esi	4812 pop esi

4384 ret	4813 ret

4385 }	4814 }

4386 }	4815 }

4387 #endif // HAS_ARGBADDROW_SSE2	4816 #endif // HAS_ARGBADDROW_SSE2

4388	4817

4389 #ifdef HAS_ARGBSUBTRACTROW_SSE2	4818 #ifdef HAS_ARGBSUBTRACTROW_SSE2

4390 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.	4819 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.

4391 __declspec(naked) __declspec(align(16))	4820 __declspec(naked)

4392 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,	4821 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

4393 uint8* dst_argb, int width) {	4822 uint8* dst_argb, int width) {

4394 __asm {	4823 __asm {

4395 push esi	4824 push esi

4396 mov eax, [esp + 4 + 4] // src_argb0	4825 mov eax, [esp + 4 + 4] // src_argb0

4397 mov esi, [esp + 4 + 8] // src_argb1	4826 mov esi, [esp + 4 + 8] // src_argb1

4398 mov edx, [esp + 4 + 12] // dst_argb	4827 mov edx, [esp + 4 + 12] // dst_argb

4399 mov ecx, [esp + 4 + 16] // width	4828 mov ecx, [esp + 4 + 16] // width

4400	4829

4401 convertloop:	4830 convertloop:

4402 movdqu xmm0, [eax] // read 4 pixels from src_argb0	4831 movdqu xmm0, [eax] // read 4 pixels from src_argb0

4403 lea eax, [eax + 16]	4832 lea eax, [eax + 16]

4404 movdqu xmm1, [esi] // read 4 pixels from src_argb1	4833 movdqu xmm1, [esi] // read 4 pixels from src_argb1

4405 lea esi, [esi + 16]	4834 lea esi, [esi + 16]

4406 psubusb xmm0, xmm1 // src_argb0 - src_argb1	4835 psubusb xmm0, xmm1 // src_argb0 - src_argb1

4407 movdqu [edx], xmm0	4836 movdqu [edx], xmm0

4408 lea edx, [edx + 16]	4837 lea edx, [edx + 16]

4409 sub ecx, 4	4838 sub ecx, 4

4410 jg convertloop	4839 jg convertloop

4411	4840

4412 pop esi	4841 pop esi

4413 ret	4842 ret

4414 }	4843 }

4415 }	4844 }

4416 #endif // HAS_ARGBSUBTRACTROW_SSE2	4845 #endif // HAS_ARGBSUBTRACTROW_SSE2

4417	4846

4418 #ifdef HAS_ARGBMULTIPLYROW_AVX2	4847 #ifdef HAS_ARGBMULTIPLYROW_AVX2

4419 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.	4848 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.

4420 __declspec(naked) __declspec(align(16))	4849 __declspec(naked)

4421 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,	4850 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

4422 uint8* dst_argb, int width) {	4851 uint8* dst_argb, int width) {

4423 __asm {	4852 __asm {

4424 push esi	4853 push esi

4425 mov eax, [esp + 4 + 4] // src_argb0	4854 mov eax, [esp + 4 + 4] // src_argb0

4426 mov esi, [esp + 4 + 8] // src_argb1	4855 mov esi, [esp + 4 + 8] // src_argb1

4427 mov edx, [esp + 4 + 12] // dst_argb	4856 mov edx, [esp + 4 + 12] // dst_argb

4428 mov ecx, [esp + 4 + 16] // width	4857 mov ecx, [esp + 4 + 16] // width

4429 vpxor ymm5, ymm5, ymm5 // constant 0	4858 vpxor ymm5, ymm5, ymm5 // constant 0

4430	4859

(...skipping 16 matching lines...) Expand all Loading...
4447	4876

4448 pop esi	4877 pop esi

4449 vzeroupper	4878 vzeroupper

4450 ret	4879 ret

4451 }	4880 }

4452 }	4881 }

4453 #endif // HAS_ARGBMULTIPLYROW_AVX2	4882 #endif // HAS_ARGBMULTIPLYROW_AVX2

4454	4883

4455 #ifdef HAS_ARGBADDROW_AVX2	4884 #ifdef HAS_ARGBADDROW_AVX2

4456 // Add 2 rows of ARGB pixels together, 8 pixels at a time.	4885 // Add 2 rows of ARGB pixels together, 8 pixels at a time.

4457 __declspec(naked) __declspec(align(16))	4886 __declspec(naked)

4458 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,	4887 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

4459 uint8* dst_argb, int width) {	4888 uint8* dst_argb, int width) {

4460 __asm {	4889 __asm {

4461 push esi	4890 push esi

4462 mov eax, [esp + 4 + 4] // src_argb0	4891 mov eax, [esp + 4 + 4] // src_argb0

4463 mov esi, [esp + 4 + 8] // src_argb1	4892 mov esi, [esp + 4 + 8] // src_argb1

4464 mov edx, [esp + 4 + 12] // dst_argb	4893 mov edx, [esp + 4 + 12] // dst_argb

4465 mov ecx, [esp + 4 + 16] // width	4894 mov ecx, [esp + 4 + 16] // width

4466	4895

4467 convertloop:	4896 convertloop:

4468 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0	4897 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0

4469 lea eax, [eax + 32]	4898 lea eax, [eax + 32]

4470 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1	4899 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1

4471 lea esi, [esi + 32]	4900 lea esi, [esi + 32]

4472 vmovdqu [edx], ymm0	4901 vmovdqu [edx], ymm0

4473 lea edx, [edx + 32]	4902 lea edx, [edx + 32]

4474 sub ecx, 8	4903 sub ecx, 8

4475 jg convertloop	4904 jg convertloop

4476	4905

4477 pop esi	4906 pop esi

4478 vzeroupper	4907 vzeroupper

4479 ret	4908 ret

4480 }	4909 }

4481 }	4910 }

4482 #endif // HAS_ARGBADDROW_AVX2	4911 #endif // HAS_ARGBADDROW_AVX2

4483	4912

4484 #ifdef HAS_ARGBSUBTRACTROW_AVX2	4913 #ifdef HAS_ARGBSUBTRACTROW_AVX2

4485 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.	4914 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.

4486 __declspec(naked) __declspec(align(16))	4915 __declspec(naked)

4487 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,	4916 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

4488 uint8* dst_argb, int width) {	4917 uint8* dst_argb, int width) {

4489 __asm {	4918 __asm {

4490 push esi	4919 push esi

4491 mov eax, [esp + 4 + 4] // src_argb0	4920 mov eax, [esp + 4 + 4] // src_argb0

4492 mov esi, [esp + 4 + 8] // src_argb1	4921 mov esi, [esp + 4 + 8] // src_argb1

4493 mov edx, [esp + 4 + 12] // dst_argb	4922 mov edx, [esp + 4 + 12] // dst_argb

4494 mov ecx, [esp + 4 + 16] // width	4923 mov ecx, [esp + 4 + 16] // width

4495	4924

4496 convertloop:	4925 convertloop:

(...skipping 11 matching lines...) Expand all Loading...
4508 ret	4937 ret

4509 }	4938 }

4510 }	4939 }

4511 #endif // HAS_ARGBSUBTRACTROW_AVX2	4940 #endif // HAS_ARGBSUBTRACTROW_AVX2

4512	4941

4513 #ifdef HAS_SOBELXROW_SSE2	4942 #ifdef HAS_SOBELXROW_SSE2

4514 // SobelX as a matrix is	4943 // SobelX as a matrix is

4515 // -1 0 1	4944 // -1 0 1

4516 // -2 0 2	4945 // -2 0 2

4517 // -1 0 1	4946 // -1 0 1

4518 __declspec(naked) __declspec(align(16))	4947 __declspec(naked)

4519 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,	4948 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,

4520 const uint8* src_y2, uint8* dst_sobelx, int width) {	4949 const uint8* src_y2, uint8* dst_sobelx, int width) {

4521 __asm {	4950 __asm {

4522 push esi	4951 push esi

4523 push edi	4952 push edi

4524 mov eax, [esp + 8 + 4] // src_y0	4953 mov eax, [esp + 8 + 4] // src_y0

4525 mov esi, [esp + 8 + 8] // src_y1	4954 mov esi, [esp + 8 + 8] // src_y1

4526 mov edi, [esp + 8 + 12] // src_y2	4955 mov edi, [esp + 8 + 12] // src_y2

4527 mov edx, [esp + 8 + 16] // dst_sobelx	4956 mov edx, [esp + 8 + 16] // dst_sobelx

4528 mov ecx, [esp + 8 + 20] // width	4957 mov ecx, [esp + 8 + 20] // width

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4564 ret	4993 ret

4565 }	4994 }

4566 }	4995 }

4567 #endif // HAS_SOBELXROW_SSE2	4996 #endif // HAS_SOBELXROW_SSE2

4568	4997

4569 #ifdef HAS_SOBELYROW_SSE2	4998 #ifdef HAS_SOBELYROW_SSE2

4570 // SobelY as a matrix is	4999 // SobelY as a matrix is

4571 // -1 -2 -1	5000 // -1 -2 -1

4572 // 0 0 0	5001 // 0 0 0

4573 // 1 2 1	5002 // 1 2 1

4574 __declspec(naked) __declspec(align(16))	5003 __declspec(naked)

4575 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,	5004 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,

4576 uint8* dst_sobely, int width) {	5005 uint8* dst_sobely, int width) {

4577 __asm {	5006 __asm {

4578 push esi	5007 push esi

4579 mov eax, [esp + 4 + 4] // src_y0	5008 mov eax, [esp + 4 + 4] // src_y0

4580 mov esi, [esp + 4 + 8] // src_y1	5009 mov esi, [esp + 4 + 8] // src_y1

4581 mov edx, [esp + 4 + 12] // dst_sobely	5010 mov edx, [esp + 4 + 12] // dst_sobely

4582 mov ecx, [esp + 4 + 16] // width	5011 mov ecx, [esp + 4 + 16] // width

4583 sub esi, eax	5012 sub esi, eax

4584 sub edx, eax	5013 sub edx, eax

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4617 }	5046 }

4618 }	5047 }

4619 #endif // HAS_SOBELYROW_SSE2	5048 #endif // HAS_SOBELYROW_SSE2

4620	5049

4621 #ifdef HAS_SOBELROW_SSE2	5050 #ifdef HAS_SOBELROW_SSE2

4622 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.	5051 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.

4623 // A = 255	5052 // A = 255

4624 // R = Sobel	5053 // R = Sobel

4625 // G = Sobel	5054 // G = Sobel

4626 // B = Sobel	5055 // B = Sobel

4627 __declspec(naked) __declspec(align(16))	5056 __declspec(naked)

4628 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,	5057 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

4629 uint8* dst_argb, int width) {	5058 uint8* dst_argb, int width) {

4630 __asm {	5059 __asm {

4631 push esi	5060 push esi

4632 mov eax, [esp + 4 + 4] // src_sobelx	5061 mov eax, [esp + 4 + 4] // src_sobelx

4633 mov esi, [esp + 4 + 8] // src_sobely	5062 mov esi, [esp + 4 + 8] // src_sobely

4634 mov edx, [esp + 4 + 12] // dst_argb	5063 mov edx, [esp + 4 + 12] // dst_argb

4635 mov ecx, [esp + 4 + 16] // width	5064 mov ecx, [esp + 4 + 16] // width

4636 sub esi, eax	5065 sub esi, eax

4637 pcmpeqb xmm5, xmm5 // alpha 255	5066 pcmpeqb xmm5, xmm5 // alpha 255

(...skipping 26 matching lines...) Expand all Loading...
4664 jg convertloop	5093 jg convertloop

4665	5094

4666 pop esi	5095 pop esi

4667 ret	5096 ret

4668 }	5097 }

4669 }	5098 }

4670 #endif // HAS_SOBELROW_SSE2	5099 #endif // HAS_SOBELROW_SSE2

4671	5100

4672 #ifdef HAS_SOBELTOPLANEROW_SSE2	5101 #ifdef HAS_SOBELTOPLANEROW_SSE2

4673 // Adds Sobel X and Sobel Y and stores Sobel into a plane.	5102 // Adds Sobel X and Sobel Y and stores Sobel into a plane.

4674 __declspec(naked) __declspec(align(16))	5103 __declspec(naked)

4675 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,	5104 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

4676 uint8* dst_y, int width) {	5105 uint8* dst_y, int width) {

4677 __asm {	5106 __asm {

4678 push esi	5107 push esi

4679 mov eax, [esp + 4 + 4] // src_sobelx	5108 mov eax, [esp + 4 + 4] // src_sobelx

4680 mov esi, [esp + 4 + 8] // src_sobely	5109 mov esi, [esp + 4 + 8] // src_sobely

4681 mov edx, [esp + 4 + 12] // dst_argb	5110 mov edx, [esp + 4 + 12] // dst_argb

4682 mov ecx, [esp + 4 + 16] // width	5111 mov ecx, [esp + 4 + 16] // width

4683 sub esi, eax	5112 sub esi, eax

4684	5113

(...skipping 12 matching lines...) Expand all Loading...
4697 }	5126 }

4698 }	5127 }

4699 #endif // HAS_SOBELTOPLANEROW_SSE2	5128 #endif // HAS_SOBELTOPLANEROW_SSE2

4700	5129

4701 #ifdef HAS_SOBELXYROW_SSE2	5130 #ifdef HAS_SOBELXYROW_SSE2

4702 // Mixes Sobel X, Sobel Y and Sobel into ARGB.	5131 // Mixes Sobel X, Sobel Y and Sobel into ARGB.

4703 // A = 255	5132 // A = 255

4704 // R = Sobel X	5133 // R = Sobel X

4705 // G = Sobel	5134 // G = Sobel

4706 // B = Sobel Y	5135 // B = Sobel Y

4707 __declspec(naked) __declspec(align(16))	5136 __declspec(naked)

4708 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,	5137 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

4709 uint8* dst_argb, int width) {	5138 uint8* dst_argb, int width) {

4710 __asm {	5139 __asm {

4711 push esi	5140 push esi

4712 mov eax, [esp + 4 + 4] // src_sobelx	5141 mov eax, [esp + 4 + 4] // src_sobelx

4713 mov esi, [esp + 4 + 8] // src_sobely	5142 mov esi, [esp + 4 + 8] // src_sobely

4714 mov edx, [esp + 4 + 12] // dst_argb	5143 mov edx, [esp + 4 + 12] // dst_argb

4715 mov ecx, [esp + 4 + 16] // width	5144 mov ecx, [esp + 4 + 16] // width

4716 sub esi, eax	5145 sub esi, eax

4717 pcmpeqb xmm5, xmm5 // alpha 255	5146 pcmpeqb xmm5, xmm5 // alpha 255

(...skipping 266 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4984 sub ecx, 1	5413 sub ecx, 1

4985 jge l1	5414 jge l1

4986	5415

4987 l1b:	5416 l1b:

4988 }	5417 }

4989 }	5418 }

4990 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2	5419 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2

4991	5420

4992 #ifdef HAS_ARGBAFFINEROW_SSE2	5421 #ifdef HAS_ARGBAFFINEROW_SSE2

4993 // Copy ARGB pixels from source image with slope to a row of destination.	5422 // Copy ARGB pixels from source image with slope to a row of destination.

4994 __declspec(naked) __declspec(align(16))	5423 __declspec(naked)

4995 LIBYUV_API	5424 LIBYUV_API

4996 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,	5425 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,

4997 uint8* dst_argb, const float* uv_dudv, int width) {	5426 uint8* dst_argb, const float* uv_dudv, int width) {

4998 __asm {	5427 __asm {

4999 push esi	5428 push esi

5000 push edi	5429 push edi

5001 mov eax, [esp + 12] // src_argb	5430 mov eax, [esp + 12] // src_argb

5002 mov esi, [esp + 16] // stride	5431 mov esi, [esp + 16] // stride

5003 mov edx, [esp + 20] // dst_argb	5432 mov edx, [esp + 20] // dst_argb

5004 mov ecx, [esp + 24] // pointer to uv_dudv	5433 mov ecx, [esp + 24] // pointer to uv_dudv

(...skipping 64 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5069 l1b:	5498 l1b:

5070 pop edi	5499 pop edi

5071 pop esi	5500 pop esi

5072 ret	5501 ret

5073 }	5502 }

5074 }	5503 }

5075 #endif // HAS_ARGBAFFINEROW_SSE2	5504 #endif // HAS_ARGBAFFINEROW_SSE2

5076	5505

5077 #ifdef HAS_INTERPOLATEROW_AVX2	5506 #ifdef HAS_INTERPOLATEROW_AVX2

5078 // Bilinear filter 32x2 -> 32x1	5507 // Bilinear filter 32x2 -> 32x1

5079 __declspec(naked) __declspec(align(16))	5508 __declspec(naked)

5080 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,	5509 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,

5081 ptrdiff_t src_stride, int dst_width,	5510 ptrdiff_t src_stride, int dst_width,

5082 int source_y_fraction) {	5511 int source_y_fraction) {

5083 __asm {	5512 __asm {

5084 push esi	5513 push esi

5085 push edi	5514 push edi

5086 mov edi, [esp + 8 + 4] // dst_ptr	5515 mov edi, [esp + 8 + 4] // dst_ptr

5087 mov esi, [esp + 8 + 8] // src_ptr	5516 mov esi, [esp + 8 + 8] // src_ptr

5088 mov edx, [esp + 8 + 12] // src_stride	5517 mov edx, [esp + 8 + 12] // src_stride

5089 mov ecx, [esp + 8 + 16] // dst_width	5518 mov ecx, [esp + 8 + 16] // dst_width

(...skipping 76 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5166 xloop99:	5595 xloop99:

5167 pop edi	5596 pop edi

5168 pop esi	5597 pop esi

5169 vzeroupper	5598 vzeroupper

5170 ret	5599 ret

5171 }	5600 }

5172 }	5601 }

5173 #endif // HAS_INTERPOLATEROW_AVX2	5602 #endif // HAS_INTERPOLATEROW_AVX2

5174	5603

5175 // Bilinear filter 16x2 -> 16x1	5604 // Bilinear filter 16x2 -> 16x1

5176 __declspec(naked) __declspec(align(16))	5605 __declspec(naked)

5177 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,	5606 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

5178 ptrdiff_t src_stride, int dst_width,	5607 ptrdiff_t src_stride, int dst_width,

5179 int source_y_fraction) {	5608 int source_y_fraction) {

5180 __asm {	5609 __asm {

5181 push esi	5610 push esi

5182 push edi	5611 push edi

5183 mov edi, [esp + 8 + 4] // dst_ptr	5612 mov edi, [esp + 8 + 4] // dst_ptr

5184 mov esi, [esp + 8 + 8] // src_ptr	5613 mov esi, [esp + 8 + 8] // src_ptr

5185 mov edx, [esp + 8 + 12] // src_stride	5614 mov edx, [esp + 8 + 12] // src_stride

5186 mov ecx, [esp + 8 + 16] // dst_width	5615 mov ecx, [esp + 8 + 16] // dst_width

(...skipping 80 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5267	5696

5268 xloop99:	5697 xloop99:

5269 pop edi	5698 pop edi

5270 pop esi	5699 pop esi

5271 ret	5700 ret

5272 }	5701 }

5273 }	5702 }

5274	5703

5275 #ifdef HAS_INTERPOLATEROW_SSE2	5704 #ifdef HAS_INTERPOLATEROW_SSE2

5276 // Bilinear filter 16x2 -> 16x1	5705 // Bilinear filter 16x2 -> 16x1

5277 __declspec(naked) __declspec(align(16))	5706 __declspec(naked)

5278 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,	5707 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,

5279 ptrdiff_t src_stride, int dst_width,	5708 ptrdiff_t src_stride, int dst_width,

5280 int source_y_fraction) {	5709 int source_y_fraction) {

5281 __asm {	5710 __asm {

5282 push esi	5711 push esi

5283 push edi	5712 push edi

5284 mov edi, [esp + 8 + 4] // dst_ptr	5713 mov edi, [esp + 8 + 4] // dst_ptr

5285 mov esi, [esp + 8 + 8] // src_ptr	5714 mov esi, [esp + 8 + 8] // src_ptr

5286 mov edx, [esp + 8 + 12] // src_stride	5715 mov edx, [esp + 8 + 12] // src_stride

5287 mov ecx, [esp + 8 + 16] // dst_width	5716 mov ecx, [esp + 8 + 16] // dst_width

(...skipping 85 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5373 jg xloop100	5802 jg xloop100

5374	5803

5375 xloop99:	5804 xloop99:

5376 pop edi	5805 pop edi

5377 pop esi	5806 pop esi

5378 ret	5807 ret

5379 }	5808 }

5380 }	5809 }

5381 #endif // HAS_INTERPOLATEROW_SSE2	5810 #endif // HAS_INTERPOLATEROW_SSE2

5382	5811

5383 // Specialized ARGB to Bayer that just isolates G channel.

5384 __declspec(naked) __declspec(align(16))

5385 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,

5386 uint32 selector, int pix) {

5387 __asm {

5388 mov eax, [esp + 4] // src_argb

5389 mov edx, [esp + 8] // dst_bayer

5390 // selector

5391 mov ecx, [esp + 16] // pix

5392 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff

5393 psrld xmm5, 24

5394

5395 wloop:

5396 movdqu xmm0, [eax]

5397 movdqu xmm1, [eax + 16]

5398 lea eax, [eax + 32]

5399 psrld xmm0, 8 // Move green to bottom.

5400 psrld xmm1, 8

5401 pand xmm0, xmm5

5402 pand xmm1, xmm5

5403 packssdw xmm0, xmm1

5404 packuswb xmm0, xmm1

5405 movq qword ptr [edx], xmm0

5406 lea edx, [edx + 8]

5407 sub ecx, 8

5408 jg wloop

5409 ret

5410 }

5411 }

5412

5413 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.	5812 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

5414 __declspec(naked) __declspec(align(16))	5813 __declspec(naked)

5415 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,	5814 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

5416 const uint8* shuffler, int pix) {	5815 const uint8* shuffler, int pix) {

5417 __asm {	5816 __asm {

5418 mov eax, [esp + 4] // src_argb	5817 mov eax, [esp + 4] // src_argb

5419 mov edx, [esp + 8] // dst_argb	5818 mov edx, [esp + 8] // dst_argb

5420 mov ecx, [esp + 12] // shuffler	5819 mov ecx, [esp + 12] // shuffler

5421 movdqu xmm5, [ecx]	5820 movdqu xmm5, [ecx]

5422 mov ecx, [esp + 16] // pix	5821 mov ecx, [esp + 16] // pix

5423	5822

5424 wloop:	5823 wloop:

5425 movdqu xmm0, [eax]	5824 movdqu xmm0, [eax]

5426 movdqu xmm1, [eax + 16]	5825 movdqu xmm1, [eax + 16]

5427 lea eax, [eax + 32]	5826 lea eax, [eax + 32]

5428 pshufb xmm0, xmm5	5827 pshufb xmm0, xmm5

5429 pshufb xmm1, xmm5	5828 pshufb xmm1, xmm5

5430 movdqu [edx], xmm0	5829 movdqu [edx], xmm0

5431 movdqu [edx + 16], xmm1	5830 movdqu [edx + 16], xmm1

5432 lea edx, [edx + 32]	5831 lea edx, [edx + 32]

5433 sub ecx, 8	5832 sub ecx, 8

5434 jg wloop	5833 jg wloop

5435 ret	5834 ret

5436 }	5835 }

5437 }	5836 }

5438	5837

5439 #ifdef HAS_ARGBSHUFFLEROW_AVX2	5838 #ifdef HAS_ARGBSHUFFLEROW_AVX2

5440 __declspec(naked) __declspec(align(16))	5839 __declspec(naked)

5441 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,	5840 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,

5442 const uint8* shuffler, int pix) {	5841 const uint8* shuffler, int pix) {

5443 __asm {	5842 __asm {

5444 mov eax, [esp + 4] // src_argb	5843 mov eax, [esp + 4] // src_argb

5445 mov edx, [esp + 8] // dst_argb	5844 mov edx, [esp + 8] // dst_argb

5446 mov ecx, [esp + 12] // shuffler	5845 mov ecx, [esp + 12] // shuffler

5447 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.	5846 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.

5448 mov ecx, [esp + 16] // pix	5847 mov ecx, [esp + 16] // pix

5449	5848

5450 wloop:	5849 wloop:

5451 vmovdqu ymm0, [eax]	5850 vmovdqu ymm0, [eax]

5452 vmovdqu ymm1, [eax + 32]	5851 vmovdqu ymm1, [eax + 32]

5453 lea eax, [eax + 64]	5852 lea eax, [eax + 64]

5454 vpshufb ymm0, ymm0, ymm5	5853 vpshufb ymm0, ymm0, ymm5

5455 vpshufb ymm1, ymm1, ymm5	5854 vpshufb ymm1, ymm1, ymm5

5456 vmovdqu [edx], ymm0	5855 vmovdqu [edx], ymm0

5457 vmovdqu [edx + 32], ymm1	5856 vmovdqu [edx + 32], ymm1

5458 lea edx, [edx + 64]	5857 lea edx, [edx + 64]

5459 sub ecx, 16	5858 sub ecx, 16

5460 jg wloop	5859 jg wloop

5461	5860

5462 vzeroupper	5861 vzeroupper

5463 ret	5862 ret

5464 }	5863 }

5465 }	5864 }

5466 #endif // HAS_ARGBSHUFFLEROW_AVX2	5865 #endif // HAS_ARGBSHUFFLEROW_AVX2

5467	5866

5468 __declspec(naked) __declspec(align(16))	5867 __declspec(naked)

5469 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,	5868 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,

5470 const uint8* shuffler, int pix) {	5869 const uint8* shuffler, int pix) {

5471 __asm {	5870 __asm {

5472 push ebx	5871 push ebx

5473 push esi	5872 push esi

5474 mov eax, [esp + 8 + 4] // src_argb	5873 mov eax, [esp + 8 + 4] // src_argb

5475 mov edx, [esp + 8 + 8] // dst_argb	5874 mov edx, [esp + 8 + 8] // dst_argb

5476 mov esi, [esp + 8 + 12] // shuffler	5875 mov esi, [esp + 8 + 12] // shuffler

5477 mov ecx, [esp + 8 + 16] // pix	5876 mov ecx, [esp + 8 + 16] // pix

5478 pxor xmm5, xmm5	5877 pxor xmm5, xmm5

(...skipping 101 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5580 ret	5979 ret

5581 }	5980 }

5582 }	5981 }

5583	5982

5584 // YUY2 - Macro-pixel = 2 image pixels	5983 // YUY2 - Macro-pixel = 2 image pixels

5585 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....	5984 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....

5586	5985

5587 // UYVY - Macro-pixel = 2 image pixels	5986 // UYVY - Macro-pixel = 2 image pixels

5588 // U0Y0V0Y1	5987 // U0Y0V0Y1

5589	5988

5590 __declspec(naked) __declspec(align(16))	5989 __declspec(naked)

5591 void I422ToYUY2Row_SSE2(const uint8* src_y,	5990 void I422ToYUY2Row_SSE2(const uint8* src_y,

5592 const uint8* src_u,	5991 const uint8* src_u,

5593 const uint8* src_v,	5992 const uint8* src_v,

5594 uint8* dst_frame, int width) {	5993 uint8* dst_frame, int width) {

5595 __asm {	5994 __asm {

5596 push esi	5995 push esi

5597 push edi	5996 push edi

5598 mov eax, [esp + 8 + 4] // src_y	5997 mov eax, [esp + 8 + 4] // src_y

5599 mov esi, [esp + 8 + 8] // src_u	5998 mov esi, [esp + 8 + 8] // src_u

5600 mov edx, [esp + 8 + 12] // src_v	5999 mov edx, [esp + 8 + 12] // src_v

(...skipping 16 matching lines...) Expand all Loading...
5617 lea edi, [edi + 32]	6016 lea edi, [edi + 32]

5618 sub ecx, 16	6017 sub ecx, 16

5619 jg convertloop	6018 jg convertloop

5620	6019

5621 pop edi	6020 pop edi

5622 pop esi	6021 pop esi

5623 ret	6022 ret

5624 }	6023 }

5625 }	6024 }

5626	6025

5627 __declspec(naked) __declspec(align(16))	6026 __declspec(naked)

5628 void I422ToUYVYRow_SSE2(const uint8* src_y,	6027 void I422ToUYVYRow_SSE2(const uint8* src_y,

5629 const uint8* src_u,	6028 const uint8* src_u,

5630 const uint8* src_v,	6029 const uint8* src_v,

5631 uint8* dst_frame, int width) {	6030 uint8* dst_frame, int width) {

5632 __asm {	6031 __asm {

5633 push esi	6032 push esi

5634 push edi	6033 push edi

5635 mov eax, [esp + 8 + 4] // src_y	6034 mov eax, [esp + 8 + 4] // src_y

5636 mov esi, [esp + 8 + 8] // src_u	6035 mov esi, [esp + 8 + 8] // src_u

5637 mov edx, [esp + 8 + 12] // src_v	6036 mov edx, [esp + 8 + 12] // src_v

(...skipping 17 matching lines...) Expand all Loading...
5655 sub ecx, 16	6054 sub ecx, 16

5656 jg convertloop	6055 jg convertloop

5657	6056

5658 pop edi	6057 pop edi

5659 pop esi	6058 pop esi

5660 ret	6059 ret

5661 }	6060 }

5662 }	6061 }

5663	6062

5664 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2	6063 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2

5665 __declspec(naked) __declspec(align(16))	6064 __declspec(naked)

5666 void ARGBPolynomialRow_SSE2(const uint8* src_argb,	6065 void ARGBPolynomialRow_SSE2(const uint8* src_argb,

5667 uint8* dst_argb, const float* poly,	6066 uint8* dst_argb, const float* poly,

5668 int width) {	6067 int width) {

5669 __asm {	6068 __asm {

5670 push esi	6069 push esi

5671 mov eax, [esp + 4 + 4] /* src_argb */	6070 mov eax, [esp + 4 + 4] /* src_argb */

5672 mov edx, [esp + 4 + 8] /* dst_argb */	6071 mov edx, [esp + 4 + 8] /* dst_argb */

5673 mov esi, [esp + 4 + 12] /* poly */	6072 mov esi, [esp + 4 + 12] /* poly */

5674 mov ecx, [esp + 4 + 16] /* width */	6073 mov ecx, [esp + 4 + 16] /* width */

5675 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.	6074 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5714 lea edx, [edx + 8]	6113 lea edx, [edx + 8]

5715 sub ecx, 2	6114 sub ecx, 2

5716 jg convertloop	6115 jg convertloop

5717 pop esi	6116 pop esi

5718 ret	6117 ret

5719 }	6118 }

5720 }	6119 }

5721 #endif // HAS_ARGBPOLYNOMIALROW_SSE2	6120 #endif // HAS_ARGBPOLYNOMIALROW_SSE2

5722	6121

5723 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2	6122 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2

5724 __declspec(naked) __declspec(align(16))	6123 __declspec(naked)

5725 void ARGBPolynomialRow_AVX2(const uint8* src_argb,	6124 void ARGBPolynomialRow_AVX2(const uint8* src_argb,

5726 uint8* dst_argb, const float* poly,	6125 uint8* dst_argb, const float* poly,

5727 int width) {	6126 int width) {

5728 __asm {	6127 __asm {

5729 mov eax, [esp + 4] /* src_argb */	6128 mov eax, [esp + 4] /* src_argb */

5730 mov edx, [esp + 8] /* dst_argb */	6129 mov edx, [esp + 8] /* dst_argb */

5731 mov ecx, [esp + 12] /* poly */	6130 mov ecx, [esp + 12] /* poly */

5732 vbroadcastf128 ymm4, [ecx] // C0	6131 vbroadcastf128 ymm4, [ecx] // C0

5733 vbroadcastf128 ymm5, [ecx + 16] // C1	6132 vbroadcastf128 ymm5, [ecx + 16] // C1

5734 vbroadcastf128 ymm6, [ecx + 32] // C2	6133 vbroadcastf128 ymm6, [ecx + 32] // C2

(...skipping 19 matching lines...) Expand all Loading...
5754 sub ecx, 2	6153 sub ecx, 2

5755 jg convertloop	6154 jg convertloop

5756 vzeroupper	6155 vzeroupper

5757 ret	6156 ret

5758 }	6157 }

5759 }	6158 }

5760 #endif // HAS_ARGBPOLYNOMIALROW_AVX2	6159 #endif // HAS_ARGBPOLYNOMIALROW_AVX2

5761	6160

5762 #ifdef HAS_ARGBCOLORTABLEROW_X86	6161 #ifdef HAS_ARGBCOLORTABLEROW_X86

5763 // Tranform ARGB pixels with color table.	6162 // Tranform ARGB pixels with color table.

5764 __declspec(naked) __declspec(align(16))	6163 __declspec(naked)

5765 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,	6164 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,

5766 int width) {	6165 int width) {

5767 __asm {	6166 __asm {

5768 push esi	6167 push esi

5769 mov eax, [esp + 4 + 4] /* dst_argb */	6168 mov eax, [esp + 4 + 4] /* dst_argb */

5770 mov esi, [esp + 4 + 8] /* table_argb */	6169 mov esi, [esp + 4 + 8] /* table_argb */

5771 mov ecx, [esp + 4 + 12] /* width */	6170 mov ecx, [esp + 4 + 12] /* width */

5772	6171

5773 // 1 pixel loop.	6172 // 1 pixel loop.

5774 convertloop:	6173 convertloop:

(...skipping 13 matching lines...) Expand all Loading...
5788 dec ecx	6187 dec ecx

5789 jg convertloop	6188 jg convertloop

5790 pop esi	6189 pop esi

5791 ret	6190 ret

5792 }	6191 }

5793 }	6192 }

5794 #endif // HAS_ARGBCOLORTABLEROW_X86	6193 #endif // HAS_ARGBCOLORTABLEROW_X86

5795	6194

5796 #ifdef HAS_RGBCOLORTABLEROW_X86	6195 #ifdef HAS_RGBCOLORTABLEROW_X86

5797 // Tranform RGB pixels with color table.	6196 // Tranform RGB pixels with color table.

5798 __declspec(naked) __declspec(align(16))	6197 __declspec(naked)

5799 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {	6198 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {

5800 __asm {	6199 __asm {

5801 push esi	6200 push esi

5802 mov eax, [esp + 4 + 4] /* dst_argb */	6201 mov eax, [esp + 4 + 4] /* dst_argb */

5803 mov esi, [esp + 4 + 8] /* table_argb */	6202 mov esi, [esp + 4 + 8] /* table_argb */

5804 mov ecx, [esp + 4 + 12] /* width */	6203 mov ecx, [esp + 4 + 12] /* width */

5805	6204

5806 // 1 pixel loop.	6205 // 1 pixel loop.

5807 convertloop:	6206 convertloop:

5808 movzx edx, byte ptr [eax]	6207 movzx edx, byte ptr [eax]

(...skipping 10 matching lines...) Expand all Loading...
5819 jg convertloop	6218 jg convertloop

5820	6219

5821 pop esi	6220 pop esi

5822 ret	6221 ret

5823 }	6222 }

5824 }	6223 }

5825 #endif // HAS_RGBCOLORTABLEROW_X86	6224 #endif // HAS_RGBCOLORTABLEROW_X86

5826	6225

5827 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3	6226 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3

5828 // Tranform RGB pixels with luma table.	6227 // Tranform RGB pixels with luma table.

5829 __declspec(naked) __declspec(align(16))	6228 __declspec(naked)

5830 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,	6229 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

5831 int width,	6230 int width,

5832 const uint8* luma, uint32 lumacoeff) {	6231 const uint8* luma, uint32 lumacoeff) {

5833 __asm {	6232 __asm {

5834 push esi	6233 push esi

5835 push edi	6234 push edi

5836 mov eax, [esp + 8 + 4] /* src_argb */	6235 mov eax, [esp + 8 + 4] /* src_argb */

5837 mov edi, [esp + 8 + 8] /* dst_argb */	6236 mov edi, [esp + 8 + 8] /* dst_argb */

5838 mov ecx, [esp + 8 + 12] /* width */	6237 mov ecx, [esp + 8 + 12] /* width */

5839 movd xmm2, dword ptr [esp + 8 + 16] // luma table	6238 movd xmm2, dword ptr [esp + 8 + 16] // luma table

(...skipping 77 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5917 jg convertloop	6316 jg convertloop

5918	6317

5919 pop edi	6318 pop edi

5920 pop esi	6319 pop esi

5921 ret	6320 ret

5922 }	6321 }

5923 }	6322 }

5924 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3	6323 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3

5925	6324

5926 #endif // defined(_M_X64)	6325 #endif // defined(_M_X64)

5927 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)	6326 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) \|\| defined(_M_X64))

5928	6327

5929 #ifdef __cplusplus	6328 #ifdef __cplusplus

5930 } // extern "C"	6329 } // extern "C"

5931 } // namespace libyuv	6330 } // namespace libyuv

5932 #endif	6331 #endif

OLD	NEW

« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_posix.cc ('k') | source/libvpx/third_party/libyuv/source/scale.cc » ('j') | no next file with comments »