Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(269)

Side by Side Diff: source/libvpx/third_party/libyuv/source/row_win.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "libyuv/row.h" 11 #include "libyuv/row.h"
12 12
13 #if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) 13 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
14 defined(_MSC_VER) && !defined(__clang__)
14 #include <emmintrin.h> 15 #include <emmintrin.h>
15 #include <tmmintrin.h> // For _mm_maddubs_epi16 16 #include <tmmintrin.h> // For _mm_maddubs_epi16
16 #endif 17 #endif
17 18
18 #ifdef __cplusplus 19 #ifdef __cplusplus
19 namespace libyuv { 20 namespace libyuv {
20 extern "C" { 21 extern "C" {
21 #endif 22 #endif
22 23
23 // This module is for Visual C. 24 // This module is for Visual C.
24 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ 25 #if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
25 (defined(_M_IX86) || defined(_M_X64)) 26 defined(_MSC_VER) && !defined(__clang__)
26
27 // YUV to RGB conversion constants.
28 // Y contribution to R,G,B. Scale and bias.
29 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
30 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
31
32 // U and V contributions to R,G,B.
33 #define UB -128 /* -min(128, round(2.018 * 64)) */
34 #define UG 25 /* -round(-0.391 * 64) */
35 #define VG 52 /* -round(-0.813 * 64) */
36 #define VR -102 /* -round(1.596 * 64) */
37
38 // Bias values to subtract 16 from Y and 128 from U and V.
39 #define BB (UB * 128 - YGB)
40 #define BG (UG * 128 + VG * 128 - YGB)
41 #define BR (VR * 128 - YGB)
42 27
43 struct YuvConstants { 28 struct YuvConstants {
44 lvec8 kUVToB; // 0 29 lvec8 kUVToB; // 0
45 lvec8 kUVToG; // 32 30 lvec8 kUVToG; // 32
46 lvec8 kUVToR; // 64 31 lvec8 kUVToR; // 64
47 lvec16 kUVBiasB; // 96 32 lvec16 kUVBiasB; // 96
48 lvec16 kUVBiasG; // 128 33 lvec16 kUVBiasG; // 128
49 lvec16 kUVBiasR; // 160 34 lvec16 kUVBiasR; // 160
50 lvec16 kYToRgb; // 192 35 lvec16 kYToRgb; // 192
51 }; 36 };
52 37
38 // BT.601 YUV to RGB reference
39 // R = (Y - 16) * 1.164 - V * -1.596
40 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
41 // B = (Y - 16) * 1.164 - U * -2.018
42
43 // Y contribution to R,G,B. Scale and bias.
44 // TODO(fbarchard): Consider moving constants into a common header.
45 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
46 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
47
48 // U and V contributions to R,G,B.
49 #define UB -128 /* max(-128, round(-2.018 * 64)) */
50 #define UG 25 /* round(0.391 * 64) */
51 #define VG 52 /* round(0.813 * 64) */
52 #define VR -102 /* round(-1.596 * 64) */
53
54 // Bias values to subtract 16 from Y and 128 from U and V.
55 #define BB (UB * 128 + YGB)
56 #define BG (UG * 128 + VG * 128 + YGB)
57 #define BR (VR * 128 + YGB)
58
53 // BT601 constants for YUV to RGB. 59 // BT601 constants for YUV to RGB.
54 static YuvConstants SIMD_ALIGNED(kYuvConstants) = { 60 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
55 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, 61 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
56 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, 62 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
57 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, 63 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
58 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, 64 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
59 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 65 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
60 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, 66 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
61 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, 67 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
62 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, 68 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
63 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, 69 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
64 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } 70 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
65 }; 71 };
66 72
67 // BT601 constants for NV21 where chroma plane is VU instead of UV. 73 // BT601 constants for NV21 where chroma plane is VU instead of UV.
68 static YuvConstants SIMD_ALIGNED(kYvuConstants) = { 74 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
69 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 75 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
70 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, 76 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
71 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, 77 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
72 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, 78 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
73 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, 79 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
74 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, 80 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
75 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, 81 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
76 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, 82 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
77 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, 83 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
78 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } 84 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
79 }; 85 };
80 86
87 #undef YG
88 #undef YGB
89 #undef UB
90 #undef UG
91 #undef VG
92 #undef VR
93 #undef BB
94 #undef BG
95 #undef BR
96
97 // JPEG YUV to RGB reference
98 // * R = Y - V * -1.40200
99 // * G = Y - U * 0.34414 - V * 0.71414
100 // * B = Y - U * -1.77200
101
102 // Y contribution to R,G,B. Scale and bias.
103 // TODO(fbarchard): Consider moving constants into a common header.
104 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
105 #define YGBJ 32 /* 64 / 2 */
106
107 // U and V contributions to R,G,B.
108 #define UBJ -113 /* round(-1.77200 * 64) */
109 #define UGJ 22 /* round(0.34414 * 64) */
110 #define VGJ 46 /* round(0.71414 * 64) */
111 #define VRJ -90 /* round(-1.40200 * 64) */
112
113 // Bias values to subtract 16 from Y and 128 from U and V.
114 #define BBJ (UBJ * 128 + YGBJ)
115 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
116 #define BRJ (VRJ * 128 + YGBJ)
117
118 // JPEG constants for YUV to RGB.
119 static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
120 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
121 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
122 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
123 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
124 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
125 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
126 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
127 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
128 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
129 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
130 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
131 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
132 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
133 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
134 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
135 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
136 };
137
138 #undef YGJ
139 #undef YGBJ
140 #undef UBJ
141 #undef UGJ
142 #undef VGJ
143 #undef VRJ
144 #undef BBJ
145 #undef BGJ
146 #undef BRJ
147
81 // 64 bit 148 // 64 bit
82 #if defined(_M_X64) 149 #if defined(_M_X64)
83 150 #if defined(HAS_I422TOARGBROW_SSSE3)
84 __declspec(align(16))
85 void I422ToARGBRow_SSSE3(const uint8* y_buf, 151 void I422ToARGBRow_SSSE3(const uint8* y_buf,
86 const uint8* u_buf, 152 const uint8* u_buf,
87 const uint8* v_buf, 153 const uint8* v_buf,
88 uint8* dst_argb, 154 uint8* dst_argb,
89 int width) { 155 int width) {
90 __m128i xmm0, xmm1, xmm2, xmm3; 156 __m128i xmm0, xmm1, xmm2, xmm3;
91 const __m128i xmm5 = _mm_set1_epi8(-1); 157 const __m128i xmm5 = _mm_set1_epi8(-1);
92 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; 158 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
93 159
94 while (width > 0) { 160 while (width > 0) {
(...skipping 29 matching lines...) Expand all
124 190
125 _mm_storeu_si128((__m128i *)dst_argb, xmm0); 191 _mm_storeu_si128((__m128i *)dst_argb, xmm0);
126 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); 192 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
127 193
128 y_buf += 8; 194 y_buf += 8;
129 u_buf += 4; 195 u_buf += 4;
130 dst_argb += 32; 196 dst_argb += 32;
131 width -= 8; 197 width -= 8;
132 } 198 }
133 } 199 }
134 200 #endif
135 // 32 bit 201 // 32 bit
136 #else // defined(_M_X64) 202 #else // defined(_M_X64)
137
138 #ifdef HAS_ARGBTOYROW_SSSE3 203 #ifdef HAS_ARGBTOYROW_SSSE3
139 204
140 // Constants for ARGB. 205 // Constants for ARGB.
141 static const vec8 kARGBToY = { 206 static const vec8 kARGBToY = {
142 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 207 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
143 }; 208 };
144 209
145 // JPeg full range. 210 // JPeg full range.
146 static const vec8 kARGBToYJ = { 211 static const vec8 kARGBToYJ = {
147 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 212 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after
250 static const uvec8 kShuffleMaskARGBToRGB24_0 = { 315 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
251 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 316 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
252 }; 317 };
253 318
254 // Shuffle table for converting ARGB to RAW. 319 // Shuffle table for converting ARGB to RAW.
255 static const uvec8 kShuffleMaskARGBToRAW_0 = { 320 static const uvec8 kShuffleMaskARGBToRAW_0 = {
256 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 321 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
257 }; 322 };
258 323
259 // Duplicates gray value 3 times and fills in alpha opaque. 324 // Duplicates gray value 3 times and fills in alpha opaque.
260 __declspec(naked) __declspec(align(16)) 325 __declspec(naked)
261 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 326 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
262 __asm { 327 __asm {
263 mov eax, [esp + 4] // src_y 328 mov eax, [esp + 4] // src_y
264 mov edx, [esp + 8] // dst_argb 329 mov edx, [esp + 8] // dst_argb
265 mov ecx, [esp + 12] // pix 330 mov ecx, [esp + 12] // pix
266 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 331 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
267 pslld xmm5, 24 332 pslld xmm5, 24
268 333
269 convertloop: 334 convertloop:
270 movq xmm0, qword ptr [eax] 335 movq xmm0, qword ptr [eax]
271 lea eax, [eax + 8] 336 lea eax, [eax + 8]
272 punpcklbw xmm0, xmm0 337 punpcklbw xmm0, xmm0
273 movdqa xmm1, xmm0 338 movdqa xmm1, xmm0
274 punpcklwd xmm0, xmm0 339 punpcklwd xmm0, xmm0
275 punpckhwd xmm1, xmm1 340 punpckhwd xmm1, xmm1
276 por xmm0, xmm5 341 por xmm0, xmm5
277 por xmm1, xmm5 342 por xmm1, xmm5
278 movdqu [edx], xmm0 343 movdqu [edx], xmm0
279 movdqu [edx + 16], xmm1 344 movdqu [edx + 16], xmm1
280 lea edx, [edx + 32] 345 lea edx, [edx + 32]
281 sub ecx, 8 346 sub ecx, 8
282 jg convertloop 347 jg convertloop
283 ret 348 ret
284 } 349 }
285 } 350 }
286 351
287 __declspec(naked) __declspec(align(16)) 352 #ifdef HAS_J400TOARGBROW_AVX2
353 // Duplicates gray value 3 times and fills in alpha opaque.
354 __declspec(naked)
355 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
356 __asm {
357 mov eax, [esp + 4] // src_y
358 mov edx, [esp + 8] // dst_argb
359 mov ecx, [esp + 12] // pix
360 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
361 vpslld ymm5, ymm5, 24
362
363 convertloop:
364 vmovdqu xmm0, [eax]
365 lea eax, [eax + 16]
366 vpermq ymm0, ymm0, 0xd8
367 vpunpcklbw ymm0, ymm0, ymm0
368 vpermq ymm0, ymm0, 0xd8
369 vpunpckhwd ymm1, ymm0, ymm0
370 vpunpcklwd ymm0, ymm0, ymm0
371 vpor ymm0, ymm0, ymm5
372 vpor ymm1, ymm1, ymm5
373 vmovdqu [edx], ymm0
374 vmovdqu [edx + 32], ymm1
375 lea edx, [edx + 64]
376 sub ecx, 16
377 jg convertloop
378 vzeroupper
379 ret
380 }
381 }
382 #endif // HAS_J400TOARGBROW_AVX2
383
384 __declspec(naked)
288 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 385 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
289 __asm { 386 __asm {
290 mov eax, [esp + 4] // src_rgb24 387 mov eax, [esp + 4] // src_rgb24
291 mov edx, [esp + 8] // dst_argb 388 mov edx, [esp + 8] // dst_argb
292 mov ecx, [esp + 12] // pix 389 mov ecx, [esp + 12] // pix
293 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 390 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
294 pslld xmm5, 24 391 pslld xmm5, 24
295 movdqa xmm4, kShuffleMaskRGB24ToARGB 392 movdqa xmm4, kShuffleMaskRGB24ToARGB
296 393
297 convertloop: 394 convertloop:
(...skipping 17 matching lines...) Expand all
315 movdqu [edx + 16], xmm1 412 movdqu [edx + 16], xmm1
316 por xmm3, xmm5 413 por xmm3, xmm5
317 movdqu [edx + 48], xmm3 414 movdqu [edx + 48], xmm3
318 lea edx, [edx + 64] 415 lea edx, [edx + 64]
319 sub ecx, 16 416 sub ecx, 16
320 jg convertloop 417 jg convertloop
321 ret 418 ret
322 } 419 }
323 } 420 }
324 421
325 __declspec(naked) __declspec(align(16)) 422 __declspec(naked)
326 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, 423 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
327 int pix) { 424 int pix) {
328 __asm { 425 __asm {
329 mov eax, [esp + 4] // src_raw 426 mov eax, [esp + 4] // src_raw
330 mov edx, [esp + 8] // dst_argb 427 mov edx, [esp + 8] // dst_argb
331 mov ecx, [esp + 12] // pix 428 mov ecx, [esp + 12] // pix
332 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 429 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
333 pslld xmm5, 24 430 pslld xmm5, 24
334 movdqa xmm4, kShuffleMaskRAWToARGB 431 movdqa xmm4, kShuffleMaskRAWToARGB
335 432
(...skipping 25 matching lines...) Expand all
361 } 458 }
362 } 459 }
363 460
364 // pmul method to replicate bits. 461 // pmul method to replicate bits.
365 // Math to replicate bits: 462 // Math to replicate bits:
366 // (v << 8) | (v << 3) 463 // (v << 8) | (v << 3)
367 // v * 256 + v * 8 464 // v * 256 + v * 8
368 // v * (256 + 8) 465 // v * (256 + 8)
369 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 466 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
370 // 20 instructions. 467 // 20 instructions.
371 __declspec(naked) __declspec(align(16)) 468 __declspec(naked)
372 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, 469 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
373 int pix) { 470 int pix) {
374 __asm { 471 __asm {
375 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 472 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
376 movd xmm5, eax 473 movd xmm5, eax
377 pshufd xmm5, xmm5, 0 474 pshufd xmm5, xmm5, 0
378 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 475 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
379 movd xmm6, eax 476 movd xmm6, eax
380 pshufd xmm6, xmm6, 0 477 pshufd xmm6, xmm6, 0
381 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 478 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
(...skipping 28 matching lines...) Expand all
410 punpckhbw xmm2, xmm0 507 punpckhbw xmm2, xmm0
411 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 508 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
412 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 509 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
413 lea eax, [eax + 16] 510 lea eax, [eax + 16]
414 sub ecx, 8 511 sub ecx, 8
415 jg convertloop 512 jg convertloop
416 ret 513 ret
417 } 514 }
418 } 515 }
419 516
517 #ifdef HAS_RGB565TOARGBROW_AVX2
518 // pmul method to replicate bits.
519 // Math to replicate bits:
520 // (v << 8) | (v << 3)
521 // v * 256 + v * 8
522 // v * (256 + 8)
523 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
524 __declspec(naked)
525 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
526 int pix) {
527 __asm {
528 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
529 vmovd xmm5, eax
530 vbroadcastss ymm5, xmm5
531 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
532 movd xmm6, eax
533 vbroadcastss ymm6, xmm6
534 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
535 vpsllw ymm3, ymm3, 11
536 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
537 vpsllw ymm4, ymm4, 10
538 vpsrlw ymm4, ymm4, 5
539 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
540 vpsllw ymm7, ymm7, 8
541
542 mov eax, [esp + 4] // src_rgb565
543 mov edx, [esp + 8] // dst_argb
544 mov ecx, [esp + 12] // pix
545 sub edx, eax
546 sub edx, eax
547
548 convertloop:
549 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
550 vpand ymm1, ymm0, ymm3 // R in upper 5 bits
551 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
552 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
553 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
554 vpsllw ymm1, ymm1, 8
555 vpor ymm1, ymm1, ymm2 // RB
556 vpand ymm0, ymm0, ymm4 // G in middle 6 bits
557 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
558 vpor ymm0, ymm0, ymm7 // AG
559 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
560 vpermq ymm1, ymm1, 0xd8
561 vpunpckhbw ymm2, ymm1, ymm0
562 vpunpcklbw ymm1, ymm1, ymm0
563 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
564 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
565 lea eax, [eax + 32]
566 sub ecx, 16
567 jg convertloop
568 vzeroupper
569 ret
570 }
571 }
572 #endif // HAS_RGB565TOARGBROW_AVX2
573
574 #ifdef HAS_ARGB1555TOARGBROW_AVX2
575 __declspec(naked)
576 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
577 int pix) {
578 __asm {
579 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
580 vmovd xmm5, eax
581 vbroadcastss ymm5, xmm5
582 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
583 movd xmm6, eax
584 vbroadcastss ymm6, xmm6
585 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
586 vpsllw ymm3, ymm3, 11
587 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
588 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
589 vpsllw ymm7, ymm7, 8
590
591 mov eax, [esp + 4] // src_argb1555
592 mov edx, [esp + 8] // dst_argb
593 mov ecx, [esp + 12] // pix
594 sub edx, eax
595 sub edx, eax
596
597 convertloop:
598 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
599 vpsllw ymm1, ymm0, 1 // R in upper 5 bits
600 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
601 vpand ymm1, ymm1, ymm3
602 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
603 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
604 vpsllw ymm1, ymm1, 8
605 vpor ymm1, ymm1, ymm2 // RB
606 vpsraw ymm2, ymm0, 8 // A
607 vpand ymm0, ymm0, ymm4 // G in middle 5 bits
608 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
609 vpand ymm2, ymm2, ymm7
610 vpor ymm0, ymm0, ymm2 // AG
611 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
612 vpermq ymm1, ymm1, 0xd8
613 vpunpckhbw ymm2, ymm1, ymm0
614 vpunpcklbw ymm1, ymm1, ymm0
615 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
616 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
617 lea eax, [eax + 32]
618 sub ecx, 16
619 jg convertloop
620 vzeroupper
621 ret
622 }
623 }
624 #endif // HAS_ARGB1555TOARGBROW_AVX2
625
626 #ifdef HAS_ARGB4444TOARGBROW_AVX2
627 __declspec(naked)
628 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
629 int pix) {
630 __asm {
631 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
632 vmovd xmm4, eax
633 vbroadcastss ymm4, xmm4
634 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
635 mov eax, [esp + 4] // src_argb4444
636 mov edx, [esp + 8] // dst_argb
637 mov ecx, [esp + 12] // pix
638 sub edx, eax
639 sub edx, eax
640
641 convertloop:
642 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
643 vpand ymm2, ymm0, ymm5 // mask high nibbles
644 vpand ymm0, ymm0, ymm4 // mask low nibbles
645 vpsrlw ymm3, ymm2, 4
646 vpsllw ymm1, ymm0, 4
647 vpor ymm2, ymm2, ymm3
648 vpor ymm0, ymm0, ymm1
649 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
650 vpermq ymm2, ymm2, 0xd8
651 vpunpckhbw ymm1, ymm0, ymm2
652 vpunpcklbw ymm0, ymm0, ymm2
653 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
654 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
655 lea eax, [eax + 32]
656 sub ecx, 16
657 jg convertloop
658 vzeroupper
659 ret
660 }
661 }
662 #endif // HAS_ARGB4444TOARGBROW_AVX2
663
420 // 24 instructions 664 // 24 instructions
421 __declspec(naked) __declspec(align(16)) 665 __declspec(naked)
422 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, 666 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
423 int pix) { 667 int pix) {
424 __asm { 668 __asm {
425 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 669 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
426 movd xmm5, eax 670 movd xmm5, eax
427 pshufd xmm5, xmm5, 0 671 pshufd xmm5, xmm5, 0
428 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 672 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
429 movd xmm6, eax 673 movd xmm6, eax
430 pshufd xmm6, xmm6, 0 674 pshufd xmm6, xmm6, 0
431 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 675 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
464 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 708 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
465 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 709 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
466 lea eax, [eax + 16] 710 lea eax, [eax + 16]
467 sub ecx, 8 711 sub ecx, 8
468 jg convertloop 712 jg convertloop
469 ret 713 ret
470 } 714 }
471 } 715 }
472 716
473 // 18 instructions. 717 // 18 instructions.
474 __declspec(naked) __declspec(align(16)) 718 __declspec(naked)
475 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, 719 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
476 int pix) { 720 int pix) {
477 __asm { 721 __asm {
478 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 722 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
479 movd xmm4, eax 723 movd xmm4, eax
480 pshufd xmm4, xmm4, 0 724 pshufd xmm4, xmm4, 0
481 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles 725 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
482 pslld xmm5, 4 726 pslld xmm5, 4
483 mov eax, [esp + 4] // src_argb4444 727 mov eax, [esp + 4] // src_argb4444
484 mov edx, [esp + 8] // dst_argb 728 mov edx, [esp + 8] // dst_argb
(...skipping 17 matching lines...) Expand all
502 punpckhbw xmm1, xmm2 746 punpckhbw xmm1, xmm2
503 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB 747 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
504 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB 748 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
505 lea eax, [eax + 16] 749 lea eax, [eax + 16]
506 sub ecx, 8 750 sub ecx, 8
507 jg convertloop 751 jg convertloop
508 ret 752 ret
509 } 753 }
510 } 754 }
511 755
512 __declspec(naked) __declspec(align(16)) 756 __declspec(naked)
513 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 757 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
514 __asm { 758 __asm {
515 mov eax, [esp + 4] // src_argb 759 mov eax, [esp + 4] // src_argb
516 mov edx, [esp + 8] // dst_rgb 760 mov edx, [esp + 8] // dst_rgb
517 mov ecx, [esp + 12] // pix 761 mov ecx, [esp + 12] // pix
518 movdqa xmm6, kShuffleMaskARGBToRGB24 762 movdqa xmm6, kShuffleMaskARGBToRGB24
519 763
520 convertloop: 764 convertloop:
521 movdqu xmm0, [eax] // fetch 16 pixels of argb 765 movdqu xmm0, [eax] // fetch 16 pixels of argb
522 movdqu xmm1, [eax + 16] 766 movdqu xmm1, [eax + 16]
(...skipping 17 matching lines...) Expand all
540 por xmm2, xmm3 // 12 bytes from 3 for 2 784 por xmm2, xmm3 // 12 bytes from 3 for 2
541 movdqu [edx + 16], xmm1 // store 1 785 movdqu [edx + 16], xmm1 // store 1
542 movdqu [edx + 32], xmm2 // store 2 786 movdqu [edx + 32], xmm2 // store 2
543 lea edx, [edx + 48] 787 lea edx, [edx + 48]
544 sub ecx, 16 788 sub ecx, 16
545 jg convertloop 789 jg convertloop
546 ret 790 ret
547 } 791 }
548 } 792 }
549 793
550 __declspec(naked) __declspec(align(16)) 794 __declspec(naked)
551 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 795 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
552 __asm { 796 __asm {
553 mov eax, [esp + 4] // src_argb 797 mov eax, [esp + 4] // src_argb
554 mov edx, [esp + 8] // dst_rgb 798 mov edx, [esp + 8] // dst_rgb
555 mov ecx, [esp + 12] // pix 799 mov ecx, [esp + 12] // pix
556 movdqa xmm6, kShuffleMaskARGBToRAW 800 movdqa xmm6, kShuffleMaskARGBToRAW
557 801
558 convertloop: 802 convertloop:
559 movdqu xmm0, [eax] // fetch 16 pixels of argb 803 movdqu xmm0, [eax] // fetch 16 pixels of argb
560 movdqu xmm1, [eax + 16] 804 movdqu xmm1, [eax + 16]
(...skipping 17 matching lines...) Expand all
578 por xmm2, xmm3 // 12 bytes from 3 for 2 822 por xmm2, xmm3 // 12 bytes from 3 for 2
579 movdqu [edx + 16], xmm1 // store 1 823 movdqu [edx + 16], xmm1 // store 1
580 movdqu [edx + 32], xmm2 // store 2 824 movdqu [edx + 32], xmm2 // store 2
581 lea edx, [edx + 48] 825 lea edx, [edx + 48]
582 sub ecx, 16 826 sub ecx, 16
583 jg convertloop 827 jg convertloop
584 ret 828 ret
585 } 829 }
586 } 830 }
587 831
588 __declspec(naked) __declspec(align(16)) 832 // 4 pixels
833 __declspec(naked)
589 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 834 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
590 __asm { 835 __asm {
591 mov eax, [esp + 4] // src_argb 836 mov eax, [esp + 4] // src_argb
592 mov edx, [esp + 8] // dst_rgb 837 mov edx, [esp + 8] // dst_rgb
593 mov ecx, [esp + 12] // pix 838 mov ecx, [esp + 12] // pix
594 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 839 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
595 psrld xmm3, 27 840 psrld xmm3, 27
596 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 841 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
597 psrld xmm4, 26 842 psrld xmm4, 26
598 pslld xmm4, 5 843 pslld xmm4, 5
(...skipping 16 matching lines...) Expand all
615 packssdw xmm0, xmm0 860 packssdw xmm0, xmm0
616 lea eax, [eax + 16] 861 lea eax, [eax + 16]
617 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 862 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
618 lea edx, [edx + 8] 863 lea edx, [edx + 8]
619 sub ecx, 4 864 sub ecx, 4
620 jg convertloop 865 jg convertloop
621 ret 866 ret
622 } 867 }
623 } 868 }
624 869
870 // 8 pixels
871 __declspec(naked)
872 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
873 const uint32 dither4, int pix) {
874 __asm {
875
876 mov eax, [esp + 4] // src_argb
877 mov edx, [esp + 8] // dst_rgb
878 movd xmm6, [esp + 12] // dither4
879 mov ecx, [esp + 16] // pix
880 punpcklbw xmm6, xmm6 // make dither 16 bytes
881 movdqa xmm7, xmm6
882 punpcklwd xmm6, xmm6
883 punpckhwd xmm7, xmm7
884 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
885 psrld xmm3, 27
886 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
887 psrld xmm4, 26
888 pslld xmm4, 5
889 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
890 pslld xmm5, 11
891
892 convertloop:
893 movdqu xmm0, [eax] // fetch 4 pixels of argb
894 paddusb xmm0, xmm6 // add dither
895 movdqa xmm1, xmm0 // B
896 movdqa xmm2, xmm0 // G
897 pslld xmm0, 8 // R
898 psrld xmm1, 3 // B
899 psrld xmm2, 5 // G
900 psrad xmm0, 16 // R
901 pand xmm1, xmm3 // B
902 pand xmm2, xmm4 // G
903 pand xmm0, xmm5 // R
904 por xmm1, xmm2 // BG
905 por xmm0, xmm1 // BGR
906 packssdw xmm0, xmm0
907 lea eax, [eax + 16]
908 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
909 lea edx, [edx + 8]
910 sub ecx, 4
911 jg convertloop
912 ret
913 }
914 }
915
916 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
917 __declspec(naked)
918 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
919 const uint32 dither4, int pix) {
920 __asm {
921 mov eax, [esp + 4] // src_argb
922 mov edx, [esp + 8] // dst_rgb
923 vbroadcastss xmm6, [esp + 12] // dither4
924 mov ecx, [esp + 16] // pix
925 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
926 vpermq ymm6, ymm6, 0xd8
927 vpunpcklwd ymm6, ymm6, ymm6
928 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
929 vpsrld ymm3, ymm3, 27
930 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
931 vpsrld ymm4, ymm4, 26
932 vpslld ymm4, ymm4, 5
933 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
934
935 convertloop:
936 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
937 vpaddusb ymm0, ymm0, ymm6 // add dither
938 vpsrld ymm2, ymm0, 5 // G
939 vpsrld ymm1, ymm0, 3 // B
940 vpsrld ymm0, ymm0, 8 // R
941 vpand ymm2, ymm2, ymm4 // G
942 vpand ymm1, ymm1, ymm3 // B
943 vpand ymm0, ymm0, ymm5 // R
944 vpor ymm1, ymm1, ymm2 // BG
945 vpor ymm0, ymm0, ymm1 // BGR
946 vpackusdw ymm0, ymm0, ymm0
947 vpermq ymm0, ymm0, 0xd8
948 lea eax, [eax + 32]
949 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
950 lea edx, [edx + 16]
951 sub ecx, 8
952 jg convertloop
953 vzeroupper
954 ret
955 }
956 }
957 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
958
625 // TODO(fbarchard): Improve sign extension/packing. 959 // TODO(fbarchard): Improve sign extension/packing.
626 __declspec(naked) __declspec(align(16)) 960 __declspec(naked)
627 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 961 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
628 __asm { 962 __asm {
629 mov eax, [esp + 4] // src_argb 963 mov eax, [esp + 4] // src_argb
630 mov edx, [esp + 8] // dst_rgb 964 mov edx, [esp + 8] // dst_rgb
631 mov ecx, [esp + 12] // pix 965 mov ecx, [esp + 12] // pix
632 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f 966 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
633 psrld xmm4, 27 967 psrld xmm4, 27
634 movdqa xmm5, xmm4 // generate mask 0x000003e0 968 movdqa xmm5, xmm4 // generate mask 0x000003e0
635 pslld xmm5, 5 969 pslld xmm5, 5
636 movdqa xmm6, xmm4 // generate mask 0x00007c00 970 movdqa xmm6, xmm4 // generate mask 0x00007c00
(...skipping 20 matching lines...) Expand all
657 packssdw xmm0, xmm0 991 packssdw xmm0, xmm0
658 lea eax, [eax + 16] 992 lea eax, [eax + 16]
659 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 993 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
660 lea edx, [edx + 8] 994 lea edx, [edx + 8]
661 sub ecx, 4 995 sub ecx, 4
662 jg convertloop 996 jg convertloop
663 ret 997 ret
664 } 998 }
665 } 999 }
666 1000
667 __declspec(naked) __declspec(align(16)) 1001 __declspec(naked)
668 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1002 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
669 __asm { 1003 __asm {
670 mov eax, [esp + 4] // src_argb 1004 mov eax, [esp + 4] // src_argb
671 mov edx, [esp + 8] // dst_rgb 1005 mov edx, [esp + 8] // dst_rgb
672 mov ecx, [esp + 12] // pix 1006 mov ecx, [esp + 12] // pix
673 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 1007 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
674 psllw xmm4, 12 1008 psllw xmm4, 12
675 movdqa xmm3, xmm4 // generate mask 0x00f000f0 1009 movdqa xmm3, xmm4 // generate mask 0x00f000f0
676 psrlw xmm3, 8 1010 psrlw xmm3, 8
677 1011
678 convertloop: 1012 convertloop:
679 movdqu xmm0, [eax] // fetch 4 pixels of argb 1013 movdqu xmm0, [eax] // fetch 4 pixels of argb
680 movdqa xmm1, xmm0 1014 movdqa xmm1, xmm0
681 pand xmm0, xmm3 // low nibble 1015 pand xmm0, xmm3 // low nibble
682 pand xmm1, xmm4 // high nibble 1016 pand xmm1, xmm4 // high nibble
683 psrld xmm0, 4 1017 psrld xmm0, 4
684 psrld xmm1, 8 1018 psrld xmm1, 8
685 por xmm0, xmm1 1019 por xmm0, xmm1
686 packuswb xmm0, xmm0 1020 packuswb xmm0, xmm0
687 lea eax, [eax + 16] 1021 lea eax, [eax + 16]
688 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 1022 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
689 lea edx, [edx + 8] 1023 lea edx, [edx + 8]
690 sub ecx, 4 1024 sub ecx, 4
691 jg convertloop 1025 jg convertloop
692 ret 1026 ret
693 } 1027 }
694 } 1028 }
695 1029
696 #ifdef HAS_ARGBTORGB565ROW_AVX2 1030 #ifdef HAS_ARGBTORGB565ROW_AVX2
697 __declspec(naked) __declspec(align(16)) 1031 __declspec(naked)
698 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1032 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
699 __asm { 1033 __asm {
700 mov eax, [esp + 4] // src_argb 1034 mov eax, [esp + 4] // src_argb
701 mov edx, [esp + 8] // dst_rgb 1035 mov edx, [esp + 8] // dst_rgb
702 mov ecx, [esp + 12] // pix 1036 mov ecx, [esp + 12] // pix
703 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f 1037 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
704 vpsrld ymm3, ymm3, 27 1038 vpsrld ymm3, ymm3, 27
705 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 1039 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
706 vpsrld ymm4, ymm4, 26 1040 vpsrld ymm4, ymm4, 26
707 vpslld ymm4, ymm4, 5 1041 vpslld ymm4, ymm4, 5
708 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800 1042 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
709 vpslld ymm5, ymm5, 11
710 1043
711 convertloop: 1044 convertloop:
712 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1045 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
713 vpsrld ymm2, ymm0, 5 // G 1046 vpsrld ymm2, ymm0, 5 // G
714 vpsrld ymm1, ymm0, 3 // B 1047 vpsrld ymm1, ymm0, 3 // B
715 vpslld ymm0, ymm0, 8 // R 1048 vpsrld ymm0, ymm0, 8 // R
716 vpand ymm2, ymm2, ymm4 // G 1049 vpand ymm2, ymm2, ymm4 // G
717 vpand ymm1, ymm1, ymm3 // B 1050 vpand ymm1, ymm1, ymm3 // B
718 vpsrad ymm0, ymm0, 16 // R
719 vpand ymm0, ymm0, ymm5 // R 1051 vpand ymm0, ymm0, ymm5 // R
720 vpor ymm1, ymm1, ymm2 // BG 1052 vpor ymm1, ymm1, ymm2 // BG
721 vpor ymm0, ymm0, ymm1 // BGR 1053 vpor ymm0, ymm0, ymm1 // BGR
722 vpackssdw ymm0, ymm0, ymm0 1054 vpackusdw ymm0, ymm0, ymm0
723 vpermq ymm0, ymm0, 0xd8 1055 vpermq ymm0, ymm0, 0xd8
724 lea eax, [eax + 32] 1056 lea eax, [eax + 32]
725 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 1057 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
726 lea edx, [edx + 16] 1058 lea edx, [edx + 16]
727 sub ecx, 8 1059 sub ecx, 8
728 jg convertloop 1060 jg convertloop
729 vzeroupper 1061 vzeroupper
730 ret 1062 ret
731 } 1063 }
732 } 1064 }
733 #endif // HAS_ARGBTORGB565ROW_AVX2 1065 #endif // HAS_ARGBTORGB565ROW_AVX2
734 1066
735 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 1067 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
736 __declspec(naked) __declspec(align(16)) 1068 __declspec(naked)
737 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1069 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
738 __asm { 1070 __asm {
739 mov eax, [esp + 4] // src_argb 1071 mov eax, [esp + 4] // src_argb
740 mov edx, [esp + 8] // dst_rgb 1072 mov edx, [esp + 8] // dst_rgb
741 mov ecx, [esp + 12] // pix 1073 mov ecx, [esp + 12] // pix
742 vpcmpeqb ymm4, ymm4, ymm4 1074 vpcmpeqb ymm4, ymm4, ymm4
743 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f 1075 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
744 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 1076 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
745 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 1077 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
746 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 1078 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
(...skipping 19 matching lines...) Expand all
766 lea edx, [edx + 16] 1098 lea edx, [edx + 16]
767 sub ecx, 8 1099 sub ecx, 8
768 jg convertloop 1100 jg convertloop
769 vzeroupper 1101 vzeroupper
770 ret 1102 ret
771 } 1103 }
772 } 1104 }
773 #endif // HAS_ARGBTOARGB1555ROW_AVX2 1105 #endif // HAS_ARGBTOARGB1555ROW_AVX2
774 1106
775 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 1107 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
776 __declspec(naked) __declspec(align(16)) 1108 __declspec(naked)
777 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1109 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
778 __asm { 1110 __asm {
779 mov eax, [esp + 4] // src_argb 1111 mov eax, [esp + 4] // src_argb
780 mov edx, [esp + 8] // dst_rgb 1112 mov edx, [esp + 8] // dst_rgb
781 mov ecx, [esp + 12] // pix 1113 mov ecx, [esp + 12] // pix
782 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 1114 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
783 vpsllw ymm4, ymm4, 12 1115 vpsllw ymm4, ymm4, 12
784 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 1116 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
785 1117
786 convertloop: 1118 convertloop:
(...skipping 10 matching lines...) Expand all
797 lea edx, [edx + 16] 1129 lea edx, [edx + 16]
798 sub ecx, 8 1130 sub ecx, 8
799 jg convertloop 1131 jg convertloop
800 vzeroupper 1132 vzeroupper
801 ret 1133 ret
802 } 1134 }
803 } 1135 }
804 #endif // HAS_ARGBTOARGB4444ROW_AVX2 1136 #endif // HAS_ARGBTOARGB4444ROW_AVX2
805 1137
806 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. 1138 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
807 __declspec(naked) __declspec(align(16)) 1139 __declspec(naked)
808 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1140 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
809 __asm { 1141 __asm {
810 mov eax, [esp + 4] /* src_argb */ 1142 mov eax, [esp + 4] /* src_argb */
811 mov edx, [esp + 8] /* dst_y */ 1143 mov edx, [esp + 8] /* dst_y */
812 mov ecx, [esp + 12] /* pix */ 1144 mov ecx, [esp + 12] /* pix */
813 movdqa xmm4, kARGBToY 1145 movdqa xmm4, kARGBToY
814 movdqa xmm5, kAddY16 1146 movdqa xmm5, kAddY16
815 1147
816 convertloop: 1148 convertloop:
817 movdqu xmm0, [eax] 1149 movdqu xmm0, [eax]
(...skipping 14 matching lines...) Expand all
832 movdqu [edx], xmm0 1164 movdqu [edx], xmm0
833 lea edx, [edx + 16] 1165 lea edx, [edx + 16]
834 sub ecx, 16 1166 sub ecx, 16
835 jg convertloop 1167 jg convertloop
836 ret 1168 ret
837 } 1169 }
838 } 1170 }
839 1171
840 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. 1172 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
841 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. 1173 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
842 __declspec(naked) __declspec(align(16)) 1174 __declspec(naked)
843 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1175 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
844 __asm { 1176 __asm {
845 mov eax, [esp + 4] /* src_argb */ 1177 mov eax, [esp + 4] /* src_argb */
846 mov edx, [esp + 8] /* dst_y */ 1178 mov edx, [esp + 8] /* dst_y */
847 mov ecx, [esp + 12] /* pix */ 1179 mov ecx, [esp + 12] /* pix */
848 movdqa xmm4, kARGBToYJ 1180 movdqa xmm4, kARGBToYJ
849 movdqa xmm5, kAddYJ64 1181 movdqa xmm5, kAddYJ64
850 1182
851 convertloop: 1183 convertloop:
852 movdqu xmm0, [eax] 1184 movdqu xmm0, [eax]
(...skipping 20 matching lines...) Expand all
873 } 1205 }
874 } 1206 }
875 1207
876 #ifdef HAS_ARGBTOYROW_AVX2 1208 #ifdef HAS_ARGBTOYROW_AVX2
877 // vpermd for vphaddw + vpackuswb vpermd. 1209 // vpermd for vphaddw + vpackuswb vpermd.
878 static const lvec32 kPermdARGBToY_AVX = { 1210 static const lvec32 kPermdARGBToY_AVX = {
879 0, 4, 1, 5, 2, 6, 3, 7 1211 0, 4, 1, 5, 2, 6, 3, 7
880 }; 1212 };
881 1213
882 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1214 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
883 __declspec(naked) __declspec(align(32)) 1215 __declspec(naked)
884 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 1216 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
885 __asm { 1217 __asm {
886 mov eax, [esp + 4] /* src_argb */ 1218 mov eax, [esp + 4] /* src_argb */
887 mov edx, [esp + 8] /* dst_y */ 1219 mov edx, [esp + 8] /* dst_y */
888 mov ecx, [esp + 12] /* pix */ 1220 mov ecx, [esp + 12] /* pix */
889 vbroadcastf128 ymm4, kARGBToY 1221 vbroadcastf128 ymm4, kARGBToY
890 vbroadcastf128 ymm5, kAddY16 1222 vbroadcastf128 ymm5, kAddY16
891 vmovdqu ymm6, kPermdARGBToY_AVX 1223 vmovdqu ymm6, kPermdARGBToY_AVX
892 1224
893 convertloop: 1225 convertloop:
(...skipping 16 matching lines...) Expand all
910 vmovdqu [edx], ymm0 1242 vmovdqu [edx], ymm0
911 lea edx, [edx + 32] 1243 lea edx, [edx + 32]
912 sub ecx, 32 1244 sub ecx, 32
913 jg convertloop 1245 jg convertloop
914 vzeroupper 1246 vzeroupper
915 ret 1247 ret
916 } 1248 }
917 } 1249 }
918 #endif // HAS_ARGBTOYROW_AVX2 1250 #endif // HAS_ARGBTOYROW_AVX2
919 1251
920 #ifdef HAS_ARGBTOYROW_AVX2 1252 #ifdef HAS_ARGBTOYJROW_AVX2
921 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1253 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
922 __declspec(naked) __declspec(align(32)) 1254 __declspec(naked)
923 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 1255 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
924 __asm { 1256 __asm {
925 mov eax, [esp + 4] /* src_argb */ 1257 mov eax, [esp + 4] /* src_argb */
926 mov edx, [esp + 8] /* dst_y */ 1258 mov edx, [esp + 8] /* dst_y */
927 mov ecx, [esp + 12] /* pix */ 1259 mov ecx, [esp + 12] /* pix */
928 vbroadcastf128 ymm4, kARGBToYJ 1260 vbroadcastf128 ymm4, kARGBToYJ
929 vbroadcastf128 ymm5, kAddYJ64 1261 vbroadcastf128 ymm5, kAddYJ64
930 vmovdqu ymm6, kPermdARGBToY_AVX 1262 vmovdqu ymm6, kPermdARGBToY_AVX
931 1263
932 convertloop: 1264 convertloop:
(...skipping 18 matching lines...) Expand all
951 lea edx, [edx + 32] 1283 lea edx, [edx + 32]
952 sub ecx, 32 1284 sub ecx, 32
953 jg convertloop 1285 jg convertloop
954 1286
955 vzeroupper 1287 vzeroupper
956 ret 1288 ret
957 } 1289 }
958 } 1290 }
959 #endif // HAS_ARGBTOYJROW_AVX2 1291 #endif // HAS_ARGBTOYJROW_AVX2
960 1292
961 __declspec(naked) __declspec(align(16)) 1293 __declspec(naked)
962 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1294 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
963 __asm { 1295 __asm {
964 mov eax, [esp + 4] /* src_argb */ 1296 mov eax, [esp + 4] /* src_argb */
965 mov edx, [esp + 8] /* dst_y */ 1297 mov edx, [esp + 8] /* dst_y */
966 mov ecx, [esp + 12] /* pix */ 1298 mov ecx, [esp + 12] /* pix */
967 movdqa xmm4, kBGRAToY 1299 movdqa xmm4, kBGRAToY
968 movdqa xmm5, kAddY16 1300 movdqa xmm5, kAddY16
969 1301
970 convertloop: 1302 convertloop:
971 movdqu xmm0, [eax] 1303 movdqu xmm0, [eax]
(...skipping 12 matching lines...) Expand all
984 packuswb xmm0, xmm2 1316 packuswb xmm0, xmm2
985 paddb xmm0, xmm5 1317 paddb xmm0, xmm5
986 movdqu [edx], xmm0 1318 movdqu [edx], xmm0
987 lea edx, [edx + 16] 1319 lea edx, [edx + 16]
988 sub ecx, 16 1320 sub ecx, 16
989 jg convertloop 1321 jg convertloop
990 ret 1322 ret
991 } 1323 }
992 } 1324 }
993 1325
994 __declspec(naked) __declspec(align(16)) 1326 __declspec(naked)
995 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1327 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
996 __asm { 1328 __asm {
997 mov eax, [esp + 4] /* src_argb */ 1329 mov eax, [esp + 4] /* src_argb */
998 mov edx, [esp + 8] /* dst_y */ 1330 mov edx, [esp + 8] /* dst_y */
999 mov ecx, [esp + 12] /* pix */ 1331 mov ecx, [esp + 12] /* pix */
1000 movdqa xmm4, kABGRToY 1332 movdqa xmm4, kABGRToY
1001 movdqa xmm5, kAddY16 1333 movdqa xmm5, kAddY16
1002 1334
1003 convertloop: 1335 convertloop:
1004 movdqu xmm0, [eax] 1336 movdqu xmm0, [eax]
(...skipping 12 matching lines...) Expand all
1017 packuswb xmm0, xmm2 1349 packuswb xmm0, xmm2
1018 paddb xmm0, xmm5 1350 paddb xmm0, xmm5
1019 movdqu [edx], xmm0 1351 movdqu [edx], xmm0
1020 lea edx, [edx + 16] 1352 lea edx, [edx + 16]
1021 sub ecx, 16 1353 sub ecx, 16
1022 jg convertloop 1354 jg convertloop
1023 ret 1355 ret
1024 } 1356 }
1025 } 1357 }
1026 1358
1027 __declspec(naked) __declspec(align(16)) 1359 __declspec(naked)
1028 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1360 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1029 __asm { 1361 __asm {
1030 mov eax, [esp + 4] /* src_argb */ 1362 mov eax, [esp + 4] /* src_argb */
1031 mov edx, [esp + 8] /* dst_y */ 1363 mov edx, [esp + 8] /* dst_y */
1032 mov ecx, [esp + 12] /* pix */ 1364 mov ecx, [esp + 12] /* pix */
1033 movdqa xmm4, kRGBAToY 1365 movdqa xmm4, kRGBAToY
1034 movdqa xmm5, kAddY16 1366 movdqa xmm5, kAddY16
1035 1367
1036 convertloop: 1368 convertloop:
1037 movdqu xmm0, [eax] 1369 movdqu xmm0, [eax]
(...skipping 12 matching lines...) Expand all
1050 packuswb xmm0, xmm2 1382 packuswb xmm0, xmm2
1051 paddb xmm0, xmm5 1383 paddb xmm0, xmm5
1052 movdqu [edx], xmm0 1384 movdqu [edx], xmm0
1053 lea edx, [edx + 16] 1385 lea edx, [edx + 16]
1054 sub ecx, 16 1386 sub ecx, 16
1055 jg convertloop 1387 jg convertloop
1056 ret 1388 ret
1057 } 1389 }
1058 } 1390 }
1059 1391
1060 __declspec(naked) __declspec(align(16)) 1392 __declspec(naked)
1061 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1393 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1062 uint8* dst_u, uint8* dst_v, int width) { 1394 uint8* dst_u, uint8* dst_v, int width) {
1063 __asm { 1395 __asm {
1064 push esi 1396 push esi
1065 push edi 1397 push edi
1066 mov eax, [esp + 8 + 4] // src_argb 1398 mov eax, [esp + 8 + 4] // src_argb
1067 mov esi, [esp + 8 + 8] // src_stride_argb 1399 mov esi, [esp + 8 + 8] // src_stride_argb
1068 mov edx, [esp + 8 + 12] // dst_u 1400 mov edx, [esp + 8 + 12] // dst_u
1069 mov edi, [esp + 8 + 16] // dst_v 1401 mov edi, [esp + 8 + 16] // dst_v
1070 mov ecx, [esp + 8 + 20] // pix 1402 mov ecx, [esp + 8 + 20] // pix
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
1120 lea edx, [edx + 8] 1452 lea edx, [edx + 8]
1121 sub ecx, 16 1453 sub ecx, 16
1122 jg convertloop 1454 jg convertloop
1123 1455
1124 pop edi 1456 pop edi
1125 pop esi 1457 pop esi
1126 ret 1458 ret
1127 } 1459 }
1128 } 1460 }
1129 1461
1130 __declspec(naked) __declspec(align(16)) 1462 __declspec(naked)
1131 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1463 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1132 uint8* dst_u, uint8* dst_v, int width) { 1464 uint8* dst_u, uint8* dst_v, int width) {
1133 __asm { 1465 __asm {
1134 push esi 1466 push esi
1135 push edi 1467 push edi
1136 mov eax, [esp + 8 + 4] // src_argb 1468 mov eax, [esp + 8 + 4] // src_argb
1137 mov esi, [esp + 8 + 8] // src_stride_argb 1469 mov esi, [esp + 8 + 8] // src_stride_argb
1138 mov edx, [esp + 8 + 12] // dst_u 1470 mov edx, [esp + 8 + 12] // dst_u
1139 mov edi, [esp + 8 + 16] // dst_v 1471 mov edi, [esp + 8 + 16] // dst_v
1140 mov ecx, [esp + 8 + 20] // pix 1472 mov ecx, [esp + 8 + 20] // pix
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
1192 sub ecx, 16 1524 sub ecx, 16
1193 jg convertloop 1525 jg convertloop
1194 1526
1195 pop edi 1527 pop edi
1196 pop esi 1528 pop esi
1197 ret 1529 ret
1198 } 1530 }
1199 } 1531 }
1200 1532
1201 #ifdef HAS_ARGBTOUVROW_AVX2 1533 #ifdef HAS_ARGBTOUVROW_AVX2
1202 __declspec(naked) __declspec(align(32)) 1534 __declspec(naked)
1203 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, 1535 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1204 uint8* dst_u, uint8* dst_v, int width) { 1536 uint8* dst_u, uint8* dst_v, int width) {
1205 __asm { 1537 __asm {
1206 push esi 1538 push esi
1207 push edi 1539 push edi
1208 mov eax, [esp + 8 + 4] // src_argb 1540 mov eax, [esp + 8 + 4] // src_argb
1209 mov esi, [esp + 8 + 8] // src_stride_argb 1541 mov esi, [esp + 8 + 8] // src_stride_argb
1210 mov edx, [esp + 8 + 12] // dst_u 1542 mov edx, [esp + 8 + 12] // dst_u
1211 mov edi, [esp + 8 + 16] // dst_v 1543 mov edi, [esp + 8 + 16] // dst_v
1212 mov ecx, [esp + 8 + 20] // pix 1544 mov ecx, [esp + 8 + 20] // pix
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
1257 jg convertloop 1589 jg convertloop
1258 1590
1259 pop edi 1591 pop edi
1260 pop esi 1592 pop esi
1261 vzeroupper 1593 vzeroupper
1262 ret 1594 ret
1263 } 1595 }
1264 } 1596 }
1265 #endif // HAS_ARGBTOUVROW_AVX2 1597 #endif // HAS_ARGBTOUVROW_AVX2
1266 1598
1267 __declspec(naked) __declspec(align(16)) 1599 __declspec(naked)
1268 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, 1600 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1269 uint8* dst_u, uint8* dst_v, int width) { 1601 uint8* dst_u, uint8* dst_v, int width) {
1270 __asm { 1602 __asm {
1271 push edi 1603 push edi
1272 mov eax, [esp + 4 + 4] // src_argb 1604 mov eax, [esp + 4 + 4] // src_argb
1273 mov edx, [esp + 4 + 8] // dst_u 1605 mov edx, [esp + 4 + 8] // dst_u
1274 mov edi, [esp + 4 + 12] // dst_v 1606 mov edi, [esp + 4 + 12] // dst_v
1275 mov ecx, [esp + 4 + 16] // pix 1607 mov ecx, [esp + 4 + 16] // pix
1276 movdqa xmm5, kAddUV128 1608 movdqa xmm5, kAddUV128
1277 movdqa xmm6, kARGBToV 1609 movdqa xmm6, kARGBToV
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
1314 movdqu [edx + edi], xmm0 1646 movdqu [edx + edi], xmm0
1315 lea edx, [edx + 16] 1647 lea edx, [edx + 16]
1316 sub ecx, 16 1648 sub ecx, 16
1317 jg convertloop 1649 jg convertloop
1318 1650
1319 pop edi 1651 pop edi
1320 ret 1652 ret
1321 } 1653 }
1322 } 1654 }
1323 1655
1324 __declspec(naked) __declspec(align(16)) 1656 __declspec(naked)
1325 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, 1657 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1326 uint8* dst_u, uint8* dst_v, int width) { 1658 uint8* dst_u, uint8* dst_v, int width) {
1327 __asm { 1659 __asm {
1328 push edi 1660 push edi
1329 mov eax, [esp + 4 + 4] // src_argb 1661 mov eax, [esp + 4 + 4] // src_argb
1330 mov edx, [esp + 4 + 8] // dst_u 1662 mov edx, [esp + 4 + 8] // dst_u
1331 mov edi, [esp + 4 + 12] // dst_v 1663 mov edi, [esp + 4 + 12] // dst_v
1332 mov ecx, [esp + 4 + 16] // pix 1664 mov ecx, [esp + 4 + 16] // pix
1333 movdqa xmm5, kAddUV128 1665 movdqa xmm5, kAddUV128
1334 movdqa xmm6, kARGBToV 1666 movdqa xmm6, kARGBToV
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
1372 movhps qword ptr [edx + edi], xmm0 // V 1704 movhps qword ptr [edx + edi], xmm0 // V
1373 lea edx, [edx + 8] 1705 lea edx, [edx + 8]
1374 sub ecx, 16 1706 sub ecx, 16
1375 jg convertloop 1707 jg convertloop
1376 1708
1377 pop edi 1709 pop edi
1378 ret 1710 ret
1379 } 1711 }
1380 } 1712 }
1381 1713
1382 __declspec(naked) __declspec(align(16)) 1714 __declspec(naked)
1383 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1715 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1384 uint8* dst_u, uint8* dst_v, int width) { 1716 uint8* dst_u, uint8* dst_v, int width) {
1385 __asm { 1717 __asm {
1386 push esi 1718 push esi
1387 push edi 1719 push edi
1388 mov eax, [esp + 8 + 4] // src_argb 1720 mov eax, [esp + 8 + 4] // src_argb
1389 mov esi, [esp + 8 + 8] // src_stride_argb 1721 mov esi, [esp + 8 + 8] // src_stride_argb
1390 mov edx, [esp + 8 + 12] // dst_u 1722 mov edx, [esp + 8 + 12] // dst_u
1391 mov edi, [esp + 8 + 16] // dst_v 1723 mov edi, [esp + 8 + 16] // dst_v
1392 mov ecx, [esp + 8 + 20] // pix 1724 mov ecx, [esp + 8 + 20] // pix
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
1442 lea edx, [edx + 8] 1774 lea edx, [edx + 8]
1443 sub ecx, 16 1775 sub ecx, 16
1444 jg convertloop 1776 jg convertloop
1445 1777
1446 pop edi 1778 pop edi
1447 pop esi 1779 pop esi
1448 ret 1780 ret
1449 } 1781 }
1450 } 1782 }
1451 1783
1452 __declspec(naked) __declspec(align(16)) 1784 __declspec(naked)
1453 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1785 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1454 uint8* dst_u, uint8* dst_v, int width) { 1786 uint8* dst_u, uint8* dst_v, int width) {
1455 __asm { 1787 __asm {
1456 push esi 1788 push esi
1457 push edi 1789 push edi
1458 mov eax, [esp + 8 + 4] // src_argb 1790 mov eax, [esp + 8 + 4] // src_argb
1459 mov esi, [esp + 8 + 8] // src_stride_argb 1791 mov esi, [esp + 8 + 8] // src_stride_argb
1460 mov edx, [esp + 8 + 12] // dst_u 1792 mov edx, [esp + 8 + 12] // dst_u
1461 mov edi, [esp + 8 + 16] // dst_v 1793 mov edi, [esp + 8 + 16] // dst_v
1462 mov ecx, [esp + 8 + 20] // pix 1794 mov ecx, [esp + 8 + 20] // pix
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
1512 lea edx, [edx + 8] 1844 lea edx, [edx + 8]
1513 sub ecx, 16 1845 sub ecx, 16
1514 jg convertloop 1846 jg convertloop
1515 1847
1516 pop edi 1848 pop edi
1517 pop esi 1849 pop esi
1518 ret 1850 ret
1519 } 1851 }
1520 } 1852 }
1521 1853
1522 __declspec(naked) __declspec(align(16)) 1854 __declspec(naked)
1523 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1855 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1524 uint8* dst_u, uint8* dst_v, int width) { 1856 uint8* dst_u, uint8* dst_v, int width) {
1525 __asm { 1857 __asm {
1526 push esi 1858 push esi
1527 push edi 1859 push edi
1528 mov eax, [esp + 8 + 4] // src_argb 1860 mov eax, [esp + 8 + 4] // src_argb
1529 mov esi, [esp + 8 + 8] // src_stride_argb 1861 mov esi, [esp + 8 + 8] // src_stride_argb
1530 mov edx, [esp + 8 + 12] // dst_u 1862 mov edx, [esp + 8 + 12] // dst_u
1531 mov edi, [esp + 8 + 16] // dst_v 1863 mov edi, [esp + 8 + 16] // dst_v
1532 mov ecx, [esp + 8 + 20] // pix 1864 mov ecx, [esp + 8 + 20] // pix
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
1583 sub ecx, 16 1915 sub ecx, 16
1584 jg convertloop 1916 jg convertloop
1585 1917
1586 pop edi 1918 pop edi
1587 pop esi 1919 pop esi
1588 ret 1920 ret
1589 } 1921 }
1590 } 1922 }
1591 #endif // HAS_ARGBTOYROW_SSSE3 1923 #endif // HAS_ARGBTOYROW_SSSE3
1592 1924
1925 // Read 16 UV from 444
1926 #define READYUV444_AVX2 __asm { \
1927 __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \
1928 __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \
1929 __asm lea esi, [esi + 16] \
1930 __asm vpermq ymm0, ymm0, 0xd8 \
1931 __asm vpermq ymm1, ymm1, 0xd8 \
1932 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1933 }
1934
1593 // Read 8 UV from 422, upsample to 16 UV. 1935 // Read 8 UV from 422, upsample to 16 UV.
1594 #define READYUV422_AVX2 __asm { \ 1936 #define READYUV422_AVX2 __asm { \
1595 __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ 1937 __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
1596 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ 1938 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
1597 __asm lea esi, [esi + 8] \ 1939 __asm lea esi, [esi + 8] \
1598 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1940 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1599 __asm vpermq ymm0, ymm0, 0xd8 \ 1941 __asm vpermq ymm0, ymm0, 0xd8 \
1600 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1942 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1601 } 1943 }
1602 1944
1945 // Read 4 UV from 411, upsample to 16 UV.
1946 #define READYUV411_AVX2 __asm { \
1947 __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \
1948 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \
1949 __asm lea esi, [esi + 4] \
1950 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1951 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1952 __asm vpermq ymm0, ymm0, 0xd8 \
1953 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
1954 }
1955
1603 // Read 8 UV from NV12, upsample to 16 UV. 1956 // Read 8 UV from NV12, upsample to 16 UV.
1604 #define READNV12_AVX2 __asm { \ 1957 #define READNV12_AVX2 __asm { \
1605 __asm vmovdqu xmm0, [esi] /* UV */ \ 1958 __asm vmovdqu xmm0, [esi] /* UV */ \
1606 __asm lea esi, [esi + 16] \ 1959 __asm lea esi, [esi + 16] \
1607 __asm vpermq ymm0, ymm0, 0xd8 \ 1960 __asm vpermq ymm0, ymm0, 0xd8 \
1608 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1961 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1609 } 1962 }
1610 1963
1611 // Convert 16 pixels: 16 UV and 16 Y. 1964 // Convert 16 pixels: 16 UV and 16 Y.
1612 #define YUVTORGB_AVX2(YuvConstants) __asm { \ 1965 #define YUVTORGB_AVX2(YuvConstants) __asm { \
(...skipping 26 matching lines...) Expand all
1639 1992
1640 // Store 16 ARGB values. 1993 // Store 16 ARGB values.
1641 #define STOREARGB_AVX2 __asm { \ 1994 #define STOREARGB_AVX2 __asm { \
1642 /* Step 3: Weave into ARGB */ \ 1995 /* Step 3: Weave into ARGB */ \
1643 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ 1996 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
1644 __asm vpermq ymm0, ymm0, 0xd8 \ 1997 __asm vpermq ymm0, ymm0, 0xd8 \
1645 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ 1998 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
1646 __asm vpermq ymm2, ymm2, 0xd8 \ 1999 __asm vpermq ymm2, ymm2, 0xd8 \
1647 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ 2000 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
1648 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ 2001 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
1649 __asm vmovdqu [edx], ymm1 \ 2002 __asm vmovdqu 0[edx], ymm1 \
1650 __asm vmovdqu [edx + 32], ymm0 \ 2003 __asm vmovdqu 32[edx], ymm0 \
1651 __asm lea edx, [edx + 64] \ 2004 __asm lea edx, [edx + 64] \
1652 } 2005 }
1653 2006
1654 #ifdef HAS_I422TOARGBROW_AVX2 2007 #ifdef HAS_I422TOARGBROW_AVX2
1655 // 16 pixels 2008 // 16 pixels
1656 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2009 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
1657 __declspec(naked) __declspec(align(16)) 2010 __declspec(naked)
1658 void I422ToARGBRow_AVX2(const uint8* y_buf, 2011 void I422ToARGBRow_AVX2(const uint8* y_buf,
1659 const uint8* u_buf, 2012 const uint8* u_buf,
1660 const uint8* v_buf, 2013 const uint8* v_buf,
1661 uint8* dst_argb, 2014 uint8* dst_argb,
1662 int width) { 2015 int width) {
1663 __asm { 2016 __asm {
1664 push esi 2017 push esi
1665 push edi 2018 push edi
1666 mov eax, [esp + 8 + 4] // Y 2019 mov eax, [esp + 8 + 4] // Y
1667 mov esi, [esp + 8 + 8] // U 2020 mov esi, [esp + 8 + 8] // U
(...skipping 12 matching lines...) Expand all
1680 jg convertloop 2033 jg convertloop
1681 2034
1682 pop edi 2035 pop edi
1683 pop esi 2036 pop esi
1684 vzeroupper 2037 vzeroupper
1685 ret 2038 ret
1686 } 2039 }
1687 } 2040 }
1688 #endif // HAS_I422TOARGBROW_AVX2 2041 #endif // HAS_I422TOARGBROW_AVX2
1689 2042
2043 #ifdef HAS_J422TOARGBROW_AVX2
2044 // 16 pixels
2045 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2046 __declspec(naked)
2047 void J422ToARGBRow_AVX2(const uint8* y_buf,
2048 const uint8* u_buf,
2049 const uint8* v_buf,
2050 uint8* dst_argb,
2051 int width) {
2052 __asm {
2053 push esi
2054 push edi
2055 mov eax, [esp + 8 + 4] // Y
2056 mov esi, [esp + 8 + 8] // U
2057 mov edi, [esp + 8 + 12] // V
2058 mov edx, [esp + 8 + 16] // argb
2059 mov ecx, [esp + 8 + 20] // width
2060 sub edi, esi
2061 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2062
2063 convertloop:
2064 READYUV422_AVX2
2065 YUVTORGB_AVX2(kYuvJConstants)
2066 STOREARGB_AVX2
2067
2068 sub ecx, 16
2069 jg convertloop
2070
2071 pop edi
2072 pop esi
2073 vzeroupper
2074 ret
2075 }
2076 }
2077 #endif // HAS_J422TOARGBROW_AVX2
2078
2079 #ifdef HAS_I444TOARGBROW_AVX2
2080 // 16 pixels
2081 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2082 __declspec(naked)
2083 void I444ToARGBRow_AVX2(const uint8* y_buf,
2084 const uint8* u_buf,
2085 const uint8* v_buf,
2086 uint8* dst_argb,
2087 int width) {
2088 __asm {
2089 push esi
2090 push edi
2091 mov eax, [esp + 8 + 4] // Y
2092 mov esi, [esp + 8 + 8] // U
2093 mov edi, [esp + 8 + 12] // V
2094 mov edx, [esp + 8 + 16] // argb
2095 mov ecx, [esp + 8 + 20] // width
2096 sub edi, esi
2097 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2098
2099 convertloop:
2100 READYUV444_AVX2
2101 YUVTORGB_AVX2(kYuvConstants)
2102 STOREARGB_AVX2
2103
2104 sub ecx, 16
2105 jg convertloop
2106
2107 pop edi
2108 pop esi
2109 vzeroupper
2110 ret
2111 }
2112 }
2113 #endif // HAS_I444TOARGBROW_AVX2
2114
2115 #ifdef HAS_I411TOARGBROW_AVX2
2116 // 16 pixels
2117 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2118 __declspec(naked)
2119 void I411ToARGBRow_AVX2(const uint8* y_buf,
2120 const uint8* u_buf,
2121 const uint8* v_buf,
2122 uint8* dst_argb,
2123 int width) {
2124 __asm {
2125 push esi
2126 push edi
2127 mov eax, [esp + 8 + 4] // Y
2128 mov esi, [esp + 8 + 8] // U
2129 mov edi, [esp + 8 + 12] // V
2130 mov edx, [esp + 8 + 16] // argb
2131 mov ecx, [esp + 8 + 20] // width
2132 sub edi, esi
2133 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2134
2135 convertloop:
2136 READYUV411_AVX2
2137 YUVTORGB_AVX2(kYuvConstants)
2138 STOREARGB_AVX2
2139
2140 sub ecx, 16
2141 jg convertloop
2142
2143 pop edi
2144 pop esi
2145 vzeroupper
2146 ret
2147 }
2148 }
2149 #endif // HAS_I411TOARGBROW_AVX2
2150
1690 #ifdef HAS_NV12TOARGBROW_AVX2 2151 #ifdef HAS_NV12TOARGBROW_AVX2
1691 // 16 pixels. 2152 // 16 pixels.
1692 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2153 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
1693 __declspec(naked) __declspec(align(16)) 2154 __declspec(naked)
1694 void NV12ToARGBRow_AVX2(const uint8* y_buf, 2155 void NV12ToARGBRow_AVX2(const uint8* y_buf,
1695 const uint8* uv_buf, 2156 const uint8* uv_buf,
1696 uint8* dst_argb, 2157 uint8* dst_argb,
1697 int width) { 2158 int width) {
1698 __asm { 2159 __asm {
1699 push esi 2160 push esi
1700 mov eax, [esp + 4 + 4] // Y 2161 mov eax, [esp + 4 + 4] // Y
1701 mov esi, [esp + 4 + 8] // UV 2162 mov esi, [esp + 4 + 8] // UV
1702 mov edx, [esp + 4 + 12] // argb 2163 mov edx, [esp + 4 + 12] // argb
1703 mov ecx, [esp + 4 + 16] // width 2164 mov ecx, [esp + 4 + 16] // width
1704 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2165 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
1705 2166
1706 convertloop: 2167 convertloop:
1707 READNV12_AVX2 2168 READNV12_AVX2
1708 YUVTORGB_AVX2(kYuvConstants) 2169 YUVTORGB_AVX2(kYuvConstants)
1709 STOREARGB_AVX2 2170 STOREARGB_AVX2
1710 2171
1711 sub ecx, 16 2172 sub ecx, 16
1712 jg convertloop 2173 jg convertloop
1713 2174
1714 pop esi 2175 pop esi
2176 vzeroupper
1715 ret 2177 ret
1716 } 2178 }
1717 } 2179 }
1718 #endif // HAS_NV12TOARGBROW_AVX2 2180 #endif // HAS_NV12TOARGBROW_AVX2
1719 2181
1720 #ifdef HAS_NV21TOARGBROW_AVX2 2182 #ifdef HAS_NV21TOARGBROW_AVX2
1721 // 16 pixels. 2183 // 16 pixels.
1722 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). 2184 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
1723 __declspec(naked) __declspec(align(16)) 2185 __declspec(naked)
1724 void NV21ToARGBRow_AVX2(const uint8* y_buf, 2186 void NV21ToARGBRow_AVX2(const uint8* y_buf,
1725 const uint8* uv_buf, 2187 const uint8* uv_buf,
1726 uint8* dst_argb, 2188 uint8* dst_argb,
1727 int width) { 2189 int width) {
1728 __asm { 2190 __asm {
1729 push esi 2191 push esi
1730 mov eax, [esp + 4 + 4] // Y 2192 mov eax, [esp + 4 + 4] // Y
1731 mov esi, [esp + 4 + 8] // UV 2193 mov esi, [esp + 4 + 8] // UV
1732 mov edx, [esp + 4 + 12] // argb 2194 mov edx, [esp + 4 + 12] // argb
1733 mov ecx, [esp + 4 + 16] // width 2195 mov ecx, [esp + 4 + 16] // width
1734 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2196 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
1735 2197
1736 convertloop: 2198 convertloop:
1737 READNV12_AVX2 2199 READNV12_AVX2
1738 YUVTORGB_AVX2(kYvuConstants) 2200 YUVTORGB_AVX2(kYvuConstants)
1739 STOREARGB_AVX2 2201 STOREARGB_AVX2
1740 2202
1741 sub ecx, 16 2203 sub ecx, 16
1742 jg convertloop 2204 jg convertloop
1743 2205
1744 pop esi 2206 pop esi
2207 vzeroupper
1745 ret 2208 ret
1746 } 2209 }
1747 } 2210 }
1748 #endif // HAS_NV21TOARGBROW_AVX2 2211 #endif // HAS_NV21TOARGBROW_AVX2
1749 2212
1750 #ifdef HAS_I422TOBGRAROW_AVX2 2213 #ifdef HAS_I422TOBGRAROW_AVX2
1751 // 16 pixels 2214 // 16 pixels
1752 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). 2215 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
1753 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. 2216 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
1754 __declspec(naked) __declspec(align(16)) 2217 __declspec(naked)
1755 void I422ToBGRARow_AVX2(const uint8* y_buf, 2218 void I422ToBGRARow_AVX2(const uint8* y_buf,
1756 const uint8* u_buf, 2219 const uint8* u_buf,
1757 const uint8* v_buf, 2220 const uint8* v_buf,
1758 uint8* dst_argb, 2221 uint8* dst_argb,
1759 int width) { 2222 int width) {
1760 __asm { 2223 __asm {
1761 push esi 2224 push esi
1762 push edi 2225 push edi
1763 mov eax, [esp + 8 + 4] // Y 2226 mov eax, [esp + 8 + 4] // Y
1764 mov esi, [esp + 8 + 8] // U 2227 mov esi, [esp + 8 + 8] // U
(...skipping 25 matching lines...) Expand all
1790 vzeroupper 2253 vzeroupper
1791 ret 2254 ret
1792 } 2255 }
1793 } 2256 }
1794 #endif // HAS_I422TOBGRAROW_AVX2 2257 #endif // HAS_I422TOBGRAROW_AVX2
1795 2258
1796 #ifdef HAS_I422TORGBAROW_AVX2 2259 #ifdef HAS_I422TORGBAROW_AVX2
1797 // 16 pixels 2260 // 16 pixels
1798 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). 2261 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
1799 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. 2262 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
1800 __declspec(naked) __declspec(align(16)) 2263 __declspec(naked)
1801 void I422ToRGBARow_AVX2(const uint8* y_buf, 2264 void I422ToRGBARow_AVX2(const uint8* y_buf,
1802 const uint8* u_buf, 2265 const uint8* u_buf,
1803 const uint8* v_buf, 2266 const uint8* v_buf,
1804 uint8* dst_argb, 2267 uint8* dst_argb,
1805 int width) { 2268 int width) {
1806 __asm { 2269 __asm {
1807 push esi 2270 push esi
1808 push edi 2271 push edi
1809 mov eax, [esp + 8 + 4] // Y 2272 mov eax, [esp + 8 + 4] // Y
1810 mov esi, [esp + 8 + 8] // U 2273 mov esi, [esp + 8 + 8] // U
(...skipping 25 matching lines...) Expand all
1836 vzeroupper 2299 vzeroupper
1837 ret 2300 ret
1838 } 2301 }
1839 } 2302 }
1840 #endif // HAS_I422TORGBAROW_AVX2 2303 #endif // HAS_I422TORGBAROW_AVX2
1841 2304
1842 #ifdef HAS_I422TOABGRROW_AVX2 2305 #ifdef HAS_I422TOABGRROW_AVX2
1843 // 16 pixels 2306 // 16 pixels
1844 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). 2307 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
1845 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. 2308 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
1846 __declspec(naked) __declspec(align(16)) 2309 __declspec(naked)
1847 void I422ToABGRRow_AVX2(const uint8* y_buf, 2310 void I422ToABGRRow_AVX2(const uint8* y_buf,
1848 const uint8* u_buf, 2311 const uint8* u_buf,
1849 const uint8* v_buf, 2312 const uint8* v_buf,
1850 uint8* dst_argb, 2313 uint8* dst_argb,
1851 int width) { 2314 int width) {
1852 __asm { 2315 __asm {
1853 push esi 2316 push esi
1854 push edi 2317 push edi
1855 mov eax, [esp + 8 + 4] // Y 2318 mov eax, [esp + 8 + 4] // Y
1856 mov esi, [esp + 8 + 8] // U 2319 mov esi, [esp + 8 + 8] // U
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
1907 2370
1908 // Read 2 UV from 411, upsample to 8 UV. 2371 // Read 2 UV from 411, upsample to 8 UV.
1909 #define READYUV411 __asm { \ 2372 #define READYUV411 __asm { \
1910 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ 2373 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \
1911 __asm movd xmm0, ebx \ 2374 __asm movd xmm0, ebx \
1912 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ 2375 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \
1913 __asm movd xmm1, ebx \ 2376 __asm movd xmm1, ebx \
1914 __asm lea esi, [esi + 2] \ 2377 __asm lea esi, [esi + 2] \
1915 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2378 __asm punpcklbw xmm0, xmm1 /* UV */ \
1916 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2379 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
1917 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ 2380 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
1918 } 2381 }
1919 2382
1920 // Read 4 UV from NV12, upsample to 8 UV. 2383 // Read 4 UV from NV12, upsample to 8 UV.
1921 #define READNV12 __asm { \ 2384 #define READNV12 __asm { \
1922 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ 2385 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
1923 __asm lea esi, [esi + 8] \ 2386 __asm lea esi, [esi + 8] \
1924 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2387 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
1925 } 2388 }
1926 2389
1927 // Convert 8 pixels: 8 UV and 8 Y. 2390 // Convert 8 pixels: 8 UV and 8 Y.
(...skipping 28 matching lines...) Expand all
1956 } 2419 }
1957 2420
1958 // Store 8 ARGB values. 2421 // Store 8 ARGB values.
1959 #define STOREARGB __asm { \ 2422 #define STOREARGB __asm { \
1960 /* Step 3: Weave into ARGB */ \ 2423 /* Step 3: Weave into ARGB */ \
1961 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2424 __asm punpcklbw xmm0, xmm1 /* BG */ \
1962 __asm punpcklbw xmm2, xmm5 /* RA */ \ 2425 __asm punpcklbw xmm2, xmm5 /* RA */ \
1963 __asm movdqa xmm1, xmm0 \ 2426 __asm movdqa xmm1, xmm0 \
1964 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ 2427 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
1965 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ 2428 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
1966 __asm movdqu [edx], xmm0 \ 2429 __asm movdqu 0[edx], xmm0 \
1967 __asm movdqu [edx + 16], xmm1 \ 2430 __asm movdqu 16[edx], xmm1 \
1968 __asm lea edx, [edx + 32] \ 2431 __asm lea edx, [edx + 32] \
1969 } 2432 }
1970 2433
1971 // Store 8 BGRA values. 2434 // Store 8 BGRA values.
1972 #define STOREBGRA __asm { \ 2435 #define STOREBGRA __asm { \
1973 /* Step 3: Weave into BGRA */ \ 2436 /* Step 3: Weave into BGRA */ \
1974 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ 2437 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
1975 __asm punpcklbw xmm1, xmm0 /* GB */ \ 2438 __asm punpcklbw xmm1, xmm0 /* GB */ \
1976 __asm punpcklbw xmm5, xmm2 /* AR */ \ 2439 __asm punpcklbw xmm5, xmm2 /* AR */ \
1977 __asm movdqa xmm0, xmm5 \ 2440 __asm movdqa xmm0, xmm5 \
1978 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ 2441 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
1979 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ 2442 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
1980 __asm movdqu [edx], xmm5 \ 2443 __asm movdqu 0[edx], xmm5 \
1981 __asm movdqu [edx + 16], xmm0 \ 2444 __asm movdqu 16[edx], xmm0 \
1982 __asm lea edx, [edx + 32] \ 2445 __asm lea edx, [edx + 32] \
1983 } 2446 }
1984 2447
1985 // Store 8 ABGR values. 2448 // Store 8 ABGR values.
1986 #define STOREABGR __asm { \ 2449 #define STOREABGR __asm { \
1987 /* Step 3: Weave into ABGR */ \ 2450 /* Step 3: Weave into ABGR */ \
1988 __asm punpcklbw xmm2, xmm1 /* RG */ \ 2451 __asm punpcklbw xmm2, xmm1 /* RG */ \
1989 __asm punpcklbw xmm0, xmm5 /* BA */ \ 2452 __asm punpcklbw xmm0, xmm5 /* BA */ \
1990 __asm movdqa xmm1, xmm2 \ 2453 __asm movdqa xmm1, xmm2 \
1991 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ 2454 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \
1992 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ 2455 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \
1993 __asm movdqu [edx], xmm2 \ 2456 __asm movdqu 0[edx], xmm2 \
1994 __asm movdqu [edx + 16], xmm1 \ 2457 __asm movdqu 16[edx], xmm1 \
1995 __asm lea edx, [edx + 32] \ 2458 __asm lea edx, [edx + 32] \
1996 } 2459 }
1997 2460
1998 // Store 8 RGBA values. 2461 // Store 8 RGBA values.
1999 #define STORERGBA __asm { \ 2462 #define STORERGBA __asm { \
2000 /* Step 3: Weave into RGBA */ \ 2463 /* Step 3: Weave into RGBA */ \
2001 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ 2464 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2002 __asm punpcklbw xmm1, xmm2 /* GR */ \ 2465 __asm punpcklbw xmm1, xmm2 /* GR */ \
2003 __asm punpcklbw xmm5, xmm0 /* AB */ \ 2466 __asm punpcklbw xmm5, xmm0 /* AB */ \
2004 __asm movdqa xmm0, xmm5 \ 2467 __asm movdqa xmm0, xmm5 \
2005 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ 2468 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
2006 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ 2469 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
2007 __asm movdqu [edx], xmm5 \ 2470 __asm movdqu 0[edx], xmm5 \
2008 __asm movdqu [edx + 16], xmm0 \ 2471 __asm movdqu 16[edx], xmm0 \
2009 __asm lea edx, [edx + 32] \ 2472 __asm lea edx, [edx + 32] \
2010 } 2473 }
2011 2474
2012 // Store 8 RGB24 values. 2475 // Store 8 RGB24 values.
2013 #define STORERGB24 __asm { \ 2476 #define STORERGB24 __asm { \
2014 /* Step 3: Weave into RRGB */ \ 2477 /* Step 3: Weave into RRGB */ \
2015 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2478 __asm punpcklbw xmm0, xmm1 /* BG */ \
2016 __asm punpcklbw xmm2, xmm2 /* RR */ \ 2479 __asm punpcklbw xmm2, xmm2 /* RR */ \
2017 __asm movdqa xmm1, xmm0 \ 2480 __asm movdqa xmm1, xmm0 \
2018 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 2481 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2019 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ 2482 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
2020 /* Step 4: RRGB -> RGB24 */ \ 2483 /* Step 4: RRGB -> RGB24 */ \
2021 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ 2484 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2022 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ 2485 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
2023 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ 2486 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2024 __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ 2487 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
2025 __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ 2488 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
2026 __asm lea edx, [edx + 24] \ 2489 __asm lea edx, [edx + 24] \
2027 } 2490 }
2028 2491
2029 // Store 8 RAW values. 2492 // Store 8 RAW values.
2030 #define STORERAW __asm { \ 2493 #define STORERAW __asm { \
2031 /* Step 3: Weave into RRGB */ \ 2494 /* Step 3: Weave into RRGB */ \
2032 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2495 __asm punpcklbw xmm0, xmm1 /* BG */ \
2033 __asm punpcklbw xmm2, xmm2 /* RR */ \ 2496 __asm punpcklbw xmm2, xmm2 /* RR */ \
2034 __asm movdqa xmm1, xmm0 \ 2497 __asm movdqa xmm1, xmm0 \
2035 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 2498 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2036 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ 2499 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
2037 /* Step 4: RRGB -> RAW */ \ 2500 /* Step 4: RRGB -> RAW */ \
2038 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ 2501 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2039 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ 2502 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
2040 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ 2503 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2041 __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ 2504 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
2042 __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ 2505 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
2043 __asm lea edx, [edx + 24] \ 2506 __asm lea edx, [edx + 24] \
2044 } 2507 }
2045 2508
2046 // Store 8 RGB565 values. 2509 // Store 8 RGB565 values.
2047 #define STORERGB565 __asm { \ 2510 #define STORERGB565 __asm { \
2048 /* Step 3: Weave into RRGB */ \ 2511 /* Step 3: Weave into RRGB */ \
2049 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2512 __asm punpcklbw xmm0, xmm1 /* BG */ \
2050 __asm punpcklbw xmm2, xmm2 /* RR */ \ 2513 __asm punpcklbw xmm2, xmm2 /* RR */ \
2051 __asm movdqa xmm1, xmm0 \ 2514 __asm movdqa xmm1, xmm0 \
2052 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 2515 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
(...skipping 15 matching lines...) Expand all
2068 __asm pslld xmm1, 8 /* R */ \ 2531 __asm pslld xmm1, 8 /* R */ \
2069 __asm psrld xmm3, 3 /* B */ \ 2532 __asm psrld xmm3, 3 /* B */ \
2070 __asm psrld xmm2, 5 /* G */ \ 2533 __asm psrld xmm2, 5 /* G */ \
2071 __asm psrad xmm1, 16 /* R */ \ 2534 __asm psrad xmm1, 16 /* R */ \
2072 __asm pand xmm3, xmm5 /* B */ \ 2535 __asm pand xmm3, xmm5 /* B */ \
2073 __asm pand xmm2, xmm6 /* G */ \ 2536 __asm pand xmm2, xmm6 /* G */ \
2074 __asm pand xmm1, xmm7 /* R */ \ 2537 __asm pand xmm1, xmm7 /* R */ \
2075 __asm por xmm3, xmm2 /* BG */ \ 2538 __asm por xmm3, xmm2 /* BG */ \
2076 __asm por xmm1, xmm3 /* BGR */ \ 2539 __asm por xmm1, xmm3 /* BGR */ \
2077 __asm packssdw xmm0, xmm1 \ 2540 __asm packssdw xmm0, xmm1 \
2078 __asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \ 2541 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
2079 __asm lea edx, [edx + 16] \ 2542 __asm lea edx, [edx + 16] \
2080 } 2543 }
2081 2544
2082 // 8 pixels. 2545 // 8 pixels.
2083 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 2546 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2084 __declspec(naked) __declspec(align(16)) 2547 __declspec(naked)
2085 void I444ToARGBRow_SSSE3(const uint8* y_buf, 2548 void I444ToARGBRow_SSSE3(const uint8* y_buf,
2086 const uint8* u_buf, 2549 const uint8* u_buf,
2087 const uint8* v_buf, 2550 const uint8* v_buf,
2088 uint8* dst_argb, 2551 uint8* dst_argb,
2089 int width) { 2552 int width) {
2090 __asm { 2553 __asm {
2091 push esi 2554 push esi
2092 push edi 2555 push edi
2093 mov eax, [esp + 8 + 4] // Y 2556 mov eax, [esp + 8 + 4] // Y
2094 mov esi, [esp + 8 + 8] // U 2557 mov esi, [esp + 8 + 8] // U
(...skipping 12 matching lines...) Expand all
2107 jg convertloop 2570 jg convertloop
2108 2571
2109 pop edi 2572 pop edi
2110 pop esi 2573 pop esi
2111 ret 2574 ret
2112 } 2575 }
2113 } 2576 }
2114 2577
2115 // 8 pixels. 2578 // 8 pixels.
2116 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). 2579 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2117 __declspec(naked) __declspec(align(16)) 2580 __declspec(naked)
2118 void I422ToRGB24Row_SSSE3(const uint8* y_buf, 2581 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
2119 const uint8* u_buf, 2582 const uint8* u_buf,
2120 const uint8* v_buf, 2583 const uint8* v_buf,
2121 uint8* dst_rgb24, 2584 uint8* dst_rgb24,
2122 int width) { 2585 int width) {
2123 __asm { 2586 __asm {
2124 push esi 2587 push esi
2125 push edi 2588 push edi
2126 mov eax, [esp + 8 + 4] // Y 2589 mov eax, [esp + 8 + 4] // Y
2127 mov esi, [esp + 8 + 8] // U 2590 mov esi, [esp + 8 + 8] // U
(...skipping 13 matching lines...) Expand all
2141 jg convertloop 2604 jg convertloop
2142 2605
2143 pop edi 2606 pop edi
2144 pop esi 2607 pop esi
2145 ret 2608 ret
2146 } 2609 }
2147 } 2610 }
2148 2611
2149 // 8 pixels. 2612 // 8 pixels.
2150 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). 2613 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
2151 __declspec(naked) __declspec(align(16)) 2614 __declspec(naked)
2152 void I422ToRAWRow_SSSE3(const uint8* y_buf, 2615 void I422ToRAWRow_SSSE3(const uint8* y_buf,
2153 const uint8* u_buf, 2616 const uint8* u_buf,
2154 const uint8* v_buf, 2617 const uint8* v_buf,
2155 uint8* dst_raw, 2618 uint8* dst_raw,
2156 int width) { 2619 int width) {
2157 __asm { 2620 __asm {
2158 push esi 2621 push esi
2159 push edi 2622 push edi
2160 mov eax, [esp + 8 + 4] // Y 2623 mov eax, [esp + 8 + 4] // Y
2161 mov esi, [esp + 8 + 8] // U 2624 mov esi, [esp + 8 + 8] // U
(...skipping 13 matching lines...) Expand all
2175 jg convertloop 2638 jg convertloop
2176 2639
2177 pop edi 2640 pop edi
2178 pop esi 2641 pop esi
2179 ret 2642 ret
2180 } 2643 }
2181 } 2644 }
2182 2645
2183 // 8 pixels 2646 // 8 pixels
2184 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). 2647 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2185 __declspec(naked) __declspec(align(16)) 2648 __declspec(naked)
2186 void I422ToRGB565Row_SSSE3(const uint8* y_buf, 2649 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
2187 const uint8* u_buf, 2650 const uint8* u_buf,
2188 const uint8* v_buf, 2651 const uint8* v_buf,
2189 uint8* rgb565_buf, 2652 uint8* rgb565_buf,
2190 int width) { 2653 int width) {
2191 __asm { 2654 __asm {
2192 push esi 2655 push esi
2193 push edi 2656 push edi
2194 mov eax, [esp + 8 + 4] // Y 2657 mov eax, [esp + 8 + 4] // Y
2195 mov esi, [esp + 8 + 8] // U 2658 mov esi, [esp + 8 + 8] // U
(...skipping 18 matching lines...) Expand all
2214 jg convertloop 2677 jg convertloop
2215 2678
2216 pop edi 2679 pop edi
2217 pop esi 2680 pop esi
2218 ret 2681 ret
2219 } 2682 }
2220 } 2683 }
2221 2684
2222 // 8 pixels. 2685 // 8 pixels.
2223 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2686 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2224 __declspec(naked) __declspec(align(16)) 2687 __declspec(naked)
2225 void I422ToARGBRow_SSSE3(const uint8* y_buf, 2688 void I422ToARGBRow_SSSE3(const uint8* y_buf,
2226 const uint8* u_buf, 2689 const uint8* u_buf,
2227 const uint8* v_buf, 2690 const uint8* v_buf,
2228 uint8* dst_argb, 2691 uint8* dst_argb,
2229 int width) { 2692 int width) {
2230 __asm { 2693 __asm {
2231 push esi 2694 push esi
2232 push edi 2695 push edi
2233 mov eax, [esp + 8 + 4] // Y 2696 mov eax, [esp + 8 + 4] // Y
2234 mov esi, [esp + 8 + 8] // U 2697 mov esi, [esp + 8 + 8] // U
(...skipping 11 matching lines...) Expand all
2246 sub ecx, 8 2709 sub ecx, 8
2247 jg convertloop 2710 jg convertloop
2248 2711
2249 pop edi 2712 pop edi
2250 pop esi 2713 pop esi
2251 ret 2714 ret
2252 } 2715 }
2253 } 2716 }
2254 2717
2255 // 8 pixels. 2718 // 8 pixels.
2719 // JPeg color space version of I422ToARGB
2720 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2721 __declspec(naked)
2722 void J422ToARGBRow_SSSE3(const uint8* y_buf,
2723 const uint8* u_buf,
2724 const uint8* v_buf,
2725 uint8* dst_argb,
2726 int width) {
2727 __asm {
2728 push esi
2729 push edi
2730 mov eax, [esp + 8 + 4] // Y
2731 mov esi, [esp + 8 + 8] // U
2732 mov edi, [esp + 8 + 12] // V
2733 mov edx, [esp + 8 + 16] // argb
2734 mov ecx, [esp + 8 + 20] // width
2735 sub edi, esi
2736 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2737
2738 convertloop:
2739 READYUV422
2740 YUVTORGB(kYuvJConstants)
2741 STOREARGB
2742
2743 sub ecx, 8
2744 jg convertloop
2745
2746 pop edi
2747 pop esi
2748 ret
2749 }
2750 }
2751
2752 // 8 pixels.
2256 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2753 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2257 // Similar to I420 but duplicate UV once more. 2754 // Similar to I420 but duplicate UV once more.
2258 __declspec(naked) __declspec(align(16)) 2755 __declspec(naked)
2259 void I411ToARGBRow_SSSE3(const uint8* y_buf, 2756 void I411ToARGBRow_SSSE3(const uint8* y_buf,
2260 const uint8* u_buf, 2757 const uint8* u_buf,
2261 const uint8* v_buf, 2758 const uint8* v_buf,
2262 uint8* dst_argb, 2759 uint8* dst_argb,
2263 int width) { 2760 int width) {
2264 __asm { 2761 __asm {
2265 push ebx 2762 push ebx
2266 push esi 2763 push esi
2267 push edi 2764 push edi
2268 mov eax, [esp + 12 + 4] // Y 2765 mov eax, [esp + 12 + 4] // Y
(...skipping 14 matching lines...) Expand all
2283 2780
2284 pop edi 2781 pop edi
2285 pop esi 2782 pop esi
2286 pop ebx 2783 pop ebx
2287 ret 2784 ret
2288 } 2785 }
2289 } 2786 }
2290 2787
2291 // 8 pixels. 2788 // 8 pixels.
2292 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2789 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2293 __declspec(naked) __declspec(align(16)) 2790 __declspec(naked)
2294 void NV12ToARGBRow_SSSE3(const uint8* y_buf, 2791 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
2295 const uint8* uv_buf, 2792 const uint8* uv_buf,
2296 uint8* dst_argb, 2793 uint8* dst_argb,
2297 int width) { 2794 int width) {
2298 __asm { 2795 __asm {
2299 push esi 2796 push esi
2300 mov eax, [esp + 4 + 4] // Y 2797 mov eax, [esp + 4 + 4] // Y
2301 mov esi, [esp + 4 + 8] // UV 2798 mov esi, [esp + 4 + 8] // UV
2302 mov edx, [esp + 4 + 12] // argb 2799 mov edx, [esp + 4 + 12] // argb
2303 mov ecx, [esp + 4 + 16] // width 2800 mov ecx, [esp + 4 + 16] // width
2304 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2801 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2305 2802
2306 convertloop: 2803 convertloop:
2307 READNV12 2804 READNV12
2308 YUVTORGB(kYuvConstants) 2805 YUVTORGB(kYuvConstants)
2309 STOREARGB 2806 STOREARGB
2310 2807
2311 sub ecx, 8 2808 sub ecx, 8
2312 jg convertloop 2809 jg convertloop
2313 2810
2314 pop esi 2811 pop esi
2315 ret 2812 ret
2316 } 2813 }
2317 } 2814 }
2318 2815
2319 // 8 pixels. 2816 // 8 pixels.
2320 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). 2817 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
2321 __declspec(naked) __declspec(align(16)) 2818 __declspec(naked)
2322 void NV21ToARGBRow_SSSE3(const uint8* y_buf, 2819 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
2323 const uint8* uv_buf, 2820 const uint8* uv_buf,
2324 uint8* dst_argb, 2821 uint8* dst_argb,
2325 int width) { 2822 int width) {
2326 __asm { 2823 __asm {
2327 push esi 2824 push esi
2328 mov eax, [esp + 4 + 4] // Y 2825 mov eax, [esp + 4 + 4] // Y
2329 mov esi, [esp + 4 + 8] // UV 2826 mov esi, [esp + 4 + 8] // UV
2330 mov edx, [esp + 4 + 12] // argb 2827 mov edx, [esp + 4 + 12] // argb
2331 mov ecx, [esp + 4 + 16] // width 2828 mov ecx, [esp + 4 + 16] // width
2332 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2829 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2333 2830
2334 convertloop: 2831 convertloop:
2335 READNV12 2832 READNV12
2336 YUVTORGB(kYvuConstants) 2833 YUVTORGB(kYvuConstants)
2337 STOREARGB 2834 STOREARGB
2338 2835
2339 sub ecx, 8 2836 sub ecx, 8
2340 jg convertloop 2837 jg convertloop
2341 2838
2342 pop esi 2839 pop esi
2343 ret 2840 ret
2344 } 2841 }
2345 } 2842 }
2346 2843
2347 __declspec(naked) __declspec(align(16)) 2844 __declspec(naked)
2348 void I422ToBGRARow_SSSE3(const uint8* y_buf, 2845 void I422ToBGRARow_SSSE3(const uint8* y_buf,
2349 const uint8* u_buf, 2846 const uint8* u_buf,
2350 const uint8* v_buf, 2847 const uint8* v_buf,
2351 uint8* dst_bgra, 2848 uint8* dst_bgra,
2352 int width) { 2849 int width) {
2353 __asm { 2850 __asm {
2354 push esi 2851 push esi
2355 push edi 2852 push edi
2356 mov eax, [esp + 8 + 4] // Y 2853 mov eax, [esp + 8 + 4] // Y
2357 mov esi, [esp + 8 + 8] // U 2854 mov esi, [esp + 8 + 8] // U
2358 mov edi, [esp + 8 + 12] // V 2855 mov edi, [esp + 8 + 12] // V
2359 mov edx, [esp + 8 + 16] // bgra 2856 mov edx, [esp + 8 + 16] // bgra
2360 mov ecx, [esp + 8 + 20] // width 2857 mov ecx, [esp + 8 + 20] // width
2361 sub edi, esi 2858 sub edi, esi
2362 2859
2363 convertloop: 2860 convertloop:
2364 READYUV422 2861 READYUV422
2365 YUVTORGB(kYuvConstants) 2862 YUVTORGB(kYuvConstants)
2366 STOREBGRA 2863 STOREBGRA
2367 2864
2368 sub ecx, 8 2865 sub ecx, 8
2369 jg convertloop 2866 jg convertloop
2370 2867
2371 pop edi 2868 pop edi
2372 pop esi 2869 pop esi
2373 ret 2870 ret
2374 } 2871 }
2375 } 2872 }
2376 2873
2377 __declspec(naked) __declspec(align(16)) 2874 __declspec(naked)
2378 void I422ToABGRRow_SSSE3(const uint8* y_buf, 2875 void I422ToABGRRow_SSSE3(const uint8* y_buf,
2379 const uint8* u_buf, 2876 const uint8* u_buf,
2380 const uint8* v_buf, 2877 const uint8* v_buf,
2381 uint8* dst_abgr, 2878 uint8* dst_abgr,
2382 int width) { 2879 int width) {
2383 __asm { 2880 __asm {
2384 push esi 2881 push esi
2385 push edi 2882 push edi
2386 mov eax, [esp + 8 + 4] // Y 2883 mov eax, [esp + 8 + 4] // Y
2387 mov esi, [esp + 8 + 8] // U 2884 mov esi, [esp + 8 + 8] // U
(...skipping 10 matching lines...) Expand all
2398 2895
2399 sub ecx, 8 2896 sub ecx, 8
2400 jg convertloop 2897 jg convertloop
2401 2898
2402 pop edi 2899 pop edi
2403 pop esi 2900 pop esi
2404 ret 2901 ret
2405 } 2902 }
2406 } 2903 }
2407 2904
2408 __declspec(naked) __declspec(align(16)) 2905 __declspec(naked)
2409 void I422ToRGBARow_SSSE3(const uint8* y_buf, 2906 void I422ToRGBARow_SSSE3(const uint8* y_buf,
2410 const uint8* u_buf, 2907 const uint8* u_buf,
2411 const uint8* v_buf, 2908 const uint8* v_buf,
2412 uint8* dst_rgba, 2909 uint8* dst_rgba,
2413 int width) { 2910 int width) {
2414 __asm { 2911 __asm {
2415 push esi 2912 push esi
2416 push edi 2913 push edi
2417 mov eax, [esp + 8 + 4] // Y 2914 mov eax, [esp + 8 + 4] // Y
2418 mov esi, [esp + 8 + 8] // U 2915 mov esi, [esp + 8 + 8] // U
(...skipping 11 matching lines...) Expand all
2430 jg convertloop 2927 jg convertloop
2431 2928
2432 pop edi 2929 pop edi
2433 pop esi 2930 pop esi
2434 ret 2931 ret
2435 } 2932 }
2436 } 2933 }
2437 2934
2438 #endif // HAS_I422TOARGBROW_SSSE3 2935 #endif // HAS_I422TOARGBROW_SSSE3
2439 2936
2440 #ifdef HAS_YTOARGBROW_SSE2 2937 #ifdef HAS_I400TOARGBROW_SSE2
2441 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). 2938 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
2442 __declspec(naked) __declspec(align(16)) 2939 __declspec(naked)
2443 void YToARGBRow_SSE2(const uint8* y_buf, 2940 void I400ToARGBRow_SSE2(const uint8* y_buf,
2444 uint8* rgb_buf, 2941 uint8* rgb_buf,
2445 int width) { 2942 int width) {
2446 __asm { 2943 __asm {
2447 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) 2944 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2448 movd xmm2, eax 2945 movd xmm2, eax
2449 pshufd xmm2, xmm2,0 2946 pshufd xmm2, xmm2,0
2450 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) 2947 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2451 movd xmm3, eax 2948 movd xmm3, eax
2452 pshufd xmm3, xmm3, 0 2949 pshufd xmm3, xmm3, 0
2453 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 2950 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
2454 pslld xmm4, 24 2951 pslld xmm4, 24
2455 2952
(...skipping 19 matching lines...) Expand all
2475 por xmm0, xmm4 2972 por xmm0, xmm4
2476 por xmm1, xmm4 2973 por xmm1, xmm4
2477 movdqu [edx], xmm0 2974 movdqu [edx], xmm0
2478 movdqu [edx + 16], xmm1 2975 movdqu [edx + 16], xmm1
2479 lea edx, [edx + 32] 2976 lea edx, [edx + 32]
2480 sub ecx, 8 2977 sub ecx, 8
2481 jg convertloop 2978 jg convertloop
2482 ret 2979 ret
2483 } 2980 }
2484 } 2981 }
2485 #endif // HAS_YTOARGBROW_SSE2 2982 #endif // HAS_I400TOARGBROW_SSE2
2486 2983
2487 #ifdef HAS_YTOARGBROW_AVX2 2984 #ifdef HAS_I400TOARGBROW_AVX2
2488 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). 2985 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2489 // note: vpunpcklbw mutates and vpackuswb unmutates. 2986 // note: vpunpcklbw mutates and vpackuswb unmutates.
2490 __declspec(naked) __declspec(align(16)) 2987 __declspec(naked)
2491 void YToARGBRow_AVX2(const uint8* y_buf, 2988 void I400ToARGBRow_AVX2(const uint8* y_buf,
2492 uint8* rgb_buf, 2989 uint8* rgb_buf,
2493 int width) { 2990 int width) {
2494 __asm { 2991 __asm {
2495 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) 2992 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2496 vmovd xmm2, eax 2993 vmovd xmm2, eax
2497 vbroadcastss ymm2, xmm2 2994 vbroadcastss ymm2, xmm2
2498 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) 2995 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2499 vmovd xmm3, eax 2996 vmovd xmm3, eax
2500 vbroadcastss ymm3, xmm3 2997 vbroadcastss ymm3, xmm3
2501 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 2998 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
2502 vpslld ymm4, ymm4, 24 2999 vpslld ymm4, ymm4, 24
2503 3000
2504 mov eax, [esp + 4] // Y 3001 mov eax, [esp + 4] // Y
2505 mov edx, [esp + 8] // rgb 3002 mov edx, [esp + 8] // rgb
2506 mov ecx, [esp + 12] // width 3003 mov ecx, [esp + 12] // width
2507 3004
2508 convertloop: 3005 convertloop:
2509 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 3006 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
2510 vmovdqu xmm0, [eax] 3007 vmovdqu xmm0, [eax]
2511 lea eax, [eax + 16] 3008 lea eax, [eax + 16]
2512 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates 3009 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
2513 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y 3010 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
2514 vpmulhuw ymm0, ymm0, ymm2 3011 vpmulhuw ymm0, ymm0, ymm2
2515 vpsubusw ymm0, ymm0, ymm3 3012 vpsubusw ymm0, ymm0, ymm3
2516 vpsrlw ymm0, ymm0, 6 3013 vpsrlw ymm0, ymm0, 6
2517 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 3014 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
2518 3015
2519 // TODO(fbarchard): Weave alpha with unpack. 3016 // TODO(fbarchard): Weave alpha with unpack.
2520 // Step 2: Weave into ARGB 3017 // Step 2: Weave into ARGB
2521 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates 3018 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
2522 vpermq ymm1, ymm1, 0xd8 3019 vpermq ymm1, ymm1, 0xd8
2523 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels 3020 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
2524 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels 3021 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
2525 vpor ymm0, ymm0, ymm4 3022 vpor ymm0, ymm0, ymm4
2526 vpor ymm1, ymm1, ymm4 3023 vpor ymm1, ymm1, ymm4
2527 vmovdqu [edx], ymm0 3024 vmovdqu [edx], ymm0
2528 vmovdqu [edx + 32], ymm1 3025 vmovdqu [edx + 32], ymm1
2529 lea edx, [edx + 64] 3026 lea edx, [edx + 64]
2530 sub ecx, 16 3027 sub ecx, 16
2531 jg convertloop 3028 jg convertloop
2532 vzeroupper 3029 vzeroupper
2533 ret 3030 ret
2534 } 3031 }
2535 } 3032 }
2536 #endif // HAS_YTOARGBROW_AVX2 3033 #endif // HAS_I400TOARGBROW_AVX2
2537 3034
2538 #ifdef HAS_MIRRORROW_SSSE3 3035 #ifdef HAS_MIRRORROW_SSSE3
2539 // Shuffle table for reversing the bytes. 3036 // Shuffle table for reversing the bytes.
2540 static const uvec8 kShuffleMirror = { 3037 static const uvec8 kShuffleMirror = {
2541 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3038 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2542 }; 3039 };
2543 3040
2544 // TODO(fbarchard): Replace lea with -16 offset. 3041 // TODO(fbarchard): Replace lea with -16 offset.
2545 __declspec(naked) __declspec(align(16)) 3042 __declspec(naked)
2546 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 3043 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2547 __asm { 3044 __asm {
2548 mov eax, [esp + 4] // src 3045 mov eax, [esp + 4] // src
2549 mov edx, [esp + 8] // dst 3046 mov edx, [esp + 8] // dst
2550 mov ecx, [esp + 12] // width 3047 mov ecx, [esp + 12] // width
2551 movdqa xmm5, kShuffleMirror 3048 movdqa xmm5, kShuffleMirror
2552 3049
2553 convertloop: 3050 convertloop:
2554 movdqu xmm0, [eax - 16 + ecx] 3051 movdqu xmm0, [eax - 16 + ecx]
2555 pshufb xmm0, xmm5 3052 pshufb xmm0, xmm5
2556 movdqu [edx], xmm0 3053 movdqu [edx], xmm0
2557 lea edx, [edx + 16] 3054 lea edx, [edx + 16]
2558 sub ecx, 16 3055 sub ecx, 16
2559 jg convertloop 3056 jg convertloop
2560 ret 3057 ret
2561 } 3058 }
2562 } 3059 }
2563 #endif // HAS_MIRRORROW_SSSE3 3060 #endif // HAS_MIRRORROW_SSSE3
2564 3061
2565 #ifdef HAS_MIRRORROW_AVX2 3062 #ifdef HAS_MIRRORROW_AVX2
2566 __declspec(naked) __declspec(align(16)) 3063 __declspec(naked)
2567 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3064 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2568 __asm { 3065 __asm {
2569 mov eax, [esp + 4] // src 3066 mov eax, [esp + 4] // src
2570 mov edx, [esp + 8] // dst 3067 mov edx, [esp + 8] // dst
2571 mov ecx, [esp + 12] // width 3068 mov ecx, [esp + 12] // width
2572 vbroadcastf128 ymm5, kShuffleMirror 3069 vbroadcastf128 ymm5, kShuffleMirror
2573 3070
2574 convertloop: 3071 convertloop:
2575 vmovdqu ymm0, [eax - 32 + ecx] 3072 vmovdqu ymm0, [eax - 32 + ecx]
2576 vpshufb ymm0, ymm0, ymm5 3073 vpshufb ymm0, ymm0, ymm5
2577 vpermq ymm0, ymm0, 0x4e // swap high and low halfs 3074 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
2578 vmovdqu [edx], ymm0 3075 vmovdqu [edx], ymm0
2579 lea edx, [edx + 32] 3076 lea edx, [edx + 32]
2580 sub ecx, 32 3077 sub ecx, 32
2581 jg convertloop 3078 jg convertloop
2582 vzeroupper 3079 vzeroupper
2583 ret 3080 ret
2584 } 3081 }
2585 } 3082 }
2586 #endif // HAS_MIRRORROW_AVX2 3083 #endif // HAS_MIRRORROW_AVX2
2587 3084
2588 #ifdef HAS_MIRRORROW_SSE2 3085 #ifdef HAS_MIRRORROW_SSE2
2589 __declspec(naked) __declspec(align(16)) 3086 __declspec(naked)
2590 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 3087 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2591 __asm { 3088 __asm {
2592 mov eax, [esp + 4] // src 3089 mov eax, [esp + 4] // src
2593 mov edx, [esp + 8] // dst 3090 mov edx, [esp + 8] // dst
2594 mov ecx, [esp + 12] // width 3091 mov ecx, [esp + 12] // width
2595 3092
2596 convertloop: 3093 convertloop:
2597 movdqu xmm0, [eax - 16 + ecx] 3094 movdqu xmm0, [eax - 16 + ecx]
2598 movdqa xmm1, xmm0 // swap bytes 3095 movdqa xmm1, xmm0 // swap bytes
2599 psllw xmm0, 8 3096 psllw xmm0, 8
(...skipping 10 matching lines...) Expand all
2610 } 3107 }
2611 } 3108 }
2612 #endif // HAS_MIRRORROW_SSE2 3109 #endif // HAS_MIRRORROW_SSE2
2613 3110
2614 #ifdef HAS_MIRRORROW_UV_SSSE3 3111 #ifdef HAS_MIRRORROW_UV_SSSE3
2615 // Shuffle table for reversing the bytes of UV channels. 3112 // Shuffle table for reversing the bytes of UV channels.
2616 static const uvec8 kShuffleMirrorUV = { 3113 static const uvec8 kShuffleMirrorUV = {
2617 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 3114 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2618 }; 3115 };
2619 3116
2620 __declspec(naked) __declspec(align(16)) 3117 __declspec(naked)
2621 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 3118 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2622 int width) { 3119 int width) {
2623 __asm { 3120 __asm {
2624 push edi 3121 push edi
2625 mov eax, [esp + 4 + 4] // src 3122 mov eax, [esp + 4 + 4] // src
2626 mov edx, [esp + 4 + 8] // dst_u 3123 mov edx, [esp + 4 + 8] // dst_u
2627 mov edi, [esp + 4 + 12] // dst_v 3124 mov edi, [esp + 4 + 12] // dst_v
2628 mov ecx, [esp + 4 + 16] // width 3125 mov ecx, [esp + 4 + 16] // width
2629 movdqa xmm1, kShuffleMirrorUV 3126 movdqa xmm1, kShuffleMirrorUV
2630 lea eax, [eax + ecx * 2 - 16] 3127 lea eax, [eax + ecx * 2 - 16]
2631 sub edi, edx 3128 sub edi, edx
2632 3129
2633 convertloop: 3130 convertloop:
2634 movdqu xmm0, [eax] 3131 movdqu xmm0, [eax]
2635 lea eax, [eax - 16] 3132 lea eax, [eax - 16]
2636 pshufb xmm0, xmm1 3133 pshufb xmm0, xmm1
2637 movlpd qword ptr [edx], xmm0 3134 movlpd qword ptr [edx], xmm0
2638 movhpd qword ptr [edx + edi], xmm0 3135 movhpd qword ptr [edx + edi], xmm0
2639 lea edx, [edx + 8] 3136 lea edx, [edx + 8]
2640 sub ecx, 8 3137 sub ecx, 8
2641 jg convertloop 3138 jg convertloop
2642 3139
2643 pop edi 3140 pop edi
2644 ret 3141 ret
2645 } 3142 }
2646 } 3143 }
2647 #endif // HAS_MIRRORROW_UV_SSSE3 3144 #endif // HAS_MIRRORROW_UV_SSSE3
2648 3145
2649 #ifdef HAS_ARGBMIRRORROW_SSE2 3146 #ifdef HAS_ARGBMIRRORROW_SSE2
2650 __declspec(naked) __declspec(align(16)) 3147 __declspec(naked)
2651 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 3148 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2652 __asm { 3149 __asm {
2653 mov eax, [esp + 4] // src 3150 mov eax, [esp + 4] // src
2654 mov edx, [esp + 8] // dst 3151 mov edx, [esp + 8] // dst
2655 mov ecx, [esp + 12] // width 3152 mov ecx, [esp + 12] // width
2656 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. 3153 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
2657 3154
2658 convertloop: 3155 convertloop:
2659 movdqu xmm0, [eax] 3156 movdqu xmm0, [eax]
2660 lea eax, [eax - 16] 3157 lea eax, [eax - 16]
2661 pshufd xmm0, xmm0, 0x1b 3158 pshufd xmm0, xmm0, 0x1b
2662 movdqu [edx], xmm0 3159 movdqu [edx], xmm0
2663 lea edx, [edx + 16] 3160 lea edx, [edx + 16]
2664 sub ecx, 4 3161 sub ecx, 4
2665 jg convertloop 3162 jg convertloop
2666 ret 3163 ret
2667 } 3164 }
2668 } 3165 }
2669 #endif // HAS_ARGBMIRRORROW_SSE2 3166 #endif // HAS_ARGBMIRRORROW_SSE2
2670 3167
2671 #ifdef HAS_ARGBMIRRORROW_AVX2 3168 #ifdef HAS_ARGBMIRRORROW_AVX2
2672 // Shuffle table for reversing the bytes. 3169 // Shuffle table for reversing the bytes.
2673 static const ulvec32 kARGBShuffleMirror_AVX2 = { 3170 static const ulvec32 kARGBShuffleMirror_AVX2 = {
2674 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3171 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2675 }; 3172 };
2676 3173
2677 __declspec(naked) __declspec(align(16)) 3174 __declspec(naked)
2678 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3175 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2679 __asm { 3176 __asm {
2680 mov eax, [esp + 4] // src 3177 mov eax, [esp + 4] // src
2681 mov edx, [esp + 8] // dst 3178 mov edx, [esp + 8] // dst
2682 mov ecx, [esp + 12] // width 3179 mov ecx, [esp + 12] // width
2683 vmovdqu ymm5, kARGBShuffleMirror_AVX2 3180 vmovdqu ymm5, kARGBShuffleMirror_AVX2
2684 3181
2685 convertloop: 3182 convertloop:
2686 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order 3183 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
2687 vmovdqu [edx], ymm0 3184 vmovdqu [edx], ymm0
2688 lea edx, [edx + 32] 3185 lea edx, [edx + 32]
2689 sub ecx, 8 3186 sub ecx, 8
2690 jg convertloop 3187 jg convertloop
2691 vzeroupper 3188 vzeroupper
2692 ret 3189 ret
2693 } 3190 }
2694 } 3191 }
2695 #endif // HAS_ARGBMIRRORROW_AVX2 3192 #endif // HAS_ARGBMIRRORROW_AVX2
2696 3193
2697 #ifdef HAS_SPLITUVROW_SSE2 3194 #ifdef HAS_SPLITUVROW_SSE2
2698 __declspec(naked) __declspec(align(16)) 3195 __declspec(naked)
2699 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3196 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2700 __asm { 3197 __asm {
2701 push edi 3198 push edi
2702 mov eax, [esp + 4 + 4] // src_uv 3199 mov eax, [esp + 4 + 4] // src_uv
2703 mov edx, [esp + 4 + 8] // dst_u 3200 mov edx, [esp + 4 + 8] // dst_u
2704 mov edi, [esp + 4 + 12] // dst_v 3201 mov edi, [esp + 4 + 12] // dst_v
2705 mov ecx, [esp + 4 + 16] // pix 3202 mov ecx, [esp + 4 + 16] // pix
2706 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3203 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2707 psrlw xmm5, 8 3204 psrlw xmm5, 8
2708 sub edi, edx 3205 sub edi, edx
(...skipping 17 matching lines...) Expand all
2726 jg convertloop 3223 jg convertloop
2727 3224
2728 pop edi 3225 pop edi
2729 ret 3226 ret
2730 } 3227 }
2731 } 3228 }
2732 3229
2733 #endif // HAS_SPLITUVROW_SSE2 3230 #endif // HAS_SPLITUVROW_SSE2
2734 3231
2735 #ifdef HAS_SPLITUVROW_AVX2 3232 #ifdef HAS_SPLITUVROW_AVX2
2736 __declspec(naked) __declspec(align(16)) 3233 __declspec(naked)
2737 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3234 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2738 __asm { 3235 __asm {
2739 push edi 3236 push edi
2740 mov eax, [esp + 4 + 4] // src_uv 3237 mov eax, [esp + 4 + 4] // src_uv
2741 mov edx, [esp + 4 + 8] // dst_u 3238 mov edx, [esp + 4 + 8] // dst_u
2742 mov edi, [esp + 4 + 12] // dst_v 3239 mov edi, [esp + 4 + 12] // dst_v
2743 mov ecx, [esp + 4 + 16] // pix 3240 mov ecx, [esp + 4 + 16] // pix
2744 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3241 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
2745 vpsrlw ymm5, ymm5, 8 3242 vpsrlw ymm5, ymm5, 8
2746 sub edi, edx 3243 sub edi, edx
(...skipping 17 matching lines...) Expand all
2764 jg convertloop 3261 jg convertloop
2765 3262
2766 pop edi 3263 pop edi
2767 vzeroupper 3264 vzeroupper
2768 ret 3265 ret
2769 } 3266 }
2770 } 3267 }
2771 #endif // HAS_SPLITUVROW_AVX2 3268 #endif // HAS_SPLITUVROW_AVX2
2772 3269
2773 #ifdef HAS_MERGEUVROW_SSE2 3270 #ifdef HAS_MERGEUVROW_SSE2
2774 __declspec(naked) __declspec(align(16)) 3271 __declspec(naked)
2775 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3272 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2776 int width) { 3273 int width) {
2777 __asm { 3274 __asm {
2778 push edi 3275 push edi
2779 mov eax, [esp + 4 + 4] // src_u 3276 mov eax, [esp + 4 + 4] // src_u
2780 mov edx, [esp + 4 + 8] // src_v 3277 mov edx, [esp + 4 + 8] // src_v
2781 mov edi, [esp + 4 + 12] // dst_uv 3278 mov edi, [esp + 4 + 12] // dst_uv
2782 mov ecx, [esp + 4 + 16] // width 3279 mov ecx, [esp + 4 + 16] // width
2783 sub edx, eax 3280 sub edx, eax
2784 3281
(...skipping 10 matching lines...) Expand all
2795 sub ecx, 16 3292 sub ecx, 16
2796 jg convertloop 3293 jg convertloop
2797 3294
2798 pop edi 3295 pop edi
2799 ret 3296 ret
2800 } 3297 }
2801 } 3298 }
2802 #endif // HAS_MERGEUVROW_SSE2 3299 #endif // HAS_MERGEUVROW_SSE2
2803 3300
2804 #ifdef HAS_MERGEUVROW_AVX2 3301 #ifdef HAS_MERGEUVROW_AVX2
2805 __declspec(naked) __declspec(align(16)) 3302 __declspec(naked)
2806 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3303 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2807 int width) { 3304 int width) {
2808 __asm { 3305 __asm {
2809 push edi 3306 push edi
2810 mov eax, [esp + 4 + 4] // src_u 3307 mov eax, [esp + 4 + 4] // src_u
2811 mov edx, [esp + 4 + 8] // src_v 3308 mov edx, [esp + 4 + 8] // src_v
2812 mov edi, [esp + 4 + 12] // dst_uv 3309 mov edi, [esp + 4 + 12] // dst_uv
2813 mov ecx, [esp + 4 + 16] // width 3310 mov ecx, [esp + 4 + 16] // width
2814 sub edx, eax 3311 sub edx, eax
2815 3312
(...skipping 13 matching lines...) Expand all
2829 3326
2830 pop edi 3327 pop edi
2831 vzeroupper 3328 vzeroupper
2832 ret 3329 ret
2833 } 3330 }
2834 } 3331 }
2835 #endif // HAS_MERGEUVROW_AVX2 3332 #endif // HAS_MERGEUVROW_AVX2
2836 3333
2837 #ifdef HAS_COPYROW_SSE2 3334 #ifdef HAS_COPYROW_SSE2
2838 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. 3335 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
2839 __declspec(naked) __declspec(align(16)) 3336 __declspec(naked)
2840 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 3337 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2841 __asm { 3338 __asm {
2842 mov eax, [esp + 4] // src 3339 mov eax, [esp + 4] // src
2843 mov edx, [esp + 8] // dst 3340 mov edx, [esp + 8] // dst
2844 mov ecx, [esp + 12] // count 3341 mov ecx, [esp + 12] // count
2845 3342
2846 convertloop: 3343 convertloop:
2847 movdqu xmm0, [eax] 3344 movdqu xmm0, [eax]
2848 movdqu xmm1, [eax + 16] 3345 movdqu xmm1, [eax + 16]
2849 lea eax, [eax + 32] 3346 lea eax, [eax + 32]
2850 movdqu [edx], xmm0 3347 movdqu [edx], xmm0
2851 movdqu [edx + 16], xmm1 3348 movdqu [edx + 16], xmm1
2852 lea edx, [edx + 32] 3349 lea edx, [edx + 32]
2853 sub ecx, 32 3350 sub ecx, 32
2854 jg convertloop 3351 jg convertloop
2855 ret 3352 ret
2856 } 3353 }
2857 } 3354 }
2858 #endif // HAS_COPYROW_SSE2 3355 #endif // HAS_COPYROW_SSE2
2859 3356
2860 #ifdef HAS_COPYROW_AVX 3357 #ifdef HAS_COPYROW_AVX
2861 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. 3358 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
2862 __declspec(naked) __declspec(align(16)) 3359 __declspec(naked)
2863 void CopyRow_AVX(const uint8* src, uint8* dst, int count) { 3360 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2864 __asm { 3361 __asm {
2865 mov eax, [esp + 4] // src 3362 mov eax, [esp + 4] // src
2866 mov edx, [esp + 8] // dst 3363 mov edx, [esp + 8] // dst
2867 mov ecx, [esp + 12] // count 3364 mov ecx, [esp + 12] // count
2868 3365
2869 convertloop: 3366 convertloop:
2870 vmovdqu ymm0, [eax] 3367 vmovdqu ymm0, [eax]
2871 vmovdqu ymm1, [eax + 32] 3368 vmovdqu ymm1, [eax + 32]
2872 lea eax, [eax + 64] 3369 lea eax, [eax + 64]
2873 vmovdqu [edx], ymm0 3370 vmovdqu [edx], ymm0
2874 vmovdqu [edx + 32], ymm1 3371 vmovdqu [edx + 32], ymm1
2875 lea edx, [edx + 64] 3372 lea edx, [edx + 64]
2876 sub ecx, 64 3373 sub ecx, 64
2877 jg convertloop 3374 jg convertloop
2878 3375
2879 vzeroupper 3376 vzeroupper
2880 ret 3377 ret
2881 } 3378 }
2882 } 3379 }
2883 #endif // HAS_COPYROW_AVX 3380 #endif // HAS_COPYROW_AVX
2884 3381
2885 // Multiple of 1. 3382 // Multiple of 1.
2886 __declspec(naked) __declspec(align(16)) 3383 __declspec(naked)
2887 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { 3384 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
2888 __asm { 3385 __asm {
2889 mov eax, esi 3386 mov eax, esi
2890 mov edx, edi 3387 mov edx, edi
2891 mov esi, [esp + 4] // src 3388 mov esi, [esp + 4] // src
2892 mov edi, [esp + 8] // dst 3389 mov edi, [esp + 8] // dst
2893 mov ecx, [esp + 12] // count 3390 mov ecx, [esp + 12] // count
2894 rep movsb 3391 rep movsb
2895 mov edi, edx 3392 mov edi, edx
2896 mov esi, eax 3393 mov esi, eax
2897 ret 3394 ret
2898 } 3395 }
2899 } 3396 }
2900 3397
2901 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 3398 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2902 // width in pixels 3399 // width in pixels
2903 __declspec(naked) __declspec(align(16)) 3400 __declspec(naked)
2904 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3401 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2905 __asm { 3402 __asm {
2906 mov eax, [esp + 4] // src 3403 mov eax, [esp + 4] // src
2907 mov edx, [esp + 8] // dst 3404 mov edx, [esp + 8] // dst
2908 mov ecx, [esp + 12] // count 3405 mov ecx, [esp + 12] // count
2909 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3406 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
2910 pslld xmm0, 24 3407 pslld xmm0, 24
2911 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3408 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
2912 psrld xmm1, 8 3409 psrld xmm1, 8
2913 3410
(...skipping 15 matching lines...) Expand all
2929 sub ecx, 8 3426 sub ecx, 8
2930 jg convertloop 3427 jg convertloop
2931 3428
2932 ret 3429 ret
2933 } 3430 }
2934 } 3431 }
2935 #endif // HAS_ARGBCOPYALPHAROW_SSE2 3432 #endif // HAS_ARGBCOPYALPHAROW_SSE2
2936 3433
2937 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 3434 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2938 // width in pixels 3435 // width in pixels
2939 __declspec(naked) __declspec(align(16)) 3436 __declspec(naked)
2940 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3437 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2941 __asm { 3438 __asm {
2942 mov eax, [esp + 4] // src 3439 mov eax, [esp + 4] // src
2943 mov edx, [esp + 8] // dst 3440 mov edx, [esp + 8] // dst
2944 mov ecx, [esp + 12] // count 3441 mov ecx, [esp + 12] // count
2945 vpcmpeqb ymm0, ymm0, ymm0 3442 vpcmpeqb ymm0, ymm0, ymm0
2946 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3443 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
2947 3444
2948 convertloop: 3445 convertloop:
2949 vmovdqu ymm1, [eax] 3446 vmovdqu ymm1, [eax]
2950 vmovdqu ymm2, [eax + 32] 3447 vmovdqu ymm2, [eax + 32]
2951 lea eax, [eax + 64] 3448 lea eax, [eax + 64]
2952 vpblendvb ymm1, ymm1, [edx], ymm0 3449 vpblendvb ymm1, ymm1, [edx], ymm0
2953 vpblendvb ymm2, ymm2, [edx + 32], ymm0 3450 vpblendvb ymm2, ymm2, [edx + 32], ymm0
2954 vmovdqu [edx], ymm1 3451 vmovdqu [edx], ymm1
2955 vmovdqu [edx + 32], ymm2 3452 vmovdqu [edx + 32], ymm2
2956 lea edx, [edx + 64] 3453 lea edx, [edx + 64]
2957 sub ecx, 16 3454 sub ecx, 16
2958 jg convertloop 3455 jg convertloop
2959 3456
2960 vzeroupper 3457 vzeroupper
2961 ret 3458 ret
2962 } 3459 }
2963 } 3460 }
2964 #endif // HAS_ARGBCOPYALPHAROW_AVX2 3461 #endif // HAS_ARGBCOPYALPHAROW_AVX2
2965 3462
2966 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 3463 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2967 // width in pixels 3464 // width in pixels
2968 __declspec(naked) __declspec(align(16)) 3465 __declspec(naked)
2969 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3466 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2970 __asm { 3467 __asm {
2971 mov eax, [esp + 4] // src 3468 mov eax, [esp + 4] // src
2972 mov edx, [esp + 8] // dst 3469 mov edx, [esp + 8] // dst
2973 mov ecx, [esp + 12] // count 3470 mov ecx, [esp + 12] // count
2974 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3471 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
2975 pslld xmm0, 24 3472 pslld xmm0, 24
2976 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3473 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
2977 psrld xmm1, 8 3474 psrld xmm1, 8
2978 3475
(...skipping 17 matching lines...) Expand all
2996 sub ecx, 8 3493 sub ecx, 8
2997 jg convertloop 3494 jg convertloop
2998 3495
2999 ret 3496 ret
3000 } 3497 }
3001 } 3498 }
3002 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 3499 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3003 3500
3004 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 3501 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3005 // width in pixels 3502 // width in pixels
3006 __declspec(naked) __declspec(align(16)) 3503 __declspec(naked)
3007 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3504 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3008 __asm { 3505 __asm {
3009 mov eax, [esp + 4] // src 3506 mov eax, [esp + 4] // src
3010 mov edx, [esp + 8] // dst 3507 mov edx, [esp + 8] // dst
3011 mov ecx, [esp + 12] // count 3508 mov ecx, [esp + 12] // count
3012 vpcmpeqb ymm0, ymm0, ymm0 3509 vpcmpeqb ymm0, ymm0, ymm0
3013 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3510 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3014 3511
3015 convertloop: 3512 convertloop:
3016 vpmovzxbd ymm1, qword ptr [eax] 3513 vpmovzxbd ymm1, qword ptr [eax]
(...skipping 11 matching lines...) Expand all
3028 3525
3029 vzeroupper 3526 vzeroupper
3030 ret 3527 ret
3031 } 3528 }
3032 } 3529 }
3033 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 3530 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3034 3531
3035 #ifdef HAS_SETROW_X86 3532 #ifdef HAS_SETROW_X86
3036 // Write 'count' bytes using an 8 bit value repeated. 3533 // Write 'count' bytes using an 8 bit value repeated.
3037 // Count should be multiple of 4. 3534 // Count should be multiple of 4.
3038 __declspec(naked) __declspec(align(16)) 3535 __declspec(naked)
3039 void SetRow_X86(uint8* dst, uint8 v8, int count) { 3536 void SetRow_X86(uint8* dst, uint8 v8, int count) {
3040 __asm { 3537 __asm {
3041 movzx eax, byte ptr [esp + 8] // v8 3538 movzx eax, byte ptr [esp + 8] // v8
3042 mov edx, 0x01010101 // Duplicate byte to all bytes. 3539 mov edx, 0x01010101 // Duplicate byte to all bytes.
3043 mul edx // overwrites edx with upper part of result. 3540 mul edx // overwrites edx with upper part of result.
3044 mov edx, edi 3541 mov edx, edi
3045 mov edi, [esp + 4] // dst 3542 mov edi, [esp + 4] // dst
3046 mov ecx, [esp + 12] // count 3543 mov ecx, [esp + 12] // count
3047 shr ecx, 2 3544 shr ecx, 2
3048 rep stosd 3545 rep stosd
3049 mov edi, edx 3546 mov edi, edx
3050 ret 3547 ret
3051 } 3548 }
3052 } 3549 }
3053 3550
3054 // Write 'count' bytes using an 8 bit value repeated. 3551 // Write 'count' bytes using an 8 bit value repeated.
3055 __declspec(naked) __declspec(align(16)) 3552 __declspec(naked)
3056 void SetRow_ERMS(uint8* dst, uint8 v8, int count) { 3553 void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
3057 __asm { 3554 __asm {
3058 mov edx, edi 3555 mov edx, edi
3059 mov edi, [esp + 4] // dst 3556 mov edi, [esp + 4] // dst
3060 mov eax, [esp + 8] // v8 3557 mov eax, [esp + 8] // v8
3061 mov ecx, [esp + 12] // count 3558 mov ecx, [esp + 12] // count
3062 rep stosb 3559 rep stosb
3063 mov edi, edx 3560 mov edi, edx
3064 ret 3561 ret
3065 } 3562 }
3066 } 3563 }
3067 3564
3068 // Write 'count' 32 bit values. 3565 // Write 'count' 32 bit values.
3069 __declspec(naked) __declspec(align(16)) 3566 __declspec(naked)
3070 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { 3567 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
3071 __asm { 3568 __asm {
3072 mov edx, edi 3569 mov edx, edi
3073 mov edi, [esp + 4] // dst 3570 mov edi, [esp + 4] // dst
3074 mov eax, [esp + 8] // v32 3571 mov eax, [esp + 8] // v32
3075 mov ecx, [esp + 12] // count 3572 mov ecx, [esp + 12] // count
3076 rep stosd 3573 rep stosd
3077 mov edi, edx 3574 mov edi, edx
3078 ret 3575 ret
3079 } 3576 }
3080 } 3577 }
3081 #endif // HAS_SETROW_X86 3578 #endif // HAS_SETROW_X86
3082 3579
3083 #ifdef HAS_YUY2TOYROW_AVX2 3580 #ifdef HAS_YUY2TOYROW_AVX2
3084 __declspec(naked) __declspec(align(16)) 3581 __declspec(naked)
3085 void YUY2ToYRow_AVX2(const uint8* src_yuy2, 3582 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
3086 uint8* dst_y, int pix) { 3583 uint8* dst_y, int pix) {
3087 __asm { 3584 __asm {
3088 mov eax, [esp + 4] // src_yuy2 3585 mov eax, [esp + 4] // src_yuy2
3089 mov edx, [esp + 8] // dst_y 3586 mov edx, [esp + 8] // dst_y
3090 mov ecx, [esp + 12] // pix 3587 mov ecx, [esp + 12] // pix
3091 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3588 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3092 vpsrlw ymm5, ymm5, 8 3589 vpsrlw ymm5, ymm5, 8
3093 3590
3094 convertloop: 3591 convertloop:
3095 vmovdqu ymm0, [eax] 3592 vmovdqu ymm0, [eax]
3096 vmovdqu ymm1, [eax + 32] 3593 vmovdqu ymm1, [eax + 32]
3097 lea eax, [eax + 64] 3594 lea eax, [eax + 64]
3098 vpand ymm0, ymm0, ymm5 // even bytes are Y 3595 vpand ymm0, ymm0, ymm5 // even bytes are Y
3099 vpand ymm1, ymm1, ymm5 3596 vpand ymm1, ymm1, ymm5
3100 vpackuswb ymm0, ymm0, ymm1 // mutates. 3597 vpackuswb ymm0, ymm0, ymm1 // mutates.
3101 vpermq ymm0, ymm0, 0xd8 3598 vpermq ymm0, ymm0, 0xd8
3102 vmovdqu [edx], ymm0 3599 vmovdqu [edx], ymm0
3103 lea edx, [edx + 32] 3600 lea edx, [edx + 32]
3104 sub ecx, 32 3601 sub ecx, 32
3105 jg convertloop 3602 jg convertloop
3106 vzeroupper 3603 vzeroupper
3107 ret 3604 ret
3108 } 3605 }
3109 } 3606 }
3110 3607
3111 __declspec(naked) __declspec(align(16)) 3608 __declspec(naked)
3112 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 3609 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3113 uint8* dst_u, uint8* dst_v, int pix) { 3610 uint8* dst_u, uint8* dst_v, int pix) {
3114 __asm { 3611 __asm {
3115 push esi 3612 push esi
3116 push edi 3613 push edi
3117 mov eax, [esp + 8 + 4] // src_yuy2 3614 mov eax, [esp + 8 + 4] // src_yuy2
3118 mov esi, [esp + 8 + 8] // stride_yuy2 3615 mov esi, [esp + 8 + 8] // stride_yuy2
3119 mov edx, [esp + 8 + 12] // dst_u 3616 mov edx, [esp + 8 + 12] // dst_u
3120 mov edi, [esp + 8 + 16] // dst_v 3617 mov edi, [esp + 8 + 16] // dst_v
3121 mov ecx, [esp + 8 + 20] // pix 3618 mov ecx, [esp + 8 + 20] // pix
(...skipping 23 matching lines...) Expand all
3145 sub ecx, 32 3642 sub ecx, 32
3146 jg convertloop 3643 jg convertloop
3147 3644
3148 pop edi 3645 pop edi
3149 pop esi 3646 pop esi
3150 vzeroupper 3647 vzeroupper
3151 ret 3648 ret
3152 } 3649 }
3153 } 3650 }
3154 3651
3155 __declspec(naked) __declspec(align(16)) 3652 __declspec(naked)
3156 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3653 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3157 uint8* dst_u, uint8* dst_v, int pix) { 3654 uint8* dst_u, uint8* dst_v, int pix) {
3158 __asm { 3655 __asm {
3159 push edi 3656 push edi
3160 mov eax, [esp + 4 + 4] // src_yuy2 3657 mov eax, [esp + 4 + 4] // src_yuy2
3161 mov edx, [esp + 4 + 8] // dst_u 3658 mov edx, [esp + 4 + 8] // dst_u
3162 mov edi, [esp + 4 + 12] // dst_v 3659 mov edi, [esp + 4 + 12] // dst_v
3163 mov ecx, [esp + 4 + 16] // pix 3660 mov ecx, [esp + 4 + 16] // pix
3164 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3661 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3165 vpsrlw ymm5, ymm5, 8 3662 vpsrlw ymm5, ymm5, 8
(...skipping 18 matching lines...) Expand all
3184 lea edx, [edx + 16] 3681 lea edx, [edx + 16]
3185 sub ecx, 32 3682 sub ecx, 32
3186 jg convertloop 3683 jg convertloop
3187 3684
3188 pop edi 3685 pop edi
3189 vzeroupper 3686 vzeroupper
3190 ret 3687 ret
3191 } 3688 }
3192 } 3689 }
3193 3690
3194 __declspec(naked) __declspec(align(16)) 3691 __declspec(naked)
3195 void UYVYToYRow_AVX2(const uint8* src_uyvy, 3692 void UYVYToYRow_AVX2(const uint8* src_uyvy,
3196 uint8* dst_y, int pix) { 3693 uint8* dst_y, int pix) {
3197 __asm { 3694 __asm {
3198 mov eax, [esp + 4] // src_uyvy 3695 mov eax, [esp + 4] // src_uyvy
3199 mov edx, [esp + 8] // dst_y 3696 mov edx, [esp + 8] // dst_y
3200 mov ecx, [esp + 12] // pix 3697 mov ecx, [esp + 12] // pix
3201 3698
3202 convertloop: 3699 convertloop:
3203 vmovdqu ymm0, [eax] 3700 vmovdqu ymm0, [eax]
3204 vmovdqu ymm1, [eax + 32] 3701 vmovdqu ymm1, [eax + 32]
3205 lea eax, [eax + 64] 3702 lea eax, [eax + 64]
3206 vpsrlw ymm0, ymm0, 8 // odd bytes are Y 3703 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
3207 vpsrlw ymm1, ymm1, 8 3704 vpsrlw ymm1, ymm1, 8
3208 vpackuswb ymm0, ymm0, ymm1 // mutates. 3705 vpackuswb ymm0, ymm0, ymm1 // mutates.
3209 vpermq ymm0, ymm0, 0xd8 3706 vpermq ymm0, ymm0, 0xd8
3210 vmovdqu [edx], ymm0 3707 vmovdqu [edx], ymm0
3211 lea edx, [edx + 32] 3708 lea edx, [edx + 32]
3212 sub ecx, 32 3709 sub ecx, 32
3213 jg convertloop 3710 jg convertloop
3214 vzeroupper 3711 vzeroupper
3215 ret 3712 ret
3216 } 3713 }
3217 } 3714 }
3218 3715
3219 __declspec(naked) __declspec(align(16)) 3716 __declspec(naked)
3220 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 3717 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3221 uint8* dst_u, uint8* dst_v, int pix) { 3718 uint8* dst_u, uint8* dst_v, int pix) {
3222 __asm { 3719 __asm {
3223 push esi 3720 push esi
3224 push edi 3721 push edi
3225 mov eax, [esp + 8 + 4] // src_yuy2 3722 mov eax, [esp + 8 + 4] // src_yuy2
3226 mov esi, [esp + 8 + 8] // stride_yuy2 3723 mov esi, [esp + 8 + 8] // stride_yuy2
3227 mov edx, [esp + 8 + 12] // dst_u 3724 mov edx, [esp + 8 + 12] // dst_u
3228 mov edi, [esp + 8 + 16] // dst_v 3725 mov edi, [esp + 8 + 16] // dst_v
3229 mov ecx, [esp + 8 + 20] // pix 3726 mov ecx, [esp + 8 + 20] // pix
(...skipping 23 matching lines...) Expand all
3253 sub ecx, 32 3750 sub ecx, 32
3254 jg convertloop 3751 jg convertloop
3255 3752
3256 pop edi 3753 pop edi
3257 pop esi 3754 pop esi
3258 vzeroupper 3755 vzeroupper
3259 ret 3756 ret
3260 } 3757 }
3261 } 3758 }
3262 3759
3263 __declspec(naked) __declspec(align(16)) 3760 __declspec(naked)
3264 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 3761 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3265 uint8* dst_u, uint8* dst_v, int pix) { 3762 uint8* dst_u, uint8* dst_v, int pix) {
3266 __asm { 3763 __asm {
3267 push edi 3764 push edi
3268 mov eax, [esp + 4 + 4] // src_yuy2 3765 mov eax, [esp + 4 + 4] // src_yuy2
3269 mov edx, [esp + 4 + 8] // dst_u 3766 mov edx, [esp + 4 + 8] // dst_u
3270 mov edi, [esp + 4 + 12] // dst_v 3767 mov edi, [esp + 4 + 12] // dst_v
3271 mov ecx, [esp + 4 + 16] // pix 3768 mov ecx, [esp + 4 + 16] // pix
3272 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3769 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3273 vpsrlw ymm5, ymm5, 8 3770 vpsrlw ymm5, ymm5, 8
(...skipping 20 matching lines...) Expand all
3294 jg convertloop 3791 jg convertloop
3295 3792
3296 pop edi 3793 pop edi
3297 vzeroupper 3794 vzeroupper
3298 ret 3795 ret
3299 } 3796 }
3300 } 3797 }
3301 #endif // HAS_YUY2TOYROW_AVX2 3798 #endif // HAS_YUY2TOYROW_AVX2
3302 3799
3303 #ifdef HAS_YUY2TOYROW_SSE2 3800 #ifdef HAS_YUY2TOYROW_SSE2
3304 __declspec(naked) __declspec(align(16)) 3801 __declspec(naked)
3305 void YUY2ToYRow_SSE2(const uint8* src_yuy2, 3802 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
3306 uint8* dst_y, int pix) { 3803 uint8* dst_y, int pix) {
3307 __asm { 3804 __asm {
3308 mov eax, [esp + 4] // src_yuy2 3805 mov eax, [esp + 4] // src_yuy2
3309 mov edx, [esp + 8] // dst_y 3806 mov edx, [esp + 8] // dst_y
3310 mov ecx, [esp + 12] // pix 3807 mov ecx, [esp + 12] // pix
3311 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3808 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3312 psrlw xmm5, 8 3809 psrlw xmm5, 8
3313 3810
3314 convertloop: 3811 convertloop:
3315 movdqu xmm0, [eax] 3812 movdqu xmm0, [eax]
3316 movdqu xmm1, [eax + 16] 3813 movdqu xmm1, [eax + 16]
3317 lea eax, [eax + 32] 3814 lea eax, [eax + 32]
3318 pand xmm0, xmm5 // even bytes are Y 3815 pand xmm0, xmm5 // even bytes are Y
3319 pand xmm1, xmm5 3816 pand xmm1, xmm5
3320 packuswb xmm0, xmm1 3817 packuswb xmm0, xmm1
3321 movdqu [edx], xmm0 3818 movdqu [edx], xmm0
3322 lea edx, [edx + 16] 3819 lea edx, [edx + 16]
3323 sub ecx, 16 3820 sub ecx, 16
3324 jg convertloop 3821 jg convertloop
3325 ret 3822 ret
3326 } 3823 }
3327 } 3824 }
3328 3825
3329 __declspec(naked) __declspec(align(16)) 3826 __declspec(naked)
3330 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 3827 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3331 uint8* dst_u, uint8* dst_v, int pix) { 3828 uint8* dst_u, uint8* dst_v, int pix) {
3332 __asm { 3829 __asm {
3333 push esi 3830 push esi
3334 push edi 3831 push edi
3335 mov eax, [esp + 8 + 4] // src_yuy2 3832 mov eax, [esp + 8 + 4] // src_yuy2
3336 mov esi, [esp + 8 + 8] // stride_yuy2 3833 mov esi, [esp + 8 + 8] // stride_yuy2
3337 mov edx, [esp + 8 + 12] // dst_u 3834 mov edx, [esp + 8 + 12] // dst_u
3338 mov edi, [esp + 8 + 16] // dst_v 3835 mov edi, [esp + 8 + 16] // dst_v
3339 mov ecx, [esp + 8 + 20] // pix 3836 mov ecx, [esp + 8 + 20] // pix
(...skipping 22 matching lines...) Expand all
3362 lea edx, [edx + 8] 3859 lea edx, [edx + 8]
3363 sub ecx, 16 3860 sub ecx, 16
3364 jg convertloop 3861 jg convertloop
3365 3862
3366 pop edi 3863 pop edi
3367 pop esi 3864 pop esi
3368 ret 3865 ret
3369 } 3866 }
3370 } 3867 }
3371 3868
3372 __declspec(naked) __declspec(align(16)) 3869 __declspec(naked)
3373 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 3870 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3374 uint8* dst_u, uint8* dst_v, int pix) { 3871 uint8* dst_u, uint8* dst_v, int pix) {
3375 __asm { 3872 __asm {
3376 push edi 3873 push edi
3377 mov eax, [esp + 4 + 4] // src_yuy2 3874 mov eax, [esp + 4 + 4] // src_yuy2
3378 mov edx, [esp + 4 + 8] // dst_u 3875 mov edx, [esp + 4 + 8] // dst_u
3379 mov edi, [esp + 4 + 12] // dst_v 3876 mov edi, [esp + 4 + 12] // dst_v
3380 mov ecx, [esp + 4 + 16] // pix 3877 mov ecx, [esp + 4 + 16] // pix
3381 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3878 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3382 psrlw xmm5, 8 3879 psrlw xmm5, 8
(...skipping 15 matching lines...) Expand all
3398 movq qword ptr [edx + edi], xmm1 3895 movq qword ptr [edx + edi], xmm1
3399 lea edx, [edx + 8] 3896 lea edx, [edx + 8]
3400 sub ecx, 16 3897 sub ecx, 16
3401 jg convertloop 3898 jg convertloop
3402 3899
3403 pop edi 3900 pop edi
3404 ret 3901 ret
3405 } 3902 }
3406 } 3903 }
3407 3904
3408 __declspec(naked) __declspec(align(16)) 3905 __declspec(naked)
3409 void UYVYToYRow_SSE2(const uint8* src_uyvy, 3906 void UYVYToYRow_SSE2(const uint8* src_uyvy,
3410 uint8* dst_y, int pix) { 3907 uint8* dst_y, int pix) {
3411 __asm { 3908 __asm {
3412 mov eax, [esp + 4] // src_uyvy 3909 mov eax, [esp + 4] // src_uyvy
3413 mov edx, [esp + 8] // dst_y 3910 mov edx, [esp + 8] // dst_y
3414 mov ecx, [esp + 12] // pix 3911 mov ecx, [esp + 12] // pix
3415 3912
3416 convertloop: 3913 convertloop:
3417 movdqu xmm0, [eax] 3914 movdqu xmm0, [eax]
3418 movdqu xmm1, [eax + 16] 3915 movdqu xmm1, [eax + 16]
3419 lea eax, [eax + 32] 3916 lea eax, [eax + 32]
3420 psrlw xmm0, 8 // odd bytes are Y 3917 psrlw xmm0, 8 // odd bytes are Y
3421 psrlw xmm1, 8 3918 psrlw xmm1, 8
3422 packuswb xmm0, xmm1 3919 packuswb xmm0, xmm1
3423 movdqu [edx], xmm0 3920 movdqu [edx], xmm0
3424 lea edx, [edx + 16] 3921 lea edx, [edx + 16]
3425 sub ecx, 16 3922 sub ecx, 16
3426 jg convertloop 3923 jg convertloop
3427 ret 3924 ret
3428 } 3925 }
3429 } 3926 }
3430 3927
3431 __declspec(naked) __declspec(align(16)) 3928 __declspec(naked)
3432 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 3929 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
3433 uint8* dst_u, uint8* dst_v, int pix) { 3930 uint8* dst_u, uint8* dst_v, int pix) {
3434 __asm { 3931 __asm {
3435 push esi 3932 push esi
3436 push edi 3933 push edi
3437 mov eax, [esp + 8 + 4] // src_yuy2 3934 mov eax, [esp + 8 + 4] // src_yuy2
3438 mov esi, [esp + 8 + 8] // stride_yuy2 3935 mov esi, [esp + 8 + 8] // stride_yuy2
3439 mov edx, [esp + 8 + 12] // dst_u 3936 mov edx, [esp + 8 + 12] // dst_u
3440 mov edi, [esp + 8 + 16] // dst_v 3937 mov edi, [esp + 8 + 16] // dst_v
3441 mov ecx, [esp + 8 + 20] // pix 3938 mov ecx, [esp + 8 + 20] // pix
(...skipping 22 matching lines...) Expand all
3464 lea edx, [edx + 8] 3961 lea edx, [edx + 8]
3465 sub ecx, 16 3962 sub ecx, 16
3466 jg convertloop 3963 jg convertloop
3467 3964
3468 pop edi 3965 pop edi
3469 pop esi 3966 pop esi
3470 ret 3967 ret
3471 } 3968 }
3472 } 3969 }
3473 3970
3474 __declspec(naked) __declspec(align(16)) 3971 __declspec(naked)
3475 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 3972 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3476 uint8* dst_u, uint8* dst_v, int pix) { 3973 uint8* dst_u, uint8* dst_v, int pix) {
3477 __asm { 3974 __asm {
3478 push edi 3975 push edi
3479 mov eax, [esp + 4 + 4] // src_yuy2 3976 mov eax, [esp + 4 + 4] // src_yuy2
3480 mov edx, [esp + 4 + 8] // dst_u 3977 mov edx, [esp + 4 + 8] // dst_u
3481 mov edi, [esp + 4 + 12] // dst_v 3978 mov edi, [esp + 4 + 12] // dst_v
3482 mov ecx, [esp + 4 + 16] // pix 3979 mov ecx, [esp + 4 + 16] // pix
3483 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3980 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3484 psrlw xmm5, 8 3981 psrlw xmm5, 8
(...skipping 18 matching lines...) Expand all
3503 jg convertloop 4000 jg convertloop
3504 4001
3505 pop edi 4002 pop edi
3506 ret 4003 ret
3507 } 4004 }
3508 } 4005 }
3509 #endif // HAS_YUY2TOYROW_SSE2 4006 #endif // HAS_YUY2TOYROW_SSE2
3510 4007
3511 #ifdef HAS_ARGBBLENDROW_SSE2 4008 #ifdef HAS_ARGBBLENDROW_SSE2
3512 // Blend 8 pixels at a time. 4009 // Blend 8 pixels at a time.
3513 __declspec(naked) __declspec(align(16)) 4010 __declspec(naked)
3514 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4011 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3515 uint8* dst_argb, int width) { 4012 uint8* dst_argb, int width) {
3516 __asm { 4013 __asm {
3517 push esi 4014 push esi
3518 mov eax, [esp + 4 + 4] // src_argb0 4015 mov eax, [esp + 4 + 4] // src_argb0
3519 mov esi, [esp + 4 + 8] // src_argb1 4016 mov esi, [esp + 4 + 8] // src_argb1
3520 mov edx, [esp + 4 + 12] // dst_argb 4017 mov edx, [esp + 4 + 12] // dst_argb
3521 mov ecx, [esp + 4 + 16] // width 4018 mov ecx, [esp + 4 + 16] // width
3522 pcmpeqb xmm7, xmm7 // generate constant 1 4019 pcmpeqb xmm7, xmm7 // generate constant 1
3523 psrlw xmm7, 15 4020 psrlw xmm7, 15
3524 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4021 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
3525 psrlw xmm6, 8 4022 psrlw xmm6, 8
3526 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4023 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
3527 psllw xmm5, 8 4024 psllw xmm5, 8
3528 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4025 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3529 pslld xmm4, 24 4026 pslld xmm4, 24
3530 4027 sub ecx, 4
3531 sub ecx, 1 4028 jl convertloop4b // less than 4 pixels?
3532 je convertloop1 // only 1 pixel?
3533 jl convertloop1b
3534
3535 // 1 pixel loop until destination pointer is aligned.
3536 alignloop1:
3537 test edx, 15 // aligned?
3538 je alignloop1b
3539 movd xmm3, [eax]
3540 lea eax, [eax + 4]
3541 movdqa xmm0, xmm3 // src argb
3542 pxor xmm3, xmm4 // ~alpha
3543 movd xmm2, [esi] // _r_b
3544 psrlw xmm3, 8 // alpha
3545 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
3546 pshuflw xmm3, xmm3, 0F5h
3547 pand xmm2, xmm6 // _r_b
3548 paddw xmm3, xmm7 // 256 - alpha
3549 pmullw xmm2, xmm3 // _r_b * alpha
3550 movd xmm1, [esi] // _a_g
3551 lea esi, [esi + 4]
3552 psrlw xmm1, 8 // _a_g
3553 por xmm0, xmm4 // set alpha to 255
3554 pmullw xmm1, xmm3 // _a_g * alpha
3555 psrlw xmm2, 8 // _r_b convert to 8 bits again
3556 paddusb xmm0, xmm2 // + src argb
3557 pand xmm1, xmm5 // a_g_ convert to 8 bits again
3558 paddusb xmm0, xmm1 // + src argb
3559 movd [edx], xmm0
3560 lea edx, [edx + 4]
3561 sub ecx, 1
3562 jge alignloop1
3563
3564 alignloop1b:
3565 add ecx, 1 - 4
3566 jl convertloop4b
3567 4029
3568 // 4 pixel loop. 4030 // 4 pixel loop.
3569 convertloop4: 4031 convertloop4:
3570 movdqu xmm3, [eax] // src argb 4032 movdqu xmm3, [eax] // src argb
3571 lea eax, [eax + 16] 4033 lea eax, [eax + 16]
3572 movdqa xmm0, xmm3 // src argb 4034 movdqa xmm0, xmm3 // src argb
3573 pxor xmm3, xmm4 // ~alpha 4035 pxor xmm3, xmm4 // ~alpha
3574 movdqu xmm2, [esi] // _r_b 4036 movdqu xmm2, [esi] // _r_b
3575 psrlw xmm3, 8 // alpha 4037 psrlw xmm3, 8 // alpha
3576 pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4038 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
3637 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 4099 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3638 }; 4100 };
3639 // Same as SSE2, but replaces: 4101 // Same as SSE2, but replaces:
3640 // psrlw xmm3, 8 // alpha 4102 // psrlw xmm3, 8 // alpha
3641 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4103 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words
3642 // pshuflw xmm3, xmm3, 0F5h 4104 // pshuflw xmm3, xmm3, 0F5h
3643 // with.. 4105 // with..
3644 // pshufb xmm3, kShuffleAlpha // alpha 4106 // pshufb xmm3, kShuffleAlpha // alpha
3645 // Blend 8 pixels at a time. 4107 // Blend 8 pixels at a time.
3646 4108
3647 __declspec(naked) __declspec(align(16)) 4109 __declspec(naked)
3648 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 4110 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3649 uint8* dst_argb, int width) { 4111 uint8* dst_argb, int width) {
3650 __asm { 4112 __asm {
3651 push esi 4113 push esi
3652 mov eax, [esp + 4 + 4] // src_argb0 4114 mov eax, [esp + 4 + 4] // src_argb0
3653 mov esi, [esp + 4 + 8] // src_argb1 4115 mov esi, [esp + 4 + 8] // src_argb1
3654 mov edx, [esp + 4 + 12] // dst_argb 4116 mov edx, [esp + 4 + 12] // dst_argb
3655 mov ecx, [esp + 4 + 16] // width 4117 mov ecx, [esp + 4 + 16] // width
3656 pcmpeqb xmm7, xmm7 // generate constant 0x0001 4118 pcmpeqb xmm7, xmm7 // generate constant 0x0001
3657 psrlw xmm7, 15 4119 psrlw xmm7, 15
3658 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4120 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
3659 psrlw xmm6, 8 4121 psrlw xmm6, 8
3660 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4122 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
3661 psllw xmm5, 8 4123 psllw xmm5, 8
3662 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4124 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3663 pslld xmm4, 24 4125 pslld xmm4, 24
3664 4126 sub ecx, 4
3665 sub ecx, 1 4127 jl convertloop4b // less than 4 pixels?
3666 je convertloop1 // only 1 pixel?
3667 jl convertloop1b
3668
3669 // 1 pixel loop until destination pointer is aligned.
3670 alignloop1:
3671 test edx, 15 // aligned?
3672 je alignloop1b
3673 movd xmm3, [eax]
3674 lea eax, [eax + 4]
3675 movdqa xmm0, xmm3 // src argb
3676 pxor xmm3, xmm4 // ~alpha
3677 movd xmm2, [esi] // _r_b
3678 pshufb xmm3, kShuffleAlpha // alpha
3679 pand xmm2, xmm6 // _r_b
3680 paddw xmm3, xmm7 // 256 - alpha
3681 pmullw xmm2, xmm3 // _r_b * alpha
3682 movd xmm1, [esi] // _a_g
3683 lea esi, [esi + 4]
3684 psrlw xmm1, 8 // _a_g
3685 por xmm0, xmm4 // set alpha to 255
3686 pmullw xmm1, xmm3 // _a_g * alpha
3687 psrlw xmm2, 8 // _r_b convert to 8 bits again
3688 paddusb xmm0, xmm2 // + src argb
3689 pand xmm1, xmm5 // a_g_ convert to 8 bits again
3690 paddusb xmm0, xmm1 // + src argb
3691 movd [edx], xmm0
3692 lea edx, [edx + 4]
3693 sub ecx, 1
3694 jge alignloop1
3695
3696 alignloop1b:
3697 add ecx, 1 - 4
3698 jl convertloop4b
3699 4128
3700 // 4 pixel loop. 4129 // 4 pixel loop.
3701 convertloop4: 4130 convertloop4:
3702 movdqu xmm3, [eax] // src argb 4131 movdqu xmm3, [eax] // src argb
3703 lea eax, [eax + 16] 4132 lea eax, [eax + 16]
3704 movdqa xmm0, xmm3 // src argb 4133 movdqa xmm0, xmm3 // src argb
3705 pxor xmm3, xmm4 // ~alpha 4134 pxor xmm3, xmm4 // ~alpha
3706 movdqu xmm2, [esi] // _r_b 4135 movdqu xmm2, [esi] // _r_b
3707 pshufb xmm3, kShuffleAlpha // alpha 4136 pshufb xmm3, kShuffleAlpha // alpha
3708 pand xmm2, xmm6 // _r_b 4137 pand xmm2, xmm6 // _r_b
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
3753 4182
3754 convertloop1b: 4183 convertloop1b:
3755 pop esi 4184 pop esi
3756 ret 4185 ret
3757 } 4186 }
3758 } 4187 }
3759 #endif // HAS_ARGBBLENDROW_SSSE3 4188 #endif // HAS_ARGBBLENDROW_SSSE3
3760 4189
3761 #ifdef HAS_ARGBATTENUATEROW_SSE2 4190 #ifdef HAS_ARGBATTENUATEROW_SSE2
3762 // Attenuate 4 pixels at a time. 4191 // Attenuate 4 pixels at a time.
3763 __declspec(naked) __declspec(align(16)) 4192 __declspec(naked)
3764 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { 4193 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3765 __asm { 4194 __asm {
3766 mov eax, [esp + 4] // src_argb0 4195 mov eax, [esp + 4] // src_argb0
3767 mov edx, [esp + 8] // dst_argb 4196 mov edx, [esp + 8] // dst_argb
3768 mov ecx, [esp + 12] // width 4197 mov ecx, [esp + 12] // width
3769 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4198 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3770 pslld xmm4, 24 4199 pslld xmm4, 24
3771 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff 4200 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
3772 psrld xmm5, 8 4201 psrld xmm5, 8
3773 4202
(...skipping 28 matching lines...) Expand all
3802 4231
3803 #ifdef HAS_ARGBATTENUATEROW_SSSE3 4232 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3804 // Shuffle table duplicating alpha. 4233 // Shuffle table duplicating alpha.
3805 static const uvec8 kShuffleAlpha0 = { 4234 static const uvec8 kShuffleAlpha0 = {
3806 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 4235 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3807 }; 4236 };
3808 static const uvec8 kShuffleAlpha1 = { 4237 static const uvec8 kShuffleAlpha1 = {
3809 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 4238 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3810 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 4239 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3811 }; 4240 };
3812 __declspec(naked) __declspec(align(16)) 4241 __declspec(naked)
3813 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4242 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3814 __asm { 4243 __asm {
3815 mov eax, [esp + 4] // src_argb0 4244 mov eax, [esp + 4] // src_argb0
3816 mov edx, [esp + 8] // dst_argb 4245 mov edx, [esp + 8] // dst_argb
3817 mov ecx, [esp + 12] // width 4246 mov ecx, [esp + 12] // width
3818 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 4247 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
3819 pslld xmm3, 24 4248 pslld xmm3, 24
3820 movdqa xmm4, kShuffleAlpha0 4249 movdqa xmm4, kShuffleAlpha0
3821 movdqa xmm5, kShuffleAlpha1 4250 movdqa xmm5, kShuffleAlpha1
3822 4251
(...skipping 23 matching lines...) Expand all
3846 ret 4275 ret
3847 } 4276 }
3848 } 4277 }
3849 #endif // HAS_ARGBATTENUATEROW_SSSE3 4278 #endif // HAS_ARGBATTENUATEROW_SSSE3
3850 4279
3851 #ifdef HAS_ARGBATTENUATEROW_AVX2 4280 #ifdef HAS_ARGBATTENUATEROW_AVX2
3852 // Shuffle table duplicating alpha. 4281 // Shuffle table duplicating alpha.
3853 static const uvec8 kShuffleAlpha_AVX2 = { 4282 static const uvec8 kShuffleAlpha_AVX2 = {
3854 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u 4283 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
3855 }; 4284 };
3856 __declspec(naked) __declspec(align(16)) 4285 __declspec(naked)
3857 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { 4286 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3858 __asm { 4287 __asm {
3859 mov eax, [esp + 4] // src_argb0 4288 mov eax, [esp + 4] // src_argb0
3860 mov edx, [esp + 8] // dst_argb 4289 mov edx, [esp + 8] // dst_argb
3861 mov ecx, [esp + 12] // width 4290 mov ecx, [esp + 12] // width
3862 sub edx, eax 4291 sub edx, eax
3863 vbroadcastf128 ymm4,kShuffleAlpha_AVX2 4292 vbroadcastf128 ymm4,kShuffleAlpha_AVX2
3864 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 4293 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
3865 vpslld ymm5, ymm5, 24 4294 vpslld ymm5, ymm5, 24
3866 4295
(...skipping 16 matching lines...) Expand all
3883 jg convertloop 4312 jg convertloop
3884 4313
3885 vzeroupper 4314 vzeroupper
3886 ret 4315 ret
3887 } 4316 }
3888 } 4317 }
3889 #endif // HAS_ARGBATTENUATEROW_AVX2 4318 #endif // HAS_ARGBATTENUATEROW_AVX2
3890 4319
3891 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 4320 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3892 // Unattenuate 4 pixels at a time. 4321 // Unattenuate 4 pixels at a time.
3893 __declspec(naked) __declspec(align(16)) 4322 __declspec(naked)
3894 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 4323 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3895 int width) { 4324 int width) {
3896 __asm { 4325 __asm {
3897 push esi 4326 push esi
3898 push edi 4327 push edi
3899 mov eax, [esp + 8 + 4] // src_argb0 4328 mov eax, [esp + 8 + 4] // src_argb0
3900 mov edx, [esp + 8 + 8] // dst_argb 4329 mov edx, [esp + 8 + 8] // dst_argb
3901 mov ecx, [esp + 8 + 12] // width 4330 mov ecx, [esp + 8 + 12] // width
3902 4331
3903 convertloop: 4332 convertloop:
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
3937 #endif // HAS_ARGBUNATTENUATEROW_SSE2 4366 #endif // HAS_ARGBUNATTENUATEROW_SSE2
3938 4367
3939 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 4368 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3940 // Shuffle table duplicating alpha. 4369 // Shuffle table duplicating alpha.
3941 static const uvec8 kUnattenShuffleAlpha_AVX2 = { 4370 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3942 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u 4371 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
3943 }; 4372 };
3944 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. 4373 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
3945 // USE_GATHER is not on by default, due to being a slow instruction. 4374 // USE_GATHER is not on by default, due to being a slow instruction.
3946 #ifdef USE_GATHER 4375 #ifdef USE_GATHER
3947 __declspec(naked) __declspec(align(16)) 4376 __declspec(naked)
3948 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 4377 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
3949 int width) { 4378 int width) {
3950 __asm { 4379 __asm {
3951 mov eax, [esp + 4] // src_argb0 4380 mov eax, [esp + 4] // src_argb0
3952 mov edx, [esp + 8] // dst_argb 4381 mov edx, [esp + 8] // dst_argb
3953 mov ecx, [esp + 12] // width 4382 mov ecx, [esp + 12] // width
3954 sub edx, eax 4383 sub edx, eax
3955 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 4384 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
3956 4385
3957 convertloop: 4386 convertloop:
(...skipping 13 matching lines...) Expand all
3971 vmovdqu [eax + edx], ymm0 4400 vmovdqu [eax + edx], ymm0
3972 lea eax, [eax + 32] 4401 lea eax, [eax + 32]
3973 sub ecx, 8 4402 sub ecx, 8
3974 jg convertloop 4403 jg convertloop
3975 4404
3976 vzeroupper 4405 vzeroupper
3977 ret 4406 ret
3978 } 4407 }
3979 } 4408 }
3980 #else // USE_GATHER 4409 #else // USE_GATHER
3981 __declspec(naked) __declspec(align(16)) 4410 __declspec(naked)
3982 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 4411 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
3983 int width) { 4412 int width) {
3984 __asm { 4413 __asm {
3985 4414
3986 mov eax, [esp + 4] // src_argb0 4415 mov eax, [esp + 4] // src_argb0
3987 mov edx, [esp + 8] // dst_argb 4416 mov edx, [esp + 8] // dst_argb
3988 mov ecx, [esp + 12] // width 4417 mov ecx, [esp + 12] // width
3989 sub edx, eax 4418 sub edx, eax
3990 vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 4419 vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
3991 4420
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
4038 pop esi 4467 pop esi
4039 vzeroupper 4468 vzeroupper
4040 ret 4469 ret
4041 } 4470 }
4042 } 4471 }
4043 #endif // USE_GATHER 4472 #endif // USE_GATHER
4044 #endif // HAS_ARGBATTENUATEROW_AVX2 4473 #endif // HAS_ARGBATTENUATEROW_AVX2
4045 4474
4046 #ifdef HAS_ARGBGRAYROW_SSSE3 4475 #ifdef HAS_ARGBGRAYROW_SSSE3
4047 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. 4476 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4048 __declspec(naked) __declspec(align(16)) 4477 __declspec(naked)
4049 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4478 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4050 __asm { 4479 __asm {
4051 mov eax, [esp + 4] /* src_argb */ 4480 mov eax, [esp + 4] /* src_argb */
4052 mov edx, [esp + 8] /* dst_argb */ 4481 mov edx, [esp + 8] /* dst_argb */
4053 mov ecx, [esp + 12] /* width */ 4482 mov ecx, [esp + 12] /* width */
4054 movdqa xmm4, kARGBToYJ 4483 movdqa xmm4, kARGBToYJ
4055 movdqa xmm5, kAddYJ64 4484 movdqa xmm5, kAddYJ64
4056 4485
4057 convertloop: 4486 convertloop:
4058 movdqu xmm0, [eax] // G 4487 movdqu xmm0, [eax] // G
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
4097 4526
4098 static const vec8 kARGBToSepiaG = { 4527 static const vec8 kARGBToSepiaG = {
4099 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 4528 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
4100 }; 4529 };
4101 4530
4102 static const vec8 kARGBToSepiaR = { 4531 static const vec8 kARGBToSepiaR = {
4103 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 4532 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
4104 }; 4533 };
4105 4534
4106 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 4535 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4107 __declspec(naked) __declspec(align(16)) 4536 __declspec(naked)
4108 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 4537 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4109 __asm { 4538 __asm {
4110 mov eax, [esp + 4] /* dst_argb */ 4539 mov eax, [esp + 4] /* dst_argb */
4111 mov ecx, [esp + 8] /* width */ 4540 mov ecx, [esp + 8] /* width */
4112 movdqa xmm2, kARGBToSepiaB 4541 movdqa xmm2, kARGBToSepiaB
4113 movdqa xmm3, kARGBToSepiaG 4542 movdqa xmm3, kARGBToSepiaG
4114 movdqa xmm4, kARGBToSepiaR 4543 movdqa xmm4, kARGBToSepiaR
4115 4544
4116 convertloop: 4545 convertloop:
4117 movdqu xmm0, [eax] // B 4546 movdqu xmm0, [eax] // B
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
4154 ret 4583 ret
4155 } 4584 }
4156 } 4585 }
4157 #endif // HAS_ARGBSEPIAROW_SSSE3 4586 #endif // HAS_ARGBSEPIAROW_SSSE3
4158 4587
4159 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 4588 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4160 // Tranform 8 ARGB pixels (32 bytes) with color matrix. 4589 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4161 // Same as Sepia except matrix is provided. 4590 // Same as Sepia except matrix is provided.
4162 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R 4591 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4163 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. 4592 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4164 __declspec(naked) __declspec(align(16)) 4593 __declspec(naked)
4165 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 4594 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4166 const int8* matrix_argb, int width) { 4595 const int8* matrix_argb, int width) {
4167 __asm { 4596 __asm {
4168 mov eax, [esp + 4] /* src_argb */ 4597 mov eax, [esp + 4] /* src_argb */
4169 mov edx, [esp + 8] /* dst_argb */ 4598 mov edx, [esp + 8] /* dst_argb */
4170 mov ecx, [esp + 12] /* matrix_argb */ 4599 mov ecx, [esp + 12] /* matrix_argb */
4171 movdqu xmm5, [ecx] 4600 movdqu xmm5, [ecx]
4172 pshufd xmm2, xmm5, 0x00 4601 pshufd xmm2, xmm5, 0x00
4173 pshufd xmm3, xmm5, 0x55 4602 pshufd xmm3, xmm5, 0x55
4174 pshufd xmm4, xmm5, 0xaa 4603 pshufd xmm4, xmm5, 0xaa
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
4215 lea edx, [edx + 32] 4644 lea edx, [edx + 32]
4216 sub ecx, 8 4645 sub ecx, 8
4217 jg convertloop 4646 jg convertloop
4218 ret 4647 ret
4219 } 4648 }
4220 } 4649 }
4221 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 4650 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4222 4651
4223 #ifdef HAS_ARGBQUANTIZEROW_SSE2 4652 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4224 // Quantize 4 ARGB pixels (16 bytes). 4653 // Quantize 4 ARGB pixels (16 bytes).
4225 __declspec(naked) __declspec(align(16)) 4654 __declspec(naked)
4226 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 4655 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4227 int interval_offset, int width) { 4656 int interval_offset, int width) {
4228 __asm { 4657 __asm {
4229 mov eax, [esp + 4] /* dst_argb */ 4658 mov eax, [esp + 4] /* dst_argb */
4230 movd xmm2, [esp + 8] /* scale */ 4659 movd xmm2, [esp + 8] /* scale */
4231 movd xmm3, [esp + 12] /* interval_size */ 4660 movd xmm3, [esp + 12] /* interval_size */
4232 movd xmm4, [esp + 16] /* interval_offset */ 4661 movd xmm4, [esp + 16] /* interval_offset */
4233 mov ecx, [esp + 20] /* width */ 4662 mov ecx, [esp + 20] /* width */
4234 pshuflw xmm2, xmm2, 040h 4663 pshuflw xmm2, xmm2, 040h
4235 pshufd xmm2, xmm2, 044h 4664 pshufd xmm2, xmm2, 044h
(...skipping 24 matching lines...) Expand all
4260 lea eax, [eax + 16] 4689 lea eax, [eax + 16]
4261 sub ecx, 4 4690 sub ecx, 4
4262 jg convertloop 4691 jg convertloop
4263 ret 4692 ret
4264 } 4693 }
4265 } 4694 }
4266 #endif // HAS_ARGBQUANTIZEROW_SSE2 4695 #endif // HAS_ARGBQUANTIZEROW_SSE2
4267 4696
4268 #ifdef HAS_ARGBSHADEROW_SSE2 4697 #ifdef HAS_ARGBSHADEROW_SSE2
4269 // Shade 4 pixels at a time by specified value. 4698 // Shade 4 pixels at a time by specified value.
4270 __declspec(naked) __declspec(align(16)) 4699 __declspec(naked)
4271 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 4700 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4272 uint32 value) { 4701 uint32 value) {
4273 __asm { 4702 __asm {
4274 mov eax, [esp + 4] // src_argb 4703 mov eax, [esp + 4] // src_argb
4275 mov edx, [esp + 8] // dst_argb 4704 mov edx, [esp + 8] // dst_argb
4276 mov ecx, [esp + 12] // width 4705 mov ecx, [esp + 12] // width
4277 movd xmm2, [esp + 16] // value 4706 movd xmm2, [esp + 16] // value
4278 punpcklbw xmm2, xmm2 4707 punpcklbw xmm2, xmm2
4279 punpcklqdq xmm2, xmm2 4708 punpcklqdq xmm2, xmm2
4280 4709
(...skipping 13 matching lines...) Expand all
4294 sub ecx, 4 4723 sub ecx, 4
4295 jg convertloop 4724 jg convertloop
4296 4725
4297 ret 4726 ret
4298 } 4727 }
4299 } 4728 }
4300 #endif // HAS_ARGBSHADEROW_SSE2 4729 #endif // HAS_ARGBSHADEROW_SSE2
4301 4730
4302 #ifdef HAS_ARGBMULTIPLYROW_SSE2 4731 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4303 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 4732 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4304 __declspec(naked) __declspec(align(16)) 4733 __declspec(naked)
4305 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4734 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4306 uint8* dst_argb, int width) { 4735 uint8* dst_argb, int width) {
4307 __asm { 4736 __asm {
4308 push esi 4737 push esi
4309 mov eax, [esp + 4 + 4] // src_argb0 4738 mov eax, [esp + 4 + 4] // src_argb0
4310 mov esi, [esp + 4 + 8] // src_argb1 4739 mov esi, [esp + 4 + 8] // src_argb1
4311 mov edx, [esp + 4 + 12] // dst_argb 4740 mov edx, [esp + 4 + 12] // dst_argb
4312 mov ecx, [esp + 4 + 16] // width 4741 mov ecx, [esp + 4 + 16] // width
4313 pxor xmm5, xmm5 // constant 0 4742 pxor xmm5, xmm5 // constant 0
4314 4743
(...skipping 18 matching lines...) Expand all
4333 4762
4334 pop esi 4763 pop esi
4335 ret 4764 ret
4336 } 4765 }
4337 } 4766 }
4338 #endif // HAS_ARGBMULTIPLYROW_SSE2 4767 #endif // HAS_ARGBMULTIPLYROW_SSE2
4339 4768
4340 #ifdef HAS_ARGBADDROW_SSE2 4769 #ifdef HAS_ARGBADDROW_SSE2
4341 // Add 2 rows of ARGB pixels together, 4 pixels at a time. 4770 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4342 // TODO(fbarchard): Port this to posix, neon and other math functions. 4771 // TODO(fbarchard): Port this to posix, neon and other math functions.
4343 __declspec(naked) __declspec(align(16)) 4772 __declspec(naked)
4344 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4773 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4345 uint8* dst_argb, int width) { 4774 uint8* dst_argb, int width) {
4346 __asm { 4775 __asm {
4347 push esi 4776 push esi
4348 mov eax, [esp + 4 + 4] // src_argb0 4777 mov eax, [esp + 4 + 4] // src_argb0
4349 mov esi, [esp + 4 + 8] // src_argb1 4778 mov esi, [esp + 4 + 8] // src_argb1
4350 mov edx, [esp + 4 + 12] // dst_argb 4779 mov edx, [esp + 4 + 12] // dst_argb
4351 mov ecx, [esp + 4 + 16] // width 4780 mov ecx, [esp + 4 + 16] // width
4352 4781
4353 sub ecx, 4 4782 sub ecx, 4
(...skipping 27 matching lines...) Expand all
4381 4810
4382 convertloop19: 4811 convertloop19:
4383 pop esi 4812 pop esi
4384 ret 4813 ret
4385 } 4814 }
4386 } 4815 }
4387 #endif // HAS_ARGBADDROW_SSE2 4816 #endif // HAS_ARGBADDROW_SSE2
4388 4817
4389 #ifdef HAS_ARGBSUBTRACTROW_SSE2 4818 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4390 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. 4819 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4391 __declspec(naked) __declspec(align(16)) 4820 __declspec(naked)
4392 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4821 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4393 uint8* dst_argb, int width) { 4822 uint8* dst_argb, int width) {
4394 __asm { 4823 __asm {
4395 push esi 4824 push esi
4396 mov eax, [esp + 4 + 4] // src_argb0 4825 mov eax, [esp + 4 + 4] // src_argb0
4397 mov esi, [esp + 4 + 8] // src_argb1 4826 mov esi, [esp + 4 + 8] // src_argb1
4398 mov edx, [esp + 4 + 12] // dst_argb 4827 mov edx, [esp + 4 + 12] // dst_argb
4399 mov ecx, [esp + 4 + 16] // width 4828 mov ecx, [esp + 4 + 16] // width
4400 4829
4401 convertloop: 4830 convertloop:
4402 movdqu xmm0, [eax] // read 4 pixels from src_argb0 4831 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4403 lea eax, [eax + 16] 4832 lea eax, [eax + 16]
4404 movdqu xmm1, [esi] // read 4 pixels from src_argb1 4833 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4405 lea esi, [esi + 16] 4834 lea esi, [esi + 16]
4406 psubusb xmm0, xmm1 // src_argb0 - src_argb1 4835 psubusb xmm0, xmm1 // src_argb0 - src_argb1
4407 movdqu [edx], xmm0 4836 movdqu [edx], xmm0
4408 lea edx, [edx + 16] 4837 lea edx, [edx + 16]
4409 sub ecx, 4 4838 sub ecx, 4
4410 jg convertloop 4839 jg convertloop
4411 4840
4412 pop esi 4841 pop esi
4413 ret 4842 ret
4414 } 4843 }
4415 } 4844 }
4416 #endif // HAS_ARGBSUBTRACTROW_SSE2 4845 #endif // HAS_ARGBSUBTRACTROW_SSE2
4417 4846
4418 #ifdef HAS_ARGBMULTIPLYROW_AVX2 4847 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4419 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 4848 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4420 __declspec(naked) __declspec(align(16)) 4849 __declspec(naked)
4421 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4850 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4422 uint8* dst_argb, int width) { 4851 uint8* dst_argb, int width) {
4423 __asm { 4852 __asm {
4424 push esi 4853 push esi
4425 mov eax, [esp + 4 + 4] // src_argb0 4854 mov eax, [esp + 4 + 4] // src_argb0
4426 mov esi, [esp + 4 + 8] // src_argb1 4855 mov esi, [esp + 4 + 8] // src_argb1
4427 mov edx, [esp + 4 + 12] // dst_argb 4856 mov edx, [esp + 4 + 12] // dst_argb
4428 mov ecx, [esp + 4 + 16] // width 4857 mov ecx, [esp + 4 + 16] // width
4429 vpxor ymm5, ymm5, ymm5 // constant 0 4858 vpxor ymm5, ymm5, ymm5 // constant 0
4430 4859
(...skipping 16 matching lines...) Expand all
4447 4876
4448 pop esi 4877 pop esi
4449 vzeroupper 4878 vzeroupper
4450 ret 4879 ret
4451 } 4880 }
4452 } 4881 }
4453 #endif // HAS_ARGBMULTIPLYROW_AVX2 4882 #endif // HAS_ARGBMULTIPLYROW_AVX2
4454 4883
4455 #ifdef HAS_ARGBADDROW_AVX2 4884 #ifdef HAS_ARGBADDROW_AVX2
4456 // Add 2 rows of ARGB pixels together, 8 pixels at a time. 4885 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
4457 __declspec(naked) __declspec(align(16)) 4886 __declspec(naked)
4458 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4887 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4459 uint8* dst_argb, int width) { 4888 uint8* dst_argb, int width) {
4460 __asm { 4889 __asm {
4461 push esi 4890 push esi
4462 mov eax, [esp + 4 + 4] // src_argb0 4891 mov eax, [esp + 4 + 4] // src_argb0
4463 mov esi, [esp + 4 + 8] // src_argb1 4892 mov esi, [esp + 4 + 8] // src_argb1
4464 mov edx, [esp + 4 + 12] // dst_argb 4893 mov edx, [esp + 4 + 12] // dst_argb
4465 mov ecx, [esp + 4 + 16] // width 4894 mov ecx, [esp + 4 + 16] // width
4466 4895
4467 convertloop: 4896 convertloop:
4468 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 4897 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4469 lea eax, [eax + 32] 4898 lea eax, [eax + 32]
4470 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 4899 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
4471 lea esi, [esi + 32] 4900 lea esi, [esi + 32]
4472 vmovdqu [edx], ymm0 4901 vmovdqu [edx], ymm0
4473 lea edx, [edx + 32] 4902 lea edx, [edx + 32]
4474 sub ecx, 8 4903 sub ecx, 8
4475 jg convertloop 4904 jg convertloop
4476 4905
4477 pop esi 4906 pop esi
4478 vzeroupper 4907 vzeroupper
4479 ret 4908 ret
4480 } 4909 }
4481 } 4910 }
4482 #endif // HAS_ARGBADDROW_AVX2 4911 #endif // HAS_ARGBADDROW_AVX2
4483 4912
4484 #ifdef HAS_ARGBSUBTRACTROW_AVX2 4913 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4485 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. 4914 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4486 __declspec(naked) __declspec(align(16)) 4915 __declspec(naked)
4487 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4916 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4488 uint8* dst_argb, int width) { 4917 uint8* dst_argb, int width) {
4489 __asm { 4918 __asm {
4490 push esi 4919 push esi
4491 mov eax, [esp + 4 + 4] // src_argb0 4920 mov eax, [esp + 4 + 4] // src_argb0
4492 mov esi, [esp + 4 + 8] // src_argb1 4921 mov esi, [esp + 4 + 8] // src_argb1
4493 mov edx, [esp + 4 + 12] // dst_argb 4922 mov edx, [esp + 4 + 12] // dst_argb
4494 mov ecx, [esp + 4 + 16] // width 4923 mov ecx, [esp + 4 + 16] // width
4495 4924
4496 convertloop: 4925 convertloop:
(...skipping 11 matching lines...) Expand all
4508 ret 4937 ret
4509 } 4938 }
4510 } 4939 }
4511 #endif // HAS_ARGBSUBTRACTROW_AVX2 4940 #endif // HAS_ARGBSUBTRACTROW_AVX2
4512 4941
4513 #ifdef HAS_SOBELXROW_SSE2 4942 #ifdef HAS_SOBELXROW_SSE2
4514 // SobelX as a matrix is 4943 // SobelX as a matrix is
4515 // -1 0 1 4944 // -1 0 1
4516 // -2 0 2 4945 // -2 0 2
4517 // -1 0 1 4946 // -1 0 1
4518 __declspec(naked) __declspec(align(16)) 4947 __declspec(naked)
4519 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 4948 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4520 const uint8* src_y2, uint8* dst_sobelx, int width) { 4949 const uint8* src_y2, uint8* dst_sobelx, int width) {
4521 __asm { 4950 __asm {
4522 push esi 4951 push esi
4523 push edi 4952 push edi
4524 mov eax, [esp + 8 + 4] // src_y0 4953 mov eax, [esp + 8 + 4] // src_y0
4525 mov esi, [esp + 8 + 8] // src_y1 4954 mov esi, [esp + 8 + 8] // src_y1
4526 mov edi, [esp + 8 + 12] // src_y2 4955 mov edi, [esp + 8 + 12] // src_y2
4527 mov edx, [esp + 8 + 16] // dst_sobelx 4956 mov edx, [esp + 8 + 16] // dst_sobelx
4528 mov ecx, [esp + 8 + 20] // width 4957 mov ecx, [esp + 8 + 20] // width
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
4564 ret 4993 ret
4565 } 4994 }
4566 } 4995 }
4567 #endif // HAS_SOBELXROW_SSE2 4996 #endif // HAS_SOBELXROW_SSE2
4568 4997
4569 #ifdef HAS_SOBELYROW_SSE2 4998 #ifdef HAS_SOBELYROW_SSE2
4570 // SobelY as a matrix is 4999 // SobelY as a matrix is
4571 // -1 -2 -1 5000 // -1 -2 -1
4572 // 0 0 0 5001 // 0 0 0
4573 // 1 2 1 5002 // 1 2 1
4574 __declspec(naked) __declspec(align(16)) 5003 __declspec(naked)
4575 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 5004 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4576 uint8* dst_sobely, int width) { 5005 uint8* dst_sobely, int width) {
4577 __asm { 5006 __asm {
4578 push esi 5007 push esi
4579 mov eax, [esp + 4 + 4] // src_y0 5008 mov eax, [esp + 4 + 4] // src_y0
4580 mov esi, [esp + 4 + 8] // src_y1 5009 mov esi, [esp + 4 + 8] // src_y1
4581 mov edx, [esp + 4 + 12] // dst_sobely 5010 mov edx, [esp + 4 + 12] // dst_sobely
4582 mov ecx, [esp + 4 + 16] // width 5011 mov ecx, [esp + 4 + 16] // width
4583 sub esi, eax 5012 sub esi, eax
4584 sub edx, eax 5013 sub edx, eax
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
4617 } 5046 }
4618 } 5047 }
4619 #endif // HAS_SOBELYROW_SSE2 5048 #endif // HAS_SOBELYROW_SSE2
4620 5049
4621 #ifdef HAS_SOBELROW_SSE2 5050 #ifdef HAS_SOBELROW_SSE2
4622 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. 5051 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4623 // A = 255 5052 // A = 255
4624 // R = Sobel 5053 // R = Sobel
4625 // G = Sobel 5054 // G = Sobel
4626 // B = Sobel 5055 // B = Sobel
4627 __declspec(naked) __declspec(align(16)) 5056 __declspec(naked)
4628 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5057 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4629 uint8* dst_argb, int width) { 5058 uint8* dst_argb, int width) {
4630 __asm { 5059 __asm {
4631 push esi 5060 push esi
4632 mov eax, [esp + 4 + 4] // src_sobelx 5061 mov eax, [esp + 4 + 4] // src_sobelx
4633 mov esi, [esp + 4 + 8] // src_sobely 5062 mov esi, [esp + 4 + 8] // src_sobely
4634 mov edx, [esp + 4 + 12] // dst_argb 5063 mov edx, [esp + 4 + 12] // dst_argb
4635 mov ecx, [esp + 4 + 16] // width 5064 mov ecx, [esp + 4 + 16] // width
4636 sub esi, eax 5065 sub esi, eax
4637 pcmpeqb xmm5, xmm5 // alpha 255 5066 pcmpeqb xmm5, xmm5 // alpha 255
(...skipping 26 matching lines...) Expand all
4664 jg convertloop 5093 jg convertloop
4665 5094
4666 pop esi 5095 pop esi
4667 ret 5096 ret
4668 } 5097 }
4669 } 5098 }
4670 #endif // HAS_SOBELROW_SSE2 5099 #endif // HAS_SOBELROW_SSE2
4671 5100
4672 #ifdef HAS_SOBELTOPLANEROW_SSE2 5101 #ifdef HAS_SOBELTOPLANEROW_SSE2
4673 // Adds Sobel X and Sobel Y and stores Sobel into a plane. 5102 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
4674 __declspec(naked) __declspec(align(16)) 5103 __declspec(naked)
4675 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5104 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4676 uint8* dst_y, int width) { 5105 uint8* dst_y, int width) {
4677 __asm { 5106 __asm {
4678 push esi 5107 push esi
4679 mov eax, [esp + 4 + 4] // src_sobelx 5108 mov eax, [esp + 4 + 4] // src_sobelx
4680 mov esi, [esp + 4 + 8] // src_sobely 5109 mov esi, [esp + 4 + 8] // src_sobely
4681 mov edx, [esp + 4 + 12] // dst_argb 5110 mov edx, [esp + 4 + 12] // dst_argb
4682 mov ecx, [esp + 4 + 16] // width 5111 mov ecx, [esp + 4 + 16] // width
4683 sub esi, eax 5112 sub esi, eax
4684 5113
(...skipping 12 matching lines...) Expand all
4697 } 5126 }
4698 } 5127 }
4699 #endif // HAS_SOBELTOPLANEROW_SSE2 5128 #endif // HAS_SOBELTOPLANEROW_SSE2
4700 5129
4701 #ifdef HAS_SOBELXYROW_SSE2 5130 #ifdef HAS_SOBELXYROW_SSE2
4702 // Mixes Sobel X, Sobel Y and Sobel into ARGB. 5131 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4703 // A = 255 5132 // A = 255
4704 // R = Sobel X 5133 // R = Sobel X
4705 // G = Sobel 5134 // G = Sobel
4706 // B = Sobel Y 5135 // B = Sobel Y
4707 __declspec(naked) __declspec(align(16)) 5136 __declspec(naked)
4708 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5137 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4709 uint8* dst_argb, int width) { 5138 uint8* dst_argb, int width) {
4710 __asm { 5139 __asm {
4711 push esi 5140 push esi
4712 mov eax, [esp + 4 + 4] // src_sobelx 5141 mov eax, [esp + 4 + 4] // src_sobelx
4713 mov esi, [esp + 4 + 8] // src_sobely 5142 mov esi, [esp + 4 + 8] // src_sobely
4714 mov edx, [esp + 4 + 12] // dst_argb 5143 mov edx, [esp + 4 + 12] // dst_argb
4715 mov ecx, [esp + 4 + 16] // width 5144 mov ecx, [esp + 4 + 16] // width
4716 sub esi, eax 5145 sub esi, eax
4717 pcmpeqb xmm5, xmm5 // alpha 255 5146 pcmpeqb xmm5, xmm5 // alpha 255
(...skipping 266 matching lines...) Expand 10 before | Expand all | Expand 10 after
4984 sub ecx, 1 5413 sub ecx, 1
4985 jge l1 5414 jge l1
4986 5415
4987 l1b: 5416 l1b:
4988 } 5417 }
4989 } 5418 }
4990 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 5419 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4991 5420
4992 #ifdef HAS_ARGBAFFINEROW_SSE2 5421 #ifdef HAS_ARGBAFFINEROW_SSE2
4993 // Copy ARGB pixels from source image with slope to a row of destination. 5422 // Copy ARGB pixels from source image with slope to a row of destination.
4994 __declspec(naked) __declspec(align(16)) 5423 __declspec(naked)
4995 LIBYUV_API 5424 LIBYUV_API
4996 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 5425 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4997 uint8* dst_argb, const float* uv_dudv, int width) { 5426 uint8* dst_argb, const float* uv_dudv, int width) {
4998 __asm { 5427 __asm {
4999 push esi 5428 push esi
5000 push edi 5429 push edi
5001 mov eax, [esp + 12] // src_argb 5430 mov eax, [esp + 12] // src_argb
5002 mov esi, [esp + 16] // stride 5431 mov esi, [esp + 16] // stride
5003 mov edx, [esp + 20] // dst_argb 5432 mov edx, [esp + 20] // dst_argb
5004 mov ecx, [esp + 24] // pointer to uv_dudv 5433 mov ecx, [esp + 24] // pointer to uv_dudv
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
5069 l1b: 5498 l1b:
5070 pop edi 5499 pop edi
5071 pop esi 5500 pop esi
5072 ret 5501 ret
5073 } 5502 }
5074 } 5503 }
5075 #endif // HAS_ARGBAFFINEROW_SSE2 5504 #endif // HAS_ARGBAFFINEROW_SSE2
5076 5505
5077 #ifdef HAS_INTERPOLATEROW_AVX2 5506 #ifdef HAS_INTERPOLATEROW_AVX2
5078 // Bilinear filter 32x2 -> 32x1 5507 // Bilinear filter 32x2 -> 32x1
5079 __declspec(naked) __declspec(align(16)) 5508 __declspec(naked)
5080 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, 5509 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
5081 ptrdiff_t src_stride, int dst_width, 5510 ptrdiff_t src_stride, int dst_width,
5082 int source_y_fraction) { 5511 int source_y_fraction) {
5083 __asm { 5512 __asm {
5084 push esi 5513 push esi
5085 push edi 5514 push edi
5086 mov edi, [esp + 8 + 4] // dst_ptr 5515 mov edi, [esp + 8 + 4] // dst_ptr
5087 mov esi, [esp + 8 + 8] // src_ptr 5516 mov esi, [esp + 8 + 8] // src_ptr
5088 mov edx, [esp + 8 + 12] // src_stride 5517 mov edx, [esp + 8 + 12] // src_stride
5089 mov ecx, [esp + 8 + 16] // dst_width 5518 mov ecx, [esp + 8 + 16] // dst_width
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after
5166 xloop99: 5595 xloop99:
5167 pop edi 5596 pop edi
5168 pop esi 5597 pop esi
5169 vzeroupper 5598 vzeroupper
5170 ret 5599 ret
5171 } 5600 }
5172 } 5601 }
5173 #endif // HAS_INTERPOLATEROW_AVX2 5602 #endif // HAS_INTERPOLATEROW_AVX2
5174 5603
5175 // Bilinear filter 16x2 -> 16x1 5604 // Bilinear filter 16x2 -> 16x1
5176 __declspec(naked) __declspec(align(16)) 5605 __declspec(naked)
5177 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 5606 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5178 ptrdiff_t src_stride, int dst_width, 5607 ptrdiff_t src_stride, int dst_width,
5179 int source_y_fraction) { 5608 int source_y_fraction) {
5180 __asm { 5609 __asm {
5181 push esi 5610 push esi
5182 push edi 5611 push edi
5183 mov edi, [esp + 8 + 4] // dst_ptr 5612 mov edi, [esp + 8 + 4] // dst_ptr
5184 mov esi, [esp + 8 + 8] // src_ptr 5613 mov esi, [esp + 8 + 8] // src_ptr
5185 mov edx, [esp + 8 + 12] // src_stride 5614 mov edx, [esp + 8 + 12] // src_stride
5186 mov ecx, [esp + 8 + 16] // dst_width 5615 mov ecx, [esp + 8 + 16] // dst_width
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
5267 5696
5268 xloop99: 5697 xloop99:
5269 pop edi 5698 pop edi
5270 pop esi 5699 pop esi
5271 ret 5700 ret
5272 } 5701 }
5273 } 5702 }
5274 5703
5275 #ifdef HAS_INTERPOLATEROW_SSE2 5704 #ifdef HAS_INTERPOLATEROW_SSE2
5276 // Bilinear filter 16x2 -> 16x1 5705 // Bilinear filter 16x2 -> 16x1
5277 __declspec(naked) __declspec(align(16)) 5706 __declspec(naked)
5278 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, 5707 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
5279 ptrdiff_t src_stride, int dst_width, 5708 ptrdiff_t src_stride, int dst_width,
5280 int source_y_fraction) { 5709 int source_y_fraction) {
5281 __asm { 5710 __asm {
5282 push esi 5711 push esi
5283 push edi 5712 push edi
5284 mov edi, [esp + 8 + 4] // dst_ptr 5713 mov edi, [esp + 8 + 4] // dst_ptr
5285 mov esi, [esp + 8 + 8] // src_ptr 5714 mov esi, [esp + 8 + 8] // src_ptr
5286 mov edx, [esp + 8 + 12] // src_stride 5715 mov edx, [esp + 8 + 12] // src_stride
5287 mov ecx, [esp + 8 + 16] // dst_width 5716 mov ecx, [esp + 8 + 16] // dst_width
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after
5373 jg xloop100 5802 jg xloop100
5374 5803
5375 xloop99: 5804 xloop99:
5376 pop edi 5805 pop edi
5377 pop esi 5806 pop esi
5378 ret 5807 ret
5379 } 5808 }
5380 } 5809 }
5381 #endif // HAS_INTERPOLATEROW_SSE2 5810 #endif // HAS_INTERPOLATEROW_SSE2
5382 5811
5383 // Specialized ARGB to Bayer that just isolates G channel.
5384 __declspec(naked) __declspec(align(16))
5385 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
5386 uint32 selector, int pix) {
5387 __asm {
5388 mov eax, [esp + 4] // src_argb
5389 mov edx, [esp + 8] // dst_bayer
5390 // selector
5391 mov ecx, [esp + 16] // pix
5392 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
5393 psrld xmm5, 24
5394
5395 wloop:
5396 movdqu xmm0, [eax]
5397 movdqu xmm1, [eax + 16]
5398 lea eax, [eax + 32]
5399 psrld xmm0, 8 // Move green to bottom.
5400 psrld xmm1, 8
5401 pand xmm0, xmm5
5402 pand xmm1, xmm5
5403 packssdw xmm0, xmm1
5404 packuswb xmm0, xmm1
5405 movq qword ptr [edx], xmm0
5406 lea edx, [edx + 8]
5407 sub ecx, 8
5408 jg wloop
5409 ret
5410 }
5411 }
5412
5413 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5812 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5414 __declspec(naked) __declspec(align(16)) 5813 __declspec(naked)
5415 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5814 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5416 const uint8* shuffler, int pix) { 5815 const uint8* shuffler, int pix) {
5417 __asm { 5816 __asm {
5418 mov eax, [esp + 4] // src_argb 5817 mov eax, [esp + 4] // src_argb
5419 mov edx, [esp + 8] // dst_argb 5818 mov edx, [esp + 8] // dst_argb
5420 mov ecx, [esp + 12] // shuffler 5819 mov ecx, [esp + 12] // shuffler
5421 movdqu xmm5, [ecx] 5820 movdqu xmm5, [ecx]
5422 mov ecx, [esp + 16] // pix 5821 mov ecx, [esp + 16] // pix
5423 5822
5424 wloop: 5823 wloop:
5425 movdqu xmm0, [eax] 5824 movdqu xmm0, [eax]
5426 movdqu xmm1, [eax + 16] 5825 movdqu xmm1, [eax + 16]
5427 lea eax, [eax + 32] 5826 lea eax, [eax + 32]
5428 pshufb xmm0, xmm5 5827 pshufb xmm0, xmm5
5429 pshufb xmm1, xmm5 5828 pshufb xmm1, xmm5
5430 movdqu [edx], xmm0 5829 movdqu [edx], xmm0
5431 movdqu [edx + 16], xmm1 5830 movdqu [edx + 16], xmm1
5432 lea edx, [edx + 32] 5831 lea edx, [edx + 32]
5433 sub ecx, 8 5832 sub ecx, 8
5434 jg wloop 5833 jg wloop
5435 ret 5834 ret
5436 } 5835 }
5437 } 5836 }
5438 5837
5439 #ifdef HAS_ARGBSHUFFLEROW_AVX2 5838 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5440 __declspec(naked) __declspec(align(16)) 5839 __declspec(naked)
5441 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 5840 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5442 const uint8* shuffler, int pix) { 5841 const uint8* shuffler, int pix) {
5443 __asm { 5842 __asm {
5444 mov eax, [esp + 4] // src_argb 5843 mov eax, [esp + 4] // src_argb
5445 mov edx, [esp + 8] // dst_argb 5844 mov edx, [esp + 8] // dst_argb
5446 mov ecx, [esp + 12] // shuffler 5845 mov ecx, [esp + 12] // shuffler
5447 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. 5846 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
5448 mov ecx, [esp + 16] // pix 5847 mov ecx, [esp + 16] // pix
5449 5848
5450 wloop: 5849 wloop:
5451 vmovdqu ymm0, [eax] 5850 vmovdqu ymm0, [eax]
5452 vmovdqu ymm1, [eax + 32] 5851 vmovdqu ymm1, [eax + 32]
5453 lea eax, [eax + 64] 5852 lea eax, [eax + 64]
5454 vpshufb ymm0, ymm0, ymm5 5853 vpshufb ymm0, ymm0, ymm5
5455 vpshufb ymm1, ymm1, ymm5 5854 vpshufb ymm1, ymm1, ymm5
5456 vmovdqu [edx], ymm0 5855 vmovdqu [edx], ymm0
5457 vmovdqu [edx + 32], ymm1 5856 vmovdqu [edx + 32], ymm1
5458 lea edx, [edx + 64] 5857 lea edx, [edx + 64]
5459 sub ecx, 16 5858 sub ecx, 16
5460 jg wloop 5859 jg wloop
5461 5860
5462 vzeroupper 5861 vzeroupper
5463 ret 5862 ret
5464 } 5863 }
5465 } 5864 }
5466 #endif // HAS_ARGBSHUFFLEROW_AVX2 5865 #endif // HAS_ARGBSHUFFLEROW_AVX2
5467 5866
5468 __declspec(naked) __declspec(align(16)) 5867 __declspec(naked)
5469 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 5868 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5470 const uint8* shuffler, int pix) { 5869 const uint8* shuffler, int pix) {
5471 __asm { 5870 __asm {
5472 push ebx 5871 push ebx
5473 push esi 5872 push esi
5474 mov eax, [esp + 8 + 4] // src_argb 5873 mov eax, [esp + 8 + 4] // src_argb
5475 mov edx, [esp + 8 + 8] // dst_argb 5874 mov edx, [esp + 8 + 8] // dst_argb
5476 mov esi, [esp + 8 + 12] // shuffler 5875 mov esi, [esp + 8 + 12] // shuffler
5477 mov ecx, [esp + 8 + 16] // pix 5876 mov ecx, [esp + 8 + 16] // pix
5478 pxor xmm5, xmm5 5877 pxor xmm5, xmm5
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after
5580 ret 5979 ret
5581 } 5980 }
5582 } 5981 }
5583 5982
5584 // YUY2 - Macro-pixel = 2 image pixels 5983 // YUY2 - Macro-pixel = 2 image pixels
5585 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... 5984 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5586 5985
5587 // UYVY - Macro-pixel = 2 image pixels 5986 // UYVY - Macro-pixel = 2 image pixels
5588 // U0Y0V0Y1 5987 // U0Y0V0Y1
5589 5988
5590 __declspec(naked) __declspec(align(16)) 5989 __declspec(naked)
5591 void I422ToYUY2Row_SSE2(const uint8* src_y, 5990 void I422ToYUY2Row_SSE2(const uint8* src_y,
5592 const uint8* src_u, 5991 const uint8* src_u,
5593 const uint8* src_v, 5992 const uint8* src_v,
5594 uint8* dst_frame, int width) { 5993 uint8* dst_frame, int width) {
5595 __asm { 5994 __asm {
5596 push esi 5995 push esi
5597 push edi 5996 push edi
5598 mov eax, [esp + 8 + 4] // src_y 5997 mov eax, [esp + 8 + 4] // src_y
5599 mov esi, [esp + 8 + 8] // src_u 5998 mov esi, [esp + 8 + 8] // src_u
5600 mov edx, [esp + 8 + 12] // src_v 5999 mov edx, [esp + 8 + 12] // src_v
(...skipping 16 matching lines...) Expand all
5617 lea edi, [edi + 32] 6016 lea edi, [edi + 32]
5618 sub ecx, 16 6017 sub ecx, 16
5619 jg convertloop 6018 jg convertloop
5620 6019
5621 pop edi 6020 pop edi
5622 pop esi 6021 pop esi
5623 ret 6022 ret
5624 } 6023 }
5625 } 6024 }
5626 6025
5627 __declspec(naked) __declspec(align(16)) 6026 __declspec(naked)
5628 void I422ToUYVYRow_SSE2(const uint8* src_y, 6027 void I422ToUYVYRow_SSE2(const uint8* src_y,
5629 const uint8* src_u, 6028 const uint8* src_u,
5630 const uint8* src_v, 6029 const uint8* src_v,
5631 uint8* dst_frame, int width) { 6030 uint8* dst_frame, int width) {
5632 __asm { 6031 __asm {
5633 push esi 6032 push esi
5634 push edi 6033 push edi
5635 mov eax, [esp + 8 + 4] // src_y 6034 mov eax, [esp + 8 + 4] // src_y
5636 mov esi, [esp + 8 + 8] // src_u 6035 mov esi, [esp + 8 + 8] // src_u
5637 mov edx, [esp + 8 + 12] // src_v 6036 mov edx, [esp + 8 + 12] // src_v
(...skipping 17 matching lines...) Expand all
5655 sub ecx, 16 6054 sub ecx, 16
5656 jg convertloop 6055 jg convertloop
5657 6056
5658 pop edi 6057 pop edi
5659 pop esi 6058 pop esi
5660 ret 6059 ret
5661 } 6060 }
5662 } 6061 }
5663 6062
5664 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 6063 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5665 __declspec(naked) __declspec(align(16)) 6064 __declspec(naked)
5666 void ARGBPolynomialRow_SSE2(const uint8* src_argb, 6065 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5667 uint8* dst_argb, const float* poly, 6066 uint8* dst_argb, const float* poly,
5668 int width) { 6067 int width) {
5669 __asm { 6068 __asm {
5670 push esi 6069 push esi
5671 mov eax, [esp + 4 + 4] /* src_argb */ 6070 mov eax, [esp + 4 + 4] /* src_argb */
5672 mov edx, [esp + 4 + 8] /* dst_argb */ 6071 mov edx, [esp + 4 + 8] /* dst_argb */
5673 mov esi, [esp + 4 + 12] /* poly */ 6072 mov esi, [esp + 4 + 12] /* poly */
5674 mov ecx, [esp + 4 + 16] /* width */ 6073 mov ecx, [esp + 4 + 16] /* width */
5675 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. 6074 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
5714 lea edx, [edx + 8] 6113 lea edx, [edx + 8]
5715 sub ecx, 2 6114 sub ecx, 2
5716 jg convertloop 6115 jg convertloop
5717 pop esi 6116 pop esi
5718 ret 6117 ret
5719 } 6118 }
5720 } 6119 }
5721 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 6120 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
5722 6121
5723 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 6122 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5724 __declspec(naked) __declspec(align(16)) 6123 __declspec(naked)
5725 void ARGBPolynomialRow_AVX2(const uint8* src_argb, 6124 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5726 uint8* dst_argb, const float* poly, 6125 uint8* dst_argb, const float* poly,
5727 int width) { 6126 int width) {
5728 __asm { 6127 __asm {
5729 mov eax, [esp + 4] /* src_argb */ 6128 mov eax, [esp + 4] /* src_argb */
5730 mov edx, [esp + 8] /* dst_argb */ 6129 mov edx, [esp + 8] /* dst_argb */
5731 mov ecx, [esp + 12] /* poly */ 6130 mov ecx, [esp + 12] /* poly */
5732 vbroadcastf128 ymm4, [ecx] // C0 6131 vbroadcastf128 ymm4, [ecx] // C0
5733 vbroadcastf128 ymm5, [ecx + 16] // C1 6132 vbroadcastf128 ymm5, [ecx + 16] // C1
5734 vbroadcastf128 ymm6, [ecx + 32] // C2 6133 vbroadcastf128 ymm6, [ecx + 32] // C2
(...skipping 19 matching lines...) Expand all
5754 sub ecx, 2 6153 sub ecx, 2
5755 jg convertloop 6154 jg convertloop
5756 vzeroupper 6155 vzeroupper
5757 ret 6156 ret
5758 } 6157 }
5759 } 6158 }
5760 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 6159 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
5761 6160
5762 #ifdef HAS_ARGBCOLORTABLEROW_X86 6161 #ifdef HAS_ARGBCOLORTABLEROW_X86
5763 // Tranform ARGB pixels with color table. 6162 // Tranform ARGB pixels with color table.
5764 __declspec(naked) __declspec(align(16)) 6163 __declspec(naked)
5765 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 6164 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
5766 int width) { 6165 int width) {
5767 __asm { 6166 __asm {
5768 push esi 6167 push esi
5769 mov eax, [esp + 4 + 4] /* dst_argb */ 6168 mov eax, [esp + 4 + 4] /* dst_argb */
5770 mov esi, [esp + 4 + 8] /* table_argb */ 6169 mov esi, [esp + 4 + 8] /* table_argb */
5771 mov ecx, [esp + 4 + 12] /* width */ 6170 mov ecx, [esp + 4 + 12] /* width */
5772 6171
5773 // 1 pixel loop. 6172 // 1 pixel loop.
5774 convertloop: 6173 convertloop:
(...skipping 13 matching lines...) Expand all
5788 dec ecx 6187 dec ecx
5789 jg convertloop 6188 jg convertloop
5790 pop esi 6189 pop esi
5791 ret 6190 ret
5792 } 6191 }
5793 } 6192 }
5794 #endif // HAS_ARGBCOLORTABLEROW_X86 6193 #endif // HAS_ARGBCOLORTABLEROW_X86
5795 6194
5796 #ifdef HAS_RGBCOLORTABLEROW_X86 6195 #ifdef HAS_RGBCOLORTABLEROW_X86
5797 // Tranform RGB pixels with color table. 6196 // Tranform RGB pixels with color table.
5798 __declspec(naked) __declspec(align(16)) 6197 __declspec(naked)
5799 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { 6198 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5800 __asm { 6199 __asm {
5801 push esi 6200 push esi
5802 mov eax, [esp + 4 + 4] /* dst_argb */ 6201 mov eax, [esp + 4 + 4] /* dst_argb */
5803 mov esi, [esp + 4 + 8] /* table_argb */ 6202 mov esi, [esp + 4 + 8] /* table_argb */
5804 mov ecx, [esp + 4 + 12] /* width */ 6203 mov ecx, [esp + 4 + 12] /* width */
5805 6204
5806 // 1 pixel loop. 6205 // 1 pixel loop.
5807 convertloop: 6206 convertloop:
5808 movzx edx, byte ptr [eax] 6207 movzx edx, byte ptr [eax]
(...skipping 10 matching lines...) Expand all
5819 jg convertloop 6218 jg convertloop
5820 6219
5821 pop esi 6220 pop esi
5822 ret 6221 ret
5823 } 6222 }
5824 } 6223 }
5825 #endif // HAS_RGBCOLORTABLEROW_X86 6224 #endif // HAS_RGBCOLORTABLEROW_X86
5826 6225
5827 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 6226 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5828 // Tranform RGB pixels with luma table. 6227 // Tranform RGB pixels with luma table.
5829 __declspec(naked) __declspec(align(16)) 6228 __declspec(naked)
5830 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 6229 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5831 int width, 6230 int width,
5832 const uint8* luma, uint32 lumacoeff) { 6231 const uint8* luma, uint32 lumacoeff) {
5833 __asm { 6232 __asm {
5834 push esi 6233 push esi
5835 push edi 6234 push edi
5836 mov eax, [esp + 8 + 4] /* src_argb */ 6235 mov eax, [esp + 8 + 4] /* src_argb */
5837 mov edi, [esp + 8 + 8] /* dst_argb */ 6236 mov edi, [esp + 8 + 8] /* dst_argb */
5838 mov ecx, [esp + 8 + 12] /* width */ 6237 mov ecx, [esp + 8 + 12] /* width */
5839 movd xmm2, dword ptr [esp + 8 + 16] // luma table 6238 movd xmm2, dword ptr [esp + 8 + 16] // luma table
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
5917 jg convertloop 6316 jg convertloop
5918 6317
5919 pop edi 6318 pop edi
5920 pop esi 6319 pop esi
5921 ret 6320 ret
5922 } 6321 }
5923 } 6322 }
5924 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6323 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5925 6324
5926 #endif // defined(_M_X64) 6325 #endif // defined(_M_X64)
5927 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) 6326 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
5928 6327
5929 #ifdef __cplusplus 6328 #ifdef __cplusplus
5930 } // extern "C" 6329 } // extern "C"
5931 } // namespace libyuv 6330 } // namespace libyuv
5932 #endif 6331 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_posix.cc ('k') | source/libvpx/third_party/libyuv/source/scale.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698