OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "libyuv/row.h" | 11 #include "libyuv/row.h" |
12 | 12 |
13 #if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) | 13 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \ |
| 14 defined(_MSC_VER) && !defined(__clang__) |
14 #include <emmintrin.h> | 15 #include <emmintrin.h> |
15 #include <tmmintrin.h> // For _mm_maddubs_epi16 | 16 #include <tmmintrin.h> // For _mm_maddubs_epi16 |
16 #endif | 17 #endif |
17 | 18 |
18 #ifdef __cplusplus | 19 #ifdef __cplusplus |
19 namespace libyuv { | 20 namespace libyuv { |
20 extern "C" { | 21 extern "C" { |
21 #endif | 22 #endif |
22 | 23 |
23 // This module is for Visual C. | 24 // This module is for Visual C. |
24 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ | 25 #if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \ |
25 (defined(_M_IX86) || defined(_M_X64)) | 26 defined(_MSC_VER) && !defined(__clang__) |
26 | |
27 // YUV to RGB conversion constants. | |
28 // Y contribution to R,G,B. Scale and bias. | |
29 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ | |
30 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ | |
31 | |
32 // U and V contributions to R,G,B. | |
33 #define UB -128 /* -min(128, round(2.018 * 64)) */ | |
34 #define UG 25 /* -round(-0.391 * 64) */ | |
35 #define VG 52 /* -round(-0.813 * 64) */ | |
36 #define VR -102 /* -round(1.596 * 64) */ | |
37 | |
38 // Bias values to subtract 16 from Y and 128 from U and V. | |
39 #define BB (UB * 128 - YGB) | |
40 #define BG (UG * 128 + VG * 128 - YGB) | |
41 #define BR (VR * 128 - YGB) | |
42 | 27 |
43 struct YuvConstants { | 28 struct YuvConstants { |
44 lvec8 kUVToB; // 0 | 29 lvec8 kUVToB; // 0 |
45 lvec8 kUVToG; // 32 | 30 lvec8 kUVToG; // 32 |
46 lvec8 kUVToR; // 64 | 31 lvec8 kUVToR; // 64 |
47 lvec16 kUVBiasB; // 96 | 32 lvec16 kUVBiasB; // 96 |
48 lvec16 kUVBiasG; // 128 | 33 lvec16 kUVBiasG; // 128 |
49 lvec16 kUVBiasR; // 160 | 34 lvec16 kUVBiasR; // 160 |
50 lvec16 kYToRgb; // 192 | 35 lvec16 kYToRgb; // 192 |
51 }; | 36 }; |
52 | 37 |
| 38 // BT.601 YUV to RGB reference |
| 39 // R = (Y - 16) * 1.164 - V * -1.596 |
| 40 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 |
| 41 // B = (Y - 16) * 1.164 - U * -2.018 |
| 42 |
| 43 // Y contribution to R,G,B. Scale and bias. |
| 44 // TODO(fbarchard): Consider moving constants into a common header. |
| 45 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ |
| 46 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ |
| 47 |
| 48 // U and V contributions to R,G,B. |
| 49 #define UB -128 /* max(-128, round(-2.018 * 64)) */ |
| 50 #define UG 25 /* round(0.391 * 64) */ |
| 51 #define VG 52 /* round(0.813 * 64) */ |
| 52 #define VR -102 /* round(-1.596 * 64) */ |
| 53 |
| 54 // Bias values to subtract 16 from Y and 128 from U and V. |
| 55 #define BB (UB * 128 + YGB) |
| 56 #define BG (UG * 128 + VG * 128 + YGB) |
| 57 #define BR (VR * 128 + YGB) |
| 58 |
53 // BT601 constants for YUV to RGB. | 59 // BT601 constants for YUV to RGB. |
54 static YuvConstants SIMD_ALIGNED(kYuvConstants) = { | 60 static YuvConstants SIMD_ALIGNED(kYuvConstants) = { |
55 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, | 61 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, |
56 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, | 62 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, |
57 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, | 63 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, |
58 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, | 64 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, |
59 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, | 65 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, |
60 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, | 66 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, |
61 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | 67 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, |
62 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | 68 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, |
63 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | 69 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, |
64 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | 70 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } |
65 }; | 71 }; |
66 | 72 |
67 // BT601 constants for NV21 where chroma plane is VU instead of UV. | 73 // BT601 constants for NV21 where chroma plane is VU instead of UV. |
68 static YuvConstants SIMD_ALIGNED(kYvuConstants) = { | 74 static YuvConstants SIMD_ALIGNED(kYvuConstants) = { |
69 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, | 75 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, |
70 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, | 76 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, |
71 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, | 77 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, |
72 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, | 78 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, |
73 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, | 79 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, |
74 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, | 80 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, |
75 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | 81 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, |
76 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | 82 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, |
77 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | 83 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, |
78 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | 84 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } |
79 }; | 85 }; |
80 | 86 |
| 87 #undef YG |
| 88 #undef YGB |
| 89 #undef UB |
| 90 #undef UG |
| 91 #undef VG |
| 92 #undef VR |
| 93 #undef BB |
| 94 #undef BG |
| 95 #undef BR |
| 96 |
| 97 // JPEG YUV to RGB reference |
| 98 // * R = Y - V * -1.40200 |
| 99 // * G = Y - U * 0.34414 - V * 0.71414 |
| 100 // * B = Y - U * -1.77200 |
| 101 |
| 102 // Y contribution to R,G,B. Scale and bias. |
| 103 // TODO(fbarchard): Consider moving constants into a common header. |
| 104 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ |
| 105 #define YGBJ 32 /* 64 / 2 */ |
| 106 |
| 107 // U and V contributions to R,G,B. |
| 108 #define UBJ -113 /* round(-1.77200 * 64) */ |
| 109 #define UGJ 22 /* round(0.34414 * 64) */ |
| 110 #define VGJ 46 /* round(0.71414 * 64) */ |
| 111 #define VRJ -90 /* round(-1.40200 * 64) */ |
| 112 |
| 113 // Bias values to subtract 16 from Y and 128 from U and V. |
| 114 #define BBJ (UBJ * 128 + YGBJ) |
| 115 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) |
| 116 #define BRJ (VRJ * 128 + YGBJ) |
| 117 |
| 118 // JPEG constants for YUV to RGB. |
| 119 static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { |
| 120 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, |
| 121 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, |
| 122 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| 123 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| 124 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| 125 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, |
| 126 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, |
| 127 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, |
| 128 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, |
| 129 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, |
| 130 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, |
| 131 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, |
| 132 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, |
| 133 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, |
| 134 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, |
| 135 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } |
| 136 }; |
| 137 |
| 138 #undef YGJ |
| 139 #undef YGBJ |
| 140 #undef UBJ |
| 141 #undef UGJ |
| 142 #undef VGJ |
| 143 #undef VRJ |
| 144 #undef BBJ |
| 145 #undef BGJ |
| 146 #undef BRJ |
| 147 |
81 // 64 bit | 148 // 64 bit |
82 #if defined(_M_X64) | 149 #if defined(_M_X64) |
83 | 150 #if defined(HAS_I422TOARGBROW_SSSE3) |
84 __declspec(align(16)) | |
85 void I422ToARGBRow_SSSE3(const uint8* y_buf, | 151 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
86 const uint8* u_buf, | 152 const uint8* u_buf, |
87 const uint8* v_buf, | 153 const uint8* v_buf, |
88 uint8* dst_argb, | 154 uint8* dst_argb, |
89 int width) { | 155 int width) { |
90 __m128i xmm0, xmm1, xmm2, xmm3; | 156 __m128i xmm0, xmm1, xmm2, xmm3; |
91 const __m128i xmm5 = _mm_set1_epi8(-1); | 157 const __m128i xmm5 = _mm_set1_epi8(-1); |
92 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; | 158 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; |
93 | 159 |
94 while (width > 0) { | 160 while (width > 0) { |
(...skipping 29 matching lines...) Expand all Loading... |
124 | 190 |
125 _mm_storeu_si128((__m128i *)dst_argb, xmm0); | 191 _mm_storeu_si128((__m128i *)dst_argb, xmm0); |
126 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); | 192 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); |
127 | 193 |
128 y_buf += 8; | 194 y_buf += 8; |
129 u_buf += 4; | 195 u_buf += 4; |
130 dst_argb += 32; | 196 dst_argb += 32; |
131 width -= 8; | 197 width -= 8; |
132 } | 198 } |
133 } | 199 } |
134 | 200 #endif |
135 // 32 bit | 201 // 32 bit |
136 #else // defined(_M_X64) | 202 #else // defined(_M_X64) |
137 | |
138 #ifdef HAS_ARGBTOYROW_SSSE3 | 203 #ifdef HAS_ARGBTOYROW_SSSE3 |
139 | 204 |
140 // Constants for ARGB. | 205 // Constants for ARGB. |
141 static const vec8 kARGBToY = { | 206 static const vec8 kARGBToY = { |
142 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 | 207 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 |
143 }; | 208 }; |
144 | 209 |
145 // JPeg full range. | 210 // JPeg full range. |
146 static const vec8 kARGBToYJ = { | 211 static const vec8 kARGBToYJ = { |
147 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 | 212 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 |
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
250 static const uvec8 kShuffleMaskARGBToRGB24_0 = { | 315 static const uvec8 kShuffleMaskARGBToRGB24_0 = { |
251 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u | 316 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u |
252 }; | 317 }; |
253 | 318 |
254 // Shuffle table for converting ARGB to RAW. | 319 // Shuffle table for converting ARGB to RAW. |
255 static const uvec8 kShuffleMaskARGBToRAW_0 = { | 320 static const uvec8 kShuffleMaskARGBToRAW_0 = { |
256 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u | 321 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u |
257 }; | 322 }; |
258 | 323 |
259 // Duplicates gray value 3 times and fills in alpha opaque. | 324 // Duplicates gray value 3 times and fills in alpha opaque. |
260 __declspec(naked) __declspec(align(16)) | 325 __declspec(naked) |
261 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | 326 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
262 __asm { | 327 __asm { |
263 mov eax, [esp + 4] // src_y | 328 mov eax, [esp + 4] // src_y |
264 mov edx, [esp + 8] // dst_argb | 329 mov edx, [esp + 8] // dst_argb |
265 mov ecx, [esp + 12] // pix | 330 mov ecx, [esp + 12] // pix |
266 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 331 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
267 pslld xmm5, 24 | 332 pslld xmm5, 24 |
268 | 333 |
269 convertloop: | 334 convertloop: |
270 movq xmm0, qword ptr [eax] | 335 movq xmm0, qword ptr [eax] |
271 lea eax, [eax + 8] | 336 lea eax, [eax + 8] |
272 punpcklbw xmm0, xmm0 | 337 punpcklbw xmm0, xmm0 |
273 movdqa xmm1, xmm0 | 338 movdqa xmm1, xmm0 |
274 punpcklwd xmm0, xmm0 | 339 punpcklwd xmm0, xmm0 |
275 punpckhwd xmm1, xmm1 | 340 punpckhwd xmm1, xmm1 |
276 por xmm0, xmm5 | 341 por xmm0, xmm5 |
277 por xmm1, xmm5 | 342 por xmm1, xmm5 |
278 movdqu [edx], xmm0 | 343 movdqu [edx], xmm0 |
279 movdqu [edx + 16], xmm1 | 344 movdqu [edx + 16], xmm1 |
280 lea edx, [edx + 32] | 345 lea edx, [edx + 32] |
281 sub ecx, 8 | 346 sub ecx, 8 |
282 jg convertloop | 347 jg convertloop |
283 ret | 348 ret |
284 } | 349 } |
285 } | 350 } |
286 | 351 |
287 __declspec(naked) __declspec(align(16)) | 352 #ifdef HAS_J400TOARGBROW_AVX2 |
| 353 // Duplicates gray value 3 times and fills in alpha opaque. |
| 354 __declspec(naked) |
| 355 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { |
| 356 __asm { |
| 357 mov eax, [esp + 4] // src_y |
| 358 mov edx, [esp + 8] // dst_argb |
| 359 mov ecx, [esp + 12] // pix |
| 360 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
| 361 vpslld ymm5, ymm5, 24 |
| 362 |
| 363 convertloop: |
| 364 vmovdqu xmm0, [eax] |
| 365 lea eax, [eax + 16] |
| 366 vpermq ymm0, ymm0, 0xd8 |
| 367 vpunpcklbw ymm0, ymm0, ymm0 |
| 368 vpermq ymm0, ymm0, 0xd8 |
| 369 vpunpckhwd ymm1, ymm0, ymm0 |
| 370 vpunpcklwd ymm0, ymm0, ymm0 |
| 371 vpor ymm0, ymm0, ymm5 |
| 372 vpor ymm1, ymm1, ymm5 |
| 373 vmovdqu [edx], ymm0 |
| 374 vmovdqu [edx + 32], ymm1 |
| 375 lea edx, [edx + 64] |
| 376 sub ecx, 16 |
| 377 jg convertloop |
| 378 vzeroupper |
| 379 ret |
| 380 } |
| 381 } |
| 382 #endif // HAS_J400TOARGBROW_AVX2 |
| 383 |
| 384 __declspec(naked) |
288 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 385 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
289 __asm { | 386 __asm { |
290 mov eax, [esp + 4] // src_rgb24 | 387 mov eax, [esp + 4] // src_rgb24 |
291 mov edx, [esp + 8] // dst_argb | 388 mov edx, [esp + 8] // dst_argb |
292 mov ecx, [esp + 12] // pix | 389 mov ecx, [esp + 12] // pix |
293 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 390 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
294 pslld xmm5, 24 | 391 pslld xmm5, 24 |
295 movdqa xmm4, kShuffleMaskRGB24ToARGB | 392 movdqa xmm4, kShuffleMaskRGB24ToARGB |
296 | 393 |
297 convertloop: | 394 convertloop: |
(...skipping 17 matching lines...) Expand all Loading... |
315 movdqu [edx + 16], xmm1 | 412 movdqu [edx + 16], xmm1 |
316 por xmm3, xmm5 | 413 por xmm3, xmm5 |
317 movdqu [edx + 48], xmm3 | 414 movdqu [edx + 48], xmm3 |
318 lea edx, [edx + 64] | 415 lea edx, [edx + 64] |
319 sub ecx, 16 | 416 sub ecx, 16 |
320 jg convertloop | 417 jg convertloop |
321 ret | 418 ret |
322 } | 419 } |
323 } | 420 } |
324 | 421 |
325 __declspec(naked) __declspec(align(16)) | 422 __declspec(naked) |
326 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, | 423 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, |
327 int pix) { | 424 int pix) { |
328 __asm { | 425 __asm { |
329 mov eax, [esp + 4] // src_raw | 426 mov eax, [esp + 4] // src_raw |
330 mov edx, [esp + 8] // dst_argb | 427 mov edx, [esp + 8] // dst_argb |
331 mov ecx, [esp + 12] // pix | 428 mov ecx, [esp + 12] // pix |
332 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 429 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
333 pslld xmm5, 24 | 430 pslld xmm5, 24 |
334 movdqa xmm4, kShuffleMaskRAWToARGB | 431 movdqa xmm4, kShuffleMaskRAWToARGB |
335 | 432 |
(...skipping 25 matching lines...) Expand all Loading... |
361 } | 458 } |
362 } | 459 } |
363 | 460 |
364 // pmul method to replicate bits. | 461 // pmul method to replicate bits. |
365 // Math to replicate bits: | 462 // Math to replicate bits: |
366 // (v << 8) | (v << 3) | 463 // (v << 8) | (v << 3) |
367 // v * 256 + v * 8 | 464 // v * 256 + v * 8 |
368 // v * (256 + 8) | 465 // v * (256 + 8) |
369 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 | 466 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
370 // 20 instructions. | 467 // 20 instructions. |
371 __declspec(naked) __declspec(align(16)) | 468 __declspec(naked) |
372 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, | 469 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, |
373 int pix) { | 470 int pix) { |
374 __asm { | 471 __asm { |
375 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 472 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
376 movd xmm5, eax | 473 movd xmm5, eax |
377 pshufd xmm5, xmm5, 0 | 474 pshufd xmm5, xmm5, 0 |
378 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits | 475 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
379 movd xmm6, eax | 476 movd xmm6, eax |
380 pshufd xmm6, xmm6, 0 | 477 pshufd xmm6, xmm6, 0 |
381 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red | 478 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
(...skipping 28 matching lines...) Expand all Loading... |
410 punpckhbw xmm2, xmm0 | 507 punpckhbw xmm2, xmm0 |
411 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB | 508 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
412 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB | 509 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
413 lea eax, [eax + 16] | 510 lea eax, [eax + 16] |
414 sub ecx, 8 | 511 sub ecx, 8 |
415 jg convertloop | 512 jg convertloop |
416 ret | 513 ret |
417 } | 514 } |
418 } | 515 } |
419 | 516 |
| 517 #ifdef HAS_RGB565TOARGBROW_AVX2 |
| 518 // pmul method to replicate bits. |
| 519 // Math to replicate bits: |
| 520 // (v << 8) | (v << 3) |
| 521 // v * 256 + v * 8 |
| 522 // v * (256 + 8) |
| 523 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
| 524 __declspec(naked) |
| 525 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, |
| 526 int pix) { |
| 527 __asm { |
| 528 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 529 vmovd xmm5, eax |
| 530 vbroadcastss ymm5, xmm5 |
| 531 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
| 532 movd xmm6, eax |
| 533 vbroadcastss ymm6, xmm6 |
| 534 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
| 535 vpsllw ymm3, ymm3, 11 |
| 536 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green |
| 537 vpsllw ymm4, ymm4, 10 |
| 538 vpsrlw ymm4, ymm4, 5 |
| 539 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
| 540 vpsllw ymm7, ymm7, 8 |
| 541 |
| 542 mov eax, [esp + 4] // src_rgb565 |
| 543 mov edx, [esp + 8] // dst_argb |
| 544 mov ecx, [esp + 12] // pix |
| 545 sub edx, eax |
| 546 sub edx, eax |
| 547 |
| 548 convertloop: |
| 549 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 |
| 550 vpand ymm1, ymm0, ymm3 // R in upper 5 bits |
| 551 vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
| 552 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
| 553 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
| 554 vpsllw ymm1, ymm1, 8 |
| 555 vpor ymm1, ymm1, ymm2 // RB |
| 556 vpand ymm0, ymm0, ymm4 // G in middle 6 bits |
| 557 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) |
| 558 vpor ymm0, ymm0, ymm7 // AG |
| 559 vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| 560 vpermq ymm1, ymm1, 0xd8 |
| 561 vpunpckhbw ymm2, ymm1, ymm0 |
| 562 vpunpcklbw ymm1, ymm1, ymm0 |
| 563 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB |
| 564 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB |
| 565 lea eax, [eax + 32] |
| 566 sub ecx, 16 |
| 567 jg convertloop |
| 568 vzeroupper |
| 569 ret |
| 570 } |
| 571 } |
| 572 #endif // HAS_RGB565TOARGBROW_AVX2 |
| 573 |
| 574 #ifdef HAS_ARGB1555TOARGBROW_AVX2 |
| 575 __declspec(naked) |
| 576 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, |
| 577 int pix) { |
| 578 __asm { |
| 579 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 580 vmovd xmm5, eax |
| 581 vbroadcastss ymm5, xmm5 |
| 582 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
| 583 movd xmm6, eax |
| 584 vbroadcastss ymm6, xmm6 |
| 585 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
| 586 vpsllw ymm3, ymm3, 11 |
| 587 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green |
| 588 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
| 589 vpsllw ymm7, ymm7, 8 |
| 590 |
| 591 mov eax, [esp + 4] // src_argb1555 |
| 592 mov edx, [esp + 8] // dst_argb |
| 593 mov ecx, [esp + 12] // pix |
| 594 sub edx, eax |
| 595 sub edx, eax |
| 596 |
| 597 convertloop: |
| 598 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 |
| 599 vpsllw ymm1, ymm0, 1 // R in upper 5 bits |
| 600 vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
| 601 vpand ymm1, ymm1, ymm3 |
| 602 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
| 603 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
| 604 vpsllw ymm1, ymm1, 8 |
| 605 vpor ymm1, ymm1, ymm2 // RB |
| 606 vpsraw ymm2, ymm0, 8 // A |
| 607 vpand ymm0, ymm0, ymm4 // G in middle 5 bits |
| 608 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) |
| 609 vpand ymm2, ymm2, ymm7 |
| 610 vpor ymm0, ymm0, ymm2 // AG |
| 611 vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| 612 vpermq ymm1, ymm1, 0xd8 |
| 613 vpunpckhbw ymm2, ymm1, ymm0 |
| 614 vpunpcklbw ymm1, ymm1, ymm0 |
| 615 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB |
| 616 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB |
| 617 lea eax, [eax + 32] |
| 618 sub ecx, 16 |
| 619 jg convertloop |
| 620 vzeroupper |
| 621 ret |
| 622 } |
| 623 } |
| 624 #endif // HAS_ARGB1555TOARGBROW_AVX2 |
| 625 |
| 626 #ifdef HAS_ARGB4444TOARGBROW_AVX2 |
| 627 __declspec(naked) |
| 628 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, |
| 629 int pix) { |
| 630 __asm { |
| 631 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
| 632 vmovd xmm4, eax |
| 633 vbroadcastss ymm4, xmm4 |
| 634 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles |
| 635 mov eax, [esp + 4] // src_argb4444 |
| 636 mov edx, [esp + 8] // dst_argb |
| 637 mov ecx, [esp + 12] // pix |
| 638 sub edx, eax |
| 639 sub edx, eax |
| 640 |
| 641 convertloop: |
| 642 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 |
| 643 vpand ymm2, ymm0, ymm5 // mask high nibbles |
| 644 vpand ymm0, ymm0, ymm4 // mask low nibbles |
| 645 vpsrlw ymm3, ymm2, 4 |
| 646 vpsllw ymm1, ymm0, 4 |
| 647 vpor ymm2, ymm2, ymm3 |
| 648 vpor ymm0, ymm0, ymm1 |
| 649 vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| 650 vpermq ymm2, ymm2, 0xd8 |
| 651 vpunpckhbw ymm1, ymm0, ymm2 |
| 652 vpunpcklbw ymm0, ymm0, ymm2 |
| 653 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB |
| 654 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB |
| 655 lea eax, [eax + 32] |
| 656 sub ecx, 16 |
| 657 jg convertloop |
| 658 vzeroupper |
| 659 ret |
| 660 } |
| 661 } |
| 662 #endif // HAS_ARGB4444TOARGBROW_AVX2 |
| 663 |
420 // 24 instructions | 664 // 24 instructions |
421 __declspec(naked) __declspec(align(16)) | 665 __declspec(naked) |
422 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, | 666 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, |
423 int pix) { | 667 int pix) { |
424 __asm { | 668 __asm { |
425 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 669 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
426 movd xmm5, eax | 670 movd xmm5, eax |
427 pshufd xmm5, xmm5, 0 | 671 pshufd xmm5, xmm5, 0 |
428 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits | 672 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
429 movd xmm6, eax | 673 movd xmm6, eax |
430 pshufd xmm6, xmm6, 0 | 674 pshufd xmm6, xmm6, 0 |
431 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red | 675 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
464 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB | 708 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
465 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB | 709 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
466 lea eax, [eax + 16] | 710 lea eax, [eax + 16] |
467 sub ecx, 8 | 711 sub ecx, 8 |
468 jg convertloop | 712 jg convertloop |
469 ret | 713 ret |
470 } | 714 } |
471 } | 715 } |
472 | 716 |
473 // 18 instructions. | 717 // 18 instructions. |
474 __declspec(naked) __declspec(align(16)) | 718 __declspec(naked) |
475 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, | 719 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, |
476 int pix) { | 720 int pix) { |
477 __asm { | 721 __asm { |
478 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f | 722 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
479 movd xmm4, eax | 723 movd xmm4, eax |
480 pshufd xmm4, xmm4, 0 | 724 pshufd xmm4, xmm4, 0 |
481 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles | 725 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles |
482 pslld xmm5, 4 | 726 pslld xmm5, 4 |
483 mov eax, [esp + 4] // src_argb4444 | 727 mov eax, [esp + 4] // src_argb4444 |
484 mov edx, [esp + 8] // dst_argb | 728 mov edx, [esp + 8] // dst_argb |
(...skipping 17 matching lines...) Expand all Loading... |
502 punpckhbw xmm1, xmm2 | 746 punpckhbw xmm1, xmm2 |
503 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB | 747 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB |
504 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB | 748 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB |
505 lea eax, [eax + 16] | 749 lea eax, [eax + 16] |
506 sub ecx, 8 | 750 sub ecx, 8 |
507 jg convertloop | 751 jg convertloop |
508 ret | 752 ret |
509 } | 753 } |
510 } | 754 } |
511 | 755 |
512 __declspec(naked) __declspec(align(16)) | 756 __declspec(naked) |
513 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { | 757 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
514 __asm { | 758 __asm { |
515 mov eax, [esp + 4] // src_argb | 759 mov eax, [esp + 4] // src_argb |
516 mov edx, [esp + 8] // dst_rgb | 760 mov edx, [esp + 8] // dst_rgb |
517 mov ecx, [esp + 12] // pix | 761 mov ecx, [esp + 12] // pix |
518 movdqa xmm6, kShuffleMaskARGBToRGB24 | 762 movdqa xmm6, kShuffleMaskARGBToRGB24 |
519 | 763 |
520 convertloop: | 764 convertloop: |
521 movdqu xmm0, [eax] // fetch 16 pixels of argb | 765 movdqu xmm0, [eax] // fetch 16 pixels of argb |
522 movdqu xmm1, [eax + 16] | 766 movdqu xmm1, [eax + 16] |
(...skipping 17 matching lines...) Expand all Loading... |
540 por xmm2, xmm3 // 12 bytes from 3 for 2 | 784 por xmm2, xmm3 // 12 bytes from 3 for 2 |
541 movdqu [edx + 16], xmm1 // store 1 | 785 movdqu [edx + 16], xmm1 // store 1 |
542 movdqu [edx + 32], xmm2 // store 2 | 786 movdqu [edx + 32], xmm2 // store 2 |
543 lea edx, [edx + 48] | 787 lea edx, [edx + 48] |
544 sub ecx, 16 | 788 sub ecx, 16 |
545 jg convertloop | 789 jg convertloop |
546 ret | 790 ret |
547 } | 791 } |
548 } | 792 } |
549 | 793 |
550 __declspec(naked) __declspec(align(16)) | 794 __declspec(naked) |
551 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { | 795 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
552 __asm { | 796 __asm { |
553 mov eax, [esp + 4] // src_argb | 797 mov eax, [esp + 4] // src_argb |
554 mov edx, [esp + 8] // dst_rgb | 798 mov edx, [esp + 8] // dst_rgb |
555 mov ecx, [esp + 12] // pix | 799 mov ecx, [esp + 12] // pix |
556 movdqa xmm6, kShuffleMaskARGBToRAW | 800 movdqa xmm6, kShuffleMaskARGBToRAW |
557 | 801 |
558 convertloop: | 802 convertloop: |
559 movdqu xmm0, [eax] // fetch 16 pixels of argb | 803 movdqu xmm0, [eax] // fetch 16 pixels of argb |
560 movdqu xmm1, [eax + 16] | 804 movdqu xmm1, [eax + 16] |
(...skipping 17 matching lines...) Expand all Loading... |
578 por xmm2, xmm3 // 12 bytes from 3 for 2 | 822 por xmm2, xmm3 // 12 bytes from 3 for 2 |
579 movdqu [edx + 16], xmm1 // store 1 | 823 movdqu [edx + 16], xmm1 // store 1 |
580 movdqu [edx + 32], xmm2 // store 2 | 824 movdqu [edx + 32], xmm2 // store 2 |
581 lea edx, [edx + 48] | 825 lea edx, [edx + 48] |
582 sub ecx, 16 | 826 sub ecx, 16 |
583 jg convertloop | 827 jg convertloop |
584 ret | 828 ret |
585 } | 829 } |
586 } | 830 } |
587 | 831 |
588 __declspec(naked) __declspec(align(16)) | 832 // 4 pixels |
| 833 __declspec(naked) |
589 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 834 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
590 __asm { | 835 __asm { |
591 mov eax, [esp + 4] // src_argb | 836 mov eax, [esp + 4] // src_argb |
592 mov edx, [esp + 8] // dst_rgb | 837 mov edx, [esp + 8] // dst_rgb |
593 mov ecx, [esp + 12] // pix | 838 mov ecx, [esp + 12] // pix |
594 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f | 839 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
595 psrld xmm3, 27 | 840 psrld xmm3, 27 |
596 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 | 841 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
597 psrld xmm4, 26 | 842 psrld xmm4, 26 |
598 pslld xmm4, 5 | 843 pslld xmm4, 5 |
(...skipping 16 matching lines...) Expand all Loading... |
615 packssdw xmm0, xmm0 | 860 packssdw xmm0, xmm0 |
616 lea eax, [eax + 16] | 861 lea eax, [eax + 16] |
617 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 | 862 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
618 lea edx, [edx + 8] | 863 lea edx, [edx + 8] |
619 sub ecx, 4 | 864 sub ecx, 4 |
620 jg convertloop | 865 jg convertloop |
621 ret | 866 ret |
622 } | 867 } |
623 } | 868 } |
624 | 869 |
| 870 // 8 pixels |
| 871 __declspec(naked) |
| 872 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, |
| 873 const uint32 dither4, int pix) { |
| 874 __asm { |
| 875 |
| 876 mov eax, [esp + 4] // src_argb |
| 877 mov edx, [esp + 8] // dst_rgb |
| 878 movd xmm6, [esp + 12] // dither4 |
| 879 mov ecx, [esp + 16] // pix |
| 880 punpcklbw xmm6, xmm6 // make dither 16 bytes |
| 881 movdqa xmm7, xmm6 |
| 882 punpcklwd xmm6, xmm6 |
| 883 punpckhwd xmm7, xmm7 |
| 884 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
| 885 psrld xmm3, 27 |
| 886 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
| 887 psrld xmm4, 26 |
| 888 pslld xmm4, 5 |
| 889 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
| 890 pslld xmm5, 11 |
| 891 |
| 892 convertloop: |
| 893 movdqu xmm0, [eax] // fetch 4 pixels of argb |
| 894 paddusb xmm0, xmm6 // add dither |
| 895 movdqa xmm1, xmm0 // B |
| 896 movdqa xmm2, xmm0 // G |
| 897 pslld xmm0, 8 // R |
| 898 psrld xmm1, 3 // B |
| 899 psrld xmm2, 5 // G |
| 900 psrad xmm0, 16 // R |
| 901 pand xmm1, xmm3 // B |
| 902 pand xmm2, xmm4 // G |
| 903 pand xmm0, xmm5 // R |
| 904 por xmm1, xmm2 // BG |
| 905 por xmm0, xmm1 // BGR |
| 906 packssdw xmm0, xmm0 |
| 907 lea eax, [eax + 16] |
| 908 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
| 909 lea edx, [edx + 8] |
| 910 sub ecx, 4 |
| 911 jg convertloop |
| 912 ret |
| 913 } |
| 914 } |
| 915 |
| 916 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 |
| 917 __declspec(naked) |
| 918 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, |
| 919 const uint32 dither4, int pix) { |
| 920 __asm { |
| 921 mov eax, [esp + 4] // src_argb |
| 922 mov edx, [esp + 8] // dst_rgb |
| 923 vbroadcastss xmm6, [esp + 12] // dither4 |
| 924 mov ecx, [esp + 16] // pix |
| 925 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes |
| 926 vpermq ymm6, ymm6, 0xd8 |
| 927 vpunpcklwd ymm6, ymm6, ymm6 |
| 928 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
| 929 vpsrld ymm3, ymm3, 27 |
| 930 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
| 931 vpsrld ymm4, ymm4, 26 |
| 932 vpslld ymm4, ymm4, 5 |
| 933 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
| 934 |
| 935 convertloop: |
| 936 vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| 937 vpaddusb ymm0, ymm0, ymm6 // add dither |
| 938 vpsrld ymm2, ymm0, 5 // G |
| 939 vpsrld ymm1, ymm0, 3 // B |
| 940 vpsrld ymm0, ymm0, 8 // R |
| 941 vpand ymm2, ymm2, ymm4 // G |
| 942 vpand ymm1, ymm1, ymm3 // B |
| 943 vpand ymm0, ymm0, ymm5 // R |
| 944 vpor ymm1, ymm1, ymm2 // BG |
| 945 vpor ymm0, ymm0, ymm1 // BGR |
| 946 vpackusdw ymm0, ymm0, ymm0 |
| 947 vpermq ymm0, ymm0, 0xd8 |
| 948 lea eax, [eax + 32] |
| 949 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 |
| 950 lea edx, [edx + 16] |
| 951 sub ecx, 8 |
| 952 jg convertloop |
| 953 vzeroupper |
| 954 ret |
| 955 } |
| 956 } |
| 957 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 |
| 958 |
625 // TODO(fbarchard): Improve sign extension/packing. | 959 // TODO(fbarchard): Improve sign extension/packing. |
626 __declspec(naked) __declspec(align(16)) | 960 __declspec(naked) |
627 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 961 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
628 __asm { | 962 __asm { |
629 mov eax, [esp + 4] // src_argb | 963 mov eax, [esp + 4] // src_argb |
630 mov edx, [esp + 8] // dst_rgb | 964 mov edx, [esp + 8] // dst_rgb |
631 mov ecx, [esp + 12] // pix | 965 mov ecx, [esp + 12] // pix |
632 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f | 966 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f |
633 psrld xmm4, 27 | 967 psrld xmm4, 27 |
634 movdqa xmm5, xmm4 // generate mask 0x000003e0 | 968 movdqa xmm5, xmm4 // generate mask 0x000003e0 |
635 pslld xmm5, 5 | 969 pslld xmm5, 5 |
636 movdqa xmm6, xmm4 // generate mask 0x00007c00 | 970 movdqa xmm6, xmm4 // generate mask 0x00007c00 |
(...skipping 20 matching lines...) Expand all Loading... |
657 packssdw xmm0, xmm0 | 991 packssdw xmm0, xmm0 |
658 lea eax, [eax + 16] | 992 lea eax, [eax + 16] |
659 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 | 993 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 |
660 lea edx, [edx + 8] | 994 lea edx, [edx + 8] |
661 sub ecx, 4 | 995 sub ecx, 4 |
662 jg convertloop | 996 jg convertloop |
663 ret | 997 ret |
664 } | 998 } |
665 } | 999 } |
666 | 1000 |
667 __declspec(naked) __declspec(align(16)) | 1001 __declspec(naked) |
668 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1002 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
669 __asm { | 1003 __asm { |
670 mov eax, [esp + 4] // src_argb | 1004 mov eax, [esp + 4] // src_argb |
671 mov edx, [esp + 8] // dst_rgb | 1005 mov edx, [esp + 8] // dst_rgb |
672 mov ecx, [esp + 12] // pix | 1006 mov ecx, [esp + 12] // pix |
673 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 | 1007 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 |
674 psllw xmm4, 12 | 1008 psllw xmm4, 12 |
675 movdqa xmm3, xmm4 // generate mask 0x00f000f0 | 1009 movdqa xmm3, xmm4 // generate mask 0x00f000f0 |
676 psrlw xmm3, 8 | 1010 psrlw xmm3, 8 |
677 | 1011 |
678 convertloop: | 1012 convertloop: |
679 movdqu xmm0, [eax] // fetch 4 pixels of argb | 1013 movdqu xmm0, [eax] // fetch 4 pixels of argb |
680 movdqa xmm1, xmm0 | 1014 movdqa xmm1, xmm0 |
681 pand xmm0, xmm3 // low nibble | 1015 pand xmm0, xmm3 // low nibble |
682 pand xmm1, xmm4 // high nibble | 1016 pand xmm1, xmm4 // high nibble |
683 psrld xmm0, 4 | 1017 psrld xmm0, 4 |
684 psrld xmm1, 8 | 1018 psrld xmm1, 8 |
685 por xmm0, xmm1 | 1019 por xmm0, xmm1 |
686 packuswb xmm0, xmm0 | 1020 packuswb xmm0, xmm0 |
687 lea eax, [eax + 16] | 1021 lea eax, [eax + 16] |
688 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 | 1022 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 |
689 lea edx, [edx + 8] | 1023 lea edx, [edx + 8] |
690 sub ecx, 4 | 1024 sub ecx, 4 |
691 jg convertloop | 1025 jg convertloop |
692 ret | 1026 ret |
693 } | 1027 } |
694 } | 1028 } |
695 | 1029 |
696 #ifdef HAS_ARGBTORGB565ROW_AVX2 | 1030 #ifdef HAS_ARGBTORGB565ROW_AVX2 |
697 __declspec(naked) __declspec(align(16)) | 1031 __declspec(naked) |
698 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1032 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
699 __asm { | 1033 __asm { |
700 mov eax, [esp + 4] // src_argb | 1034 mov eax, [esp + 4] // src_argb |
701 mov edx, [esp + 8] // dst_rgb | 1035 mov edx, [esp + 8] // dst_rgb |
702 mov ecx, [esp + 12] // pix | 1036 mov ecx, [esp + 12] // pix |
703 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f | 1037 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
704 vpsrld ymm3, ymm3, 27 | 1038 vpsrld ymm3, ymm3, 27 |
705 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 | 1039 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
706 vpsrld ymm4, ymm4, 26 | 1040 vpsrld ymm4, ymm4, 26 |
707 vpslld ymm4, ymm4, 5 | 1041 vpslld ymm4, ymm4, 5 |
708 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800 | 1042 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
709 vpslld ymm5, ymm5, 11 | |
710 | 1043 |
711 convertloop: | 1044 convertloop: |
712 vmovdqu ymm0, [eax] // fetch 8 pixels of argb | 1045 vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
713 vpsrld ymm2, ymm0, 5 // G | 1046 vpsrld ymm2, ymm0, 5 // G |
714 vpsrld ymm1, ymm0, 3 // B | 1047 vpsrld ymm1, ymm0, 3 // B |
715 vpslld ymm0, ymm0, 8 // R | 1048 vpsrld ymm0, ymm0, 8 // R |
716 vpand ymm2, ymm2, ymm4 // G | 1049 vpand ymm2, ymm2, ymm4 // G |
717 vpand ymm1, ymm1, ymm3 // B | 1050 vpand ymm1, ymm1, ymm3 // B |
718 vpsrad ymm0, ymm0, 16 // R | |
719 vpand ymm0, ymm0, ymm5 // R | 1051 vpand ymm0, ymm0, ymm5 // R |
720 vpor ymm1, ymm1, ymm2 // BG | 1052 vpor ymm1, ymm1, ymm2 // BG |
721 vpor ymm0, ymm0, ymm1 // BGR | 1053 vpor ymm0, ymm0, ymm1 // BGR |
722 vpackssdw ymm0, ymm0, ymm0 | 1054 vpackusdw ymm0, ymm0, ymm0 |
723 vpermq ymm0, ymm0, 0xd8 | 1055 vpermq ymm0, ymm0, 0xd8 |
724 lea eax, [eax + 32] | 1056 lea eax, [eax + 32] |
725 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 | 1057 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 |
726 lea edx, [edx + 16] | 1058 lea edx, [edx + 16] |
727 sub ecx, 8 | 1059 sub ecx, 8 |
728 jg convertloop | 1060 jg convertloop |
729 vzeroupper | 1061 vzeroupper |
730 ret | 1062 ret |
731 } | 1063 } |
732 } | 1064 } |
733 #endif // HAS_ARGBTORGB565ROW_AVX2 | 1065 #endif // HAS_ARGBTORGB565ROW_AVX2 |
734 | 1066 |
735 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 | 1067 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 |
736 __declspec(naked) __declspec(align(16)) | 1068 __declspec(naked) |
737 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1069 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
738 __asm { | 1070 __asm { |
739 mov eax, [esp + 4] // src_argb | 1071 mov eax, [esp + 4] // src_argb |
740 mov edx, [esp + 8] // dst_rgb | 1072 mov edx, [esp + 8] // dst_rgb |
741 mov ecx, [esp + 12] // pix | 1073 mov ecx, [esp + 12] // pix |
742 vpcmpeqb ymm4, ymm4, ymm4 | 1074 vpcmpeqb ymm4, ymm4, ymm4 |
743 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f | 1075 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f |
744 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 | 1076 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 |
745 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 | 1077 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 |
746 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 | 1078 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 |
(...skipping 19 matching lines...) Expand all Loading... |
766 lea edx, [edx + 16] | 1098 lea edx, [edx + 16] |
767 sub ecx, 8 | 1099 sub ecx, 8 |
768 jg convertloop | 1100 jg convertloop |
769 vzeroupper | 1101 vzeroupper |
770 ret | 1102 ret |
771 } | 1103 } |
772 } | 1104 } |
773 #endif // HAS_ARGBTOARGB1555ROW_AVX2 | 1105 #endif // HAS_ARGBTOARGB1555ROW_AVX2 |
774 | 1106 |
775 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 | 1107 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 |
776 __declspec(naked) __declspec(align(16)) | 1108 __declspec(naked) |
777 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1109 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
778 __asm { | 1110 __asm { |
779 mov eax, [esp + 4] // src_argb | 1111 mov eax, [esp + 4] // src_argb |
780 mov edx, [esp + 8] // dst_rgb | 1112 mov edx, [esp + 8] // dst_rgb |
781 mov ecx, [esp + 12] // pix | 1113 mov ecx, [esp + 12] // pix |
782 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 | 1114 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 |
783 vpsllw ymm4, ymm4, 12 | 1115 vpsllw ymm4, ymm4, 12 |
784 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 | 1116 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 |
785 | 1117 |
786 convertloop: | 1118 convertloop: |
(...skipping 10 matching lines...) Expand all Loading... |
797 lea edx, [edx + 16] | 1129 lea edx, [edx + 16] |
798 sub ecx, 8 | 1130 sub ecx, 8 |
799 jg convertloop | 1131 jg convertloop |
800 vzeroupper | 1132 vzeroupper |
801 ret | 1133 ret |
802 } | 1134 } |
803 } | 1135 } |
804 #endif // HAS_ARGBTOARGB4444ROW_AVX2 | 1136 #endif // HAS_ARGBTOARGB4444ROW_AVX2 |
805 | 1137 |
806 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. | 1138 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
807 __declspec(naked) __declspec(align(16)) | 1139 __declspec(naked) |
808 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1140 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
809 __asm { | 1141 __asm { |
810 mov eax, [esp + 4] /* src_argb */ | 1142 mov eax, [esp + 4] /* src_argb */ |
811 mov edx, [esp + 8] /* dst_y */ | 1143 mov edx, [esp + 8] /* dst_y */ |
812 mov ecx, [esp + 12] /* pix */ | 1144 mov ecx, [esp + 12] /* pix */ |
813 movdqa xmm4, kARGBToY | 1145 movdqa xmm4, kARGBToY |
814 movdqa xmm5, kAddY16 | 1146 movdqa xmm5, kAddY16 |
815 | 1147 |
816 convertloop: | 1148 convertloop: |
817 movdqu xmm0, [eax] | 1149 movdqu xmm0, [eax] |
(...skipping 14 matching lines...) Expand all Loading... |
832 movdqu [edx], xmm0 | 1164 movdqu [edx], xmm0 |
833 lea edx, [edx + 16] | 1165 lea edx, [edx + 16] |
834 sub ecx, 16 | 1166 sub ecx, 16 |
835 jg convertloop | 1167 jg convertloop |
836 ret | 1168 ret |
837 } | 1169 } |
838 } | 1170 } |
839 | 1171 |
840 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. | 1172 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. |
841 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. | 1173 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. |
842 __declspec(naked) __declspec(align(16)) | 1174 __declspec(naked) |
843 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1175 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
844 __asm { | 1176 __asm { |
845 mov eax, [esp + 4] /* src_argb */ | 1177 mov eax, [esp + 4] /* src_argb */ |
846 mov edx, [esp + 8] /* dst_y */ | 1178 mov edx, [esp + 8] /* dst_y */ |
847 mov ecx, [esp + 12] /* pix */ | 1179 mov ecx, [esp + 12] /* pix */ |
848 movdqa xmm4, kARGBToYJ | 1180 movdqa xmm4, kARGBToYJ |
849 movdqa xmm5, kAddYJ64 | 1181 movdqa xmm5, kAddYJ64 |
850 | 1182 |
851 convertloop: | 1183 convertloop: |
852 movdqu xmm0, [eax] | 1184 movdqu xmm0, [eax] |
(...skipping 20 matching lines...) Expand all Loading... |
873 } | 1205 } |
874 } | 1206 } |
875 | 1207 |
876 #ifdef HAS_ARGBTOYROW_AVX2 | 1208 #ifdef HAS_ARGBTOYROW_AVX2 |
877 // vpermd for vphaddw + vpackuswb vpermd. | 1209 // vpermd for vphaddw + vpackuswb vpermd. |
878 static const lvec32 kPermdARGBToY_AVX = { | 1210 static const lvec32 kPermdARGBToY_AVX = { |
879 0, 4, 1, 5, 2, 6, 3, 7 | 1211 0, 4, 1, 5, 2, 6, 3, 7 |
880 }; | 1212 }; |
881 | 1213 |
882 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 1214 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
883 __declspec(naked) __declspec(align(32)) | 1215 __declspec(naked) |
884 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 1216 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
885 __asm { | 1217 __asm { |
886 mov eax, [esp + 4] /* src_argb */ | 1218 mov eax, [esp + 4] /* src_argb */ |
887 mov edx, [esp + 8] /* dst_y */ | 1219 mov edx, [esp + 8] /* dst_y */ |
888 mov ecx, [esp + 12] /* pix */ | 1220 mov ecx, [esp + 12] /* pix */ |
889 vbroadcastf128 ymm4, kARGBToY | 1221 vbroadcastf128 ymm4, kARGBToY |
890 vbroadcastf128 ymm5, kAddY16 | 1222 vbroadcastf128 ymm5, kAddY16 |
891 vmovdqu ymm6, kPermdARGBToY_AVX | 1223 vmovdqu ymm6, kPermdARGBToY_AVX |
892 | 1224 |
893 convertloop: | 1225 convertloop: |
(...skipping 16 matching lines...) Expand all Loading... |
910 vmovdqu [edx], ymm0 | 1242 vmovdqu [edx], ymm0 |
911 lea edx, [edx + 32] | 1243 lea edx, [edx + 32] |
912 sub ecx, 32 | 1244 sub ecx, 32 |
913 jg convertloop | 1245 jg convertloop |
914 vzeroupper | 1246 vzeroupper |
915 ret | 1247 ret |
916 } | 1248 } |
917 } | 1249 } |
918 #endif // HAS_ARGBTOYROW_AVX2 | 1250 #endif // HAS_ARGBTOYROW_AVX2 |
919 | 1251 |
920 #ifdef HAS_ARGBTOYROW_AVX2 | 1252 #ifdef HAS_ARGBTOYJROW_AVX2 |
921 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 1253 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
922 __declspec(naked) __declspec(align(32)) | 1254 __declspec(naked) |
923 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 1255 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
924 __asm { | 1256 __asm { |
925 mov eax, [esp + 4] /* src_argb */ | 1257 mov eax, [esp + 4] /* src_argb */ |
926 mov edx, [esp + 8] /* dst_y */ | 1258 mov edx, [esp + 8] /* dst_y */ |
927 mov ecx, [esp + 12] /* pix */ | 1259 mov ecx, [esp + 12] /* pix */ |
928 vbroadcastf128 ymm4, kARGBToYJ | 1260 vbroadcastf128 ymm4, kARGBToYJ |
929 vbroadcastf128 ymm5, kAddYJ64 | 1261 vbroadcastf128 ymm5, kAddYJ64 |
930 vmovdqu ymm6, kPermdARGBToY_AVX | 1262 vmovdqu ymm6, kPermdARGBToY_AVX |
931 | 1263 |
932 convertloop: | 1264 convertloop: |
(...skipping 18 matching lines...) Expand all Loading... |
951 lea edx, [edx + 32] | 1283 lea edx, [edx + 32] |
952 sub ecx, 32 | 1284 sub ecx, 32 |
953 jg convertloop | 1285 jg convertloop |
954 | 1286 |
955 vzeroupper | 1287 vzeroupper |
956 ret | 1288 ret |
957 } | 1289 } |
958 } | 1290 } |
959 #endif // HAS_ARGBTOYJROW_AVX2 | 1291 #endif // HAS_ARGBTOYJROW_AVX2 |
960 | 1292 |
961 __declspec(naked) __declspec(align(16)) | 1293 __declspec(naked) |
962 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1294 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
963 __asm { | 1295 __asm { |
964 mov eax, [esp + 4] /* src_argb */ | 1296 mov eax, [esp + 4] /* src_argb */ |
965 mov edx, [esp + 8] /* dst_y */ | 1297 mov edx, [esp + 8] /* dst_y */ |
966 mov ecx, [esp + 12] /* pix */ | 1298 mov ecx, [esp + 12] /* pix */ |
967 movdqa xmm4, kBGRAToY | 1299 movdqa xmm4, kBGRAToY |
968 movdqa xmm5, kAddY16 | 1300 movdqa xmm5, kAddY16 |
969 | 1301 |
970 convertloop: | 1302 convertloop: |
971 movdqu xmm0, [eax] | 1303 movdqu xmm0, [eax] |
(...skipping 12 matching lines...) Expand all Loading... |
984 packuswb xmm0, xmm2 | 1316 packuswb xmm0, xmm2 |
985 paddb xmm0, xmm5 | 1317 paddb xmm0, xmm5 |
986 movdqu [edx], xmm0 | 1318 movdqu [edx], xmm0 |
987 lea edx, [edx + 16] | 1319 lea edx, [edx + 16] |
988 sub ecx, 16 | 1320 sub ecx, 16 |
989 jg convertloop | 1321 jg convertloop |
990 ret | 1322 ret |
991 } | 1323 } |
992 } | 1324 } |
993 | 1325 |
994 __declspec(naked) __declspec(align(16)) | 1326 __declspec(naked) |
995 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1327 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
996 __asm { | 1328 __asm { |
997 mov eax, [esp + 4] /* src_argb */ | 1329 mov eax, [esp + 4] /* src_argb */ |
998 mov edx, [esp + 8] /* dst_y */ | 1330 mov edx, [esp + 8] /* dst_y */ |
999 mov ecx, [esp + 12] /* pix */ | 1331 mov ecx, [esp + 12] /* pix */ |
1000 movdqa xmm4, kABGRToY | 1332 movdqa xmm4, kABGRToY |
1001 movdqa xmm5, kAddY16 | 1333 movdqa xmm5, kAddY16 |
1002 | 1334 |
1003 convertloop: | 1335 convertloop: |
1004 movdqu xmm0, [eax] | 1336 movdqu xmm0, [eax] |
(...skipping 12 matching lines...) Expand all Loading... |
1017 packuswb xmm0, xmm2 | 1349 packuswb xmm0, xmm2 |
1018 paddb xmm0, xmm5 | 1350 paddb xmm0, xmm5 |
1019 movdqu [edx], xmm0 | 1351 movdqu [edx], xmm0 |
1020 lea edx, [edx + 16] | 1352 lea edx, [edx + 16] |
1021 sub ecx, 16 | 1353 sub ecx, 16 |
1022 jg convertloop | 1354 jg convertloop |
1023 ret | 1355 ret |
1024 } | 1356 } |
1025 } | 1357 } |
1026 | 1358 |
1027 __declspec(naked) __declspec(align(16)) | 1359 __declspec(naked) |
1028 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1360 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
1029 __asm { | 1361 __asm { |
1030 mov eax, [esp + 4] /* src_argb */ | 1362 mov eax, [esp + 4] /* src_argb */ |
1031 mov edx, [esp + 8] /* dst_y */ | 1363 mov edx, [esp + 8] /* dst_y */ |
1032 mov ecx, [esp + 12] /* pix */ | 1364 mov ecx, [esp + 12] /* pix */ |
1033 movdqa xmm4, kRGBAToY | 1365 movdqa xmm4, kRGBAToY |
1034 movdqa xmm5, kAddY16 | 1366 movdqa xmm5, kAddY16 |
1035 | 1367 |
1036 convertloop: | 1368 convertloop: |
1037 movdqu xmm0, [eax] | 1369 movdqu xmm0, [eax] |
(...skipping 12 matching lines...) Expand all Loading... |
1050 packuswb xmm0, xmm2 | 1382 packuswb xmm0, xmm2 |
1051 paddb xmm0, xmm5 | 1383 paddb xmm0, xmm5 |
1052 movdqu [edx], xmm0 | 1384 movdqu [edx], xmm0 |
1053 lea edx, [edx + 16] | 1385 lea edx, [edx + 16] |
1054 sub ecx, 16 | 1386 sub ecx, 16 |
1055 jg convertloop | 1387 jg convertloop |
1056 ret | 1388 ret |
1057 } | 1389 } |
1058 } | 1390 } |
1059 | 1391 |
1060 __declspec(naked) __declspec(align(16)) | 1392 __declspec(naked) |
1061 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1393 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
1062 uint8* dst_u, uint8* dst_v, int width) { | 1394 uint8* dst_u, uint8* dst_v, int width) { |
1063 __asm { | 1395 __asm { |
1064 push esi | 1396 push esi |
1065 push edi | 1397 push edi |
1066 mov eax, [esp + 8 + 4] // src_argb | 1398 mov eax, [esp + 8 + 4] // src_argb |
1067 mov esi, [esp + 8 + 8] // src_stride_argb | 1399 mov esi, [esp + 8 + 8] // src_stride_argb |
1068 mov edx, [esp + 8 + 12] // dst_u | 1400 mov edx, [esp + 8 + 12] // dst_u |
1069 mov edi, [esp + 8 + 16] // dst_v | 1401 mov edi, [esp + 8 + 16] // dst_v |
1070 mov ecx, [esp + 8 + 20] // pix | 1402 mov ecx, [esp + 8 + 20] // pix |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1120 lea edx, [edx + 8] | 1452 lea edx, [edx + 8] |
1121 sub ecx, 16 | 1453 sub ecx, 16 |
1122 jg convertloop | 1454 jg convertloop |
1123 | 1455 |
1124 pop edi | 1456 pop edi |
1125 pop esi | 1457 pop esi |
1126 ret | 1458 ret |
1127 } | 1459 } |
1128 } | 1460 } |
1129 | 1461 |
1130 __declspec(naked) __declspec(align(16)) | 1462 __declspec(naked) |
1131 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1463 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
1132 uint8* dst_u, uint8* dst_v, int width) { | 1464 uint8* dst_u, uint8* dst_v, int width) { |
1133 __asm { | 1465 __asm { |
1134 push esi | 1466 push esi |
1135 push edi | 1467 push edi |
1136 mov eax, [esp + 8 + 4] // src_argb | 1468 mov eax, [esp + 8 + 4] // src_argb |
1137 mov esi, [esp + 8 + 8] // src_stride_argb | 1469 mov esi, [esp + 8 + 8] // src_stride_argb |
1138 mov edx, [esp + 8 + 12] // dst_u | 1470 mov edx, [esp + 8 + 12] // dst_u |
1139 mov edi, [esp + 8 + 16] // dst_v | 1471 mov edi, [esp + 8 + 16] // dst_v |
1140 mov ecx, [esp + 8 + 20] // pix | 1472 mov ecx, [esp + 8 + 20] // pix |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1192 sub ecx, 16 | 1524 sub ecx, 16 |
1193 jg convertloop | 1525 jg convertloop |
1194 | 1526 |
1195 pop edi | 1527 pop edi |
1196 pop esi | 1528 pop esi |
1197 ret | 1529 ret |
1198 } | 1530 } |
1199 } | 1531 } |
1200 | 1532 |
1201 #ifdef HAS_ARGBTOUVROW_AVX2 | 1533 #ifdef HAS_ARGBTOUVROW_AVX2 |
1202 __declspec(naked) __declspec(align(32)) | 1534 __declspec(naked) |
1203 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, | 1535 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
1204 uint8* dst_u, uint8* dst_v, int width) { | 1536 uint8* dst_u, uint8* dst_v, int width) { |
1205 __asm { | 1537 __asm { |
1206 push esi | 1538 push esi |
1207 push edi | 1539 push edi |
1208 mov eax, [esp + 8 + 4] // src_argb | 1540 mov eax, [esp + 8 + 4] // src_argb |
1209 mov esi, [esp + 8 + 8] // src_stride_argb | 1541 mov esi, [esp + 8 + 8] // src_stride_argb |
1210 mov edx, [esp + 8 + 12] // dst_u | 1542 mov edx, [esp + 8 + 12] // dst_u |
1211 mov edi, [esp + 8 + 16] // dst_v | 1543 mov edi, [esp + 8 + 16] // dst_v |
1212 mov ecx, [esp + 8 + 20] // pix | 1544 mov ecx, [esp + 8 + 20] // pix |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1257 jg convertloop | 1589 jg convertloop |
1258 | 1590 |
1259 pop edi | 1591 pop edi |
1260 pop esi | 1592 pop esi |
1261 vzeroupper | 1593 vzeroupper |
1262 ret | 1594 ret |
1263 } | 1595 } |
1264 } | 1596 } |
1265 #endif // HAS_ARGBTOUVROW_AVX2 | 1597 #endif // HAS_ARGBTOUVROW_AVX2 |
1266 | 1598 |
1267 __declspec(naked) __declspec(align(16)) | 1599 __declspec(naked) |
1268 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, | 1600 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, |
1269 uint8* dst_u, uint8* dst_v, int width) { | 1601 uint8* dst_u, uint8* dst_v, int width) { |
1270 __asm { | 1602 __asm { |
1271 push edi | 1603 push edi |
1272 mov eax, [esp + 4 + 4] // src_argb | 1604 mov eax, [esp + 4 + 4] // src_argb |
1273 mov edx, [esp + 4 + 8] // dst_u | 1605 mov edx, [esp + 4 + 8] // dst_u |
1274 mov edi, [esp + 4 + 12] // dst_v | 1606 mov edi, [esp + 4 + 12] // dst_v |
1275 mov ecx, [esp + 4 + 16] // pix | 1607 mov ecx, [esp + 4 + 16] // pix |
1276 movdqa xmm5, kAddUV128 | 1608 movdqa xmm5, kAddUV128 |
1277 movdqa xmm6, kARGBToV | 1609 movdqa xmm6, kARGBToV |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1314 movdqu [edx + edi], xmm0 | 1646 movdqu [edx + edi], xmm0 |
1315 lea edx, [edx + 16] | 1647 lea edx, [edx + 16] |
1316 sub ecx, 16 | 1648 sub ecx, 16 |
1317 jg convertloop | 1649 jg convertloop |
1318 | 1650 |
1319 pop edi | 1651 pop edi |
1320 ret | 1652 ret |
1321 } | 1653 } |
1322 } | 1654 } |
1323 | 1655 |
1324 __declspec(naked) __declspec(align(16)) | 1656 __declspec(naked) |
1325 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, | 1657 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, |
1326 uint8* dst_u, uint8* dst_v, int width) { | 1658 uint8* dst_u, uint8* dst_v, int width) { |
1327 __asm { | 1659 __asm { |
1328 push edi | 1660 push edi |
1329 mov eax, [esp + 4 + 4] // src_argb | 1661 mov eax, [esp + 4 + 4] // src_argb |
1330 mov edx, [esp + 4 + 8] // dst_u | 1662 mov edx, [esp + 4 + 8] // dst_u |
1331 mov edi, [esp + 4 + 12] // dst_v | 1663 mov edi, [esp + 4 + 12] // dst_v |
1332 mov ecx, [esp + 4 + 16] // pix | 1664 mov ecx, [esp + 4 + 16] // pix |
1333 movdqa xmm5, kAddUV128 | 1665 movdqa xmm5, kAddUV128 |
1334 movdqa xmm6, kARGBToV | 1666 movdqa xmm6, kARGBToV |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1372 movhps qword ptr [edx + edi], xmm0 // V | 1704 movhps qword ptr [edx + edi], xmm0 // V |
1373 lea edx, [edx + 8] | 1705 lea edx, [edx + 8] |
1374 sub ecx, 16 | 1706 sub ecx, 16 |
1375 jg convertloop | 1707 jg convertloop |
1376 | 1708 |
1377 pop edi | 1709 pop edi |
1378 ret | 1710 ret |
1379 } | 1711 } |
1380 } | 1712 } |
1381 | 1713 |
1382 __declspec(naked) __declspec(align(16)) | 1714 __declspec(naked) |
1383 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1715 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
1384 uint8* dst_u, uint8* dst_v, int width) { | 1716 uint8* dst_u, uint8* dst_v, int width) { |
1385 __asm { | 1717 __asm { |
1386 push esi | 1718 push esi |
1387 push edi | 1719 push edi |
1388 mov eax, [esp + 8 + 4] // src_argb | 1720 mov eax, [esp + 8 + 4] // src_argb |
1389 mov esi, [esp + 8 + 8] // src_stride_argb | 1721 mov esi, [esp + 8 + 8] // src_stride_argb |
1390 mov edx, [esp + 8 + 12] // dst_u | 1722 mov edx, [esp + 8 + 12] // dst_u |
1391 mov edi, [esp + 8 + 16] // dst_v | 1723 mov edi, [esp + 8 + 16] // dst_v |
1392 mov ecx, [esp + 8 + 20] // pix | 1724 mov ecx, [esp + 8 + 20] // pix |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1442 lea edx, [edx + 8] | 1774 lea edx, [edx + 8] |
1443 sub ecx, 16 | 1775 sub ecx, 16 |
1444 jg convertloop | 1776 jg convertloop |
1445 | 1777 |
1446 pop edi | 1778 pop edi |
1447 pop esi | 1779 pop esi |
1448 ret | 1780 ret |
1449 } | 1781 } |
1450 } | 1782 } |
1451 | 1783 |
1452 __declspec(naked) __declspec(align(16)) | 1784 __declspec(naked) |
1453 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1785 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
1454 uint8* dst_u, uint8* dst_v, int width) { | 1786 uint8* dst_u, uint8* dst_v, int width) { |
1455 __asm { | 1787 __asm { |
1456 push esi | 1788 push esi |
1457 push edi | 1789 push edi |
1458 mov eax, [esp + 8 + 4] // src_argb | 1790 mov eax, [esp + 8 + 4] // src_argb |
1459 mov esi, [esp + 8 + 8] // src_stride_argb | 1791 mov esi, [esp + 8 + 8] // src_stride_argb |
1460 mov edx, [esp + 8 + 12] // dst_u | 1792 mov edx, [esp + 8 + 12] // dst_u |
1461 mov edi, [esp + 8 + 16] // dst_v | 1793 mov edi, [esp + 8 + 16] // dst_v |
1462 mov ecx, [esp + 8 + 20] // pix | 1794 mov ecx, [esp + 8 + 20] // pix |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1512 lea edx, [edx + 8] | 1844 lea edx, [edx + 8] |
1513 sub ecx, 16 | 1845 sub ecx, 16 |
1514 jg convertloop | 1846 jg convertloop |
1515 | 1847 |
1516 pop edi | 1848 pop edi |
1517 pop esi | 1849 pop esi |
1518 ret | 1850 ret |
1519 } | 1851 } |
1520 } | 1852 } |
1521 | 1853 |
1522 __declspec(naked) __declspec(align(16)) | 1854 __declspec(naked) |
1523 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1855 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
1524 uint8* dst_u, uint8* dst_v, int width) { | 1856 uint8* dst_u, uint8* dst_v, int width) { |
1525 __asm { | 1857 __asm { |
1526 push esi | 1858 push esi |
1527 push edi | 1859 push edi |
1528 mov eax, [esp + 8 + 4] // src_argb | 1860 mov eax, [esp + 8 + 4] // src_argb |
1529 mov esi, [esp + 8 + 8] // src_stride_argb | 1861 mov esi, [esp + 8 + 8] // src_stride_argb |
1530 mov edx, [esp + 8 + 12] // dst_u | 1862 mov edx, [esp + 8 + 12] // dst_u |
1531 mov edi, [esp + 8 + 16] // dst_v | 1863 mov edi, [esp + 8 + 16] // dst_v |
1532 mov ecx, [esp + 8 + 20] // pix | 1864 mov ecx, [esp + 8 + 20] // pix |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1583 sub ecx, 16 | 1915 sub ecx, 16 |
1584 jg convertloop | 1916 jg convertloop |
1585 | 1917 |
1586 pop edi | 1918 pop edi |
1587 pop esi | 1919 pop esi |
1588 ret | 1920 ret |
1589 } | 1921 } |
1590 } | 1922 } |
1591 #endif // HAS_ARGBTOYROW_SSSE3 | 1923 #endif // HAS_ARGBTOYROW_SSSE3 |
1592 | 1924 |
| 1925 // Read 16 UV from 444 |
| 1926 #define READYUV444_AVX2 __asm { \ |
| 1927 __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \ |
| 1928 __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \ |
| 1929 __asm lea esi, [esi + 16] \ |
| 1930 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1931 __asm vpermq ymm1, ymm1, 0xd8 \ |
| 1932 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| 1933 } |
| 1934 |
1593 // Read 8 UV from 422, upsample to 16 UV. | 1935 // Read 8 UV from 422, upsample to 16 UV. |
1594 #define READYUV422_AVX2 __asm { \ | 1936 #define READYUV422_AVX2 __asm { \ |
1595 __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ | 1937 __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ |
1596 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ | 1938 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ |
1597 __asm lea esi, [esi + 8] \ | 1939 __asm lea esi, [esi + 8] \ |
1598 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ | 1940 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
1599 __asm vpermq ymm0, ymm0, 0xd8 \ | 1941 __asm vpermq ymm0, ymm0, 0xd8 \ |
1600 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1942 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
1601 } | 1943 } |
1602 | 1944 |
| 1945 // Read 4 UV from 411, upsample to 16 UV. |
| 1946 #define READYUV411_AVX2 __asm { \ |
| 1947 __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \ |
| 1948 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \ |
| 1949 __asm lea esi, [esi + 4] \ |
| 1950 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| 1951 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1952 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1953 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ |
| 1954 } |
| 1955 |
1603 // Read 8 UV from NV12, upsample to 16 UV. | 1956 // Read 8 UV from NV12, upsample to 16 UV. |
1604 #define READNV12_AVX2 __asm { \ | 1957 #define READNV12_AVX2 __asm { \ |
1605 __asm vmovdqu xmm0, [esi] /* UV */ \ | 1958 __asm vmovdqu xmm0, [esi] /* UV */ \ |
1606 __asm lea esi, [esi + 16] \ | 1959 __asm lea esi, [esi + 16] \ |
1607 __asm vpermq ymm0, ymm0, 0xd8 \ | 1960 __asm vpermq ymm0, ymm0, 0xd8 \ |
1608 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1961 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
1609 } | 1962 } |
1610 | 1963 |
1611 // Convert 16 pixels: 16 UV and 16 Y. | 1964 // Convert 16 pixels: 16 UV and 16 Y. |
1612 #define YUVTORGB_AVX2(YuvConstants) __asm { \ | 1965 #define YUVTORGB_AVX2(YuvConstants) __asm { \ |
(...skipping 26 matching lines...) Expand all Loading... |
1639 | 1992 |
1640 // Store 16 ARGB values. | 1993 // Store 16 ARGB values. |
1641 #define STOREARGB_AVX2 __asm { \ | 1994 #define STOREARGB_AVX2 __asm { \ |
1642 /* Step 3: Weave into ARGB */ \ | 1995 /* Step 3: Weave into ARGB */ \ |
1643 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ | 1996 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ |
1644 __asm vpermq ymm0, ymm0, 0xd8 \ | 1997 __asm vpermq ymm0, ymm0, 0xd8 \ |
1645 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ | 1998 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ |
1646 __asm vpermq ymm2, ymm2, 0xd8 \ | 1999 __asm vpermq ymm2, ymm2, 0xd8 \ |
1647 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ | 2000 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ |
1648 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ | 2001 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ |
1649 __asm vmovdqu [edx], ymm1 \ | 2002 __asm vmovdqu 0[edx], ymm1 \ |
1650 __asm vmovdqu [edx + 32], ymm0 \ | 2003 __asm vmovdqu 32[edx], ymm0 \ |
1651 __asm lea edx, [edx + 64] \ | 2004 __asm lea edx, [edx + 64] \ |
1652 } | 2005 } |
1653 | 2006 |
1654 #ifdef HAS_I422TOARGBROW_AVX2 | 2007 #ifdef HAS_I422TOARGBROW_AVX2 |
1655 // 16 pixels | 2008 // 16 pixels |
1656 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2009 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
1657 __declspec(naked) __declspec(align(16)) | 2010 __declspec(naked) |
1658 void I422ToARGBRow_AVX2(const uint8* y_buf, | 2011 void I422ToARGBRow_AVX2(const uint8* y_buf, |
1659 const uint8* u_buf, | 2012 const uint8* u_buf, |
1660 const uint8* v_buf, | 2013 const uint8* v_buf, |
1661 uint8* dst_argb, | 2014 uint8* dst_argb, |
1662 int width) { | 2015 int width) { |
1663 __asm { | 2016 __asm { |
1664 push esi | 2017 push esi |
1665 push edi | 2018 push edi |
1666 mov eax, [esp + 8 + 4] // Y | 2019 mov eax, [esp + 8 + 4] // Y |
1667 mov esi, [esp + 8 + 8] // U | 2020 mov esi, [esp + 8 + 8] // U |
(...skipping 12 matching lines...) Expand all Loading... |
1680 jg convertloop | 2033 jg convertloop |
1681 | 2034 |
1682 pop edi | 2035 pop edi |
1683 pop esi | 2036 pop esi |
1684 vzeroupper | 2037 vzeroupper |
1685 ret | 2038 ret |
1686 } | 2039 } |
1687 } | 2040 } |
1688 #endif // HAS_I422TOARGBROW_AVX2 | 2041 #endif // HAS_I422TOARGBROW_AVX2 |
1689 | 2042 |
| 2043 #ifdef HAS_J422TOARGBROW_AVX2 |
| 2044 // 16 pixels |
| 2045 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2046 __declspec(naked) |
| 2047 void J422ToARGBRow_AVX2(const uint8* y_buf, |
| 2048 const uint8* u_buf, |
| 2049 const uint8* v_buf, |
| 2050 uint8* dst_argb, |
| 2051 int width) { |
| 2052 __asm { |
| 2053 push esi |
| 2054 push edi |
| 2055 mov eax, [esp + 8 + 4] // Y |
| 2056 mov esi, [esp + 8 + 8] // U |
| 2057 mov edi, [esp + 8 + 12] // V |
| 2058 mov edx, [esp + 8 + 16] // argb |
| 2059 mov ecx, [esp + 8 + 20] // width |
| 2060 sub edi, esi |
| 2061 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2062 |
| 2063 convertloop: |
| 2064 READYUV422_AVX2 |
| 2065 YUVTORGB_AVX2(kYuvJConstants) |
| 2066 STOREARGB_AVX2 |
| 2067 |
| 2068 sub ecx, 16 |
| 2069 jg convertloop |
| 2070 |
| 2071 pop edi |
| 2072 pop esi |
| 2073 vzeroupper |
| 2074 ret |
| 2075 } |
| 2076 } |
| 2077 #endif // HAS_J422TOARGBROW_AVX2 |
| 2078 |
| 2079 #ifdef HAS_I444TOARGBROW_AVX2 |
| 2080 // 16 pixels |
| 2081 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). |
| 2082 __declspec(naked) |
| 2083 void I444ToARGBRow_AVX2(const uint8* y_buf, |
| 2084 const uint8* u_buf, |
| 2085 const uint8* v_buf, |
| 2086 uint8* dst_argb, |
| 2087 int width) { |
| 2088 __asm { |
| 2089 push esi |
| 2090 push edi |
| 2091 mov eax, [esp + 8 + 4] // Y |
| 2092 mov esi, [esp + 8 + 8] // U |
| 2093 mov edi, [esp + 8 + 12] // V |
| 2094 mov edx, [esp + 8 + 16] // argb |
| 2095 mov ecx, [esp + 8 + 20] // width |
| 2096 sub edi, esi |
| 2097 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2098 |
| 2099 convertloop: |
| 2100 READYUV444_AVX2 |
| 2101 YUVTORGB_AVX2(kYuvConstants) |
| 2102 STOREARGB_AVX2 |
| 2103 |
| 2104 sub ecx, 16 |
| 2105 jg convertloop |
| 2106 |
| 2107 pop edi |
| 2108 pop esi |
| 2109 vzeroupper |
| 2110 ret |
| 2111 } |
| 2112 } |
| 2113 #endif // HAS_I444TOARGBROW_AVX2 |
| 2114 |
| 2115 #ifdef HAS_I411TOARGBROW_AVX2 |
| 2116 // 16 pixels |
| 2117 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2118 __declspec(naked) |
| 2119 void I411ToARGBRow_AVX2(const uint8* y_buf, |
| 2120 const uint8* u_buf, |
| 2121 const uint8* v_buf, |
| 2122 uint8* dst_argb, |
| 2123 int width) { |
| 2124 __asm { |
| 2125 push esi |
| 2126 push edi |
| 2127 mov eax, [esp + 8 + 4] // Y |
| 2128 mov esi, [esp + 8 + 8] // U |
| 2129 mov edi, [esp + 8 + 12] // V |
| 2130 mov edx, [esp + 8 + 16] // argb |
| 2131 mov ecx, [esp + 8 + 20] // width |
| 2132 sub edi, esi |
| 2133 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2134 |
| 2135 convertloop: |
| 2136 READYUV411_AVX2 |
| 2137 YUVTORGB_AVX2(kYuvConstants) |
| 2138 STOREARGB_AVX2 |
| 2139 |
| 2140 sub ecx, 16 |
| 2141 jg convertloop |
| 2142 |
| 2143 pop edi |
| 2144 pop esi |
| 2145 vzeroupper |
| 2146 ret |
| 2147 } |
| 2148 } |
| 2149 #endif // HAS_I411TOARGBROW_AVX2 |
| 2150 |
1690 #ifdef HAS_NV12TOARGBROW_AVX2 | 2151 #ifdef HAS_NV12TOARGBROW_AVX2 |
1691 // 16 pixels. | 2152 // 16 pixels. |
1692 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2153 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
1693 __declspec(naked) __declspec(align(16)) | 2154 __declspec(naked) |
1694 void NV12ToARGBRow_AVX2(const uint8* y_buf, | 2155 void NV12ToARGBRow_AVX2(const uint8* y_buf, |
1695 const uint8* uv_buf, | 2156 const uint8* uv_buf, |
1696 uint8* dst_argb, | 2157 uint8* dst_argb, |
1697 int width) { | 2158 int width) { |
1698 __asm { | 2159 __asm { |
1699 push esi | 2160 push esi |
1700 mov eax, [esp + 4 + 4] // Y | 2161 mov eax, [esp + 4 + 4] // Y |
1701 mov esi, [esp + 4 + 8] // UV | 2162 mov esi, [esp + 4 + 8] // UV |
1702 mov edx, [esp + 4 + 12] // argb | 2163 mov edx, [esp + 4 + 12] // argb |
1703 mov ecx, [esp + 4 + 16] // width | 2164 mov ecx, [esp + 4 + 16] // width |
1704 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2165 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
1705 | 2166 |
1706 convertloop: | 2167 convertloop: |
1707 READNV12_AVX2 | 2168 READNV12_AVX2 |
1708 YUVTORGB_AVX2(kYuvConstants) | 2169 YUVTORGB_AVX2(kYuvConstants) |
1709 STOREARGB_AVX2 | 2170 STOREARGB_AVX2 |
1710 | 2171 |
1711 sub ecx, 16 | 2172 sub ecx, 16 |
1712 jg convertloop | 2173 jg convertloop |
1713 | 2174 |
1714 pop esi | 2175 pop esi |
| 2176 vzeroupper |
1715 ret | 2177 ret |
1716 } | 2178 } |
1717 } | 2179 } |
1718 #endif // HAS_NV12TOARGBROW_AVX2 | 2180 #endif // HAS_NV12TOARGBROW_AVX2 |
1719 | 2181 |
1720 #ifdef HAS_NV21TOARGBROW_AVX2 | 2182 #ifdef HAS_NV21TOARGBROW_AVX2 |
1721 // 16 pixels. | 2183 // 16 pixels. |
1722 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). | 2184 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). |
1723 __declspec(naked) __declspec(align(16)) | 2185 __declspec(naked) |
1724 void NV21ToARGBRow_AVX2(const uint8* y_buf, | 2186 void NV21ToARGBRow_AVX2(const uint8* y_buf, |
1725 const uint8* uv_buf, | 2187 const uint8* uv_buf, |
1726 uint8* dst_argb, | 2188 uint8* dst_argb, |
1727 int width) { | 2189 int width) { |
1728 __asm { | 2190 __asm { |
1729 push esi | 2191 push esi |
1730 mov eax, [esp + 4 + 4] // Y | 2192 mov eax, [esp + 4 + 4] // Y |
1731 mov esi, [esp + 4 + 8] // UV | 2193 mov esi, [esp + 4 + 8] // UV |
1732 mov edx, [esp + 4 + 12] // argb | 2194 mov edx, [esp + 4 + 12] // argb |
1733 mov ecx, [esp + 4 + 16] // width | 2195 mov ecx, [esp + 4 + 16] // width |
1734 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2196 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
1735 | 2197 |
1736 convertloop: | 2198 convertloop: |
1737 READNV12_AVX2 | 2199 READNV12_AVX2 |
1738 YUVTORGB_AVX2(kYvuConstants) | 2200 YUVTORGB_AVX2(kYvuConstants) |
1739 STOREARGB_AVX2 | 2201 STOREARGB_AVX2 |
1740 | 2202 |
1741 sub ecx, 16 | 2203 sub ecx, 16 |
1742 jg convertloop | 2204 jg convertloop |
1743 | 2205 |
1744 pop esi | 2206 pop esi |
| 2207 vzeroupper |
1745 ret | 2208 ret |
1746 } | 2209 } |
1747 } | 2210 } |
1748 #endif // HAS_NV21TOARGBROW_AVX2 | 2211 #endif // HAS_NV21TOARGBROW_AVX2 |
1749 | 2212 |
1750 #ifdef HAS_I422TOBGRAROW_AVX2 | 2213 #ifdef HAS_I422TOBGRAROW_AVX2 |
1751 // 16 pixels | 2214 // 16 pixels |
1752 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 2215 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
1753 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. | 2216 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
1754 __declspec(naked) __declspec(align(16)) | 2217 __declspec(naked) |
1755 void I422ToBGRARow_AVX2(const uint8* y_buf, | 2218 void I422ToBGRARow_AVX2(const uint8* y_buf, |
1756 const uint8* u_buf, | 2219 const uint8* u_buf, |
1757 const uint8* v_buf, | 2220 const uint8* v_buf, |
1758 uint8* dst_argb, | 2221 uint8* dst_argb, |
1759 int width) { | 2222 int width) { |
1760 __asm { | 2223 __asm { |
1761 push esi | 2224 push esi |
1762 push edi | 2225 push edi |
1763 mov eax, [esp + 8 + 4] // Y | 2226 mov eax, [esp + 8 + 4] // Y |
1764 mov esi, [esp + 8 + 8] // U | 2227 mov esi, [esp + 8 + 8] // U |
(...skipping 25 matching lines...) Expand all Loading... |
1790 vzeroupper | 2253 vzeroupper |
1791 ret | 2254 ret |
1792 } | 2255 } |
1793 } | 2256 } |
1794 #endif // HAS_I422TOBGRAROW_AVX2 | 2257 #endif // HAS_I422TOBGRAROW_AVX2 |
1795 | 2258 |
1796 #ifdef HAS_I422TORGBAROW_AVX2 | 2259 #ifdef HAS_I422TORGBAROW_AVX2 |
1797 // 16 pixels | 2260 // 16 pixels |
1798 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | 2261 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
1799 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. | 2262 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
1800 __declspec(naked) __declspec(align(16)) | 2263 __declspec(naked) |
1801 void I422ToRGBARow_AVX2(const uint8* y_buf, | 2264 void I422ToRGBARow_AVX2(const uint8* y_buf, |
1802 const uint8* u_buf, | 2265 const uint8* u_buf, |
1803 const uint8* v_buf, | 2266 const uint8* v_buf, |
1804 uint8* dst_argb, | 2267 uint8* dst_argb, |
1805 int width) { | 2268 int width) { |
1806 __asm { | 2269 __asm { |
1807 push esi | 2270 push esi |
1808 push edi | 2271 push edi |
1809 mov eax, [esp + 8 + 4] // Y | 2272 mov eax, [esp + 8 + 4] // Y |
1810 mov esi, [esp + 8 + 8] // U | 2273 mov esi, [esp + 8 + 8] // U |
(...skipping 25 matching lines...) Expand all Loading... |
1836 vzeroupper | 2299 vzeroupper |
1837 ret | 2300 ret |
1838 } | 2301 } |
1839 } | 2302 } |
1840 #endif // HAS_I422TORGBAROW_AVX2 | 2303 #endif // HAS_I422TORGBAROW_AVX2 |
1841 | 2304 |
1842 #ifdef HAS_I422TOABGRROW_AVX2 | 2305 #ifdef HAS_I422TOABGRROW_AVX2 |
1843 // 16 pixels | 2306 // 16 pixels |
1844 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 2307 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
1845 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. | 2308 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
1846 __declspec(naked) __declspec(align(16)) | 2309 __declspec(naked) |
1847 void I422ToABGRRow_AVX2(const uint8* y_buf, | 2310 void I422ToABGRRow_AVX2(const uint8* y_buf, |
1848 const uint8* u_buf, | 2311 const uint8* u_buf, |
1849 const uint8* v_buf, | 2312 const uint8* v_buf, |
1850 uint8* dst_argb, | 2313 uint8* dst_argb, |
1851 int width) { | 2314 int width) { |
1852 __asm { | 2315 __asm { |
1853 push esi | 2316 push esi |
1854 push edi | 2317 push edi |
1855 mov eax, [esp + 8 + 4] // Y | 2318 mov eax, [esp + 8 + 4] // Y |
1856 mov esi, [esp + 8 + 8] // U | 2319 mov esi, [esp + 8 + 8] // U |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1907 | 2370 |
1908 // Read 2 UV from 411, upsample to 8 UV. | 2371 // Read 2 UV from 411, upsample to 8 UV. |
1909 #define READYUV411 __asm { \ | 2372 #define READYUV411 __asm { \ |
1910 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ | 2373 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ |
1911 __asm movd xmm0, ebx \ | 2374 __asm movd xmm0, ebx \ |
1912 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ | 2375 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ |
1913 __asm movd xmm1, ebx \ | 2376 __asm movd xmm1, ebx \ |
1914 __asm lea esi, [esi + 2] \ | 2377 __asm lea esi, [esi + 2] \ |
1915 __asm punpcklbw xmm0, xmm1 /* UV */ \ | 2378 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
1916 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2379 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
1917 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ | 2380 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ |
1918 } | 2381 } |
1919 | 2382 |
1920 // Read 4 UV from NV12, upsample to 8 UV. | 2383 // Read 4 UV from NV12, upsample to 8 UV. |
1921 #define READNV12 __asm { \ | 2384 #define READNV12 __asm { \ |
1922 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ | 2385 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ |
1923 __asm lea esi, [esi + 8] \ | 2386 __asm lea esi, [esi + 8] \ |
1924 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2387 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
1925 } | 2388 } |
1926 | 2389 |
1927 // Convert 8 pixels: 8 UV and 8 Y. | 2390 // Convert 8 pixels: 8 UV and 8 Y. |
(...skipping 28 matching lines...) Expand all Loading... |
1956 } | 2419 } |
1957 | 2420 |
1958 // Store 8 ARGB values. | 2421 // Store 8 ARGB values. |
1959 #define STOREARGB __asm { \ | 2422 #define STOREARGB __asm { \ |
1960 /* Step 3: Weave into ARGB */ \ | 2423 /* Step 3: Weave into ARGB */ \ |
1961 __asm punpcklbw xmm0, xmm1 /* BG */ \ | 2424 __asm punpcklbw xmm0, xmm1 /* BG */ \ |
1962 __asm punpcklbw xmm2, xmm5 /* RA */ \ | 2425 __asm punpcklbw xmm2, xmm5 /* RA */ \ |
1963 __asm movdqa xmm1, xmm0 \ | 2426 __asm movdqa xmm1, xmm0 \ |
1964 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ | 2427 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ |
1965 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ | 2428 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ |
1966 __asm movdqu [edx], xmm0 \ | 2429 __asm movdqu 0[edx], xmm0 \ |
1967 __asm movdqu [edx + 16], xmm1 \ | 2430 __asm movdqu 16[edx], xmm1 \ |
1968 __asm lea edx, [edx + 32] \ | 2431 __asm lea edx, [edx + 32] \ |
1969 } | 2432 } |
1970 | 2433 |
1971 // Store 8 BGRA values. | 2434 // Store 8 BGRA values. |
1972 #define STOREBGRA __asm { \ | 2435 #define STOREBGRA __asm { \ |
1973 /* Step 3: Weave into BGRA */ \ | 2436 /* Step 3: Weave into BGRA */ \ |
1974 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ | 2437 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ |
1975 __asm punpcklbw xmm1, xmm0 /* GB */ \ | 2438 __asm punpcklbw xmm1, xmm0 /* GB */ \ |
1976 __asm punpcklbw xmm5, xmm2 /* AR */ \ | 2439 __asm punpcklbw xmm5, xmm2 /* AR */ \ |
1977 __asm movdqa xmm0, xmm5 \ | 2440 __asm movdqa xmm0, xmm5 \ |
1978 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ | 2441 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ |
1979 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ | 2442 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ |
1980 __asm movdqu [edx], xmm5 \ | 2443 __asm movdqu 0[edx], xmm5 \ |
1981 __asm movdqu [edx + 16], xmm0 \ | 2444 __asm movdqu 16[edx], xmm0 \ |
1982 __asm lea edx, [edx + 32] \ | 2445 __asm lea edx, [edx + 32] \ |
1983 } | 2446 } |
1984 | 2447 |
1985 // Store 8 ABGR values. | 2448 // Store 8 ABGR values. |
1986 #define STOREABGR __asm { \ | 2449 #define STOREABGR __asm { \ |
1987 /* Step 3: Weave into ABGR */ \ | 2450 /* Step 3: Weave into ABGR */ \ |
1988 __asm punpcklbw xmm2, xmm1 /* RG */ \ | 2451 __asm punpcklbw xmm2, xmm1 /* RG */ \ |
1989 __asm punpcklbw xmm0, xmm5 /* BA */ \ | 2452 __asm punpcklbw xmm0, xmm5 /* BA */ \ |
1990 __asm movdqa xmm1, xmm2 \ | 2453 __asm movdqa xmm1, xmm2 \ |
1991 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ | 2454 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ |
1992 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ | 2455 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ |
1993 __asm movdqu [edx], xmm2 \ | 2456 __asm movdqu 0[edx], xmm2 \ |
1994 __asm movdqu [edx + 16], xmm1 \ | 2457 __asm movdqu 16[edx], xmm1 \ |
1995 __asm lea edx, [edx + 32] \ | 2458 __asm lea edx, [edx + 32] \ |
1996 } | 2459 } |
1997 | 2460 |
1998 // Store 8 RGBA values. | 2461 // Store 8 RGBA values. |
1999 #define STORERGBA __asm { \ | 2462 #define STORERGBA __asm { \ |
2000 /* Step 3: Weave into RGBA */ \ | 2463 /* Step 3: Weave into RGBA */ \ |
2001 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ | 2464 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ |
2002 __asm punpcklbw xmm1, xmm2 /* GR */ \ | 2465 __asm punpcklbw xmm1, xmm2 /* GR */ \ |
2003 __asm punpcklbw xmm5, xmm0 /* AB */ \ | 2466 __asm punpcklbw xmm5, xmm0 /* AB */ \ |
2004 __asm movdqa xmm0, xmm5 \ | 2467 __asm movdqa xmm0, xmm5 \ |
2005 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ | 2468 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ |
2006 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ | 2469 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ |
2007 __asm movdqu [edx], xmm5 \ | 2470 __asm movdqu 0[edx], xmm5 \ |
2008 __asm movdqu [edx + 16], xmm0 \ | 2471 __asm movdqu 16[edx], xmm0 \ |
2009 __asm lea edx, [edx + 32] \ | 2472 __asm lea edx, [edx + 32] \ |
2010 } | 2473 } |
2011 | 2474 |
2012 // Store 8 RGB24 values. | 2475 // Store 8 RGB24 values. |
2013 #define STORERGB24 __asm { \ | 2476 #define STORERGB24 __asm { \ |
2014 /* Step 3: Weave into RRGB */ \ | 2477 /* Step 3: Weave into RRGB */ \ |
2015 __asm punpcklbw xmm0, xmm1 /* BG */ \ | 2478 __asm punpcklbw xmm0, xmm1 /* BG */ \ |
2016 __asm punpcklbw xmm2, xmm2 /* RR */ \ | 2479 __asm punpcklbw xmm2, xmm2 /* RR */ \ |
2017 __asm movdqa xmm1, xmm0 \ | 2480 __asm movdqa xmm1, xmm0 \ |
2018 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ | 2481 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
2019 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ | 2482 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ |
2020 /* Step 4: RRGB -> RGB24 */ \ | 2483 /* Step 4: RRGB -> RGB24 */ \ |
2021 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ | 2484 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ |
2022 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ | 2485 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ |
2023 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ | 2486 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ |
2024 __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ | 2487 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ |
2025 __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ | 2488 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ |
2026 __asm lea edx, [edx + 24] \ | 2489 __asm lea edx, [edx + 24] \ |
2027 } | 2490 } |
2028 | 2491 |
2029 // Store 8 RAW values. | 2492 // Store 8 RAW values. |
2030 #define STORERAW __asm { \ | 2493 #define STORERAW __asm { \ |
2031 /* Step 3: Weave into RRGB */ \ | 2494 /* Step 3: Weave into RRGB */ \ |
2032 __asm punpcklbw xmm0, xmm1 /* BG */ \ | 2495 __asm punpcklbw xmm0, xmm1 /* BG */ \ |
2033 __asm punpcklbw xmm2, xmm2 /* RR */ \ | 2496 __asm punpcklbw xmm2, xmm2 /* RR */ \ |
2034 __asm movdqa xmm1, xmm0 \ | 2497 __asm movdqa xmm1, xmm0 \ |
2035 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ | 2498 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
2036 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ | 2499 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ |
2037 /* Step 4: RRGB -> RAW */ \ | 2500 /* Step 4: RRGB -> RAW */ \ |
2038 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ | 2501 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ |
2039 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ | 2502 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ |
2040 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ | 2503 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ |
2041 __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ | 2504 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ |
2042 __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ | 2505 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ |
2043 __asm lea edx, [edx + 24] \ | 2506 __asm lea edx, [edx + 24] \ |
2044 } | 2507 } |
2045 | 2508 |
2046 // Store 8 RGB565 values. | 2509 // Store 8 RGB565 values. |
2047 #define STORERGB565 __asm { \ | 2510 #define STORERGB565 __asm { \ |
2048 /* Step 3: Weave into RRGB */ \ | 2511 /* Step 3: Weave into RRGB */ \ |
2049 __asm punpcklbw xmm0, xmm1 /* BG */ \ | 2512 __asm punpcklbw xmm0, xmm1 /* BG */ \ |
2050 __asm punpcklbw xmm2, xmm2 /* RR */ \ | 2513 __asm punpcklbw xmm2, xmm2 /* RR */ \ |
2051 __asm movdqa xmm1, xmm0 \ | 2514 __asm movdqa xmm1, xmm0 \ |
2052 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ | 2515 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
(...skipping 15 matching lines...) Expand all Loading... |
2068 __asm pslld xmm1, 8 /* R */ \ | 2531 __asm pslld xmm1, 8 /* R */ \ |
2069 __asm psrld xmm3, 3 /* B */ \ | 2532 __asm psrld xmm3, 3 /* B */ \ |
2070 __asm psrld xmm2, 5 /* G */ \ | 2533 __asm psrld xmm2, 5 /* G */ \ |
2071 __asm psrad xmm1, 16 /* R */ \ | 2534 __asm psrad xmm1, 16 /* R */ \ |
2072 __asm pand xmm3, xmm5 /* B */ \ | 2535 __asm pand xmm3, xmm5 /* B */ \ |
2073 __asm pand xmm2, xmm6 /* G */ \ | 2536 __asm pand xmm2, xmm6 /* G */ \ |
2074 __asm pand xmm1, xmm7 /* R */ \ | 2537 __asm pand xmm1, xmm7 /* R */ \ |
2075 __asm por xmm3, xmm2 /* BG */ \ | 2538 __asm por xmm3, xmm2 /* BG */ \ |
2076 __asm por xmm1, xmm3 /* BGR */ \ | 2539 __asm por xmm1, xmm3 /* BGR */ \ |
2077 __asm packssdw xmm0, xmm1 \ | 2540 __asm packssdw xmm0, xmm1 \ |
2078 __asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \ | 2541 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ |
2079 __asm lea edx, [edx + 16] \ | 2542 __asm lea edx, [edx + 16] \ |
2080 } | 2543 } |
2081 | 2544 |
2082 // 8 pixels. | 2545 // 8 pixels. |
2083 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). | 2546 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). |
2084 __declspec(naked) __declspec(align(16)) | 2547 __declspec(naked) |
2085 void I444ToARGBRow_SSSE3(const uint8* y_buf, | 2548 void I444ToARGBRow_SSSE3(const uint8* y_buf, |
2086 const uint8* u_buf, | 2549 const uint8* u_buf, |
2087 const uint8* v_buf, | 2550 const uint8* v_buf, |
2088 uint8* dst_argb, | 2551 uint8* dst_argb, |
2089 int width) { | 2552 int width) { |
2090 __asm { | 2553 __asm { |
2091 push esi | 2554 push esi |
2092 push edi | 2555 push edi |
2093 mov eax, [esp + 8 + 4] // Y | 2556 mov eax, [esp + 8 + 4] // Y |
2094 mov esi, [esp + 8 + 8] // U | 2557 mov esi, [esp + 8 + 8] // U |
(...skipping 12 matching lines...) Expand all Loading... |
2107 jg convertloop | 2570 jg convertloop |
2108 | 2571 |
2109 pop edi | 2572 pop edi |
2110 pop esi | 2573 pop esi |
2111 ret | 2574 ret |
2112 } | 2575 } |
2113 } | 2576 } |
2114 | 2577 |
2115 // 8 pixels. | 2578 // 8 pixels. |
2116 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). | 2579 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). |
2117 __declspec(naked) __declspec(align(16)) | 2580 __declspec(naked) |
2118 void I422ToRGB24Row_SSSE3(const uint8* y_buf, | 2581 void I422ToRGB24Row_SSSE3(const uint8* y_buf, |
2119 const uint8* u_buf, | 2582 const uint8* u_buf, |
2120 const uint8* v_buf, | 2583 const uint8* v_buf, |
2121 uint8* dst_rgb24, | 2584 uint8* dst_rgb24, |
2122 int width) { | 2585 int width) { |
2123 __asm { | 2586 __asm { |
2124 push esi | 2587 push esi |
2125 push edi | 2588 push edi |
2126 mov eax, [esp + 8 + 4] // Y | 2589 mov eax, [esp + 8 + 4] // Y |
2127 mov esi, [esp + 8 + 8] // U | 2590 mov esi, [esp + 8 + 8] // U |
(...skipping 13 matching lines...) Expand all Loading... |
2141 jg convertloop | 2604 jg convertloop |
2142 | 2605 |
2143 pop edi | 2606 pop edi |
2144 pop esi | 2607 pop esi |
2145 ret | 2608 ret |
2146 } | 2609 } |
2147 } | 2610 } |
2148 | 2611 |
2149 // 8 pixels. | 2612 // 8 pixels. |
2150 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). | 2613 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). |
2151 __declspec(naked) __declspec(align(16)) | 2614 __declspec(naked) |
2152 void I422ToRAWRow_SSSE3(const uint8* y_buf, | 2615 void I422ToRAWRow_SSSE3(const uint8* y_buf, |
2153 const uint8* u_buf, | 2616 const uint8* u_buf, |
2154 const uint8* v_buf, | 2617 const uint8* v_buf, |
2155 uint8* dst_raw, | 2618 uint8* dst_raw, |
2156 int width) { | 2619 int width) { |
2157 __asm { | 2620 __asm { |
2158 push esi | 2621 push esi |
2159 push edi | 2622 push edi |
2160 mov eax, [esp + 8 + 4] // Y | 2623 mov eax, [esp + 8 + 4] // Y |
2161 mov esi, [esp + 8 + 8] // U | 2624 mov esi, [esp + 8 + 8] // U |
(...skipping 13 matching lines...) Expand all Loading... |
2175 jg convertloop | 2638 jg convertloop |
2176 | 2639 |
2177 pop edi | 2640 pop edi |
2178 pop esi | 2641 pop esi |
2179 ret | 2642 ret |
2180 } | 2643 } |
2181 } | 2644 } |
2182 | 2645 |
2183 // 8 pixels | 2646 // 8 pixels |
2184 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). | 2647 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). |
2185 __declspec(naked) __declspec(align(16)) | 2648 __declspec(naked) |
2186 void I422ToRGB565Row_SSSE3(const uint8* y_buf, | 2649 void I422ToRGB565Row_SSSE3(const uint8* y_buf, |
2187 const uint8* u_buf, | 2650 const uint8* u_buf, |
2188 const uint8* v_buf, | 2651 const uint8* v_buf, |
2189 uint8* rgb565_buf, | 2652 uint8* rgb565_buf, |
2190 int width) { | 2653 int width) { |
2191 __asm { | 2654 __asm { |
2192 push esi | 2655 push esi |
2193 push edi | 2656 push edi |
2194 mov eax, [esp + 8 + 4] // Y | 2657 mov eax, [esp + 8 + 4] // Y |
2195 mov esi, [esp + 8 + 8] // U | 2658 mov esi, [esp + 8 + 8] // U |
(...skipping 18 matching lines...) Expand all Loading... |
2214 jg convertloop | 2677 jg convertloop |
2215 | 2678 |
2216 pop edi | 2679 pop edi |
2217 pop esi | 2680 pop esi |
2218 ret | 2681 ret |
2219 } | 2682 } |
2220 } | 2683 } |
2221 | 2684 |
2222 // 8 pixels. | 2685 // 8 pixels. |
2223 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2686 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
2224 __declspec(naked) __declspec(align(16)) | 2687 __declspec(naked) |
2225 void I422ToARGBRow_SSSE3(const uint8* y_buf, | 2688 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
2226 const uint8* u_buf, | 2689 const uint8* u_buf, |
2227 const uint8* v_buf, | 2690 const uint8* v_buf, |
2228 uint8* dst_argb, | 2691 uint8* dst_argb, |
2229 int width) { | 2692 int width) { |
2230 __asm { | 2693 __asm { |
2231 push esi | 2694 push esi |
2232 push edi | 2695 push edi |
2233 mov eax, [esp + 8 + 4] // Y | 2696 mov eax, [esp + 8 + 4] // Y |
2234 mov esi, [esp + 8 + 8] // U | 2697 mov esi, [esp + 8 + 8] // U |
(...skipping 11 matching lines...) Expand all Loading... |
2246 sub ecx, 8 | 2709 sub ecx, 8 |
2247 jg convertloop | 2710 jg convertloop |
2248 | 2711 |
2249 pop edi | 2712 pop edi |
2250 pop esi | 2713 pop esi |
2251 ret | 2714 ret |
2252 } | 2715 } |
2253 } | 2716 } |
2254 | 2717 |
2255 // 8 pixels. | 2718 // 8 pixels. |
| 2719 // JPeg color space version of I422ToARGB |
| 2720 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2721 __declspec(naked) |
| 2722 void J422ToARGBRow_SSSE3(const uint8* y_buf, |
| 2723 const uint8* u_buf, |
| 2724 const uint8* v_buf, |
| 2725 uint8* dst_argb, |
| 2726 int width) { |
| 2727 __asm { |
| 2728 push esi |
| 2729 push edi |
| 2730 mov eax, [esp + 8 + 4] // Y |
| 2731 mov esi, [esp + 8 + 8] // U |
| 2732 mov edi, [esp + 8 + 12] // V |
| 2733 mov edx, [esp + 8 + 16] // argb |
| 2734 mov ecx, [esp + 8 + 20] // width |
| 2735 sub edi, esi |
| 2736 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2737 |
| 2738 convertloop: |
| 2739 READYUV422 |
| 2740 YUVTORGB(kYuvJConstants) |
| 2741 STOREARGB |
| 2742 |
| 2743 sub ecx, 8 |
| 2744 jg convertloop |
| 2745 |
| 2746 pop edi |
| 2747 pop esi |
| 2748 ret |
| 2749 } |
| 2750 } |
| 2751 |
| 2752 // 8 pixels. |
2256 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2753 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
2257 // Similar to I420 but duplicate UV once more. | 2754 // Similar to I420 but duplicate UV once more. |
2258 __declspec(naked) __declspec(align(16)) | 2755 __declspec(naked) |
2259 void I411ToARGBRow_SSSE3(const uint8* y_buf, | 2756 void I411ToARGBRow_SSSE3(const uint8* y_buf, |
2260 const uint8* u_buf, | 2757 const uint8* u_buf, |
2261 const uint8* v_buf, | 2758 const uint8* v_buf, |
2262 uint8* dst_argb, | 2759 uint8* dst_argb, |
2263 int width) { | 2760 int width) { |
2264 __asm { | 2761 __asm { |
2265 push ebx | 2762 push ebx |
2266 push esi | 2763 push esi |
2267 push edi | 2764 push edi |
2268 mov eax, [esp + 12 + 4] // Y | 2765 mov eax, [esp + 12 + 4] // Y |
(...skipping 14 matching lines...) Expand all Loading... |
2283 | 2780 |
2284 pop edi | 2781 pop edi |
2285 pop esi | 2782 pop esi |
2286 pop ebx | 2783 pop ebx |
2287 ret | 2784 ret |
2288 } | 2785 } |
2289 } | 2786 } |
2290 | 2787 |
2291 // 8 pixels. | 2788 // 8 pixels. |
2292 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2789 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
2293 __declspec(naked) __declspec(align(16)) | 2790 __declspec(naked) |
2294 void NV12ToARGBRow_SSSE3(const uint8* y_buf, | 2791 void NV12ToARGBRow_SSSE3(const uint8* y_buf, |
2295 const uint8* uv_buf, | 2792 const uint8* uv_buf, |
2296 uint8* dst_argb, | 2793 uint8* dst_argb, |
2297 int width) { | 2794 int width) { |
2298 __asm { | 2795 __asm { |
2299 push esi | 2796 push esi |
2300 mov eax, [esp + 4 + 4] // Y | 2797 mov eax, [esp + 4 + 4] // Y |
2301 mov esi, [esp + 4 + 8] // UV | 2798 mov esi, [esp + 4 + 8] // UV |
2302 mov edx, [esp + 4 + 12] // argb | 2799 mov edx, [esp + 4 + 12] // argb |
2303 mov ecx, [esp + 4 + 16] // width | 2800 mov ecx, [esp + 4 + 16] // width |
2304 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2801 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2305 | 2802 |
2306 convertloop: | 2803 convertloop: |
2307 READNV12 | 2804 READNV12 |
2308 YUVTORGB(kYuvConstants) | 2805 YUVTORGB(kYuvConstants) |
2309 STOREARGB | 2806 STOREARGB |
2310 | 2807 |
2311 sub ecx, 8 | 2808 sub ecx, 8 |
2312 jg convertloop | 2809 jg convertloop |
2313 | 2810 |
2314 pop esi | 2811 pop esi |
2315 ret | 2812 ret |
2316 } | 2813 } |
2317 } | 2814 } |
2318 | 2815 |
2319 // 8 pixels. | 2816 // 8 pixels. |
2320 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). | 2817 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). |
2321 __declspec(naked) __declspec(align(16)) | 2818 __declspec(naked) |
2322 void NV21ToARGBRow_SSSE3(const uint8* y_buf, | 2819 void NV21ToARGBRow_SSSE3(const uint8* y_buf, |
2323 const uint8* uv_buf, | 2820 const uint8* uv_buf, |
2324 uint8* dst_argb, | 2821 uint8* dst_argb, |
2325 int width) { | 2822 int width) { |
2326 __asm { | 2823 __asm { |
2327 push esi | 2824 push esi |
2328 mov eax, [esp + 4 + 4] // Y | 2825 mov eax, [esp + 4 + 4] // Y |
2329 mov esi, [esp + 4 + 8] // UV | 2826 mov esi, [esp + 4 + 8] // UV |
2330 mov edx, [esp + 4 + 12] // argb | 2827 mov edx, [esp + 4 + 12] // argb |
2331 mov ecx, [esp + 4 + 16] // width | 2828 mov ecx, [esp + 4 + 16] // width |
2332 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2829 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2333 | 2830 |
2334 convertloop: | 2831 convertloop: |
2335 READNV12 | 2832 READNV12 |
2336 YUVTORGB(kYvuConstants) | 2833 YUVTORGB(kYvuConstants) |
2337 STOREARGB | 2834 STOREARGB |
2338 | 2835 |
2339 sub ecx, 8 | 2836 sub ecx, 8 |
2340 jg convertloop | 2837 jg convertloop |
2341 | 2838 |
2342 pop esi | 2839 pop esi |
2343 ret | 2840 ret |
2344 } | 2841 } |
2345 } | 2842 } |
2346 | 2843 |
2347 __declspec(naked) __declspec(align(16)) | 2844 __declspec(naked) |
2348 void I422ToBGRARow_SSSE3(const uint8* y_buf, | 2845 void I422ToBGRARow_SSSE3(const uint8* y_buf, |
2349 const uint8* u_buf, | 2846 const uint8* u_buf, |
2350 const uint8* v_buf, | 2847 const uint8* v_buf, |
2351 uint8* dst_bgra, | 2848 uint8* dst_bgra, |
2352 int width) { | 2849 int width) { |
2353 __asm { | 2850 __asm { |
2354 push esi | 2851 push esi |
2355 push edi | 2852 push edi |
2356 mov eax, [esp + 8 + 4] // Y | 2853 mov eax, [esp + 8 + 4] // Y |
2357 mov esi, [esp + 8 + 8] // U | 2854 mov esi, [esp + 8 + 8] // U |
2358 mov edi, [esp + 8 + 12] // V | 2855 mov edi, [esp + 8 + 12] // V |
2359 mov edx, [esp + 8 + 16] // bgra | 2856 mov edx, [esp + 8 + 16] // bgra |
2360 mov ecx, [esp + 8 + 20] // width | 2857 mov ecx, [esp + 8 + 20] // width |
2361 sub edi, esi | 2858 sub edi, esi |
2362 | 2859 |
2363 convertloop: | 2860 convertloop: |
2364 READYUV422 | 2861 READYUV422 |
2365 YUVTORGB(kYuvConstants) | 2862 YUVTORGB(kYuvConstants) |
2366 STOREBGRA | 2863 STOREBGRA |
2367 | 2864 |
2368 sub ecx, 8 | 2865 sub ecx, 8 |
2369 jg convertloop | 2866 jg convertloop |
2370 | 2867 |
2371 pop edi | 2868 pop edi |
2372 pop esi | 2869 pop esi |
2373 ret | 2870 ret |
2374 } | 2871 } |
2375 } | 2872 } |
2376 | 2873 |
2377 __declspec(naked) __declspec(align(16)) | 2874 __declspec(naked) |
2378 void I422ToABGRRow_SSSE3(const uint8* y_buf, | 2875 void I422ToABGRRow_SSSE3(const uint8* y_buf, |
2379 const uint8* u_buf, | 2876 const uint8* u_buf, |
2380 const uint8* v_buf, | 2877 const uint8* v_buf, |
2381 uint8* dst_abgr, | 2878 uint8* dst_abgr, |
2382 int width) { | 2879 int width) { |
2383 __asm { | 2880 __asm { |
2384 push esi | 2881 push esi |
2385 push edi | 2882 push edi |
2386 mov eax, [esp + 8 + 4] // Y | 2883 mov eax, [esp + 8 + 4] // Y |
2387 mov esi, [esp + 8 + 8] // U | 2884 mov esi, [esp + 8 + 8] // U |
(...skipping 10 matching lines...) Expand all Loading... |
2398 | 2895 |
2399 sub ecx, 8 | 2896 sub ecx, 8 |
2400 jg convertloop | 2897 jg convertloop |
2401 | 2898 |
2402 pop edi | 2899 pop edi |
2403 pop esi | 2900 pop esi |
2404 ret | 2901 ret |
2405 } | 2902 } |
2406 } | 2903 } |
2407 | 2904 |
2408 __declspec(naked) __declspec(align(16)) | 2905 __declspec(naked) |
2409 void I422ToRGBARow_SSSE3(const uint8* y_buf, | 2906 void I422ToRGBARow_SSSE3(const uint8* y_buf, |
2410 const uint8* u_buf, | 2907 const uint8* u_buf, |
2411 const uint8* v_buf, | 2908 const uint8* v_buf, |
2412 uint8* dst_rgba, | 2909 uint8* dst_rgba, |
2413 int width) { | 2910 int width) { |
2414 __asm { | 2911 __asm { |
2415 push esi | 2912 push esi |
2416 push edi | 2913 push edi |
2417 mov eax, [esp + 8 + 4] // Y | 2914 mov eax, [esp + 8 + 4] // Y |
2418 mov esi, [esp + 8 + 8] // U | 2915 mov esi, [esp + 8 + 8] // U |
(...skipping 11 matching lines...) Expand all Loading... |
2430 jg convertloop | 2927 jg convertloop |
2431 | 2928 |
2432 pop edi | 2929 pop edi |
2433 pop esi | 2930 pop esi |
2434 ret | 2931 ret |
2435 } | 2932 } |
2436 } | 2933 } |
2437 | 2934 |
2438 #endif // HAS_I422TOARGBROW_SSSE3 | 2935 #endif // HAS_I422TOARGBROW_SSSE3 |
2439 | 2936 |
2440 #ifdef HAS_YTOARGBROW_SSE2 | 2937 #ifdef HAS_I400TOARGBROW_SSE2 |
2441 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). | 2938 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). |
2442 __declspec(naked) __declspec(align(16)) | 2939 __declspec(naked) |
2443 void YToARGBRow_SSE2(const uint8* y_buf, | 2940 void I400ToARGBRow_SSE2(const uint8* y_buf, |
2444 uint8* rgb_buf, | 2941 uint8* rgb_buf, |
2445 int width) { | 2942 int width) { |
2446 __asm { | 2943 __asm { |
2447 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) | 2944 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) |
2448 movd xmm2, eax | 2945 movd xmm2, eax |
2449 pshufd xmm2, xmm2,0 | 2946 pshufd xmm2, xmm2,0 |
2450 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) | 2947 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) |
2451 movd xmm3, eax | 2948 movd xmm3, eax |
2452 pshufd xmm3, xmm3, 0 | 2949 pshufd xmm3, xmm3, 0 |
2453 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | 2950 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
2454 pslld xmm4, 24 | 2951 pslld xmm4, 24 |
2455 | 2952 |
(...skipping 19 matching lines...) Expand all Loading... |
2475 por xmm0, xmm4 | 2972 por xmm0, xmm4 |
2476 por xmm1, xmm4 | 2973 por xmm1, xmm4 |
2477 movdqu [edx], xmm0 | 2974 movdqu [edx], xmm0 |
2478 movdqu [edx + 16], xmm1 | 2975 movdqu [edx + 16], xmm1 |
2479 lea edx, [edx + 32] | 2976 lea edx, [edx + 32] |
2480 sub ecx, 8 | 2977 sub ecx, 8 |
2481 jg convertloop | 2978 jg convertloop |
2482 ret | 2979 ret |
2483 } | 2980 } |
2484 } | 2981 } |
2485 #endif // HAS_YTOARGBROW_SSE2 | 2982 #endif // HAS_I400TOARGBROW_SSE2 |
2486 | 2983 |
2487 #ifdef HAS_YTOARGBROW_AVX2 | 2984 #ifdef HAS_I400TOARGBROW_AVX2 |
2488 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). | 2985 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). |
2489 // note: vpunpcklbw mutates and vpackuswb unmutates. | 2986 // note: vpunpcklbw mutates and vpackuswb unmutates. |
2490 __declspec(naked) __declspec(align(16)) | 2987 __declspec(naked) |
2491 void YToARGBRow_AVX2(const uint8* y_buf, | 2988 void I400ToARGBRow_AVX2(const uint8* y_buf, |
2492 uint8* rgb_buf, | 2989 uint8* rgb_buf, |
2493 int width) { | 2990 int width) { |
2494 __asm { | 2991 __asm { |
2495 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) | 2992 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) |
2496 vmovd xmm2, eax | 2993 vmovd xmm2, eax |
2497 vbroadcastss ymm2, xmm2 | 2994 vbroadcastss ymm2, xmm2 |
2498 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) | 2995 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) |
2499 vmovd xmm3, eax | 2996 vmovd xmm3, eax |
2500 vbroadcastss ymm3, xmm3 | 2997 vbroadcastss ymm3, xmm3 |
2501 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 | 2998 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 |
2502 vpslld ymm4, ymm4, 24 | 2999 vpslld ymm4, ymm4, 24 |
2503 | 3000 |
2504 mov eax, [esp + 4] // Y | 3001 mov eax, [esp + 4] // Y |
2505 mov edx, [esp + 8] // rgb | 3002 mov edx, [esp + 8] // rgb |
2506 mov ecx, [esp + 12] // width | 3003 mov ecx, [esp + 12] // width |
2507 | 3004 |
2508 convertloop: | 3005 convertloop: |
2509 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 | 3006 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 |
2510 vmovdqu xmm0, [eax] | 3007 vmovdqu xmm0, [eax] |
2511 lea eax, [eax + 16] | 3008 lea eax, [eax + 16] |
2512 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates | 3009 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates |
2513 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y | 3010 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y |
2514 vpmulhuw ymm0, ymm0, ymm2 | 3011 vpmulhuw ymm0, ymm0, ymm2 |
2515 vpsubusw ymm0, ymm0, ymm3 | 3012 vpsubusw ymm0, ymm0, ymm3 |
2516 vpsrlw ymm0, ymm0, 6 | 3013 vpsrlw ymm0, ymm0, 6 |
2517 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 | 3014 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 |
2518 | 3015 |
2519 // TODO(fbarchard): Weave alpha with unpack. | 3016 // TODO(fbarchard): Weave alpha with unpack. |
2520 // Step 2: Weave into ARGB | 3017 // Step 2: Weave into ARGB |
2521 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates | 3018 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates |
2522 vpermq ymm1, ymm1, 0xd8 | 3019 vpermq ymm1, ymm1, 0xd8 |
2523 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels | 3020 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels |
2524 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels | 3021 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels |
2525 vpor ymm0, ymm0, ymm4 | 3022 vpor ymm0, ymm0, ymm4 |
2526 vpor ymm1, ymm1, ymm4 | 3023 vpor ymm1, ymm1, ymm4 |
2527 vmovdqu [edx], ymm0 | 3024 vmovdqu [edx], ymm0 |
2528 vmovdqu [edx + 32], ymm1 | 3025 vmovdqu [edx + 32], ymm1 |
2529 lea edx, [edx + 64] | 3026 lea edx, [edx + 64] |
2530 sub ecx, 16 | 3027 sub ecx, 16 |
2531 jg convertloop | 3028 jg convertloop |
2532 vzeroupper | 3029 vzeroupper |
2533 ret | 3030 ret |
2534 } | 3031 } |
2535 } | 3032 } |
2536 #endif // HAS_YTOARGBROW_AVX2 | 3033 #endif // HAS_I400TOARGBROW_AVX2 |
2537 | 3034 |
2538 #ifdef HAS_MIRRORROW_SSSE3 | 3035 #ifdef HAS_MIRRORROW_SSSE3 |
2539 // Shuffle table for reversing the bytes. | 3036 // Shuffle table for reversing the bytes. |
2540 static const uvec8 kShuffleMirror = { | 3037 static const uvec8 kShuffleMirror = { |
2541 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | 3038 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
2542 }; | 3039 }; |
2543 | 3040 |
2544 // TODO(fbarchard): Replace lea with -16 offset. | 3041 // TODO(fbarchard): Replace lea with -16 offset. |
2545 __declspec(naked) __declspec(align(16)) | 3042 __declspec(naked) |
2546 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { | 3043 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
2547 __asm { | 3044 __asm { |
2548 mov eax, [esp + 4] // src | 3045 mov eax, [esp + 4] // src |
2549 mov edx, [esp + 8] // dst | 3046 mov edx, [esp + 8] // dst |
2550 mov ecx, [esp + 12] // width | 3047 mov ecx, [esp + 12] // width |
2551 movdqa xmm5, kShuffleMirror | 3048 movdqa xmm5, kShuffleMirror |
2552 | 3049 |
2553 convertloop: | 3050 convertloop: |
2554 movdqu xmm0, [eax - 16 + ecx] | 3051 movdqu xmm0, [eax - 16 + ecx] |
2555 pshufb xmm0, xmm5 | 3052 pshufb xmm0, xmm5 |
2556 movdqu [edx], xmm0 | 3053 movdqu [edx], xmm0 |
2557 lea edx, [edx + 16] | 3054 lea edx, [edx + 16] |
2558 sub ecx, 16 | 3055 sub ecx, 16 |
2559 jg convertloop | 3056 jg convertloop |
2560 ret | 3057 ret |
2561 } | 3058 } |
2562 } | 3059 } |
2563 #endif // HAS_MIRRORROW_SSSE3 | 3060 #endif // HAS_MIRRORROW_SSSE3 |
2564 | 3061 |
2565 #ifdef HAS_MIRRORROW_AVX2 | 3062 #ifdef HAS_MIRRORROW_AVX2 |
2566 __declspec(naked) __declspec(align(16)) | 3063 __declspec(naked) |
2567 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | 3064 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
2568 __asm { | 3065 __asm { |
2569 mov eax, [esp + 4] // src | 3066 mov eax, [esp + 4] // src |
2570 mov edx, [esp + 8] // dst | 3067 mov edx, [esp + 8] // dst |
2571 mov ecx, [esp + 12] // width | 3068 mov ecx, [esp + 12] // width |
2572 vbroadcastf128 ymm5, kShuffleMirror | 3069 vbroadcastf128 ymm5, kShuffleMirror |
2573 | 3070 |
2574 convertloop: | 3071 convertloop: |
2575 vmovdqu ymm0, [eax - 32 + ecx] | 3072 vmovdqu ymm0, [eax - 32 + ecx] |
2576 vpshufb ymm0, ymm0, ymm5 | 3073 vpshufb ymm0, ymm0, ymm5 |
2577 vpermq ymm0, ymm0, 0x4e // swap high and low halfs | 3074 vpermq ymm0, ymm0, 0x4e // swap high and low halfs |
2578 vmovdqu [edx], ymm0 | 3075 vmovdqu [edx], ymm0 |
2579 lea edx, [edx + 32] | 3076 lea edx, [edx + 32] |
2580 sub ecx, 32 | 3077 sub ecx, 32 |
2581 jg convertloop | 3078 jg convertloop |
2582 vzeroupper | 3079 vzeroupper |
2583 ret | 3080 ret |
2584 } | 3081 } |
2585 } | 3082 } |
2586 #endif // HAS_MIRRORROW_AVX2 | 3083 #endif // HAS_MIRRORROW_AVX2 |
2587 | 3084 |
2588 #ifdef HAS_MIRRORROW_SSE2 | 3085 #ifdef HAS_MIRRORROW_SSE2 |
2589 __declspec(naked) __declspec(align(16)) | 3086 __declspec(naked) |
2590 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | 3087 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { |
2591 __asm { | 3088 __asm { |
2592 mov eax, [esp + 4] // src | 3089 mov eax, [esp + 4] // src |
2593 mov edx, [esp + 8] // dst | 3090 mov edx, [esp + 8] // dst |
2594 mov ecx, [esp + 12] // width | 3091 mov ecx, [esp + 12] // width |
2595 | 3092 |
2596 convertloop: | 3093 convertloop: |
2597 movdqu xmm0, [eax - 16 + ecx] | 3094 movdqu xmm0, [eax - 16 + ecx] |
2598 movdqa xmm1, xmm0 // swap bytes | 3095 movdqa xmm1, xmm0 // swap bytes |
2599 psllw xmm0, 8 | 3096 psllw xmm0, 8 |
(...skipping 10 matching lines...) Expand all Loading... |
2610 } | 3107 } |
2611 } | 3108 } |
2612 #endif // HAS_MIRRORROW_SSE2 | 3109 #endif // HAS_MIRRORROW_SSE2 |
2613 | 3110 |
2614 #ifdef HAS_MIRRORROW_UV_SSSE3 | 3111 #ifdef HAS_MIRRORROW_UV_SSSE3 |
2615 // Shuffle table for reversing the bytes of UV channels. | 3112 // Shuffle table for reversing the bytes of UV channels. |
2616 static const uvec8 kShuffleMirrorUV = { | 3113 static const uvec8 kShuffleMirrorUV = { |
2617 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u | 3114 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u |
2618 }; | 3115 }; |
2619 | 3116 |
2620 __declspec(naked) __declspec(align(16)) | 3117 __declspec(naked) |
2621 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, | 3118 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, |
2622 int width) { | 3119 int width) { |
2623 __asm { | 3120 __asm { |
2624 push edi | 3121 push edi |
2625 mov eax, [esp + 4 + 4] // src | 3122 mov eax, [esp + 4 + 4] // src |
2626 mov edx, [esp + 4 + 8] // dst_u | 3123 mov edx, [esp + 4 + 8] // dst_u |
2627 mov edi, [esp + 4 + 12] // dst_v | 3124 mov edi, [esp + 4 + 12] // dst_v |
2628 mov ecx, [esp + 4 + 16] // width | 3125 mov ecx, [esp + 4 + 16] // width |
2629 movdqa xmm1, kShuffleMirrorUV | 3126 movdqa xmm1, kShuffleMirrorUV |
2630 lea eax, [eax + ecx * 2 - 16] | 3127 lea eax, [eax + ecx * 2 - 16] |
2631 sub edi, edx | 3128 sub edi, edx |
2632 | 3129 |
2633 convertloop: | 3130 convertloop: |
2634 movdqu xmm0, [eax] | 3131 movdqu xmm0, [eax] |
2635 lea eax, [eax - 16] | 3132 lea eax, [eax - 16] |
2636 pshufb xmm0, xmm1 | 3133 pshufb xmm0, xmm1 |
2637 movlpd qword ptr [edx], xmm0 | 3134 movlpd qword ptr [edx], xmm0 |
2638 movhpd qword ptr [edx + edi], xmm0 | 3135 movhpd qword ptr [edx + edi], xmm0 |
2639 lea edx, [edx + 8] | 3136 lea edx, [edx + 8] |
2640 sub ecx, 8 | 3137 sub ecx, 8 |
2641 jg convertloop | 3138 jg convertloop |
2642 | 3139 |
2643 pop edi | 3140 pop edi |
2644 ret | 3141 ret |
2645 } | 3142 } |
2646 } | 3143 } |
2647 #endif // HAS_MIRRORROW_UV_SSSE3 | 3144 #endif // HAS_MIRRORROW_UV_SSSE3 |
2648 | 3145 |
2649 #ifdef HAS_ARGBMIRRORROW_SSE2 | 3146 #ifdef HAS_ARGBMIRRORROW_SSE2 |
2650 __declspec(naked) __declspec(align(16)) | 3147 __declspec(naked) |
2651 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | 3148 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { |
2652 __asm { | 3149 __asm { |
2653 mov eax, [esp + 4] // src | 3150 mov eax, [esp + 4] // src |
2654 mov edx, [esp + 8] // dst | 3151 mov edx, [esp + 8] // dst |
2655 mov ecx, [esp + 12] // width | 3152 mov ecx, [esp + 12] // width |
2656 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. | 3153 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. |
2657 | 3154 |
2658 convertloop: | 3155 convertloop: |
2659 movdqu xmm0, [eax] | 3156 movdqu xmm0, [eax] |
2660 lea eax, [eax - 16] | 3157 lea eax, [eax - 16] |
2661 pshufd xmm0, xmm0, 0x1b | 3158 pshufd xmm0, xmm0, 0x1b |
2662 movdqu [edx], xmm0 | 3159 movdqu [edx], xmm0 |
2663 lea edx, [edx + 16] | 3160 lea edx, [edx + 16] |
2664 sub ecx, 4 | 3161 sub ecx, 4 |
2665 jg convertloop | 3162 jg convertloop |
2666 ret | 3163 ret |
2667 } | 3164 } |
2668 } | 3165 } |
2669 #endif // HAS_ARGBMIRRORROW_SSE2 | 3166 #endif // HAS_ARGBMIRRORROW_SSE2 |
2670 | 3167 |
2671 #ifdef HAS_ARGBMIRRORROW_AVX2 | 3168 #ifdef HAS_ARGBMIRRORROW_AVX2 |
2672 // Shuffle table for reversing the bytes. | 3169 // Shuffle table for reversing the bytes. |
2673 static const ulvec32 kARGBShuffleMirror_AVX2 = { | 3170 static const ulvec32 kARGBShuffleMirror_AVX2 = { |
2674 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | 3171 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
2675 }; | 3172 }; |
2676 | 3173 |
2677 __declspec(naked) __declspec(align(16)) | 3174 __declspec(naked) |
2678 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | 3175 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
2679 __asm { | 3176 __asm { |
2680 mov eax, [esp + 4] // src | 3177 mov eax, [esp + 4] // src |
2681 mov edx, [esp + 8] // dst | 3178 mov edx, [esp + 8] // dst |
2682 mov ecx, [esp + 12] // width | 3179 mov ecx, [esp + 12] // width |
2683 vmovdqu ymm5, kARGBShuffleMirror_AVX2 | 3180 vmovdqu ymm5, kARGBShuffleMirror_AVX2 |
2684 | 3181 |
2685 convertloop: | 3182 convertloop: |
2686 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order | 3183 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order |
2687 vmovdqu [edx], ymm0 | 3184 vmovdqu [edx], ymm0 |
2688 lea edx, [edx + 32] | 3185 lea edx, [edx + 32] |
2689 sub ecx, 8 | 3186 sub ecx, 8 |
2690 jg convertloop | 3187 jg convertloop |
2691 vzeroupper | 3188 vzeroupper |
2692 ret | 3189 ret |
2693 } | 3190 } |
2694 } | 3191 } |
2695 #endif // HAS_ARGBMIRRORROW_AVX2 | 3192 #endif // HAS_ARGBMIRRORROW_AVX2 |
2696 | 3193 |
2697 #ifdef HAS_SPLITUVROW_SSE2 | 3194 #ifdef HAS_SPLITUVROW_SSE2 |
2698 __declspec(naked) __declspec(align(16)) | 3195 __declspec(naked) |
2699 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 3196 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
2700 __asm { | 3197 __asm { |
2701 push edi | 3198 push edi |
2702 mov eax, [esp + 4 + 4] // src_uv | 3199 mov eax, [esp + 4 + 4] // src_uv |
2703 mov edx, [esp + 4 + 8] // dst_u | 3200 mov edx, [esp + 4 + 8] // dst_u |
2704 mov edi, [esp + 4 + 12] // dst_v | 3201 mov edi, [esp + 4 + 12] // dst_v |
2705 mov ecx, [esp + 4 + 16] // pix | 3202 mov ecx, [esp + 4 + 16] // pix |
2706 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 3203 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
2707 psrlw xmm5, 8 | 3204 psrlw xmm5, 8 |
2708 sub edi, edx | 3205 sub edi, edx |
(...skipping 17 matching lines...) Expand all Loading... |
2726 jg convertloop | 3223 jg convertloop |
2727 | 3224 |
2728 pop edi | 3225 pop edi |
2729 ret | 3226 ret |
2730 } | 3227 } |
2731 } | 3228 } |
2732 | 3229 |
2733 #endif // HAS_SPLITUVROW_SSE2 | 3230 #endif // HAS_SPLITUVROW_SSE2 |
2734 | 3231 |
2735 #ifdef HAS_SPLITUVROW_AVX2 | 3232 #ifdef HAS_SPLITUVROW_AVX2 |
2736 __declspec(naked) __declspec(align(16)) | 3233 __declspec(naked) |
2737 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 3234 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
2738 __asm { | 3235 __asm { |
2739 push edi | 3236 push edi |
2740 mov eax, [esp + 4 + 4] // src_uv | 3237 mov eax, [esp + 4 + 4] // src_uv |
2741 mov edx, [esp + 4 + 8] // dst_u | 3238 mov edx, [esp + 4 + 8] // dst_u |
2742 mov edi, [esp + 4 + 12] // dst_v | 3239 mov edi, [esp + 4 + 12] // dst_v |
2743 mov ecx, [esp + 4 + 16] // pix | 3240 mov ecx, [esp + 4 + 16] // pix |
2744 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 3241 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
2745 vpsrlw ymm5, ymm5, 8 | 3242 vpsrlw ymm5, ymm5, 8 |
2746 sub edi, edx | 3243 sub edi, edx |
(...skipping 17 matching lines...) Expand all Loading... |
2764 jg convertloop | 3261 jg convertloop |
2765 | 3262 |
2766 pop edi | 3263 pop edi |
2767 vzeroupper | 3264 vzeroupper |
2768 ret | 3265 ret |
2769 } | 3266 } |
2770 } | 3267 } |
2771 #endif // HAS_SPLITUVROW_AVX2 | 3268 #endif // HAS_SPLITUVROW_AVX2 |
2772 | 3269 |
2773 #ifdef HAS_MERGEUVROW_SSE2 | 3270 #ifdef HAS_MERGEUVROW_SSE2 |
2774 __declspec(naked) __declspec(align(16)) | 3271 __declspec(naked) |
2775 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 3272 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
2776 int width) { | 3273 int width) { |
2777 __asm { | 3274 __asm { |
2778 push edi | 3275 push edi |
2779 mov eax, [esp + 4 + 4] // src_u | 3276 mov eax, [esp + 4 + 4] // src_u |
2780 mov edx, [esp + 4 + 8] // src_v | 3277 mov edx, [esp + 4 + 8] // src_v |
2781 mov edi, [esp + 4 + 12] // dst_uv | 3278 mov edi, [esp + 4 + 12] // dst_uv |
2782 mov ecx, [esp + 4 + 16] // width | 3279 mov ecx, [esp + 4 + 16] // width |
2783 sub edx, eax | 3280 sub edx, eax |
2784 | 3281 |
(...skipping 10 matching lines...) Expand all Loading... |
2795 sub ecx, 16 | 3292 sub ecx, 16 |
2796 jg convertloop | 3293 jg convertloop |
2797 | 3294 |
2798 pop edi | 3295 pop edi |
2799 ret | 3296 ret |
2800 } | 3297 } |
2801 } | 3298 } |
2802 #endif // HAS_MERGEUVROW_SSE2 | 3299 #endif // HAS_MERGEUVROW_SSE2 |
2803 | 3300 |
2804 #ifdef HAS_MERGEUVROW_AVX2 | 3301 #ifdef HAS_MERGEUVROW_AVX2 |
2805 __declspec(naked) __declspec(align(16)) | 3302 __declspec(naked) |
2806 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 3303 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
2807 int width) { | 3304 int width) { |
2808 __asm { | 3305 __asm { |
2809 push edi | 3306 push edi |
2810 mov eax, [esp + 4 + 4] // src_u | 3307 mov eax, [esp + 4 + 4] // src_u |
2811 mov edx, [esp + 4 + 8] // src_v | 3308 mov edx, [esp + 4 + 8] // src_v |
2812 mov edi, [esp + 4 + 12] // dst_uv | 3309 mov edi, [esp + 4 + 12] // dst_uv |
2813 mov ecx, [esp + 4 + 16] // width | 3310 mov ecx, [esp + 4 + 16] // width |
2814 sub edx, eax | 3311 sub edx, eax |
2815 | 3312 |
(...skipping 13 matching lines...) Expand all Loading... |
2829 | 3326 |
2830 pop edi | 3327 pop edi |
2831 vzeroupper | 3328 vzeroupper |
2832 ret | 3329 ret |
2833 } | 3330 } |
2834 } | 3331 } |
2835 #endif // HAS_MERGEUVROW_AVX2 | 3332 #endif // HAS_MERGEUVROW_AVX2 |
2836 | 3333 |
2837 #ifdef HAS_COPYROW_SSE2 | 3334 #ifdef HAS_COPYROW_SSE2 |
2838 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. | 3335 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. |
2839 __declspec(naked) __declspec(align(16)) | 3336 __declspec(naked) |
2840 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { | 3337 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { |
2841 __asm { | 3338 __asm { |
2842 mov eax, [esp + 4] // src | 3339 mov eax, [esp + 4] // src |
2843 mov edx, [esp + 8] // dst | 3340 mov edx, [esp + 8] // dst |
2844 mov ecx, [esp + 12] // count | 3341 mov ecx, [esp + 12] // count |
2845 | 3342 |
2846 convertloop: | 3343 convertloop: |
2847 movdqu xmm0, [eax] | 3344 movdqu xmm0, [eax] |
2848 movdqu xmm1, [eax + 16] | 3345 movdqu xmm1, [eax + 16] |
2849 lea eax, [eax + 32] | 3346 lea eax, [eax + 32] |
2850 movdqu [edx], xmm0 | 3347 movdqu [edx], xmm0 |
2851 movdqu [edx + 16], xmm1 | 3348 movdqu [edx + 16], xmm1 |
2852 lea edx, [edx + 32] | 3349 lea edx, [edx + 32] |
2853 sub ecx, 32 | 3350 sub ecx, 32 |
2854 jg convertloop | 3351 jg convertloop |
2855 ret | 3352 ret |
2856 } | 3353 } |
2857 } | 3354 } |
2858 #endif // HAS_COPYROW_SSE2 | 3355 #endif // HAS_COPYROW_SSE2 |
2859 | 3356 |
2860 #ifdef HAS_COPYROW_AVX | 3357 #ifdef HAS_COPYROW_AVX |
2861 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. | 3358 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. |
2862 __declspec(naked) __declspec(align(16)) | 3359 __declspec(naked) |
2863 void CopyRow_AVX(const uint8* src, uint8* dst, int count) { | 3360 void CopyRow_AVX(const uint8* src, uint8* dst, int count) { |
2864 __asm { | 3361 __asm { |
2865 mov eax, [esp + 4] // src | 3362 mov eax, [esp + 4] // src |
2866 mov edx, [esp + 8] // dst | 3363 mov edx, [esp + 8] // dst |
2867 mov ecx, [esp + 12] // count | 3364 mov ecx, [esp + 12] // count |
2868 | 3365 |
2869 convertloop: | 3366 convertloop: |
2870 vmovdqu ymm0, [eax] | 3367 vmovdqu ymm0, [eax] |
2871 vmovdqu ymm1, [eax + 32] | 3368 vmovdqu ymm1, [eax + 32] |
2872 lea eax, [eax + 64] | 3369 lea eax, [eax + 64] |
2873 vmovdqu [edx], ymm0 | 3370 vmovdqu [edx], ymm0 |
2874 vmovdqu [edx + 32], ymm1 | 3371 vmovdqu [edx + 32], ymm1 |
2875 lea edx, [edx + 64] | 3372 lea edx, [edx + 64] |
2876 sub ecx, 64 | 3373 sub ecx, 64 |
2877 jg convertloop | 3374 jg convertloop |
2878 | 3375 |
2879 vzeroupper | 3376 vzeroupper |
2880 ret | 3377 ret |
2881 } | 3378 } |
2882 } | 3379 } |
2883 #endif // HAS_COPYROW_AVX | 3380 #endif // HAS_COPYROW_AVX |
2884 | 3381 |
2885 // Multiple of 1. | 3382 // Multiple of 1. |
2886 __declspec(naked) __declspec(align(16)) | 3383 __declspec(naked) |
2887 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { | 3384 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { |
2888 __asm { | 3385 __asm { |
2889 mov eax, esi | 3386 mov eax, esi |
2890 mov edx, edi | 3387 mov edx, edi |
2891 mov esi, [esp + 4] // src | 3388 mov esi, [esp + 4] // src |
2892 mov edi, [esp + 8] // dst | 3389 mov edi, [esp + 8] // dst |
2893 mov ecx, [esp + 12] // count | 3390 mov ecx, [esp + 12] // count |
2894 rep movsb | 3391 rep movsb |
2895 mov edi, edx | 3392 mov edi, edx |
2896 mov esi, eax | 3393 mov esi, eax |
2897 ret | 3394 ret |
2898 } | 3395 } |
2899 } | 3396 } |
2900 | 3397 |
2901 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 | 3398 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 |
2902 // width in pixels | 3399 // width in pixels |
2903 __declspec(naked) __declspec(align(16)) | 3400 __declspec(naked) |
2904 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | 3401 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
2905 __asm { | 3402 __asm { |
2906 mov eax, [esp + 4] // src | 3403 mov eax, [esp + 4] // src |
2907 mov edx, [esp + 8] // dst | 3404 mov edx, [esp + 8] // dst |
2908 mov ecx, [esp + 12] // count | 3405 mov ecx, [esp + 12] // count |
2909 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 | 3406 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
2910 pslld xmm0, 24 | 3407 pslld xmm0, 24 |
2911 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff | 3408 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
2912 psrld xmm1, 8 | 3409 psrld xmm1, 8 |
2913 | 3410 |
(...skipping 15 matching lines...) Expand all Loading... |
2929 sub ecx, 8 | 3426 sub ecx, 8 |
2930 jg convertloop | 3427 jg convertloop |
2931 | 3428 |
2932 ret | 3429 ret |
2933 } | 3430 } |
2934 } | 3431 } |
2935 #endif // HAS_ARGBCOPYALPHAROW_SSE2 | 3432 #endif // HAS_ARGBCOPYALPHAROW_SSE2 |
2936 | 3433 |
2937 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 | 3434 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 |
2938 // width in pixels | 3435 // width in pixels |
2939 __declspec(naked) __declspec(align(16)) | 3436 __declspec(naked) |
2940 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { | 3437 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
2941 __asm { | 3438 __asm { |
2942 mov eax, [esp + 4] // src | 3439 mov eax, [esp + 4] // src |
2943 mov edx, [esp + 8] // dst | 3440 mov edx, [esp + 8] // dst |
2944 mov ecx, [esp + 12] // count | 3441 mov ecx, [esp + 12] // count |
2945 vpcmpeqb ymm0, ymm0, ymm0 | 3442 vpcmpeqb ymm0, ymm0, ymm0 |
2946 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff | 3443 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
2947 | 3444 |
2948 convertloop: | 3445 convertloop: |
2949 vmovdqu ymm1, [eax] | 3446 vmovdqu ymm1, [eax] |
2950 vmovdqu ymm2, [eax + 32] | 3447 vmovdqu ymm2, [eax + 32] |
2951 lea eax, [eax + 64] | 3448 lea eax, [eax + 64] |
2952 vpblendvb ymm1, ymm1, [edx], ymm0 | 3449 vpblendvb ymm1, ymm1, [edx], ymm0 |
2953 vpblendvb ymm2, ymm2, [edx + 32], ymm0 | 3450 vpblendvb ymm2, ymm2, [edx + 32], ymm0 |
2954 vmovdqu [edx], ymm1 | 3451 vmovdqu [edx], ymm1 |
2955 vmovdqu [edx + 32], ymm2 | 3452 vmovdqu [edx + 32], ymm2 |
2956 lea edx, [edx + 64] | 3453 lea edx, [edx + 64] |
2957 sub ecx, 16 | 3454 sub ecx, 16 |
2958 jg convertloop | 3455 jg convertloop |
2959 | 3456 |
2960 vzeroupper | 3457 vzeroupper |
2961 ret | 3458 ret |
2962 } | 3459 } |
2963 } | 3460 } |
2964 #endif // HAS_ARGBCOPYALPHAROW_AVX2 | 3461 #endif // HAS_ARGBCOPYALPHAROW_AVX2 |
2965 | 3462 |
2966 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 | 3463 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 |
2967 // width in pixels | 3464 // width in pixels |
2968 __declspec(naked) __declspec(align(16)) | 3465 __declspec(naked) |
2969 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | 3466 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
2970 __asm { | 3467 __asm { |
2971 mov eax, [esp + 4] // src | 3468 mov eax, [esp + 4] // src |
2972 mov edx, [esp + 8] // dst | 3469 mov edx, [esp + 8] // dst |
2973 mov ecx, [esp + 12] // count | 3470 mov ecx, [esp + 12] // count |
2974 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 | 3471 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
2975 pslld xmm0, 24 | 3472 pslld xmm0, 24 |
2976 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff | 3473 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
2977 psrld xmm1, 8 | 3474 psrld xmm1, 8 |
2978 | 3475 |
(...skipping 17 matching lines...) Expand all Loading... |
2996 sub ecx, 8 | 3493 sub ecx, 8 |
2997 jg convertloop | 3494 jg convertloop |
2998 | 3495 |
2999 ret | 3496 ret |
3000 } | 3497 } |
3001 } | 3498 } |
3002 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 | 3499 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 |
3003 | 3500 |
3004 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 | 3501 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 |
3005 // width in pixels | 3502 // width in pixels |
3006 __declspec(naked) __declspec(align(16)) | 3503 __declspec(naked) |
3007 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { | 3504 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
3008 __asm { | 3505 __asm { |
3009 mov eax, [esp + 4] // src | 3506 mov eax, [esp + 4] // src |
3010 mov edx, [esp + 8] // dst | 3507 mov edx, [esp + 8] // dst |
3011 mov ecx, [esp + 12] // count | 3508 mov ecx, [esp + 12] // count |
3012 vpcmpeqb ymm0, ymm0, ymm0 | 3509 vpcmpeqb ymm0, ymm0, ymm0 |
3013 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff | 3510 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
3014 | 3511 |
3015 convertloop: | 3512 convertloop: |
3016 vpmovzxbd ymm1, qword ptr [eax] | 3513 vpmovzxbd ymm1, qword ptr [eax] |
(...skipping 11 matching lines...) Expand all Loading... |
3028 | 3525 |
3029 vzeroupper | 3526 vzeroupper |
3030 ret | 3527 ret |
3031 } | 3528 } |
3032 } | 3529 } |
3033 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 | 3530 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 |
3034 | 3531 |
3035 #ifdef HAS_SETROW_X86 | 3532 #ifdef HAS_SETROW_X86 |
3036 // Write 'count' bytes using an 8 bit value repeated. | 3533 // Write 'count' bytes using an 8 bit value repeated. |
3037 // Count should be multiple of 4. | 3534 // Count should be multiple of 4. |
3038 __declspec(naked) __declspec(align(16)) | 3535 __declspec(naked) |
3039 void SetRow_X86(uint8* dst, uint8 v8, int count) { | 3536 void SetRow_X86(uint8* dst, uint8 v8, int count) { |
3040 __asm { | 3537 __asm { |
3041 movzx eax, byte ptr [esp + 8] // v8 | 3538 movzx eax, byte ptr [esp + 8] // v8 |
3042 mov edx, 0x01010101 // Duplicate byte to all bytes. | 3539 mov edx, 0x01010101 // Duplicate byte to all bytes. |
3043 mul edx // overwrites edx with upper part of result. | 3540 mul edx // overwrites edx with upper part of result. |
3044 mov edx, edi | 3541 mov edx, edi |
3045 mov edi, [esp + 4] // dst | 3542 mov edi, [esp + 4] // dst |
3046 mov ecx, [esp + 12] // count | 3543 mov ecx, [esp + 12] // count |
3047 shr ecx, 2 | 3544 shr ecx, 2 |
3048 rep stosd | 3545 rep stosd |
3049 mov edi, edx | 3546 mov edi, edx |
3050 ret | 3547 ret |
3051 } | 3548 } |
3052 } | 3549 } |
3053 | 3550 |
3054 // Write 'count' bytes using an 8 bit value repeated. | 3551 // Write 'count' bytes using an 8 bit value repeated. |
3055 __declspec(naked) __declspec(align(16)) | 3552 __declspec(naked) |
3056 void SetRow_ERMS(uint8* dst, uint8 v8, int count) { | 3553 void SetRow_ERMS(uint8* dst, uint8 v8, int count) { |
3057 __asm { | 3554 __asm { |
3058 mov edx, edi | 3555 mov edx, edi |
3059 mov edi, [esp + 4] // dst | 3556 mov edi, [esp + 4] // dst |
3060 mov eax, [esp + 8] // v8 | 3557 mov eax, [esp + 8] // v8 |
3061 mov ecx, [esp + 12] // count | 3558 mov ecx, [esp + 12] // count |
3062 rep stosb | 3559 rep stosb |
3063 mov edi, edx | 3560 mov edi, edx |
3064 ret | 3561 ret |
3065 } | 3562 } |
3066 } | 3563 } |
3067 | 3564 |
3068 // Write 'count' 32 bit values. | 3565 // Write 'count' 32 bit values. |
3069 __declspec(naked) __declspec(align(16)) | 3566 __declspec(naked) |
3070 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { | 3567 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { |
3071 __asm { | 3568 __asm { |
3072 mov edx, edi | 3569 mov edx, edi |
3073 mov edi, [esp + 4] // dst | 3570 mov edi, [esp + 4] // dst |
3074 mov eax, [esp + 8] // v32 | 3571 mov eax, [esp + 8] // v32 |
3075 mov ecx, [esp + 12] // count | 3572 mov ecx, [esp + 12] // count |
3076 rep stosd | 3573 rep stosd |
3077 mov edi, edx | 3574 mov edi, edx |
3078 ret | 3575 ret |
3079 } | 3576 } |
3080 } | 3577 } |
3081 #endif // HAS_SETROW_X86 | 3578 #endif // HAS_SETROW_X86 |
3082 | 3579 |
3083 #ifdef HAS_YUY2TOYROW_AVX2 | 3580 #ifdef HAS_YUY2TOYROW_AVX2 |
3084 __declspec(naked) __declspec(align(16)) | 3581 __declspec(naked) |
3085 void YUY2ToYRow_AVX2(const uint8* src_yuy2, | 3582 void YUY2ToYRow_AVX2(const uint8* src_yuy2, |
3086 uint8* dst_y, int pix) { | 3583 uint8* dst_y, int pix) { |
3087 __asm { | 3584 __asm { |
3088 mov eax, [esp + 4] // src_yuy2 | 3585 mov eax, [esp + 4] // src_yuy2 |
3089 mov edx, [esp + 8] // dst_y | 3586 mov edx, [esp + 8] // dst_y |
3090 mov ecx, [esp + 12] // pix | 3587 mov ecx, [esp + 12] // pix |
3091 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 3588 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
3092 vpsrlw ymm5, ymm5, 8 | 3589 vpsrlw ymm5, ymm5, 8 |
3093 | 3590 |
3094 convertloop: | 3591 convertloop: |
3095 vmovdqu ymm0, [eax] | 3592 vmovdqu ymm0, [eax] |
3096 vmovdqu ymm1, [eax + 32] | 3593 vmovdqu ymm1, [eax + 32] |
3097 lea eax, [eax + 64] | 3594 lea eax, [eax + 64] |
3098 vpand ymm0, ymm0, ymm5 // even bytes are Y | 3595 vpand ymm0, ymm0, ymm5 // even bytes are Y |
3099 vpand ymm1, ymm1, ymm5 | 3596 vpand ymm1, ymm1, ymm5 |
3100 vpackuswb ymm0, ymm0, ymm1 // mutates. | 3597 vpackuswb ymm0, ymm0, ymm1 // mutates. |
3101 vpermq ymm0, ymm0, 0xd8 | 3598 vpermq ymm0, ymm0, 0xd8 |
3102 vmovdqu [edx], ymm0 | 3599 vmovdqu [edx], ymm0 |
3103 lea edx, [edx + 32] | 3600 lea edx, [edx + 32] |
3104 sub ecx, 32 | 3601 sub ecx, 32 |
3105 jg convertloop | 3602 jg convertloop |
3106 vzeroupper | 3603 vzeroupper |
3107 ret | 3604 ret |
3108 } | 3605 } |
3109 } | 3606 } |
3110 | 3607 |
3111 __declspec(naked) __declspec(align(16)) | 3608 __declspec(naked) |
3112 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, | 3609 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, |
3113 uint8* dst_u, uint8* dst_v, int pix) { | 3610 uint8* dst_u, uint8* dst_v, int pix) { |
3114 __asm { | 3611 __asm { |
3115 push esi | 3612 push esi |
3116 push edi | 3613 push edi |
3117 mov eax, [esp + 8 + 4] // src_yuy2 | 3614 mov eax, [esp + 8 + 4] // src_yuy2 |
3118 mov esi, [esp + 8 + 8] // stride_yuy2 | 3615 mov esi, [esp + 8 + 8] // stride_yuy2 |
3119 mov edx, [esp + 8 + 12] // dst_u | 3616 mov edx, [esp + 8 + 12] // dst_u |
3120 mov edi, [esp + 8 + 16] // dst_v | 3617 mov edi, [esp + 8 + 16] // dst_v |
3121 mov ecx, [esp + 8 + 20] // pix | 3618 mov ecx, [esp + 8 + 20] // pix |
(...skipping 23 matching lines...) Expand all Loading... |
3145 sub ecx, 32 | 3642 sub ecx, 32 |
3146 jg convertloop | 3643 jg convertloop |
3147 | 3644 |
3148 pop edi | 3645 pop edi |
3149 pop esi | 3646 pop esi |
3150 vzeroupper | 3647 vzeroupper |
3151 ret | 3648 ret |
3152 } | 3649 } |
3153 } | 3650 } |
3154 | 3651 |
3155 __declspec(naked) __declspec(align(16)) | 3652 __declspec(naked) |
3156 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, | 3653 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, |
3157 uint8* dst_u, uint8* dst_v, int pix) { | 3654 uint8* dst_u, uint8* dst_v, int pix) { |
3158 __asm { | 3655 __asm { |
3159 push edi | 3656 push edi |
3160 mov eax, [esp + 4 + 4] // src_yuy2 | 3657 mov eax, [esp + 4 + 4] // src_yuy2 |
3161 mov edx, [esp + 4 + 8] // dst_u | 3658 mov edx, [esp + 4 + 8] // dst_u |
3162 mov edi, [esp + 4 + 12] // dst_v | 3659 mov edi, [esp + 4 + 12] // dst_v |
3163 mov ecx, [esp + 4 + 16] // pix | 3660 mov ecx, [esp + 4 + 16] // pix |
3164 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 3661 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
3165 vpsrlw ymm5, ymm5, 8 | 3662 vpsrlw ymm5, ymm5, 8 |
(...skipping 18 matching lines...) Expand all Loading... |
3184 lea edx, [edx + 16] | 3681 lea edx, [edx + 16] |
3185 sub ecx, 32 | 3682 sub ecx, 32 |
3186 jg convertloop | 3683 jg convertloop |
3187 | 3684 |
3188 pop edi | 3685 pop edi |
3189 vzeroupper | 3686 vzeroupper |
3190 ret | 3687 ret |
3191 } | 3688 } |
3192 } | 3689 } |
3193 | 3690 |
3194 __declspec(naked) __declspec(align(16)) | 3691 __declspec(naked) |
3195 void UYVYToYRow_AVX2(const uint8* src_uyvy, | 3692 void UYVYToYRow_AVX2(const uint8* src_uyvy, |
3196 uint8* dst_y, int pix) { | 3693 uint8* dst_y, int pix) { |
3197 __asm { | 3694 __asm { |
3198 mov eax, [esp + 4] // src_uyvy | 3695 mov eax, [esp + 4] // src_uyvy |
3199 mov edx, [esp + 8] // dst_y | 3696 mov edx, [esp + 8] // dst_y |
3200 mov ecx, [esp + 12] // pix | 3697 mov ecx, [esp + 12] // pix |
3201 | 3698 |
3202 convertloop: | 3699 convertloop: |
3203 vmovdqu ymm0, [eax] | 3700 vmovdqu ymm0, [eax] |
3204 vmovdqu ymm1, [eax + 32] | 3701 vmovdqu ymm1, [eax + 32] |
3205 lea eax, [eax + 64] | 3702 lea eax, [eax + 64] |
3206 vpsrlw ymm0, ymm0, 8 // odd bytes are Y | 3703 vpsrlw ymm0, ymm0, 8 // odd bytes are Y |
3207 vpsrlw ymm1, ymm1, 8 | 3704 vpsrlw ymm1, ymm1, 8 |
3208 vpackuswb ymm0, ymm0, ymm1 // mutates. | 3705 vpackuswb ymm0, ymm0, ymm1 // mutates. |
3209 vpermq ymm0, ymm0, 0xd8 | 3706 vpermq ymm0, ymm0, 0xd8 |
3210 vmovdqu [edx], ymm0 | 3707 vmovdqu [edx], ymm0 |
3211 lea edx, [edx + 32] | 3708 lea edx, [edx + 32] |
3212 sub ecx, 32 | 3709 sub ecx, 32 |
3213 jg convertloop | 3710 jg convertloop |
3214 vzeroupper | 3711 vzeroupper |
3215 ret | 3712 ret |
3216 } | 3713 } |
3217 } | 3714 } |
3218 | 3715 |
3219 __declspec(naked) __declspec(align(16)) | 3716 __declspec(naked) |
3220 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, | 3717 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, |
3221 uint8* dst_u, uint8* dst_v, int pix) { | 3718 uint8* dst_u, uint8* dst_v, int pix) { |
3222 __asm { | 3719 __asm { |
3223 push esi | 3720 push esi |
3224 push edi | 3721 push edi |
3225 mov eax, [esp + 8 + 4] // src_yuy2 | 3722 mov eax, [esp + 8 + 4] // src_yuy2 |
3226 mov esi, [esp + 8 + 8] // stride_yuy2 | 3723 mov esi, [esp + 8 + 8] // stride_yuy2 |
3227 mov edx, [esp + 8 + 12] // dst_u | 3724 mov edx, [esp + 8 + 12] // dst_u |
3228 mov edi, [esp + 8 + 16] // dst_v | 3725 mov edi, [esp + 8 + 16] // dst_v |
3229 mov ecx, [esp + 8 + 20] // pix | 3726 mov ecx, [esp + 8 + 20] // pix |
(...skipping 23 matching lines...) Expand all Loading... |
3253 sub ecx, 32 | 3750 sub ecx, 32 |
3254 jg convertloop | 3751 jg convertloop |
3255 | 3752 |
3256 pop edi | 3753 pop edi |
3257 pop esi | 3754 pop esi |
3258 vzeroupper | 3755 vzeroupper |
3259 ret | 3756 ret |
3260 } | 3757 } |
3261 } | 3758 } |
3262 | 3759 |
3263 __declspec(naked) __declspec(align(16)) | 3760 __declspec(naked) |
3264 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, | 3761 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, |
3265 uint8* dst_u, uint8* dst_v, int pix) { | 3762 uint8* dst_u, uint8* dst_v, int pix) { |
3266 __asm { | 3763 __asm { |
3267 push edi | 3764 push edi |
3268 mov eax, [esp + 4 + 4] // src_yuy2 | 3765 mov eax, [esp + 4 + 4] // src_yuy2 |
3269 mov edx, [esp + 4 + 8] // dst_u | 3766 mov edx, [esp + 4 + 8] // dst_u |
3270 mov edi, [esp + 4 + 12] // dst_v | 3767 mov edi, [esp + 4 + 12] // dst_v |
3271 mov ecx, [esp + 4 + 16] // pix | 3768 mov ecx, [esp + 4 + 16] // pix |
3272 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 3769 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
3273 vpsrlw ymm5, ymm5, 8 | 3770 vpsrlw ymm5, ymm5, 8 |
(...skipping 20 matching lines...) Expand all Loading... |
3294 jg convertloop | 3791 jg convertloop |
3295 | 3792 |
3296 pop edi | 3793 pop edi |
3297 vzeroupper | 3794 vzeroupper |
3298 ret | 3795 ret |
3299 } | 3796 } |
3300 } | 3797 } |
3301 #endif // HAS_YUY2TOYROW_AVX2 | 3798 #endif // HAS_YUY2TOYROW_AVX2 |
3302 | 3799 |
3303 #ifdef HAS_YUY2TOYROW_SSE2 | 3800 #ifdef HAS_YUY2TOYROW_SSE2 |
3304 __declspec(naked) __declspec(align(16)) | 3801 __declspec(naked) |
3305 void YUY2ToYRow_SSE2(const uint8* src_yuy2, | 3802 void YUY2ToYRow_SSE2(const uint8* src_yuy2, |
3306 uint8* dst_y, int pix) { | 3803 uint8* dst_y, int pix) { |
3307 __asm { | 3804 __asm { |
3308 mov eax, [esp + 4] // src_yuy2 | 3805 mov eax, [esp + 4] // src_yuy2 |
3309 mov edx, [esp + 8] // dst_y | 3806 mov edx, [esp + 8] // dst_y |
3310 mov ecx, [esp + 12] // pix | 3807 mov ecx, [esp + 12] // pix |
3311 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 3808 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
3312 psrlw xmm5, 8 | 3809 psrlw xmm5, 8 |
3313 | 3810 |
3314 convertloop: | 3811 convertloop: |
3315 movdqu xmm0, [eax] | 3812 movdqu xmm0, [eax] |
3316 movdqu xmm1, [eax + 16] | 3813 movdqu xmm1, [eax + 16] |
3317 lea eax, [eax + 32] | 3814 lea eax, [eax + 32] |
3318 pand xmm0, xmm5 // even bytes are Y | 3815 pand xmm0, xmm5 // even bytes are Y |
3319 pand xmm1, xmm5 | 3816 pand xmm1, xmm5 |
3320 packuswb xmm0, xmm1 | 3817 packuswb xmm0, xmm1 |
3321 movdqu [edx], xmm0 | 3818 movdqu [edx], xmm0 |
3322 lea edx, [edx + 16] | 3819 lea edx, [edx + 16] |
3323 sub ecx, 16 | 3820 sub ecx, 16 |
3324 jg convertloop | 3821 jg convertloop |
3325 ret | 3822 ret |
3326 } | 3823 } |
3327 } | 3824 } |
3328 | 3825 |
3329 __declspec(naked) __declspec(align(16)) | 3826 __declspec(naked) |
3330 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, | 3827 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, |
3331 uint8* dst_u, uint8* dst_v, int pix) { | 3828 uint8* dst_u, uint8* dst_v, int pix) { |
3332 __asm { | 3829 __asm { |
3333 push esi | 3830 push esi |
3334 push edi | 3831 push edi |
3335 mov eax, [esp + 8 + 4] // src_yuy2 | 3832 mov eax, [esp + 8 + 4] // src_yuy2 |
3336 mov esi, [esp + 8 + 8] // stride_yuy2 | 3833 mov esi, [esp + 8 + 8] // stride_yuy2 |
3337 mov edx, [esp + 8 + 12] // dst_u | 3834 mov edx, [esp + 8 + 12] // dst_u |
3338 mov edi, [esp + 8 + 16] // dst_v | 3835 mov edi, [esp + 8 + 16] // dst_v |
3339 mov ecx, [esp + 8 + 20] // pix | 3836 mov ecx, [esp + 8 + 20] // pix |
(...skipping 22 matching lines...) Expand all Loading... |
3362 lea edx, [edx + 8] | 3859 lea edx, [edx + 8] |
3363 sub ecx, 16 | 3860 sub ecx, 16 |
3364 jg convertloop | 3861 jg convertloop |
3365 | 3862 |
3366 pop edi | 3863 pop edi |
3367 pop esi | 3864 pop esi |
3368 ret | 3865 ret |
3369 } | 3866 } |
3370 } | 3867 } |
3371 | 3868 |
3372 __declspec(naked) __declspec(align(16)) | 3869 __declspec(naked) |
3373 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, | 3870 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
3374 uint8* dst_u, uint8* dst_v, int pix) { | 3871 uint8* dst_u, uint8* dst_v, int pix) { |
3375 __asm { | 3872 __asm { |
3376 push edi | 3873 push edi |
3377 mov eax, [esp + 4 + 4] // src_yuy2 | 3874 mov eax, [esp + 4 + 4] // src_yuy2 |
3378 mov edx, [esp + 4 + 8] // dst_u | 3875 mov edx, [esp + 4 + 8] // dst_u |
3379 mov edi, [esp + 4 + 12] // dst_v | 3876 mov edi, [esp + 4 + 12] // dst_v |
3380 mov ecx, [esp + 4 + 16] // pix | 3877 mov ecx, [esp + 4 + 16] // pix |
3381 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 3878 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
3382 psrlw xmm5, 8 | 3879 psrlw xmm5, 8 |
(...skipping 15 matching lines...) Expand all Loading... |
3398 movq qword ptr [edx + edi], xmm1 | 3895 movq qword ptr [edx + edi], xmm1 |
3399 lea edx, [edx + 8] | 3896 lea edx, [edx + 8] |
3400 sub ecx, 16 | 3897 sub ecx, 16 |
3401 jg convertloop | 3898 jg convertloop |
3402 | 3899 |
3403 pop edi | 3900 pop edi |
3404 ret | 3901 ret |
3405 } | 3902 } |
3406 } | 3903 } |
3407 | 3904 |
3408 __declspec(naked) __declspec(align(16)) | 3905 __declspec(naked) |
3409 void UYVYToYRow_SSE2(const uint8* src_uyvy, | 3906 void UYVYToYRow_SSE2(const uint8* src_uyvy, |
3410 uint8* dst_y, int pix) { | 3907 uint8* dst_y, int pix) { |
3411 __asm { | 3908 __asm { |
3412 mov eax, [esp + 4] // src_uyvy | 3909 mov eax, [esp + 4] // src_uyvy |
3413 mov edx, [esp + 8] // dst_y | 3910 mov edx, [esp + 8] // dst_y |
3414 mov ecx, [esp + 12] // pix | 3911 mov ecx, [esp + 12] // pix |
3415 | 3912 |
3416 convertloop: | 3913 convertloop: |
3417 movdqu xmm0, [eax] | 3914 movdqu xmm0, [eax] |
3418 movdqu xmm1, [eax + 16] | 3915 movdqu xmm1, [eax + 16] |
3419 lea eax, [eax + 32] | 3916 lea eax, [eax + 32] |
3420 psrlw xmm0, 8 // odd bytes are Y | 3917 psrlw xmm0, 8 // odd bytes are Y |
3421 psrlw xmm1, 8 | 3918 psrlw xmm1, 8 |
3422 packuswb xmm0, xmm1 | 3919 packuswb xmm0, xmm1 |
3423 movdqu [edx], xmm0 | 3920 movdqu [edx], xmm0 |
3424 lea edx, [edx + 16] | 3921 lea edx, [edx + 16] |
3425 sub ecx, 16 | 3922 sub ecx, 16 |
3426 jg convertloop | 3923 jg convertloop |
3427 ret | 3924 ret |
3428 } | 3925 } |
3429 } | 3926 } |
3430 | 3927 |
3431 __declspec(naked) __declspec(align(16)) | 3928 __declspec(naked) |
3432 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, | 3929 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, |
3433 uint8* dst_u, uint8* dst_v, int pix) { | 3930 uint8* dst_u, uint8* dst_v, int pix) { |
3434 __asm { | 3931 __asm { |
3435 push esi | 3932 push esi |
3436 push edi | 3933 push edi |
3437 mov eax, [esp + 8 + 4] // src_yuy2 | 3934 mov eax, [esp + 8 + 4] // src_yuy2 |
3438 mov esi, [esp + 8 + 8] // stride_yuy2 | 3935 mov esi, [esp + 8 + 8] // stride_yuy2 |
3439 mov edx, [esp + 8 + 12] // dst_u | 3936 mov edx, [esp + 8 + 12] // dst_u |
3440 mov edi, [esp + 8 + 16] // dst_v | 3937 mov edi, [esp + 8 + 16] // dst_v |
3441 mov ecx, [esp + 8 + 20] // pix | 3938 mov ecx, [esp + 8 + 20] // pix |
(...skipping 22 matching lines...) Expand all Loading... |
3464 lea edx, [edx + 8] | 3961 lea edx, [edx + 8] |
3465 sub ecx, 16 | 3962 sub ecx, 16 |
3466 jg convertloop | 3963 jg convertloop |
3467 | 3964 |
3468 pop edi | 3965 pop edi |
3469 pop esi | 3966 pop esi |
3470 ret | 3967 ret |
3471 } | 3968 } |
3472 } | 3969 } |
3473 | 3970 |
3474 __declspec(naked) __declspec(align(16)) | 3971 __declspec(naked) |
3475 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, | 3972 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
3476 uint8* dst_u, uint8* dst_v, int pix) { | 3973 uint8* dst_u, uint8* dst_v, int pix) { |
3477 __asm { | 3974 __asm { |
3478 push edi | 3975 push edi |
3479 mov eax, [esp + 4 + 4] // src_yuy2 | 3976 mov eax, [esp + 4 + 4] // src_yuy2 |
3480 mov edx, [esp + 4 + 8] // dst_u | 3977 mov edx, [esp + 4 + 8] // dst_u |
3481 mov edi, [esp + 4 + 12] // dst_v | 3978 mov edi, [esp + 4 + 12] // dst_v |
3482 mov ecx, [esp + 4 + 16] // pix | 3979 mov ecx, [esp + 4 + 16] // pix |
3483 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 3980 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
3484 psrlw xmm5, 8 | 3981 psrlw xmm5, 8 |
(...skipping 18 matching lines...) Expand all Loading... |
3503 jg convertloop | 4000 jg convertloop |
3504 | 4001 |
3505 pop edi | 4002 pop edi |
3506 ret | 4003 ret |
3507 } | 4004 } |
3508 } | 4005 } |
3509 #endif // HAS_YUY2TOYROW_SSE2 | 4006 #endif // HAS_YUY2TOYROW_SSE2 |
3510 | 4007 |
3511 #ifdef HAS_ARGBBLENDROW_SSE2 | 4008 #ifdef HAS_ARGBBLENDROW_SSE2 |
3512 // Blend 8 pixels at a time. | 4009 // Blend 8 pixels at a time. |
3513 __declspec(naked) __declspec(align(16)) | 4010 __declspec(naked) |
3514 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4011 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
3515 uint8* dst_argb, int width) { | 4012 uint8* dst_argb, int width) { |
3516 __asm { | 4013 __asm { |
3517 push esi | 4014 push esi |
3518 mov eax, [esp + 4 + 4] // src_argb0 | 4015 mov eax, [esp + 4 + 4] // src_argb0 |
3519 mov esi, [esp + 4 + 8] // src_argb1 | 4016 mov esi, [esp + 4 + 8] // src_argb1 |
3520 mov edx, [esp + 4 + 12] // dst_argb | 4017 mov edx, [esp + 4 + 12] // dst_argb |
3521 mov ecx, [esp + 4 + 16] // width | 4018 mov ecx, [esp + 4 + 16] // width |
3522 pcmpeqb xmm7, xmm7 // generate constant 1 | 4019 pcmpeqb xmm7, xmm7 // generate constant 1 |
3523 psrlw xmm7, 15 | 4020 psrlw xmm7, 15 |
3524 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff | 4021 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff |
3525 psrlw xmm6, 8 | 4022 psrlw xmm6, 8 |
3526 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 | 4023 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
3527 psllw xmm5, 8 | 4024 psllw xmm5, 8 |
3528 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | 4025 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
3529 pslld xmm4, 24 | 4026 pslld xmm4, 24 |
3530 | 4027 sub ecx, 4 |
3531 sub ecx, 1 | 4028 jl convertloop4b // less than 4 pixels? |
3532 je convertloop1 // only 1 pixel? | |
3533 jl convertloop1b | |
3534 | |
3535 // 1 pixel loop until destination pointer is aligned. | |
3536 alignloop1: | |
3537 test edx, 15 // aligned? | |
3538 je alignloop1b | |
3539 movd xmm3, [eax] | |
3540 lea eax, [eax + 4] | |
3541 movdqa xmm0, xmm3 // src argb | |
3542 pxor xmm3, xmm4 // ~alpha | |
3543 movd xmm2, [esi] // _r_b | |
3544 psrlw xmm3, 8 // alpha | |
3545 pshufhw xmm3, xmm3, 0F5h // 8 alpha words | |
3546 pshuflw xmm3, xmm3, 0F5h | |
3547 pand xmm2, xmm6 // _r_b | |
3548 paddw xmm3, xmm7 // 256 - alpha | |
3549 pmullw xmm2, xmm3 // _r_b * alpha | |
3550 movd xmm1, [esi] // _a_g | |
3551 lea esi, [esi + 4] | |
3552 psrlw xmm1, 8 // _a_g | |
3553 por xmm0, xmm4 // set alpha to 255 | |
3554 pmullw xmm1, xmm3 // _a_g * alpha | |
3555 psrlw xmm2, 8 // _r_b convert to 8 bits again | |
3556 paddusb xmm0, xmm2 // + src argb | |
3557 pand xmm1, xmm5 // a_g_ convert to 8 bits again | |
3558 paddusb xmm0, xmm1 // + src argb | |
3559 movd [edx], xmm0 | |
3560 lea edx, [edx + 4] | |
3561 sub ecx, 1 | |
3562 jge alignloop1 | |
3563 | |
3564 alignloop1b: | |
3565 add ecx, 1 - 4 | |
3566 jl convertloop4b | |
3567 | 4029 |
3568 // 4 pixel loop. | 4030 // 4 pixel loop. |
3569 convertloop4: | 4031 convertloop4: |
3570 movdqu xmm3, [eax] // src argb | 4032 movdqu xmm3, [eax] // src argb |
3571 lea eax, [eax + 16] | 4033 lea eax, [eax + 16] |
3572 movdqa xmm0, xmm3 // src argb | 4034 movdqa xmm0, xmm3 // src argb |
3573 pxor xmm3, xmm4 // ~alpha | 4035 pxor xmm3, xmm4 // ~alpha |
3574 movdqu xmm2, [esi] // _r_b | 4036 movdqu xmm2, [esi] // _r_b |
3575 psrlw xmm3, 8 // alpha | 4037 psrlw xmm3, 8 // alpha |
3576 pshufhw xmm3, xmm3, 0F5h // 8 alpha words | 4038 pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3637 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | 4099 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
3638 }; | 4100 }; |
3639 // Same as SSE2, but replaces: | 4101 // Same as SSE2, but replaces: |
3640 // psrlw xmm3, 8 // alpha | 4102 // psrlw xmm3, 8 // alpha |
3641 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words | 4103 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
3642 // pshuflw xmm3, xmm3, 0F5h | 4104 // pshuflw xmm3, xmm3, 0F5h |
3643 // with.. | 4105 // with.. |
3644 // pshufb xmm3, kShuffleAlpha // alpha | 4106 // pshufb xmm3, kShuffleAlpha // alpha |
3645 // Blend 8 pixels at a time. | 4107 // Blend 8 pixels at a time. |
3646 | 4108 |
3647 __declspec(naked) __declspec(align(16)) | 4109 __declspec(naked) |
3648 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | 4110 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
3649 uint8* dst_argb, int width) { | 4111 uint8* dst_argb, int width) { |
3650 __asm { | 4112 __asm { |
3651 push esi | 4113 push esi |
3652 mov eax, [esp + 4 + 4] // src_argb0 | 4114 mov eax, [esp + 4 + 4] // src_argb0 |
3653 mov esi, [esp + 4 + 8] // src_argb1 | 4115 mov esi, [esp + 4 + 8] // src_argb1 |
3654 mov edx, [esp + 4 + 12] // dst_argb | 4116 mov edx, [esp + 4 + 12] // dst_argb |
3655 mov ecx, [esp + 4 + 16] // width | 4117 mov ecx, [esp + 4 + 16] // width |
3656 pcmpeqb xmm7, xmm7 // generate constant 0x0001 | 4118 pcmpeqb xmm7, xmm7 // generate constant 0x0001 |
3657 psrlw xmm7, 15 | 4119 psrlw xmm7, 15 |
3658 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff | 4120 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff |
3659 psrlw xmm6, 8 | 4121 psrlw xmm6, 8 |
3660 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 | 4122 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
3661 psllw xmm5, 8 | 4123 psllw xmm5, 8 |
3662 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | 4124 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
3663 pslld xmm4, 24 | 4125 pslld xmm4, 24 |
3664 | 4126 sub ecx, 4 |
3665 sub ecx, 1 | 4127 jl convertloop4b // less than 4 pixels? |
3666 je convertloop1 // only 1 pixel? | |
3667 jl convertloop1b | |
3668 | |
3669 // 1 pixel loop until destination pointer is aligned. | |
3670 alignloop1: | |
3671 test edx, 15 // aligned? | |
3672 je alignloop1b | |
3673 movd xmm3, [eax] | |
3674 lea eax, [eax + 4] | |
3675 movdqa xmm0, xmm3 // src argb | |
3676 pxor xmm3, xmm4 // ~alpha | |
3677 movd xmm2, [esi] // _r_b | |
3678 pshufb xmm3, kShuffleAlpha // alpha | |
3679 pand xmm2, xmm6 // _r_b | |
3680 paddw xmm3, xmm7 // 256 - alpha | |
3681 pmullw xmm2, xmm3 // _r_b * alpha | |
3682 movd xmm1, [esi] // _a_g | |
3683 lea esi, [esi + 4] | |
3684 psrlw xmm1, 8 // _a_g | |
3685 por xmm0, xmm4 // set alpha to 255 | |
3686 pmullw xmm1, xmm3 // _a_g * alpha | |
3687 psrlw xmm2, 8 // _r_b convert to 8 bits again | |
3688 paddusb xmm0, xmm2 // + src argb | |
3689 pand xmm1, xmm5 // a_g_ convert to 8 bits again | |
3690 paddusb xmm0, xmm1 // + src argb | |
3691 movd [edx], xmm0 | |
3692 lea edx, [edx + 4] | |
3693 sub ecx, 1 | |
3694 jge alignloop1 | |
3695 | |
3696 alignloop1b: | |
3697 add ecx, 1 - 4 | |
3698 jl convertloop4b | |
3699 | 4128 |
3700 // 4 pixel loop. | 4129 // 4 pixel loop. |
3701 convertloop4: | 4130 convertloop4: |
3702 movdqu xmm3, [eax] // src argb | 4131 movdqu xmm3, [eax] // src argb |
3703 lea eax, [eax + 16] | 4132 lea eax, [eax + 16] |
3704 movdqa xmm0, xmm3 // src argb | 4133 movdqa xmm0, xmm3 // src argb |
3705 pxor xmm3, xmm4 // ~alpha | 4134 pxor xmm3, xmm4 // ~alpha |
3706 movdqu xmm2, [esi] // _r_b | 4135 movdqu xmm2, [esi] // _r_b |
3707 pshufb xmm3, kShuffleAlpha // alpha | 4136 pshufb xmm3, kShuffleAlpha // alpha |
3708 pand xmm2, xmm6 // _r_b | 4137 pand xmm2, xmm6 // _r_b |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3753 | 4182 |
3754 convertloop1b: | 4183 convertloop1b: |
3755 pop esi | 4184 pop esi |
3756 ret | 4185 ret |
3757 } | 4186 } |
3758 } | 4187 } |
3759 #endif // HAS_ARGBBLENDROW_SSSE3 | 4188 #endif // HAS_ARGBBLENDROW_SSSE3 |
3760 | 4189 |
3761 #ifdef HAS_ARGBATTENUATEROW_SSE2 | 4190 #ifdef HAS_ARGBATTENUATEROW_SSE2 |
3762 // Attenuate 4 pixels at a time. | 4191 // Attenuate 4 pixels at a time. |
3763 __declspec(naked) __declspec(align(16)) | 4192 __declspec(naked) |
3764 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { | 4193 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { |
3765 __asm { | 4194 __asm { |
3766 mov eax, [esp + 4] // src_argb0 | 4195 mov eax, [esp + 4] // src_argb0 |
3767 mov edx, [esp + 8] // dst_argb | 4196 mov edx, [esp + 8] // dst_argb |
3768 mov ecx, [esp + 12] // width | 4197 mov ecx, [esp + 12] // width |
3769 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | 4198 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
3770 pslld xmm4, 24 | 4199 pslld xmm4, 24 |
3771 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff | 4200 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff |
3772 psrld xmm5, 8 | 4201 psrld xmm5, 8 |
3773 | 4202 |
(...skipping 28 matching lines...) Expand all Loading... |
3802 | 4231 |
3803 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | 4232 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
3804 // Shuffle table duplicating alpha. | 4233 // Shuffle table duplicating alpha. |
3805 static const uvec8 kShuffleAlpha0 = { | 4234 static const uvec8 kShuffleAlpha0 = { |
3806 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, | 4235 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, |
3807 }; | 4236 }; |
3808 static const uvec8 kShuffleAlpha1 = { | 4237 static const uvec8 kShuffleAlpha1 = { |
3809 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | 4238 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
3810 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, | 4239 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, |
3811 }; | 4240 }; |
3812 __declspec(naked) __declspec(align(16)) | 4241 __declspec(naked) |
3813 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | 4242 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
3814 __asm { | 4243 __asm { |
3815 mov eax, [esp + 4] // src_argb0 | 4244 mov eax, [esp + 4] // src_argb0 |
3816 mov edx, [esp + 8] // dst_argb | 4245 mov edx, [esp + 8] // dst_argb |
3817 mov ecx, [esp + 12] // width | 4246 mov ecx, [esp + 12] // width |
3818 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 | 4247 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 |
3819 pslld xmm3, 24 | 4248 pslld xmm3, 24 |
3820 movdqa xmm4, kShuffleAlpha0 | 4249 movdqa xmm4, kShuffleAlpha0 |
3821 movdqa xmm5, kShuffleAlpha1 | 4250 movdqa xmm5, kShuffleAlpha1 |
3822 | 4251 |
(...skipping 23 matching lines...) Expand all Loading... |
3846 ret | 4275 ret |
3847 } | 4276 } |
3848 } | 4277 } |
3849 #endif // HAS_ARGBATTENUATEROW_SSSE3 | 4278 #endif // HAS_ARGBATTENUATEROW_SSSE3 |
3850 | 4279 |
3851 #ifdef HAS_ARGBATTENUATEROW_AVX2 | 4280 #ifdef HAS_ARGBATTENUATEROW_AVX2 |
3852 // Shuffle table duplicating alpha. | 4281 // Shuffle table duplicating alpha. |
3853 static const uvec8 kShuffleAlpha_AVX2 = { | 4282 static const uvec8 kShuffleAlpha_AVX2 = { |
3854 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u | 4283 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u |
3855 }; | 4284 }; |
3856 __declspec(naked) __declspec(align(16)) | 4285 __declspec(naked) |
3857 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { | 4286 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { |
3858 __asm { | 4287 __asm { |
3859 mov eax, [esp + 4] // src_argb0 | 4288 mov eax, [esp + 4] // src_argb0 |
3860 mov edx, [esp + 8] // dst_argb | 4289 mov edx, [esp + 8] // dst_argb |
3861 mov ecx, [esp + 12] // width | 4290 mov ecx, [esp + 12] // width |
3862 sub edx, eax | 4291 sub edx, eax |
3863 vbroadcastf128 ymm4,kShuffleAlpha_AVX2 | 4292 vbroadcastf128 ymm4,kShuffleAlpha_AVX2 |
3864 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 | 4293 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
3865 vpslld ymm5, ymm5, 24 | 4294 vpslld ymm5, ymm5, 24 |
3866 | 4295 |
(...skipping 16 matching lines...) Expand all Loading... |
3883 jg convertloop | 4312 jg convertloop |
3884 | 4313 |
3885 vzeroupper | 4314 vzeroupper |
3886 ret | 4315 ret |
3887 } | 4316 } |
3888 } | 4317 } |
3889 #endif // HAS_ARGBATTENUATEROW_AVX2 | 4318 #endif // HAS_ARGBATTENUATEROW_AVX2 |
3890 | 4319 |
3891 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 | 4320 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 |
3892 // Unattenuate 4 pixels at a time. | 4321 // Unattenuate 4 pixels at a time. |
3893 __declspec(naked) __declspec(align(16)) | 4322 __declspec(naked) |
3894 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 4323 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
3895 int width) { | 4324 int width) { |
3896 __asm { | 4325 __asm { |
3897 push esi | 4326 push esi |
3898 push edi | 4327 push edi |
3899 mov eax, [esp + 8 + 4] // src_argb0 | 4328 mov eax, [esp + 8 + 4] // src_argb0 |
3900 mov edx, [esp + 8 + 8] // dst_argb | 4329 mov edx, [esp + 8 + 8] // dst_argb |
3901 mov ecx, [esp + 8 + 12] // width | 4330 mov ecx, [esp + 8 + 12] // width |
3902 | 4331 |
3903 convertloop: | 4332 convertloop: |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3937 #endif // HAS_ARGBUNATTENUATEROW_SSE2 | 4366 #endif // HAS_ARGBUNATTENUATEROW_SSE2 |
3938 | 4367 |
3939 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 | 4368 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 |
3940 // Shuffle table duplicating alpha. | 4369 // Shuffle table duplicating alpha. |
3941 static const uvec8 kUnattenShuffleAlpha_AVX2 = { | 4370 static const uvec8 kUnattenShuffleAlpha_AVX2 = { |
3942 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u | 4371 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u |
3943 }; | 4372 }; |
3944 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. | 4373 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. |
3945 // USE_GATHER is not on by default, due to being a slow instruction. | 4374 // USE_GATHER is not on by default, due to being a slow instruction. |
3946 #ifdef USE_GATHER | 4375 #ifdef USE_GATHER |
3947 __declspec(naked) __declspec(align(16)) | 4376 __declspec(naked) |
3948 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 4377 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
3949 int width) { | 4378 int width) { |
3950 __asm { | 4379 __asm { |
3951 mov eax, [esp + 4] // src_argb0 | 4380 mov eax, [esp + 4] // src_argb0 |
3952 mov edx, [esp + 8] // dst_argb | 4381 mov edx, [esp + 8] // dst_argb |
3953 mov ecx, [esp + 12] // width | 4382 mov ecx, [esp + 12] // width |
3954 sub edx, eax | 4383 sub edx, eax |
3955 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 | 4384 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 |
3956 | 4385 |
3957 convertloop: | 4386 convertloop: |
(...skipping 13 matching lines...) Expand all Loading... |
3971 vmovdqu [eax + edx], ymm0 | 4400 vmovdqu [eax + edx], ymm0 |
3972 lea eax, [eax + 32] | 4401 lea eax, [eax + 32] |
3973 sub ecx, 8 | 4402 sub ecx, 8 |
3974 jg convertloop | 4403 jg convertloop |
3975 | 4404 |
3976 vzeroupper | 4405 vzeroupper |
3977 ret | 4406 ret |
3978 } | 4407 } |
3979 } | 4408 } |
3980 #else // USE_GATHER | 4409 #else // USE_GATHER |
3981 __declspec(naked) __declspec(align(16)) | 4410 __declspec(naked) |
3982 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 4411 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
3983 int width) { | 4412 int width) { |
3984 __asm { | 4413 __asm { |
3985 | 4414 |
3986 mov eax, [esp + 4] // src_argb0 | 4415 mov eax, [esp + 4] // src_argb0 |
3987 mov edx, [esp + 8] // dst_argb | 4416 mov edx, [esp + 8] // dst_argb |
3988 mov ecx, [esp + 12] // width | 4417 mov ecx, [esp + 12] // width |
3989 sub edx, eax | 4418 sub edx, eax |
3990 vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 | 4419 vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 |
3991 | 4420 |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4038 pop esi | 4467 pop esi |
4039 vzeroupper | 4468 vzeroupper |
4040 ret | 4469 ret |
4041 } | 4470 } |
4042 } | 4471 } |
4043 #endif // USE_GATHER | 4472 #endif // USE_GATHER |
4044 #endif // HAS_ARGBATTENUATEROW_AVX2 | 4473 #endif // HAS_ARGBATTENUATEROW_AVX2 |
4045 | 4474 |
4046 #ifdef HAS_ARGBGRAYROW_SSSE3 | 4475 #ifdef HAS_ARGBGRAYROW_SSSE3 |
4047 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. | 4476 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. |
4048 __declspec(naked) __declspec(align(16)) | 4477 __declspec(naked) |
4049 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | 4478 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
4050 __asm { | 4479 __asm { |
4051 mov eax, [esp + 4] /* src_argb */ | 4480 mov eax, [esp + 4] /* src_argb */ |
4052 mov edx, [esp + 8] /* dst_argb */ | 4481 mov edx, [esp + 8] /* dst_argb */ |
4053 mov ecx, [esp + 12] /* width */ | 4482 mov ecx, [esp + 12] /* width */ |
4054 movdqa xmm4, kARGBToYJ | 4483 movdqa xmm4, kARGBToYJ |
4055 movdqa xmm5, kAddYJ64 | 4484 movdqa xmm5, kAddYJ64 |
4056 | 4485 |
4057 convertloop: | 4486 convertloop: |
4058 movdqu xmm0, [eax] // G | 4487 movdqu xmm0, [eax] // G |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4097 | 4526 |
4098 static const vec8 kARGBToSepiaG = { | 4527 static const vec8 kARGBToSepiaG = { |
4099 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 | 4528 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 |
4100 }; | 4529 }; |
4101 | 4530 |
4102 static const vec8 kARGBToSepiaR = { | 4531 static const vec8 kARGBToSepiaR = { |
4103 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 | 4532 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 |
4104 }; | 4533 }; |
4105 | 4534 |
4106 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. | 4535 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
4107 __declspec(naked) __declspec(align(16)) | 4536 __declspec(naked) |
4108 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { | 4537 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { |
4109 __asm { | 4538 __asm { |
4110 mov eax, [esp + 4] /* dst_argb */ | 4539 mov eax, [esp + 4] /* dst_argb */ |
4111 mov ecx, [esp + 8] /* width */ | 4540 mov ecx, [esp + 8] /* width */ |
4112 movdqa xmm2, kARGBToSepiaB | 4541 movdqa xmm2, kARGBToSepiaB |
4113 movdqa xmm3, kARGBToSepiaG | 4542 movdqa xmm3, kARGBToSepiaG |
4114 movdqa xmm4, kARGBToSepiaR | 4543 movdqa xmm4, kARGBToSepiaR |
4115 | 4544 |
4116 convertloop: | 4545 convertloop: |
4117 movdqu xmm0, [eax] // B | 4546 movdqu xmm0, [eax] // B |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4154 ret | 4583 ret |
4155 } | 4584 } |
4156 } | 4585 } |
4157 #endif // HAS_ARGBSEPIAROW_SSSE3 | 4586 #endif // HAS_ARGBSEPIAROW_SSSE3 |
4158 | 4587 |
4159 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 | 4588 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 |
4160 // Tranform 8 ARGB pixels (32 bytes) with color matrix. | 4589 // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
4161 // Same as Sepia except matrix is provided. | 4590 // Same as Sepia except matrix is provided. |
4162 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R | 4591 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R |
4163 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. | 4592 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. |
4164 __declspec(naked) __declspec(align(16)) | 4593 __declspec(naked) |
4165 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 4594 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
4166 const int8* matrix_argb, int width) { | 4595 const int8* matrix_argb, int width) { |
4167 __asm { | 4596 __asm { |
4168 mov eax, [esp + 4] /* src_argb */ | 4597 mov eax, [esp + 4] /* src_argb */ |
4169 mov edx, [esp + 8] /* dst_argb */ | 4598 mov edx, [esp + 8] /* dst_argb */ |
4170 mov ecx, [esp + 12] /* matrix_argb */ | 4599 mov ecx, [esp + 12] /* matrix_argb */ |
4171 movdqu xmm5, [ecx] | 4600 movdqu xmm5, [ecx] |
4172 pshufd xmm2, xmm5, 0x00 | 4601 pshufd xmm2, xmm5, 0x00 |
4173 pshufd xmm3, xmm5, 0x55 | 4602 pshufd xmm3, xmm5, 0x55 |
4174 pshufd xmm4, xmm5, 0xaa | 4603 pshufd xmm4, xmm5, 0xaa |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4215 lea edx, [edx + 32] | 4644 lea edx, [edx + 32] |
4216 sub ecx, 8 | 4645 sub ecx, 8 |
4217 jg convertloop | 4646 jg convertloop |
4218 ret | 4647 ret |
4219 } | 4648 } |
4220 } | 4649 } |
4221 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 | 4650 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 |
4222 | 4651 |
4223 #ifdef HAS_ARGBQUANTIZEROW_SSE2 | 4652 #ifdef HAS_ARGBQUANTIZEROW_SSE2 |
4224 // Quantize 4 ARGB pixels (16 bytes). | 4653 // Quantize 4 ARGB pixels (16 bytes). |
4225 __declspec(naked) __declspec(align(16)) | 4654 __declspec(naked) |
4226 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, | 4655 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, |
4227 int interval_offset, int width) { | 4656 int interval_offset, int width) { |
4228 __asm { | 4657 __asm { |
4229 mov eax, [esp + 4] /* dst_argb */ | 4658 mov eax, [esp + 4] /* dst_argb */ |
4230 movd xmm2, [esp + 8] /* scale */ | 4659 movd xmm2, [esp + 8] /* scale */ |
4231 movd xmm3, [esp + 12] /* interval_size */ | 4660 movd xmm3, [esp + 12] /* interval_size */ |
4232 movd xmm4, [esp + 16] /* interval_offset */ | 4661 movd xmm4, [esp + 16] /* interval_offset */ |
4233 mov ecx, [esp + 20] /* width */ | 4662 mov ecx, [esp + 20] /* width */ |
4234 pshuflw xmm2, xmm2, 040h | 4663 pshuflw xmm2, xmm2, 040h |
4235 pshufd xmm2, xmm2, 044h | 4664 pshufd xmm2, xmm2, 044h |
(...skipping 24 matching lines...) Expand all Loading... |
4260 lea eax, [eax + 16] | 4689 lea eax, [eax + 16] |
4261 sub ecx, 4 | 4690 sub ecx, 4 |
4262 jg convertloop | 4691 jg convertloop |
4263 ret | 4692 ret |
4264 } | 4693 } |
4265 } | 4694 } |
4266 #endif // HAS_ARGBQUANTIZEROW_SSE2 | 4695 #endif // HAS_ARGBQUANTIZEROW_SSE2 |
4267 | 4696 |
4268 #ifdef HAS_ARGBSHADEROW_SSE2 | 4697 #ifdef HAS_ARGBSHADEROW_SSE2 |
4269 // Shade 4 pixels at a time by specified value. | 4698 // Shade 4 pixels at a time by specified value. |
4270 __declspec(naked) __declspec(align(16)) | 4699 __declspec(naked) |
4271 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, | 4700 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, |
4272 uint32 value) { | 4701 uint32 value) { |
4273 __asm { | 4702 __asm { |
4274 mov eax, [esp + 4] // src_argb | 4703 mov eax, [esp + 4] // src_argb |
4275 mov edx, [esp + 8] // dst_argb | 4704 mov edx, [esp + 8] // dst_argb |
4276 mov ecx, [esp + 12] // width | 4705 mov ecx, [esp + 12] // width |
4277 movd xmm2, [esp + 16] // value | 4706 movd xmm2, [esp + 16] // value |
4278 punpcklbw xmm2, xmm2 | 4707 punpcklbw xmm2, xmm2 |
4279 punpcklqdq xmm2, xmm2 | 4708 punpcklqdq xmm2, xmm2 |
4280 | 4709 |
(...skipping 13 matching lines...) Expand all Loading... |
4294 sub ecx, 4 | 4723 sub ecx, 4 |
4295 jg convertloop | 4724 jg convertloop |
4296 | 4725 |
4297 ret | 4726 ret |
4298 } | 4727 } |
4299 } | 4728 } |
4300 #endif // HAS_ARGBSHADEROW_SSE2 | 4729 #endif // HAS_ARGBSHADEROW_SSE2 |
4301 | 4730 |
4302 #ifdef HAS_ARGBMULTIPLYROW_SSE2 | 4731 #ifdef HAS_ARGBMULTIPLYROW_SSE2 |
4303 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. | 4732 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. |
4304 __declspec(naked) __declspec(align(16)) | 4733 __declspec(naked) |
4305 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4734 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
4306 uint8* dst_argb, int width) { | 4735 uint8* dst_argb, int width) { |
4307 __asm { | 4736 __asm { |
4308 push esi | 4737 push esi |
4309 mov eax, [esp + 4 + 4] // src_argb0 | 4738 mov eax, [esp + 4 + 4] // src_argb0 |
4310 mov esi, [esp + 4 + 8] // src_argb1 | 4739 mov esi, [esp + 4 + 8] // src_argb1 |
4311 mov edx, [esp + 4 + 12] // dst_argb | 4740 mov edx, [esp + 4 + 12] // dst_argb |
4312 mov ecx, [esp + 4 + 16] // width | 4741 mov ecx, [esp + 4 + 16] // width |
4313 pxor xmm5, xmm5 // constant 0 | 4742 pxor xmm5, xmm5 // constant 0 |
4314 | 4743 |
(...skipping 18 matching lines...) Expand all Loading... |
4333 | 4762 |
4334 pop esi | 4763 pop esi |
4335 ret | 4764 ret |
4336 } | 4765 } |
4337 } | 4766 } |
4338 #endif // HAS_ARGBMULTIPLYROW_SSE2 | 4767 #endif // HAS_ARGBMULTIPLYROW_SSE2 |
4339 | 4768 |
4340 #ifdef HAS_ARGBADDROW_SSE2 | 4769 #ifdef HAS_ARGBADDROW_SSE2 |
4341 // Add 2 rows of ARGB pixels together, 4 pixels at a time. | 4770 // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
4342 // TODO(fbarchard): Port this to posix, neon and other math functions. | 4771 // TODO(fbarchard): Port this to posix, neon and other math functions. |
4343 __declspec(naked) __declspec(align(16)) | 4772 __declspec(naked) |
4344 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4773 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
4345 uint8* dst_argb, int width) { | 4774 uint8* dst_argb, int width) { |
4346 __asm { | 4775 __asm { |
4347 push esi | 4776 push esi |
4348 mov eax, [esp + 4 + 4] // src_argb0 | 4777 mov eax, [esp + 4 + 4] // src_argb0 |
4349 mov esi, [esp + 4 + 8] // src_argb1 | 4778 mov esi, [esp + 4 + 8] // src_argb1 |
4350 mov edx, [esp + 4 + 12] // dst_argb | 4779 mov edx, [esp + 4 + 12] // dst_argb |
4351 mov ecx, [esp + 4 + 16] // width | 4780 mov ecx, [esp + 4 + 16] // width |
4352 | 4781 |
4353 sub ecx, 4 | 4782 sub ecx, 4 |
(...skipping 27 matching lines...) Expand all Loading... |
4381 | 4810 |
4382 convertloop19: | 4811 convertloop19: |
4383 pop esi | 4812 pop esi |
4384 ret | 4813 ret |
4385 } | 4814 } |
4386 } | 4815 } |
4387 #endif // HAS_ARGBADDROW_SSE2 | 4816 #endif // HAS_ARGBADDROW_SSE2 |
4388 | 4817 |
4389 #ifdef HAS_ARGBSUBTRACTROW_SSE2 | 4818 #ifdef HAS_ARGBSUBTRACTROW_SSE2 |
4390 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. | 4819 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. |
4391 __declspec(naked) __declspec(align(16)) | 4820 __declspec(naked) |
4392 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4821 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
4393 uint8* dst_argb, int width) { | 4822 uint8* dst_argb, int width) { |
4394 __asm { | 4823 __asm { |
4395 push esi | 4824 push esi |
4396 mov eax, [esp + 4 + 4] // src_argb0 | 4825 mov eax, [esp + 4 + 4] // src_argb0 |
4397 mov esi, [esp + 4 + 8] // src_argb1 | 4826 mov esi, [esp + 4 + 8] // src_argb1 |
4398 mov edx, [esp + 4 + 12] // dst_argb | 4827 mov edx, [esp + 4 + 12] // dst_argb |
4399 mov ecx, [esp + 4 + 16] // width | 4828 mov ecx, [esp + 4 + 16] // width |
4400 | 4829 |
4401 convertloop: | 4830 convertloop: |
4402 movdqu xmm0, [eax] // read 4 pixels from src_argb0 | 4831 movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
4403 lea eax, [eax + 16] | 4832 lea eax, [eax + 16] |
4404 movdqu xmm1, [esi] // read 4 pixels from src_argb1 | 4833 movdqu xmm1, [esi] // read 4 pixels from src_argb1 |
4405 lea esi, [esi + 16] | 4834 lea esi, [esi + 16] |
4406 psubusb xmm0, xmm1 // src_argb0 - src_argb1 | 4835 psubusb xmm0, xmm1 // src_argb0 - src_argb1 |
4407 movdqu [edx], xmm0 | 4836 movdqu [edx], xmm0 |
4408 lea edx, [edx + 16] | 4837 lea edx, [edx + 16] |
4409 sub ecx, 4 | 4838 sub ecx, 4 |
4410 jg convertloop | 4839 jg convertloop |
4411 | 4840 |
4412 pop esi | 4841 pop esi |
4413 ret | 4842 ret |
4414 } | 4843 } |
4415 } | 4844 } |
4416 #endif // HAS_ARGBSUBTRACTROW_SSE2 | 4845 #endif // HAS_ARGBSUBTRACTROW_SSE2 |
4417 | 4846 |
4418 #ifdef HAS_ARGBMULTIPLYROW_AVX2 | 4847 #ifdef HAS_ARGBMULTIPLYROW_AVX2 |
4419 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | 4848 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
4420 __declspec(naked) __declspec(align(16)) | 4849 __declspec(naked) |
4421 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4850 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
4422 uint8* dst_argb, int width) { | 4851 uint8* dst_argb, int width) { |
4423 __asm { | 4852 __asm { |
4424 push esi | 4853 push esi |
4425 mov eax, [esp + 4 + 4] // src_argb0 | 4854 mov eax, [esp + 4 + 4] // src_argb0 |
4426 mov esi, [esp + 4 + 8] // src_argb1 | 4855 mov esi, [esp + 4 + 8] // src_argb1 |
4427 mov edx, [esp + 4 + 12] // dst_argb | 4856 mov edx, [esp + 4 + 12] // dst_argb |
4428 mov ecx, [esp + 4 + 16] // width | 4857 mov ecx, [esp + 4 + 16] // width |
4429 vpxor ymm5, ymm5, ymm5 // constant 0 | 4858 vpxor ymm5, ymm5, ymm5 // constant 0 |
4430 | 4859 |
(...skipping 16 matching lines...) Expand all Loading... |
4447 | 4876 |
4448 pop esi | 4877 pop esi |
4449 vzeroupper | 4878 vzeroupper |
4450 ret | 4879 ret |
4451 } | 4880 } |
4452 } | 4881 } |
4453 #endif // HAS_ARGBMULTIPLYROW_AVX2 | 4882 #endif // HAS_ARGBMULTIPLYROW_AVX2 |
4454 | 4883 |
4455 #ifdef HAS_ARGBADDROW_AVX2 | 4884 #ifdef HAS_ARGBADDROW_AVX2 |
4456 // Add 2 rows of ARGB pixels together, 8 pixels at a time. | 4885 // Add 2 rows of ARGB pixels together, 8 pixels at a time. |
4457 __declspec(naked) __declspec(align(16)) | 4886 __declspec(naked) |
4458 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4887 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
4459 uint8* dst_argb, int width) { | 4888 uint8* dst_argb, int width) { |
4460 __asm { | 4889 __asm { |
4461 push esi | 4890 push esi |
4462 mov eax, [esp + 4 + 4] // src_argb0 | 4891 mov eax, [esp + 4 + 4] // src_argb0 |
4463 mov esi, [esp + 4 + 8] // src_argb1 | 4892 mov esi, [esp + 4 + 8] // src_argb1 |
4464 mov edx, [esp + 4 + 12] // dst_argb | 4893 mov edx, [esp + 4 + 12] // dst_argb |
4465 mov ecx, [esp + 4 + 16] // width | 4894 mov ecx, [esp + 4 + 16] // width |
4466 | 4895 |
4467 convertloop: | 4896 convertloop: |
4468 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 | 4897 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 |
4469 lea eax, [eax + 32] | 4898 lea eax, [eax + 32] |
4470 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 | 4899 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 |
4471 lea esi, [esi + 32] | 4900 lea esi, [esi + 32] |
4472 vmovdqu [edx], ymm0 | 4901 vmovdqu [edx], ymm0 |
4473 lea edx, [edx + 32] | 4902 lea edx, [edx + 32] |
4474 sub ecx, 8 | 4903 sub ecx, 8 |
4475 jg convertloop | 4904 jg convertloop |
4476 | 4905 |
4477 pop esi | 4906 pop esi |
4478 vzeroupper | 4907 vzeroupper |
4479 ret | 4908 ret |
4480 } | 4909 } |
4481 } | 4910 } |
4482 #endif // HAS_ARGBADDROW_AVX2 | 4911 #endif // HAS_ARGBADDROW_AVX2 |
4483 | 4912 |
4484 #ifdef HAS_ARGBSUBTRACTROW_AVX2 | 4913 #ifdef HAS_ARGBSUBTRACTROW_AVX2 |
4485 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. | 4914 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. |
4486 __declspec(naked) __declspec(align(16)) | 4915 __declspec(naked) |
4487 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4916 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
4488 uint8* dst_argb, int width) { | 4917 uint8* dst_argb, int width) { |
4489 __asm { | 4918 __asm { |
4490 push esi | 4919 push esi |
4491 mov eax, [esp + 4 + 4] // src_argb0 | 4920 mov eax, [esp + 4 + 4] // src_argb0 |
4492 mov esi, [esp + 4 + 8] // src_argb1 | 4921 mov esi, [esp + 4 + 8] // src_argb1 |
4493 mov edx, [esp + 4 + 12] // dst_argb | 4922 mov edx, [esp + 4 + 12] // dst_argb |
4494 mov ecx, [esp + 4 + 16] // width | 4923 mov ecx, [esp + 4 + 16] // width |
4495 | 4924 |
4496 convertloop: | 4925 convertloop: |
(...skipping 11 matching lines...) Expand all Loading... |
4508 ret | 4937 ret |
4509 } | 4938 } |
4510 } | 4939 } |
4511 #endif // HAS_ARGBSUBTRACTROW_AVX2 | 4940 #endif // HAS_ARGBSUBTRACTROW_AVX2 |
4512 | 4941 |
4513 #ifdef HAS_SOBELXROW_SSE2 | 4942 #ifdef HAS_SOBELXROW_SSE2 |
4514 // SobelX as a matrix is | 4943 // SobelX as a matrix is |
4515 // -1 0 1 | 4944 // -1 0 1 |
4516 // -2 0 2 | 4945 // -2 0 2 |
4517 // -1 0 1 | 4946 // -1 0 1 |
4518 __declspec(naked) __declspec(align(16)) | 4947 __declspec(naked) |
4519 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, | 4948 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
4520 const uint8* src_y2, uint8* dst_sobelx, int width) { | 4949 const uint8* src_y2, uint8* dst_sobelx, int width) { |
4521 __asm { | 4950 __asm { |
4522 push esi | 4951 push esi |
4523 push edi | 4952 push edi |
4524 mov eax, [esp + 8 + 4] // src_y0 | 4953 mov eax, [esp + 8 + 4] // src_y0 |
4525 mov esi, [esp + 8 + 8] // src_y1 | 4954 mov esi, [esp + 8 + 8] // src_y1 |
4526 mov edi, [esp + 8 + 12] // src_y2 | 4955 mov edi, [esp + 8 + 12] // src_y2 |
4527 mov edx, [esp + 8 + 16] // dst_sobelx | 4956 mov edx, [esp + 8 + 16] // dst_sobelx |
4528 mov ecx, [esp + 8 + 20] // width | 4957 mov ecx, [esp + 8 + 20] // width |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4564 ret | 4993 ret |
4565 } | 4994 } |
4566 } | 4995 } |
4567 #endif // HAS_SOBELXROW_SSE2 | 4996 #endif // HAS_SOBELXROW_SSE2 |
4568 | 4997 |
4569 #ifdef HAS_SOBELYROW_SSE2 | 4998 #ifdef HAS_SOBELYROW_SSE2 |
4570 // SobelY as a matrix is | 4999 // SobelY as a matrix is |
4571 // -1 -2 -1 | 5000 // -1 -2 -1 |
4572 // 0 0 0 | 5001 // 0 0 0 |
4573 // 1 2 1 | 5002 // 1 2 1 |
4574 __declspec(naked) __declspec(align(16)) | 5003 __declspec(naked) |
4575 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, | 5004 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
4576 uint8* dst_sobely, int width) { | 5005 uint8* dst_sobely, int width) { |
4577 __asm { | 5006 __asm { |
4578 push esi | 5007 push esi |
4579 mov eax, [esp + 4 + 4] // src_y0 | 5008 mov eax, [esp + 4 + 4] // src_y0 |
4580 mov esi, [esp + 4 + 8] // src_y1 | 5009 mov esi, [esp + 4 + 8] // src_y1 |
4581 mov edx, [esp + 4 + 12] // dst_sobely | 5010 mov edx, [esp + 4 + 12] // dst_sobely |
4582 mov ecx, [esp + 4 + 16] // width | 5011 mov ecx, [esp + 4 + 16] // width |
4583 sub esi, eax | 5012 sub esi, eax |
4584 sub edx, eax | 5013 sub edx, eax |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4617 } | 5046 } |
4618 } | 5047 } |
4619 #endif // HAS_SOBELYROW_SSE2 | 5048 #endif // HAS_SOBELYROW_SSE2 |
4620 | 5049 |
4621 #ifdef HAS_SOBELROW_SSE2 | 5050 #ifdef HAS_SOBELROW_SSE2 |
4622 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | 5051 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
4623 // A = 255 | 5052 // A = 255 |
4624 // R = Sobel | 5053 // R = Sobel |
4625 // G = Sobel | 5054 // G = Sobel |
4626 // B = Sobel | 5055 // B = Sobel |
4627 __declspec(naked) __declspec(align(16)) | 5056 __declspec(naked) |
4628 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 5057 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
4629 uint8* dst_argb, int width) { | 5058 uint8* dst_argb, int width) { |
4630 __asm { | 5059 __asm { |
4631 push esi | 5060 push esi |
4632 mov eax, [esp + 4 + 4] // src_sobelx | 5061 mov eax, [esp + 4 + 4] // src_sobelx |
4633 mov esi, [esp + 4 + 8] // src_sobely | 5062 mov esi, [esp + 4 + 8] // src_sobely |
4634 mov edx, [esp + 4 + 12] // dst_argb | 5063 mov edx, [esp + 4 + 12] // dst_argb |
4635 mov ecx, [esp + 4 + 16] // width | 5064 mov ecx, [esp + 4 + 16] // width |
4636 sub esi, eax | 5065 sub esi, eax |
4637 pcmpeqb xmm5, xmm5 // alpha 255 | 5066 pcmpeqb xmm5, xmm5 // alpha 255 |
(...skipping 26 matching lines...) Expand all Loading... |
4664 jg convertloop | 5093 jg convertloop |
4665 | 5094 |
4666 pop esi | 5095 pop esi |
4667 ret | 5096 ret |
4668 } | 5097 } |
4669 } | 5098 } |
4670 #endif // HAS_SOBELROW_SSE2 | 5099 #endif // HAS_SOBELROW_SSE2 |
4671 | 5100 |
4672 #ifdef HAS_SOBELTOPLANEROW_SSE2 | 5101 #ifdef HAS_SOBELTOPLANEROW_SSE2 |
4673 // Adds Sobel X and Sobel Y and stores Sobel into a plane. | 5102 // Adds Sobel X and Sobel Y and stores Sobel into a plane. |
4674 __declspec(naked) __declspec(align(16)) | 5103 __declspec(naked) |
4675 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 5104 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
4676 uint8* dst_y, int width) { | 5105 uint8* dst_y, int width) { |
4677 __asm { | 5106 __asm { |
4678 push esi | 5107 push esi |
4679 mov eax, [esp + 4 + 4] // src_sobelx | 5108 mov eax, [esp + 4 + 4] // src_sobelx |
4680 mov esi, [esp + 4 + 8] // src_sobely | 5109 mov esi, [esp + 4 + 8] // src_sobely |
4681 mov edx, [esp + 4 + 12] // dst_argb | 5110 mov edx, [esp + 4 + 12] // dst_argb |
4682 mov ecx, [esp + 4 + 16] // width | 5111 mov ecx, [esp + 4 + 16] // width |
4683 sub esi, eax | 5112 sub esi, eax |
4684 | 5113 |
(...skipping 12 matching lines...) Expand all Loading... |
4697 } | 5126 } |
4698 } | 5127 } |
4699 #endif // HAS_SOBELTOPLANEROW_SSE2 | 5128 #endif // HAS_SOBELTOPLANEROW_SSE2 |
4700 | 5129 |
4701 #ifdef HAS_SOBELXYROW_SSE2 | 5130 #ifdef HAS_SOBELXYROW_SSE2 |
4702 // Mixes Sobel X, Sobel Y and Sobel into ARGB. | 5131 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
4703 // A = 255 | 5132 // A = 255 |
4704 // R = Sobel X | 5133 // R = Sobel X |
4705 // G = Sobel | 5134 // G = Sobel |
4706 // B = Sobel Y | 5135 // B = Sobel Y |
4707 __declspec(naked) __declspec(align(16)) | 5136 __declspec(naked) |
4708 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 5137 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
4709 uint8* dst_argb, int width) { | 5138 uint8* dst_argb, int width) { |
4710 __asm { | 5139 __asm { |
4711 push esi | 5140 push esi |
4712 mov eax, [esp + 4 + 4] // src_sobelx | 5141 mov eax, [esp + 4 + 4] // src_sobelx |
4713 mov esi, [esp + 4 + 8] // src_sobely | 5142 mov esi, [esp + 4 + 8] // src_sobely |
4714 mov edx, [esp + 4 + 12] // dst_argb | 5143 mov edx, [esp + 4 + 12] // dst_argb |
4715 mov ecx, [esp + 4 + 16] // width | 5144 mov ecx, [esp + 4 + 16] // width |
4716 sub esi, eax | 5145 sub esi, eax |
4717 pcmpeqb xmm5, xmm5 // alpha 255 | 5146 pcmpeqb xmm5, xmm5 // alpha 255 |
(...skipping 266 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4984 sub ecx, 1 | 5413 sub ecx, 1 |
4985 jge l1 | 5414 jge l1 |
4986 | 5415 |
4987 l1b: | 5416 l1b: |
4988 } | 5417 } |
4989 } | 5418 } |
4990 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 | 5419 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 |
4991 | 5420 |
4992 #ifdef HAS_ARGBAFFINEROW_SSE2 | 5421 #ifdef HAS_ARGBAFFINEROW_SSE2 |
4993 // Copy ARGB pixels from source image with slope to a row of destination. | 5422 // Copy ARGB pixels from source image with slope to a row of destination. |
4994 __declspec(naked) __declspec(align(16)) | 5423 __declspec(naked) |
4995 LIBYUV_API | 5424 LIBYUV_API |
4996 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, | 5425 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, |
4997 uint8* dst_argb, const float* uv_dudv, int width) { | 5426 uint8* dst_argb, const float* uv_dudv, int width) { |
4998 __asm { | 5427 __asm { |
4999 push esi | 5428 push esi |
5000 push edi | 5429 push edi |
5001 mov eax, [esp + 12] // src_argb | 5430 mov eax, [esp + 12] // src_argb |
5002 mov esi, [esp + 16] // stride | 5431 mov esi, [esp + 16] // stride |
5003 mov edx, [esp + 20] // dst_argb | 5432 mov edx, [esp + 20] // dst_argb |
5004 mov ecx, [esp + 24] // pointer to uv_dudv | 5433 mov ecx, [esp + 24] // pointer to uv_dudv |
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5069 l1b: | 5498 l1b: |
5070 pop edi | 5499 pop edi |
5071 pop esi | 5500 pop esi |
5072 ret | 5501 ret |
5073 } | 5502 } |
5074 } | 5503 } |
5075 #endif // HAS_ARGBAFFINEROW_SSE2 | 5504 #endif // HAS_ARGBAFFINEROW_SSE2 |
5076 | 5505 |
5077 #ifdef HAS_INTERPOLATEROW_AVX2 | 5506 #ifdef HAS_INTERPOLATEROW_AVX2 |
5078 // Bilinear filter 32x2 -> 32x1 | 5507 // Bilinear filter 32x2 -> 32x1 |
5079 __declspec(naked) __declspec(align(16)) | 5508 __declspec(naked) |
5080 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, | 5509 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, |
5081 ptrdiff_t src_stride, int dst_width, | 5510 ptrdiff_t src_stride, int dst_width, |
5082 int source_y_fraction) { | 5511 int source_y_fraction) { |
5083 __asm { | 5512 __asm { |
5084 push esi | 5513 push esi |
5085 push edi | 5514 push edi |
5086 mov edi, [esp + 8 + 4] // dst_ptr | 5515 mov edi, [esp + 8 + 4] // dst_ptr |
5087 mov esi, [esp + 8 + 8] // src_ptr | 5516 mov esi, [esp + 8 + 8] // src_ptr |
5088 mov edx, [esp + 8 + 12] // src_stride | 5517 mov edx, [esp + 8 + 12] // src_stride |
5089 mov ecx, [esp + 8 + 16] // dst_width | 5518 mov ecx, [esp + 8 + 16] // dst_width |
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5166 xloop99: | 5595 xloop99: |
5167 pop edi | 5596 pop edi |
5168 pop esi | 5597 pop esi |
5169 vzeroupper | 5598 vzeroupper |
5170 ret | 5599 ret |
5171 } | 5600 } |
5172 } | 5601 } |
5173 #endif // HAS_INTERPOLATEROW_AVX2 | 5602 #endif // HAS_INTERPOLATEROW_AVX2 |
5174 | 5603 |
5175 // Bilinear filter 16x2 -> 16x1 | 5604 // Bilinear filter 16x2 -> 16x1 |
5176 __declspec(naked) __declspec(align(16)) | 5605 __declspec(naked) |
5177 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 5606 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
5178 ptrdiff_t src_stride, int dst_width, | 5607 ptrdiff_t src_stride, int dst_width, |
5179 int source_y_fraction) { | 5608 int source_y_fraction) { |
5180 __asm { | 5609 __asm { |
5181 push esi | 5610 push esi |
5182 push edi | 5611 push edi |
5183 mov edi, [esp + 8 + 4] // dst_ptr | 5612 mov edi, [esp + 8 + 4] // dst_ptr |
5184 mov esi, [esp + 8 + 8] // src_ptr | 5613 mov esi, [esp + 8 + 8] // src_ptr |
5185 mov edx, [esp + 8 + 12] // src_stride | 5614 mov edx, [esp + 8 + 12] // src_stride |
5186 mov ecx, [esp + 8 + 16] // dst_width | 5615 mov ecx, [esp + 8 + 16] // dst_width |
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5267 | 5696 |
5268 xloop99: | 5697 xloop99: |
5269 pop edi | 5698 pop edi |
5270 pop esi | 5699 pop esi |
5271 ret | 5700 ret |
5272 } | 5701 } |
5273 } | 5702 } |
5274 | 5703 |
5275 #ifdef HAS_INTERPOLATEROW_SSE2 | 5704 #ifdef HAS_INTERPOLATEROW_SSE2 |
5276 // Bilinear filter 16x2 -> 16x1 | 5705 // Bilinear filter 16x2 -> 16x1 |
5277 __declspec(naked) __declspec(align(16)) | 5706 __declspec(naked) |
5278 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, | 5707 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
5279 ptrdiff_t src_stride, int dst_width, | 5708 ptrdiff_t src_stride, int dst_width, |
5280 int source_y_fraction) { | 5709 int source_y_fraction) { |
5281 __asm { | 5710 __asm { |
5282 push esi | 5711 push esi |
5283 push edi | 5712 push edi |
5284 mov edi, [esp + 8 + 4] // dst_ptr | 5713 mov edi, [esp + 8 + 4] // dst_ptr |
5285 mov esi, [esp + 8 + 8] // src_ptr | 5714 mov esi, [esp + 8 + 8] // src_ptr |
5286 mov edx, [esp + 8 + 12] // src_stride | 5715 mov edx, [esp + 8 + 12] // src_stride |
5287 mov ecx, [esp + 8 + 16] // dst_width | 5716 mov ecx, [esp + 8 + 16] // dst_width |
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5373 jg xloop100 | 5802 jg xloop100 |
5374 | 5803 |
5375 xloop99: | 5804 xloop99: |
5376 pop edi | 5805 pop edi |
5377 pop esi | 5806 pop esi |
5378 ret | 5807 ret |
5379 } | 5808 } |
5380 } | 5809 } |
5381 #endif // HAS_INTERPOLATEROW_SSE2 | 5810 #endif // HAS_INTERPOLATEROW_SSE2 |
5382 | 5811 |
5383 // Specialized ARGB to Bayer that just isolates G channel. | |
5384 __declspec(naked) __declspec(align(16)) | |
5385 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, | |
5386 uint32 selector, int pix) { | |
5387 __asm { | |
5388 mov eax, [esp + 4] // src_argb | |
5389 mov edx, [esp + 8] // dst_bayer | |
5390 // selector | |
5391 mov ecx, [esp + 16] // pix | |
5392 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff | |
5393 psrld xmm5, 24 | |
5394 | |
5395 wloop: | |
5396 movdqu xmm0, [eax] | |
5397 movdqu xmm1, [eax + 16] | |
5398 lea eax, [eax + 32] | |
5399 psrld xmm0, 8 // Move green to bottom. | |
5400 psrld xmm1, 8 | |
5401 pand xmm0, xmm5 | |
5402 pand xmm1, xmm5 | |
5403 packssdw xmm0, xmm1 | |
5404 packuswb xmm0, xmm1 | |
5405 movq qword ptr [edx], xmm0 | |
5406 lea edx, [edx + 8] | |
5407 sub ecx, 8 | |
5408 jg wloop | |
5409 ret | |
5410 } | |
5411 } | |
5412 | |
5413 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5812 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
5414 __declspec(naked) __declspec(align(16)) | 5813 __declspec(naked) |
5415 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 5814 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
5416 const uint8* shuffler, int pix) { | 5815 const uint8* shuffler, int pix) { |
5417 __asm { | 5816 __asm { |
5418 mov eax, [esp + 4] // src_argb | 5817 mov eax, [esp + 4] // src_argb |
5419 mov edx, [esp + 8] // dst_argb | 5818 mov edx, [esp + 8] // dst_argb |
5420 mov ecx, [esp + 12] // shuffler | 5819 mov ecx, [esp + 12] // shuffler |
5421 movdqu xmm5, [ecx] | 5820 movdqu xmm5, [ecx] |
5422 mov ecx, [esp + 16] // pix | 5821 mov ecx, [esp + 16] // pix |
5423 | 5822 |
5424 wloop: | 5823 wloop: |
5425 movdqu xmm0, [eax] | 5824 movdqu xmm0, [eax] |
5426 movdqu xmm1, [eax + 16] | 5825 movdqu xmm1, [eax + 16] |
5427 lea eax, [eax + 32] | 5826 lea eax, [eax + 32] |
5428 pshufb xmm0, xmm5 | 5827 pshufb xmm0, xmm5 |
5429 pshufb xmm1, xmm5 | 5828 pshufb xmm1, xmm5 |
5430 movdqu [edx], xmm0 | 5829 movdqu [edx], xmm0 |
5431 movdqu [edx + 16], xmm1 | 5830 movdqu [edx + 16], xmm1 |
5432 lea edx, [edx + 32] | 5831 lea edx, [edx + 32] |
5433 sub ecx, 8 | 5832 sub ecx, 8 |
5434 jg wloop | 5833 jg wloop |
5435 ret | 5834 ret |
5436 } | 5835 } |
5437 } | 5836 } |
5438 | 5837 |
5439 #ifdef HAS_ARGBSHUFFLEROW_AVX2 | 5838 #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
5440 __declspec(naked) __declspec(align(16)) | 5839 __declspec(naked) |
5441 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 5840 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
5442 const uint8* shuffler, int pix) { | 5841 const uint8* shuffler, int pix) { |
5443 __asm { | 5842 __asm { |
5444 mov eax, [esp + 4] // src_argb | 5843 mov eax, [esp + 4] // src_argb |
5445 mov edx, [esp + 8] // dst_argb | 5844 mov edx, [esp + 8] // dst_argb |
5446 mov ecx, [esp + 12] // shuffler | 5845 mov ecx, [esp + 12] // shuffler |
5447 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. | 5846 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. |
5448 mov ecx, [esp + 16] // pix | 5847 mov ecx, [esp + 16] // pix |
5449 | 5848 |
5450 wloop: | 5849 wloop: |
5451 vmovdqu ymm0, [eax] | 5850 vmovdqu ymm0, [eax] |
5452 vmovdqu ymm1, [eax + 32] | 5851 vmovdqu ymm1, [eax + 32] |
5453 lea eax, [eax + 64] | 5852 lea eax, [eax + 64] |
5454 vpshufb ymm0, ymm0, ymm5 | 5853 vpshufb ymm0, ymm0, ymm5 |
5455 vpshufb ymm1, ymm1, ymm5 | 5854 vpshufb ymm1, ymm1, ymm5 |
5456 vmovdqu [edx], ymm0 | 5855 vmovdqu [edx], ymm0 |
5457 vmovdqu [edx + 32], ymm1 | 5856 vmovdqu [edx + 32], ymm1 |
5458 lea edx, [edx + 64] | 5857 lea edx, [edx + 64] |
5459 sub ecx, 16 | 5858 sub ecx, 16 |
5460 jg wloop | 5859 jg wloop |
5461 | 5860 |
5462 vzeroupper | 5861 vzeroupper |
5463 ret | 5862 ret |
5464 } | 5863 } |
5465 } | 5864 } |
5466 #endif // HAS_ARGBSHUFFLEROW_AVX2 | 5865 #endif // HAS_ARGBSHUFFLEROW_AVX2 |
5467 | 5866 |
5468 __declspec(naked) __declspec(align(16)) | 5867 __declspec(naked) |
5469 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 5868 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
5470 const uint8* shuffler, int pix) { | 5869 const uint8* shuffler, int pix) { |
5471 __asm { | 5870 __asm { |
5472 push ebx | 5871 push ebx |
5473 push esi | 5872 push esi |
5474 mov eax, [esp + 8 + 4] // src_argb | 5873 mov eax, [esp + 8 + 4] // src_argb |
5475 mov edx, [esp + 8 + 8] // dst_argb | 5874 mov edx, [esp + 8 + 8] // dst_argb |
5476 mov esi, [esp + 8 + 12] // shuffler | 5875 mov esi, [esp + 8 + 12] // shuffler |
5477 mov ecx, [esp + 8 + 16] // pix | 5876 mov ecx, [esp + 8 + 16] // pix |
5478 pxor xmm5, xmm5 | 5877 pxor xmm5, xmm5 |
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5580 ret | 5979 ret |
5581 } | 5980 } |
5582 } | 5981 } |
5583 | 5982 |
5584 // YUY2 - Macro-pixel = 2 image pixels | 5983 // YUY2 - Macro-pixel = 2 image pixels |
5585 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... | 5984 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... |
5586 | 5985 |
5587 // UYVY - Macro-pixel = 2 image pixels | 5986 // UYVY - Macro-pixel = 2 image pixels |
5588 // U0Y0V0Y1 | 5987 // U0Y0V0Y1 |
5589 | 5988 |
5590 __declspec(naked) __declspec(align(16)) | 5989 __declspec(naked) |
5591 void I422ToYUY2Row_SSE2(const uint8* src_y, | 5990 void I422ToYUY2Row_SSE2(const uint8* src_y, |
5592 const uint8* src_u, | 5991 const uint8* src_u, |
5593 const uint8* src_v, | 5992 const uint8* src_v, |
5594 uint8* dst_frame, int width) { | 5993 uint8* dst_frame, int width) { |
5595 __asm { | 5994 __asm { |
5596 push esi | 5995 push esi |
5597 push edi | 5996 push edi |
5598 mov eax, [esp + 8 + 4] // src_y | 5997 mov eax, [esp + 8 + 4] // src_y |
5599 mov esi, [esp + 8 + 8] // src_u | 5998 mov esi, [esp + 8 + 8] // src_u |
5600 mov edx, [esp + 8 + 12] // src_v | 5999 mov edx, [esp + 8 + 12] // src_v |
(...skipping 16 matching lines...) Expand all Loading... |
5617 lea edi, [edi + 32] | 6016 lea edi, [edi + 32] |
5618 sub ecx, 16 | 6017 sub ecx, 16 |
5619 jg convertloop | 6018 jg convertloop |
5620 | 6019 |
5621 pop edi | 6020 pop edi |
5622 pop esi | 6021 pop esi |
5623 ret | 6022 ret |
5624 } | 6023 } |
5625 } | 6024 } |
5626 | 6025 |
5627 __declspec(naked) __declspec(align(16)) | 6026 __declspec(naked) |
5628 void I422ToUYVYRow_SSE2(const uint8* src_y, | 6027 void I422ToUYVYRow_SSE2(const uint8* src_y, |
5629 const uint8* src_u, | 6028 const uint8* src_u, |
5630 const uint8* src_v, | 6029 const uint8* src_v, |
5631 uint8* dst_frame, int width) { | 6030 uint8* dst_frame, int width) { |
5632 __asm { | 6031 __asm { |
5633 push esi | 6032 push esi |
5634 push edi | 6033 push edi |
5635 mov eax, [esp + 8 + 4] // src_y | 6034 mov eax, [esp + 8 + 4] // src_y |
5636 mov esi, [esp + 8 + 8] // src_u | 6035 mov esi, [esp + 8 + 8] // src_u |
5637 mov edx, [esp + 8 + 12] // src_v | 6036 mov edx, [esp + 8 + 12] // src_v |
(...skipping 17 matching lines...) Expand all Loading... |
5655 sub ecx, 16 | 6054 sub ecx, 16 |
5656 jg convertloop | 6055 jg convertloop |
5657 | 6056 |
5658 pop edi | 6057 pop edi |
5659 pop esi | 6058 pop esi |
5660 ret | 6059 ret |
5661 } | 6060 } |
5662 } | 6061 } |
5663 | 6062 |
5664 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 | 6063 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 |
5665 __declspec(naked) __declspec(align(16)) | 6064 __declspec(naked) |
5666 void ARGBPolynomialRow_SSE2(const uint8* src_argb, | 6065 void ARGBPolynomialRow_SSE2(const uint8* src_argb, |
5667 uint8* dst_argb, const float* poly, | 6066 uint8* dst_argb, const float* poly, |
5668 int width) { | 6067 int width) { |
5669 __asm { | 6068 __asm { |
5670 push esi | 6069 push esi |
5671 mov eax, [esp + 4 + 4] /* src_argb */ | 6070 mov eax, [esp + 4 + 4] /* src_argb */ |
5672 mov edx, [esp + 4 + 8] /* dst_argb */ | 6071 mov edx, [esp + 4 + 8] /* dst_argb */ |
5673 mov esi, [esp + 4 + 12] /* poly */ | 6072 mov esi, [esp + 4 + 12] /* poly */ |
5674 mov ecx, [esp + 4 + 16] /* width */ | 6073 mov ecx, [esp + 4 + 16] /* width */ |
5675 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. | 6074 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5714 lea edx, [edx + 8] | 6113 lea edx, [edx + 8] |
5715 sub ecx, 2 | 6114 sub ecx, 2 |
5716 jg convertloop | 6115 jg convertloop |
5717 pop esi | 6116 pop esi |
5718 ret | 6117 ret |
5719 } | 6118 } |
5720 } | 6119 } |
5721 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 | 6120 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 |
5722 | 6121 |
5723 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 | 6122 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 |
5724 __declspec(naked) __declspec(align(16)) | 6123 __declspec(naked) |
5725 void ARGBPolynomialRow_AVX2(const uint8* src_argb, | 6124 void ARGBPolynomialRow_AVX2(const uint8* src_argb, |
5726 uint8* dst_argb, const float* poly, | 6125 uint8* dst_argb, const float* poly, |
5727 int width) { | 6126 int width) { |
5728 __asm { | 6127 __asm { |
5729 mov eax, [esp + 4] /* src_argb */ | 6128 mov eax, [esp + 4] /* src_argb */ |
5730 mov edx, [esp + 8] /* dst_argb */ | 6129 mov edx, [esp + 8] /* dst_argb */ |
5731 mov ecx, [esp + 12] /* poly */ | 6130 mov ecx, [esp + 12] /* poly */ |
5732 vbroadcastf128 ymm4, [ecx] // C0 | 6131 vbroadcastf128 ymm4, [ecx] // C0 |
5733 vbroadcastf128 ymm5, [ecx + 16] // C1 | 6132 vbroadcastf128 ymm5, [ecx + 16] // C1 |
5734 vbroadcastf128 ymm6, [ecx + 32] // C2 | 6133 vbroadcastf128 ymm6, [ecx + 32] // C2 |
(...skipping 19 matching lines...) Expand all Loading... |
5754 sub ecx, 2 | 6153 sub ecx, 2 |
5755 jg convertloop | 6154 jg convertloop |
5756 vzeroupper | 6155 vzeroupper |
5757 ret | 6156 ret |
5758 } | 6157 } |
5759 } | 6158 } |
5760 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 | 6159 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 |
5761 | 6160 |
5762 #ifdef HAS_ARGBCOLORTABLEROW_X86 | 6161 #ifdef HAS_ARGBCOLORTABLEROW_X86 |
5763 // Tranform ARGB pixels with color table. | 6162 // Tranform ARGB pixels with color table. |
5764 __declspec(naked) __declspec(align(16)) | 6163 __declspec(naked) |
5765 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, | 6164 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, |
5766 int width) { | 6165 int width) { |
5767 __asm { | 6166 __asm { |
5768 push esi | 6167 push esi |
5769 mov eax, [esp + 4 + 4] /* dst_argb */ | 6168 mov eax, [esp + 4 + 4] /* dst_argb */ |
5770 mov esi, [esp + 4 + 8] /* table_argb */ | 6169 mov esi, [esp + 4 + 8] /* table_argb */ |
5771 mov ecx, [esp + 4 + 12] /* width */ | 6170 mov ecx, [esp + 4 + 12] /* width */ |
5772 | 6171 |
5773 // 1 pixel loop. | 6172 // 1 pixel loop. |
5774 convertloop: | 6173 convertloop: |
(...skipping 13 matching lines...) Expand all Loading... |
5788 dec ecx | 6187 dec ecx |
5789 jg convertloop | 6188 jg convertloop |
5790 pop esi | 6189 pop esi |
5791 ret | 6190 ret |
5792 } | 6191 } |
5793 } | 6192 } |
5794 #endif // HAS_ARGBCOLORTABLEROW_X86 | 6193 #endif // HAS_ARGBCOLORTABLEROW_X86 |
5795 | 6194 |
5796 #ifdef HAS_RGBCOLORTABLEROW_X86 | 6195 #ifdef HAS_RGBCOLORTABLEROW_X86 |
5797 // Tranform RGB pixels with color table. | 6196 // Tranform RGB pixels with color table. |
5798 __declspec(naked) __declspec(align(16)) | 6197 __declspec(naked) |
5799 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { | 6198 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { |
5800 __asm { | 6199 __asm { |
5801 push esi | 6200 push esi |
5802 mov eax, [esp + 4 + 4] /* dst_argb */ | 6201 mov eax, [esp + 4 + 4] /* dst_argb */ |
5803 mov esi, [esp + 4 + 8] /* table_argb */ | 6202 mov esi, [esp + 4 + 8] /* table_argb */ |
5804 mov ecx, [esp + 4 + 12] /* width */ | 6203 mov ecx, [esp + 4 + 12] /* width */ |
5805 | 6204 |
5806 // 1 pixel loop. | 6205 // 1 pixel loop. |
5807 convertloop: | 6206 convertloop: |
5808 movzx edx, byte ptr [eax] | 6207 movzx edx, byte ptr [eax] |
(...skipping 10 matching lines...) Expand all Loading... |
5819 jg convertloop | 6218 jg convertloop |
5820 | 6219 |
5821 pop esi | 6220 pop esi |
5822 ret | 6221 ret |
5823 } | 6222 } |
5824 } | 6223 } |
5825 #endif // HAS_RGBCOLORTABLEROW_X86 | 6224 #endif // HAS_RGBCOLORTABLEROW_X86 |
5826 | 6225 |
5827 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6226 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5828 // Tranform RGB pixels with luma table. | 6227 // Tranform RGB pixels with luma table. |
5829 __declspec(naked) __declspec(align(16)) | 6228 __declspec(naked) |
5830 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 6229 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
5831 int width, | 6230 int width, |
5832 const uint8* luma, uint32 lumacoeff) { | 6231 const uint8* luma, uint32 lumacoeff) { |
5833 __asm { | 6232 __asm { |
5834 push esi | 6233 push esi |
5835 push edi | 6234 push edi |
5836 mov eax, [esp + 8 + 4] /* src_argb */ | 6235 mov eax, [esp + 8 + 4] /* src_argb */ |
5837 mov edi, [esp + 8 + 8] /* dst_argb */ | 6236 mov edi, [esp + 8 + 8] /* dst_argb */ |
5838 mov ecx, [esp + 8 + 12] /* width */ | 6237 mov ecx, [esp + 8 + 12] /* width */ |
5839 movd xmm2, dword ptr [esp + 8 + 16] // luma table | 6238 movd xmm2, dword ptr [esp + 8 + 16] // luma table |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5917 jg convertloop | 6316 jg convertloop |
5918 | 6317 |
5919 pop edi | 6318 pop edi |
5920 pop esi | 6319 pop esi |
5921 ret | 6320 ret |
5922 } | 6321 } |
5923 } | 6322 } |
5924 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6323 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5925 | 6324 |
5926 #endif // defined(_M_X64) | 6325 #endif // defined(_M_X64) |
5927 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) | 6326 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
5928 | 6327 |
5929 #ifdef __cplusplus | 6328 #ifdef __cplusplus |
5930 } // extern "C" | 6329 } // extern "C" |
5931 } // namespace libyuv | 6330 } // namespace libyuv |
5932 #endif | 6331 #endif |
OLD | NEW |