| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include "libyuv/row.h" | 11 #include "libyuv/row.h" |
| 12 | 12 |
| 13 #if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) | 13 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \ |
| 14 defined(_MSC_VER) && !defined(__clang__) |
| 14 #include <emmintrin.h> | 15 #include <emmintrin.h> |
| 15 #include <tmmintrin.h> // For _mm_maddubs_epi16 | 16 #include <tmmintrin.h> // For _mm_maddubs_epi16 |
| 16 #endif | 17 #endif |
| 17 | 18 |
| 18 #ifdef __cplusplus | 19 #ifdef __cplusplus |
| 19 namespace libyuv { | 20 namespace libyuv { |
| 20 extern "C" { | 21 extern "C" { |
| 21 #endif | 22 #endif |
| 22 | 23 |
| 23 // This module is for Visual C. | 24 // This module is for Visual C. |
| 24 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ | 25 #if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \ |
| 25 (defined(_M_IX86) || defined(_M_X64)) | 26 defined(_MSC_VER) && !defined(__clang__) |
| 26 | |
| 27 // YUV to RGB conversion constants. | |
| 28 // Y contribution to R,G,B. Scale and bias. | |
| 29 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ | |
| 30 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ | |
| 31 | |
| 32 // U and V contributions to R,G,B. | |
| 33 #define UB -128 /* -min(128, round(2.018 * 64)) */ | |
| 34 #define UG 25 /* -round(-0.391 * 64) */ | |
| 35 #define VG 52 /* -round(-0.813 * 64) */ | |
| 36 #define VR -102 /* -round(1.596 * 64) */ | |
| 37 | |
| 38 // Bias values to subtract 16 from Y and 128 from U and V. | |
| 39 #define BB (UB * 128 - YGB) | |
| 40 #define BG (UG * 128 + VG * 128 - YGB) | |
| 41 #define BR (VR * 128 - YGB) | |
| 42 | 27 |
| 43 struct YuvConstants { | 28 struct YuvConstants { |
| 44 lvec8 kUVToB; // 0 | 29 lvec8 kUVToB; // 0 |
| 45 lvec8 kUVToG; // 32 | 30 lvec8 kUVToG; // 32 |
| 46 lvec8 kUVToR; // 64 | 31 lvec8 kUVToR; // 64 |
| 47 lvec16 kUVBiasB; // 96 | 32 lvec16 kUVBiasB; // 96 |
| 48 lvec16 kUVBiasG; // 128 | 33 lvec16 kUVBiasG; // 128 |
| 49 lvec16 kUVBiasR; // 160 | 34 lvec16 kUVBiasR; // 160 |
| 50 lvec16 kYToRgb; // 192 | 35 lvec16 kYToRgb; // 192 |
| 51 }; | 36 }; |
| 52 | 37 |
| 38 // BT.601 YUV to RGB reference |
| 39 // R = (Y - 16) * 1.164 - V * -1.596 |
| 40 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 |
| 41 // B = (Y - 16) * 1.164 - U * -2.018 |
| 42 |
| 43 // Y contribution to R,G,B. Scale and bias. |
| 44 // TODO(fbarchard): Consider moving constants into a common header. |
| 45 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ |
| 46 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ |
| 47 |
| 48 // U and V contributions to R,G,B. |
| 49 #define UB -128 /* max(-128, round(-2.018 * 64)) */ |
| 50 #define UG 25 /* round(0.391 * 64) */ |
| 51 #define VG 52 /* round(0.813 * 64) */ |
| 52 #define VR -102 /* round(-1.596 * 64) */ |
| 53 |
| 54 // Bias values to subtract 16 from Y and 128 from U and V. |
| 55 #define BB (UB * 128 + YGB) |
| 56 #define BG (UG * 128 + VG * 128 + YGB) |
| 57 #define BR (VR * 128 + YGB) |
| 58 |
| 53 // BT601 constants for YUV to RGB. | 59 // BT601 constants for YUV to RGB. |
| 54 static YuvConstants SIMD_ALIGNED(kYuvConstants) = { | 60 static YuvConstants SIMD_ALIGNED(kYuvConstants) = { |
| 55 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, | 61 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, |
| 56 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, | 62 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, |
| 57 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, | 63 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, |
| 58 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, | 64 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, |
| 59 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, | 65 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, |
| 60 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, | 66 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, |
| 61 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | 67 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, |
| 62 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | 68 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, |
| 63 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | 69 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, |
| 64 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | 70 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } |
| 65 }; | 71 }; |
| 66 | 72 |
| 67 // BT601 constants for NV21 where chroma plane is VU instead of UV. | 73 // BT601 constants for NV21 where chroma plane is VU instead of UV. |
| 68 static YuvConstants SIMD_ALIGNED(kYvuConstants) = { | 74 static YuvConstants SIMD_ALIGNED(kYvuConstants) = { |
| 69 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, | 75 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, |
| 70 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, | 76 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, |
| 71 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, | 77 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, |
| 72 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, | 78 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, |
| 73 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, | 79 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, |
| 74 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, | 80 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, |
| 75 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | 81 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, |
| 76 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | 82 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, |
| 77 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | 83 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, |
| 78 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | 84 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } |
| 79 }; | 85 }; |
| 80 | 86 |
| 87 #undef YG |
| 88 #undef YGB |
| 89 #undef UB |
| 90 #undef UG |
| 91 #undef VG |
| 92 #undef VR |
| 93 #undef BB |
| 94 #undef BG |
| 95 #undef BR |
| 96 |
| 97 // JPEG YUV to RGB reference |
| 98 // * R = Y - V * -1.40200 |
| 99 // * G = Y - U * 0.34414 - V * 0.71414 |
| 100 // * B = Y - U * -1.77200 |
| 101 |
| 102 // Y contribution to R,G,B. Scale and bias. |
| 103 // TODO(fbarchard): Consider moving constants into a common header. |
| 104 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ |
| 105 #define YGBJ 32 /* 64 / 2 */ |
| 106 |
| 107 // U and V contributions to R,G,B. |
| 108 #define UBJ -113 /* round(-1.77200 * 64) */ |
| 109 #define UGJ 22 /* round(0.34414 * 64) */ |
| 110 #define VGJ 46 /* round(0.71414 * 64) */ |
| 111 #define VRJ -90 /* round(-1.40200 * 64) */ |
| 112 |
| 113 // Bias values to subtract 16 from Y and 128 from U and V. |
| 114 #define BBJ (UBJ * 128 + YGBJ) |
| 115 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) |
| 116 #define BRJ (VRJ * 128 + YGBJ) |
| 117 |
| 118 // JPEG constants for YUV to RGB. |
| 119 static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { |
| 120 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, |
| 121 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, |
| 122 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| 123 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| 124 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| 125 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, |
| 126 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, |
| 127 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, |
| 128 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, |
| 129 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, |
| 130 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, |
| 131 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, |
| 132 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, |
| 133 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, |
| 134 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, |
| 135 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } |
| 136 }; |
| 137 |
| 138 #undef YGJ |
| 139 #undef YGBJ |
| 140 #undef UBJ |
| 141 #undef UGJ |
| 142 #undef VGJ |
| 143 #undef VRJ |
| 144 #undef BBJ |
| 145 #undef BGJ |
| 146 #undef BRJ |
| 147 |
| 81 // 64 bit | 148 // 64 bit |
| 82 #if defined(_M_X64) | 149 #if defined(_M_X64) |
| 83 | 150 #if defined(HAS_I422TOARGBROW_SSSE3) |
| 84 __declspec(align(16)) | |
| 85 void I422ToARGBRow_SSSE3(const uint8* y_buf, | 151 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
| 86 const uint8* u_buf, | 152 const uint8* u_buf, |
| 87 const uint8* v_buf, | 153 const uint8* v_buf, |
| 88 uint8* dst_argb, | 154 uint8* dst_argb, |
| 89 int width) { | 155 int width) { |
| 90 __m128i xmm0, xmm1, xmm2, xmm3; | 156 __m128i xmm0, xmm1, xmm2, xmm3; |
| 91 const __m128i xmm5 = _mm_set1_epi8(-1); | 157 const __m128i xmm5 = _mm_set1_epi8(-1); |
| 92 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; | 158 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; |
| 93 | 159 |
| 94 while (width > 0) { | 160 while (width > 0) { |
| (...skipping 29 matching lines...) Expand all Loading... |
| 124 | 190 |
| 125 _mm_storeu_si128((__m128i *)dst_argb, xmm0); | 191 _mm_storeu_si128((__m128i *)dst_argb, xmm0); |
| 126 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); | 192 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); |
| 127 | 193 |
| 128 y_buf += 8; | 194 y_buf += 8; |
| 129 u_buf += 4; | 195 u_buf += 4; |
| 130 dst_argb += 32; | 196 dst_argb += 32; |
| 131 width -= 8; | 197 width -= 8; |
| 132 } | 198 } |
| 133 } | 199 } |
| 134 | 200 #endif |
| 135 // 32 bit | 201 // 32 bit |
| 136 #else // defined(_M_X64) | 202 #else // defined(_M_X64) |
| 137 | |
| 138 #ifdef HAS_ARGBTOYROW_SSSE3 | 203 #ifdef HAS_ARGBTOYROW_SSSE3 |
| 139 | 204 |
| 140 // Constants for ARGB. | 205 // Constants for ARGB. |
| 141 static const vec8 kARGBToY = { | 206 static const vec8 kARGBToY = { |
| 142 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 | 207 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 |
| 143 }; | 208 }; |
| 144 | 209 |
| 145 // JPeg full range. | 210 // JPeg full range. |
| 146 static const vec8 kARGBToYJ = { | 211 static const vec8 kARGBToYJ = { |
| 147 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 | 212 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 |
| (...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 250 static const uvec8 kShuffleMaskARGBToRGB24_0 = { | 315 static const uvec8 kShuffleMaskARGBToRGB24_0 = { |
| 251 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u | 316 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u |
| 252 }; | 317 }; |
| 253 | 318 |
| 254 // Shuffle table for converting ARGB to RAW. | 319 // Shuffle table for converting ARGB to RAW. |
| 255 static const uvec8 kShuffleMaskARGBToRAW_0 = { | 320 static const uvec8 kShuffleMaskARGBToRAW_0 = { |
| 256 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u | 321 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u |
| 257 }; | 322 }; |
| 258 | 323 |
| 259 // Duplicates gray value 3 times and fills in alpha opaque. | 324 // Duplicates gray value 3 times and fills in alpha opaque. |
| 260 __declspec(naked) __declspec(align(16)) | 325 __declspec(naked) |
| 261 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | 326 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
| 262 __asm { | 327 __asm { |
| 263 mov eax, [esp + 4] // src_y | 328 mov eax, [esp + 4] // src_y |
| 264 mov edx, [esp + 8] // dst_argb | 329 mov edx, [esp + 8] // dst_argb |
| 265 mov ecx, [esp + 12] // pix | 330 mov ecx, [esp + 12] // pix |
| 266 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 331 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| 267 pslld xmm5, 24 | 332 pslld xmm5, 24 |
| 268 | 333 |
| 269 convertloop: | 334 convertloop: |
| 270 movq xmm0, qword ptr [eax] | 335 movq xmm0, qword ptr [eax] |
| 271 lea eax, [eax + 8] | 336 lea eax, [eax + 8] |
| 272 punpcklbw xmm0, xmm0 | 337 punpcklbw xmm0, xmm0 |
| 273 movdqa xmm1, xmm0 | 338 movdqa xmm1, xmm0 |
| 274 punpcklwd xmm0, xmm0 | 339 punpcklwd xmm0, xmm0 |
| 275 punpckhwd xmm1, xmm1 | 340 punpckhwd xmm1, xmm1 |
| 276 por xmm0, xmm5 | 341 por xmm0, xmm5 |
| 277 por xmm1, xmm5 | 342 por xmm1, xmm5 |
| 278 movdqu [edx], xmm0 | 343 movdqu [edx], xmm0 |
| 279 movdqu [edx + 16], xmm1 | 344 movdqu [edx + 16], xmm1 |
| 280 lea edx, [edx + 32] | 345 lea edx, [edx + 32] |
| 281 sub ecx, 8 | 346 sub ecx, 8 |
| 282 jg convertloop | 347 jg convertloop |
| 283 ret | 348 ret |
| 284 } | 349 } |
| 285 } | 350 } |
| 286 | 351 |
| 287 __declspec(naked) __declspec(align(16)) | 352 #ifdef HAS_J400TOARGBROW_AVX2 |
| 353 // Duplicates gray value 3 times and fills in alpha opaque. |
| 354 __declspec(naked) |
| 355 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { |
| 356 __asm { |
| 357 mov eax, [esp + 4] // src_y |
| 358 mov edx, [esp + 8] // dst_argb |
| 359 mov ecx, [esp + 12] // pix |
| 360 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
| 361 vpslld ymm5, ymm5, 24 |
| 362 |
| 363 convertloop: |
| 364 vmovdqu xmm0, [eax] |
| 365 lea eax, [eax + 16] |
| 366 vpermq ymm0, ymm0, 0xd8 |
| 367 vpunpcklbw ymm0, ymm0, ymm0 |
| 368 vpermq ymm0, ymm0, 0xd8 |
| 369 vpunpckhwd ymm1, ymm0, ymm0 |
| 370 vpunpcklwd ymm0, ymm0, ymm0 |
| 371 vpor ymm0, ymm0, ymm5 |
| 372 vpor ymm1, ymm1, ymm5 |
| 373 vmovdqu [edx], ymm0 |
| 374 vmovdqu [edx + 32], ymm1 |
| 375 lea edx, [edx + 64] |
| 376 sub ecx, 16 |
| 377 jg convertloop |
| 378 vzeroupper |
| 379 ret |
| 380 } |
| 381 } |
| 382 #endif // HAS_J400TOARGBROW_AVX2 |
| 383 |
| 384 __declspec(naked) |
| 288 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 385 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
| 289 __asm { | 386 __asm { |
| 290 mov eax, [esp + 4] // src_rgb24 | 387 mov eax, [esp + 4] // src_rgb24 |
| 291 mov edx, [esp + 8] // dst_argb | 388 mov edx, [esp + 8] // dst_argb |
| 292 mov ecx, [esp + 12] // pix | 389 mov ecx, [esp + 12] // pix |
| 293 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 390 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| 294 pslld xmm5, 24 | 391 pslld xmm5, 24 |
| 295 movdqa xmm4, kShuffleMaskRGB24ToARGB | 392 movdqa xmm4, kShuffleMaskRGB24ToARGB |
| 296 | 393 |
| 297 convertloop: | 394 convertloop: |
| (...skipping 17 matching lines...) Expand all Loading... |
| 315 movdqu [edx + 16], xmm1 | 412 movdqu [edx + 16], xmm1 |
| 316 por xmm3, xmm5 | 413 por xmm3, xmm5 |
| 317 movdqu [edx + 48], xmm3 | 414 movdqu [edx + 48], xmm3 |
| 318 lea edx, [edx + 64] | 415 lea edx, [edx + 64] |
| 319 sub ecx, 16 | 416 sub ecx, 16 |
| 320 jg convertloop | 417 jg convertloop |
| 321 ret | 418 ret |
| 322 } | 419 } |
| 323 } | 420 } |
| 324 | 421 |
| 325 __declspec(naked) __declspec(align(16)) | 422 __declspec(naked) |
| 326 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, | 423 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, |
| 327 int pix) { | 424 int pix) { |
| 328 __asm { | 425 __asm { |
| 329 mov eax, [esp + 4] // src_raw | 426 mov eax, [esp + 4] // src_raw |
| 330 mov edx, [esp + 8] // dst_argb | 427 mov edx, [esp + 8] // dst_argb |
| 331 mov ecx, [esp + 12] // pix | 428 mov ecx, [esp + 12] // pix |
| 332 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 429 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| 333 pslld xmm5, 24 | 430 pslld xmm5, 24 |
| 334 movdqa xmm4, kShuffleMaskRAWToARGB | 431 movdqa xmm4, kShuffleMaskRAWToARGB |
| 335 | 432 |
| (...skipping 25 matching lines...) Expand all Loading... |
| 361 } | 458 } |
| 362 } | 459 } |
| 363 | 460 |
| 364 // pmul method to replicate bits. | 461 // pmul method to replicate bits. |
| 365 // Math to replicate bits: | 462 // Math to replicate bits: |
| 366 // (v << 8) | (v << 3) | 463 // (v << 8) | (v << 3) |
| 367 // v * 256 + v * 8 | 464 // v * 256 + v * 8 |
| 368 // v * (256 + 8) | 465 // v * (256 + 8) |
| 369 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 | 466 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
| 370 // 20 instructions. | 467 // 20 instructions. |
| 371 __declspec(naked) __declspec(align(16)) | 468 __declspec(naked) |
| 372 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, | 469 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, |
| 373 int pix) { | 470 int pix) { |
| 374 __asm { | 471 __asm { |
| 375 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 472 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 376 movd xmm5, eax | 473 movd xmm5, eax |
| 377 pshufd xmm5, xmm5, 0 | 474 pshufd xmm5, xmm5, 0 |
| 378 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits | 475 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
| 379 movd xmm6, eax | 476 movd xmm6, eax |
| 380 pshufd xmm6, xmm6, 0 | 477 pshufd xmm6, xmm6, 0 |
| 381 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red | 478 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
| (...skipping 28 matching lines...) Expand all Loading... |
| 410 punpckhbw xmm2, xmm0 | 507 punpckhbw xmm2, xmm0 |
| 411 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB | 508 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
| 412 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB | 509 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
| 413 lea eax, [eax + 16] | 510 lea eax, [eax + 16] |
| 414 sub ecx, 8 | 511 sub ecx, 8 |
| 415 jg convertloop | 512 jg convertloop |
| 416 ret | 513 ret |
| 417 } | 514 } |
| 418 } | 515 } |
| 419 | 516 |
| 517 #ifdef HAS_RGB565TOARGBROW_AVX2 |
| 518 // pmul method to replicate bits. |
| 519 // Math to replicate bits: |
| 520 // (v << 8) | (v << 3) |
| 521 // v * 256 + v * 8 |
| 522 // v * (256 + 8) |
| 523 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
| 524 __declspec(naked) |
| 525 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, |
| 526 int pix) { |
| 527 __asm { |
| 528 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 529 vmovd xmm5, eax |
| 530 vbroadcastss ymm5, xmm5 |
| 531 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
| 532 movd xmm6, eax |
| 533 vbroadcastss ymm6, xmm6 |
| 534 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
| 535 vpsllw ymm3, ymm3, 11 |
| 536 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green |
| 537 vpsllw ymm4, ymm4, 10 |
| 538 vpsrlw ymm4, ymm4, 5 |
| 539 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
| 540 vpsllw ymm7, ymm7, 8 |
| 541 |
| 542 mov eax, [esp + 4] // src_rgb565 |
| 543 mov edx, [esp + 8] // dst_argb |
| 544 mov ecx, [esp + 12] // pix |
| 545 sub edx, eax |
| 546 sub edx, eax |
| 547 |
| 548 convertloop: |
| 549 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 |
| 550 vpand ymm1, ymm0, ymm3 // R in upper 5 bits |
| 551 vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
| 552 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
| 553 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
| 554 vpsllw ymm1, ymm1, 8 |
| 555 vpor ymm1, ymm1, ymm2 // RB |
| 556 vpand ymm0, ymm0, ymm4 // G in middle 6 bits |
| 557 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) |
| 558 vpor ymm0, ymm0, ymm7 // AG |
| 559 vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| 560 vpermq ymm1, ymm1, 0xd8 |
| 561 vpunpckhbw ymm2, ymm1, ymm0 |
| 562 vpunpcklbw ymm1, ymm1, ymm0 |
| 563 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB |
| 564 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB |
| 565 lea eax, [eax + 32] |
| 566 sub ecx, 16 |
| 567 jg convertloop |
| 568 vzeroupper |
| 569 ret |
| 570 } |
| 571 } |
| 572 #endif // HAS_RGB565TOARGBROW_AVX2 |
| 573 |
| 574 #ifdef HAS_ARGB1555TOARGBROW_AVX2 |
| 575 __declspec(naked) |
| 576 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, |
| 577 int pix) { |
| 578 __asm { |
| 579 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 580 vmovd xmm5, eax |
| 581 vbroadcastss ymm5, xmm5 |
| 582 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
| 583 movd xmm6, eax |
| 584 vbroadcastss ymm6, xmm6 |
| 585 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
| 586 vpsllw ymm3, ymm3, 11 |
| 587 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green |
| 588 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
| 589 vpsllw ymm7, ymm7, 8 |
| 590 |
| 591 mov eax, [esp + 4] // src_argb1555 |
| 592 mov edx, [esp + 8] // dst_argb |
| 593 mov ecx, [esp + 12] // pix |
| 594 sub edx, eax |
| 595 sub edx, eax |
| 596 |
| 597 convertloop: |
| 598 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 |
| 599 vpsllw ymm1, ymm0, 1 // R in upper 5 bits |
| 600 vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
| 601 vpand ymm1, ymm1, ymm3 |
| 602 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
| 603 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
| 604 vpsllw ymm1, ymm1, 8 |
| 605 vpor ymm1, ymm1, ymm2 // RB |
| 606 vpsraw ymm2, ymm0, 8 // A |
| 607 vpand ymm0, ymm0, ymm4 // G in middle 5 bits |
| 608 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) |
| 609 vpand ymm2, ymm2, ymm7 |
| 610 vpor ymm0, ymm0, ymm2 // AG |
| 611 vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| 612 vpermq ymm1, ymm1, 0xd8 |
| 613 vpunpckhbw ymm2, ymm1, ymm0 |
| 614 vpunpcklbw ymm1, ymm1, ymm0 |
| 615 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB |
| 616 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB |
| 617 lea eax, [eax + 32] |
| 618 sub ecx, 16 |
| 619 jg convertloop |
| 620 vzeroupper |
| 621 ret |
| 622 } |
| 623 } |
| 624 #endif // HAS_ARGB1555TOARGBROW_AVX2 |
| 625 |
| 626 #ifdef HAS_ARGB4444TOARGBROW_AVX2 |
| 627 __declspec(naked) |
| 628 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, |
| 629 int pix) { |
| 630 __asm { |
| 631 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
| 632 vmovd xmm4, eax |
| 633 vbroadcastss ymm4, xmm4 |
| 634 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles |
| 635 mov eax, [esp + 4] // src_argb4444 |
| 636 mov edx, [esp + 8] // dst_argb |
| 637 mov ecx, [esp + 12] // pix |
| 638 sub edx, eax |
| 639 sub edx, eax |
| 640 |
| 641 convertloop: |
| 642 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 |
| 643 vpand ymm2, ymm0, ymm5 // mask high nibbles |
| 644 vpand ymm0, ymm0, ymm4 // mask low nibbles |
| 645 vpsrlw ymm3, ymm2, 4 |
| 646 vpsllw ymm1, ymm0, 4 |
| 647 vpor ymm2, ymm2, ymm3 |
| 648 vpor ymm0, ymm0, ymm1 |
| 649 vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| 650 vpermq ymm2, ymm2, 0xd8 |
| 651 vpunpckhbw ymm1, ymm0, ymm2 |
| 652 vpunpcklbw ymm0, ymm0, ymm2 |
| 653 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB |
| 654 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB |
| 655 lea eax, [eax + 32] |
| 656 sub ecx, 16 |
| 657 jg convertloop |
| 658 vzeroupper |
| 659 ret |
| 660 } |
| 661 } |
| 662 #endif // HAS_ARGB4444TOARGBROW_AVX2 |
| 663 |
| 420 // 24 instructions | 664 // 24 instructions |
| 421 __declspec(naked) __declspec(align(16)) | 665 __declspec(naked) |
| 422 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, | 666 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, |
| 423 int pix) { | 667 int pix) { |
| 424 __asm { | 668 __asm { |
| 425 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 669 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 426 movd xmm5, eax | 670 movd xmm5, eax |
| 427 pshufd xmm5, xmm5, 0 | 671 pshufd xmm5, xmm5, 0 |
| 428 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits | 672 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
| 429 movd xmm6, eax | 673 movd xmm6, eax |
| 430 pshufd xmm6, xmm6, 0 | 674 pshufd xmm6, xmm6, 0 |
| 431 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red | 675 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 464 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB | 708 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
| 465 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB | 709 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
| 466 lea eax, [eax + 16] | 710 lea eax, [eax + 16] |
| 467 sub ecx, 8 | 711 sub ecx, 8 |
| 468 jg convertloop | 712 jg convertloop |
| 469 ret | 713 ret |
| 470 } | 714 } |
| 471 } | 715 } |
| 472 | 716 |
| 473 // 18 instructions. | 717 // 18 instructions. |
| 474 __declspec(naked) __declspec(align(16)) | 718 __declspec(naked) |
| 475 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, | 719 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, |
| 476 int pix) { | 720 int pix) { |
| 477 __asm { | 721 __asm { |
| 478 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f | 722 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
| 479 movd xmm4, eax | 723 movd xmm4, eax |
| 480 pshufd xmm4, xmm4, 0 | 724 pshufd xmm4, xmm4, 0 |
| 481 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles | 725 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles |
| 482 pslld xmm5, 4 | 726 pslld xmm5, 4 |
| 483 mov eax, [esp + 4] // src_argb4444 | 727 mov eax, [esp + 4] // src_argb4444 |
| 484 mov edx, [esp + 8] // dst_argb | 728 mov edx, [esp + 8] // dst_argb |
| (...skipping 17 matching lines...) Expand all Loading... |
| 502 punpckhbw xmm1, xmm2 | 746 punpckhbw xmm1, xmm2 |
| 503 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB | 747 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB |
| 504 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB | 748 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB |
| 505 lea eax, [eax + 16] | 749 lea eax, [eax + 16] |
| 506 sub ecx, 8 | 750 sub ecx, 8 |
| 507 jg convertloop | 751 jg convertloop |
| 508 ret | 752 ret |
| 509 } | 753 } |
| 510 } | 754 } |
| 511 | 755 |
| 512 __declspec(naked) __declspec(align(16)) | 756 __declspec(naked) |
| 513 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { | 757 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 514 __asm { | 758 __asm { |
| 515 mov eax, [esp + 4] // src_argb | 759 mov eax, [esp + 4] // src_argb |
| 516 mov edx, [esp + 8] // dst_rgb | 760 mov edx, [esp + 8] // dst_rgb |
| 517 mov ecx, [esp + 12] // pix | 761 mov ecx, [esp + 12] // pix |
| 518 movdqa xmm6, kShuffleMaskARGBToRGB24 | 762 movdqa xmm6, kShuffleMaskARGBToRGB24 |
| 519 | 763 |
| 520 convertloop: | 764 convertloop: |
| 521 movdqu xmm0, [eax] // fetch 16 pixels of argb | 765 movdqu xmm0, [eax] // fetch 16 pixels of argb |
| 522 movdqu xmm1, [eax + 16] | 766 movdqu xmm1, [eax + 16] |
| (...skipping 17 matching lines...) Expand all Loading... |
| 540 por xmm2, xmm3 // 12 bytes from 3 for 2 | 784 por xmm2, xmm3 // 12 bytes from 3 for 2 |
| 541 movdqu [edx + 16], xmm1 // store 1 | 785 movdqu [edx + 16], xmm1 // store 1 |
| 542 movdqu [edx + 32], xmm2 // store 2 | 786 movdqu [edx + 32], xmm2 // store 2 |
| 543 lea edx, [edx + 48] | 787 lea edx, [edx + 48] |
| 544 sub ecx, 16 | 788 sub ecx, 16 |
| 545 jg convertloop | 789 jg convertloop |
| 546 ret | 790 ret |
| 547 } | 791 } |
| 548 } | 792 } |
| 549 | 793 |
| 550 __declspec(naked) __declspec(align(16)) | 794 __declspec(naked) |
| 551 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { | 795 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 552 __asm { | 796 __asm { |
| 553 mov eax, [esp + 4] // src_argb | 797 mov eax, [esp + 4] // src_argb |
| 554 mov edx, [esp + 8] // dst_rgb | 798 mov edx, [esp + 8] // dst_rgb |
| 555 mov ecx, [esp + 12] // pix | 799 mov ecx, [esp + 12] // pix |
| 556 movdqa xmm6, kShuffleMaskARGBToRAW | 800 movdqa xmm6, kShuffleMaskARGBToRAW |
| 557 | 801 |
| 558 convertloop: | 802 convertloop: |
| 559 movdqu xmm0, [eax] // fetch 16 pixels of argb | 803 movdqu xmm0, [eax] // fetch 16 pixels of argb |
| 560 movdqu xmm1, [eax + 16] | 804 movdqu xmm1, [eax + 16] |
| (...skipping 17 matching lines...) Expand all Loading... |
| 578 por xmm2, xmm3 // 12 bytes from 3 for 2 | 822 por xmm2, xmm3 // 12 bytes from 3 for 2 |
| 579 movdqu [edx + 16], xmm1 // store 1 | 823 movdqu [edx + 16], xmm1 // store 1 |
| 580 movdqu [edx + 32], xmm2 // store 2 | 824 movdqu [edx + 32], xmm2 // store 2 |
| 581 lea edx, [edx + 48] | 825 lea edx, [edx + 48] |
| 582 sub ecx, 16 | 826 sub ecx, 16 |
| 583 jg convertloop | 827 jg convertloop |
| 584 ret | 828 ret |
| 585 } | 829 } |
| 586 } | 830 } |
| 587 | 831 |
| 588 __declspec(naked) __declspec(align(16)) | 832 // 4 pixels |
| 833 __declspec(naked) |
| 589 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 834 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 590 __asm { | 835 __asm { |
| 591 mov eax, [esp + 4] // src_argb | 836 mov eax, [esp + 4] // src_argb |
| 592 mov edx, [esp + 8] // dst_rgb | 837 mov edx, [esp + 8] // dst_rgb |
| 593 mov ecx, [esp + 12] // pix | 838 mov ecx, [esp + 12] // pix |
| 594 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f | 839 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
| 595 psrld xmm3, 27 | 840 psrld xmm3, 27 |
| 596 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 | 841 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
| 597 psrld xmm4, 26 | 842 psrld xmm4, 26 |
| 598 pslld xmm4, 5 | 843 pslld xmm4, 5 |
| (...skipping 16 matching lines...) Expand all Loading... |
| 615 packssdw xmm0, xmm0 | 860 packssdw xmm0, xmm0 |
| 616 lea eax, [eax + 16] | 861 lea eax, [eax + 16] |
| 617 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 | 862 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
| 618 lea edx, [edx + 8] | 863 lea edx, [edx + 8] |
| 619 sub ecx, 4 | 864 sub ecx, 4 |
| 620 jg convertloop | 865 jg convertloop |
| 621 ret | 866 ret |
| 622 } | 867 } |
| 623 } | 868 } |
| 624 | 869 |
| 870 // 8 pixels |
| 871 __declspec(naked) |
| 872 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, |
| 873 const uint32 dither4, int pix) { |
| 874 __asm { |
| 875 |
| 876 mov eax, [esp + 4] // src_argb |
| 877 mov edx, [esp + 8] // dst_rgb |
| 878 movd xmm6, [esp + 12] // dither4 |
| 879 mov ecx, [esp + 16] // pix |
| 880 punpcklbw xmm6, xmm6 // make dither 16 bytes |
| 881 movdqa xmm7, xmm6 |
| 882 punpcklwd xmm6, xmm6 |
| 883 punpckhwd xmm7, xmm7 |
| 884 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
| 885 psrld xmm3, 27 |
| 886 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
| 887 psrld xmm4, 26 |
| 888 pslld xmm4, 5 |
| 889 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
| 890 pslld xmm5, 11 |
| 891 |
| 892 convertloop: |
| 893 movdqu xmm0, [eax] // fetch 4 pixels of argb |
| 894 paddusb xmm0, xmm6 // add dither |
| 895 movdqa xmm1, xmm0 // B |
| 896 movdqa xmm2, xmm0 // G |
| 897 pslld xmm0, 8 // R |
| 898 psrld xmm1, 3 // B |
| 899 psrld xmm2, 5 // G |
| 900 psrad xmm0, 16 // R |
| 901 pand xmm1, xmm3 // B |
| 902 pand xmm2, xmm4 // G |
| 903 pand xmm0, xmm5 // R |
| 904 por xmm1, xmm2 // BG |
| 905 por xmm0, xmm1 // BGR |
| 906 packssdw xmm0, xmm0 |
| 907 lea eax, [eax + 16] |
| 908 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
| 909 lea edx, [edx + 8] |
| 910 sub ecx, 4 |
| 911 jg convertloop |
| 912 ret |
| 913 } |
| 914 } |
| 915 |
| 916 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 |
| 917 __declspec(naked) |
| 918 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, |
| 919 const uint32 dither4, int pix) { |
| 920 __asm { |
| 921 mov eax, [esp + 4] // src_argb |
| 922 mov edx, [esp + 8] // dst_rgb |
| 923 vbroadcastss xmm6, [esp + 12] // dither4 |
| 924 mov ecx, [esp + 16] // pix |
| 925 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes |
| 926 vpermq ymm6, ymm6, 0xd8 |
| 927 vpunpcklwd ymm6, ymm6, ymm6 |
| 928 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
| 929 vpsrld ymm3, ymm3, 27 |
| 930 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
| 931 vpsrld ymm4, ymm4, 26 |
| 932 vpslld ymm4, ymm4, 5 |
| 933 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
| 934 |
| 935 convertloop: |
| 936 vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| 937 vpaddusb ymm0, ymm0, ymm6 // add dither |
| 938 vpsrld ymm2, ymm0, 5 // G |
| 939 vpsrld ymm1, ymm0, 3 // B |
| 940 vpsrld ymm0, ymm0, 8 // R |
| 941 vpand ymm2, ymm2, ymm4 // G |
| 942 vpand ymm1, ymm1, ymm3 // B |
| 943 vpand ymm0, ymm0, ymm5 // R |
| 944 vpor ymm1, ymm1, ymm2 // BG |
| 945 vpor ymm0, ymm0, ymm1 // BGR |
| 946 vpackusdw ymm0, ymm0, ymm0 |
| 947 vpermq ymm0, ymm0, 0xd8 |
| 948 lea eax, [eax + 32] |
| 949 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 |
| 950 lea edx, [edx + 16] |
| 951 sub ecx, 8 |
| 952 jg convertloop |
| 953 vzeroupper |
| 954 ret |
| 955 } |
| 956 } |
| 957 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 |
| 958 |
| 625 // TODO(fbarchard): Improve sign extension/packing. | 959 // TODO(fbarchard): Improve sign extension/packing. |
| 626 __declspec(naked) __declspec(align(16)) | 960 __declspec(naked) |
| 627 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 961 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 628 __asm { | 962 __asm { |
| 629 mov eax, [esp + 4] // src_argb | 963 mov eax, [esp + 4] // src_argb |
| 630 mov edx, [esp + 8] // dst_rgb | 964 mov edx, [esp + 8] // dst_rgb |
| 631 mov ecx, [esp + 12] // pix | 965 mov ecx, [esp + 12] // pix |
| 632 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f | 966 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f |
| 633 psrld xmm4, 27 | 967 psrld xmm4, 27 |
| 634 movdqa xmm5, xmm4 // generate mask 0x000003e0 | 968 movdqa xmm5, xmm4 // generate mask 0x000003e0 |
| 635 pslld xmm5, 5 | 969 pslld xmm5, 5 |
| 636 movdqa xmm6, xmm4 // generate mask 0x00007c00 | 970 movdqa xmm6, xmm4 // generate mask 0x00007c00 |
| (...skipping 20 matching lines...) Expand all Loading... |
| 657 packssdw xmm0, xmm0 | 991 packssdw xmm0, xmm0 |
| 658 lea eax, [eax + 16] | 992 lea eax, [eax + 16] |
| 659 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 | 993 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 |
| 660 lea edx, [edx + 8] | 994 lea edx, [edx + 8] |
| 661 sub ecx, 4 | 995 sub ecx, 4 |
| 662 jg convertloop | 996 jg convertloop |
| 663 ret | 997 ret |
| 664 } | 998 } |
| 665 } | 999 } |
| 666 | 1000 |
| 667 __declspec(naked) __declspec(align(16)) | 1001 __declspec(naked) |
| 668 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1002 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 669 __asm { | 1003 __asm { |
| 670 mov eax, [esp + 4] // src_argb | 1004 mov eax, [esp + 4] // src_argb |
| 671 mov edx, [esp + 8] // dst_rgb | 1005 mov edx, [esp + 8] // dst_rgb |
| 672 mov ecx, [esp + 12] // pix | 1006 mov ecx, [esp + 12] // pix |
| 673 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 | 1007 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 |
| 674 psllw xmm4, 12 | 1008 psllw xmm4, 12 |
| 675 movdqa xmm3, xmm4 // generate mask 0x00f000f0 | 1009 movdqa xmm3, xmm4 // generate mask 0x00f000f0 |
| 676 psrlw xmm3, 8 | 1010 psrlw xmm3, 8 |
| 677 | 1011 |
| 678 convertloop: | 1012 convertloop: |
| 679 movdqu xmm0, [eax] // fetch 4 pixels of argb | 1013 movdqu xmm0, [eax] // fetch 4 pixels of argb |
| 680 movdqa xmm1, xmm0 | 1014 movdqa xmm1, xmm0 |
| 681 pand xmm0, xmm3 // low nibble | 1015 pand xmm0, xmm3 // low nibble |
| 682 pand xmm1, xmm4 // high nibble | 1016 pand xmm1, xmm4 // high nibble |
| 683 psrld xmm0, 4 | 1017 psrld xmm0, 4 |
| 684 psrld xmm1, 8 | 1018 psrld xmm1, 8 |
| 685 por xmm0, xmm1 | 1019 por xmm0, xmm1 |
| 686 packuswb xmm0, xmm0 | 1020 packuswb xmm0, xmm0 |
| 687 lea eax, [eax + 16] | 1021 lea eax, [eax + 16] |
| 688 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 | 1022 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 |
| 689 lea edx, [edx + 8] | 1023 lea edx, [edx + 8] |
| 690 sub ecx, 4 | 1024 sub ecx, 4 |
| 691 jg convertloop | 1025 jg convertloop |
| 692 ret | 1026 ret |
| 693 } | 1027 } |
| 694 } | 1028 } |
| 695 | 1029 |
| 696 #ifdef HAS_ARGBTORGB565ROW_AVX2 | 1030 #ifdef HAS_ARGBTORGB565ROW_AVX2 |
| 697 __declspec(naked) __declspec(align(16)) | 1031 __declspec(naked) |
| 698 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1032 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 699 __asm { | 1033 __asm { |
| 700 mov eax, [esp + 4] // src_argb | 1034 mov eax, [esp + 4] // src_argb |
| 701 mov edx, [esp + 8] // dst_rgb | 1035 mov edx, [esp + 8] // dst_rgb |
| 702 mov ecx, [esp + 12] // pix | 1036 mov ecx, [esp + 12] // pix |
| 703 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f | 1037 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
| 704 vpsrld ymm3, ymm3, 27 | 1038 vpsrld ymm3, ymm3, 27 |
| 705 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 | 1039 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
| 706 vpsrld ymm4, ymm4, 26 | 1040 vpsrld ymm4, ymm4, 26 |
| 707 vpslld ymm4, ymm4, 5 | 1041 vpslld ymm4, ymm4, 5 |
| 708 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800 | 1042 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
| 709 vpslld ymm5, ymm5, 11 | |
| 710 | 1043 |
| 711 convertloop: | 1044 convertloop: |
| 712 vmovdqu ymm0, [eax] // fetch 8 pixels of argb | 1045 vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| 713 vpsrld ymm2, ymm0, 5 // G | 1046 vpsrld ymm2, ymm0, 5 // G |
| 714 vpsrld ymm1, ymm0, 3 // B | 1047 vpsrld ymm1, ymm0, 3 // B |
| 715 vpslld ymm0, ymm0, 8 // R | 1048 vpsrld ymm0, ymm0, 8 // R |
| 716 vpand ymm2, ymm2, ymm4 // G | 1049 vpand ymm2, ymm2, ymm4 // G |
| 717 vpand ymm1, ymm1, ymm3 // B | 1050 vpand ymm1, ymm1, ymm3 // B |
| 718 vpsrad ymm0, ymm0, 16 // R | |
| 719 vpand ymm0, ymm0, ymm5 // R | 1051 vpand ymm0, ymm0, ymm5 // R |
| 720 vpor ymm1, ymm1, ymm2 // BG | 1052 vpor ymm1, ymm1, ymm2 // BG |
| 721 vpor ymm0, ymm0, ymm1 // BGR | 1053 vpor ymm0, ymm0, ymm1 // BGR |
| 722 vpackssdw ymm0, ymm0, ymm0 | 1054 vpackusdw ymm0, ymm0, ymm0 |
| 723 vpermq ymm0, ymm0, 0xd8 | 1055 vpermq ymm0, ymm0, 0xd8 |
| 724 lea eax, [eax + 32] | 1056 lea eax, [eax + 32] |
| 725 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 | 1057 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 |
| 726 lea edx, [edx + 16] | 1058 lea edx, [edx + 16] |
| 727 sub ecx, 8 | 1059 sub ecx, 8 |
| 728 jg convertloop | 1060 jg convertloop |
| 729 vzeroupper | 1061 vzeroupper |
| 730 ret | 1062 ret |
| 731 } | 1063 } |
| 732 } | 1064 } |
| 733 #endif // HAS_ARGBTORGB565ROW_AVX2 | 1065 #endif // HAS_ARGBTORGB565ROW_AVX2 |
| 734 | 1066 |
| 735 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 | 1067 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 |
| 736 __declspec(naked) __declspec(align(16)) | 1068 __declspec(naked) |
| 737 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1069 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 738 __asm { | 1070 __asm { |
| 739 mov eax, [esp + 4] // src_argb | 1071 mov eax, [esp + 4] // src_argb |
| 740 mov edx, [esp + 8] // dst_rgb | 1072 mov edx, [esp + 8] // dst_rgb |
| 741 mov ecx, [esp + 12] // pix | 1073 mov ecx, [esp + 12] // pix |
| 742 vpcmpeqb ymm4, ymm4, ymm4 | 1074 vpcmpeqb ymm4, ymm4, ymm4 |
| 743 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f | 1075 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f |
| 744 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 | 1076 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 |
| 745 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 | 1077 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 |
| 746 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 | 1078 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 |
| (...skipping 19 matching lines...) Expand all Loading... |
| 766 lea edx, [edx + 16] | 1098 lea edx, [edx + 16] |
| 767 sub ecx, 8 | 1099 sub ecx, 8 |
| 768 jg convertloop | 1100 jg convertloop |
| 769 vzeroupper | 1101 vzeroupper |
| 770 ret | 1102 ret |
| 771 } | 1103 } |
| 772 } | 1104 } |
| 773 #endif // HAS_ARGBTOARGB1555ROW_AVX2 | 1105 #endif // HAS_ARGBTOARGB1555ROW_AVX2 |
| 774 | 1106 |
| 775 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 | 1107 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 |
| 776 __declspec(naked) __declspec(align(16)) | 1108 __declspec(naked) |
| 777 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1109 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 778 __asm { | 1110 __asm { |
| 779 mov eax, [esp + 4] // src_argb | 1111 mov eax, [esp + 4] // src_argb |
| 780 mov edx, [esp + 8] // dst_rgb | 1112 mov edx, [esp + 8] // dst_rgb |
| 781 mov ecx, [esp + 12] // pix | 1113 mov ecx, [esp + 12] // pix |
| 782 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 | 1114 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 |
| 783 vpsllw ymm4, ymm4, 12 | 1115 vpsllw ymm4, ymm4, 12 |
| 784 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 | 1116 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 |
| 785 | 1117 |
| 786 convertloop: | 1118 convertloop: |
| (...skipping 10 matching lines...) Expand all Loading... |
| 797 lea edx, [edx + 16] | 1129 lea edx, [edx + 16] |
| 798 sub ecx, 8 | 1130 sub ecx, 8 |
| 799 jg convertloop | 1131 jg convertloop |
| 800 vzeroupper | 1132 vzeroupper |
| 801 ret | 1133 ret |
| 802 } | 1134 } |
| 803 } | 1135 } |
| 804 #endif // HAS_ARGBTOARGB4444ROW_AVX2 | 1136 #endif // HAS_ARGBTOARGB4444ROW_AVX2 |
| 805 | 1137 |
| 806 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. | 1138 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
| 807 __declspec(naked) __declspec(align(16)) | 1139 __declspec(naked) |
| 808 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1140 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 809 __asm { | 1141 __asm { |
| 810 mov eax, [esp + 4] /* src_argb */ | 1142 mov eax, [esp + 4] /* src_argb */ |
| 811 mov edx, [esp + 8] /* dst_y */ | 1143 mov edx, [esp + 8] /* dst_y */ |
| 812 mov ecx, [esp + 12] /* pix */ | 1144 mov ecx, [esp + 12] /* pix */ |
| 813 movdqa xmm4, kARGBToY | 1145 movdqa xmm4, kARGBToY |
| 814 movdqa xmm5, kAddY16 | 1146 movdqa xmm5, kAddY16 |
| 815 | 1147 |
| 816 convertloop: | 1148 convertloop: |
| 817 movdqu xmm0, [eax] | 1149 movdqu xmm0, [eax] |
| (...skipping 14 matching lines...) Expand all Loading... |
| 832 movdqu [edx], xmm0 | 1164 movdqu [edx], xmm0 |
| 833 lea edx, [edx + 16] | 1165 lea edx, [edx + 16] |
| 834 sub ecx, 16 | 1166 sub ecx, 16 |
| 835 jg convertloop | 1167 jg convertloop |
| 836 ret | 1168 ret |
| 837 } | 1169 } |
| 838 } | 1170 } |
| 839 | 1171 |
| 840 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. | 1172 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. |
| 841 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. | 1173 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. |
| 842 __declspec(naked) __declspec(align(16)) | 1174 __declspec(naked) |
| 843 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1175 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 844 __asm { | 1176 __asm { |
| 845 mov eax, [esp + 4] /* src_argb */ | 1177 mov eax, [esp + 4] /* src_argb */ |
| 846 mov edx, [esp + 8] /* dst_y */ | 1178 mov edx, [esp + 8] /* dst_y */ |
| 847 mov ecx, [esp + 12] /* pix */ | 1179 mov ecx, [esp + 12] /* pix */ |
| 848 movdqa xmm4, kARGBToYJ | 1180 movdqa xmm4, kARGBToYJ |
| 849 movdqa xmm5, kAddYJ64 | 1181 movdqa xmm5, kAddYJ64 |
| 850 | 1182 |
| 851 convertloop: | 1183 convertloop: |
| 852 movdqu xmm0, [eax] | 1184 movdqu xmm0, [eax] |
| (...skipping 20 matching lines...) Expand all Loading... |
| 873 } | 1205 } |
| 874 } | 1206 } |
| 875 | 1207 |
| 876 #ifdef HAS_ARGBTOYROW_AVX2 | 1208 #ifdef HAS_ARGBTOYROW_AVX2 |
| 877 // vpermd for vphaddw + vpackuswb vpermd. | 1209 // vpermd for vphaddw + vpackuswb vpermd. |
| 878 static const lvec32 kPermdARGBToY_AVX = { | 1210 static const lvec32 kPermdARGBToY_AVX = { |
| 879 0, 4, 1, 5, 2, 6, 3, 7 | 1211 0, 4, 1, 5, 2, 6, 3, 7 |
| 880 }; | 1212 }; |
| 881 | 1213 |
| 882 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 1214 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
| 883 __declspec(naked) __declspec(align(32)) | 1215 __declspec(naked) |
| 884 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 1216 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
| 885 __asm { | 1217 __asm { |
| 886 mov eax, [esp + 4] /* src_argb */ | 1218 mov eax, [esp + 4] /* src_argb */ |
| 887 mov edx, [esp + 8] /* dst_y */ | 1219 mov edx, [esp + 8] /* dst_y */ |
| 888 mov ecx, [esp + 12] /* pix */ | 1220 mov ecx, [esp + 12] /* pix */ |
| 889 vbroadcastf128 ymm4, kARGBToY | 1221 vbroadcastf128 ymm4, kARGBToY |
| 890 vbroadcastf128 ymm5, kAddY16 | 1222 vbroadcastf128 ymm5, kAddY16 |
| 891 vmovdqu ymm6, kPermdARGBToY_AVX | 1223 vmovdqu ymm6, kPermdARGBToY_AVX |
| 892 | 1224 |
| 893 convertloop: | 1225 convertloop: |
| (...skipping 16 matching lines...) Expand all Loading... |
| 910 vmovdqu [edx], ymm0 | 1242 vmovdqu [edx], ymm0 |
| 911 lea edx, [edx + 32] | 1243 lea edx, [edx + 32] |
| 912 sub ecx, 32 | 1244 sub ecx, 32 |
| 913 jg convertloop | 1245 jg convertloop |
| 914 vzeroupper | 1246 vzeroupper |
| 915 ret | 1247 ret |
| 916 } | 1248 } |
| 917 } | 1249 } |
| 918 #endif // HAS_ARGBTOYROW_AVX2 | 1250 #endif // HAS_ARGBTOYROW_AVX2 |
| 919 | 1251 |
| 920 #ifdef HAS_ARGBTOYROW_AVX2 | 1252 #ifdef HAS_ARGBTOYJROW_AVX2 |
| 921 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 1253 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
| 922 __declspec(naked) __declspec(align(32)) | 1254 __declspec(naked) |
| 923 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 1255 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
| 924 __asm { | 1256 __asm { |
| 925 mov eax, [esp + 4] /* src_argb */ | 1257 mov eax, [esp + 4] /* src_argb */ |
| 926 mov edx, [esp + 8] /* dst_y */ | 1258 mov edx, [esp + 8] /* dst_y */ |
| 927 mov ecx, [esp + 12] /* pix */ | 1259 mov ecx, [esp + 12] /* pix */ |
| 928 vbroadcastf128 ymm4, kARGBToYJ | 1260 vbroadcastf128 ymm4, kARGBToYJ |
| 929 vbroadcastf128 ymm5, kAddYJ64 | 1261 vbroadcastf128 ymm5, kAddYJ64 |
| 930 vmovdqu ymm6, kPermdARGBToY_AVX | 1262 vmovdqu ymm6, kPermdARGBToY_AVX |
| 931 | 1263 |
| 932 convertloop: | 1264 convertloop: |
| (...skipping 18 matching lines...) Expand all Loading... |
| 951 lea edx, [edx + 32] | 1283 lea edx, [edx + 32] |
| 952 sub ecx, 32 | 1284 sub ecx, 32 |
| 953 jg convertloop | 1285 jg convertloop |
| 954 | 1286 |
| 955 vzeroupper | 1287 vzeroupper |
| 956 ret | 1288 ret |
| 957 } | 1289 } |
| 958 } | 1290 } |
| 959 #endif // HAS_ARGBTOYJROW_AVX2 | 1291 #endif // HAS_ARGBTOYJROW_AVX2 |
| 960 | 1292 |
| 961 __declspec(naked) __declspec(align(16)) | 1293 __declspec(naked) |
| 962 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1294 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 963 __asm { | 1295 __asm { |
| 964 mov eax, [esp + 4] /* src_argb */ | 1296 mov eax, [esp + 4] /* src_argb */ |
| 965 mov edx, [esp + 8] /* dst_y */ | 1297 mov edx, [esp + 8] /* dst_y */ |
| 966 mov ecx, [esp + 12] /* pix */ | 1298 mov ecx, [esp + 12] /* pix */ |
| 967 movdqa xmm4, kBGRAToY | 1299 movdqa xmm4, kBGRAToY |
| 968 movdqa xmm5, kAddY16 | 1300 movdqa xmm5, kAddY16 |
| 969 | 1301 |
| 970 convertloop: | 1302 convertloop: |
| 971 movdqu xmm0, [eax] | 1303 movdqu xmm0, [eax] |
| (...skipping 12 matching lines...) Expand all Loading... |
| 984 packuswb xmm0, xmm2 | 1316 packuswb xmm0, xmm2 |
| 985 paddb xmm0, xmm5 | 1317 paddb xmm0, xmm5 |
| 986 movdqu [edx], xmm0 | 1318 movdqu [edx], xmm0 |
| 987 lea edx, [edx + 16] | 1319 lea edx, [edx + 16] |
| 988 sub ecx, 16 | 1320 sub ecx, 16 |
| 989 jg convertloop | 1321 jg convertloop |
| 990 ret | 1322 ret |
| 991 } | 1323 } |
| 992 } | 1324 } |
| 993 | 1325 |
| 994 __declspec(naked) __declspec(align(16)) | 1326 __declspec(naked) |
| 995 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1327 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 996 __asm { | 1328 __asm { |
| 997 mov eax, [esp + 4] /* src_argb */ | 1329 mov eax, [esp + 4] /* src_argb */ |
| 998 mov edx, [esp + 8] /* dst_y */ | 1330 mov edx, [esp + 8] /* dst_y */ |
| 999 mov ecx, [esp + 12] /* pix */ | 1331 mov ecx, [esp + 12] /* pix */ |
| 1000 movdqa xmm4, kABGRToY | 1332 movdqa xmm4, kABGRToY |
| 1001 movdqa xmm5, kAddY16 | 1333 movdqa xmm5, kAddY16 |
| 1002 | 1334 |
| 1003 convertloop: | 1335 convertloop: |
| 1004 movdqu xmm0, [eax] | 1336 movdqu xmm0, [eax] |
| (...skipping 12 matching lines...) Expand all Loading... |
| 1017 packuswb xmm0, xmm2 | 1349 packuswb xmm0, xmm2 |
| 1018 paddb xmm0, xmm5 | 1350 paddb xmm0, xmm5 |
| 1019 movdqu [edx], xmm0 | 1351 movdqu [edx], xmm0 |
| 1020 lea edx, [edx + 16] | 1352 lea edx, [edx + 16] |
| 1021 sub ecx, 16 | 1353 sub ecx, 16 |
| 1022 jg convertloop | 1354 jg convertloop |
| 1023 ret | 1355 ret |
| 1024 } | 1356 } |
| 1025 } | 1357 } |
| 1026 | 1358 |
| 1027 __declspec(naked) __declspec(align(16)) | 1359 __declspec(naked) |
| 1028 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1360 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 1029 __asm { | 1361 __asm { |
| 1030 mov eax, [esp + 4] /* src_argb */ | 1362 mov eax, [esp + 4] /* src_argb */ |
| 1031 mov edx, [esp + 8] /* dst_y */ | 1363 mov edx, [esp + 8] /* dst_y */ |
| 1032 mov ecx, [esp + 12] /* pix */ | 1364 mov ecx, [esp + 12] /* pix */ |
| 1033 movdqa xmm4, kRGBAToY | 1365 movdqa xmm4, kRGBAToY |
| 1034 movdqa xmm5, kAddY16 | 1366 movdqa xmm5, kAddY16 |
| 1035 | 1367 |
| 1036 convertloop: | 1368 convertloop: |
| 1037 movdqu xmm0, [eax] | 1369 movdqu xmm0, [eax] |
| (...skipping 12 matching lines...) Expand all Loading... |
| 1050 packuswb xmm0, xmm2 | 1382 packuswb xmm0, xmm2 |
| 1051 paddb xmm0, xmm5 | 1383 paddb xmm0, xmm5 |
| 1052 movdqu [edx], xmm0 | 1384 movdqu [edx], xmm0 |
| 1053 lea edx, [edx + 16] | 1385 lea edx, [edx + 16] |
| 1054 sub ecx, 16 | 1386 sub ecx, 16 |
| 1055 jg convertloop | 1387 jg convertloop |
| 1056 ret | 1388 ret |
| 1057 } | 1389 } |
| 1058 } | 1390 } |
| 1059 | 1391 |
| 1060 __declspec(naked) __declspec(align(16)) | 1392 __declspec(naked) |
| 1061 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1393 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1062 uint8* dst_u, uint8* dst_v, int width) { | 1394 uint8* dst_u, uint8* dst_v, int width) { |
| 1063 __asm { | 1395 __asm { |
| 1064 push esi | 1396 push esi |
| 1065 push edi | 1397 push edi |
| 1066 mov eax, [esp + 8 + 4] // src_argb | 1398 mov eax, [esp + 8 + 4] // src_argb |
| 1067 mov esi, [esp + 8 + 8] // src_stride_argb | 1399 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1068 mov edx, [esp + 8 + 12] // dst_u | 1400 mov edx, [esp + 8 + 12] // dst_u |
| 1069 mov edi, [esp + 8 + 16] // dst_v | 1401 mov edi, [esp + 8 + 16] // dst_v |
| 1070 mov ecx, [esp + 8 + 20] // pix | 1402 mov ecx, [esp + 8 + 20] // pix |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1120 lea edx, [edx + 8] | 1452 lea edx, [edx + 8] |
| 1121 sub ecx, 16 | 1453 sub ecx, 16 |
| 1122 jg convertloop | 1454 jg convertloop |
| 1123 | 1455 |
| 1124 pop edi | 1456 pop edi |
| 1125 pop esi | 1457 pop esi |
| 1126 ret | 1458 ret |
| 1127 } | 1459 } |
| 1128 } | 1460 } |
| 1129 | 1461 |
| 1130 __declspec(naked) __declspec(align(16)) | 1462 __declspec(naked) |
| 1131 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1463 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1132 uint8* dst_u, uint8* dst_v, int width) { | 1464 uint8* dst_u, uint8* dst_v, int width) { |
| 1133 __asm { | 1465 __asm { |
| 1134 push esi | 1466 push esi |
| 1135 push edi | 1467 push edi |
| 1136 mov eax, [esp + 8 + 4] // src_argb | 1468 mov eax, [esp + 8 + 4] // src_argb |
| 1137 mov esi, [esp + 8 + 8] // src_stride_argb | 1469 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1138 mov edx, [esp + 8 + 12] // dst_u | 1470 mov edx, [esp + 8 + 12] // dst_u |
| 1139 mov edi, [esp + 8 + 16] // dst_v | 1471 mov edi, [esp + 8 + 16] // dst_v |
| 1140 mov ecx, [esp + 8 + 20] // pix | 1472 mov ecx, [esp + 8 + 20] // pix |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1192 sub ecx, 16 | 1524 sub ecx, 16 |
| 1193 jg convertloop | 1525 jg convertloop |
| 1194 | 1526 |
| 1195 pop edi | 1527 pop edi |
| 1196 pop esi | 1528 pop esi |
| 1197 ret | 1529 ret |
| 1198 } | 1530 } |
| 1199 } | 1531 } |
| 1200 | 1532 |
| 1201 #ifdef HAS_ARGBTOUVROW_AVX2 | 1533 #ifdef HAS_ARGBTOUVROW_AVX2 |
| 1202 __declspec(naked) __declspec(align(32)) | 1534 __declspec(naked) |
| 1203 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, | 1535 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
| 1204 uint8* dst_u, uint8* dst_v, int width) { | 1536 uint8* dst_u, uint8* dst_v, int width) { |
| 1205 __asm { | 1537 __asm { |
| 1206 push esi | 1538 push esi |
| 1207 push edi | 1539 push edi |
| 1208 mov eax, [esp + 8 + 4] // src_argb | 1540 mov eax, [esp + 8 + 4] // src_argb |
| 1209 mov esi, [esp + 8 + 8] // src_stride_argb | 1541 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1210 mov edx, [esp + 8 + 12] // dst_u | 1542 mov edx, [esp + 8 + 12] // dst_u |
| 1211 mov edi, [esp + 8 + 16] // dst_v | 1543 mov edi, [esp + 8 + 16] // dst_v |
| 1212 mov ecx, [esp + 8 + 20] // pix | 1544 mov ecx, [esp + 8 + 20] // pix |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1257 jg convertloop | 1589 jg convertloop |
| 1258 | 1590 |
| 1259 pop edi | 1591 pop edi |
| 1260 pop esi | 1592 pop esi |
| 1261 vzeroupper | 1593 vzeroupper |
| 1262 ret | 1594 ret |
| 1263 } | 1595 } |
| 1264 } | 1596 } |
| 1265 #endif // HAS_ARGBTOUVROW_AVX2 | 1597 #endif // HAS_ARGBTOUVROW_AVX2 |
| 1266 | 1598 |
| 1267 __declspec(naked) __declspec(align(16)) | 1599 __declspec(naked) |
| 1268 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, | 1600 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, |
| 1269 uint8* dst_u, uint8* dst_v, int width) { | 1601 uint8* dst_u, uint8* dst_v, int width) { |
| 1270 __asm { | 1602 __asm { |
| 1271 push edi | 1603 push edi |
| 1272 mov eax, [esp + 4 + 4] // src_argb | 1604 mov eax, [esp + 4 + 4] // src_argb |
| 1273 mov edx, [esp + 4 + 8] // dst_u | 1605 mov edx, [esp + 4 + 8] // dst_u |
| 1274 mov edi, [esp + 4 + 12] // dst_v | 1606 mov edi, [esp + 4 + 12] // dst_v |
| 1275 mov ecx, [esp + 4 + 16] // pix | 1607 mov ecx, [esp + 4 + 16] // pix |
| 1276 movdqa xmm5, kAddUV128 | 1608 movdqa xmm5, kAddUV128 |
| 1277 movdqa xmm6, kARGBToV | 1609 movdqa xmm6, kARGBToV |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1314 movdqu [edx + edi], xmm0 | 1646 movdqu [edx + edi], xmm0 |
| 1315 lea edx, [edx + 16] | 1647 lea edx, [edx + 16] |
| 1316 sub ecx, 16 | 1648 sub ecx, 16 |
| 1317 jg convertloop | 1649 jg convertloop |
| 1318 | 1650 |
| 1319 pop edi | 1651 pop edi |
| 1320 ret | 1652 ret |
| 1321 } | 1653 } |
| 1322 } | 1654 } |
| 1323 | 1655 |
| 1324 __declspec(naked) __declspec(align(16)) | 1656 __declspec(naked) |
| 1325 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, | 1657 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, |
| 1326 uint8* dst_u, uint8* dst_v, int width) { | 1658 uint8* dst_u, uint8* dst_v, int width) { |
| 1327 __asm { | 1659 __asm { |
| 1328 push edi | 1660 push edi |
| 1329 mov eax, [esp + 4 + 4] // src_argb | 1661 mov eax, [esp + 4 + 4] // src_argb |
| 1330 mov edx, [esp + 4 + 8] // dst_u | 1662 mov edx, [esp + 4 + 8] // dst_u |
| 1331 mov edi, [esp + 4 + 12] // dst_v | 1663 mov edi, [esp + 4 + 12] // dst_v |
| 1332 mov ecx, [esp + 4 + 16] // pix | 1664 mov ecx, [esp + 4 + 16] // pix |
| 1333 movdqa xmm5, kAddUV128 | 1665 movdqa xmm5, kAddUV128 |
| 1334 movdqa xmm6, kARGBToV | 1666 movdqa xmm6, kARGBToV |
| (...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1372 movhps qword ptr [edx + edi], xmm0 // V | 1704 movhps qword ptr [edx + edi], xmm0 // V |
| 1373 lea edx, [edx + 8] | 1705 lea edx, [edx + 8] |
| 1374 sub ecx, 16 | 1706 sub ecx, 16 |
| 1375 jg convertloop | 1707 jg convertloop |
| 1376 | 1708 |
| 1377 pop edi | 1709 pop edi |
| 1378 ret | 1710 ret |
| 1379 } | 1711 } |
| 1380 } | 1712 } |
| 1381 | 1713 |
| 1382 __declspec(naked) __declspec(align(16)) | 1714 __declspec(naked) |
| 1383 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1715 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1384 uint8* dst_u, uint8* dst_v, int width) { | 1716 uint8* dst_u, uint8* dst_v, int width) { |
| 1385 __asm { | 1717 __asm { |
| 1386 push esi | 1718 push esi |
| 1387 push edi | 1719 push edi |
| 1388 mov eax, [esp + 8 + 4] // src_argb | 1720 mov eax, [esp + 8 + 4] // src_argb |
| 1389 mov esi, [esp + 8 + 8] // src_stride_argb | 1721 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1390 mov edx, [esp + 8 + 12] // dst_u | 1722 mov edx, [esp + 8 + 12] // dst_u |
| 1391 mov edi, [esp + 8 + 16] // dst_v | 1723 mov edi, [esp + 8 + 16] // dst_v |
| 1392 mov ecx, [esp + 8 + 20] // pix | 1724 mov ecx, [esp + 8 + 20] // pix |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1442 lea edx, [edx + 8] | 1774 lea edx, [edx + 8] |
| 1443 sub ecx, 16 | 1775 sub ecx, 16 |
| 1444 jg convertloop | 1776 jg convertloop |
| 1445 | 1777 |
| 1446 pop edi | 1778 pop edi |
| 1447 pop esi | 1779 pop esi |
| 1448 ret | 1780 ret |
| 1449 } | 1781 } |
| 1450 } | 1782 } |
| 1451 | 1783 |
| 1452 __declspec(naked) __declspec(align(16)) | 1784 __declspec(naked) |
| 1453 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1785 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1454 uint8* dst_u, uint8* dst_v, int width) { | 1786 uint8* dst_u, uint8* dst_v, int width) { |
| 1455 __asm { | 1787 __asm { |
| 1456 push esi | 1788 push esi |
| 1457 push edi | 1789 push edi |
| 1458 mov eax, [esp + 8 + 4] // src_argb | 1790 mov eax, [esp + 8 + 4] // src_argb |
| 1459 mov esi, [esp + 8 + 8] // src_stride_argb | 1791 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1460 mov edx, [esp + 8 + 12] // dst_u | 1792 mov edx, [esp + 8 + 12] // dst_u |
| 1461 mov edi, [esp + 8 + 16] // dst_v | 1793 mov edi, [esp + 8 + 16] // dst_v |
| 1462 mov ecx, [esp + 8 + 20] // pix | 1794 mov ecx, [esp + 8 + 20] // pix |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1512 lea edx, [edx + 8] | 1844 lea edx, [edx + 8] |
| 1513 sub ecx, 16 | 1845 sub ecx, 16 |
| 1514 jg convertloop | 1846 jg convertloop |
| 1515 | 1847 |
| 1516 pop edi | 1848 pop edi |
| 1517 pop esi | 1849 pop esi |
| 1518 ret | 1850 ret |
| 1519 } | 1851 } |
| 1520 } | 1852 } |
| 1521 | 1853 |
| 1522 __declspec(naked) __declspec(align(16)) | 1854 __declspec(naked) |
| 1523 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1855 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1524 uint8* dst_u, uint8* dst_v, int width) { | 1856 uint8* dst_u, uint8* dst_v, int width) { |
| 1525 __asm { | 1857 __asm { |
| 1526 push esi | 1858 push esi |
| 1527 push edi | 1859 push edi |
| 1528 mov eax, [esp + 8 + 4] // src_argb | 1860 mov eax, [esp + 8 + 4] // src_argb |
| 1529 mov esi, [esp + 8 + 8] // src_stride_argb | 1861 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1530 mov edx, [esp + 8 + 12] // dst_u | 1862 mov edx, [esp + 8 + 12] // dst_u |
| 1531 mov edi, [esp + 8 + 16] // dst_v | 1863 mov edi, [esp + 8 + 16] // dst_v |
| 1532 mov ecx, [esp + 8 + 20] // pix | 1864 mov ecx, [esp + 8 + 20] // pix |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1583 sub ecx, 16 | 1915 sub ecx, 16 |
| 1584 jg convertloop | 1916 jg convertloop |
| 1585 | 1917 |
| 1586 pop edi | 1918 pop edi |
| 1587 pop esi | 1919 pop esi |
| 1588 ret | 1920 ret |
| 1589 } | 1921 } |
| 1590 } | 1922 } |
| 1591 #endif // HAS_ARGBTOYROW_SSSE3 | 1923 #endif // HAS_ARGBTOYROW_SSSE3 |
| 1592 | 1924 |
| 1925 // Read 16 UV from 444 |
| 1926 #define READYUV444_AVX2 __asm { \ |
| 1927 __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \ |
| 1928 __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \ |
| 1929 __asm lea esi, [esi + 16] \ |
| 1930 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1931 __asm vpermq ymm1, ymm1, 0xd8 \ |
| 1932 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| 1933 } |
| 1934 |
| 1593 // Read 8 UV from 422, upsample to 16 UV. | 1935 // Read 8 UV from 422, upsample to 16 UV. |
| 1594 #define READYUV422_AVX2 __asm { \ | 1936 #define READYUV422_AVX2 __asm { \ |
| 1595 __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ | 1937 __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ |
| 1596 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ | 1938 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ |
| 1597 __asm lea esi, [esi + 8] \ | 1939 __asm lea esi, [esi + 8] \ |
| 1598 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ | 1940 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| 1599 __asm vpermq ymm0, ymm0, 0xd8 \ | 1941 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1600 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1942 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1601 } | 1943 } |
| 1602 | 1944 |
| 1945 // Read 4 UV from 411, upsample to 16 UV. |
| 1946 #define READYUV411_AVX2 __asm { \ |
| 1947 __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \ |
| 1948 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \ |
| 1949 __asm lea esi, [esi + 4] \ |
| 1950 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| 1951 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1952 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1953 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ |
| 1954 } |
| 1955 |
| 1603 // Read 8 UV from NV12, upsample to 16 UV. | 1956 // Read 8 UV from NV12, upsample to 16 UV. |
| 1604 #define READNV12_AVX2 __asm { \ | 1957 #define READNV12_AVX2 __asm { \ |
| 1605 __asm vmovdqu xmm0, [esi] /* UV */ \ | 1958 __asm vmovdqu xmm0, [esi] /* UV */ \ |
| 1606 __asm lea esi, [esi + 16] \ | 1959 __asm lea esi, [esi + 16] \ |
| 1607 __asm vpermq ymm0, ymm0, 0xd8 \ | 1960 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1608 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1961 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1609 } | 1962 } |
| 1610 | 1963 |
| 1611 // Convert 16 pixels: 16 UV and 16 Y. | 1964 // Convert 16 pixels: 16 UV and 16 Y. |
| 1612 #define YUVTORGB_AVX2(YuvConstants) __asm { \ | 1965 #define YUVTORGB_AVX2(YuvConstants) __asm { \ |
| (...skipping 26 matching lines...) Expand all Loading... |
| 1639 | 1992 |
| 1640 // Store 16 ARGB values. | 1993 // Store 16 ARGB values. |
| 1641 #define STOREARGB_AVX2 __asm { \ | 1994 #define STOREARGB_AVX2 __asm { \ |
| 1642 /* Step 3: Weave into ARGB */ \ | 1995 /* Step 3: Weave into ARGB */ \ |
| 1643 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ | 1996 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ |
| 1644 __asm vpermq ymm0, ymm0, 0xd8 \ | 1997 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1645 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ | 1998 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ |
| 1646 __asm vpermq ymm2, ymm2, 0xd8 \ | 1999 __asm vpermq ymm2, ymm2, 0xd8 \ |
| 1647 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ | 2000 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ |
| 1648 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ | 2001 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ |
| 1649 __asm vmovdqu [edx], ymm1 \ | 2002 __asm vmovdqu 0[edx], ymm1 \ |
| 1650 __asm vmovdqu [edx + 32], ymm0 \ | 2003 __asm vmovdqu 32[edx], ymm0 \ |
| 1651 __asm lea edx, [edx + 64] \ | 2004 __asm lea edx, [edx + 64] \ |
| 1652 } | 2005 } |
| 1653 | 2006 |
| 1654 #ifdef HAS_I422TOARGBROW_AVX2 | 2007 #ifdef HAS_I422TOARGBROW_AVX2 |
| 1655 // 16 pixels | 2008 // 16 pixels |
| 1656 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2009 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 1657 __declspec(naked) __declspec(align(16)) | 2010 __declspec(naked) |
| 1658 void I422ToARGBRow_AVX2(const uint8* y_buf, | 2011 void I422ToARGBRow_AVX2(const uint8* y_buf, |
| 1659 const uint8* u_buf, | 2012 const uint8* u_buf, |
| 1660 const uint8* v_buf, | 2013 const uint8* v_buf, |
| 1661 uint8* dst_argb, | 2014 uint8* dst_argb, |
| 1662 int width) { | 2015 int width) { |
| 1663 __asm { | 2016 __asm { |
| 1664 push esi | 2017 push esi |
| 1665 push edi | 2018 push edi |
| 1666 mov eax, [esp + 8 + 4] // Y | 2019 mov eax, [esp + 8 + 4] // Y |
| 1667 mov esi, [esp + 8 + 8] // U | 2020 mov esi, [esp + 8 + 8] // U |
| (...skipping 12 matching lines...) Expand all Loading... |
| 1680 jg convertloop | 2033 jg convertloop |
| 1681 | 2034 |
| 1682 pop edi | 2035 pop edi |
| 1683 pop esi | 2036 pop esi |
| 1684 vzeroupper | 2037 vzeroupper |
| 1685 ret | 2038 ret |
| 1686 } | 2039 } |
| 1687 } | 2040 } |
| 1688 #endif // HAS_I422TOARGBROW_AVX2 | 2041 #endif // HAS_I422TOARGBROW_AVX2 |
| 1689 | 2042 |
| 2043 #ifdef HAS_J422TOARGBROW_AVX2 |
| 2044 // 16 pixels |
| 2045 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2046 __declspec(naked) |
| 2047 void J422ToARGBRow_AVX2(const uint8* y_buf, |
| 2048 const uint8* u_buf, |
| 2049 const uint8* v_buf, |
| 2050 uint8* dst_argb, |
| 2051 int width) { |
| 2052 __asm { |
| 2053 push esi |
| 2054 push edi |
| 2055 mov eax, [esp + 8 + 4] // Y |
| 2056 mov esi, [esp + 8 + 8] // U |
| 2057 mov edi, [esp + 8 + 12] // V |
| 2058 mov edx, [esp + 8 + 16] // argb |
| 2059 mov ecx, [esp + 8 + 20] // width |
| 2060 sub edi, esi |
| 2061 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2062 |
| 2063 convertloop: |
| 2064 READYUV422_AVX2 |
| 2065 YUVTORGB_AVX2(kYuvJConstants) |
| 2066 STOREARGB_AVX2 |
| 2067 |
| 2068 sub ecx, 16 |
| 2069 jg convertloop |
| 2070 |
| 2071 pop edi |
| 2072 pop esi |
| 2073 vzeroupper |
| 2074 ret |
| 2075 } |
| 2076 } |
| 2077 #endif // HAS_J422TOARGBROW_AVX2 |
| 2078 |
| 2079 #ifdef HAS_I444TOARGBROW_AVX2 |
| 2080 // 16 pixels |
| 2081 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). |
| 2082 __declspec(naked) |
| 2083 void I444ToARGBRow_AVX2(const uint8* y_buf, |
| 2084 const uint8* u_buf, |
| 2085 const uint8* v_buf, |
| 2086 uint8* dst_argb, |
| 2087 int width) { |
| 2088 __asm { |
| 2089 push esi |
| 2090 push edi |
| 2091 mov eax, [esp + 8 + 4] // Y |
| 2092 mov esi, [esp + 8 + 8] // U |
| 2093 mov edi, [esp + 8 + 12] // V |
| 2094 mov edx, [esp + 8 + 16] // argb |
| 2095 mov ecx, [esp + 8 + 20] // width |
| 2096 sub edi, esi |
| 2097 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2098 |
| 2099 convertloop: |
| 2100 READYUV444_AVX2 |
| 2101 YUVTORGB_AVX2(kYuvConstants) |
| 2102 STOREARGB_AVX2 |
| 2103 |
| 2104 sub ecx, 16 |
| 2105 jg convertloop |
| 2106 |
| 2107 pop edi |
| 2108 pop esi |
| 2109 vzeroupper |
| 2110 ret |
| 2111 } |
| 2112 } |
| 2113 #endif // HAS_I444TOARGBROW_AVX2 |
| 2114 |
| 2115 #ifdef HAS_I411TOARGBROW_AVX2 |
| 2116 // 16 pixels |
| 2117 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2118 __declspec(naked) |
| 2119 void I411ToARGBRow_AVX2(const uint8* y_buf, |
| 2120 const uint8* u_buf, |
| 2121 const uint8* v_buf, |
| 2122 uint8* dst_argb, |
| 2123 int width) { |
| 2124 __asm { |
| 2125 push esi |
| 2126 push edi |
| 2127 mov eax, [esp + 8 + 4] // Y |
| 2128 mov esi, [esp + 8 + 8] // U |
| 2129 mov edi, [esp + 8 + 12] // V |
| 2130 mov edx, [esp + 8 + 16] // argb |
| 2131 mov ecx, [esp + 8 + 20] // width |
| 2132 sub edi, esi |
| 2133 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2134 |
| 2135 convertloop: |
| 2136 READYUV411_AVX2 |
| 2137 YUVTORGB_AVX2(kYuvConstants) |
| 2138 STOREARGB_AVX2 |
| 2139 |
| 2140 sub ecx, 16 |
| 2141 jg convertloop |
| 2142 |
| 2143 pop edi |
| 2144 pop esi |
| 2145 vzeroupper |
| 2146 ret |
| 2147 } |
| 2148 } |
| 2149 #endif // HAS_I411TOARGBROW_AVX2 |
| 2150 |
| 1690 #ifdef HAS_NV12TOARGBROW_AVX2 | 2151 #ifdef HAS_NV12TOARGBROW_AVX2 |
| 1691 // 16 pixels. | 2152 // 16 pixels. |
| 1692 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2153 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 1693 __declspec(naked) __declspec(align(16)) | 2154 __declspec(naked) |
| 1694 void NV12ToARGBRow_AVX2(const uint8* y_buf, | 2155 void NV12ToARGBRow_AVX2(const uint8* y_buf, |
| 1695 const uint8* uv_buf, | 2156 const uint8* uv_buf, |
| 1696 uint8* dst_argb, | 2157 uint8* dst_argb, |
| 1697 int width) { | 2158 int width) { |
| 1698 __asm { | 2159 __asm { |
| 1699 push esi | 2160 push esi |
| 1700 mov eax, [esp + 4 + 4] // Y | 2161 mov eax, [esp + 4 + 4] // Y |
| 1701 mov esi, [esp + 4 + 8] // UV | 2162 mov esi, [esp + 4 + 8] // UV |
| 1702 mov edx, [esp + 4 + 12] // argb | 2163 mov edx, [esp + 4 + 12] // argb |
| 1703 mov ecx, [esp + 4 + 16] // width | 2164 mov ecx, [esp + 4 + 16] // width |
| 1704 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2165 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 1705 | 2166 |
| 1706 convertloop: | 2167 convertloop: |
| 1707 READNV12_AVX2 | 2168 READNV12_AVX2 |
| 1708 YUVTORGB_AVX2(kYuvConstants) | 2169 YUVTORGB_AVX2(kYuvConstants) |
| 1709 STOREARGB_AVX2 | 2170 STOREARGB_AVX2 |
| 1710 | 2171 |
| 1711 sub ecx, 16 | 2172 sub ecx, 16 |
| 1712 jg convertloop | 2173 jg convertloop |
| 1713 | 2174 |
| 1714 pop esi | 2175 pop esi |
| 2176 vzeroupper |
| 1715 ret | 2177 ret |
| 1716 } | 2178 } |
| 1717 } | 2179 } |
| 1718 #endif // HAS_NV12TOARGBROW_AVX2 | 2180 #endif // HAS_NV12TOARGBROW_AVX2 |
| 1719 | 2181 |
| 1720 #ifdef HAS_NV21TOARGBROW_AVX2 | 2182 #ifdef HAS_NV21TOARGBROW_AVX2 |
| 1721 // 16 pixels. | 2183 // 16 pixels. |
| 1722 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). | 2184 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 1723 __declspec(naked) __declspec(align(16)) | 2185 __declspec(naked) |
| 1724 void NV21ToARGBRow_AVX2(const uint8* y_buf, | 2186 void NV21ToARGBRow_AVX2(const uint8* y_buf, |
| 1725 const uint8* uv_buf, | 2187 const uint8* uv_buf, |
| 1726 uint8* dst_argb, | 2188 uint8* dst_argb, |
| 1727 int width) { | 2189 int width) { |
| 1728 __asm { | 2190 __asm { |
| 1729 push esi | 2191 push esi |
| 1730 mov eax, [esp + 4 + 4] // Y | 2192 mov eax, [esp + 4 + 4] // Y |
| 1731 mov esi, [esp + 4 + 8] // UV | 2193 mov esi, [esp + 4 + 8] // UV |
| 1732 mov edx, [esp + 4 + 12] // argb | 2194 mov edx, [esp + 4 + 12] // argb |
| 1733 mov ecx, [esp + 4 + 16] // width | 2195 mov ecx, [esp + 4 + 16] // width |
| 1734 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2196 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 1735 | 2197 |
| 1736 convertloop: | 2198 convertloop: |
| 1737 READNV12_AVX2 | 2199 READNV12_AVX2 |
| 1738 YUVTORGB_AVX2(kYvuConstants) | 2200 YUVTORGB_AVX2(kYvuConstants) |
| 1739 STOREARGB_AVX2 | 2201 STOREARGB_AVX2 |
| 1740 | 2202 |
| 1741 sub ecx, 16 | 2203 sub ecx, 16 |
| 1742 jg convertloop | 2204 jg convertloop |
| 1743 | 2205 |
| 1744 pop esi | 2206 pop esi |
| 2207 vzeroupper |
| 1745 ret | 2208 ret |
| 1746 } | 2209 } |
| 1747 } | 2210 } |
| 1748 #endif // HAS_NV21TOARGBROW_AVX2 | 2211 #endif // HAS_NV21TOARGBROW_AVX2 |
| 1749 | 2212 |
| 1750 #ifdef HAS_I422TOBGRAROW_AVX2 | 2213 #ifdef HAS_I422TOBGRAROW_AVX2 |
| 1751 // 16 pixels | 2214 // 16 pixels |
| 1752 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 2215 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
| 1753 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. | 2216 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
| 1754 __declspec(naked) __declspec(align(16)) | 2217 __declspec(naked) |
| 1755 void I422ToBGRARow_AVX2(const uint8* y_buf, | 2218 void I422ToBGRARow_AVX2(const uint8* y_buf, |
| 1756 const uint8* u_buf, | 2219 const uint8* u_buf, |
| 1757 const uint8* v_buf, | 2220 const uint8* v_buf, |
| 1758 uint8* dst_argb, | 2221 uint8* dst_argb, |
| 1759 int width) { | 2222 int width) { |
| 1760 __asm { | 2223 __asm { |
| 1761 push esi | 2224 push esi |
| 1762 push edi | 2225 push edi |
| 1763 mov eax, [esp + 8 + 4] // Y | 2226 mov eax, [esp + 8 + 4] // Y |
| 1764 mov esi, [esp + 8 + 8] // U | 2227 mov esi, [esp + 8 + 8] // U |
| (...skipping 25 matching lines...) Expand all Loading... |
| 1790 vzeroupper | 2253 vzeroupper |
| 1791 ret | 2254 ret |
| 1792 } | 2255 } |
| 1793 } | 2256 } |
| 1794 #endif // HAS_I422TOBGRAROW_AVX2 | 2257 #endif // HAS_I422TOBGRAROW_AVX2 |
| 1795 | 2258 |
| 1796 #ifdef HAS_I422TORGBAROW_AVX2 | 2259 #ifdef HAS_I422TORGBAROW_AVX2 |
| 1797 // 16 pixels | 2260 // 16 pixels |
| 1798 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | 2261 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
| 1799 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. | 2262 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
| 1800 __declspec(naked) __declspec(align(16)) | 2263 __declspec(naked) |
| 1801 void I422ToRGBARow_AVX2(const uint8* y_buf, | 2264 void I422ToRGBARow_AVX2(const uint8* y_buf, |
| 1802 const uint8* u_buf, | 2265 const uint8* u_buf, |
| 1803 const uint8* v_buf, | 2266 const uint8* v_buf, |
| 1804 uint8* dst_argb, | 2267 uint8* dst_argb, |
| 1805 int width) { | 2268 int width) { |
| 1806 __asm { | 2269 __asm { |
| 1807 push esi | 2270 push esi |
| 1808 push edi | 2271 push edi |
| 1809 mov eax, [esp + 8 + 4] // Y | 2272 mov eax, [esp + 8 + 4] // Y |
| 1810 mov esi, [esp + 8 + 8] // U | 2273 mov esi, [esp + 8 + 8] // U |
| (...skipping 25 matching lines...) Expand all Loading... |
| 1836 vzeroupper | 2299 vzeroupper |
| 1837 ret | 2300 ret |
| 1838 } | 2301 } |
| 1839 } | 2302 } |
| 1840 #endif // HAS_I422TORGBAROW_AVX2 | 2303 #endif // HAS_I422TORGBAROW_AVX2 |
| 1841 | 2304 |
| 1842 #ifdef HAS_I422TOABGRROW_AVX2 | 2305 #ifdef HAS_I422TOABGRROW_AVX2 |
| 1843 // 16 pixels | 2306 // 16 pixels |
| 1844 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 2307 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
| 1845 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. | 2308 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
| 1846 __declspec(naked) __declspec(align(16)) | 2309 __declspec(naked) |
| 1847 void I422ToABGRRow_AVX2(const uint8* y_buf, | 2310 void I422ToABGRRow_AVX2(const uint8* y_buf, |
| 1848 const uint8* u_buf, | 2311 const uint8* u_buf, |
| 1849 const uint8* v_buf, | 2312 const uint8* v_buf, |
| 1850 uint8* dst_argb, | 2313 uint8* dst_argb, |
| 1851 int width) { | 2314 int width) { |
| 1852 __asm { | 2315 __asm { |
| 1853 push esi | 2316 push esi |
| 1854 push edi | 2317 push edi |
| 1855 mov eax, [esp + 8 + 4] // Y | 2318 mov eax, [esp + 8 + 4] // Y |
| 1856 mov esi, [esp + 8 + 8] // U | 2319 mov esi, [esp + 8 + 8] // U |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1907 | 2370 |
| 1908 // Read 2 UV from 411, upsample to 8 UV. | 2371 // Read 2 UV from 411, upsample to 8 UV. |
| 1909 #define READYUV411 __asm { \ | 2372 #define READYUV411 __asm { \ |
| 1910 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ | 2373 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ |
| 1911 __asm movd xmm0, ebx \ | 2374 __asm movd xmm0, ebx \ |
| 1912 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ | 2375 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ |
| 1913 __asm movd xmm1, ebx \ | 2376 __asm movd xmm1, ebx \ |
| 1914 __asm lea esi, [esi + 2] \ | 2377 __asm lea esi, [esi + 2] \ |
| 1915 __asm punpcklbw xmm0, xmm1 /* UV */ \ | 2378 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| 1916 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2379 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| 1917 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ | 2380 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ |
| 1918 } | 2381 } |
| 1919 | 2382 |
| 1920 // Read 4 UV from NV12, upsample to 8 UV. | 2383 // Read 4 UV from NV12, upsample to 8 UV. |
| 1921 #define READNV12 __asm { \ | 2384 #define READNV12 __asm { \ |
| 1922 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ | 2385 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ |
| 1923 __asm lea esi, [esi + 8] \ | 2386 __asm lea esi, [esi + 8] \ |
| 1924 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2387 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| 1925 } | 2388 } |
| 1926 | 2389 |
| 1927 // Convert 8 pixels: 8 UV and 8 Y. | 2390 // Convert 8 pixels: 8 UV and 8 Y. |
| (...skipping 28 matching lines...) Expand all Loading... |
| 1956 } | 2419 } |
| 1957 | 2420 |
| 1958 // Store 8 ARGB values. | 2421 // Store 8 ARGB values. |
| 1959 #define STOREARGB __asm { \ | 2422 #define STOREARGB __asm { \ |
| 1960 /* Step 3: Weave into ARGB */ \ | 2423 /* Step 3: Weave into ARGB */ \ |
| 1961 __asm punpcklbw xmm0, xmm1 /* BG */ \ | 2424 __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| 1962 __asm punpcklbw xmm2, xmm5 /* RA */ \ | 2425 __asm punpcklbw xmm2, xmm5 /* RA */ \ |
| 1963 __asm movdqa xmm1, xmm0 \ | 2426 __asm movdqa xmm1, xmm0 \ |
| 1964 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ | 2427 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ |
| 1965 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ | 2428 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ |
| 1966 __asm movdqu [edx], xmm0 \ | 2429 __asm movdqu 0[edx], xmm0 \ |
| 1967 __asm movdqu [edx + 16], xmm1 \ | 2430 __asm movdqu 16[edx], xmm1 \ |
| 1968 __asm lea edx, [edx + 32] \ | 2431 __asm lea edx, [edx + 32] \ |
| 1969 } | 2432 } |
| 1970 | 2433 |
| 1971 // Store 8 BGRA values. | 2434 // Store 8 BGRA values. |
| 1972 #define STOREBGRA __asm { \ | 2435 #define STOREBGRA __asm { \ |
| 1973 /* Step 3: Weave into BGRA */ \ | 2436 /* Step 3: Weave into BGRA */ \ |
| 1974 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ | 2437 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ |
| 1975 __asm punpcklbw xmm1, xmm0 /* GB */ \ | 2438 __asm punpcklbw xmm1, xmm0 /* GB */ \ |
| 1976 __asm punpcklbw xmm5, xmm2 /* AR */ \ | 2439 __asm punpcklbw xmm5, xmm2 /* AR */ \ |
| 1977 __asm movdqa xmm0, xmm5 \ | 2440 __asm movdqa xmm0, xmm5 \ |
| 1978 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ | 2441 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ |
| 1979 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ | 2442 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ |
| 1980 __asm movdqu [edx], xmm5 \ | 2443 __asm movdqu 0[edx], xmm5 \ |
| 1981 __asm movdqu [edx + 16], xmm0 \ | 2444 __asm movdqu 16[edx], xmm0 \ |
| 1982 __asm lea edx, [edx + 32] \ | 2445 __asm lea edx, [edx + 32] \ |
| 1983 } | 2446 } |
| 1984 | 2447 |
| 1985 // Store 8 ABGR values. | 2448 // Store 8 ABGR values. |
| 1986 #define STOREABGR __asm { \ | 2449 #define STOREABGR __asm { \ |
| 1987 /* Step 3: Weave into ABGR */ \ | 2450 /* Step 3: Weave into ABGR */ \ |
| 1988 __asm punpcklbw xmm2, xmm1 /* RG */ \ | 2451 __asm punpcklbw xmm2, xmm1 /* RG */ \ |
| 1989 __asm punpcklbw xmm0, xmm5 /* BA */ \ | 2452 __asm punpcklbw xmm0, xmm5 /* BA */ \ |
| 1990 __asm movdqa xmm1, xmm2 \ | 2453 __asm movdqa xmm1, xmm2 \ |
| 1991 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ | 2454 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ |
| 1992 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ | 2455 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ |
| 1993 __asm movdqu [edx], xmm2 \ | 2456 __asm movdqu 0[edx], xmm2 \ |
| 1994 __asm movdqu [edx + 16], xmm1 \ | 2457 __asm movdqu 16[edx], xmm1 \ |
| 1995 __asm lea edx, [edx + 32] \ | 2458 __asm lea edx, [edx + 32] \ |
| 1996 } | 2459 } |
| 1997 | 2460 |
| 1998 // Store 8 RGBA values. | 2461 // Store 8 RGBA values. |
| 1999 #define STORERGBA __asm { \ | 2462 #define STORERGBA __asm { \ |
| 2000 /* Step 3: Weave into RGBA */ \ | 2463 /* Step 3: Weave into RGBA */ \ |
| 2001 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ | 2464 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ |
| 2002 __asm punpcklbw xmm1, xmm2 /* GR */ \ | 2465 __asm punpcklbw xmm1, xmm2 /* GR */ \ |
| 2003 __asm punpcklbw xmm5, xmm0 /* AB */ \ | 2466 __asm punpcklbw xmm5, xmm0 /* AB */ \ |
| 2004 __asm movdqa xmm0, xmm5 \ | 2467 __asm movdqa xmm0, xmm5 \ |
| 2005 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ | 2468 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ |
| 2006 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ | 2469 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ |
| 2007 __asm movdqu [edx], xmm5 \ | 2470 __asm movdqu 0[edx], xmm5 \ |
| 2008 __asm movdqu [edx + 16], xmm0 \ | 2471 __asm movdqu 16[edx], xmm0 \ |
| 2009 __asm lea edx, [edx + 32] \ | 2472 __asm lea edx, [edx + 32] \ |
| 2010 } | 2473 } |
| 2011 | 2474 |
| 2012 // Store 8 RGB24 values. | 2475 // Store 8 RGB24 values. |
| 2013 #define STORERGB24 __asm { \ | 2476 #define STORERGB24 __asm { \ |
| 2014 /* Step 3: Weave into RRGB */ \ | 2477 /* Step 3: Weave into RRGB */ \ |
| 2015 __asm punpcklbw xmm0, xmm1 /* BG */ \ | 2478 __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| 2016 __asm punpcklbw xmm2, xmm2 /* RR */ \ | 2479 __asm punpcklbw xmm2, xmm2 /* RR */ \ |
| 2017 __asm movdqa xmm1, xmm0 \ | 2480 __asm movdqa xmm1, xmm0 \ |
| 2018 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ | 2481 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
| 2019 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ | 2482 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ |
| 2020 /* Step 4: RRGB -> RGB24 */ \ | 2483 /* Step 4: RRGB -> RGB24 */ \ |
| 2021 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ | 2484 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ |
| 2022 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ | 2485 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ |
| 2023 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ | 2486 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ |
| 2024 __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ | 2487 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ |
| 2025 __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ | 2488 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ |
| 2026 __asm lea edx, [edx + 24] \ | 2489 __asm lea edx, [edx + 24] \ |
| 2027 } | 2490 } |
| 2028 | 2491 |
| 2029 // Store 8 RAW values. | 2492 // Store 8 RAW values. |
| 2030 #define STORERAW __asm { \ | 2493 #define STORERAW __asm { \ |
| 2031 /* Step 3: Weave into RRGB */ \ | 2494 /* Step 3: Weave into RRGB */ \ |
| 2032 __asm punpcklbw xmm0, xmm1 /* BG */ \ | 2495 __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| 2033 __asm punpcklbw xmm2, xmm2 /* RR */ \ | 2496 __asm punpcklbw xmm2, xmm2 /* RR */ \ |
| 2034 __asm movdqa xmm1, xmm0 \ | 2497 __asm movdqa xmm1, xmm0 \ |
| 2035 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ | 2498 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
| 2036 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ | 2499 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ |
| 2037 /* Step 4: RRGB -> RAW */ \ | 2500 /* Step 4: RRGB -> RAW */ \ |
| 2038 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ | 2501 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ |
| 2039 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ | 2502 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ |
| 2040 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ | 2503 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ |
| 2041 __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ | 2504 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ |
| 2042 __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ | 2505 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ |
| 2043 __asm lea edx, [edx + 24] \ | 2506 __asm lea edx, [edx + 24] \ |
| 2044 } | 2507 } |
| 2045 | 2508 |
| 2046 // Store 8 RGB565 values. | 2509 // Store 8 RGB565 values. |
| 2047 #define STORERGB565 __asm { \ | 2510 #define STORERGB565 __asm { \ |
| 2048 /* Step 3: Weave into RRGB */ \ | 2511 /* Step 3: Weave into RRGB */ \ |
| 2049 __asm punpcklbw xmm0, xmm1 /* BG */ \ | 2512 __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| 2050 __asm punpcklbw xmm2, xmm2 /* RR */ \ | 2513 __asm punpcklbw xmm2, xmm2 /* RR */ \ |
| 2051 __asm movdqa xmm1, xmm0 \ | 2514 __asm movdqa xmm1, xmm0 \ |
| 2052 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ | 2515 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
| (...skipping 15 matching lines...) Expand all Loading... |
| 2068 __asm pslld xmm1, 8 /* R */ \ | 2531 __asm pslld xmm1, 8 /* R */ \ |
| 2069 __asm psrld xmm3, 3 /* B */ \ | 2532 __asm psrld xmm3, 3 /* B */ \ |
| 2070 __asm psrld xmm2, 5 /* G */ \ | 2533 __asm psrld xmm2, 5 /* G */ \ |
| 2071 __asm psrad xmm1, 16 /* R */ \ | 2534 __asm psrad xmm1, 16 /* R */ \ |
| 2072 __asm pand xmm3, xmm5 /* B */ \ | 2535 __asm pand xmm3, xmm5 /* B */ \ |
| 2073 __asm pand xmm2, xmm6 /* G */ \ | 2536 __asm pand xmm2, xmm6 /* G */ \ |
| 2074 __asm pand xmm1, xmm7 /* R */ \ | 2537 __asm pand xmm1, xmm7 /* R */ \ |
| 2075 __asm por xmm3, xmm2 /* BG */ \ | 2538 __asm por xmm3, xmm2 /* BG */ \ |
| 2076 __asm por xmm1, xmm3 /* BGR */ \ | 2539 __asm por xmm1, xmm3 /* BGR */ \ |
| 2077 __asm packssdw xmm0, xmm1 \ | 2540 __asm packssdw xmm0, xmm1 \ |
| 2078 __asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \ | 2541 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ |
| 2079 __asm lea edx, [edx + 16] \ | 2542 __asm lea edx, [edx + 16] \ |
| 2080 } | 2543 } |
| 2081 | 2544 |
| 2082 // 8 pixels. | 2545 // 8 pixels. |
| 2083 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). | 2546 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2084 __declspec(naked) __declspec(align(16)) | 2547 __declspec(naked) |
| 2085 void I444ToARGBRow_SSSE3(const uint8* y_buf, | 2548 void I444ToARGBRow_SSSE3(const uint8* y_buf, |
| 2086 const uint8* u_buf, | 2549 const uint8* u_buf, |
| 2087 const uint8* v_buf, | 2550 const uint8* v_buf, |
| 2088 uint8* dst_argb, | 2551 uint8* dst_argb, |
| 2089 int width) { | 2552 int width) { |
| 2090 __asm { | 2553 __asm { |
| 2091 push esi | 2554 push esi |
| 2092 push edi | 2555 push edi |
| 2093 mov eax, [esp + 8 + 4] // Y | 2556 mov eax, [esp + 8 + 4] // Y |
| 2094 mov esi, [esp + 8 + 8] // U | 2557 mov esi, [esp + 8 + 8] // U |
| (...skipping 12 matching lines...) Expand all Loading... |
| 2107 jg convertloop | 2570 jg convertloop |
| 2108 | 2571 |
| 2109 pop edi | 2572 pop edi |
| 2110 pop esi | 2573 pop esi |
| 2111 ret | 2574 ret |
| 2112 } | 2575 } |
| 2113 } | 2576 } |
| 2114 | 2577 |
| 2115 // 8 pixels. | 2578 // 8 pixels. |
| 2116 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). | 2579 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). |
| 2117 __declspec(naked) __declspec(align(16)) | 2580 __declspec(naked) |
| 2118 void I422ToRGB24Row_SSSE3(const uint8* y_buf, | 2581 void I422ToRGB24Row_SSSE3(const uint8* y_buf, |
| 2119 const uint8* u_buf, | 2582 const uint8* u_buf, |
| 2120 const uint8* v_buf, | 2583 const uint8* v_buf, |
| 2121 uint8* dst_rgb24, | 2584 uint8* dst_rgb24, |
| 2122 int width) { | 2585 int width) { |
| 2123 __asm { | 2586 __asm { |
| 2124 push esi | 2587 push esi |
| 2125 push edi | 2588 push edi |
| 2126 mov eax, [esp + 8 + 4] // Y | 2589 mov eax, [esp + 8 + 4] // Y |
| 2127 mov esi, [esp + 8 + 8] // U | 2590 mov esi, [esp + 8 + 8] // U |
| (...skipping 13 matching lines...) Expand all Loading... |
| 2141 jg convertloop | 2604 jg convertloop |
| 2142 | 2605 |
| 2143 pop edi | 2606 pop edi |
| 2144 pop esi | 2607 pop esi |
| 2145 ret | 2608 ret |
| 2146 } | 2609 } |
| 2147 } | 2610 } |
| 2148 | 2611 |
| 2149 // 8 pixels. | 2612 // 8 pixels. |
| 2150 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). | 2613 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). |
| 2151 __declspec(naked) __declspec(align(16)) | 2614 __declspec(naked) |
| 2152 void I422ToRAWRow_SSSE3(const uint8* y_buf, | 2615 void I422ToRAWRow_SSSE3(const uint8* y_buf, |
| 2153 const uint8* u_buf, | 2616 const uint8* u_buf, |
| 2154 const uint8* v_buf, | 2617 const uint8* v_buf, |
| 2155 uint8* dst_raw, | 2618 uint8* dst_raw, |
| 2156 int width) { | 2619 int width) { |
| 2157 __asm { | 2620 __asm { |
| 2158 push esi | 2621 push esi |
| 2159 push edi | 2622 push edi |
| 2160 mov eax, [esp + 8 + 4] // Y | 2623 mov eax, [esp + 8 + 4] // Y |
| 2161 mov esi, [esp + 8 + 8] // U | 2624 mov esi, [esp + 8 + 8] // U |
| (...skipping 13 matching lines...) Expand all Loading... |
| 2175 jg convertloop | 2638 jg convertloop |
| 2176 | 2639 |
| 2177 pop edi | 2640 pop edi |
| 2178 pop esi | 2641 pop esi |
| 2179 ret | 2642 ret |
| 2180 } | 2643 } |
| 2181 } | 2644 } |
| 2182 | 2645 |
| 2183 // 8 pixels | 2646 // 8 pixels |
| 2184 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). | 2647 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). |
| 2185 __declspec(naked) __declspec(align(16)) | 2648 __declspec(naked) |
| 2186 void I422ToRGB565Row_SSSE3(const uint8* y_buf, | 2649 void I422ToRGB565Row_SSSE3(const uint8* y_buf, |
| 2187 const uint8* u_buf, | 2650 const uint8* u_buf, |
| 2188 const uint8* v_buf, | 2651 const uint8* v_buf, |
| 2189 uint8* rgb565_buf, | 2652 uint8* rgb565_buf, |
| 2190 int width) { | 2653 int width) { |
| 2191 __asm { | 2654 __asm { |
| 2192 push esi | 2655 push esi |
| 2193 push edi | 2656 push edi |
| 2194 mov eax, [esp + 8 + 4] // Y | 2657 mov eax, [esp + 8 + 4] // Y |
| 2195 mov esi, [esp + 8 + 8] // U | 2658 mov esi, [esp + 8 + 8] // U |
| (...skipping 18 matching lines...) Expand all Loading... |
| 2214 jg convertloop | 2677 jg convertloop |
| 2215 | 2678 |
| 2216 pop edi | 2679 pop edi |
| 2217 pop esi | 2680 pop esi |
| 2218 ret | 2681 ret |
| 2219 } | 2682 } |
| 2220 } | 2683 } |
| 2221 | 2684 |
| 2222 // 8 pixels. | 2685 // 8 pixels. |
| 2223 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2686 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2224 __declspec(naked) __declspec(align(16)) | 2687 __declspec(naked) |
| 2225 void I422ToARGBRow_SSSE3(const uint8* y_buf, | 2688 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
| 2226 const uint8* u_buf, | 2689 const uint8* u_buf, |
| 2227 const uint8* v_buf, | 2690 const uint8* v_buf, |
| 2228 uint8* dst_argb, | 2691 uint8* dst_argb, |
| 2229 int width) { | 2692 int width) { |
| 2230 __asm { | 2693 __asm { |
| 2231 push esi | 2694 push esi |
| 2232 push edi | 2695 push edi |
| 2233 mov eax, [esp + 8 + 4] // Y | 2696 mov eax, [esp + 8 + 4] // Y |
| 2234 mov esi, [esp + 8 + 8] // U | 2697 mov esi, [esp + 8 + 8] // U |
| (...skipping 11 matching lines...) Expand all Loading... |
| 2246 sub ecx, 8 | 2709 sub ecx, 8 |
| 2247 jg convertloop | 2710 jg convertloop |
| 2248 | 2711 |
| 2249 pop edi | 2712 pop edi |
| 2250 pop esi | 2713 pop esi |
| 2251 ret | 2714 ret |
| 2252 } | 2715 } |
| 2253 } | 2716 } |
| 2254 | 2717 |
| 2255 // 8 pixels. | 2718 // 8 pixels. |
| 2719 // JPeg color space version of I422ToARGB |
| 2720 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2721 __declspec(naked) |
| 2722 void J422ToARGBRow_SSSE3(const uint8* y_buf, |
| 2723 const uint8* u_buf, |
| 2724 const uint8* v_buf, |
| 2725 uint8* dst_argb, |
| 2726 int width) { |
| 2727 __asm { |
| 2728 push esi |
| 2729 push edi |
| 2730 mov eax, [esp + 8 + 4] // Y |
| 2731 mov esi, [esp + 8 + 8] // U |
| 2732 mov edi, [esp + 8 + 12] // V |
| 2733 mov edx, [esp + 8 + 16] // argb |
| 2734 mov ecx, [esp + 8 + 20] // width |
| 2735 sub edi, esi |
| 2736 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2737 |
| 2738 convertloop: |
| 2739 READYUV422 |
| 2740 YUVTORGB(kYuvJConstants) |
| 2741 STOREARGB |
| 2742 |
| 2743 sub ecx, 8 |
| 2744 jg convertloop |
| 2745 |
| 2746 pop edi |
| 2747 pop esi |
| 2748 ret |
| 2749 } |
| 2750 } |
| 2751 |
| 2752 // 8 pixels. |
| 2256 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2753 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2257 // Similar to I420 but duplicate UV once more. | 2754 // Similar to I420 but duplicate UV once more. |
| 2258 __declspec(naked) __declspec(align(16)) | 2755 __declspec(naked) |
| 2259 void I411ToARGBRow_SSSE3(const uint8* y_buf, | 2756 void I411ToARGBRow_SSSE3(const uint8* y_buf, |
| 2260 const uint8* u_buf, | 2757 const uint8* u_buf, |
| 2261 const uint8* v_buf, | 2758 const uint8* v_buf, |
| 2262 uint8* dst_argb, | 2759 uint8* dst_argb, |
| 2263 int width) { | 2760 int width) { |
| 2264 __asm { | 2761 __asm { |
| 2265 push ebx | 2762 push ebx |
| 2266 push esi | 2763 push esi |
| 2267 push edi | 2764 push edi |
| 2268 mov eax, [esp + 12 + 4] // Y | 2765 mov eax, [esp + 12 + 4] // Y |
| (...skipping 14 matching lines...) Expand all Loading... |
| 2283 | 2780 |
| 2284 pop edi | 2781 pop edi |
| 2285 pop esi | 2782 pop esi |
| 2286 pop ebx | 2783 pop ebx |
| 2287 ret | 2784 ret |
| 2288 } | 2785 } |
| 2289 } | 2786 } |
| 2290 | 2787 |
| 2291 // 8 pixels. | 2788 // 8 pixels. |
| 2292 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2789 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2293 __declspec(naked) __declspec(align(16)) | 2790 __declspec(naked) |
| 2294 void NV12ToARGBRow_SSSE3(const uint8* y_buf, | 2791 void NV12ToARGBRow_SSSE3(const uint8* y_buf, |
| 2295 const uint8* uv_buf, | 2792 const uint8* uv_buf, |
| 2296 uint8* dst_argb, | 2793 uint8* dst_argb, |
| 2297 int width) { | 2794 int width) { |
| 2298 __asm { | 2795 __asm { |
| 2299 push esi | 2796 push esi |
| 2300 mov eax, [esp + 4 + 4] // Y | 2797 mov eax, [esp + 4 + 4] // Y |
| 2301 mov esi, [esp + 4 + 8] // UV | 2798 mov esi, [esp + 4 + 8] // UV |
| 2302 mov edx, [esp + 4 + 12] // argb | 2799 mov edx, [esp + 4 + 12] // argb |
| 2303 mov ecx, [esp + 4 + 16] // width | 2800 mov ecx, [esp + 4 + 16] // width |
| 2304 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2801 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2305 | 2802 |
| 2306 convertloop: | 2803 convertloop: |
| 2307 READNV12 | 2804 READNV12 |
| 2308 YUVTORGB(kYuvConstants) | 2805 YUVTORGB(kYuvConstants) |
| 2309 STOREARGB | 2806 STOREARGB |
| 2310 | 2807 |
| 2311 sub ecx, 8 | 2808 sub ecx, 8 |
| 2312 jg convertloop | 2809 jg convertloop |
| 2313 | 2810 |
| 2314 pop esi | 2811 pop esi |
| 2315 ret | 2812 ret |
| 2316 } | 2813 } |
| 2317 } | 2814 } |
| 2318 | 2815 |
| 2319 // 8 pixels. | 2816 // 8 pixels. |
| 2320 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). | 2817 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2321 __declspec(naked) __declspec(align(16)) | 2818 __declspec(naked) |
| 2322 void NV21ToARGBRow_SSSE3(const uint8* y_buf, | 2819 void NV21ToARGBRow_SSSE3(const uint8* y_buf, |
| 2323 const uint8* uv_buf, | 2820 const uint8* uv_buf, |
| 2324 uint8* dst_argb, | 2821 uint8* dst_argb, |
| 2325 int width) { | 2822 int width) { |
| 2326 __asm { | 2823 __asm { |
| 2327 push esi | 2824 push esi |
| 2328 mov eax, [esp + 4 + 4] // Y | 2825 mov eax, [esp + 4 + 4] // Y |
| 2329 mov esi, [esp + 4 + 8] // UV | 2826 mov esi, [esp + 4 + 8] // UV |
| 2330 mov edx, [esp + 4 + 12] // argb | 2827 mov edx, [esp + 4 + 12] // argb |
| 2331 mov ecx, [esp + 4 + 16] // width | 2828 mov ecx, [esp + 4 + 16] // width |
| 2332 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2829 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2333 | 2830 |
| 2334 convertloop: | 2831 convertloop: |
| 2335 READNV12 | 2832 READNV12 |
| 2336 YUVTORGB(kYvuConstants) | 2833 YUVTORGB(kYvuConstants) |
| 2337 STOREARGB | 2834 STOREARGB |
| 2338 | 2835 |
| 2339 sub ecx, 8 | 2836 sub ecx, 8 |
| 2340 jg convertloop | 2837 jg convertloop |
| 2341 | 2838 |
| 2342 pop esi | 2839 pop esi |
| 2343 ret | 2840 ret |
| 2344 } | 2841 } |
| 2345 } | 2842 } |
| 2346 | 2843 |
| 2347 __declspec(naked) __declspec(align(16)) | 2844 __declspec(naked) |
| 2348 void I422ToBGRARow_SSSE3(const uint8* y_buf, | 2845 void I422ToBGRARow_SSSE3(const uint8* y_buf, |
| 2349 const uint8* u_buf, | 2846 const uint8* u_buf, |
| 2350 const uint8* v_buf, | 2847 const uint8* v_buf, |
| 2351 uint8* dst_bgra, | 2848 uint8* dst_bgra, |
| 2352 int width) { | 2849 int width) { |
| 2353 __asm { | 2850 __asm { |
| 2354 push esi | 2851 push esi |
| 2355 push edi | 2852 push edi |
| 2356 mov eax, [esp + 8 + 4] // Y | 2853 mov eax, [esp + 8 + 4] // Y |
| 2357 mov esi, [esp + 8 + 8] // U | 2854 mov esi, [esp + 8 + 8] // U |
| 2358 mov edi, [esp + 8 + 12] // V | 2855 mov edi, [esp + 8 + 12] // V |
| 2359 mov edx, [esp + 8 + 16] // bgra | 2856 mov edx, [esp + 8 + 16] // bgra |
| 2360 mov ecx, [esp + 8 + 20] // width | 2857 mov ecx, [esp + 8 + 20] // width |
| 2361 sub edi, esi | 2858 sub edi, esi |
| 2362 | 2859 |
| 2363 convertloop: | 2860 convertloop: |
| 2364 READYUV422 | 2861 READYUV422 |
| 2365 YUVTORGB(kYuvConstants) | 2862 YUVTORGB(kYuvConstants) |
| 2366 STOREBGRA | 2863 STOREBGRA |
| 2367 | 2864 |
| 2368 sub ecx, 8 | 2865 sub ecx, 8 |
| 2369 jg convertloop | 2866 jg convertloop |
| 2370 | 2867 |
| 2371 pop edi | 2868 pop edi |
| 2372 pop esi | 2869 pop esi |
| 2373 ret | 2870 ret |
| 2374 } | 2871 } |
| 2375 } | 2872 } |
| 2376 | 2873 |
| 2377 __declspec(naked) __declspec(align(16)) | 2874 __declspec(naked) |
| 2378 void I422ToABGRRow_SSSE3(const uint8* y_buf, | 2875 void I422ToABGRRow_SSSE3(const uint8* y_buf, |
| 2379 const uint8* u_buf, | 2876 const uint8* u_buf, |
| 2380 const uint8* v_buf, | 2877 const uint8* v_buf, |
| 2381 uint8* dst_abgr, | 2878 uint8* dst_abgr, |
| 2382 int width) { | 2879 int width) { |
| 2383 __asm { | 2880 __asm { |
| 2384 push esi | 2881 push esi |
| 2385 push edi | 2882 push edi |
| 2386 mov eax, [esp + 8 + 4] // Y | 2883 mov eax, [esp + 8 + 4] // Y |
| 2387 mov esi, [esp + 8 + 8] // U | 2884 mov esi, [esp + 8 + 8] // U |
| (...skipping 10 matching lines...) Expand all Loading... |
| 2398 | 2895 |
| 2399 sub ecx, 8 | 2896 sub ecx, 8 |
| 2400 jg convertloop | 2897 jg convertloop |
| 2401 | 2898 |
| 2402 pop edi | 2899 pop edi |
| 2403 pop esi | 2900 pop esi |
| 2404 ret | 2901 ret |
| 2405 } | 2902 } |
| 2406 } | 2903 } |
| 2407 | 2904 |
| 2408 __declspec(naked) __declspec(align(16)) | 2905 __declspec(naked) |
| 2409 void I422ToRGBARow_SSSE3(const uint8* y_buf, | 2906 void I422ToRGBARow_SSSE3(const uint8* y_buf, |
| 2410 const uint8* u_buf, | 2907 const uint8* u_buf, |
| 2411 const uint8* v_buf, | 2908 const uint8* v_buf, |
| 2412 uint8* dst_rgba, | 2909 uint8* dst_rgba, |
| 2413 int width) { | 2910 int width) { |
| 2414 __asm { | 2911 __asm { |
| 2415 push esi | 2912 push esi |
| 2416 push edi | 2913 push edi |
| 2417 mov eax, [esp + 8 + 4] // Y | 2914 mov eax, [esp + 8 + 4] // Y |
| 2418 mov esi, [esp + 8 + 8] // U | 2915 mov esi, [esp + 8 + 8] // U |
| (...skipping 11 matching lines...) Expand all Loading... |
| 2430 jg convertloop | 2927 jg convertloop |
| 2431 | 2928 |
| 2432 pop edi | 2929 pop edi |
| 2433 pop esi | 2930 pop esi |
| 2434 ret | 2931 ret |
| 2435 } | 2932 } |
| 2436 } | 2933 } |
| 2437 | 2934 |
| 2438 #endif // HAS_I422TOARGBROW_SSSE3 | 2935 #endif // HAS_I422TOARGBROW_SSSE3 |
| 2439 | 2936 |
| 2440 #ifdef HAS_YTOARGBROW_SSE2 | 2937 #ifdef HAS_I400TOARGBROW_SSE2 |
| 2441 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). | 2938 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). |
| 2442 __declspec(naked) __declspec(align(16)) | 2939 __declspec(naked) |
| 2443 void YToARGBRow_SSE2(const uint8* y_buf, | 2940 void I400ToARGBRow_SSE2(const uint8* y_buf, |
| 2444 uint8* rgb_buf, | 2941 uint8* rgb_buf, |
| 2445 int width) { | 2942 int width) { |
| 2446 __asm { | 2943 __asm { |
| 2447 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) | 2944 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) |
| 2448 movd xmm2, eax | 2945 movd xmm2, eax |
| 2449 pshufd xmm2, xmm2,0 | 2946 pshufd xmm2, xmm2,0 |
| 2450 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) | 2947 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) |
| 2451 movd xmm3, eax | 2948 movd xmm3, eax |
| 2452 pshufd xmm3, xmm3, 0 | 2949 pshufd xmm3, xmm3, 0 |
| 2453 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | 2950 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
| 2454 pslld xmm4, 24 | 2951 pslld xmm4, 24 |
| 2455 | 2952 |
| (...skipping 19 matching lines...) Expand all Loading... |
| 2475 por xmm0, xmm4 | 2972 por xmm0, xmm4 |
| 2476 por xmm1, xmm4 | 2973 por xmm1, xmm4 |
| 2477 movdqu [edx], xmm0 | 2974 movdqu [edx], xmm0 |
| 2478 movdqu [edx + 16], xmm1 | 2975 movdqu [edx + 16], xmm1 |
| 2479 lea edx, [edx + 32] | 2976 lea edx, [edx + 32] |
| 2480 sub ecx, 8 | 2977 sub ecx, 8 |
| 2481 jg convertloop | 2978 jg convertloop |
| 2482 ret | 2979 ret |
| 2483 } | 2980 } |
| 2484 } | 2981 } |
| 2485 #endif // HAS_YTOARGBROW_SSE2 | 2982 #endif // HAS_I400TOARGBROW_SSE2 |
| 2486 | 2983 |
| 2487 #ifdef HAS_YTOARGBROW_AVX2 | 2984 #ifdef HAS_I400TOARGBROW_AVX2 |
| 2488 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). | 2985 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). |
| 2489 // note: vpunpcklbw mutates and vpackuswb unmutates. | 2986 // note: vpunpcklbw mutates and vpackuswb unmutates. |
| 2490 __declspec(naked) __declspec(align(16)) | 2987 __declspec(naked) |
| 2491 void YToARGBRow_AVX2(const uint8* y_buf, | 2988 void I400ToARGBRow_AVX2(const uint8* y_buf, |
| 2492 uint8* rgb_buf, | 2989 uint8* rgb_buf, |
| 2493 int width) { | 2990 int width) { |
| 2494 __asm { | 2991 __asm { |
| 2495 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) | 2992 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) |
| 2496 vmovd xmm2, eax | 2993 vmovd xmm2, eax |
| 2497 vbroadcastss ymm2, xmm2 | 2994 vbroadcastss ymm2, xmm2 |
| 2498 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) | 2995 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) |
| 2499 vmovd xmm3, eax | 2996 vmovd xmm3, eax |
| 2500 vbroadcastss ymm3, xmm3 | 2997 vbroadcastss ymm3, xmm3 |
| 2501 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 | 2998 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 |
| 2502 vpslld ymm4, ymm4, 24 | 2999 vpslld ymm4, ymm4, 24 |
| 2503 | 3000 |
| 2504 mov eax, [esp + 4] // Y | 3001 mov eax, [esp + 4] // Y |
| 2505 mov edx, [esp + 8] // rgb | 3002 mov edx, [esp + 8] // rgb |
| 2506 mov ecx, [esp + 12] // width | 3003 mov ecx, [esp + 12] // width |
| 2507 | 3004 |
| 2508 convertloop: | 3005 convertloop: |
| 2509 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 | 3006 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 |
| 2510 vmovdqu xmm0, [eax] | 3007 vmovdqu xmm0, [eax] |
| 2511 lea eax, [eax + 16] | 3008 lea eax, [eax + 16] |
| 2512 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates | 3009 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates |
| 2513 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y | 3010 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y |
| 2514 vpmulhuw ymm0, ymm0, ymm2 | 3011 vpmulhuw ymm0, ymm0, ymm2 |
| 2515 vpsubusw ymm0, ymm0, ymm3 | 3012 vpsubusw ymm0, ymm0, ymm3 |
| 2516 vpsrlw ymm0, ymm0, 6 | 3013 vpsrlw ymm0, ymm0, 6 |
| 2517 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 | 3014 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 |
| 2518 | 3015 |
| 2519 // TODO(fbarchard): Weave alpha with unpack. | 3016 // TODO(fbarchard): Weave alpha with unpack. |
| 2520 // Step 2: Weave into ARGB | 3017 // Step 2: Weave into ARGB |
| 2521 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates | 3018 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates |
| 2522 vpermq ymm1, ymm1, 0xd8 | 3019 vpermq ymm1, ymm1, 0xd8 |
| 2523 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels | 3020 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels |
| 2524 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels | 3021 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels |
| 2525 vpor ymm0, ymm0, ymm4 | 3022 vpor ymm0, ymm0, ymm4 |
| 2526 vpor ymm1, ymm1, ymm4 | 3023 vpor ymm1, ymm1, ymm4 |
| 2527 vmovdqu [edx], ymm0 | 3024 vmovdqu [edx], ymm0 |
| 2528 vmovdqu [edx + 32], ymm1 | 3025 vmovdqu [edx + 32], ymm1 |
| 2529 lea edx, [edx + 64] | 3026 lea edx, [edx + 64] |
| 2530 sub ecx, 16 | 3027 sub ecx, 16 |
| 2531 jg convertloop | 3028 jg convertloop |
| 2532 vzeroupper | 3029 vzeroupper |
| 2533 ret | 3030 ret |
| 2534 } | 3031 } |
| 2535 } | 3032 } |
| 2536 #endif // HAS_YTOARGBROW_AVX2 | 3033 #endif // HAS_I400TOARGBROW_AVX2 |
| 2537 | 3034 |
| 2538 #ifdef HAS_MIRRORROW_SSSE3 | 3035 #ifdef HAS_MIRRORROW_SSSE3 |
| 2539 // Shuffle table for reversing the bytes. | 3036 // Shuffle table for reversing the bytes. |
| 2540 static const uvec8 kShuffleMirror = { | 3037 static const uvec8 kShuffleMirror = { |
| 2541 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | 3038 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
| 2542 }; | 3039 }; |
| 2543 | 3040 |
| 2544 // TODO(fbarchard): Replace lea with -16 offset. | 3041 // TODO(fbarchard): Replace lea with -16 offset. |
| 2545 __declspec(naked) __declspec(align(16)) | 3042 __declspec(naked) |
| 2546 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { | 3043 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
| 2547 __asm { | 3044 __asm { |
| 2548 mov eax, [esp + 4] // src | 3045 mov eax, [esp + 4] // src |
| 2549 mov edx, [esp + 8] // dst | 3046 mov edx, [esp + 8] // dst |
| 2550 mov ecx, [esp + 12] // width | 3047 mov ecx, [esp + 12] // width |
| 2551 movdqa xmm5, kShuffleMirror | 3048 movdqa xmm5, kShuffleMirror |
| 2552 | 3049 |
| 2553 convertloop: | 3050 convertloop: |
| 2554 movdqu xmm0, [eax - 16 + ecx] | 3051 movdqu xmm0, [eax - 16 + ecx] |
| 2555 pshufb xmm0, xmm5 | 3052 pshufb xmm0, xmm5 |
| 2556 movdqu [edx], xmm0 | 3053 movdqu [edx], xmm0 |
| 2557 lea edx, [edx + 16] | 3054 lea edx, [edx + 16] |
| 2558 sub ecx, 16 | 3055 sub ecx, 16 |
| 2559 jg convertloop | 3056 jg convertloop |
| 2560 ret | 3057 ret |
| 2561 } | 3058 } |
| 2562 } | 3059 } |
| 2563 #endif // HAS_MIRRORROW_SSSE3 | 3060 #endif // HAS_MIRRORROW_SSSE3 |
| 2564 | 3061 |
| 2565 #ifdef HAS_MIRRORROW_AVX2 | 3062 #ifdef HAS_MIRRORROW_AVX2 |
| 2566 __declspec(naked) __declspec(align(16)) | 3063 __declspec(naked) |
| 2567 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | 3064 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
| 2568 __asm { | 3065 __asm { |
| 2569 mov eax, [esp + 4] // src | 3066 mov eax, [esp + 4] // src |
| 2570 mov edx, [esp + 8] // dst | 3067 mov edx, [esp + 8] // dst |
| 2571 mov ecx, [esp + 12] // width | 3068 mov ecx, [esp + 12] // width |
| 2572 vbroadcastf128 ymm5, kShuffleMirror | 3069 vbroadcastf128 ymm5, kShuffleMirror |
| 2573 | 3070 |
| 2574 convertloop: | 3071 convertloop: |
| 2575 vmovdqu ymm0, [eax - 32 + ecx] | 3072 vmovdqu ymm0, [eax - 32 + ecx] |
| 2576 vpshufb ymm0, ymm0, ymm5 | 3073 vpshufb ymm0, ymm0, ymm5 |
| 2577 vpermq ymm0, ymm0, 0x4e // swap high and low halfs | 3074 vpermq ymm0, ymm0, 0x4e // swap high and low halfs |
| 2578 vmovdqu [edx], ymm0 | 3075 vmovdqu [edx], ymm0 |
| 2579 lea edx, [edx + 32] | 3076 lea edx, [edx + 32] |
| 2580 sub ecx, 32 | 3077 sub ecx, 32 |
| 2581 jg convertloop | 3078 jg convertloop |
| 2582 vzeroupper | 3079 vzeroupper |
| 2583 ret | 3080 ret |
| 2584 } | 3081 } |
| 2585 } | 3082 } |
| 2586 #endif // HAS_MIRRORROW_AVX2 | 3083 #endif // HAS_MIRRORROW_AVX2 |
| 2587 | 3084 |
| 2588 #ifdef HAS_MIRRORROW_SSE2 | 3085 #ifdef HAS_MIRRORROW_SSE2 |
| 2589 __declspec(naked) __declspec(align(16)) | 3086 __declspec(naked) |
| 2590 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | 3087 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { |
| 2591 __asm { | 3088 __asm { |
| 2592 mov eax, [esp + 4] // src | 3089 mov eax, [esp + 4] // src |
| 2593 mov edx, [esp + 8] // dst | 3090 mov edx, [esp + 8] // dst |
| 2594 mov ecx, [esp + 12] // width | 3091 mov ecx, [esp + 12] // width |
| 2595 | 3092 |
| 2596 convertloop: | 3093 convertloop: |
| 2597 movdqu xmm0, [eax - 16 + ecx] | 3094 movdqu xmm0, [eax - 16 + ecx] |
| 2598 movdqa xmm1, xmm0 // swap bytes | 3095 movdqa xmm1, xmm0 // swap bytes |
| 2599 psllw xmm0, 8 | 3096 psllw xmm0, 8 |
| (...skipping 10 matching lines...) Expand all Loading... |
| 2610 } | 3107 } |
| 2611 } | 3108 } |
| 2612 #endif // HAS_MIRRORROW_SSE2 | 3109 #endif // HAS_MIRRORROW_SSE2 |
| 2613 | 3110 |
| 2614 #ifdef HAS_MIRRORROW_UV_SSSE3 | 3111 #ifdef HAS_MIRRORROW_UV_SSSE3 |
| 2615 // Shuffle table for reversing the bytes of UV channels. | 3112 // Shuffle table for reversing the bytes of UV channels. |
| 2616 static const uvec8 kShuffleMirrorUV = { | 3113 static const uvec8 kShuffleMirrorUV = { |
| 2617 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u | 3114 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u |
| 2618 }; | 3115 }; |
| 2619 | 3116 |
| 2620 __declspec(naked) __declspec(align(16)) | 3117 __declspec(naked) |
| 2621 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, | 3118 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, |
| 2622 int width) { | 3119 int width) { |
| 2623 __asm { | 3120 __asm { |
| 2624 push edi | 3121 push edi |
| 2625 mov eax, [esp + 4 + 4] // src | 3122 mov eax, [esp + 4 + 4] // src |
| 2626 mov edx, [esp + 4 + 8] // dst_u | 3123 mov edx, [esp + 4 + 8] // dst_u |
| 2627 mov edi, [esp + 4 + 12] // dst_v | 3124 mov edi, [esp + 4 + 12] // dst_v |
| 2628 mov ecx, [esp + 4 + 16] // width | 3125 mov ecx, [esp + 4 + 16] // width |
| 2629 movdqa xmm1, kShuffleMirrorUV | 3126 movdqa xmm1, kShuffleMirrorUV |
| 2630 lea eax, [eax + ecx * 2 - 16] | 3127 lea eax, [eax + ecx * 2 - 16] |
| 2631 sub edi, edx | 3128 sub edi, edx |
| 2632 | 3129 |
| 2633 convertloop: | 3130 convertloop: |
| 2634 movdqu xmm0, [eax] | 3131 movdqu xmm0, [eax] |
| 2635 lea eax, [eax - 16] | 3132 lea eax, [eax - 16] |
| 2636 pshufb xmm0, xmm1 | 3133 pshufb xmm0, xmm1 |
| 2637 movlpd qword ptr [edx], xmm0 | 3134 movlpd qword ptr [edx], xmm0 |
| 2638 movhpd qword ptr [edx + edi], xmm0 | 3135 movhpd qword ptr [edx + edi], xmm0 |
| 2639 lea edx, [edx + 8] | 3136 lea edx, [edx + 8] |
| 2640 sub ecx, 8 | 3137 sub ecx, 8 |
| 2641 jg convertloop | 3138 jg convertloop |
| 2642 | 3139 |
| 2643 pop edi | 3140 pop edi |
| 2644 ret | 3141 ret |
| 2645 } | 3142 } |
| 2646 } | 3143 } |
| 2647 #endif // HAS_MIRRORROW_UV_SSSE3 | 3144 #endif // HAS_MIRRORROW_UV_SSSE3 |
| 2648 | 3145 |
| 2649 #ifdef HAS_ARGBMIRRORROW_SSE2 | 3146 #ifdef HAS_ARGBMIRRORROW_SSE2 |
| 2650 __declspec(naked) __declspec(align(16)) | 3147 __declspec(naked) |
| 2651 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | 3148 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { |
| 2652 __asm { | 3149 __asm { |
| 2653 mov eax, [esp + 4] // src | 3150 mov eax, [esp + 4] // src |
| 2654 mov edx, [esp + 8] // dst | 3151 mov edx, [esp + 8] // dst |
| 2655 mov ecx, [esp + 12] // width | 3152 mov ecx, [esp + 12] // width |
| 2656 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. | 3153 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. |
| 2657 | 3154 |
| 2658 convertloop: | 3155 convertloop: |
| 2659 movdqu xmm0, [eax] | 3156 movdqu xmm0, [eax] |
| 2660 lea eax, [eax - 16] | 3157 lea eax, [eax - 16] |
| 2661 pshufd xmm0, xmm0, 0x1b | 3158 pshufd xmm0, xmm0, 0x1b |
| 2662 movdqu [edx], xmm0 | 3159 movdqu [edx], xmm0 |
| 2663 lea edx, [edx + 16] | 3160 lea edx, [edx + 16] |
| 2664 sub ecx, 4 | 3161 sub ecx, 4 |
| 2665 jg convertloop | 3162 jg convertloop |
| 2666 ret | 3163 ret |
| 2667 } | 3164 } |
| 2668 } | 3165 } |
| 2669 #endif // HAS_ARGBMIRRORROW_SSE2 | 3166 #endif // HAS_ARGBMIRRORROW_SSE2 |
| 2670 | 3167 |
| 2671 #ifdef HAS_ARGBMIRRORROW_AVX2 | 3168 #ifdef HAS_ARGBMIRRORROW_AVX2 |
| 2672 // Shuffle table for reversing the bytes. | 3169 // Shuffle table for reversing the bytes. |
| 2673 static const ulvec32 kARGBShuffleMirror_AVX2 = { | 3170 static const ulvec32 kARGBShuffleMirror_AVX2 = { |
| 2674 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | 3171 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
| 2675 }; | 3172 }; |
| 2676 | 3173 |
| 2677 __declspec(naked) __declspec(align(16)) | 3174 __declspec(naked) |
| 2678 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | 3175 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
| 2679 __asm { | 3176 __asm { |
| 2680 mov eax, [esp + 4] // src | 3177 mov eax, [esp + 4] // src |
| 2681 mov edx, [esp + 8] // dst | 3178 mov edx, [esp + 8] // dst |
| 2682 mov ecx, [esp + 12] // width | 3179 mov ecx, [esp + 12] // width |
| 2683 vmovdqu ymm5, kARGBShuffleMirror_AVX2 | 3180 vmovdqu ymm5, kARGBShuffleMirror_AVX2 |
| 2684 | 3181 |
| 2685 convertloop: | 3182 convertloop: |
| 2686 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order | 3183 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order |
| 2687 vmovdqu [edx], ymm0 | 3184 vmovdqu [edx], ymm0 |
| 2688 lea edx, [edx + 32] | 3185 lea edx, [edx + 32] |
| 2689 sub ecx, 8 | 3186 sub ecx, 8 |
| 2690 jg convertloop | 3187 jg convertloop |
| 2691 vzeroupper | 3188 vzeroupper |
| 2692 ret | 3189 ret |
| 2693 } | 3190 } |
| 2694 } | 3191 } |
| 2695 #endif // HAS_ARGBMIRRORROW_AVX2 | 3192 #endif // HAS_ARGBMIRRORROW_AVX2 |
| 2696 | 3193 |
| 2697 #ifdef HAS_SPLITUVROW_SSE2 | 3194 #ifdef HAS_SPLITUVROW_SSE2 |
| 2698 __declspec(naked) __declspec(align(16)) | 3195 __declspec(naked) |
| 2699 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 3196 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
| 2700 __asm { | 3197 __asm { |
| 2701 push edi | 3198 push edi |
| 2702 mov eax, [esp + 4 + 4] // src_uv | 3199 mov eax, [esp + 4 + 4] // src_uv |
| 2703 mov edx, [esp + 4 + 8] // dst_u | 3200 mov edx, [esp + 4 + 8] // dst_u |
| 2704 mov edi, [esp + 4 + 12] // dst_v | 3201 mov edi, [esp + 4 + 12] // dst_v |
| 2705 mov ecx, [esp + 4 + 16] // pix | 3202 mov ecx, [esp + 4 + 16] // pix |
| 2706 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 3203 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 2707 psrlw xmm5, 8 | 3204 psrlw xmm5, 8 |
| 2708 sub edi, edx | 3205 sub edi, edx |
| (...skipping 17 matching lines...) Expand all Loading... |
| 2726 jg convertloop | 3223 jg convertloop |
| 2727 | 3224 |
| 2728 pop edi | 3225 pop edi |
| 2729 ret | 3226 ret |
| 2730 } | 3227 } |
| 2731 } | 3228 } |
| 2732 | 3229 |
| 2733 #endif // HAS_SPLITUVROW_SSE2 | 3230 #endif // HAS_SPLITUVROW_SSE2 |
| 2734 | 3231 |
| 2735 #ifdef HAS_SPLITUVROW_AVX2 | 3232 #ifdef HAS_SPLITUVROW_AVX2 |
| 2736 __declspec(naked) __declspec(align(16)) | 3233 __declspec(naked) |
| 2737 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 3234 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
| 2738 __asm { | 3235 __asm { |
| 2739 push edi | 3236 push edi |
| 2740 mov eax, [esp + 4 + 4] // src_uv | 3237 mov eax, [esp + 4 + 4] // src_uv |
| 2741 mov edx, [esp + 4 + 8] // dst_u | 3238 mov edx, [esp + 4 + 8] // dst_u |
| 2742 mov edi, [esp + 4 + 12] // dst_v | 3239 mov edi, [esp + 4 + 12] // dst_v |
| 2743 mov ecx, [esp + 4 + 16] // pix | 3240 mov ecx, [esp + 4 + 16] // pix |
| 2744 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 3241 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 2745 vpsrlw ymm5, ymm5, 8 | 3242 vpsrlw ymm5, ymm5, 8 |
| 2746 sub edi, edx | 3243 sub edi, edx |
| (...skipping 17 matching lines...) Expand all Loading... |
| 2764 jg convertloop | 3261 jg convertloop |
| 2765 | 3262 |
| 2766 pop edi | 3263 pop edi |
| 2767 vzeroupper | 3264 vzeroupper |
| 2768 ret | 3265 ret |
| 2769 } | 3266 } |
| 2770 } | 3267 } |
| 2771 #endif // HAS_SPLITUVROW_AVX2 | 3268 #endif // HAS_SPLITUVROW_AVX2 |
| 2772 | 3269 |
| 2773 #ifdef HAS_MERGEUVROW_SSE2 | 3270 #ifdef HAS_MERGEUVROW_SSE2 |
| 2774 __declspec(naked) __declspec(align(16)) | 3271 __declspec(naked) |
| 2775 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 3272 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
| 2776 int width) { | 3273 int width) { |
| 2777 __asm { | 3274 __asm { |
| 2778 push edi | 3275 push edi |
| 2779 mov eax, [esp + 4 + 4] // src_u | 3276 mov eax, [esp + 4 + 4] // src_u |
| 2780 mov edx, [esp + 4 + 8] // src_v | 3277 mov edx, [esp + 4 + 8] // src_v |
| 2781 mov edi, [esp + 4 + 12] // dst_uv | 3278 mov edi, [esp + 4 + 12] // dst_uv |
| 2782 mov ecx, [esp + 4 + 16] // width | 3279 mov ecx, [esp + 4 + 16] // width |
| 2783 sub edx, eax | 3280 sub edx, eax |
| 2784 | 3281 |
| (...skipping 10 matching lines...) Expand all Loading... |
| 2795 sub ecx, 16 | 3292 sub ecx, 16 |
| 2796 jg convertloop | 3293 jg convertloop |
| 2797 | 3294 |
| 2798 pop edi | 3295 pop edi |
| 2799 ret | 3296 ret |
| 2800 } | 3297 } |
| 2801 } | 3298 } |
| 2802 #endif // HAS_MERGEUVROW_SSE2 | 3299 #endif // HAS_MERGEUVROW_SSE2 |
| 2803 | 3300 |
| 2804 #ifdef HAS_MERGEUVROW_AVX2 | 3301 #ifdef HAS_MERGEUVROW_AVX2 |
| 2805 __declspec(naked) __declspec(align(16)) | 3302 __declspec(naked) |
| 2806 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 3303 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
| 2807 int width) { | 3304 int width) { |
| 2808 __asm { | 3305 __asm { |
| 2809 push edi | 3306 push edi |
| 2810 mov eax, [esp + 4 + 4] // src_u | 3307 mov eax, [esp + 4 + 4] // src_u |
| 2811 mov edx, [esp + 4 + 8] // src_v | 3308 mov edx, [esp + 4 + 8] // src_v |
| 2812 mov edi, [esp + 4 + 12] // dst_uv | 3309 mov edi, [esp + 4 + 12] // dst_uv |
| 2813 mov ecx, [esp + 4 + 16] // width | 3310 mov ecx, [esp + 4 + 16] // width |
| 2814 sub edx, eax | 3311 sub edx, eax |
| 2815 | 3312 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 2829 | 3326 |
| 2830 pop edi | 3327 pop edi |
| 2831 vzeroupper | 3328 vzeroupper |
| 2832 ret | 3329 ret |
| 2833 } | 3330 } |
| 2834 } | 3331 } |
| 2835 #endif // HAS_MERGEUVROW_AVX2 | 3332 #endif // HAS_MERGEUVROW_AVX2 |
| 2836 | 3333 |
| 2837 #ifdef HAS_COPYROW_SSE2 | 3334 #ifdef HAS_COPYROW_SSE2 |
| 2838 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. | 3335 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. |
| 2839 __declspec(naked) __declspec(align(16)) | 3336 __declspec(naked) |
| 2840 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { | 3337 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { |
| 2841 __asm { | 3338 __asm { |
| 2842 mov eax, [esp + 4] // src | 3339 mov eax, [esp + 4] // src |
| 2843 mov edx, [esp + 8] // dst | 3340 mov edx, [esp + 8] // dst |
| 2844 mov ecx, [esp + 12] // count | 3341 mov ecx, [esp + 12] // count |
| 2845 | 3342 |
| 2846 convertloop: | 3343 convertloop: |
| 2847 movdqu xmm0, [eax] | 3344 movdqu xmm0, [eax] |
| 2848 movdqu xmm1, [eax + 16] | 3345 movdqu xmm1, [eax + 16] |
| 2849 lea eax, [eax + 32] | 3346 lea eax, [eax + 32] |
| 2850 movdqu [edx], xmm0 | 3347 movdqu [edx], xmm0 |
| 2851 movdqu [edx + 16], xmm1 | 3348 movdqu [edx + 16], xmm1 |
| 2852 lea edx, [edx + 32] | 3349 lea edx, [edx + 32] |
| 2853 sub ecx, 32 | 3350 sub ecx, 32 |
| 2854 jg convertloop | 3351 jg convertloop |
| 2855 ret | 3352 ret |
| 2856 } | 3353 } |
| 2857 } | 3354 } |
| 2858 #endif // HAS_COPYROW_SSE2 | 3355 #endif // HAS_COPYROW_SSE2 |
| 2859 | 3356 |
| 2860 #ifdef HAS_COPYROW_AVX | 3357 #ifdef HAS_COPYROW_AVX |
| 2861 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. | 3358 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. |
| 2862 __declspec(naked) __declspec(align(16)) | 3359 __declspec(naked) |
| 2863 void CopyRow_AVX(const uint8* src, uint8* dst, int count) { | 3360 void CopyRow_AVX(const uint8* src, uint8* dst, int count) { |
| 2864 __asm { | 3361 __asm { |
| 2865 mov eax, [esp + 4] // src | 3362 mov eax, [esp + 4] // src |
| 2866 mov edx, [esp + 8] // dst | 3363 mov edx, [esp + 8] // dst |
| 2867 mov ecx, [esp + 12] // count | 3364 mov ecx, [esp + 12] // count |
| 2868 | 3365 |
| 2869 convertloop: | 3366 convertloop: |
| 2870 vmovdqu ymm0, [eax] | 3367 vmovdqu ymm0, [eax] |
| 2871 vmovdqu ymm1, [eax + 32] | 3368 vmovdqu ymm1, [eax + 32] |
| 2872 lea eax, [eax + 64] | 3369 lea eax, [eax + 64] |
| 2873 vmovdqu [edx], ymm0 | 3370 vmovdqu [edx], ymm0 |
| 2874 vmovdqu [edx + 32], ymm1 | 3371 vmovdqu [edx + 32], ymm1 |
| 2875 lea edx, [edx + 64] | 3372 lea edx, [edx + 64] |
| 2876 sub ecx, 64 | 3373 sub ecx, 64 |
| 2877 jg convertloop | 3374 jg convertloop |
| 2878 | 3375 |
| 2879 vzeroupper | 3376 vzeroupper |
| 2880 ret | 3377 ret |
| 2881 } | 3378 } |
| 2882 } | 3379 } |
| 2883 #endif // HAS_COPYROW_AVX | 3380 #endif // HAS_COPYROW_AVX |
| 2884 | 3381 |
| 2885 // Multiple of 1. | 3382 // Multiple of 1. |
| 2886 __declspec(naked) __declspec(align(16)) | 3383 __declspec(naked) |
| 2887 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { | 3384 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { |
| 2888 __asm { | 3385 __asm { |
| 2889 mov eax, esi | 3386 mov eax, esi |
| 2890 mov edx, edi | 3387 mov edx, edi |
| 2891 mov esi, [esp + 4] // src | 3388 mov esi, [esp + 4] // src |
| 2892 mov edi, [esp + 8] // dst | 3389 mov edi, [esp + 8] // dst |
| 2893 mov ecx, [esp + 12] // count | 3390 mov ecx, [esp + 12] // count |
| 2894 rep movsb | 3391 rep movsb |
| 2895 mov edi, edx | 3392 mov edi, edx |
| 2896 mov esi, eax | 3393 mov esi, eax |
| 2897 ret | 3394 ret |
| 2898 } | 3395 } |
| 2899 } | 3396 } |
| 2900 | 3397 |
| 2901 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 | 3398 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 |
| 2902 // width in pixels | 3399 // width in pixels |
| 2903 __declspec(naked) __declspec(align(16)) | 3400 __declspec(naked) |
| 2904 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | 3401 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
| 2905 __asm { | 3402 __asm { |
| 2906 mov eax, [esp + 4] // src | 3403 mov eax, [esp + 4] // src |
| 2907 mov edx, [esp + 8] // dst | 3404 mov edx, [esp + 8] // dst |
| 2908 mov ecx, [esp + 12] // count | 3405 mov ecx, [esp + 12] // count |
| 2909 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 | 3406 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
| 2910 pslld xmm0, 24 | 3407 pslld xmm0, 24 |
| 2911 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff | 3408 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
| 2912 psrld xmm1, 8 | 3409 psrld xmm1, 8 |
| 2913 | 3410 |
| (...skipping 15 matching lines...) Expand all Loading... |
| 2929 sub ecx, 8 | 3426 sub ecx, 8 |
| 2930 jg convertloop | 3427 jg convertloop |
| 2931 | 3428 |
| 2932 ret | 3429 ret |
| 2933 } | 3430 } |
| 2934 } | 3431 } |
| 2935 #endif // HAS_ARGBCOPYALPHAROW_SSE2 | 3432 #endif // HAS_ARGBCOPYALPHAROW_SSE2 |
| 2936 | 3433 |
| 2937 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 | 3434 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 |
| 2938 // width in pixels | 3435 // width in pixels |
| 2939 __declspec(naked) __declspec(align(16)) | 3436 __declspec(naked) |
| 2940 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { | 3437 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
| 2941 __asm { | 3438 __asm { |
| 2942 mov eax, [esp + 4] // src | 3439 mov eax, [esp + 4] // src |
| 2943 mov edx, [esp + 8] // dst | 3440 mov edx, [esp + 8] // dst |
| 2944 mov ecx, [esp + 12] // count | 3441 mov ecx, [esp + 12] // count |
| 2945 vpcmpeqb ymm0, ymm0, ymm0 | 3442 vpcmpeqb ymm0, ymm0, ymm0 |
| 2946 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff | 3443 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
| 2947 | 3444 |
| 2948 convertloop: | 3445 convertloop: |
| 2949 vmovdqu ymm1, [eax] | 3446 vmovdqu ymm1, [eax] |
| 2950 vmovdqu ymm2, [eax + 32] | 3447 vmovdqu ymm2, [eax + 32] |
| 2951 lea eax, [eax + 64] | 3448 lea eax, [eax + 64] |
| 2952 vpblendvb ymm1, ymm1, [edx], ymm0 | 3449 vpblendvb ymm1, ymm1, [edx], ymm0 |
| 2953 vpblendvb ymm2, ymm2, [edx + 32], ymm0 | 3450 vpblendvb ymm2, ymm2, [edx + 32], ymm0 |
| 2954 vmovdqu [edx], ymm1 | 3451 vmovdqu [edx], ymm1 |
| 2955 vmovdqu [edx + 32], ymm2 | 3452 vmovdqu [edx + 32], ymm2 |
| 2956 lea edx, [edx + 64] | 3453 lea edx, [edx + 64] |
| 2957 sub ecx, 16 | 3454 sub ecx, 16 |
| 2958 jg convertloop | 3455 jg convertloop |
| 2959 | 3456 |
| 2960 vzeroupper | 3457 vzeroupper |
| 2961 ret | 3458 ret |
| 2962 } | 3459 } |
| 2963 } | 3460 } |
| 2964 #endif // HAS_ARGBCOPYALPHAROW_AVX2 | 3461 #endif // HAS_ARGBCOPYALPHAROW_AVX2 |
| 2965 | 3462 |
| 2966 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 | 3463 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 |
| 2967 // width in pixels | 3464 // width in pixels |
| 2968 __declspec(naked) __declspec(align(16)) | 3465 __declspec(naked) |
| 2969 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | 3466 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
| 2970 __asm { | 3467 __asm { |
| 2971 mov eax, [esp + 4] // src | 3468 mov eax, [esp + 4] // src |
| 2972 mov edx, [esp + 8] // dst | 3469 mov edx, [esp + 8] // dst |
| 2973 mov ecx, [esp + 12] // count | 3470 mov ecx, [esp + 12] // count |
| 2974 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 | 3471 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
| 2975 pslld xmm0, 24 | 3472 pslld xmm0, 24 |
| 2976 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff | 3473 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
| 2977 psrld xmm1, 8 | 3474 psrld xmm1, 8 |
| 2978 | 3475 |
| (...skipping 17 matching lines...) Expand all Loading... |
| 2996 sub ecx, 8 | 3493 sub ecx, 8 |
| 2997 jg convertloop | 3494 jg convertloop |
| 2998 | 3495 |
| 2999 ret | 3496 ret |
| 3000 } | 3497 } |
| 3001 } | 3498 } |
| 3002 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 | 3499 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 |
| 3003 | 3500 |
| 3004 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 | 3501 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 |
| 3005 // width in pixels | 3502 // width in pixels |
| 3006 __declspec(naked) __declspec(align(16)) | 3503 __declspec(naked) |
| 3007 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { | 3504 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
| 3008 __asm { | 3505 __asm { |
| 3009 mov eax, [esp + 4] // src | 3506 mov eax, [esp + 4] // src |
| 3010 mov edx, [esp + 8] // dst | 3507 mov edx, [esp + 8] // dst |
| 3011 mov ecx, [esp + 12] // count | 3508 mov ecx, [esp + 12] // count |
| 3012 vpcmpeqb ymm0, ymm0, ymm0 | 3509 vpcmpeqb ymm0, ymm0, ymm0 |
| 3013 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff | 3510 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
| 3014 | 3511 |
| 3015 convertloop: | 3512 convertloop: |
| 3016 vpmovzxbd ymm1, qword ptr [eax] | 3513 vpmovzxbd ymm1, qword ptr [eax] |
| (...skipping 11 matching lines...) Expand all Loading... |
| 3028 | 3525 |
| 3029 vzeroupper | 3526 vzeroupper |
| 3030 ret | 3527 ret |
| 3031 } | 3528 } |
| 3032 } | 3529 } |
| 3033 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 | 3530 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 |
| 3034 | 3531 |
| 3035 #ifdef HAS_SETROW_X86 | 3532 #ifdef HAS_SETROW_X86 |
| 3036 // Write 'count' bytes using an 8 bit value repeated. | 3533 // Write 'count' bytes using an 8 bit value repeated. |
| 3037 // Count should be multiple of 4. | 3534 // Count should be multiple of 4. |
| 3038 __declspec(naked) __declspec(align(16)) | 3535 __declspec(naked) |
| 3039 void SetRow_X86(uint8* dst, uint8 v8, int count) { | 3536 void SetRow_X86(uint8* dst, uint8 v8, int count) { |
| 3040 __asm { | 3537 __asm { |
| 3041 movzx eax, byte ptr [esp + 8] // v8 | 3538 movzx eax, byte ptr [esp + 8] // v8 |
| 3042 mov edx, 0x01010101 // Duplicate byte to all bytes. | 3539 mov edx, 0x01010101 // Duplicate byte to all bytes. |
| 3043 mul edx // overwrites edx with upper part of result. | 3540 mul edx // overwrites edx with upper part of result. |
| 3044 mov edx, edi | 3541 mov edx, edi |
| 3045 mov edi, [esp + 4] // dst | 3542 mov edi, [esp + 4] // dst |
| 3046 mov ecx, [esp + 12] // count | 3543 mov ecx, [esp + 12] // count |
| 3047 shr ecx, 2 | 3544 shr ecx, 2 |
| 3048 rep stosd | 3545 rep stosd |
| 3049 mov edi, edx | 3546 mov edi, edx |
| 3050 ret | 3547 ret |
| 3051 } | 3548 } |
| 3052 } | 3549 } |
| 3053 | 3550 |
| 3054 // Write 'count' bytes using an 8 bit value repeated. | 3551 // Write 'count' bytes using an 8 bit value repeated. |
| 3055 __declspec(naked) __declspec(align(16)) | 3552 __declspec(naked) |
| 3056 void SetRow_ERMS(uint8* dst, uint8 v8, int count) { | 3553 void SetRow_ERMS(uint8* dst, uint8 v8, int count) { |
| 3057 __asm { | 3554 __asm { |
| 3058 mov edx, edi | 3555 mov edx, edi |
| 3059 mov edi, [esp + 4] // dst | 3556 mov edi, [esp + 4] // dst |
| 3060 mov eax, [esp + 8] // v8 | 3557 mov eax, [esp + 8] // v8 |
| 3061 mov ecx, [esp + 12] // count | 3558 mov ecx, [esp + 12] // count |
| 3062 rep stosb | 3559 rep stosb |
| 3063 mov edi, edx | 3560 mov edi, edx |
| 3064 ret | 3561 ret |
| 3065 } | 3562 } |
| 3066 } | 3563 } |
| 3067 | 3564 |
| 3068 // Write 'count' 32 bit values. | 3565 // Write 'count' 32 bit values. |
| 3069 __declspec(naked) __declspec(align(16)) | 3566 __declspec(naked) |
| 3070 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { | 3567 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { |
| 3071 __asm { | 3568 __asm { |
| 3072 mov edx, edi | 3569 mov edx, edi |
| 3073 mov edi, [esp + 4] // dst | 3570 mov edi, [esp + 4] // dst |
| 3074 mov eax, [esp + 8] // v32 | 3571 mov eax, [esp + 8] // v32 |
| 3075 mov ecx, [esp + 12] // count | 3572 mov ecx, [esp + 12] // count |
| 3076 rep stosd | 3573 rep stosd |
| 3077 mov edi, edx | 3574 mov edi, edx |
| 3078 ret | 3575 ret |
| 3079 } | 3576 } |
| 3080 } | 3577 } |
| 3081 #endif // HAS_SETROW_X86 | 3578 #endif // HAS_SETROW_X86 |
| 3082 | 3579 |
| 3083 #ifdef HAS_YUY2TOYROW_AVX2 | 3580 #ifdef HAS_YUY2TOYROW_AVX2 |
| 3084 __declspec(naked) __declspec(align(16)) | 3581 __declspec(naked) |
| 3085 void YUY2ToYRow_AVX2(const uint8* src_yuy2, | 3582 void YUY2ToYRow_AVX2(const uint8* src_yuy2, |
| 3086 uint8* dst_y, int pix) { | 3583 uint8* dst_y, int pix) { |
| 3087 __asm { | 3584 __asm { |
| 3088 mov eax, [esp + 4] // src_yuy2 | 3585 mov eax, [esp + 4] // src_yuy2 |
| 3089 mov edx, [esp + 8] // dst_y | 3586 mov edx, [esp + 8] // dst_y |
| 3090 mov ecx, [esp + 12] // pix | 3587 mov ecx, [esp + 12] // pix |
| 3091 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 3588 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 3092 vpsrlw ymm5, ymm5, 8 | 3589 vpsrlw ymm5, ymm5, 8 |
| 3093 | 3590 |
| 3094 convertloop: | 3591 convertloop: |
| 3095 vmovdqu ymm0, [eax] | 3592 vmovdqu ymm0, [eax] |
| 3096 vmovdqu ymm1, [eax + 32] | 3593 vmovdqu ymm1, [eax + 32] |
| 3097 lea eax, [eax + 64] | 3594 lea eax, [eax + 64] |
| 3098 vpand ymm0, ymm0, ymm5 // even bytes are Y | 3595 vpand ymm0, ymm0, ymm5 // even bytes are Y |
| 3099 vpand ymm1, ymm1, ymm5 | 3596 vpand ymm1, ymm1, ymm5 |
| 3100 vpackuswb ymm0, ymm0, ymm1 // mutates. | 3597 vpackuswb ymm0, ymm0, ymm1 // mutates. |
| 3101 vpermq ymm0, ymm0, 0xd8 | 3598 vpermq ymm0, ymm0, 0xd8 |
| 3102 vmovdqu [edx], ymm0 | 3599 vmovdqu [edx], ymm0 |
| 3103 lea edx, [edx + 32] | 3600 lea edx, [edx + 32] |
| 3104 sub ecx, 32 | 3601 sub ecx, 32 |
| 3105 jg convertloop | 3602 jg convertloop |
| 3106 vzeroupper | 3603 vzeroupper |
| 3107 ret | 3604 ret |
| 3108 } | 3605 } |
| 3109 } | 3606 } |
| 3110 | 3607 |
| 3111 __declspec(naked) __declspec(align(16)) | 3608 __declspec(naked) |
| 3112 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, | 3609 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, |
| 3113 uint8* dst_u, uint8* dst_v, int pix) { | 3610 uint8* dst_u, uint8* dst_v, int pix) { |
| 3114 __asm { | 3611 __asm { |
| 3115 push esi | 3612 push esi |
| 3116 push edi | 3613 push edi |
| 3117 mov eax, [esp + 8 + 4] // src_yuy2 | 3614 mov eax, [esp + 8 + 4] // src_yuy2 |
| 3118 mov esi, [esp + 8 + 8] // stride_yuy2 | 3615 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 3119 mov edx, [esp + 8 + 12] // dst_u | 3616 mov edx, [esp + 8 + 12] // dst_u |
| 3120 mov edi, [esp + 8 + 16] // dst_v | 3617 mov edi, [esp + 8 + 16] // dst_v |
| 3121 mov ecx, [esp + 8 + 20] // pix | 3618 mov ecx, [esp + 8 + 20] // pix |
| (...skipping 23 matching lines...) Expand all Loading... |
| 3145 sub ecx, 32 | 3642 sub ecx, 32 |
| 3146 jg convertloop | 3643 jg convertloop |
| 3147 | 3644 |
| 3148 pop edi | 3645 pop edi |
| 3149 pop esi | 3646 pop esi |
| 3150 vzeroupper | 3647 vzeroupper |
| 3151 ret | 3648 ret |
| 3152 } | 3649 } |
| 3153 } | 3650 } |
| 3154 | 3651 |
| 3155 __declspec(naked) __declspec(align(16)) | 3652 __declspec(naked) |
| 3156 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, | 3653 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, |
| 3157 uint8* dst_u, uint8* dst_v, int pix) { | 3654 uint8* dst_u, uint8* dst_v, int pix) { |
| 3158 __asm { | 3655 __asm { |
| 3159 push edi | 3656 push edi |
| 3160 mov eax, [esp + 4 + 4] // src_yuy2 | 3657 mov eax, [esp + 4 + 4] // src_yuy2 |
| 3161 mov edx, [esp + 4 + 8] // dst_u | 3658 mov edx, [esp + 4 + 8] // dst_u |
| 3162 mov edi, [esp + 4 + 12] // dst_v | 3659 mov edi, [esp + 4 + 12] // dst_v |
| 3163 mov ecx, [esp + 4 + 16] // pix | 3660 mov ecx, [esp + 4 + 16] // pix |
| 3164 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 3661 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 3165 vpsrlw ymm5, ymm5, 8 | 3662 vpsrlw ymm5, ymm5, 8 |
| (...skipping 18 matching lines...) Expand all Loading... |
| 3184 lea edx, [edx + 16] | 3681 lea edx, [edx + 16] |
| 3185 sub ecx, 32 | 3682 sub ecx, 32 |
| 3186 jg convertloop | 3683 jg convertloop |
| 3187 | 3684 |
| 3188 pop edi | 3685 pop edi |
| 3189 vzeroupper | 3686 vzeroupper |
| 3190 ret | 3687 ret |
| 3191 } | 3688 } |
| 3192 } | 3689 } |
| 3193 | 3690 |
| 3194 __declspec(naked) __declspec(align(16)) | 3691 __declspec(naked) |
| 3195 void UYVYToYRow_AVX2(const uint8* src_uyvy, | 3692 void UYVYToYRow_AVX2(const uint8* src_uyvy, |
| 3196 uint8* dst_y, int pix) { | 3693 uint8* dst_y, int pix) { |
| 3197 __asm { | 3694 __asm { |
| 3198 mov eax, [esp + 4] // src_uyvy | 3695 mov eax, [esp + 4] // src_uyvy |
| 3199 mov edx, [esp + 8] // dst_y | 3696 mov edx, [esp + 8] // dst_y |
| 3200 mov ecx, [esp + 12] // pix | 3697 mov ecx, [esp + 12] // pix |
| 3201 | 3698 |
| 3202 convertloop: | 3699 convertloop: |
| 3203 vmovdqu ymm0, [eax] | 3700 vmovdqu ymm0, [eax] |
| 3204 vmovdqu ymm1, [eax + 32] | 3701 vmovdqu ymm1, [eax + 32] |
| 3205 lea eax, [eax + 64] | 3702 lea eax, [eax + 64] |
| 3206 vpsrlw ymm0, ymm0, 8 // odd bytes are Y | 3703 vpsrlw ymm0, ymm0, 8 // odd bytes are Y |
| 3207 vpsrlw ymm1, ymm1, 8 | 3704 vpsrlw ymm1, ymm1, 8 |
| 3208 vpackuswb ymm0, ymm0, ymm1 // mutates. | 3705 vpackuswb ymm0, ymm0, ymm1 // mutates. |
| 3209 vpermq ymm0, ymm0, 0xd8 | 3706 vpermq ymm0, ymm0, 0xd8 |
| 3210 vmovdqu [edx], ymm0 | 3707 vmovdqu [edx], ymm0 |
| 3211 lea edx, [edx + 32] | 3708 lea edx, [edx + 32] |
| 3212 sub ecx, 32 | 3709 sub ecx, 32 |
| 3213 jg convertloop | 3710 jg convertloop |
| 3214 vzeroupper | 3711 vzeroupper |
| 3215 ret | 3712 ret |
| 3216 } | 3713 } |
| 3217 } | 3714 } |
| 3218 | 3715 |
| 3219 __declspec(naked) __declspec(align(16)) | 3716 __declspec(naked) |
| 3220 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, | 3717 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, |
| 3221 uint8* dst_u, uint8* dst_v, int pix) { | 3718 uint8* dst_u, uint8* dst_v, int pix) { |
| 3222 __asm { | 3719 __asm { |
| 3223 push esi | 3720 push esi |
| 3224 push edi | 3721 push edi |
| 3225 mov eax, [esp + 8 + 4] // src_yuy2 | 3722 mov eax, [esp + 8 + 4] // src_yuy2 |
| 3226 mov esi, [esp + 8 + 8] // stride_yuy2 | 3723 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 3227 mov edx, [esp + 8 + 12] // dst_u | 3724 mov edx, [esp + 8 + 12] // dst_u |
| 3228 mov edi, [esp + 8 + 16] // dst_v | 3725 mov edi, [esp + 8 + 16] // dst_v |
| 3229 mov ecx, [esp + 8 + 20] // pix | 3726 mov ecx, [esp + 8 + 20] // pix |
| (...skipping 23 matching lines...) Expand all Loading... |
| 3253 sub ecx, 32 | 3750 sub ecx, 32 |
| 3254 jg convertloop | 3751 jg convertloop |
| 3255 | 3752 |
| 3256 pop edi | 3753 pop edi |
| 3257 pop esi | 3754 pop esi |
| 3258 vzeroupper | 3755 vzeroupper |
| 3259 ret | 3756 ret |
| 3260 } | 3757 } |
| 3261 } | 3758 } |
| 3262 | 3759 |
| 3263 __declspec(naked) __declspec(align(16)) | 3760 __declspec(naked) |
| 3264 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, | 3761 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, |
| 3265 uint8* dst_u, uint8* dst_v, int pix) { | 3762 uint8* dst_u, uint8* dst_v, int pix) { |
| 3266 __asm { | 3763 __asm { |
| 3267 push edi | 3764 push edi |
| 3268 mov eax, [esp + 4 + 4] // src_yuy2 | 3765 mov eax, [esp + 4 + 4] // src_yuy2 |
| 3269 mov edx, [esp + 4 + 8] // dst_u | 3766 mov edx, [esp + 4 + 8] // dst_u |
| 3270 mov edi, [esp + 4 + 12] // dst_v | 3767 mov edi, [esp + 4 + 12] // dst_v |
| 3271 mov ecx, [esp + 4 + 16] // pix | 3768 mov ecx, [esp + 4 + 16] // pix |
| 3272 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 3769 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 3273 vpsrlw ymm5, ymm5, 8 | 3770 vpsrlw ymm5, ymm5, 8 |
| (...skipping 20 matching lines...) Expand all Loading... |
| 3294 jg convertloop | 3791 jg convertloop |
| 3295 | 3792 |
| 3296 pop edi | 3793 pop edi |
| 3297 vzeroupper | 3794 vzeroupper |
| 3298 ret | 3795 ret |
| 3299 } | 3796 } |
| 3300 } | 3797 } |
| 3301 #endif // HAS_YUY2TOYROW_AVX2 | 3798 #endif // HAS_YUY2TOYROW_AVX2 |
| 3302 | 3799 |
| 3303 #ifdef HAS_YUY2TOYROW_SSE2 | 3800 #ifdef HAS_YUY2TOYROW_SSE2 |
| 3304 __declspec(naked) __declspec(align(16)) | 3801 __declspec(naked) |
| 3305 void YUY2ToYRow_SSE2(const uint8* src_yuy2, | 3802 void YUY2ToYRow_SSE2(const uint8* src_yuy2, |
| 3306 uint8* dst_y, int pix) { | 3803 uint8* dst_y, int pix) { |
| 3307 __asm { | 3804 __asm { |
| 3308 mov eax, [esp + 4] // src_yuy2 | 3805 mov eax, [esp + 4] // src_yuy2 |
| 3309 mov edx, [esp + 8] // dst_y | 3806 mov edx, [esp + 8] // dst_y |
| 3310 mov ecx, [esp + 12] // pix | 3807 mov ecx, [esp + 12] // pix |
| 3311 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 3808 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 3312 psrlw xmm5, 8 | 3809 psrlw xmm5, 8 |
| 3313 | 3810 |
| 3314 convertloop: | 3811 convertloop: |
| 3315 movdqu xmm0, [eax] | 3812 movdqu xmm0, [eax] |
| 3316 movdqu xmm1, [eax + 16] | 3813 movdqu xmm1, [eax + 16] |
| 3317 lea eax, [eax + 32] | 3814 lea eax, [eax + 32] |
| 3318 pand xmm0, xmm5 // even bytes are Y | 3815 pand xmm0, xmm5 // even bytes are Y |
| 3319 pand xmm1, xmm5 | 3816 pand xmm1, xmm5 |
| 3320 packuswb xmm0, xmm1 | 3817 packuswb xmm0, xmm1 |
| 3321 movdqu [edx], xmm0 | 3818 movdqu [edx], xmm0 |
| 3322 lea edx, [edx + 16] | 3819 lea edx, [edx + 16] |
| 3323 sub ecx, 16 | 3820 sub ecx, 16 |
| 3324 jg convertloop | 3821 jg convertloop |
| 3325 ret | 3822 ret |
| 3326 } | 3823 } |
| 3327 } | 3824 } |
| 3328 | 3825 |
| 3329 __declspec(naked) __declspec(align(16)) | 3826 __declspec(naked) |
| 3330 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, | 3827 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, |
| 3331 uint8* dst_u, uint8* dst_v, int pix) { | 3828 uint8* dst_u, uint8* dst_v, int pix) { |
| 3332 __asm { | 3829 __asm { |
| 3333 push esi | 3830 push esi |
| 3334 push edi | 3831 push edi |
| 3335 mov eax, [esp + 8 + 4] // src_yuy2 | 3832 mov eax, [esp + 8 + 4] // src_yuy2 |
| 3336 mov esi, [esp + 8 + 8] // stride_yuy2 | 3833 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 3337 mov edx, [esp + 8 + 12] // dst_u | 3834 mov edx, [esp + 8 + 12] // dst_u |
| 3338 mov edi, [esp + 8 + 16] // dst_v | 3835 mov edi, [esp + 8 + 16] // dst_v |
| 3339 mov ecx, [esp + 8 + 20] // pix | 3836 mov ecx, [esp + 8 + 20] // pix |
| (...skipping 22 matching lines...) Expand all Loading... |
| 3362 lea edx, [edx + 8] | 3859 lea edx, [edx + 8] |
| 3363 sub ecx, 16 | 3860 sub ecx, 16 |
| 3364 jg convertloop | 3861 jg convertloop |
| 3365 | 3862 |
| 3366 pop edi | 3863 pop edi |
| 3367 pop esi | 3864 pop esi |
| 3368 ret | 3865 ret |
| 3369 } | 3866 } |
| 3370 } | 3867 } |
| 3371 | 3868 |
| 3372 __declspec(naked) __declspec(align(16)) | 3869 __declspec(naked) |
| 3373 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, | 3870 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
| 3374 uint8* dst_u, uint8* dst_v, int pix) { | 3871 uint8* dst_u, uint8* dst_v, int pix) { |
| 3375 __asm { | 3872 __asm { |
| 3376 push edi | 3873 push edi |
| 3377 mov eax, [esp + 4 + 4] // src_yuy2 | 3874 mov eax, [esp + 4 + 4] // src_yuy2 |
| 3378 mov edx, [esp + 4 + 8] // dst_u | 3875 mov edx, [esp + 4 + 8] // dst_u |
| 3379 mov edi, [esp + 4 + 12] // dst_v | 3876 mov edi, [esp + 4 + 12] // dst_v |
| 3380 mov ecx, [esp + 4 + 16] // pix | 3877 mov ecx, [esp + 4 + 16] // pix |
| 3381 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 3878 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 3382 psrlw xmm5, 8 | 3879 psrlw xmm5, 8 |
| (...skipping 15 matching lines...) Expand all Loading... |
| 3398 movq qword ptr [edx + edi], xmm1 | 3895 movq qword ptr [edx + edi], xmm1 |
| 3399 lea edx, [edx + 8] | 3896 lea edx, [edx + 8] |
| 3400 sub ecx, 16 | 3897 sub ecx, 16 |
| 3401 jg convertloop | 3898 jg convertloop |
| 3402 | 3899 |
| 3403 pop edi | 3900 pop edi |
| 3404 ret | 3901 ret |
| 3405 } | 3902 } |
| 3406 } | 3903 } |
| 3407 | 3904 |
| 3408 __declspec(naked) __declspec(align(16)) | 3905 __declspec(naked) |
| 3409 void UYVYToYRow_SSE2(const uint8* src_uyvy, | 3906 void UYVYToYRow_SSE2(const uint8* src_uyvy, |
| 3410 uint8* dst_y, int pix) { | 3907 uint8* dst_y, int pix) { |
| 3411 __asm { | 3908 __asm { |
| 3412 mov eax, [esp + 4] // src_uyvy | 3909 mov eax, [esp + 4] // src_uyvy |
| 3413 mov edx, [esp + 8] // dst_y | 3910 mov edx, [esp + 8] // dst_y |
| 3414 mov ecx, [esp + 12] // pix | 3911 mov ecx, [esp + 12] // pix |
| 3415 | 3912 |
| 3416 convertloop: | 3913 convertloop: |
| 3417 movdqu xmm0, [eax] | 3914 movdqu xmm0, [eax] |
| 3418 movdqu xmm1, [eax + 16] | 3915 movdqu xmm1, [eax + 16] |
| 3419 lea eax, [eax + 32] | 3916 lea eax, [eax + 32] |
| 3420 psrlw xmm0, 8 // odd bytes are Y | 3917 psrlw xmm0, 8 // odd bytes are Y |
| 3421 psrlw xmm1, 8 | 3918 psrlw xmm1, 8 |
| 3422 packuswb xmm0, xmm1 | 3919 packuswb xmm0, xmm1 |
| 3423 movdqu [edx], xmm0 | 3920 movdqu [edx], xmm0 |
| 3424 lea edx, [edx + 16] | 3921 lea edx, [edx + 16] |
| 3425 sub ecx, 16 | 3922 sub ecx, 16 |
| 3426 jg convertloop | 3923 jg convertloop |
| 3427 ret | 3924 ret |
| 3428 } | 3925 } |
| 3429 } | 3926 } |
| 3430 | 3927 |
| 3431 __declspec(naked) __declspec(align(16)) | 3928 __declspec(naked) |
| 3432 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, | 3929 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, |
| 3433 uint8* dst_u, uint8* dst_v, int pix) { | 3930 uint8* dst_u, uint8* dst_v, int pix) { |
| 3434 __asm { | 3931 __asm { |
| 3435 push esi | 3932 push esi |
| 3436 push edi | 3933 push edi |
| 3437 mov eax, [esp + 8 + 4] // src_yuy2 | 3934 mov eax, [esp + 8 + 4] // src_yuy2 |
| 3438 mov esi, [esp + 8 + 8] // stride_yuy2 | 3935 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 3439 mov edx, [esp + 8 + 12] // dst_u | 3936 mov edx, [esp + 8 + 12] // dst_u |
| 3440 mov edi, [esp + 8 + 16] // dst_v | 3937 mov edi, [esp + 8 + 16] // dst_v |
| 3441 mov ecx, [esp + 8 + 20] // pix | 3938 mov ecx, [esp + 8 + 20] // pix |
| (...skipping 22 matching lines...) Expand all Loading... |
| 3464 lea edx, [edx + 8] | 3961 lea edx, [edx + 8] |
| 3465 sub ecx, 16 | 3962 sub ecx, 16 |
| 3466 jg convertloop | 3963 jg convertloop |
| 3467 | 3964 |
| 3468 pop edi | 3965 pop edi |
| 3469 pop esi | 3966 pop esi |
| 3470 ret | 3967 ret |
| 3471 } | 3968 } |
| 3472 } | 3969 } |
| 3473 | 3970 |
| 3474 __declspec(naked) __declspec(align(16)) | 3971 __declspec(naked) |
| 3475 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, | 3972 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
| 3476 uint8* dst_u, uint8* dst_v, int pix) { | 3973 uint8* dst_u, uint8* dst_v, int pix) { |
| 3477 __asm { | 3974 __asm { |
| 3478 push edi | 3975 push edi |
| 3479 mov eax, [esp + 4 + 4] // src_yuy2 | 3976 mov eax, [esp + 4 + 4] // src_yuy2 |
| 3480 mov edx, [esp + 4 + 8] // dst_u | 3977 mov edx, [esp + 4 + 8] // dst_u |
| 3481 mov edi, [esp + 4 + 12] // dst_v | 3978 mov edi, [esp + 4 + 12] // dst_v |
| 3482 mov ecx, [esp + 4 + 16] // pix | 3979 mov ecx, [esp + 4 + 16] // pix |
| 3483 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 3980 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 3484 psrlw xmm5, 8 | 3981 psrlw xmm5, 8 |
| (...skipping 18 matching lines...) Expand all Loading... |
| 3503 jg convertloop | 4000 jg convertloop |
| 3504 | 4001 |
| 3505 pop edi | 4002 pop edi |
| 3506 ret | 4003 ret |
| 3507 } | 4004 } |
| 3508 } | 4005 } |
| 3509 #endif // HAS_YUY2TOYROW_SSE2 | 4006 #endif // HAS_YUY2TOYROW_SSE2 |
| 3510 | 4007 |
| 3511 #ifdef HAS_ARGBBLENDROW_SSE2 | 4008 #ifdef HAS_ARGBBLENDROW_SSE2 |
| 3512 // Blend 8 pixels at a time. | 4009 // Blend 8 pixels at a time. |
| 3513 __declspec(naked) __declspec(align(16)) | 4010 __declspec(naked) |
| 3514 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4011 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 3515 uint8* dst_argb, int width) { | 4012 uint8* dst_argb, int width) { |
| 3516 __asm { | 4013 __asm { |
| 3517 push esi | 4014 push esi |
| 3518 mov eax, [esp + 4 + 4] // src_argb0 | 4015 mov eax, [esp + 4 + 4] // src_argb0 |
| 3519 mov esi, [esp + 4 + 8] // src_argb1 | 4016 mov esi, [esp + 4 + 8] // src_argb1 |
| 3520 mov edx, [esp + 4 + 12] // dst_argb | 4017 mov edx, [esp + 4 + 12] // dst_argb |
| 3521 mov ecx, [esp + 4 + 16] // width | 4018 mov ecx, [esp + 4 + 16] // width |
| 3522 pcmpeqb xmm7, xmm7 // generate constant 1 | 4019 pcmpeqb xmm7, xmm7 // generate constant 1 |
| 3523 psrlw xmm7, 15 | 4020 psrlw xmm7, 15 |
| 3524 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff | 4021 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff |
| 3525 psrlw xmm6, 8 | 4022 psrlw xmm6, 8 |
| 3526 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 | 4023 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
| 3527 psllw xmm5, 8 | 4024 psllw xmm5, 8 |
| 3528 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | 4025 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
| 3529 pslld xmm4, 24 | 4026 pslld xmm4, 24 |
| 3530 | 4027 sub ecx, 4 |
| 3531 sub ecx, 1 | 4028 jl convertloop4b // less than 4 pixels? |
| 3532 je convertloop1 // only 1 pixel? | |
| 3533 jl convertloop1b | |
| 3534 | |
| 3535 // 1 pixel loop until destination pointer is aligned. | |
| 3536 alignloop1: | |
| 3537 test edx, 15 // aligned? | |
| 3538 je alignloop1b | |
| 3539 movd xmm3, [eax] | |
| 3540 lea eax, [eax + 4] | |
| 3541 movdqa xmm0, xmm3 // src argb | |
| 3542 pxor xmm3, xmm4 // ~alpha | |
| 3543 movd xmm2, [esi] // _r_b | |
| 3544 psrlw xmm3, 8 // alpha | |
| 3545 pshufhw xmm3, xmm3, 0F5h // 8 alpha words | |
| 3546 pshuflw xmm3, xmm3, 0F5h | |
| 3547 pand xmm2, xmm6 // _r_b | |
| 3548 paddw xmm3, xmm7 // 256 - alpha | |
| 3549 pmullw xmm2, xmm3 // _r_b * alpha | |
| 3550 movd xmm1, [esi] // _a_g | |
| 3551 lea esi, [esi + 4] | |
| 3552 psrlw xmm1, 8 // _a_g | |
| 3553 por xmm0, xmm4 // set alpha to 255 | |
| 3554 pmullw xmm1, xmm3 // _a_g * alpha | |
| 3555 psrlw xmm2, 8 // _r_b convert to 8 bits again | |
| 3556 paddusb xmm0, xmm2 // + src argb | |
| 3557 pand xmm1, xmm5 // a_g_ convert to 8 bits again | |
| 3558 paddusb xmm0, xmm1 // + src argb | |
| 3559 movd [edx], xmm0 | |
| 3560 lea edx, [edx + 4] | |
| 3561 sub ecx, 1 | |
| 3562 jge alignloop1 | |
| 3563 | |
| 3564 alignloop1b: | |
| 3565 add ecx, 1 - 4 | |
| 3566 jl convertloop4b | |
| 3567 | 4029 |
| 3568 // 4 pixel loop. | 4030 // 4 pixel loop. |
| 3569 convertloop4: | 4031 convertloop4: |
| 3570 movdqu xmm3, [eax] // src argb | 4032 movdqu xmm3, [eax] // src argb |
| 3571 lea eax, [eax + 16] | 4033 lea eax, [eax + 16] |
| 3572 movdqa xmm0, xmm3 // src argb | 4034 movdqa xmm0, xmm3 // src argb |
| 3573 pxor xmm3, xmm4 // ~alpha | 4035 pxor xmm3, xmm4 // ~alpha |
| 3574 movdqu xmm2, [esi] // _r_b | 4036 movdqu xmm2, [esi] // _r_b |
| 3575 psrlw xmm3, 8 // alpha | 4037 psrlw xmm3, 8 // alpha |
| 3576 pshufhw xmm3, xmm3, 0F5h // 8 alpha words | 4038 pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
| (...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3637 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | 4099 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
| 3638 }; | 4100 }; |
| 3639 // Same as SSE2, but replaces: | 4101 // Same as SSE2, but replaces: |
| 3640 // psrlw xmm3, 8 // alpha | 4102 // psrlw xmm3, 8 // alpha |
| 3641 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words | 4103 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
| 3642 // pshuflw xmm3, xmm3, 0F5h | 4104 // pshuflw xmm3, xmm3, 0F5h |
| 3643 // with.. | 4105 // with.. |
| 3644 // pshufb xmm3, kShuffleAlpha // alpha | 4106 // pshufb xmm3, kShuffleAlpha // alpha |
| 3645 // Blend 8 pixels at a time. | 4107 // Blend 8 pixels at a time. |
| 3646 | 4108 |
| 3647 __declspec(naked) __declspec(align(16)) | 4109 __declspec(naked) |
| 3648 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | 4110 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
| 3649 uint8* dst_argb, int width) { | 4111 uint8* dst_argb, int width) { |
| 3650 __asm { | 4112 __asm { |
| 3651 push esi | 4113 push esi |
| 3652 mov eax, [esp + 4 + 4] // src_argb0 | 4114 mov eax, [esp + 4 + 4] // src_argb0 |
| 3653 mov esi, [esp + 4 + 8] // src_argb1 | 4115 mov esi, [esp + 4 + 8] // src_argb1 |
| 3654 mov edx, [esp + 4 + 12] // dst_argb | 4116 mov edx, [esp + 4 + 12] // dst_argb |
| 3655 mov ecx, [esp + 4 + 16] // width | 4117 mov ecx, [esp + 4 + 16] // width |
| 3656 pcmpeqb xmm7, xmm7 // generate constant 0x0001 | 4118 pcmpeqb xmm7, xmm7 // generate constant 0x0001 |
| 3657 psrlw xmm7, 15 | 4119 psrlw xmm7, 15 |
| 3658 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff | 4120 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff |
| 3659 psrlw xmm6, 8 | 4121 psrlw xmm6, 8 |
| 3660 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 | 4122 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
| 3661 psllw xmm5, 8 | 4123 psllw xmm5, 8 |
| 3662 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | 4124 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
| 3663 pslld xmm4, 24 | 4125 pslld xmm4, 24 |
| 3664 | 4126 sub ecx, 4 |
| 3665 sub ecx, 1 | 4127 jl convertloop4b // less than 4 pixels? |
| 3666 je convertloop1 // only 1 pixel? | |
| 3667 jl convertloop1b | |
| 3668 | |
| 3669 // 1 pixel loop until destination pointer is aligned. | |
| 3670 alignloop1: | |
| 3671 test edx, 15 // aligned? | |
| 3672 je alignloop1b | |
| 3673 movd xmm3, [eax] | |
| 3674 lea eax, [eax + 4] | |
| 3675 movdqa xmm0, xmm3 // src argb | |
| 3676 pxor xmm3, xmm4 // ~alpha | |
| 3677 movd xmm2, [esi] // _r_b | |
| 3678 pshufb xmm3, kShuffleAlpha // alpha | |
| 3679 pand xmm2, xmm6 // _r_b | |
| 3680 paddw xmm3, xmm7 // 256 - alpha | |
| 3681 pmullw xmm2, xmm3 // _r_b * alpha | |
| 3682 movd xmm1, [esi] // _a_g | |
| 3683 lea esi, [esi + 4] | |
| 3684 psrlw xmm1, 8 // _a_g | |
| 3685 por xmm0, xmm4 // set alpha to 255 | |
| 3686 pmullw xmm1, xmm3 // _a_g * alpha | |
| 3687 psrlw xmm2, 8 // _r_b convert to 8 bits again | |
| 3688 paddusb xmm0, xmm2 // + src argb | |
| 3689 pand xmm1, xmm5 // a_g_ convert to 8 bits again | |
| 3690 paddusb xmm0, xmm1 // + src argb | |
| 3691 movd [edx], xmm0 | |
| 3692 lea edx, [edx + 4] | |
| 3693 sub ecx, 1 | |
| 3694 jge alignloop1 | |
| 3695 | |
| 3696 alignloop1b: | |
| 3697 add ecx, 1 - 4 | |
| 3698 jl convertloop4b | |
| 3699 | 4128 |
| 3700 // 4 pixel loop. | 4129 // 4 pixel loop. |
| 3701 convertloop4: | 4130 convertloop4: |
| 3702 movdqu xmm3, [eax] // src argb | 4131 movdqu xmm3, [eax] // src argb |
| 3703 lea eax, [eax + 16] | 4132 lea eax, [eax + 16] |
| 3704 movdqa xmm0, xmm3 // src argb | 4133 movdqa xmm0, xmm3 // src argb |
| 3705 pxor xmm3, xmm4 // ~alpha | 4134 pxor xmm3, xmm4 // ~alpha |
| 3706 movdqu xmm2, [esi] // _r_b | 4135 movdqu xmm2, [esi] // _r_b |
| 3707 pshufb xmm3, kShuffleAlpha // alpha | 4136 pshufb xmm3, kShuffleAlpha // alpha |
| 3708 pand xmm2, xmm6 // _r_b | 4137 pand xmm2, xmm6 // _r_b |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3753 | 4182 |
| 3754 convertloop1b: | 4183 convertloop1b: |
| 3755 pop esi | 4184 pop esi |
| 3756 ret | 4185 ret |
| 3757 } | 4186 } |
| 3758 } | 4187 } |
| 3759 #endif // HAS_ARGBBLENDROW_SSSE3 | 4188 #endif // HAS_ARGBBLENDROW_SSSE3 |
| 3760 | 4189 |
| 3761 #ifdef HAS_ARGBATTENUATEROW_SSE2 | 4190 #ifdef HAS_ARGBATTENUATEROW_SSE2 |
| 3762 // Attenuate 4 pixels at a time. | 4191 // Attenuate 4 pixels at a time. |
| 3763 __declspec(naked) __declspec(align(16)) | 4192 __declspec(naked) |
| 3764 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { | 4193 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { |
| 3765 __asm { | 4194 __asm { |
| 3766 mov eax, [esp + 4] // src_argb0 | 4195 mov eax, [esp + 4] // src_argb0 |
| 3767 mov edx, [esp + 8] // dst_argb | 4196 mov edx, [esp + 8] // dst_argb |
| 3768 mov ecx, [esp + 12] // width | 4197 mov ecx, [esp + 12] // width |
| 3769 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | 4198 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
| 3770 pslld xmm4, 24 | 4199 pslld xmm4, 24 |
| 3771 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff | 4200 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff |
| 3772 psrld xmm5, 8 | 4201 psrld xmm5, 8 |
| 3773 | 4202 |
| (...skipping 28 matching lines...) Expand all Loading... |
| 3802 | 4231 |
| 3803 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | 4232 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
| 3804 // Shuffle table duplicating alpha. | 4233 // Shuffle table duplicating alpha. |
| 3805 static const uvec8 kShuffleAlpha0 = { | 4234 static const uvec8 kShuffleAlpha0 = { |
| 3806 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, | 4235 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, |
| 3807 }; | 4236 }; |
| 3808 static const uvec8 kShuffleAlpha1 = { | 4237 static const uvec8 kShuffleAlpha1 = { |
| 3809 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | 4238 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
| 3810 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, | 4239 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, |
| 3811 }; | 4240 }; |
| 3812 __declspec(naked) __declspec(align(16)) | 4241 __declspec(naked) |
| 3813 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | 4242 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
| 3814 __asm { | 4243 __asm { |
| 3815 mov eax, [esp + 4] // src_argb0 | 4244 mov eax, [esp + 4] // src_argb0 |
| 3816 mov edx, [esp + 8] // dst_argb | 4245 mov edx, [esp + 8] // dst_argb |
| 3817 mov ecx, [esp + 12] // width | 4246 mov ecx, [esp + 12] // width |
| 3818 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 | 4247 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 |
| 3819 pslld xmm3, 24 | 4248 pslld xmm3, 24 |
| 3820 movdqa xmm4, kShuffleAlpha0 | 4249 movdqa xmm4, kShuffleAlpha0 |
| 3821 movdqa xmm5, kShuffleAlpha1 | 4250 movdqa xmm5, kShuffleAlpha1 |
| 3822 | 4251 |
| (...skipping 23 matching lines...) Expand all Loading... |
| 3846 ret | 4275 ret |
| 3847 } | 4276 } |
| 3848 } | 4277 } |
| 3849 #endif // HAS_ARGBATTENUATEROW_SSSE3 | 4278 #endif // HAS_ARGBATTENUATEROW_SSSE3 |
| 3850 | 4279 |
| 3851 #ifdef HAS_ARGBATTENUATEROW_AVX2 | 4280 #ifdef HAS_ARGBATTENUATEROW_AVX2 |
| 3852 // Shuffle table duplicating alpha. | 4281 // Shuffle table duplicating alpha. |
| 3853 static const uvec8 kShuffleAlpha_AVX2 = { | 4282 static const uvec8 kShuffleAlpha_AVX2 = { |
| 3854 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u | 4283 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u |
| 3855 }; | 4284 }; |
| 3856 __declspec(naked) __declspec(align(16)) | 4285 __declspec(naked) |
| 3857 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { | 4286 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { |
| 3858 __asm { | 4287 __asm { |
| 3859 mov eax, [esp + 4] // src_argb0 | 4288 mov eax, [esp + 4] // src_argb0 |
| 3860 mov edx, [esp + 8] // dst_argb | 4289 mov edx, [esp + 8] // dst_argb |
| 3861 mov ecx, [esp + 12] // width | 4290 mov ecx, [esp + 12] // width |
| 3862 sub edx, eax | 4291 sub edx, eax |
| 3863 vbroadcastf128 ymm4,kShuffleAlpha_AVX2 | 4292 vbroadcastf128 ymm4,kShuffleAlpha_AVX2 |
| 3864 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 | 4293 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
| 3865 vpslld ymm5, ymm5, 24 | 4294 vpslld ymm5, ymm5, 24 |
| 3866 | 4295 |
| (...skipping 16 matching lines...) Expand all Loading... |
| 3883 jg convertloop | 4312 jg convertloop |
| 3884 | 4313 |
| 3885 vzeroupper | 4314 vzeroupper |
| 3886 ret | 4315 ret |
| 3887 } | 4316 } |
| 3888 } | 4317 } |
| 3889 #endif // HAS_ARGBATTENUATEROW_AVX2 | 4318 #endif // HAS_ARGBATTENUATEROW_AVX2 |
| 3890 | 4319 |
| 3891 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 | 4320 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 |
| 3892 // Unattenuate 4 pixels at a time. | 4321 // Unattenuate 4 pixels at a time. |
| 3893 __declspec(naked) __declspec(align(16)) | 4322 __declspec(naked) |
| 3894 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 4323 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
| 3895 int width) { | 4324 int width) { |
| 3896 __asm { | 4325 __asm { |
| 3897 push esi | 4326 push esi |
| 3898 push edi | 4327 push edi |
| 3899 mov eax, [esp + 8 + 4] // src_argb0 | 4328 mov eax, [esp + 8 + 4] // src_argb0 |
| 3900 mov edx, [esp + 8 + 8] // dst_argb | 4329 mov edx, [esp + 8 + 8] // dst_argb |
| 3901 mov ecx, [esp + 8 + 12] // width | 4330 mov ecx, [esp + 8 + 12] // width |
| 3902 | 4331 |
| 3903 convertloop: | 4332 convertloop: |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3937 #endif // HAS_ARGBUNATTENUATEROW_SSE2 | 4366 #endif // HAS_ARGBUNATTENUATEROW_SSE2 |
| 3938 | 4367 |
| 3939 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 | 4368 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 |
| 3940 // Shuffle table duplicating alpha. | 4369 // Shuffle table duplicating alpha. |
| 3941 static const uvec8 kUnattenShuffleAlpha_AVX2 = { | 4370 static const uvec8 kUnattenShuffleAlpha_AVX2 = { |
| 3942 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u | 4371 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u |
| 3943 }; | 4372 }; |
| 3944 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. | 4373 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. |
| 3945 // USE_GATHER is not on by default, due to being a slow instruction. | 4374 // USE_GATHER is not on by default, due to being a slow instruction. |
| 3946 #ifdef USE_GATHER | 4375 #ifdef USE_GATHER |
| 3947 __declspec(naked) __declspec(align(16)) | 4376 __declspec(naked) |
| 3948 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 4377 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
| 3949 int width) { | 4378 int width) { |
| 3950 __asm { | 4379 __asm { |
| 3951 mov eax, [esp + 4] // src_argb0 | 4380 mov eax, [esp + 4] // src_argb0 |
| 3952 mov edx, [esp + 8] // dst_argb | 4381 mov edx, [esp + 8] // dst_argb |
| 3953 mov ecx, [esp + 12] // width | 4382 mov ecx, [esp + 12] // width |
| 3954 sub edx, eax | 4383 sub edx, eax |
| 3955 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 | 4384 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 |
| 3956 | 4385 |
| 3957 convertloop: | 4386 convertloop: |
| (...skipping 13 matching lines...) Expand all Loading... |
| 3971 vmovdqu [eax + edx], ymm0 | 4400 vmovdqu [eax + edx], ymm0 |
| 3972 lea eax, [eax + 32] | 4401 lea eax, [eax + 32] |
| 3973 sub ecx, 8 | 4402 sub ecx, 8 |
| 3974 jg convertloop | 4403 jg convertloop |
| 3975 | 4404 |
| 3976 vzeroupper | 4405 vzeroupper |
| 3977 ret | 4406 ret |
| 3978 } | 4407 } |
| 3979 } | 4408 } |
| 3980 #else // USE_GATHER | 4409 #else // USE_GATHER |
| 3981 __declspec(naked) __declspec(align(16)) | 4410 __declspec(naked) |
| 3982 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 4411 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
| 3983 int width) { | 4412 int width) { |
| 3984 __asm { | 4413 __asm { |
| 3985 | 4414 |
| 3986 mov eax, [esp + 4] // src_argb0 | 4415 mov eax, [esp + 4] // src_argb0 |
| 3987 mov edx, [esp + 8] // dst_argb | 4416 mov edx, [esp + 8] // dst_argb |
| 3988 mov ecx, [esp + 12] // width | 4417 mov ecx, [esp + 12] // width |
| 3989 sub edx, eax | 4418 sub edx, eax |
| 3990 vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 | 4419 vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 |
| 3991 | 4420 |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4038 pop esi | 4467 pop esi |
| 4039 vzeroupper | 4468 vzeroupper |
| 4040 ret | 4469 ret |
| 4041 } | 4470 } |
| 4042 } | 4471 } |
| 4043 #endif // USE_GATHER | 4472 #endif // USE_GATHER |
| 4044 #endif // HAS_ARGBATTENUATEROW_AVX2 | 4473 #endif // HAS_ARGBATTENUATEROW_AVX2 |
| 4045 | 4474 |
| 4046 #ifdef HAS_ARGBGRAYROW_SSSE3 | 4475 #ifdef HAS_ARGBGRAYROW_SSSE3 |
| 4047 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. | 4476 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. |
| 4048 __declspec(naked) __declspec(align(16)) | 4477 __declspec(naked) |
| 4049 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | 4478 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
| 4050 __asm { | 4479 __asm { |
| 4051 mov eax, [esp + 4] /* src_argb */ | 4480 mov eax, [esp + 4] /* src_argb */ |
| 4052 mov edx, [esp + 8] /* dst_argb */ | 4481 mov edx, [esp + 8] /* dst_argb */ |
| 4053 mov ecx, [esp + 12] /* width */ | 4482 mov ecx, [esp + 12] /* width */ |
| 4054 movdqa xmm4, kARGBToYJ | 4483 movdqa xmm4, kARGBToYJ |
| 4055 movdqa xmm5, kAddYJ64 | 4484 movdqa xmm5, kAddYJ64 |
| 4056 | 4485 |
| 4057 convertloop: | 4486 convertloop: |
| 4058 movdqu xmm0, [eax] // G | 4487 movdqu xmm0, [eax] // G |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4097 | 4526 |
| 4098 static const vec8 kARGBToSepiaG = { | 4527 static const vec8 kARGBToSepiaG = { |
| 4099 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 | 4528 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 |
| 4100 }; | 4529 }; |
| 4101 | 4530 |
| 4102 static const vec8 kARGBToSepiaR = { | 4531 static const vec8 kARGBToSepiaR = { |
| 4103 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 | 4532 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 |
| 4104 }; | 4533 }; |
| 4105 | 4534 |
| 4106 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. | 4535 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
| 4107 __declspec(naked) __declspec(align(16)) | 4536 __declspec(naked) |
| 4108 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { | 4537 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { |
| 4109 __asm { | 4538 __asm { |
| 4110 mov eax, [esp + 4] /* dst_argb */ | 4539 mov eax, [esp + 4] /* dst_argb */ |
| 4111 mov ecx, [esp + 8] /* width */ | 4540 mov ecx, [esp + 8] /* width */ |
| 4112 movdqa xmm2, kARGBToSepiaB | 4541 movdqa xmm2, kARGBToSepiaB |
| 4113 movdqa xmm3, kARGBToSepiaG | 4542 movdqa xmm3, kARGBToSepiaG |
| 4114 movdqa xmm4, kARGBToSepiaR | 4543 movdqa xmm4, kARGBToSepiaR |
| 4115 | 4544 |
| 4116 convertloop: | 4545 convertloop: |
| 4117 movdqu xmm0, [eax] // B | 4546 movdqu xmm0, [eax] // B |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4154 ret | 4583 ret |
| 4155 } | 4584 } |
| 4156 } | 4585 } |
| 4157 #endif // HAS_ARGBSEPIAROW_SSSE3 | 4586 #endif // HAS_ARGBSEPIAROW_SSSE3 |
| 4158 | 4587 |
| 4159 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 | 4588 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 |
| 4160 // Tranform 8 ARGB pixels (32 bytes) with color matrix. | 4589 // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
| 4161 // Same as Sepia except matrix is provided. | 4590 // Same as Sepia except matrix is provided. |
| 4162 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R | 4591 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R |
| 4163 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. | 4592 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. |
| 4164 __declspec(naked) __declspec(align(16)) | 4593 __declspec(naked) |
| 4165 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 4594 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 4166 const int8* matrix_argb, int width) { | 4595 const int8* matrix_argb, int width) { |
| 4167 __asm { | 4596 __asm { |
| 4168 mov eax, [esp + 4] /* src_argb */ | 4597 mov eax, [esp + 4] /* src_argb */ |
| 4169 mov edx, [esp + 8] /* dst_argb */ | 4598 mov edx, [esp + 8] /* dst_argb */ |
| 4170 mov ecx, [esp + 12] /* matrix_argb */ | 4599 mov ecx, [esp + 12] /* matrix_argb */ |
| 4171 movdqu xmm5, [ecx] | 4600 movdqu xmm5, [ecx] |
| 4172 pshufd xmm2, xmm5, 0x00 | 4601 pshufd xmm2, xmm5, 0x00 |
| 4173 pshufd xmm3, xmm5, 0x55 | 4602 pshufd xmm3, xmm5, 0x55 |
| 4174 pshufd xmm4, xmm5, 0xaa | 4603 pshufd xmm4, xmm5, 0xaa |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4215 lea edx, [edx + 32] | 4644 lea edx, [edx + 32] |
| 4216 sub ecx, 8 | 4645 sub ecx, 8 |
| 4217 jg convertloop | 4646 jg convertloop |
| 4218 ret | 4647 ret |
| 4219 } | 4648 } |
| 4220 } | 4649 } |
| 4221 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 | 4650 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 |
| 4222 | 4651 |
| 4223 #ifdef HAS_ARGBQUANTIZEROW_SSE2 | 4652 #ifdef HAS_ARGBQUANTIZEROW_SSE2 |
| 4224 // Quantize 4 ARGB pixels (16 bytes). | 4653 // Quantize 4 ARGB pixels (16 bytes). |
| 4225 __declspec(naked) __declspec(align(16)) | 4654 __declspec(naked) |
| 4226 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, | 4655 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, |
| 4227 int interval_offset, int width) { | 4656 int interval_offset, int width) { |
| 4228 __asm { | 4657 __asm { |
| 4229 mov eax, [esp + 4] /* dst_argb */ | 4658 mov eax, [esp + 4] /* dst_argb */ |
| 4230 movd xmm2, [esp + 8] /* scale */ | 4659 movd xmm2, [esp + 8] /* scale */ |
| 4231 movd xmm3, [esp + 12] /* interval_size */ | 4660 movd xmm3, [esp + 12] /* interval_size */ |
| 4232 movd xmm4, [esp + 16] /* interval_offset */ | 4661 movd xmm4, [esp + 16] /* interval_offset */ |
| 4233 mov ecx, [esp + 20] /* width */ | 4662 mov ecx, [esp + 20] /* width */ |
| 4234 pshuflw xmm2, xmm2, 040h | 4663 pshuflw xmm2, xmm2, 040h |
| 4235 pshufd xmm2, xmm2, 044h | 4664 pshufd xmm2, xmm2, 044h |
| (...skipping 24 matching lines...) Expand all Loading... |
| 4260 lea eax, [eax + 16] | 4689 lea eax, [eax + 16] |
| 4261 sub ecx, 4 | 4690 sub ecx, 4 |
| 4262 jg convertloop | 4691 jg convertloop |
| 4263 ret | 4692 ret |
| 4264 } | 4693 } |
| 4265 } | 4694 } |
| 4266 #endif // HAS_ARGBQUANTIZEROW_SSE2 | 4695 #endif // HAS_ARGBQUANTIZEROW_SSE2 |
| 4267 | 4696 |
| 4268 #ifdef HAS_ARGBSHADEROW_SSE2 | 4697 #ifdef HAS_ARGBSHADEROW_SSE2 |
| 4269 // Shade 4 pixels at a time by specified value. | 4698 // Shade 4 pixels at a time by specified value. |
| 4270 __declspec(naked) __declspec(align(16)) | 4699 __declspec(naked) |
| 4271 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, | 4700 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, |
| 4272 uint32 value) { | 4701 uint32 value) { |
| 4273 __asm { | 4702 __asm { |
| 4274 mov eax, [esp + 4] // src_argb | 4703 mov eax, [esp + 4] // src_argb |
| 4275 mov edx, [esp + 8] // dst_argb | 4704 mov edx, [esp + 8] // dst_argb |
| 4276 mov ecx, [esp + 12] // width | 4705 mov ecx, [esp + 12] // width |
| 4277 movd xmm2, [esp + 16] // value | 4706 movd xmm2, [esp + 16] // value |
| 4278 punpcklbw xmm2, xmm2 | 4707 punpcklbw xmm2, xmm2 |
| 4279 punpcklqdq xmm2, xmm2 | 4708 punpcklqdq xmm2, xmm2 |
| 4280 | 4709 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 4294 sub ecx, 4 | 4723 sub ecx, 4 |
| 4295 jg convertloop | 4724 jg convertloop |
| 4296 | 4725 |
| 4297 ret | 4726 ret |
| 4298 } | 4727 } |
| 4299 } | 4728 } |
| 4300 #endif // HAS_ARGBSHADEROW_SSE2 | 4729 #endif // HAS_ARGBSHADEROW_SSE2 |
| 4301 | 4730 |
| 4302 #ifdef HAS_ARGBMULTIPLYROW_SSE2 | 4731 #ifdef HAS_ARGBMULTIPLYROW_SSE2 |
| 4303 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. | 4732 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. |
| 4304 __declspec(naked) __declspec(align(16)) | 4733 __declspec(naked) |
| 4305 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4734 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 4306 uint8* dst_argb, int width) { | 4735 uint8* dst_argb, int width) { |
| 4307 __asm { | 4736 __asm { |
| 4308 push esi | 4737 push esi |
| 4309 mov eax, [esp + 4 + 4] // src_argb0 | 4738 mov eax, [esp + 4 + 4] // src_argb0 |
| 4310 mov esi, [esp + 4 + 8] // src_argb1 | 4739 mov esi, [esp + 4 + 8] // src_argb1 |
| 4311 mov edx, [esp + 4 + 12] // dst_argb | 4740 mov edx, [esp + 4 + 12] // dst_argb |
| 4312 mov ecx, [esp + 4 + 16] // width | 4741 mov ecx, [esp + 4 + 16] // width |
| 4313 pxor xmm5, xmm5 // constant 0 | 4742 pxor xmm5, xmm5 // constant 0 |
| 4314 | 4743 |
| (...skipping 18 matching lines...) Expand all Loading... |
| 4333 | 4762 |
| 4334 pop esi | 4763 pop esi |
| 4335 ret | 4764 ret |
| 4336 } | 4765 } |
| 4337 } | 4766 } |
| 4338 #endif // HAS_ARGBMULTIPLYROW_SSE2 | 4767 #endif // HAS_ARGBMULTIPLYROW_SSE2 |
| 4339 | 4768 |
| 4340 #ifdef HAS_ARGBADDROW_SSE2 | 4769 #ifdef HAS_ARGBADDROW_SSE2 |
| 4341 // Add 2 rows of ARGB pixels together, 4 pixels at a time. | 4770 // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
| 4342 // TODO(fbarchard): Port this to posix, neon and other math functions. | 4771 // TODO(fbarchard): Port this to posix, neon and other math functions. |
| 4343 __declspec(naked) __declspec(align(16)) | 4772 __declspec(naked) |
| 4344 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4773 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 4345 uint8* dst_argb, int width) { | 4774 uint8* dst_argb, int width) { |
| 4346 __asm { | 4775 __asm { |
| 4347 push esi | 4776 push esi |
| 4348 mov eax, [esp + 4 + 4] // src_argb0 | 4777 mov eax, [esp + 4 + 4] // src_argb0 |
| 4349 mov esi, [esp + 4 + 8] // src_argb1 | 4778 mov esi, [esp + 4 + 8] // src_argb1 |
| 4350 mov edx, [esp + 4 + 12] // dst_argb | 4779 mov edx, [esp + 4 + 12] // dst_argb |
| 4351 mov ecx, [esp + 4 + 16] // width | 4780 mov ecx, [esp + 4 + 16] // width |
| 4352 | 4781 |
| 4353 sub ecx, 4 | 4782 sub ecx, 4 |
| (...skipping 27 matching lines...) Expand all Loading... |
| 4381 | 4810 |
| 4382 convertloop19: | 4811 convertloop19: |
| 4383 pop esi | 4812 pop esi |
| 4384 ret | 4813 ret |
| 4385 } | 4814 } |
| 4386 } | 4815 } |
| 4387 #endif // HAS_ARGBADDROW_SSE2 | 4816 #endif // HAS_ARGBADDROW_SSE2 |
| 4388 | 4817 |
| 4389 #ifdef HAS_ARGBSUBTRACTROW_SSE2 | 4818 #ifdef HAS_ARGBSUBTRACTROW_SSE2 |
| 4390 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. | 4819 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. |
| 4391 __declspec(naked) __declspec(align(16)) | 4820 __declspec(naked) |
| 4392 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4821 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 4393 uint8* dst_argb, int width) { | 4822 uint8* dst_argb, int width) { |
| 4394 __asm { | 4823 __asm { |
| 4395 push esi | 4824 push esi |
| 4396 mov eax, [esp + 4 + 4] // src_argb0 | 4825 mov eax, [esp + 4 + 4] // src_argb0 |
| 4397 mov esi, [esp + 4 + 8] // src_argb1 | 4826 mov esi, [esp + 4 + 8] // src_argb1 |
| 4398 mov edx, [esp + 4 + 12] // dst_argb | 4827 mov edx, [esp + 4 + 12] // dst_argb |
| 4399 mov ecx, [esp + 4 + 16] // width | 4828 mov ecx, [esp + 4 + 16] // width |
| 4400 | 4829 |
| 4401 convertloop: | 4830 convertloop: |
| 4402 movdqu xmm0, [eax] // read 4 pixels from src_argb0 | 4831 movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
| 4403 lea eax, [eax + 16] | 4832 lea eax, [eax + 16] |
| 4404 movdqu xmm1, [esi] // read 4 pixels from src_argb1 | 4833 movdqu xmm1, [esi] // read 4 pixels from src_argb1 |
| 4405 lea esi, [esi + 16] | 4834 lea esi, [esi + 16] |
| 4406 psubusb xmm0, xmm1 // src_argb0 - src_argb1 | 4835 psubusb xmm0, xmm1 // src_argb0 - src_argb1 |
| 4407 movdqu [edx], xmm0 | 4836 movdqu [edx], xmm0 |
| 4408 lea edx, [edx + 16] | 4837 lea edx, [edx + 16] |
| 4409 sub ecx, 4 | 4838 sub ecx, 4 |
| 4410 jg convertloop | 4839 jg convertloop |
| 4411 | 4840 |
| 4412 pop esi | 4841 pop esi |
| 4413 ret | 4842 ret |
| 4414 } | 4843 } |
| 4415 } | 4844 } |
| 4416 #endif // HAS_ARGBSUBTRACTROW_SSE2 | 4845 #endif // HAS_ARGBSUBTRACTROW_SSE2 |
| 4417 | 4846 |
| 4418 #ifdef HAS_ARGBMULTIPLYROW_AVX2 | 4847 #ifdef HAS_ARGBMULTIPLYROW_AVX2 |
| 4419 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | 4848 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
| 4420 __declspec(naked) __declspec(align(16)) | 4849 __declspec(naked) |
| 4421 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4850 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
| 4422 uint8* dst_argb, int width) { | 4851 uint8* dst_argb, int width) { |
| 4423 __asm { | 4852 __asm { |
| 4424 push esi | 4853 push esi |
| 4425 mov eax, [esp + 4 + 4] // src_argb0 | 4854 mov eax, [esp + 4 + 4] // src_argb0 |
| 4426 mov esi, [esp + 4 + 8] // src_argb1 | 4855 mov esi, [esp + 4 + 8] // src_argb1 |
| 4427 mov edx, [esp + 4 + 12] // dst_argb | 4856 mov edx, [esp + 4 + 12] // dst_argb |
| 4428 mov ecx, [esp + 4 + 16] // width | 4857 mov ecx, [esp + 4 + 16] // width |
| 4429 vpxor ymm5, ymm5, ymm5 // constant 0 | 4858 vpxor ymm5, ymm5, ymm5 // constant 0 |
| 4430 | 4859 |
| (...skipping 16 matching lines...) Expand all Loading... |
| 4447 | 4876 |
| 4448 pop esi | 4877 pop esi |
| 4449 vzeroupper | 4878 vzeroupper |
| 4450 ret | 4879 ret |
| 4451 } | 4880 } |
| 4452 } | 4881 } |
| 4453 #endif // HAS_ARGBMULTIPLYROW_AVX2 | 4882 #endif // HAS_ARGBMULTIPLYROW_AVX2 |
| 4454 | 4883 |
| 4455 #ifdef HAS_ARGBADDROW_AVX2 | 4884 #ifdef HAS_ARGBADDROW_AVX2 |
| 4456 // Add 2 rows of ARGB pixels together, 8 pixels at a time. | 4885 // Add 2 rows of ARGB pixels together, 8 pixels at a time. |
| 4457 __declspec(naked) __declspec(align(16)) | 4886 __declspec(naked) |
| 4458 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4887 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
| 4459 uint8* dst_argb, int width) { | 4888 uint8* dst_argb, int width) { |
| 4460 __asm { | 4889 __asm { |
| 4461 push esi | 4890 push esi |
| 4462 mov eax, [esp + 4 + 4] // src_argb0 | 4891 mov eax, [esp + 4 + 4] // src_argb0 |
| 4463 mov esi, [esp + 4 + 8] // src_argb1 | 4892 mov esi, [esp + 4 + 8] // src_argb1 |
| 4464 mov edx, [esp + 4 + 12] // dst_argb | 4893 mov edx, [esp + 4 + 12] // dst_argb |
| 4465 mov ecx, [esp + 4 + 16] // width | 4894 mov ecx, [esp + 4 + 16] // width |
| 4466 | 4895 |
| 4467 convertloop: | 4896 convertloop: |
| 4468 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 | 4897 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 |
| 4469 lea eax, [eax + 32] | 4898 lea eax, [eax + 32] |
| 4470 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 | 4899 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 |
| 4471 lea esi, [esi + 32] | 4900 lea esi, [esi + 32] |
| 4472 vmovdqu [edx], ymm0 | 4901 vmovdqu [edx], ymm0 |
| 4473 lea edx, [edx + 32] | 4902 lea edx, [edx + 32] |
| 4474 sub ecx, 8 | 4903 sub ecx, 8 |
| 4475 jg convertloop | 4904 jg convertloop |
| 4476 | 4905 |
| 4477 pop esi | 4906 pop esi |
| 4478 vzeroupper | 4907 vzeroupper |
| 4479 ret | 4908 ret |
| 4480 } | 4909 } |
| 4481 } | 4910 } |
| 4482 #endif // HAS_ARGBADDROW_AVX2 | 4911 #endif // HAS_ARGBADDROW_AVX2 |
| 4483 | 4912 |
| 4484 #ifdef HAS_ARGBSUBTRACTROW_AVX2 | 4913 #ifdef HAS_ARGBSUBTRACTROW_AVX2 |
| 4485 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. | 4914 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. |
| 4486 __declspec(naked) __declspec(align(16)) | 4915 __declspec(naked) |
| 4487 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4916 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
| 4488 uint8* dst_argb, int width) { | 4917 uint8* dst_argb, int width) { |
| 4489 __asm { | 4918 __asm { |
| 4490 push esi | 4919 push esi |
| 4491 mov eax, [esp + 4 + 4] // src_argb0 | 4920 mov eax, [esp + 4 + 4] // src_argb0 |
| 4492 mov esi, [esp + 4 + 8] // src_argb1 | 4921 mov esi, [esp + 4 + 8] // src_argb1 |
| 4493 mov edx, [esp + 4 + 12] // dst_argb | 4922 mov edx, [esp + 4 + 12] // dst_argb |
| 4494 mov ecx, [esp + 4 + 16] // width | 4923 mov ecx, [esp + 4 + 16] // width |
| 4495 | 4924 |
| 4496 convertloop: | 4925 convertloop: |
| (...skipping 11 matching lines...) Expand all Loading... |
| 4508 ret | 4937 ret |
| 4509 } | 4938 } |
| 4510 } | 4939 } |
| 4511 #endif // HAS_ARGBSUBTRACTROW_AVX2 | 4940 #endif // HAS_ARGBSUBTRACTROW_AVX2 |
| 4512 | 4941 |
| 4513 #ifdef HAS_SOBELXROW_SSE2 | 4942 #ifdef HAS_SOBELXROW_SSE2 |
| 4514 // SobelX as a matrix is | 4943 // SobelX as a matrix is |
| 4515 // -1 0 1 | 4944 // -1 0 1 |
| 4516 // -2 0 2 | 4945 // -2 0 2 |
| 4517 // -1 0 1 | 4946 // -1 0 1 |
| 4518 __declspec(naked) __declspec(align(16)) | 4947 __declspec(naked) |
| 4519 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, | 4948 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
| 4520 const uint8* src_y2, uint8* dst_sobelx, int width) { | 4949 const uint8* src_y2, uint8* dst_sobelx, int width) { |
| 4521 __asm { | 4950 __asm { |
| 4522 push esi | 4951 push esi |
| 4523 push edi | 4952 push edi |
| 4524 mov eax, [esp + 8 + 4] // src_y0 | 4953 mov eax, [esp + 8 + 4] // src_y0 |
| 4525 mov esi, [esp + 8 + 8] // src_y1 | 4954 mov esi, [esp + 8 + 8] // src_y1 |
| 4526 mov edi, [esp + 8 + 12] // src_y2 | 4955 mov edi, [esp + 8 + 12] // src_y2 |
| 4527 mov edx, [esp + 8 + 16] // dst_sobelx | 4956 mov edx, [esp + 8 + 16] // dst_sobelx |
| 4528 mov ecx, [esp + 8 + 20] // width | 4957 mov ecx, [esp + 8 + 20] // width |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4564 ret | 4993 ret |
| 4565 } | 4994 } |
| 4566 } | 4995 } |
| 4567 #endif // HAS_SOBELXROW_SSE2 | 4996 #endif // HAS_SOBELXROW_SSE2 |
| 4568 | 4997 |
| 4569 #ifdef HAS_SOBELYROW_SSE2 | 4998 #ifdef HAS_SOBELYROW_SSE2 |
| 4570 // SobelY as a matrix is | 4999 // SobelY as a matrix is |
| 4571 // -1 -2 -1 | 5000 // -1 -2 -1 |
| 4572 // 0 0 0 | 5001 // 0 0 0 |
| 4573 // 1 2 1 | 5002 // 1 2 1 |
| 4574 __declspec(naked) __declspec(align(16)) | 5003 __declspec(naked) |
| 4575 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, | 5004 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
| 4576 uint8* dst_sobely, int width) { | 5005 uint8* dst_sobely, int width) { |
| 4577 __asm { | 5006 __asm { |
| 4578 push esi | 5007 push esi |
| 4579 mov eax, [esp + 4 + 4] // src_y0 | 5008 mov eax, [esp + 4 + 4] // src_y0 |
| 4580 mov esi, [esp + 4 + 8] // src_y1 | 5009 mov esi, [esp + 4 + 8] // src_y1 |
| 4581 mov edx, [esp + 4 + 12] // dst_sobely | 5010 mov edx, [esp + 4 + 12] // dst_sobely |
| 4582 mov ecx, [esp + 4 + 16] // width | 5011 mov ecx, [esp + 4 + 16] // width |
| 4583 sub esi, eax | 5012 sub esi, eax |
| 4584 sub edx, eax | 5013 sub edx, eax |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4617 } | 5046 } |
| 4618 } | 5047 } |
| 4619 #endif // HAS_SOBELYROW_SSE2 | 5048 #endif // HAS_SOBELYROW_SSE2 |
| 4620 | 5049 |
| 4621 #ifdef HAS_SOBELROW_SSE2 | 5050 #ifdef HAS_SOBELROW_SSE2 |
| 4622 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | 5051 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
| 4623 // A = 255 | 5052 // A = 255 |
| 4624 // R = Sobel | 5053 // R = Sobel |
| 4625 // G = Sobel | 5054 // G = Sobel |
| 4626 // B = Sobel | 5055 // B = Sobel |
| 4627 __declspec(naked) __declspec(align(16)) | 5056 __declspec(naked) |
| 4628 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 5057 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
| 4629 uint8* dst_argb, int width) { | 5058 uint8* dst_argb, int width) { |
| 4630 __asm { | 5059 __asm { |
| 4631 push esi | 5060 push esi |
| 4632 mov eax, [esp + 4 + 4] // src_sobelx | 5061 mov eax, [esp + 4 + 4] // src_sobelx |
| 4633 mov esi, [esp + 4 + 8] // src_sobely | 5062 mov esi, [esp + 4 + 8] // src_sobely |
| 4634 mov edx, [esp + 4 + 12] // dst_argb | 5063 mov edx, [esp + 4 + 12] // dst_argb |
| 4635 mov ecx, [esp + 4 + 16] // width | 5064 mov ecx, [esp + 4 + 16] // width |
| 4636 sub esi, eax | 5065 sub esi, eax |
| 4637 pcmpeqb xmm5, xmm5 // alpha 255 | 5066 pcmpeqb xmm5, xmm5 // alpha 255 |
| (...skipping 26 matching lines...) Expand all Loading... |
| 4664 jg convertloop | 5093 jg convertloop |
| 4665 | 5094 |
| 4666 pop esi | 5095 pop esi |
| 4667 ret | 5096 ret |
| 4668 } | 5097 } |
| 4669 } | 5098 } |
| 4670 #endif // HAS_SOBELROW_SSE2 | 5099 #endif // HAS_SOBELROW_SSE2 |
| 4671 | 5100 |
| 4672 #ifdef HAS_SOBELTOPLANEROW_SSE2 | 5101 #ifdef HAS_SOBELTOPLANEROW_SSE2 |
| 4673 // Adds Sobel X and Sobel Y and stores Sobel into a plane. | 5102 // Adds Sobel X and Sobel Y and stores Sobel into a plane. |
| 4674 __declspec(naked) __declspec(align(16)) | 5103 __declspec(naked) |
| 4675 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 5104 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
| 4676 uint8* dst_y, int width) { | 5105 uint8* dst_y, int width) { |
| 4677 __asm { | 5106 __asm { |
| 4678 push esi | 5107 push esi |
| 4679 mov eax, [esp + 4 + 4] // src_sobelx | 5108 mov eax, [esp + 4 + 4] // src_sobelx |
| 4680 mov esi, [esp + 4 + 8] // src_sobely | 5109 mov esi, [esp + 4 + 8] // src_sobely |
| 4681 mov edx, [esp + 4 + 12] // dst_argb | 5110 mov edx, [esp + 4 + 12] // dst_argb |
| 4682 mov ecx, [esp + 4 + 16] // width | 5111 mov ecx, [esp + 4 + 16] // width |
| 4683 sub esi, eax | 5112 sub esi, eax |
| 4684 | 5113 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 4697 } | 5126 } |
| 4698 } | 5127 } |
| 4699 #endif // HAS_SOBELTOPLANEROW_SSE2 | 5128 #endif // HAS_SOBELTOPLANEROW_SSE2 |
| 4700 | 5129 |
| 4701 #ifdef HAS_SOBELXYROW_SSE2 | 5130 #ifdef HAS_SOBELXYROW_SSE2 |
| 4702 // Mixes Sobel X, Sobel Y and Sobel into ARGB. | 5131 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
| 4703 // A = 255 | 5132 // A = 255 |
| 4704 // R = Sobel X | 5133 // R = Sobel X |
| 4705 // G = Sobel | 5134 // G = Sobel |
| 4706 // B = Sobel Y | 5135 // B = Sobel Y |
| 4707 __declspec(naked) __declspec(align(16)) | 5136 __declspec(naked) |
| 4708 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 5137 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
| 4709 uint8* dst_argb, int width) { | 5138 uint8* dst_argb, int width) { |
| 4710 __asm { | 5139 __asm { |
| 4711 push esi | 5140 push esi |
| 4712 mov eax, [esp + 4 + 4] // src_sobelx | 5141 mov eax, [esp + 4 + 4] // src_sobelx |
| 4713 mov esi, [esp + 4 + 8] // src_sobely | 5142 mov esi, [esp + 4 + 8] // src_sobely |
| 4714 mov edx, [esp + 4 + 12] // dst_argb | 5143 mov edx, [esp + 4 + 12] // dst_argb |
| 4715 mov ecx, [esp + 4 + 16] // width | 5144 mov ecx, [esp + 4 + 16] // width |
| 4716 sub esi, eax | 5145 sub esi, eax |
| 4717 pcmpeqb xmm5, xmm5 // alpha 255 | 5146 pcmpeqb xmm5, xmm5 // alpha 255 |
| (...skipping 266 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4984 sub ecx, 1 | 5413 sub ecx, 1 |
| 4985 jge l1 | 5414 jge l1 |
| 4986 | 5415 |
| 4987 l1b: | 5416 l1b: |
| 4988 } | 5417 } |
| 4989 } | 5418 } |
| 4990 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 | 5419 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 |
| 4991 | 5420 |
| 4992 #ifdef HAS_ARGBAFFINEROW_SSE2 | 5421 #ifdef HAS_ARGBAFFINEROW_SSE2 |
| 4993 // Copy ARGB pixels from source image with slope to a row of destination. | 5422 // Copy ARGB pixels from source image with slope to a row of destination. |
| 4994 __declspec(naked) __declspec(align(16)) | 5423 __declspec(naked) |
| 4995 LIBYUV_API | 5424 LIBYUV_API |
| 4996 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, | 5425 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, |
| 4997 uint8* dst_argb, const float* uv_dudv, int width) { | 5426 uint8* dst_argb, const float* uv_dudv, int width) { |
| 4998 __asm { | 5427 __asm { |
| 4999 push esi | 5428 push esi |
| 5000 push edi | 5429 push edi |
| 5001 mov eax, [esp + 12] // src_argb | 5430 mov eax, [esp + 12] // src_argb |
| 5002 mov esi, [esp + 16] // stride | 5431 mov esi, [esp + 16] // stride |
| 5003 mov edx, [esp + 20] // dst_argb | 5432 mov edx, [esp + 20] // dst_argb |
| 5004 mov ecx, [esp + 24] // pointer to uv_dudv | 5433 mov ecx, [esp + 24] // pointer to uv_dudv |
| (...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5069 l1b: | 5498 l1b: |
| 5070 pop edi | 5499 pop edi |
| 5071 pop esi | 5500 pop esi |
| 5072 ret | 5501 ret |
| 5073 } | 5502 } |
| 5074 } | 5503 } |
| 5075 #endif // HAS_ARGBAFFINEROW_SSE2 | 5504 #endif // HAS_ARGBAFFINEROW_SSE2 |
| 5076 | 5505 |
| 5077 #ifdef HAS_INTERPOLATEROW_AVX2 | 5506 #ifdef HAS_INTERPOLATEROW_AVX2 |
| 5078 // Bilinear filter 32x2 -> 32x1 | 5507 // Bilinear filter 32x2 -> 32x1 |
| 5079 __declspec(naked) __declspec(align(16)) | 5508 __declspec(naked) |
| 5080 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, | 5509 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, |
| 5081 ptrdiff_t src_stride, int dst_width, | 5510 ptrdiff_t src_stride, int dst_width, |
| 5082 int source_y_fraction) { | 5511 int source_y_fraction) { |
| 5083 __asm { | 5512 __asm { |
| 5084 push esi | 5513 push esi |
| 5085 push edi | 5514 push edi |
| 5086 mov edi, [esp + 8 + 4] // dst_ptr | 5515 mov edi, [esp + 8 + 4] // dst_ptr |
| 5087 mov esi, [esp + 8 + 8] // src_ptr | 5516 mov esi, [esp + 8 + 8] // src_ptr |
| 5088 mov edx, [esp + 8 + 12] // src_stride | 5517 mov edx, [esp + 8 + 12] // src_stride |
| 5089 mov ecx, [esp + 8 + 16] // dst_width | 5518 mov ecx, [esp + 8 + 16] // dst_width |
| (...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5166 xloop99: | 5595 xloop99: |
| 5167 pop edi | 5596 pop edi |
| 5168 pop esi | 5597 pop esi |
| 5169 vzeroupper | 5598 vzeroupper |
| 5170 ret | 5599 ret |
| 5171 } | 5600 } |
| 5172 } | 5601 } |
| 5173 #endif // HAS_INTERPOLATEROW_AVX2 | 5602 #endif // HAS_INTERPOLATEROW_AVX2 |
| 5174 | 5603 |
| 5175 // Bilinear filter 16x2 -> 16x1 | 5604 // Bilinear filter 16x2 -> 16x1 |
| 5176 __declspec(naked) __declspec(align(16)) | 5605 __declspec(naked) |
| 5177 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 5606 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
| 5178 ptrdiff_t src_stride, int dst_width, | 5607 ptrdiff_t src_stride, int dst_width, |
| 5179 int source_y_fraction) { | 5608 int source_y_fraction) { |
| 5180 __asm { | 5609 __asm { |
| 5181 push esi | 5610 push esi |
| 5182 push edi | 5611 push edi |
| 5183 mov edi, [esp + 8 + 4] // dst_ptr | 5612 mov edi, [esp + 8 + 4] // dst_ptr |
| 5184 mov esi, [esp + 8 + 8] // src_ptr | 5613 mov esi, [esp + 8 + 8] // src_ptr |
| 5185 mov edx, [esp + 8 + 12] // src_stride | 5614 mov edx, [esp + 8 + 12] // src_stride |
| 5186 mov ecx, [esp + 8 + 16] // dst_width | 5615 mov ecx, [esp + 8 + 16] // dst_width |
| (...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5267 | 5696 |
| 5268 xloop99: | 5697 xloop99: |
| 5269 pop edi | 5698 pop edi |
| 5270 pop esi | 5699 pop esi |
| 5271 ret | 5700 ret |
| 5272 } | 5701 } |
| 5273 } | 5702 } |
| 5274 | 5703 |
| 5275 #ifdef HAS_INTERPOLATEROW_SSE2 | 5704 #ifdef HAS_INTERPOLATEROW_SSE2 |
| 5276 // Bilinear filter 16x2 -> 16x1 | 5705 // Bilinear filter 16x2 -> 16x1 |
| 5277 __declspec(naked) __declspec(align(16)) | 5706 __declspec(naked) |
| 5278 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, | 5707 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
| 5279 ptrdiff_t src_stride, int dst_width, | 5708 ptrdiff_t src_stride, int dst_width, |
| 5280 int source_y_fraction) { | 5709 int source_y_fraction) { |
| 5281 __asm { | 5710 __asm { |
| 5282 push esi | 5711 push esi |
| 5283 push edi | 5712 push edi |
| 5284 mov edi, [esp + 8 + 4] // dst_ptr | 5713 mov edi, [esp + 8 + 4] // dst_ptr |
| 5285 mov esi, [esp + 8 + 8] // src_ptr | 5714 mov esi, [esp + 8 + 8] // src_ptr |
| 5286 mov edx, [esp + 8 + 12] // src_stride | 5715 mov edx, [esp + 8 + 12] // src_stride |
| 5287 mov ecx, [esp + 8 + 16] // dst_width | 5716 mov ecx, [esp + 8 + 16] // dst_width |
| (...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5373 jg xloop100 | 5802 jg xloop100 |
| 5374 | 5803 |
| 5375 xloop99: | 5804 xloop99: |
| 5376 pop edi | 5805 pop edi |
| 5377 pop esi | 5806 pop esi |
| 5378 ret | 5807 ret |
| 5379 } | 5808 } |
| 5380 } | 5809 } |
| 5381 #endif // HAS_INTERPOLATEROW_SSE2 | 5810 #endif // HAS_INTERPOLATEROW_SSE2 |
| 5382 | 5811 |
| 5383 // Specialized ARGB to Bayer that just isolates G channel. | |
| 5384 __declspec(naked) __declspec(align(16)) | |
| 5385 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, | |
| 5386 uint32 selector, int pix) { | |
| 5387 __asm { | |
| 5388 mov eax, [esp + 4] // src_argb | |
| 5389 mov edx, [esp + 8] // dst_bayer | |
| 5390 // selector | |
| 5391 mov ecx, [esp + 16] // pix | |
| 5392 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff | |
| 5393 psrld xmm5, 24 | |
| 5394 | |
| 5395 wloop: | |
| 5396 movdqu xmm0, [eax] | |
| 5397 movdqu xmm1, [eax + 16] | |
| 5398 lea eax, [eax + 32] | |
| 5399 psrld xmm0, 8 // Move green to bottom. | |
| 5400 psrld xmm1, 8 | |
| 5401 pand xmm0, xmm5 | |
| 5402 pand xmm1, xmm5 | |
| 5403 packssdw xmm0, xmm1 | |
| 5404 packuswb xmm0, xmm1 | |
| 5405 movq qword ptr [edx], xmm0 | |
| 5406 lea edx, [edx + 8] | |
| 5407 sub ecx, 8 | |
| 5408 jg wloop | |
| 5409 ret | |
| 5410 } | |
| 5411 } | |
| 5412 | |
| 5413 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5812 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 5414 __declspec(naked) __declspec(align(16)) | 5813 __declspec(naked) |
| 5415 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 5814 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 5416 const uint8* shuffler, int pix) { | 5815 const uint8* shuffler, int pix) { |
| 5417 __asm { | 5816 __asm { |
| 5418 mov eax, [esp + 4] // src_argb | 5817 mov eax, [esp + 4] // src_argb |
| 5419 mov edx, [esp + 8] // dst_argb | 5818 mov edx, [esp + 8] // dst_argb |
| 5420 mov ecx, [esp + 12] // shuffler | 5819 mov ecx, [esp + 12] // shuffler |
| 5421 movdqu xmm5, [ecx] | 5820 movdqu xmm5, [ecx] |
| 5422 mov ecx, [esp + 16] // pix | 5821 mov ecx, [esp + 16] // pix |
| 5423 | 5822 |
| 5424 wloop: | 5823 wloop: |
| 5425 movdqu xmm0, [eax] | 5824 movdqu xmm0, [eax] |
| 5426 movdqu xmm1, [eax + 16] | 5825 movdqu xmm1, [eax + 16] |
| 5427 lea eax, [eax + 32] | 5826 lea eax, [eax + 32] |
| 5428 pshufb xmm0, xmm5 | 5827 pshufb xmm0, xmm5 |
| 5429 pshufb xmm1, xmm5 | 5828 pshufb xmm1, xmm5 |
| 5430 movdqu [edx], xmm0 | 5829 movdqu [edx], xmm0 |
| 5431 movdqu [edx + 16], xmm1 | 5830 movdqu [edx + 16], xmm1 |
| 5432 lea edx, [edx + 32] | 5831 lea edx, [edx + 32] |
| 5433 sub ecx, 8 | 5832 sub ecx, 8 |
| 5434 jg wloop | 5833 jg wloop |
| 5435 ret | 5834 ret |
| 5436 } | 5835 } |
| 5437 } | 5836 } |
| 5438 | 5837 |
| 5439 #ifdef HAS_ARGBSHUFFLEROW_AVX2 | 5838 #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
| 5440 __declspec(naked) __declspec(align(16)) | 5839 __declspec(naked) |
| 5441 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 5840 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
| 5442 const uint8* shuffler, int pix) { | 5841 const uint8* shuffler, int pix) { |
| 5443 __asm { | 5842 __asm { |
| 5444 mov eax, [esp + 4] // src_argb | 5843 mov eax, [esp + 4] // src_argb |
| 5445 mov edx, [esp + 8] // dst_argb | 5844 mov edx, [esp + 8] // dst_argb |
| 5446 mov ecx, [esp + 12] // shuffler | 5845 mov ecx, [esp + 12] // shuffler |
| 5447 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. | 5846 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. |
| 5448 mov ecx, [esp + 16] // pix | 5847 mov ecx, [esp + 16] // pix |
| 5449 | 5848 |
| 5450 wloop: | 5849 wloop: |
| 5451 vmovdqu ymm0, [eax] | 5850 vmovdqu ymm0, [eax] |
| 5452 vmovdqu ymm1, [eax + 32] | 5851 vmovdqu ymm1, [eax + 32] |
| 5453 lea eax, [eax + 64] | 5852 lea eax, [eax + 64] |
| 5454 vpshufb ymm0, ymm0, ymm5 | 5853 vpshufb ymm0, ymm0, ymm5 |
| 5455 vpshufb ymm1, ymm1, ymm5 | 5854 vpshufb ymm1, ymm1, ymm5 |
| 5456 vmovdqu [edx], ymm0 | 5855 vmovdqu [edx], ymm0 |
| 5457 vmovdqu [edx + 32], ymm1 | 5856 vmovdqu [edx + 32], ymm1 |
| 5458 lea edx, [edx + 64] | 5857 lea edx, [edx + 64] |
| 5459 sub ecx, 16 | 5858 sub ecx, 16 |
| 5460 jg wloop | 5859 jg wloop |
| 5461 | 5860 |
| 5462 vzeroupper | 5861 vzeroupper |
| 5463 ret | 5862 ret |
| 5464 } | 5863 } |
| 5465 } | 5864 } |
| 5466 #endif // HAS_ARGBSHUFFLEROW_AVX2 | 5865 #endif // HAS_ARGBSHUFFLEROW_AVX2 |
| 5467 | 5866 |
| 5468 __declspec(naked) __declspec(align(16)) | 5867 __declspec(naked) |
| 5469 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 5868 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
| 5470 const uint8* shuffler, int pix) { | 5869 const uint8* shuffler, int pix) { |
| 5471 __asm { | 5870 __asm { |
| 5472 push ebx | 5871 push ebx |
| 5473 push esi | 5872 push esi |
| 5474 mov eax, [esp + 8 + 4] // src_argb | 5873 mov eax, [esp + 8 + 4] // src_argb |
| 5475 mov edx, [esp + 8 + 8] // dst_argb | 5874 mov edx, [esp + 8 + 8] // dst_argb |
| 5476 mov esi, [esp + 8 + 12] // shuffler | 5875 mov esi, [esp + 8 + 12] // shuffler |
| 5477 mov ecx, [esp + 8 + 16] // pix | 5876 mov ecx, [esp + 8 + 16] // pix |
| 5478 pxor xmm5, xmm5 | 5877 pxor xmm5, xmm5 |
| (...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5580 ret | 5979 ret |
| 5581 } | 5980 } |
| 5582 } | 5981 } |
| 5583 | 5982 |
| 5584 // YUY2 - Macro-pixel = 2 image pixels | 5983 // YUY2 - Macro-pixel = 2 image pixels |
| 5585 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... | 5984 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... |
| 5586 | 5985 |
| 5587 // UYVY - Macro-pixel = 2 image pixels | 5986 // UYVY - Macro-pixel = 2 image pixels |
| 5588 // U0Y0V0Y1 | 5987 // U0Y0V0Y1 |
| 5589 | 5988 |
| 5590 __declspec(naked) __declspec(align(16)) | 5989 __declspec(naked) |
| 5591 void I422ToYUY2Row_SSE2(const uint8* src_y, | 5990 void I422ToYUY2Row_SSE2(const uint8* src_y, |
| 5592 const uint8* src_u, | 5991 const uint8* src_u, |
| 5593 const uint8* src_v, | 5992 const uint8* src_v, |
| 5594 uint8* dst_frame, int width) { | 5993 uint8* dst_frame, int width) { |
| 5595 __asm { | 5994 __asm { |
| 5596 push esi | 5995 push esi |
| 5597 push edi | 5996 push edi |
| 5598 mov eax, [esp + 8 + 4] // src_y | 5997 mov eax, [esp + 8 + 4] // src_y |
| 5599 mov esi, [esp + 8 + 8] // src_u | 5998 mov esi, [esp + 8 + 8] // src_u |
| 5600 mov edx, [esp + 8 + 12] // src_v | 5999 mov edx, [esp + 8 + 12] // src_v |
| (...skipping 16 matching lines...) Expand all Loading... |
| 5617 lea edi, [edi + 32] | 6016 lea edi, [edi + 32] |
| 5618 sub ecx, 16 | 6017 sub ecx, 16 |
| 5619 jg convertloop | 6018 jg convertloop |
| 5620 | 6019 |
| 5621 pop edi | 6020 pop edi |
| 5622 pop esi | 6021 pop esi |
| 5623 ret | 6022 ret |
| 5624 } | 6023 } |
| 5625 } | 6024 } |
| 5626 | 6025 |
| 5627 __declspec(naked) __declspec(align(16)) | 6026 __declspec(naked) |
| 5628 void I422ToUYVYRow_SSE2(const uint8* src_y, | 6027 void I422ToUYVYRow_SSE2(const uint8* src_y, |
| 5629 const uint8* src_u, | 6028 const uint8* src_u, |
| 5630 const uint8* src_v, | 6029 const uint8* src_v, |
| 5631 uint8* dst_frame, int width) { | 6030 uint8* dst_frame, int width) { |
| 5632 __asm { | 6031 __asm { |
| 5633 push esi | 6032 push esi |
| 5634 push edi | 6033 push edi |
| 5635 mov eax, [esp + 8 + 4] // src_y | 6034 mov eax, [esp + 8 + 4] // src_y |
| 5636 mov esi, [esp + 8 + 8] // src_u | 6035 mov esi, [esp + 8 + 8] // src_u |
| 5637 mov edx, [esp + 8 + 12] // src_v | 6036 mov edx, [esp + 8 + 12] // src_v |
| (...skipping 17 matching lines...) Expand all Loading... |
| 5655 sub ecx, 16 | 6054 sub ecx, 16 |
| 5656 jg convertloop | 6055 jg convertloop |
| 5657 | 6056 |
| 5658 pop edi | 6057 pop edi |
| 5659 pop esi | 6058 pop esi |
| 5660 ret | 6059 ret |
| 5661 } | 6060 } |
| 5662 } | 6061 } |
| 5663 | 6062 |
| 5664 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 | 6063 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 |
| 5665 __declspec(naked) __declspec(align(16)) | 6064 __declspec(naked) |
| 5666 void ARGBPolynomialRow_SSE2(const uint8* src_argb, | 6065 void ARGBPolynomialRow_SSE2(const uint8* src_argb, |
| 5667 uint8* dst_argb, const float* poly, | 6066 uint8* dst_argb, const float* poly, |
| 5668 int width) { | 6067 int width) { |
| 5669 __asm { | 6068 __asm { |
| 5670 push esi | 6069 push esi |
| 5671 mov eax, [esp + 4 + 4] /* src_argb */ | 6070 mov eax, [esp + 4 + 4] /* src_argb */ |
| 5672 mov edx, [esp + 4 + 8] /* dst_argb */ | 6071 mov edx, [esp + 4 + 8] /* dst_argb */ |
| 5673 mov esi, [esp + 4 + 12] /* poly */ | 6072 mov esi, [esp + 4 + 12] /* poly */ |
| 5674 mov ecx, [esp + 4 + 16] /* width */ | 6073 mov ecx, [esp + 4 + 16] /* width */ |
| 5675 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. | 6074 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5714 lea edx, [edx + 8] | 6113 lea edx, [edx + 8] |
| 5715 sub ecx, 2 | 6114 sub ecx, 2 |
| 5716 jg convertloop | 6115 jg convertloop |
| 5717 pop esi | 6116 pop esi |
| 5718 ret | 6117 ret |
| 5719 } | 6118 } |
| 5720 } | 6119 } |
| 5721 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 | 6120 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 |
| 5722 | 6121 |
| 5723 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 | 6122 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 |
| 5724 __declspec(naked) __declspec(align(16)) | 6123 __declspec(naked) |
| 5725 void ARGBPolynomialRow_AVX2(const uint8* src_argb, | 6124 void ARGBPolynomialRow_AVX2(const uint8* src_argb, |
| 5726 uint8* dst_argb, const float* poly, | 6125 uint8* dst_argb, const float* poly, |
| 5727 int width) { | 6126 int width) { |
| 5728 __asm { | 6127 __asm { |
| 5729 mov eax, [esp + 4] /* src_argb */ | 6128 mov eax, [esp + 4] /* src_argb */ |
| 5730 mov edx, [esp + 8] /* dst_argb */ | 6129 mov edx, [esp + 8] /* dst_argb */ |
| 5731 mov ecx, [esp + 12] /* poly */ | 6130 mov ecx, [esp + 12] /* poly */ |
| 5732 vbroadcastf128 ymm4, [ecx] // C0 | 6131 vbroadcastf128 ymm4, [ecx] // C0 |
| 5733 vbroadcastf128 ymm5, [ecx + 16] // C1 | 6132 vbroadcastf128 ymm5, [ecx + 16] // C1 |
| 5734 vbroadcastf128 ymm6, [ecx + 32] // C2 | 6133 vbroadcastf128 ymm6, [ecx + 32] // C2 |
| (...skipping 19 matching lines...) Expand all Loading... |
| 5754 sub ecx, 2 | 6153 sub ecx, 2 |
| 5755 jg convertloop | 6154 jg convertloop |
| 5756 vzeroupper | 6155 vzeroupper |
| 5757 ret | 6156 ret |
| 5758 } | 6157 } |
| 5759 } | 6158 } |
| 5760 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 | 6159 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 |
| 5761 | 6160 |
| 5762 #ifdef HAS_ARGBCOLORTABLEROW_X86 | 6161 #ifdef HAS_ARGBCOLORTABLEROW_X86 |
| 5763 // Tranform ARGB pixels with color table. | 6162 // Tranform ARGB pixels with color table. |
| 5764 __declspec(naked) __declspec(align(16)) | 6163 __declspec(naked) |
| 5765 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, | 6164 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, |
| 5766 int width) { | 6165 int width) { |
| 5767 __asm { | 6166 __asm { |
| 5768 push esi | 6167 push esi |
| 5769 mov eax, [esp + 4 + 4] /* dst_argb */ | 6168 mov eax, [esp + 4 + 4] /* dst_argb */ |
| 5770 mov esi, [esp + 4 + 8] /* table_argb */ | 6169 mov esi, [esp + 4 + 8] /* table_argb */ |
| 5771 mov ecx, [esp + 4 + 12] /* width */ | 6170 mov ecx, [esp + 4 + 12] /* width */ |
| 5772 | 6171 |
| 5773 // 1 pixel loop. | 6172 // 1 pixel loop. |
| 5774 convertloop: | 6173 convertloop: |
| (...skipping 13 matching lines...) Expand all Loading... |
| 5788 dec ecx | 6187 dec ecx |
| 5789 jg convertloop | 6188 jg convertloop |
| 5790 pop esi | 6189 pop esi |
| 5791 ret | 6190 ret |
| 5792 } | 6191 } |
| 5793 } | 6192 } |
| 5794 #endif // HAS_ARGBCOLORTABLEROW_X86 | 6193 #endif // HAS_ARGBCOLORTABLEROW_X86 |
| 5795 | 6194 |
| 5796 #ifdef HAS_RGBCOLORTABLEROW_X86 | 6195 #ifdef HAS_RGBCOLORTABLEROW_X86 |
| 5797 // Tranform RGB pixels with color table. | 6196 // Tranform RGB pixels with color table. |
| 5798 __declspec(naked) __declspec(align(16)) | 6197 __declspec(naked) |
| 5799 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { | 6198 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { |
| 5800 __asm { | 6199 __asm { |
| 5801 push esi | 6200 push esi |
| 5802 mov eax, [esp + 4 + 4] /* dst_argb */ | 6201 mov eax, [esp + 4 + 4] /* dst_argb */ |
| 5803 mov esi, [esp + 4 + 8] /* table_argb */ | 6202 mov esi, [esp + 4 + 8] /* table_argb */ |
| 5804 mov ecx, [esp + 4 + 12] /* width */ | 6203 mov ecx, [esp + 4 + 12] /* width */ |
| 5805 | 6204 |
| 5806 // 1 pixel loop. | 6205 // 1 pixel loop. |
| 5807 convertloop: | 6206 convertloop: |
| 5808 movzx edx, byte ptr [eax] | 6207 movzx edx, byte ptr [eax] |
| (...skipping 10 matching lines...) Expand all Loading... |
| 5819 jg convertloop | 6218 jg convertloop |
| 5820 | 6219 |
| 5821 pop esi | 6220 pop esi |
| 5822 ret | 6221 ret |
| 5823 } | 6222 } |
| 5824 } | 6223 } |
| 5825 #endif // HAS_RGBCOLORTABLEROW_X86 | 6224 #endif // HAS_RGBCOLORTABLEROW_X86 |
| 5826 | 6225 |
| 5827 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6226 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 5828 // Tranform RGB pixels with luma table. | 6227 // Tranform RGB pixels with luma table. |
| 5829 __declspec(naked) __declspec(align(16)) | 6228 __declspec(naked) |
| 5830 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 6229 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 5831 int width, | 6230 int width, |
| 5832 const uint8* luma, uint32 lumacoeff) { | 6231 const uint8* luma, uint32 lumacoeff) { |
| 5833 __asm { | 6232 __asm { |
| 5834 push esi | 6233 push esi |
| 5835 push edi | 6234 push edi |
| 5836 mov eax, [esp + 8 + 4] /* src_argb */ | 6235 mov eax, [esp + 8 + 4] /* src_argb */ |
| 5837 mov edi, [esp + 8 + 8] /* dst_argb */ | 6236 mov edi, [esp + 8 + 8] /* dst_argb */ |
| 5838 mov ecx, [esp + 8 + 12] /* width */ | 6237 mov ecx, [esp + 8 + 12] /* width */ |
| 5839 movd xmm2, dword ptr [esp + 8 + 16] // luma table | 6238 movd xmm2, dword ptr [esp + 8 + 16] // luma table |
| (...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5917 jg convertloop | 6316 jg convertloop |
| 5918 | 6317 |
| 5919 pop edi | 6318 pop edi |
| 5920 pop esi | 6319 pop esi |
| 5921 ret | 6320 ret |
| 5922 } | 6321 } |
| 5923 } | 6322 } |
| 5924 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6323 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 5925 | 6324 |
| 5926 #endif // defined(_M_X64) | 6325 #endif // defined(_M_X64) |
| 5927 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) | 6326 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
| 5928 | 6327 |
| 5929 #ifdef __cplusplus | 6328 #ifdef __cplusplus |
| 5930 } // extern "C" | 6329 } // extern "C" |
| 5931 } // namespace libyuv | 6330 } // namespace libyuv |
| 5932 #endif | 6331 #endif |
| OLD | NEW |