Index: source/libvpx/third_party/libyuv/source/row_win.cc |
=================================================================== |
--- source/libvpx/third_party/libyuv/source/row_win.cc (revision 290053) |
+++ source/libvpx/third_party/libyuv/source/row_win.cc (working copy) |
@@ -8,16 +8,180 @@ |
* be found in the AUTHORS file in the root of the source tree. |
*/ |
-#include "third_party/libyuv/include/libyuv/row.h" |
+#include "libyuv/row.h" |
+#if defined (_M_X64) |
+#include <emmintrin.h> |
+#include <tmmintrin.h> // For _mm_maddubs_epi16 |
+#endif |
+ |
#ifdef __cplusplus |
namespace libyuv { |
extern "C" { |
#endif |
-// This module is for Visual C x86. |
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
+// This module is for Visual C. |
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) |
+#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ |
+ |
+#define UB 127 /* min(127,(int8)(2.018 * 64)) */ |
+#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ |
+#define UR 0 |
+ |
+#define VB 0 |
+#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ |
+#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ |
+ |
+// Bias |
+#define BB UB * 128 + VB * 128 |
+#define BG UG * 128 + VG * 128 |
+#define BR UR * 128 + VR * 128 |
+ |
+static const vec8 kUVToB = { |
+ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB |
+}; |
+ |
+static const vec8 kUVToR = { |
+ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR |
+}; |
+ |
+static const vec8 kUVToG = { |
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG |
+}; |
+ |
+static const vec8 kVUToB = { |
+ VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, |
+}; |
+ |
+static const vec8 kVUToR = { |
+ VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, |
+}; |
+ |
+static const vec8 kVUToG = { |
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, |
+}; |
+ |
+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; |
+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; |
+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; |
+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; |
+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; |
+ |
+// 64 bit |
+#if defined(_M_X64) |
+ |
+// Aligned destination version. |
+__declspec(align(16)) |
+void I422ToARGBRow_SSSE3(const uint8* y_buf, |
+ const uint8* u_buf, |
+ const uint8* v_buf, |
+ uint8* dst_argb, |
+ int width) { |
+ |
+ __m128i xmm0, xmm1, xmm2, xmm3; |
+ const __m128i xmm5 = _mm_set1_epi8(-1); |
+ const __m128i xmm4 = _mm_setzero_si128(); |
+ const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; |
+ |
+ while (width > 0) { |
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); |
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); |
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); |
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); |
+ xmm1 = _mm_load_si128(&xmm0); |
+ xmm2 = _mm_load_si128(&xmm0); |
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); |
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); |
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); |
+ xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); |
+ xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); |
+ xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); |
+ xmm3 = _mm_loadl_epi64((__m128i*)y_buf); |
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); |
+ xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); |
+ xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); |
+ xmm0 = _mm_adds_epi16(xmm0, xmm3); |
+ xmm1 = _mm_adds_epi16(xmm1, xmm3); |
+ xmm2 = _mm_adds_epi16(xmm2, xmm3); |
+ xmm0 = _mm_srai_epi16(xmm0, 6); |
+ xmm1 = _mm_srai_epi16(xmm1, 6); |
+ xmm2 = _mm_srai_epi16(xmm2, 6); |
+ xmm0 = _mm_packus_epi16(xmm0, xmm0); |
+ xmm1 = _mm_packus_epi16(xmm1, xmm1); |
+ xmm2 = _mm_packus_epi16(xmm2, xmm2); |
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); |
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); |
+ xmm1 = _mm_load_si128(&xmm0); |
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); |
+ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); |
+ |
+ _mm_store_si128((__m128i *)dst_argb, xmm0); |
+ _mm_store_si128((__m128i *)(dst_argb + 16), xmm1); |
+ |
+ y_buf += 8; |
+ u_buf += 4; |
+ dst_argb += 32; |
+ width -= 8; |
+ } |
+} |
+ |
+// Unaligned destination version. |
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
+ const uint8* u_buf, |
+ const uint8* v_buf, |
+ uint8* dst_argb, |
+ int width) { |
+ |
+ __m128i xmm0, xmm1, xmm2, xmm3; |
+ const __m128i xmm5 = _mm_set1_epi8(-1); |
+ const __m128i xmm4 = _mm_setzero_si128(); |
+ const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; |
+ |
+ while (width > 0) { |
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); |
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); |
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); |
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); |
+ xmm1 = _mm_load_si128(&xmm0); |
+ xmm2 = _mm_load_si128(&xmm0); |
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); |
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); |
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); |
+ xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); |
+ xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); |
+ xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); |
+ xmm3 = _mm_loadl_epi64((__m128i*)y_buf); |
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); |
+ xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); |
+ xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); |
+ xmm0 = _mm_adds_epi16(xmm0, xmm3); |
+ xmm1 = _mm_adds_epi16(xmm1, xmm3); |
+ xmm2 = _mm_adds_epi16(xmm2, xmm3); |
+ xmm0 = _mm_srai_epi16(xmm0, 6); |
+ xmm1 = _mm_srai_epi16(xmm1, 6); |
+ xmm2 = _mm_srai_epi16(xmm2, 6); |
+ xmm0 = _mm_packus_epi16(xmm0, xmm0); |
+ xmm1 = _mm_packus_epi16(xmm1, xmm1); |
+ xmm2 = _mm_packus_epi16(xmm2, xmm2); |
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); |
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); |
+ xmm1 = _mm_load_si128(&xmm0); |
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); |
+ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); |
+ |
+ _mm_storeu_si128((__m128i *)dst_argb, xmm0); |
+ _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); |
+ |
+ y_buf += 8; |
+ u_buf += 4; |
+ dst_argb += 32; |
+ width -= 8; |
+ } |
+} |
+// 32 bit |
+#else // defined(_M_X64) |
+ |
#ifdef HAS_ARGBTOYROW_SSSE3 |
// Constants for ARGB. |
@@ -2030,21 +2194,6 @@ |
} |
#endif // HAS_ARGBTOYROW_SSSE3 |
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ |
- |
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */ |
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ |
-#define UR 0 |
- |
-#define VB 0 |
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ |
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ |
- |
-// Bias |
-#define BB UB * 128 + VB * 128 |
-#define BG UG * 128 + VG * 128 |
-#define BR UR * 128 + VR * 128 |
- |
#ifdef HAS_I422TOARGBROW_AVX2 |
static const lvec8 kUVToB_AVX = { |
@@ -2079,10 +2228,10 @@ |
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
__declspec(naked) __declspec(align(16)) |
void I422ToARGBRow_AVX2(const uint8* y_buf, |
- const uint8* u_buf, |
- const uint8* v_buf, |
- uint8* dst_argb, |
- int width) { |
+ const uint8* u_buf, |
+ const uint8* v_buf, |
+ uint8* dst_argb, |
+ int width) { |
__asm { |
push esi |
push edi |
@@ -2150,36 +2299,6 @@ |
#ifdef HAS_I422TOARGBROW_SSSE3 |
-static const vec8 kUVToB = { |
- UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB |
-}; |
- |
-static const vec8 kUVToR = { |
- UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR |
-}; |
- |
-static const vec8 kUVToG = { |
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG |
-}; |
- |
-static const vec8 kVUToB = { |
- VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, |
-}; |
- |
-static const vec8 kVUToR = { |
- VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, |
-}; |
- |
-static const vec8 kVUToG = { |
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, |
-}; |
- |
-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; |
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; |
-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; |
-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; |
-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; |
- |
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. |
// Read 8 UV from 444. |
@@ -7276,7 +7395,8 @@ |
} |
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
-#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
+#endif // defined(_M_X64) |
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) |
#ifdef __cplusplus |
} // extern "C" |