| Index: source/libvpx/third_party/libyuv/source/row_win.cc
|
| diff --git a/source/libvpx/third_party/libyuv/source/row_win.cc b/source/libvpx/third_party/libyuv/source/row_win.cc
|
| index d79c353960bdd7a7fa53d660b15c976b92f1cef6..6e9d04c0e4efcb4e3c9a0cf613313eeb9ffdf061 100644
|
| --- a/source/libvpx/third_party/libyuv/source/row_win.cc
|
| +++ b/source/libvpx/third_party/libyuv/source/row_win.cc
|
| @@ -24,55 +24,63 @@ extern "C" {
|
| #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
|
| (defined(_M_IX86) || defined(_M_X64))
|
|
|
| -#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
|
| -
|
| -#define UB 127 /* min(127,(int8)(2.018 * 64)) */
|
| -#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
|
| -#define UR 0
|
| -
|
| -#define VB 0
|
| -#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
|
| -#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
|
| -
|
| -// Bias
|
| -#define BB UB * 128 + VB * 128
|
| -#define BG UG * 128 + VG * 128
|
| -#define BR UR * 128 + VR * 128
|
| -
|
| -static const vec8 kUVToB = {
|
| - UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
|
| -};
|
| -
|
| -static const vec8 kUVToR = {
|
| - UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
|
| -};
|
| -
|
| -static const vec8 kUVToG = {
|
| - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
|
| +// YUV to RGB conversion constants.
|
| +// Y contribution to R,G,B. Scale and bias.
|
| +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
|
| +#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
|
| +
|
| +// U and V contributions to R,G,B.
|
| +#define UB -128 /* -min(128, round(2.018 * 64)) */
|
| +#define UG 25 /* -round(-0.391 * 64) */
|
| +#define VG 52 /* -round(-0.813 * 64) */
|
| +#define VR -102 /* -round(1.596 * 64) */
|
| +
|
| +// Bias values to subtract 16 from Y and 128 from U and V.
|
| +#define BB (UB * 128 - YGB)
|
| +#define BG (UG * 128 + VG * 128 - YGB)
|
| +#define BR (VR * 128 - YGB)
|
| +
|
| +struct YuvConstants {
|
| + lvec8 kUVToB; // 0
|
| + lvec8 kUVToG; // 32
|
| + lvec8 kUVToR; // 64
|
| + lvec16 kUVBiasB; // 96
|
| + lvec16 kUVBiasG; // 128
|
| + lvec16 kUVBiasR; // 160
|
| + lvec16 kYToRgb; // 192
|
| };
|
|
|
| -static const vec8 kVUToB = {
|
| - VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
|
| +// BT601 constants for YUV to RGB.
|
| +static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
|
| + { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
|
| + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
|
| + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
|
| + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
|
| + { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
|
| + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
|
| + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
|
| + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
|
| + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
|
| + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
|
| };
|
|
|
| -static const vec8 kVUToR = {
|
| - VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
|
| +// BT601 constants for NV21 where chroma plane is VU instead of UV.
|
| +static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
|
| + { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
|
| + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
|
| + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
|
| + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
|
| + { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
|
| + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
|
| + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
|
| + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
|
| + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
|
| + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
|
| };
|
|
|
| -static const vec8 kVUToG = {
|
| - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
|
| -};
|
| -
|
| -static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
|
| -static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
|
| -static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
|
| -static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
|
| -static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
|
| -
|
| // 64 bit
|
| #if defined(_M_X64)
|
|
|
| -// Aligned destination version.
|
| __declspec(align(16))
|
| void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| @@ -81,60 +89,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
| int width) {
|
| __m128i xmm0, xmm1, xmm2, xmm3;
|
| const __m128i xmm5 = _mm_set1_epi8(-1);
|
| - const __m128i xmm4 = _mm_setzero_si128();
|
| - const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
|
| -
|
| - while (width > 0) {
|
| - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
|
| - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
|
| - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
| - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
|
| - xmm1 = _mm_load_si128(&xmm0);
|
| - xmm2 = _mm_load_si128(&xmm0);
|
| - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
|
| - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
|
| - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
|
| - xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
|
| - xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
|
| - xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
|
| - xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
|
| - xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
|
| - xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
|
| - xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
|
| - xmm0 = _mm_adds_epi16(xmm0, xmm3);
|
| - xmm1 = _mm_adds_epi16(xmm1, xmm3);
|
| - xmm2 = _mm_adds_epi16(xmm2, xmm3);
|
| - xmm0 = _mm_srai_epi16(xmm0, 6);
|
| - xmm1 = _mm_srai_epi16(xmm1, 6);
|
| - xmm2 = _mm_srai_epi16(xmm2, 6);
|
| - xmm0 = _mm_packus_epi16(xmm0, xmm0);
|
| - xmm1 = _mm_packus_epi16(xmm1, xmm1);
|
| - xmm2 = _mm_packus_epi16(xmm2, xmm2);
|
| - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
| - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
|
| - xmm1 = _mm_load_si128(&xmm0);
|
| - xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
|
| - xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
|
| -
|
| - _mm_store_si128((__m128i *)dst_argb, xmm0);
|
| - _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
|
| -
|
| - y_buf += 8;
|
| - u_buf += 4;
|
| - dst_argb += 32;
|
| - width -= 8;
|
| - }
|
| -}
|
| -
|
| -// Unaligned destination version.
|
| -void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
| - const uint8* u_buf,
|
| - const uint8* v_buf,
|
| - uint8* dst_argb,
|
| - int width) {
|
| - __m128i xmm0, xmm1, xmm2, xmm3;
|
| - const __m128i xmm5 = _mm_set1_epi8(-1);
|
| - const __m128i xmm4 = _mm_setzero_si128();
|
| const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
|
|
|
| while (width > 0) {
|
| @@ -142,18 +96,17 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
| xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
|
| xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
| xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
|
| - xmm1 = _mm_load_si128(&xmm0);
|
| - xmm2 = _mm_load_si128(&xmm0);
|
| - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
|
| - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
|
| - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
|
| - xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
|
| - xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
|
| - xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
|
| + xmm1 = _mm_loadu_si128(&xmm0);
|
| + xmm2 = _mm_loadu_si128(&xmm0);
|
| + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
|
| + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
|
| + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
|
| + xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
|
| + xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
|
| + xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
|
| xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
|
| - xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
|
| - xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
|
| - xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
|
| + xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
|
| + xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
|
| xmm0 = _mm_adds_epi16(xmm0, xmm3);
|
| xmm1 = _mm_adds_epi16(xmm1, xmm3);
|
| xmm2 = _mm_adds_epi16(xmm2, xmm3);
|
| @@ -165,7 +118,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
| xmm2 = _mm_packus_epi16(xmm2, xmm2);
|
| xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
|
| xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
|
| - xmm1 = _mm_load_si128(&xmm0);
|
| + xmm1 = _mm_loadu_si128(&xmm0);
|
| xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
|
| xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
|
|
|
| @@ -178,6 +131,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
| width -= 8;
|
| }
|
| }
|
| +
|
| // 32 bit
|
| #else // defined(_M_X64)
|
|
|
| @@ -209,15 +163,10 @@ static const vec8 kARGBToVJ = {
|
| -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
|
| };
|
|
|
| -// vpermd for vphaddw + vpackuswb vpermd.
|
| -static const lvec32 kPermdARGBToY_AVX = {
|
| - 0, 4, 1, 5, 2, 6, 3, 7
|
| -};
|
| -
|
| // vpshufb for vphaddw + vpackuswb packed to shorts.
|
| static const lvec8 kShufARGBToUV_AVX = {
|
| 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
|
| - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
|
| + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
|
| };
|
|
|
| // Constants for BGRA.
|
| @@ -263,6 +212,7 @@ static const uvec8 kAddY16 = {
|
| 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
|
| };
|
|
|
| +// 7 bit fixed point 0.5.
|
| static const vec16 kAddYJ64 = {
|
| 64, 64, 64, 64, 64, 64, 64, 64
|
| };
|
| @@ -316,36 +266,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
| pcmpeqb xmm5, xmm5 // generate mask 0xff000000
|
| pslld xmm5, 24
|
|
|
| - align 4
|
| - convertloop:
|
| - movq xmm0, qword ptr [eax]
|
| - lea eax, [eax + 8]
|
| - punpcklbw xmm0, xmm0
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm0
|
| - punpckhwd xmm1, xmm1
|
| - por xmm0, xmm5
|
| - por xmm1, xmm5
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| - sub ecx, 8
|
| - jg convertloop
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
|
| - int pix) {
|
| - __asm {
|
| - mov eax, [esp + 4] // src_y
|
| - mov edx, [esp + 8] // dst_argb
|
| - mov ecx, [esp + 12] // pix
|
| - pcmpeqb xmm5, xmm5 // generate mask 0xff000000
|
| - pslld xmm5, 24
|
| -
|
| - align 4
|
| convertloop:
|
| movq xmm0, qword ptr [eax]
|
| lea eax, [eax + 8]
|
| @@ -374,7 +294,6 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
| pslld xmm5, 24
|
| movdqa xmm4, kShuffleMaskRGB24ToARGB
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -386,18 +305,18 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
| por xmm2, xmm5
|
| palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
|
| pshufb xmm0, xmm4
|
| - movdqa [edx + 32], xmm2
|
| + movdqu [edx + 32], xmm2
|
| por xmm0, xmm5
|
| pshufb xmm1, xmm4
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| por xmm1, xmm5
|
| palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
|
| pshufb xmm3, xmm4
|
| - movdqa [edx + 16], xmm1
|
| + movdqu [edx + 16], xmm1
|
| por xmm3, xmm5
|
| - sub ecx, 16
|
| - movdqa [edx + 48], xmm3
|
| + movdqu [edx + 48], xmm3
|
| lea edx, [edx + 64]
|
| + sub ecx, 16
|
| jg convertloop
|
| ret
|
| }
|
| @@ -414,7 +333,6 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
|
| pslld xmm5, 24
|
| movdqa xmm4, kShuffleMaskRAWToARGB
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -426,18 +344,18 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
|
| por xmm2, xmm5
|
| palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
|
| pshufb xmm0, xmm4
|
| - movdqa [edx + 32], xmm2
|
| + movdqu [edx + 32], xmm2
|
| por xmm0, xmm5
|
| pshufb xmm1, xmm4
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| por xmm1, xmm5
|
| palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
|
| pshufb xmm3, xmm4
|
| - movdqa [edx + 16], xmm1
|
| + movdqu [edx + 16], xmm1
|
| por xmm3, xmm5
|
| - sub ecx, 16
|
| - movdqa [edx + 48], xmm3
|
| + movdqu [edx + 48], xmm3
|
| lea edx, [edx + 64]
|
| + sub ecx, 16
|
| jg convertloop
|
| ret
|
| }
|
| @@ -474,7 +392,6 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
|
| sub edx, eax
|
| sub edx, eax
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax] // fetch 8 pixels of bgr565
|
| movdqa xmm1, xmm0
|
| @@ -491,8 +408,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
|
| movdqa xmm2, xmm1
|
| punpcklbw xmm1, xmm0
|
| punpckhbw xmm2, xmm0
|
| - movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
|
| - movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
|
| + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
|
| + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
|
| lea eax, [eax + 16]
|
| sub ecx, 8
|
| jg convertloop
|
| @@ -524,7 +441,6 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
|
| sub edx, eax
|
| sub edx, eax
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax] // fetch 8 pixels of 1555
|
| movdqa xmm1, xmm0
|
| @@ -545,8 +461,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
|
| movdqa xmm2, xmm1
|
| punpcklbw xmm1, xmm0
|
| punpckhbw xmm2, xmm0
|
| - movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
|
| - movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
|
| + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
|
| + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
|
| lea eax, [eax + 16]
|
| sub ecx, 8
|
| jg convertloop
|
| @@ -570,7 +486,6 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
|
| sub edx, eax
|
| sub edx, eax
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
|
| movdqa xmm2, xmm0
|
| @@ -585,8 +500,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
|
| movdqa xmm1, xmm0
|
| punpcklbw xmm0, xmm2
|
| punpckhbw xmm1, xmm2
|
| - movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
|
| - movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
|
| + movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
|
| + movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
|
| lea eax, [eax + 16]
|
| sub ecx, 8
|
| jg convertloop
|
| @@ -602,7 +517,6 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| mov ecx, [esp + 12] // pix
|
| movdqa xmm6, kShuffleMaskARGBToRGB24
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax] // fetch 16 pixels of argb
|
| movdqu xmm1, [eax + 16]
|
| @@ -641,7 +555,6 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| mov ecx, [esp + 12] // pix
|
| movdqa xmm6, kShuffleMaskARGBToRAW
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax] // fetch 16 pixels of argb
|
| movdqu xmm1, [eax + 16]
|
| @@ -686,9 +599,8 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
|
| pslld xmm5, 11
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // fetch 4 pixels of argb
|
| + movdqu xmm0, [eax] // fetch 4 pixels of argb
|
| movdqa xmm1, xmm0 // B
|
| movdqa xmm2, xmm0 // G
|
| pslld xmm0, 8 // R
|
| @@ -726,9 +638,8 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
|
| pslld xmm7, 15
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // fetch 4 pixels of argb
|
| + movdqu xmm0, [eax] // fetch 4 pixels of argb
|
| movdqa xmm1, xmm0 // B
|
| movdqa xmm2, xmm0 // G
|
| movdqa xmm3, xmm0 // R
|
| @@ -764,14 +675,13 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| movdqa xmm3, xmm4 // generate mask 0x00f000f0
|
| psrlw xmm3, 8
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // fetch 4 pixels of argb
|
| + movdqu xmm0, [eax] // fetch 4 pixels of argb
|
| movdqa xmm1, xmm0
|
| pand xmm0, xmm3 // low nibble
|
| pand xmm1, xmm4 // high nibble
|
| - psrl xmm0, 4
|
| - psrl xmm1, 8
|
| + psrld xmm0, 4
|
| + psrld xmm1, 8
|
| por xmm0, xmm1
|
| packuswb xmm0, xmm0
|
| lea eax, [eax + 16]
|
| @@ -783,6 +693,116 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| }
|
| }
|
|
|
| +#ifdef HAS_ARGBTORGB565ROW_AVX2
|
| +__declspec(naked) __declspec(align(16))
|
| +void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_argb
|
| + mov edx, [esp + 8] // dst_rgb
|
| + mov ecx, [esp + 12] // pix
|
| + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
|
| + vpsrld ymm3, ymm3, 27
|
| + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
|
| + vpsrld ymm4, ymm4, 26
|
| + vpslld ymm4, ymm4, 5
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800
|
| + vpslld ymm5, ymm5, 11
|
| +
|
| + convertloop:
|
| + vmovdqu ymm0, [eax] // fetch 8 pixels of argb
|
| + vpsrld ymm2, ymm0, 5 // G
|
| + vpsrld ymm1, ymm0, 3 // B
|
| + vpslld ymm0, ymm0, 8 // R
|
| + vpand ymm2, ymm2, ymm4 // G
|
| + vpand ymm1, ymm1, ymm3 // B
|
| + vpsrad ymm0, ymm0, 16 // R
|
| + vpand ymm0, ymm0, ymm5 // R
|
| + vpor ymm1, ymm1, ymm2 // BG
|
| + vpor ymm0, ymm0, ymm1 // BGR
|
| + vpackssdw ymm0, ymm0, ymm0
|
| + vpermq ymm0, ymm0, 0xd8
|
| + lea eax, [eax + 32]
|
| + vmovdqu [edx], xmm0 // store 8 pixels of RGB565
|
| + lea edx, [edx + 16]
|
| + sub ecx, 8
|
| + jg convertloop
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_ARGBTORGB565ROW_AVX2
|
| +
|
| +#ifdef HAS_ARGBTOARGB1555ROW_AVX2
|
| +__declspec(naked) __declspec(align(16))
|
| +void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_argb
|
| + mov edx, [esp + 8] // dst_rgb
|
| + mov ecx, [esp + 12] // pix
|
| + vpcmpeqb ymm4, ymm4, ymm4
|
| + vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
|
| + vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
|
| + vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
|
| + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
|
| + vpslld ymm7, ymm7, 15
|
| +
|
| + convertloop:
|
| + vmovdqu ymm0, [eax] // fetch 8 pixels of argb
|
| + vpsrld ymm3, ymm0, 9 // R
|
| + vpsrld ymm2, ymm0, 6 // G
|
| + vpsrld ymm1, ymm0, 3 // B
|
| + vpsrad ymm0, ymm0, 16 // A
|
| + vpand ymm3, ymm3, ymm6 // R
|
| + vpand ymm2, ymm2, ymm5 // G
|
| + vpand ymm1, ymm1, ymm4 // B
|
| + vpand ymm0, ymm0, ymm7 // A
|
| + vpor ymm0, ymm0, ymm1 // BA
|
| + vpor ymm2, ymm2, ymm3 // GR
|
| + vpor ymm0, ymm0, ymm2 // BGRA
|
| + vpackssdw ymm0, ymm0, ymm0
|
| + vpermq ymm0, ymm0, 0xd8
|
| + lea eax, [eax + 32]
|
| + vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
|
| + lea edx, [edx + 16]
|
| + sub ecx, 8
|
| + jg convertloop
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_ARGBTOARGB1555ROW_AVX2
|
| +
|
| +#ifdef HAS_ARGBTOARGB4444ROW_AVX2
|
| +__declspec(naked) __declspec(align(16))
|
| +void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
| + __asm {
|
| + mov eax, [esp + 4] // src_argb
|
| + mov edx, [esp + 8] // dst_rgb
|
| + mov ecx, [esp + 12] // pix
|
| + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
|
| + vpsllw ymm4, ymm4, 12
|
| + vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
|
| +
|
| + convertloop:
|
| + vmovdqu ymm0, [eax] // fetch 8 pixels of argb
|
| + vpand ymm1, ymm0, ymm4 // high nibble
|
| + vpand ymm0, ymm0, ymm3 // low nibble
|
| + vpsrld ymm1, ymm1, 8
|
| + vpsrld ymm0, ymm0, 4
|
| + vpor ymm0, ymm0, ymm1
|
| + vpackuswb ymm0, ymm0, ymm0
|
| + vpermq ymm0, ymm0, 0xd8
|
| + lea eax, [eax + 32]
|
| + vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
|
| + lea edx, [edx + 16]
|
| + sub ecx, 8
|
| + jg convertloop
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_ARGBTOARGB4444ROW_AVX2
|
| +
|
| // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
|
| __declspec(naked) __declspec(align(16))
|
| void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| @@ -790,15 +810,14 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| mov eax, [esp + 4] /* src_argb */
|
| mov edx, [esp + 8] /* dst_y */
|
| mov ecx, [esp + 12] /* pix */
|
| - movdqa xmm5, kAddY16
|
| movdqa xmm4, kARGBToY
|
| + movdqa xmm5, kAddY16
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| + movdqu xmm2, [eax + 32]
|
| + movdqu xmm3, [eax + 48]
|
| pmaddubsw xmm0, xmm4
|
| pmaddubsw xmm1, xmm4
|
| pmaddubsw xmm2, xmm4
|
| @@ -810,15 +829,16 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| psrlw xmm2, 7
|
| packuswb xmm0, xmm2
|
| paddb xmm0, xmm5
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg convertloop
|
| ret
|
| }
|
| }
|
|
|
| -// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
|
| +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
|
| +// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
|
| __declspec(naked) __declspec(align(16))
|
| void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| __asm {
|
| @@ -828,12 +848,11 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| movdqa xmm4, kARGBToYJ
|
| movdqa xmm5, kAddYJ64
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| + movdqu xmm2, [eax + 32]
|
| + movdqu xmm3, [eax + 48]
|
| pmaddubsw xmm0, xmm4
|
| pmaddubsw xmm1, xmm4
|
| pmaddubsw xmm2, xmm4
|
| @@ -846,15 +865,20 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| psrlw xmm0, 7
|
| psrlw xmm2, 7
|
| packuswb xmm0, xmm2
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg convertloop
|
| ret
|
| }
|
| }
|
|
|
| #ifdef HAS_ARGBTOYROW_AVX2
|
| +// vpermd for vphaddw + vpackuswb vpermd.
|
| +static const lvec32 kPermdARGBToY_AVX = {
|
| + 0, 4, 1, 5, 2, 6, 3, 7
|
| +};
|
| +
|
| // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
|
| __declspec(naked) __declspec(align(32))
|
| void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
| @@ -864,9 +888,8 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
| mov ecx, [esp + 12] /* pix */
|
| vbroadcastf128 ymm4, kARGBToY
|
| vbroadcastf128 ymm5, kAddY16
|
| - vmovdqa ymm6, kPermdARGBToY_AVX
|
| + vmovdqu ymm6, kPermdARGBToY_AVX
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax]
|
| vmovdqu ymm1, [eax + 32]
|
| @@ -883,10 +906,10 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
| vpsrlw ymm2, ymm2, 7
|
| vpackuswb ymm0, ymm0, ymm2 // mutates.
|
| vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
|
| - vpaddb ymm0, ymm0, ymm5
|
| - sub ecx, 32
|
| + vpaddb ymm0, ymm0, ymm5 // add 16 for Y
|
| vmovdqu [edx], ymm0
|
| lea edx, [edx + 32]
|
| + sub ecx, 32
|
| jg convertloop
|
| vzeroupper
|
| ret
|
| @@ -904,9 +927,8 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
| mov ecx, [esp + 12] /* pix */
|
| vbroadcastf128 ymm4, kARGBToYJ
|
| vbroadcastf128 ymm5, kAddYJ64
|
| - vmovdqa ymm6, kPermdARGBToY_AVX
|
| + vmovdqu ymm6, kPermdARGBToY_AVX
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax]
|
| vmovdqu ymm1, [eax + 32]
|
| @@ -925,9 +947,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
| vpsrlw ymm2, ymm2, 7
|
| vpackuswb ymm0, ymm0, ymm2 // mutates.
|
| vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
|
| - sub ecx, 32
|
| vmovdqu [edx], ymm0
|
| lea edx, [edx + 32]
|
| + sub ecx, 32
|
| jg convertloop
|
|
|
| vzeroupper
|
| @@ -937,15 +959,14 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
| #endif // HAS_ARGBTOYJROW_AVX2
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] /* src_argb */
|
| mov edx, [esp + 8] /* dst_y */
|
| mov ecx, [esp + 12] /* pix */
|
| + movdqa xmm4, kBGRAToY
|
| movdqa xmm5, kAddY16
|
| - movdqa xmm4, kARGBToY
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -962,24 +983,23 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| psrlw xmm2, 7
|
| packuswb xmm0, xmm2
|
| paddb xmm0, xmm5
|
| - sub ecx, 16
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg convertloop
|
| ret
|
| }
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] /* src_argb */
|
| mov edx, [esp + 8] /* dst_y */
|
| mov ecx, [esp + 12] /* pix */
|
| - movdqa xmm4, kARGBToYJ
|
| - movdqa xmm5, kAddYJ64
|
| + movdqa xmm4, kABGRToY
|
| + movdqa xmm5, kAddY16
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -992,34 +1012,32 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| lea eax, [eax + 64]
|
| phaddw xmm0, xmm1
|
| phaddw xmm2, xmm3
|
| - paddw xmm0, xmm5
|
| - paddw xmm2, xmm5
|
| psrlw xmm0, 7
|
| psrlw xmm2, 7
|
| packuswb xmm0, xmm2
|
| - sub ecx, 16
|
| + paddb xmm0, xmm5
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg convertloop
|
| ret
|
| }
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| +void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| __asm {
|
| mov eax, [esp + 4] /* src_argb */
|
| mov edx, [esp + 8] /* dst_y */
|
| mov ecx, [esp + 12] /* pix */
|
| + movdqa xmm4, kRGBAToY
|
| movdqa xmm5, kAddY16
|
| - movdqa xmm4, kBGRAToY
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| + movdqu xmm2, [eax + 32]
|
| + movdqu xmm3, [eax + 48]
|
| pmaddubsw xmm0, xmm4
|
| pmaddubsw xmm1, xmm4
|
| pmaddubsw xmm2, xmm4
|
| @@ -1031,187 +1049,87 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| psrlw xmm2, 7
|
| packuswb xmm0, xmm2
|
| paddb xmm0, xmm5
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg convertloop
|
| ret
|
| }
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| + uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| - mov eax, [esp + 4] /* src_argb */
|
| - mov edx, [esp + 8] /* dst_y */
|
| - mov ecx, [esp + 12] /* pix */
|
| - movdqa xmm5, kAddY16
|
| - movdqa xmm4, kBGRAToY
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // src_argb
|
| + mov esi, [esp + 8 + 8] // src_stride_argb
|
| + mov edx, [esp + 8 + 12] // dst_u
|
| + mov edi, [esp + 8 + 16] // dst_v
|
| + mov ecx, [esp + 8 + 20] // pix
|
| + movdqa xmm5, kAddUV128
|
| + movdqa xmm6, kARGBToV
|
| + movdqa xmm7, kARGBToU
|
| + sub edi, edx // stride from u to v
|
|
|
| - align 4
|
| convertloop:
|
| + /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| movdqu xmm0, [eax]
|
| + movdqu xmm4, [eax + esi]
|
| + pavgb xmm0, xmm4
|
| movdqu xmm1, [eax + 16]
|
| + movdqu xmm4, [eax + esi + 16]
|
| + pavgb xmm1, xmm4
|
| movdqu xmm2, [eax + 32]
|
| + movdqu xmm4, [eax + esi + 32]
|
| + pavgb xmm2, xmm4
|
| movdqu xmm3, [eax + 48]
|
| - pmaddubsw xmm0, xmm4
|
| - pmaddubsw xmm1, xmm4
|
| - pmaddubsw xmm2, xmm4
|
| - pmaddubsw xmm3, xmm4
|
| - lea eax, [eax + 64]
|
| - phaddw xmm0, xmm1
|
| - phaddw xmm2, xmm3
|
| - psrlw xmm0, 7
|
| - psrlw xmm2, 7
|
| - packuswb xmm0, xmm2
|
| - paddb xmm0, xmm5
|
| + movdqu xmm4, [eax + esi + 48]
|
| + pavgb xmm3, xmm4
|
| +
|
| + lea eax, [eax + 64]
|
| + movdqa xmm4, xmm0
|
| + shufps xmm0, xmm1, 0x88
|
| + shufps xmm4, xmm1, 0xdd
|
| + pavgb xmm0, xmm4
|
| + movdqa xmm4, xmm2
|
| + shufps xmm2, xmm3, 0x88
|
| + shufps xmm4, xmm3, 0xdd
|
| + pavgb xmm2, xmm4
|
| +
|
| + // step 2 - convert to U and V
|
| + // from here down is very similar to Y code except
|
| + // instead of 16 different pixels, its 8 pixels of U and 8 of V
|
| + movdqa xmm1, xmm0
|
| + movdqa xmm3, xmm2
|
| + pmaddubsw xmm0, xmm7 // U
|
| + pmaddubsw xmm2, xmm7
|
| + pmaddubsw xmm1, xmm6 // V
|
| + pmaddubsw xmm3, xmm6
|
| + phaddw xmm0, xmm2
|
| + phaddw xmm1, xmm3
|
| + psraw xmm0, 8
|
| + psraw xmm1, 8
|
| + packsswb xmm0, xmm1
|
| + paddb xmm0, xmm5 // -> unsigned
|
| +
|
| + // step 3 - store 8 U and 8 V values
|
| + movlps qword ptr [edx], xmm0 // U
|
| + movhps qword ptr [edx + edi], xmm0 // V
|
| + lea edx, [edx + 8]
|
| sub ecx, 16
|
| - movdqu [edx], xmm0
|
| - lea edx, [edx + 16]
|
| jg convertloop
|
| +
|
| + pop edi
|
| + pop esi
|
| ret
|
| }
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| - __asm {
|
| - mov eax, [esp + 4] /* src_argb */
|
| - mov edx, [esp + 8] /* dst_y */
|
| - mov ecx, [esp + 12] /* pix */
|
| - movdqa xmm5, kAddY16
|
| - movdqa xmm4, kABGRToY
|
| -
|
| - align 4
|
| - convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| - pmaddubsw xmm0, xmm4
|
| - pmaddubsw xmm1, xmm4
|
| - pmaddubsw xmm2, xmm4
|
| - pmaddubsw xmm3, xmm4
|
| - lea eax, [eax + 64]
|
| - phaddw xmm0, xmm1
|
| - phaddw xmm2, xmm3
|
| - psrlw xmm0, 7
|
| - psrlw xmm2, 7
|
| - packuswb xmm0, xmm2
|
| - paddb xmm0, xmm5
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| - lea edx, [edx + 16]
|
| - jg convertloop
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| - __asm {
|
| - mov eax, [esp + 4] /* src_argb */
|
| - mov edx, [esp + 8] /* dst_y */
|
| - mov ecx, [esp + 12] /* pix */
|
| - movdqa xmm5, kAddY16
|
| - movdqa xmm4, kABGRToY
|
| -
|
| - align 4
|
| - convertloop:
|
| - movdqu xmm0, [eax]
|
| - movdqu xmm1, [eax + 16]
|
| - movdqu xmm2, [eax + 32]
|
| - movdqu xmm3, [eax + 48]
|
| - pmaddubsw xmm0, xmm4
|
| - pmaddubsw xmm1, xmm4
|
| - pmaddubsw xmm2, xmm4
|
| - pmaddubsw xmm3, xmm4
|
| - lea eax, [eax + 64]
|
| - phaddw xmm0, xmm1
|
| - phaddw xmm2, xmm3
|
| - psrlw xmm0, 7
|
| - psrlw xmm2, 7
|
| - packuswb xmm0, xmm2
|
| - paddb xmm0, xmm5
|
| - sub ecx, 16
|
| - movdqu [edx], xmm0
|
| - lea edx, [edx + 16]
|
| - jg convertloop
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| - __asm {
|
| - mov eax, [esp + 4] /* src_argb */
|
| - mov edx, [esp + 8] /* dst_y */
|
| - mov ecx, [esp + 12] /* pix */
|
| - movdqa xmm5, kAddY16
|
| - movdqa xmm4, kRGBAToY
|
| -
|
| - align 4
|
| - convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| - pmaddubsw xmm0, xmm4
|
| - pmaddubsw xmm1, xmm4
|
| - pmaddubsw xmm2, xmm4
|
| - pmaddubsw xmm3, xmm4
|
| - lea eax, [eax + 64]
|
| - phaddw xmm0, xmm1
|
| - phaddw xmm2, xmm3
|
| - psrlw xmm0, 7
|
| - psrlw xmm2, 7
|
| - packuswb xmm0, xmm2
|
| - paddb xmm0, xmm5
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| - lea edx, [edx + 16]
|
| - jg convertloop
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
| - __asm {
|
| - mov eax, [esp + 4] /* src_argb */
|
| - mov edx, [esp + 8] /* dst_y */
|
| - mov ecx, [esp + 12] /* pix */
|
| - movdqa xmm5, kAddY16
|
| - movdqa xmm4, kRGBAToY
|
| -
|
| - align 4
|
| - convertloop:
|
| - movdqu xmm0, [eax]
|
| - movdqu xmm1, [eax + 16]
|
| - movdqu xmm2, [eax + 32]
|
| - movdqu xmm3, [eax + 48]
|
| - pmaddubsw xmm0, xmm4
|
| - pmaddubsw xmm1, xmm4
|
| - pmaddubsw xmm2, xmm4
|
| - pmaddubsw xmm3, xmm4
|
| - lea eax, [eax + 64]
|
| - phaddw xmm0, xmm1
|
| - phaddw xmm2, xmm3
|
| - psrlw xmm0, 7
|
| - psrlw xmm2, 7
|
| - packuswb xmm0, xmm2
|
| - paddb xmm0, xmm5
|
| - sub ecx, 16
|
| - movdqu [edx], xmm0
|
| - lea edx, [edx + 16]
|
| - jg convertloop
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| + uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| push esi
|
| push edi
|
| @@ -1220,88 +1138,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| mov edx, [esp + 8 + 12] // dst_u
|
| mov edi, [esp + 8 + 16] // dst_v
|
| mov ecx, [esp + 8 + 20] // pix
|
| - movdqa xmm7, kARGBToU
|
| - movdqa xmm6, kARGBToV
|
| - movdqa xmm5, kAddUV128
|
| + movdqa xmm5, kAddUVJ128
|
| + movdqa xmm6, kARGBToVJ
|
| + movdqa xmm7, kARGBToUJ
|
| sub edi, edx // stride from u to v
|
|
|
| - align 4
|
| convertloop:
|
| /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| - pavgb xmm0, [eax + esi]
|
| - pavgb xmm1, [eax + esi + 16]
|
| - pavgb xmm2, [eax + esi + 32]
|
| - pavgb xmm3, [eax + esi + 48]
|
| - lea eax, [eax + 64]
|
| - movdqa xmm4, xmm0
|
| - shufps xmm0, xmm1, 0x88
|
| - shufps xmm4, xmm1, 0xdd
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm4, [eax + esi]
|
| pavgb xmm0, xmm4
|
| - movdqa xmm4, xmm2
|
| - shufps xmm2, xmm3, 0x88
|
| - shufps xmm4, xmm3, 0xdd
|
| + movdqu xmm1, [eax + 16]
|
| + movdqu xmm4, [eax + esi + 16]
|
| + pavgb xmm1, xmm4
|
| + movdqu xmm2, [eax + 32]
|
| + movdqu xmm4, [eax + esi + 32]
|
| pavgb xmm2, xmm4
|
| + movdqu xmm3, [eax + 48]
|
| + movdqu xmm4, [eax + esi + 48]
|
| + pavgb xmm3, xmm4
|
|
|
| - // step 2 - convert to U and V
|
| - // from here down is very similar to Y code except
|
| - // instead of 16 different pixels, its 8 pixels of U and 8 of V
|
| - movdqa xmm1, xmm0
|
| - movdqa xmm3, xmm2
|
| - pmaddubsw xmm0, xmm7 // U
|
| - pmaddubsw xmm2, xmm7
|
| - pmaddubsw xmm1, xmm6 // V
|
| - pmaddubsw xmm3, xmm6
|
| - phaddw xmm0, xmm2
|
| - phaddw xmm1, xmm3
|
| - psraw xmm0, 8
|
| - psraw xmm1, 8
|
| - packsswb xmm0, xmm1
|
| - paddb xmm0, xmm5 // -> unsigned
|
| -
|
| - // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| - movlps qword ptr [edx], xmm0 // U
|
| - movhps qword ptr [edx + edi], xmm0 // V
|
| - lea edx, [edx + 8]
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| - __asm {
|
| - push esi
|
| - push edi
|
| - mov eax, [esp + 8 + 4] // src_argb
|
| - mov esi, [esp + 8 + 8] // src_stride_argb
|
| - mov edx, [esp + 8 + 12] // dst_u
|
| - mov edi, [esp + 8 + 16] // dst_v
|
| - mov ecx, [esp + 8 + 20] // pix
|
| - movdqa xmm7, kARGBToUJ
|
| - movdqa xmm6, kARGBToVJ
|
| - movdqa xmm5, kAddUVJ128
|
| - sub edi, edx // stride from u to v
|
| -
|
| - align 4
|
| - convertloop:
|
| - /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| - pavgb xmm0, [eax + esi]
|
| - pavgb xmm1, [eax + esi + 16]
|
| - pavgb xmm2, [eax + esi + 32]
|
| - pavgb xmm3, [eax + esi + 48]
|
| lea eax, [eax + 64]
|
| movdqa xmm4, xmm0
|
| shufps xmm0, xmm1, 0x88
|
| @@ -1330,10 +1186,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| packsswb xmm0, xmm1
|
|
|
| // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| movlps qword ptr [edx], xmm0 // U
|
| movhps qword ptr [edx + edi], xmm0 // V
|
| lea edx, [edx + 8]
|
| + sub ecx, 16
|
| jg convertloop
|
|
|
| pop edi
|
| @@ -1359,7 +1215,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
| vbroadcastf128 ymm7, kARGBToU
|
| sub edi, edx // stride from u to v
|
|
|
| - align 4
|
| convertloop:
|
| /* step 1 - subsample 32x2 argb pixels to 16x1 */
|
| vmovdqu ymm0, [eax]
|
| @@ -1395,10 +1250,10 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
| vpaddb ymm0, ymm0, ymm5 // -> unsigned
|
|
|
| // step 3 - store 16 U and 16 V values
|
| - sub ecx, 32
|
| vextractf128 [edx], ymm0, 0 // U
|
| vextractf128 [edx + edi], ymm0, 1 // V
|
| lea edx, [edx + 16]
|
| + sub ecx, 32
|
| jg convertloop
|
|
|
| pop edi
|
| @@ -1410,237 +1265,36 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
| #endif // HAS_ARGBTOUVROW_AVX2
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| +void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
|
| + uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| - push esi
|
| push edi
|
| - mov eax, [esp + 8 + 4] // src_argb
|
| - mov esi, [esp + 8 + 8] // src_stride_argb
|
| - mov edx, [esp + 8 + 12] // dst_u
|
| - mov edi, [esp + 8 + 16] // dst_v
|
| - mov ecx, [esp + 8 + 20] // pix
|
| - movdqa xmm7, kARGBToU
|
| - movdqa xmm6, kARGBToV
|
| + mov eax, [esp + 4 + 4] // src_argb
|
| + mov edx, [esp + 4 + 8] // dst_u
|
| + mov edi, [esp + 4 + 12] // dst_v
|
| + mov ecx, [esp + 4 + 16] // pix
|
| movdqa xmm5, kAddUV128
|
| + movdqa xmm6, kARGBToV
|
| + movdqa xmm7, kARGBToU
|
| sub edi, edx // stride from u to v
|
|
|
| - align 4
|
| convertloop:
|
| - /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| - movdqu xmm0, [eax]
|
| + /* convert to U and V */
|
| + movdqu xmm0, [eax] // U
|
| movdqu xmm1, [eax + 16]
|
| movdqu xmm2, [eax + 32]
|
| movdqu xmm3, [eax + 48]
|
| - movdqu xmm4, [eax + esi]
|
| - pavgb xmm0, xmm4
|
| - movdqu xmm4, [eax + esi + 16]
|
| - pavgb xmm1, xmm4
|
| - movdqu xmm4, [eax + esi + 32]
|
| - pavgb xmm2, xmm4
|
| - movdqu xmm4, [eax + esi + 48]
|
| - pavgb xmm3, xmm4
|
| - lea eax, [eax + 64]
|
| - movdqa xmm4, xmm0
|
| - shufps xmm0, xmm1, 0x88
|
| - shufps xmm4, xmm1, 0xdd
|
| - pavgb xmm0, xmm4
|
| - movdqa xmm4, xmm2
|
| - shufps xmm2, xmm3, 0x88
|
| - shufps xmm4, xmm3, 0xdd
|
| - pavgb xmm2, xmm4
|
| -
|
| - // step 2 - convert to U and V
|
| - // from here down is very similar to Y code except
|
| - // instead of 16 different pixels, its 8 pixels of U and 8 of V
|
| - movdqa xmm1, xmm0
|
| - movdqa xmm3, xmm2
|
| - pmaddubsw xmm0, xmm7 // U
|
| + pmaddubsw xmm0, xmm7
|
| + pmaddubsw xmm1, xmm7
|
| pmaddubsw xmm2, xmm7
|
| - pmaddubsw xmm1, xmm6 // V
|
| - pmaddubsw xmm3, xmm6
|
| - phaddw xmm0, xmm2
|
| - phaddw xmm1, xmm3
|
| + pmaddubsw xmm3, xmm7
|
| + phaddw xmm0, xmm1
|
| + phaddw xmm2, xmm3
|
| psraw xmm0, 8
|
| - psraw xmm1, 8
|
| - packsswb xmm0, xmm1
|
| - paddb xmm0, xmm5 // -> unsigned
|
| -
|
| - // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| - movlps qword ptr [edx], xmm0 // U
|
| - movhps qword ptr [edx + edi], xmm0 // V
|
| - lea edx, [edx + 8]
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| - __asm {
|
| - push esi
|
| - push edi
|
| - mov eax, [esp + 8 + 4] // src_argb
|
| - mov esi, [esp + 8 + 8] // src_stride_argb
|
| - mov edx, [esp + 8 + 12] // dst_u
|
| - mov edi, [esp + 8 + 16] // dst_v
|
| - mov ecx, [esp + 8 + 20] // pix
|
| - movdqa xmm7, kARGBToUJ
|
| - movdqa xmm6, kARGBToVJ
|
| - movdqa xmm5, kAddUVJ128
|
| - sub edi, edx // stride from u to v
|
| -
|
| - align 4
|
| - convertloop:
|
| - /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| - movdqu xmm0, [eax]
|
| - movdqu xmm1, [eax + 16]
|
| - movdqu xmm2, [eax + 32]
|
| - movdqu xmm3, [eax + 48]
|
| - movdqu xmm4, [eax + esi]
|
| - pavgb xmm0, xmm4
|
| - movdqu xmm4, [eax + esi + 16]
|
| - pavgb xmm1, xmm4
|
| - movdqu xmm4, [eax + esi + 32]
|
| - pavgb xmm2, xmm4
|
| - movdqu xmm4, [eax + esi + 48]
|
| - pavgb xmm3, xmm4
|
| - lea eax, [eax + 64]
|
| - movdqa xmm4, xmm0
|
| - shufps xmm0, xmm1, 0x88
|
| - shufps xmm4, xmm1, 0xdd
|
| - pavgb xmm0, xmm4
|
| - movdqa xmm4, xmm2
|
| - shufps xmm2, xmm3, 0x88
|
| - shufps xmm4, xmm3, 0xdd
|
| - pavgb xmm2, xmm4
|
| -
|
| - // step 2 - convert to U and V
|
| - // from here down is very similar to Y code except
|
| - // instead of 16 different pixels, its 8 pixels of U and 8 of V
|
| - movdqa xmm1, xmm0
|
| - movdqa xmm3, xmm2
|
| - pmaddubsw xmm0, xmm7 // U
|
| - pmaddubsw xmm2, xmm7
|
| - pmaddubsw xmm1, xmm6 // V
|
| - pmaddubsw xmm3, xmm6
|
| - phaddw xmm0, xmm2
|
| - phaddw xmm1, xmm3
|
| - paddw xmm0, xmm5 // +.5 rounding -> unsigned
|
| - paddw xmm1, xmm5
|
| - psraw xmm0, 8
|
| - psraw xmm1, 8
|
| - packsswb xmm0, xmm1
|
| -
|
| - // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| - movlps qword ptr [edx], xmm0 // U
|
| - movhps qword ptr [edx + edi], xmm0 // V
|
| - lea edx, [edx + 8]
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| - __asm {
|
| - push edi
|
| - mov eax, [esp + 4 + 4] // src_argb
|
| - mov edx, [esp + 4 + 8] // dst_u
|
| - mov edi, [esp + 4 + 12] // dst_v
|
| - mov ecx, [esp + 4 + 16] // pix
|
| - movdqa xmm7, kARGBToU
|
| - movdqa xmm6, kARGBToV
|
| - movdqa xmm5, kAddUV128
|
| - sub edi, edx // stride from u to v
|
| -
|
| - align 4
|
| - convertloop:
|
| - /* convert to U and V */
|
| - movdqa xmm0, [eax] // U
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| - pmaddubsw xmm0, xmm7
|
| - pmaddubsw xmm1, xmm7
|
| - pmaddubsw xmm2, xmm7
|
| - pmaddubsw xmm3, xmm7
|
| - phaddw xmm0, xmm1
|
| - phaddw xmm2, xmm3
|
| - psraw xmm0, 8
|
| - psraw xmm2, 8
|
| - packsswb xmm0, xmm2
|
| - paddb xmm0, xmm5
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| -
|
| - movdqa xmm0, [eax] // V
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| - pmaddubsw xmm0, xmm6
|
| - pmaddubsw xmm1, xmm6
|
| - pmaddubsw xmm2, xmm6
|
| - pmaddubsw xmm3, xmm6
|
| - phaddw xmm0, xmm1
|
| - phaddw xmm2, xmm3
|
| - psraw xmm0, 8
|
| - psraw xmm2, 8
|
| - packsswb xmm0, xmm2
|
| - paddb xmm0, xmm5
|
| - lea eax, [eax + 64]
|
| - movdqa [edx + edi], xmm0
|
| - lea edx, [edx + 16]
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| - __asm {
|
| - push edi
|
| - mov eax, [esp + 4 + 4] // src_argb
|
| - mov edx, [esp + 4 + 8] // dst_u
|
| - mov edi, [esp + 4 + 12] // dst_v
|
| - mov ecx, [esp + 4 + 16] // pix
|
| - movdqa xmm7, kARGBToU
|
| - movdqa xmm6, kARGBToV
|
| - movdqa xmm5, kAddUV128
|
| - sub edi, edx // stride from u to v
|
| -
|
| - align 4
|
| - convertloop:
|
| - /* convert to U and V */
|
| - movdqu xmm0, [eax] // U
|
| - movdqu xmm1, [eax + 16]
|
| - movdqu xmm2, [eax + 32]
|
| - movdqu xmm3, [eax + 48]
|
| - pmaddubsw xmm0, xmm7
|
| - pmaddubsw xmm1, xmm7
|
| - pmaddubsw xmm2, xmm7
|
| - pmaddubsw xmm3, xmm7
|
| - phaddw xmm0, xmm1
|
| - phaddw xmm2, xmm3
|
| - psraw xmm0, 8
|
| - psraw xmm2, 8
|
| - packsswb xmm0, xmm2
|
| - paddb xmm0, xmm5
|
| - sub ecx, 16
|
| - movdqu [edx], xmm0
|
| + psraw xmm2, 8
|
| + packsswb xmm0, xmm2
|
| + paddb xmm0, xmm5
|
| + movdqu [edx], xmm0
|
|
|
| movdqu xmm0, [eax] // V
|
| movdqu xmm1, [eax + 16]
|
| @@ -1659,6 +1313,7 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
|
| lea eax, [eax + 64]
|
| movdqu [edx + edi], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg convertloop
|
|
|
| pop edi
|
| @@ -1666,287 +1321,26 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
|
| }
|
| }
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| -void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| - __asm {
|
| - push edi
|
| - mov eax, [esp + 4 + 4] // src_argb
|
| - mov edx, [esp + 4 + 8] // dst_u
|
| - mov edi, [esp + 4 + 12] // dst_v
|
| - mov ecx, [esp + 4 + 16] // pix
|
| - movdqa xmm7, kARGBToU
|
| - movdqa xmm6, kARGBToV
|
| - movdqa xmm5, kAddUV128
|
| - sub edi, edx // stride from u to v
|
| -
|
| - align 4
|
| - convertloop:
|
| - /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| - lea eax, [eax + 64]
|
| - movdqa xmm4, xmm0
|
| - shufps xmm0, xmm1, 0x88
|
| - shufps xmm4, xmm1, 0xdd
|
| - pavgb xmm0, xmm4
|
| - movdqa xmm4, xmm2
|
| - shufps xmm2, xmm3, 0x88
|
| - shufps xmm4, xmm3, 0xdd
|
| - pavgb xmm2, xmm4
|
| -
|
| - // step 2 - convert to U and V
|
| - // from here down is very similar to Y code except
|
| - // instead of 16 different pixels, its 8 pixels of U and 8 of V
|
| - movdqa xmm1, xmm0
|
| - movdqa xmm3, xmm2
|
| - pmaddubsw xmm0, xmm7 // U
|
| - pmaddubsw xmm2, xmm7
|
| - pmaddubsw xmm1, xmm6 // V
|
| - pmaddubsw xmm3, xmm6
|
| - phaddw xmm0, xmm2
|
| - phaddw xmm1, xmm3
|
| - psraw xmm0, 8
|
| - psraw xmm1, 8
|
| - packsswb xmm0, xmm1
|
| - paddb xmm0, xmm5 // -> unsigned
|
| -
|
| - // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| - movlps qword ptr [edx], xmm0 // U
|
| - movhps qword ptr [edx + edi], xmm0 // V
|
| - lea edx, [edx + 8]
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| - __asm {
|
| - push edi
|
| - mov eax, [esp + 4 + 4] // src_argb
|
| - mov edx, [esp + 4 + 8] // dst_u
|
| - mov edi, [esp + 4 + 12] // dst_v
|
| - mov ecx, [esp + 4 + 16] // pix
|
| - movdqa xmm7, kARGBToU
|
| - movdqa xmm6, kARGBToV
|
| - movdqa xmm5, kAddUV128
|
| - sub edi, edx // stride from u to v
|
| -
|
| - align 4
|
| - convertloop:
|
| - /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| - movdqu xmm0, [eax]
|
| - movdqu xmm1, [eax + 16]
|
| - movdqu xmm2, [eax + 32]
|
| - movdqu xmm3, [eax + 48]
|
| - lea eax, [eax + 64]
|
| - movdqa xmm4, xmm0
|
| - shufps xmm0, xmm1, 0x88
|
| - shufps xmm4, xmm1, 0xdd
|
| - pavgb xmm0, xmm4
|
| - movdqa xmm4, xmm2
|
| - shufps xmm2, xmm3, 0x88
|
| - shufps xmm4, xmm3, 0xdd
|
| - pavgb xmm2, xmm4
|
| -
|
| - // step 2 - convert to U and V
|
| - // from here down is very similar to Y code except
|
| - // instead of 16 different pixels, its 8 pixels of U and 8 of V
|
| - movdqa xmm1, xmm0
|
| - movdqa xmm3, xmm2
|
| - pmaddubsw xmm0, xmm7 // U
|
| - pmaddubsw xmm2, xmm7
|
| - pmaddubsw xmm1, xmm6 // V
|
| - pmaddubsw xmm3, xmm6
|
| - phaddw xmm0, xmm2
|
| - phaddw xmm1, xmm3
|
| - psraw xmm0, 8
|
| - psraw xmm1, 8
|
| - packsswb xmm0, xmm1
|
| - paddb xmm0, xmm5 // -> unsigned
|
| -
|
| - // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| - movlps qword ptr [edx], xmm0 // U
|
| - movhps qword ptr [edx + edi], xmm0 // V
|
| - lea edx, [edx + 8]
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| - __asm {
|
| - push esi
|
| - push edi
|
| - mov eax, [esp + 8 + 4] // src_argb
|
| - mov esi, [esp + 8 + 8] // src_stride_argb
|
| - mov edx, [esp + 8 + 12] // dst_u
|
| - mov edi, [esp + 8 + 16] // dst_v
|
| - mov ecx, [esp + 8 + 20] // pix
|
| - movdqa xmm7, kBGRAToU
|
| - movdqa xmm6, kBGRAToV
|
| - movdqa xmm5, kAddUV128
|
| - sub edi, edx // stride from u to v
|
| -
|
| - align 4
|
| - convertloop:
|
| - /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| - pavgb xmm0, [eax + esi]
|
| - pavgb xmm1, [eax + esi + 16]
|
| - pavgb xmm2, [eax + esi + 32]
|
| - pavgb xmm3, [eax + esi + 48]
|
| - lea eax, [eax + 64]
|
| - movdqa xmm4, xmm0
|
| - shufps xmm0, xmm1, 0x88
|
| - shufps xmm4, xmm1, 0xdd
|
| - pavgb xmm0, xmm4
|
| - movdqa xmm4, xmm2
|
| - shufps xmm2, xmm3, 0x88
|
| - shufps xmm4, xmm3, 0xdd
|
| - pavgb xmm2, xmm4
|
| -
|
| - // step 2 - convert to U and V
|
| - // from here down is very similar to Y code except
|
| - // instead of 16 different pixels, its 8 pixels of U and 8 of V
|
| - movdqa xmm1, xmm0
|
| - movdqa xmm3, xmm2
|
| - pmaddubsw xmm0, xmm7 // U
|
| - pmaddubsw xmm2, xmm7
|
| - pmaddubsw xmm1, xmm6 // V
|
| - pmaddubsw xmm3, xmm6
|
| - phaddw xmm0, xmm2
|
| - phaddw xmm1, xmm3
|
| - psraw xmm0, 8
|
| - psraw xmm1, 8
|
| - packsswb xmm0, xmm1
|
| - paddb xmm0, xmm5 // -> unsigned
|
| -
|
| - // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| - movlps qword ptr [edx], xmm0 // U
|
| - movhps qword ptr [edx + edi], xmm0 // V
|
| - lea edx, [edx + 8]
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| - __asm {
|
| - push esi
|
| - push edi
|
| - mov eax, [esp + 8 + 4] // src_argb
|
| - mov esi, [esp + 8 + 8] // src_stride_argb
|
| - mov edx, [esp + 8 + 12] // dst_u
|
| - mov edi, [esp + 8 + 16] // dst_v
|
| - mov ecx, [esp + 8 + 20] // pix
|
| - movdqa xmm7, kBGRAToU
|
| - movdqa xmm6, kBGRAToV
|
| - movdqa xmm5, kAddUV128
|
| - sub edi, edx // stride from u to v
|
| -
|
| - align 4
|
| - convertloop:
|
| - /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| - movdqu xmm0, [eax]
|
| - movdqu xmm1, [eax + 16]
|
| - movdqu xmm2, [eax + 32]
|
| - movdqu xmm3, [eax + 48]
|
| - movdqu xmm4, [eax + esi]
|
| - pavgb xmm0, xmm4
|
| - movdqu xmm4, [eax + esi + 16]
|
| - pavgb xmm1, xmm4
|
| - movdqu xmm4, [eax + esi + 32]
|
| - pavgb xmm2, xmm4
|
| - movdqu xmm4, [eax + esi + 48]
|
| - pavgb xmm3, xmm4
|
| - lea eax, [eax + 64]
|
| - movdqa xmm4, xmm0
|
| - shufps xmm0, xmm1, 0x88
|
| - shufps xmm4, xmm1, 0xdd
|
| - pavgb xmm0, xmm4
|
| - movdqa xmm4, xmm2
|
| - shufps xmm2, xmm3, 0x88
|
| - shufps xmm4, xmm3, 0xdd
|
| - pavgb xmm2, xmm4
|
| -
|
| - // step 2 - convert to U and V
|
| - // from here down is very similar to Y code except
|
| - // instead of 16 different pixels, its 8 pixels of U and 8 of V
|
| - movdqa xmm1, xmm0
|
| - movdqa xmm3, xmm2
|
| - pmaddubsw xmm0, xmm7 // U
|
| - pmaddubsw xmm2, xmm7
|
| - pmaddubsw xmm1, xmm6 // V
|
| - pmaddubsw xmm3, xmm6
|
| - phaddw xmm0, xmm2
|
| - phaddw xmm1, xmm3
|
| - psraw xmm0, 8
|
| - psraw xmm1, 8
|
| - packsswb xmm0, xmm1
|
| - paddb xmm0, xmm5 // -> unsigned
|
| -
|
| - // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| - movlps qword ptr [edx], xmm0 // U
|
| - movhps qword ptr [edx + edi], xmm0 // V
|
| - lea edx, [edx + 8]
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| +__declspec(naked) __declspec(align(16))
|
| +void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
|
| + uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| - push esi
|
| push edi
|
| - mov eax, [esp + 8 + 4] // src_argb
|
| - mov esi, [esp + 8 + 8] // src_stride_argb
|
| - mov edx, [esp + 8 + 12] // dst_u
|
| - mov edi, [esp + 8 + 16] // dst_v
|
| - mov ecx, [esp + 8 + 20] // pix
|
| - movdqa xmm7, kABGRToU
|
| - movdqa xmm6, kABGRToV
|
| + mov eax, [esp + 4 + 4] // src_argb
|
| + mov edx, [esp + 4 + 8] // dst_u
|
| + mov edi, [esp + 4 + 12] // dst_v
|
| + mov ecx, [esp + 4 + 16] // pix
|
| movdqa xmm5, kAddUV128
|
| + movdqa xmm6, kARGBToV
|
| + movdqa xmm7, kARGBToU
|
| sub edi, edx // stride from u to v
|
|
|
| - align 4
|
| convertloop:
|
| /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| - pavgb xmm0, [eax + esi]
|
| - pavgb xmm1, [eax + esi + 16]
|
| - pavgb xmm2, [eax + esi + 32]
|
| - pavgb xmm3, [eax + esi + 48]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| + movdqu xmm2, [eax + 32]
|
| + movdqu xmm3, [eax + 48]
|
| lea eax, [eax + 64]
|
| movdqa xmm4, xmm0
|
| shufps xmm0, xmm1, 0x88
|
| @@ -1974,21 +1368,20 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| paddb xmm0, xmm5 // -> unsigned
|
|
|
| // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| movlps qword ptr [edx], xmm0 // U
|
| movhps qword ptr [edx + edi], xmm0 // V
|
| lea edx, [edx + 8]
|
| + sub ecx, 16
|
| jg convertloop
|
|
|
| pop edi
|
| - pop esi
|
| ret
|
| }
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| + uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| push esi
|
| push edi
|
| @@ -1997,26 +1390,26 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| mov edx, [esp + 8 + 12] // dst_u
|
| mov edi, [esp + 8 + 16] // dst_v
|
| mov ecx, [esp + 8 + 20] // pix
|
| - movdqa xmm7, kABGRToU
|
| - movdqa xmm6, kABGRToV
|
| movdqa xmm5, kAddUV128
|
| + movdqa xmm6, kBGRAToV
|
| + movdqa xmm7, kBGRAToU
|
| sub edi, edx // stride from u to v
|
|
|
| - align 4
|
| convertloop:
|
| /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| movdqu xmm0, [eax]
|
| - movdqu xmm1, [eax + 16]
|
| - movdqu xmm2, [eax + 32]
|
| - movdqu xmm3, [eax + 48]
|
| movdqu xmm4, [eax + esi]
|
| pavgb xmm0, xmm4
|
| + movdqu xmm1, [eax + 16]
|
| movdqu xmm4, [eax + esi + 16]
|
| pavgb xmm1, xmm4
|
| + movdqu xmm2, [eax + 32]
|
| movdqu xmm4, [eax + esi + 32]
|
| pavgb xmm2, xmm4
|
| + movdqu xmm3, [eax + 48]
|
| movdqu xmm4, [eax + esi + 48]
|
| pavgb xmm3, xmm4
|
| +
|
| lea eax, [eax + 64]
|
| movdqa xmm4, xmm0
|
| shufps xmm0, xmm1, 0x88
|
| @@ -2044,10 +1437,10 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| paddb xmm0, xmm5 // -> unsigned
|
|
|
| // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| movlps qword ptr [edx], xmm0 // U
|
| movhps qword ptr [edx + edi], xmm0 // V
|
| lea edx, [edx + 8]
|
| + sub ecx, 16
|
| jg convertloop
|
|
|
| pop edi
|
| @@ -2057,7 +1450,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| push esi
|
| @@ -2067,22 +1460,26 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| mov edx, [esp + 8 + 12] // dst_u
|
| mov edi, [esp + 8 + 16] // dst_v
|
| mov ecx, [esp + 8 + 20] // pix
|
| - movdqa xmm7, kRGBAToU
|
| - movdqa xmm6, kRGBAToV
|
| movdqa xmm5, kAddUV128
|
| + movdqa xmm6, kABGRToV
|
| + movdqa xmm7, kABGRToU
|
| sub edi, edx // stride from u to v
|
|
|
| - align 4
|
| convertloop:
|
| /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| - pavgb xmm0, [eax + esi]
|
| - pavgb xmm1, [eax + esi + 16]
|
| - pavgb xmm2, [eax + esi + 32]
|
| - pavgb xmm3, [eax + esi + 48]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm4, [eax + esi]
|
| + pavgb xmm0, xmm4
|
| + movdqu xmm1, [eax + 16]
|
| + movdqu xmm4, [eax + esi + 16]
|
| + pavgb xmm1, xmm4
|
| + movdqu xmm2, [eax + 32]
|
| + movdqu xmm4, [eax + esi + 32]
|
| + pavgb xmm2, xmm4
|
| + movdqu xmm3, [eax + 48]
|
| + movdqu xmm4, [eax + esi + 48]
|
| + pavgb xmm3, xmm4
|
| +
|
| lea eax, [eax + 64]
|
| movdqa xmm4, xmm0
|
| shufps xmm0, xmm1, 0x88
|
| @@ -2110,10 +1507,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| paddb xmm0, xmm5 // -> unsigned
|
|
|
| // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| movlps qword ptr [edx], xmm0 // U
|
| movhps qword ptr [edx + edi], xmm0 // V
|
| lea edx, [edx + 8]
|
| + sub ecx, 16
|
| jg convertloop
|
|
|
| pop edi
|
| @@ -2123,8 +1520,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| - uint8* dst_u, uint8* dst_v, int width) {
|
| +void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| + uint8* dst_u, uint8* dst_v, int width) {
|
| __asm {
|
| push esi
|
| push edi
|
| @@ -2133,26 +1530,26 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| mov edx, [esp + 8 + 12] // dst_u
|
| mov edi, [esp + 8 + 16] // dst_v
|
| mov ecx, [esp + 8 + 20] // pix
|
| - movdqa xmm7, kRGBAToU
|
| - movdqa xmm6, kRGBAToV
|
| movdqa xmm5, kAddUV128
|
| + movdqa xmm6, kRGBAToV
|
| + movdqa xmm7, kRGBAToU
|
| sub edi, edx // stride from u to v
|
|
|
| - align 4
|
| convertloop:
|
| /* step 1 - subsample 16x2 argb pixels to 8x1 */
|
| movdqu xmm0, [eax]
|
| - movdqu xmm1, [eax + 16]
|
| - movdqu xmm2, [eax + 32]
|
| - movdqu xmm3, [eax + 48]
|
| movdqu xmm4, [eax + esi]
|
| pavgb xmm0, xmm4
|
| + movdqu xmm1, [eax + 16]
|
| movdqu xmm4, [eax + esi + 16]
|
| pavgb xmm1, xmm4
|
| + movdqu xmm2, [eax + 32]
|
| movdqu xmm4, [eax + esi + 32]
|
| pavgb xmm2, xmm4
|
| + movdqu xmm3, [eax + 48]
|
| movdqu xmm4, [eax + esi + 48]
|
| pavgb xmm3, xmm4
|
| +
|
| lea eax, [eax + 64]
|
| movdqa xmm4, xmm0
|
| shufps xmm0, xmm1, 0x88
|
| @@ -2180,10 +1577,10 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| paddb xmm0, xmm5 // -> unsigned
|
|
|
| // step 3 - store 8 U and 8 V values
|
| - sub ecx, 16
|
| movlps qword ptr [edx], xmm0 // U
|
| movhps qword ptr [edx + edi], xmm0 // V
|
| lea edx, [edx + 8]
|
| + sub ecx, 16
|
| jg convertloop
|
|
|
| pop edi
|
| @@ -2193,36 +1590,68 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
| }
|
| #endif // HAS_ARGBTOYROW_SSSE3
|
|
|
| -#ifdef HAS_I422TOARGBROW_AVX2
|
| -
|
| -static const lvec8 kUVToB_AVX = {
|
| - UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
|
| - UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
|
| -};
|
| -static const lvec8 kUVToR_AVX = {
|
| - UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
|
| - UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
|
| -};
|
| -static const lvec8 kUVToG_AVX = {
|
| - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
|
| - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
|
| -};
|
| -static const lvec16 kYToRgb_AVX = {
|
| - YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
|
| -};
|
| -static const lvec16 kYSub16_AVX = {
|
| - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
| -};
|
| -static const lvec16 kUVBiasB_AVX = {
|
| - BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
|
| -};
|
| -static const lvec16 kUVBiasG_AVX = {
|
| - BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
|
| -};
|
| -static const lvec16 kUVBiasR_AVX = {
|
| - BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
|
| -};
|
| +// Read 8 UV from 422, upsample to 16 UV.
|
| +#define READYUV422_AVX2 __asm { \
|
| + __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
|
| + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
|
| + __asm lea esi, [esi + 8] \
|
| + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
|
| + __asm vpermq ymm0, ymm0, 0xd8 \
|
| + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
|
| + }
|
| +
|
| +// Read 8 UV from NV12, upsample to 16 UV.
|
| +#define READNV12_AVX2 __asm { \
|
| + __asm vmovdqu xmm0, [esi] /* UV */ \
|
| + __asm lea esi, [esi + 16] \
|
| + __asm vpermq ymm0, ymm0, 0xd8 \
|
| + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
|
| + }
|
| +
|
| +// Convert 16 pixels: 16 UV and 16 Y.
|
| +#define YUVTORGB_AVX2(YuvConstants) __asm { \
|
| + /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \
|
| + __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \
|
| + __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \
|
| + __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \
|
| + __asm vmovdqu ymm3, YuvConstants.kUVBiasR \
|
| + __asm vpsubw ymm2, ymm3, ymm2 \
|
| + __asm vmovdqu ymm3, YuvConstants.kUVBiasG \
|
| + __asm vpsubw ymm1, ymm3, ymm1 \
|
| + __asm vmovdqu ymm3, YuvConstants.kUVBiasB \
|
| + __asm vpsubw ymm0, ymm3, ymm0 \
|
| + /* Step 2: Find Y contribution to 16 R,G,B values */ \
|
| + __asm vmovdqu xmm3, [eax] /* NOLINT */ \
|
| + __asm lea eax, [eax + 16] \
|
| + __asm vpermq ymm3, ymm3, 0xd8 \
|
| + __asm vpunpcklbw ymm3, ymm3, ymm3 \
|
| + __asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \
|
| + __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \
|
| + __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \
|
| + __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \
|
| + __asm vpsraw ymm0, ymm0, 6 \
|
| + __asm vpsraw ymm1, ymm1, 6 \
|
| + __asm vpsraw ymm2, ymm2, 6 \
|
| + __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
|
| + __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
|
| + __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
|
| + }
|
| +
|
| +// Store 16 ARGB values.
|
| +#define STOREARGB_AVX2 __asm { \
|
| + /* Step 3: Weave into ARGB */ \
|
| + __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
|
| + __asm vpermq ymm0, ymm0, 0xd8 \
|
| + __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
|
| + __asm vpermq ymm2, ymm2, 0xd8 \
|
| + __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
|
| + __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
|
| + __asm vmovdqu [edx], ymm1 \
|
| + __asm vmovdqu [edx + 32], ymm0 \
|
| + __asm lea edx, [edx + 64] \
|
| + }
|
|
|
| +#ifdef HAS_I422TOARGBROW_AVX2
|
| // 16 pixels
|
| // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
|
| __declspec(naked) __declspec(align(16))
|
| @@ -2241,63 +1670,222 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
|
| mov ecx, [esp + 8 + 20] // width
|
| sub edi, esi
|
| vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| - vpxor ymm4, ymm4, ymm4
|
|
|
| - align 4
|
| convertloop:
|
| - vmovq xmm0, qword ptr [esi] // U
|
| - vmovq xmm1, qword ptr [esi + edi] // V
|
| - lea esi, [esi + 8]
|
| - vpunpcklbw ymm0, ymm0, ymm1 // UV
|
| - vpermq ymm0, ymm0, 0xd8
|
| - vpunpcklwd ymm0, ymm0, ymm0 // UVUV
|
| - vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV
|
| - vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV
|
| - vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV
|
| - vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed
|
| - vpsubw ymm1, ymm1, kUVBiasG_AVX
|
| - vpsubw ymm0, ymm0, kUVBiasR_AVX
|
| -
|
| - // Step 2: Find Y contribution to 16 R,G,B values
|
| - vmovdqu xmm3, [eax] // NOLINT
|
| - lea eax, [eax + 16]
|
| - vpermq ymm3, ymm3, 0xd8
|
| - vpunpcklbw ymm3, ymm3, ymm4
|
| - vpsubsw ymm3, ymm3, kYSub16_AVX
|
| - vpmullw ymm3, ymm3, kYToRgb_AVX
|
| - vpaddsw ymm2, ymm2, ymm3 // B += Y
|
| - vpaddsw ymm1, ymm1, ymm3 // G += Y
|
| - vpaddsw ymm0, ymm0, ymm3 // R += Y
|
| - vpsraw ymm2, ymm2, 6
|
| - vpsraw ymm1, ymm1, 6
|
| - vpsraw ymm0, ymm0, 6
|
| - vpackuswb ymm2, ymm2, ymm2 // B
|
| - vpackuswb ymm1, ymm1, ymm1 // G
|
| - vpackuswb ymm0, ymm0, ymm0 // R
|
| -
|
| - // Step 3: Weave into ARGB
|
| - vpunpcklbw ymm2, ymm2, ymm1 // BG
|
| + READYUV422_AVX2
|
| + YUVTORGB_AVX2(kYuvConstants)
|
| + STOREARGB_AVX2
|
| +
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop edi
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_I422TOARGBROW_AVX2
|
| +
|
| +#ifdef HAS_NV12TOARGBROW_AVX2
|
| +// 16 pixels.
|
| +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
|
| +__declspec(naked) __declspec(align(16))
|
| +void NV12ToARGBRow_AVX2(const uint8* y_buf,
|
| + const uint8* uv_buf,
|
| + uint8* dst_argb,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + mov eax, [esp + 4 + 4] // Y
|
| + mov esi, [esp + 4 + 8] // UV
|
| + mov edx, [esp + 4 + 12] // argb
|
| + mov ecx, [esp + 4 + 16] // width
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| +
|
| + convertloop:
|
| + READNV12_AVX2
|
| + YUVTORGB_AVX2(kYuvConstants)
|
| + STOREARGB_AVX2
|
| +
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_NV12TOARGBROW_AVX2
|
| +
|
| +#ifdef HAS_NV21TOARGBROW_AVX2
|
| +// 16 pixels.
|
| +// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
|
| +__declspec(naked) __declspec(align(16))
|
| +void NV21ToARGBRow_AVX2(const uint8* y_buf,
|
| + const uint8* uv_buf,
|
| + uint8* dst_argb,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + mov eax, [esp + 4 + 4] // Y
|
| + mov esi, [esp + 4 + 8] // UV
|
| + mov edx, [esp + 4 + 12] // argb
|
| + mov ecx, [esp + 4 + 16] // width
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| +
|
| + convertloop:
|
| + READNV12_AVX2
|
| + YUVTORGB_AVX2(kYvuConstants)
|
| + STOREARGB_AVX2
|
| +
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop esi
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_NV21TOARGBROW_AVX2
|
| +
|
| +#ifdef HAS_I422TOBGRAROW_AVX2
|
| +// 16 pixels
|
| +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
|
| +// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
|
| +__declspec(naked) __declspec(align(16))
|
| +void I422ToBGRARow_AVX2(const uint8* y_buf,
|
| + const uint8* u_buf,
|
| + const uint8* v_buf,
|
| + uint8* dst_argb,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // Y
|
| + mov esi, [esp + 8 + 8] // U
|
| + mov edi, [esp + 8 + 12] // V
|
| + mov edx, [esp + 8 + 16] // argb
|
| + mov ecx, [esp + 8 + 20] // width
|
| + sub edi, esi
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| +
|
| + convertloop:
|
| + READYUV422_AVX2
|
| + YUVTORGB_AVX2(kYuvConstants)
|
| +
|
| + // Step 3: Weave into BGRA
|
| + vpunpcklbw ymm1, ymm1, ymm0 // GB
|
| + vpermq ymm1, ymm1, 0xd8
|
| + vpunpcklbw ymm2, ymm5, ymm2 // AR
|
| vpermq ymm2, ymm2, 0xd8
|
| - vpunpcklbw ymm0, ymm0, ymm5 // RA
|
| - vpermq ymm0, ymm0, 0xd8
|
| - vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels
|
| - vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels
|
| - vmovdqu [edx], ymm1
|
| + vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels
|
| + vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels
|
| + vmovdqu [edx], ymm0
|
| vmovdqu [edx + 32], ymm2
|
| lea edx, [edx + 64]
|
| sub ecx, 16
|
| jg convertloop
|
| +
|
| + pop edi
|
| + pop esi
|
| vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_I422TOBGRAROW_AVX2
|
| +
|
| +#ifdef HAS_I422TORGBAROW_AVX2
|
| +// 16 pixels
|
| +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
|
| +// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
|
| +__declspec(naked) __declspec(align(16))
|
| +void I422ToRGBARow_AVX2(const uint8* y_buf,
|
| + const uint8* u_buf,
|
| + const uint8* v_buf,
|
| + uint8* dst_argb,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // Y
|
| + mov esi, [esp + 8 + 8] // U
|
| + mov edi, [esp + 8 + 12] // V
|
| + mov edx, [esp + 8 + 16] // argb
|
| + mov ecx, [esp + 8 + 20] // width
|
| + sub edi, esi
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| +
|
| + convertloop:
|
| + READYUV422_AVX2
|
| + YUVTORGB_AVX2(kYuvConstants)
|
| +
|
| + // Step 3: Weave into RGBA
|
| + vpunpcklbw ymm1, ymm1, ymm2 // GR
|
| + vpermq ymm1, ymm1, 0xd8
|
| + vpunpcklbw ymm2, ymm5, ymm0 // AB
|
| + vpermq ymm2, ymm2, 0xd8
|
| + vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels
|
| + vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels
|
| + vmovdqu [edx], ymm0
|
| + vmovdqu [edx + 32], ymm1
|
| + lea edx, [edx + 64]
|
| + sub ecx, 16
|
| + jg convertloop
|
|
|
| pop edi
|
| pop esi
|
| + vzeroupper
|
| ret
|
| }
|
| }
|
| -#endif // HAS_I422TOARGBROW_AVX2
|
| +#endif // HAS_I422TORGBAROW_AVX2
|
| +
|
| +#ifdef HAS_I422TOABGRROW_AVX2
|
| +// 16 pixels
|
| +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
|
| +// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
|
| +__declspec(naked) __declspec(align(16))
|
| +void I422ToABGRRow_AVX2(const uint8* y_buf,
|
| + const uint8* u_buf,
|
| + const uint8* v_buf,
|
| + uint8* dst_argb,
|
| + int width) {
|
| + __asm {
|
| + push esi
|
| + push edi
|
| + mov eax, [esp + 8 + 4] // Y
|
| + mov esi, [esp + 8 + 8] // U
|
| + mov edi, [esp + 8 + 12] // V
|
| + mov edx, [esp + 8 + 16] // argb
|
| + mov ecx, [esp + 8 + 20] // width
|
| + sub edi, esi
|
| + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
| +
|
| + convertloop:
|
| + READYUV422_AVX2
|
| + YUVTORGB_AVX2(kYuvConstants)
|
|
|
| -#ifdef HAS_I422TOARGBROW_SSSE3
|
| + // Step 3: Weave into ABGR
|
| + vpunpcklbw ymm1, ymm2, ymm1 // RG
|
| + vpermq ymm1, ymm1, 0xd8
|
| + vpunpcklbw ymm2, ymm0, ymm5 // BA
|
| + vpermq ymm2, ymm2, 0xd8
|
| + vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels
|
| + vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels
|
| + vmovdqu [edx], ymm0
|
| + vmovdqu [edx + 32], ymm1
|
| + lea edx, [edx + 64]
|
| + sub ecx, 16
|
| + jg convertloop
|
| +
|
| + pop edi
|
| + pop esi
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_I422TOABGRROW_AVX2
|
|
|
| +#if defined(HAS_I422TOARGBROW_SSSE3)
|
| // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
|
|
|
| // Read 8 UV from 444.
|
| @@ -2336,51 +1924,26 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
|
| __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
|
| }
|
|
|
| -// Convert 8 pixels: 8 UV and 8 Y.
|
| -#define YUVTORGB __asm { \
|
| - /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
|
| - __asm movdqa xmm1, xmm0 \
|
| - __asm movdqa xmm2, xmm0 \
|
| - __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
|
| - __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
|
| - __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
|
| - __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
|
| - __asm psubw xmm1, kUVBiasG \
|
| - __asm psubw xmm2, kUVBiasR \
|
| - /* Step 2: Find Y contribution to 8 R,G,B values */ \
|
| - __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
|
| - __asm lea eax, [eax + 8] \
|
| - __asm punpcklbw xmm3, xmm4 \
|
| - __asm psubsw xmm3, kYSub16 \
|
| - __asm pmullw xmm3, kYToRgb \
|
| - __asm paddsw xmm0, xmm3 /* B += Y */ \
|
| - __asm paddsw xmm1, xmm3 /* G += Y */ \
|
| - __asm paddsw xmm2, xmm3 /* R += Y */ \
|
| - __asm psraw xmm0, 6 \
|
| - __asm psraw xmm1, 6 \
|
| - __asm psraw xmm2, 6 \
|
| - __asm packuswb xmm0, xmm0 /* B */ \
|
| - __asm packuswb xmm1, xmm1 /* G */ \
|
| - __asm packuswb xmm2, xmm2 /* R */ \
|
| - }
|
| -
|
| -// Convert 8 pixels: 8 VU and 8 Y.
|
| -#define YVUTORGB __asm { \
|
| +// Convert 8 pixels: 8 UV and 8 Y.
|
| +#define YUVTORGB(YuvConstants) __asm { \
|
| /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
|
| __asm movdqa xmm1, xmm0 \
|
| __asm movdqa xmm2, xmm0 \
|
| - __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
|
| - __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
|
| - __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
|
| - __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
|
| - __asm psubw xmm1, kUVBiasG \
|
| - __asm psubw xmm2, kUVBiasR \
|
| + __asm movdqa xmm3, xmm0 \
|
| + __asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \
|
| + __asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \
|
| + __asm psubw xmm0, xmm1 \
|
| + __asm movdqa xmm1, YuvConstants.kUVBiasG \
|
| + __asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \
|
| + __asm psubw xmm1, xmm2 \
|
| + __asm movdqa xmm2, YuvConstants.kUVBiasR \
|
| + __asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \
|
| + __asm psubw xmm2, xmm3 \
|
| /* Step 2: Find Y contribution to 8 R,G,B values */ \
|
| __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
|
| __asm lea eax, [eax + 8] \
|
| - __asm punpcklbw xmm3, xmm4 \
|
| - __asm psubsw xmm3, kYSub16 \
|
| - __asm pmullw xmm3, kYToRgb \
|
| + __asm punpcklbw xmm3, xmm3 \
|
| + __asm pmulhuw xmm3, YuvConstants.kYToRgb \
|
| __asm paddsw xmm0, xmm3 /* B += Y */ \
|
| __asm paddsw xmm1, xmm3 /* G += Y */ \
|
| __asm paddsw xmm2, xmm3 /* R += Y */ \
|
| @@ -2392,7 +1955,131 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
|
| __asm packuswb xmm2, xmm2 /* R */ \
|
| }
|
|
|
| -// 8 pixels, dest aligned 16.
|
| +// Store 8 ARGB values.
|
| +#define STOREARGB __asm { \
|
| + /* Step 3: Weave into ARGB */ \
|
| + __asm punpcklbw xmm0, xmm1 /* BG */ \
|
| + __asm punpcklbw xmm2, xmm5 /* RA */ \
|
| + __asm movdqa xmm1, xmm0 \
|
| + __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
|
| + __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
|
| + __asm movdqu [edx], xmm0 \
|
| + __asm movdqu [edx + 16], xmm1 \
|
| + __asm lea edx, [edx + 32] \
|
| + }
|
| +
|
| +// Store 8 BGRA values.
|
| +#define STOREBGRA __asm { \
|
| + /* Step 3: Weave into BGRA */ \
|
| + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
|
| + __asm punpcklbw xmm1, xmm0 /* GB */ \
|
| + __asm punpcklbw xmm5, xmm2 /* AR */ \
|
| + __asm movdqa xmm0, xmm5 \
|
| + __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
|
| + __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
|
| + __asm movdqu [edx], xmm5 \
|
| + __asm movdqu [edx + 16], xmm0 \
|
| + __asm lea edx, [edx + 32] \
|
| + }
|
| +
|
| +// Store 8 ABGR values.
|
| +#define STOREABGR __asm { \
|
| + /* Step 3: Weave into ABGR */ \
|
| + __asm punpcklbw xmm2, xmm1 /* RG */ \
|
| + __asm punpcklbw xmm0, xmm5 /* BA */ \
|
| + __asm movdqa xmm1, xmm2 \
|
| + __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \
|
| + __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \
|
| + __asm movdqu [edx], xmm2 \
|
| + __asm movdqu [edx + 16], xmm1 \
|
| + __asm lea edx, [edx + 32] \
|
| + }
|
| +
|
| +// Store 8 RGBA values.
|
| +#define STORERGBA __asm { \
|
| + /* Step 3: Weave into RGBA */ \
|
| + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
|
| + __asm punpcklbw xmm1, xmm2 /* GR */ \
|
| + __asm punpcklbw xmm5, xmm0 /* AB */ \
|
| + __asm movdqa xmm0, xmm5 \
|
| + __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
|
| + __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
|
| + __asm movdqu [edx], xmm5 \
|
| + __asm movdqu [edx + 16], xmm0 \
|
| + __asm lea edx, [edx + 32] \
|
| + }
|
| +
|
| +// Store 8 RGB24 values.
|
| +#define STORERGB24 __asm { \
|
| + /* Step 3: Weave into RRGB */ \
|
| + __asm punpcklbw xmm0, xmm1 /* BG */ \
|
| + __asm punpcklbw xmm2, xmm2 /* RR */ \
|
| + __asm movdqa xmm1, xmm0 \
|
| + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
|
| + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
|
| + /* Step 4: RRGB -> RGB24 */ \
|
| + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
|
| + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
|
| + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
|
| + __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \
|
| + __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \
|
| + __asm lea edx, [edx + 24] \
|
| + }
|
| +
|
| +// Store 8 RAW values.
|
| +#define STORERAW __asm { \
|
| + /* Step 3: Weave into RRGB */ \
|
| + __asm punpcklbw xmm0, xmm1 /* BG */ \
|
| + __asm punpcklbw xmm2, xmm2 /* RR */ \
|
| + __asm movdqa xmm1, xmm0 \
|
| + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
|
| + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
|
| + /* Step 4: RRGB -> RAW */ \
|
| + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
|
| + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
|
| + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
|
| + __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \
|
| + __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \
|
| + __asm lea edx, [edx + 24] \
|
| + }
|
| +
|
| +// Store 8 RGB565 values.
|
| +#define STORERGB565 __asm { \
|
| + /* Step 3: Weave into RRGB */ \
|
| + __asm punpcklbw xmm0, xmm1 /* BG */ \
|
| + __asm punpcklbw xmm2, xmm2 /* RR */ \
|
| + __asm movdqa xmm1, xmm0 \
|
| + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
|
| + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
|
| + /* Step 4: RRGB -> RGB565 */ \
|
| + __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
|
| + __asm movdqa xmm2, xmm0 /* G */ \
|
| + __asm pslld xmm0, 8 /* R */ \
|
| + __asm psrld xmm3, 3 /* B */ \
|
| + __asm psrld xmm2, 5 /* G */ \
|
| + __asm psrad xmm0, 16 /* R */ \
|
| + __asm pand xmm3, xmm5 /* B */ \
|
| + __asm pand xmm2, xmm6 /* G */ \
|
| + __asm pand xmm0, xmm7 /* R */ \
|
| + __asm por xmm3, xmm2 /* BG */ \
|
| + __asm por xmm0, xmm3 /* BGR */ \
|
| + __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
|
| + __asm movdqa xmm2, xmm1 /* G */ \
|
| + __asm pslld xmm1, 8 /* R */ \
|
| + __asm psrld xmm3, 3 /* B */ \
|
| + __asm psrld xmm2, 5 /* G */ \
|
| + __asm psrad xmm1, 16 /* R */ \
|
| + __asm pand xmm3, xmm5 /* B */ \
|
| + __asm pand xmm2, xmm6 /* G */ \
|
| + __asm pand xmm1, xmm7 /* R */ \
|
| + __asm por xmm3, xmm2 /* BG */ \
|
| + __asm por xmm1, xmm3 /* BGR */ \
|
| + __asm packssdw xmm0, xmm1 \
|
| + __asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \
|
| + __asm lea edx, [edx + 16] \
|
| + }
|
| +
|
| +// 8 pixels.
|
| // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| __declspec(naked) __declspec(align(16))
|
| void I444ToARGBRow_SSSE3(const uint8* y_buf,
|
| @@ -2410,22 +2097,12 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
|
| mov ecx, [esp + 8 + 20] // width
|
| sub edi, esi
|
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
|
|
| - align 4
|
| convertloop:
|
| READYUV444
|
| - YUVTORGB
|
| + YUVTORGB(kYuvConstants)
|
| + STOREARGB
|
|
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm5 // RA
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| sub ecx, 8
|
| jg convertloop
|
|
|
| @@ -2435,8 +2112,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
|
| }
|
| }
|
|
|
| -// 8 pixels, dest aligned 16.
|
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| +// 8 pixels.
|
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
|
| __declspec(naked) __declspec(align(16))
|
| void I422ToRGB24Row_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| @@ -2452,27 +2129,14 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
|
| mov edx, [esp + 8 + 16] // rgb24
|
| mov ecx, [esp + 8 + 20] // width
|
| sub edi, esi
|
| - pxor xmm4, xmm4
|
| movdqa xmm5, kShuffleMaskARGBToRGB24_0
|
| movdqa xmm6, kShuffleMaskARGBToRGB24
|
|
|
| - align 4
|
| convertloop:
|
| READYUV422
|
| - YUVTORGB
|
| + YUVTORGB(kYuvConstants)
|
| + STORERGB24
|
|
|
| - // Step 3: Weave into RRGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm2 // RR
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRR first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRR next 4 pixels
|
| - pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
|
| - pshufb xmm1, xmm6 // Pack into first 12 bytes.
|
| - palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
|
| - movq qword ptr [edx], xmm0 // First 8 bytes
|
| - movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
|
| - lea edx, [edx + 24]
|
| sub ecx, 8
|
| jg convertloop
|
|
|
| @@ -2482,8 +2146,8 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
|
| }
|
| }
|
|
|
| -// 8 pixels, dest aligned 16.
|
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| +// 8 pixels.
|
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
|
| __declspec(naked) __declspec(align(16))
|
| void I422ToRAWRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| @@ -2499,27 +2163,14 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
|
| mov edx, [esp + 8 + 16] // raw
|
| mov ecx, [esp + 8 + 20] // width
|
| sub edi, esi
|
| - pxor xmm4, xmm4
|
| movdqa xmm5, kShuffleMaskARGBToRAW_0
|
| movdqa xmm6, kShuffleMaskARGBToRAW
|
|
|
| - align 4
|
| convertloop:
|
| READYUV422
|
| - YUVTORGB
|
| + YUVTORGB(kYuvConstants)
|
| + STORERAW
|
|
|
| - // Step 3: Weave into RRGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm2 // RR
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRR first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRR next 4 pixels
|
| - pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
|
| - pshufb xmm1, xmm6 // Pack into first 12 bytes.
|
| - palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
|
| - movq qword ptr [edx], xmm0 // First 8 bytes
|
| - movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
|
| - lea edx, [edx + 24]
|
| sub ecx, 8
|
| jg convertloop
|
|
|
| @@ -2529,8 +2180,8 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
|
| }
|
| }
|
|
|
| -// 8 pixels, dest unaligned.
|
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| +// 8 pixels
|
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
|
| __declspec(naked) __declspec(align(16))
|
| void I422ToRGB565Row_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| @@ -2546,7 +2197,6 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
|
| mov edx, [esp + 8 + 16] // rgb565
|
| mov ecx, [esp + 8 + 20] // width
|
| sub edi, esi
|
| - pxor xmm4, xmm4
|
| pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
|
| psrld xmm5, 27
|
| pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
|
| @@ -2555,45 +2205,12 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
|
| pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
|
| pslld xmm7, 11
|
|
|
| - align 4
|
| convertloop:
|
| READYUV422
|
| - YUVTORGB
|
| + YUVTORGB(kYuvConstants)
|
| + STORERGB565
|
|
|
| - // Step 3: Weave into RRGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm2 // RR
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRR first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRR next 4 pixels
|
| -
|
| - // Step 3b: RRGB -> RGB565
|
| - movdqa xmm3, xmm0 // B first 4 pixels of argb
|
| - movdqa xmm2, xmm0 // G
|
| - pslld xmm0, 8 // R
|
| - psrld xmm3, 3 // B
|
| - psrld xmm2, 5 // G
|
| - psrad xmm0, 16 // R
|
| - pand xmm3, xmm5 // B
|
| - pand xmm2, xmm6 // G
|
| - pand xmm0, xmm7 // R
|
| - por xmm3, xmm2 // BG
|
| - por xmm0, xmm3 // BGR
|
| - movdqa xmm3, xmm1 // B next 4 pixels of argb
|
| - movdqa xmm2, xmm1 // G
|
| - pslld xmm1, 8 // R
|
| - psrld xmm3, 3 // B
|
| - psrld xmm2, 5 // G
|
| - psrad xmm1, 16 // R
|
| - pand xmm3, xmm5 // B
|
| - pand xmm2, xmm6 // G
|
| - pand xmm1, xmm7 // R
|
| - por xmm3, xmm2 // BG
|
| - por xmm1, xmm3 // BGR
|
| - packssdw xmm0, xmm1
|
| sub ecx, 8
|
| - movdqu [edx], xmm0 // store 8 pixels of RGB565
|
| - lea edx, [edx + 16]
|
| jg convertloop
|
|
|
| pop edi
|
| @@ -2602,7 +2219,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
|
| }
|
| }
|
|
|
| -// 8 pixels, dest aligned 16.
|
| +// 8 pixels.
|
| // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| __declspec(naked) __declspec(align(16))
|
| void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
| @@ -2620,22 +2237,12 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
| mov ecx, [esp + 8 + 20] // width
|
| sub edi, esi
|
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
|
|
| - align 4
|
| convertloop:
|
| READYUV422
|
| - YUVTORGB
|
| + YUVTORGB(kYuvConstants)
|
| + STOREARGB
|
|
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm5 // RA
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| sub ecx, 8
|
| jg convertloop
|
|
|
| @@ -2645,491 +2252,119 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
| }
|
| }
|
|
|
| -// 8 pixels, dest aligned 16.
|
| +// 8 pixels.
|
| // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| // Similar to I420 but duplicate UV once more.
|
| __declspec(naked) __declspec(align(16))
|
| void I411ToARGBRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| - uint8* dst_argb,
|
| - int width) {
|
| - __asm {
|
| - push ebx
|
| - push esi
|
| - push edi
|
| - mov eax, [esp + 12 + 4] // Y
|
| - mov esi, [esp + 12 + 8] // U
|
| - mov edi, [esp + 12 + 12] // V
|
| - mov edx, [esp + 12 + 16] // argb
|
| - mov ecx, [esp + 12 + 20] // width
|
| - sub edi, esi
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
| -
|
| - align 4
|
| - convertloop:
|
| - READYUV411 // modifies EBX
|
| - YUVTORGB
|
| -
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm5 // RA
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| - sub ecx, 8
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - pop esi
|
| - pop ebx
|
| - ret
|
| - }
|
| -}
|
| -
|
| -// 8 pixels, dest aligned 16.
|
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| -void NV12ToARGBRow_SSSE3(const uint8* y_buf,
|
| - const uint8* uv_buf,
|
| - uint8* dst_argb,
|
| - int width) {
|
| - __asm {
|
| - push esi
|
| - mov eax, [esp + 4 + 4] // Y
|
| - mov esi, [esp + 4 + 8] // UV
|
| - mov edx, [esp + 4 + 12] // argb
|
| - mov ecx, [esp + 4 + 16] // width
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
| -
|
| - align 4
|
| - convertloop:
|
| - READNV12
|
| - YUVTORGB
|
| -
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm5 // RA
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| - sub ecx, 8
|
| - jg convertloop
|
| -
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -// 8 pixels, dest aligned 16.
|
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| -void NV21ToARGBRow_SSSE3(const uint8* y_buf,
|
| - const uint8* uv_buf,
|
| - uint8* dst_argb,
|
| - int width) {
|
| - __asm {
|
| - push esi
|
| - mov eax, [esp + 4 + 4] // Y
|
| - mov esi, [esp + 4 + 8] // VU
|
| - mov edx, [esp + 4 + 12] // argb
|
| - mov ecx, [esp + 4 + 16] // width
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
| -
|
| - align 4
|
| - convertloop:
|
| - READNV12
|
| - YVUTORGB
|
| -
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm5 // RA
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| - sub ecx, 8
|
| - jg convertloop
|
| -
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -// 8 pixels, unaligned.
|
| -// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| -void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
| - const uint8* u_buf,
|
| - const uint8* v_buf,
|
| - uint8* dst_argb,
|
| - int width) {
|
| - __asm {
|
| - push esi
|
| - push edi
|
| - mov eax, [esp + 8 + 4] // Y
|
| - mov esi, [esp + 8 + 8] // U
|
| - mov edi, [esp + 8 + 12] // V
|
| - mov edx, [esp + 8 + 16] // argb
|
| - mov ecx, [esp + 8 + 20] // width
|
| - sub edi, esi
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
| -
|
| - align 4
|
| - convertloop:
|
| - READYUV444
|
| - YUVTORGB
|
| -
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm5 // RA
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
| - movdqu [edx], xmm0
|
| - movdqu [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| - sub ecx, 8
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -// 8 pixels, unaligned.
|
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| -void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
| - const uint8* u_buf,
|
| - const uint8* v_buf,
|
| - uint8* dst_argb,
|
| - int width) {
|
| - __asm {
|
| - push esi
|
| - push edi
|
| - mov eax, [esp + 8 + 4] // Y
|
| - mov esi, [esp + 8 + 8] // U
|
| - mov edi, [esp + 8 + 12] // V
|
| - mov edx, [esp + 8 + 16] // argb
|
| - mov ecx, [esp + 8 + 20] // width
|
| - sub edi, esi
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
| -
|
| - align 4
|
| - convertloop:
|
| - READYUV422
|
| - YUVTORGB
|
| -
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm5 // RA
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
| - movdqu [edx], xmm0
|
| - movdqu [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| - sub ecx, 8
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -// 8 pixels, unaligned.
|
| -// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| -// Similar to I420 but duplicate UV once more.
|
| -__declspec(naked) __declspec(align(16))
|
| -void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
| - const uint8* u_buf,
|
| - const uint8* v_buf,
|
| - uint8* dst_argb,
|
| - int width) {
|
| - __asm {
|
| - push ebx
|
| - push esi
|
| - push edi
|
| - mov eax, [esp + 12 + 4] // Y
|
| - mov esi, [esp + 12 + 8] // U
|
| - mov edi, [esp + 12 + 12] // V
|
| - mov edx, [esp + 12 + 16] // argb
|
| - mov ecx, [esp + 12 + 20] // width
|
| - sub edi, esi
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
| -
|
| - align 4
|
| - convertloop:
|
| - READYUV411 // modifies EBX
|
| - YUVTORGB
|
| -
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm5 // RA
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
| - movdqu [edx], xmm0
|
| - movdqu [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| - sub ecx, 8
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - pop esi
|
| - pop ebx
|
| - ret
|
| - }
|
| -}
|
| -
|
| -// 8 pixels, dest aligned 16.
|
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| -void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
| - const uint8* uv_buf,
|
| - uint8* dst_argb,
|
| - int width) {
|
| - __asm {
|
| - push esi
|
| - mov eax, [esp + 4 + 4] // Y
|
| - mov esi, [esp + 4 + 8] // UV
|
| - mov edx, [esp + 4 + 12] // argb
|
| - mov ecx, [esp + 4 + 16] // width
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
| -
|
| - align 4
|
| - convertloop:
|
| - READNV12
|
| - YUVTORGB
|
| -
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm5 // RA
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
| - movdqu [edx], xmm0
|
| - movdqu [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| - sub ecx, 8
|
| - jg convertloop
|
| -
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -// 8 pixels, dest aligned 16.
|
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| -__declspec(naked) __declspec(align(16))
|
| -void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
| - const uint8* uv_buf,
|
| - uint8* dst_argb,
|
| - int width) {
|
| - __asm {
|
| - push esi
|
| - mov eax, [esp + 4 + 4] // Y
|
| - mov esi, [esp + 4 + 8] // VU
|
| - mov edx, [esp + 4 + 12] // argb
|
| - mov ecx, [esp + 4 + 16] // width
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
| -
|
| - align 4
|
| - convertloop:
|
| - READNV12
|
| - YVUTORGB
|
| -
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm0, xmm1 // BG
|
| - punpcklbw xmm2, xmm5 // RA
|
| - movdqa xmm1, xmm0
|
| - punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
| - punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
| - movdqu [edx], xmm0
|
| - movdqu [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| - sub ecx, 8
|
| - jg convertloop
|
| -
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void I422ToBGRARow_SSSE3(const uint8* y_buf,
|
| - const uint8* u_buf,
|
| - const uint8* v_buf,
|
| - uint8* dst_bgra,
|
| + uint8* dst_argb,
|
| int width) {
|
| __asm {
|
| + push ebx
|
| push esi
|
| push edi
|
| - mov eax, [esp + 8 + 4] // Y
|
| - mov esi, [esp + 8 + 8] // U
|
| - mov edi, [esp + 8 + 12] // V
|
| - mov edx, [esp + 8 + 16] // bgra
|
| - mov ecx, [esp + 8 + 20] // width
|
| + mov eax, [esp + 12 + 4] // Y
|
| + mov esi, [esp + 12 + 8] // U
|
| + mov edi, [esp + 12 + 12] // V
|
| + mov edx, [esp + 12 + 16] // argb
|
| + mov ecx, [esp + 12 + 20] // width
|
| sub edi, esi
|
| - pxor xmm4, xmm4
|
| + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
|
|
| - align 4
|
| convertloop:
|
| - READYUV422
|
| - YUVTORGB
|
| + READYUV411 // modifies EBX
|
| + YUVTORGB(kYuvConstants)
|
| + STOREARGB
|
|
|
| - // Step 3: Weave into BGRA
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - punpcklbw xmm1, xmm0 // GB
|
| - punpcklbw xmm5, xmm2 // AR
|
| - movdqa xmm0, xmm5
|
| - punpcklwd xmm5, xmm1 // BGRA first 4 pixels
|
| - punpckhwd xmm0, xmm1 // BGRA next 4 pixels
|
| - movdqa [edx], xmm5
|
| - movdqa [edx + 16], xmm0
|
| - lea edx, [edx + 32]
|
| sub ecx, 8
|
| jg convertloop
|
|
|
| pop edi
|
| pop esi
|
| + pop ebx
|
| ret
|
| }
|
| }
|
|
|
| +// 8 pixels.
|
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| __declspec(naked) __declspec(align(16))
|
| -void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
|
| - const uint8* u_buf,
|
| - const uint8* v_buf,
|
| - uint8* dst_bgra,
|
| - int width) {
|
| +void NV12ToARGBRow_SSSE3(const uint8* y_buf,
|
| + const uint8* uv_buf,
|
| + uint8* dst_argb,
|
| + int width) {
|
| __asm {
|
| push esi
|
| - push edi
|
| - mov eax, [esp + 8 + 4] // Y
|
| - mov esi, [esp + 8 + 8] // U
|
| - mov edi, [esp + 8 + 12] // V
|
| - mov edx, [esp + 8 + 16] // bgra
|
| - mov ecx, [esp + 8 + 20] // width
|
| - sub edi, esi
|
| - pxor xmm4, xmm4
|
| + mov eax, [esp + 4 + 4] // Y
|
| + mov esi, [esp + 4 + 8] // UV
|
| + mov edx, [esp + 4 + 12] // argb
|
| + mov ecx, [esp + 4 + 16] // width
|
| + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
|
|
| - align 4
|
| convertloop:
|
| - READYUV422
|
| - YUVTORGB
|
| + READNV12
|
| + YUVTORGB(kYuvConstants)
|
| + STOREARGB
|
|
|
| - // Step 3: Weave into BGRA
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - punpcklbw xmm1, xmm0 // GB
|
| - punpcklbw xmm5, xmm2 // AR
|
| - movdqa xmm0, xmm5
|
| - punpcklwd xmm5, xmm1 // BGRA first 4 pixels
|
| - punpckhwd xmm0, xmm1 // BGRA next 4 pixels
|
| - movdqu [edx], xmm5
|
| - movdqu [edx + 16], xmm0
|
| - lea edx, [edx + 32]
|
| sub ecx, 8
|
| jg convertloop
|
|
|
| - pop edi
|
| pop esi
|
| ret
|
| }
|
| }
|
|
|
| +// 8 pixels.
|
| +// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
|
| __declspec(naked) __declspec(align(16))
|
| -void I422ToABGRRow_SSSE3(const uint8* y_buf,
|
| - const uint8* u_buf,
|
| - const uint8* v_buf,
|
| - uint8* dst_abgr,
|
| +void NV21ToARGBRow_SSSE3(const uint8* y_buf,
|
| + const uint8* uv_buf,
|
| + uint8* dst_argb,
|
| int width) {
|
| __asm {
|
| push esi
|
| - push edi
|
| - mov eax, [esp + 8 + 4] // Y
|
| - mov esi, [esp + 8 + 8] // U
|
| - mov edi, [esp + 8 + 12] // V
|
| - mov edx, [esp + 8 + 16] // abgr
|
| - mov ecx, [esp + 8 + 20] // width
|
| - sub edi, esi
|
| + mov eax, [esp + 4 + 4] // Y
|
| + mov esi, [esp + 4 + 8] // UV
|
| + mov edx, [esp + 4 + 12] // argb
|
| + mov ecx, [esp + 4 + 16] // width
|
| pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
|
|
| - align 4
|
| convertloop:
|
| - READYUV422
|
| - YUVTORGB
|
| + READNV12
|
| + YUVTORGB(kYvuConstants)
|
| + STOREARGB
|
|
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm2, xmm1 // RG
|
| - punpcklbw xmm0, xmm5 // BA
|
| - movdqa xmm1, xmm2
|
| - punpcklwd xmm2, xmm0 // RGBA first 4 pixels
|
| - punpckhwd xmm1, xmm0 // RGBA next 4 pixels
|
| - movdqa [edx], xmm2
|
| - movdqa [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| sub ecx, 8
|
| jg convertloop
|
|
|
| - pop edi
|
| pop esi
|
| ret
|
| }
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
|
| - const uint8* u_buf,
|
| - const uint8* v_buf,
|
| - uint8* dst_abgr,
|
| - int width) {
|
| +void I422ToBGRARow_SSSE3(const uint8* y_buf,
|
| + const uint8* u_buf,
|
| + const uint8* v_buf,
|
| + uint8* dst_bgra,
|
| + int width) {
|
| __asm {
|
| push esi
|
| push edi
|
| mov eax, [esp + 8 + 4] // Y
|
| mov esi, [esp + 8 + 8] // U
|
| mov edi, [esp + 8 + 12] // V
|
| - mov edx, [esp + 8 + 16] // abgr
|
| + mov edx, [esp + 8 + 16] // bgra
|
| mov ecx, [esp + 8 + 20] // width
|
| sub edi, esi
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - pxor xmm4, xmm4
|
|
|
| - align 4
|
| convertloop:
|
| READYUV422
|
| - YUVTORGB
|
| + YUVTORGB(kYuvConstants)
|
| + STOREBGRA
|
|
|
| - // Step 3: Weave into ARGB
|
| - punpcklbw xmm2, xmm1 // RG
|
| - punpcklbw xmm0, xmm5 // BA
|
| - movdqa xmm1, xmm2
|
| - punpcklwd xmm2, xmm0 // RGBA first 4 pixels
|
| - punpckhwd xmm1, xmm0 // RGBA next 4 pixels
|
| - movdqu [edx], xmm2
|
| - movdqu [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| sub ecx, 8
|
| jg convertloop
|
|
|
| @@ -3140,10 +2375,10 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void I422ToRGBARow_SSSE3(const uint8* y_buf,
|
| +void I422ToABGRRow_SSSE3(const uint8* y_buf,
|
| const uint8* u_buf,
|
| const uint8* v_buf,
|
| - uint8* dst_rgba,
|
| + uint8* dst_abgr,
|
| int width) {
|
| __asm {
|
| push esi
|
| @@ -3151,26 +2386,16 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
|
| mov eax, [esp + 8 + 4] // Y
|
| mov esi, [esp + 8 + 8] // U
|
| mov edi, [esp + 8 + 12] // V
|
| - mov edx, [esp + 8 + 16] // rgba
|
| + mov edx, [esp + 8 + 16] // abgr
|
| mov ecx, [esp + 8 + 20] // width
|
| sub edi, esi
|
| - pxor xmm4, xmm4
|
| + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
|
|
| - align 4
|
| convertloop:
|
| READYUV422
|
| - YUVTORGB
|
| + YUVTORGB(kYuvConstants)
|
| + STOREABGR
|
|
|
| - // Step 3: Weave into RGBA
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - punpcklbw xmm1, xmm2 // GR
|
| - punpcklbw xmm5, xmm0 // AB
|
| - movdqa xmm0, xmm5
|
| - punpcklwd xmm5, xmm1 // RGBA first 4 pixels
|
| - punpckhwd xmm0, xmm1 // RGBA next 4 pixels
|
| - movdqa [edx], xmm5
|
| - movdqa [edx + 16], xmm0
|
| - lea edx, [edx + 32]
|
| sub ecx, 8
|
| jg convertloop
|
|
|
| @@ -3181,11 +2406,11 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
|
| - const uint8* u_buf,
|
| - const uint8* v_buf,
|
| - uint8* dst_rgba,
|
| - int width) {
|
| +void I422ToRGBARow_SSSE3(const uint8* y_buf,
|
| + const uint8* u_buf,
|
| + const uint8* v_buf,
|
| + uint8* dst_rgba,
|
| + int width) {
|
| __asm {
|
| push esi
|
| push edi
|
| @@ -3195,23 +2420,12 @@ void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
|
| mov edx, [esp + 8 + 16] // rgba
|
| mov ecx, [esp + 8 + 20] // width
|
| sub edi, esi
|
| - pxor xmm4, xmm4
|
|
|
| - align 4
|
| convertloop:
|
| READYUV422
|
| - YUVTORGB
|
| + YUVTORGB(kYuvConstants)
|
| + STORERGBA
|
|
|
| - // Step 3: Weave into RGBA
|
| - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
| - punpcklbw xmm1, xmm2 // GR
|
| - punpcklbw xmm5, xmm0 // AB
|
| - movdqa xmm0, xmm5
|
| - punpcklwd xmm5, xmm1 // RGBA first 4 pixels
|
| - punpckhwd xmm0, xmm1 // RGBA next 4 pixels
|
| - movdqu [edx], xmm5
|
| - movdqu [edx + 16], xmm0
|
| - lea edx, [edx + 32]
|
| sub ecx, 8
|
| jg convertloop
|
|
|
| @@ -3224,32 +2438,32 @@ void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
|
| #endif // HAS_I422TOARGBROW_SSSE3
|
|
|
| #ifdef HAS_YTOARGBROW_SSE2
|
| +// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
|
| __declspec(naked) __declspec(align(16))
|
| void YToARGBRow_SSE2(const uint8* y_buf,
|
| uint8* rgb_buf,
|
| int width) {
|
| __asm {
|
| - pxor xmm5, xmm5
|
| - pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
| - pslld xmm4, 24
|
| - mov eax, 0x00100010
|
| - movd xmm3, eax
|
| - pshufd xmm3, xmm3, 0
|
| - mov eax, 0x004a004a // 74
|
| + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
|
| movd xmm2, eax
|
| pshufd xmm2, xmm2,0
|
| + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
|
| + movd xmm3, eax
|
| + pshufd xmm3, xmm3, 0
|
| + pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
| + pslld xmm4, 24
|
| +
|
| mov eax, [esp + 4] // Y
|
| mov edx, [esp + 8] // rgb
|
| mov ecx, [esp + 12] // width
|
|
|
| - align 4
|
| convertloop:
|
| // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
|
| movq xmm0, qword ptr [eax]
|
| lea eax, [eax + 8]
|
| - punpcklbw xmm0, xmm5 // 0.Y
|
| + punpcklbw xmm0, xmm0 // Y.Y
|
| + pmulhuw xmm0, xmm2
|
| psubusw xmm0, xmm3
|
| - pmullw xmm0, xmm2
|
| psrlw xmm0, 6
|
| packuswb xmm0, xmm0 // G
|
|
|
| @@ -3260,23 +2474,74 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
| punpckhwd xmm1, xmm1 // BGRA next 4 pixels
|
| por xmm0, xmm4
|
| por xmm1, xmm4
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| + movdqu [edx], xmm0
|
| + movdqu [edx + 16], xmm1
|
| lea edx, [edx + 32]
|
| sub ecx, 8
|
| jg convertloop
|
| -
|
| ret
|
| }
|
| }
|
| #endif // HAS_YTOARGBROW_SSE2
|
|
|
| +#ifdef HAS_YTOARGBROW_AVX2
|
| +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
|
| +// note: vpunpcklbw mutates and vpackuswb unmutates.
|
| +__declspec(naked) __declspec(align(16))
|
| +void YToARGBRow_AVX2(const uint8* y_buf,
|
| + uint8* rgb_buf,
|
| + int width) {
|
| + __asm {
|
| + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
|
| + vmovd xmm2, eax
|
| + vbroadcastss ymm2, xmm2
|
| + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
|
| + vmovd xmm3, eax
|
| + vbroadcastss ymm3, xmm3
|
| + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
|
| + vpslld ymm4, ymm4, 24
|
| +
|
| + mov eax, [esp + 4] // Y
|
| + mov edx, [esp + 8] // rgb
|
| + mov ecx, [esp + 12] // width
|
| +
|
| + convertloop:
|
| + // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
|
| + vmovdqu xmm0, [eax]
|
| + lea eax, [eax + 16]
|
| + vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
|
| + vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
|
| + vpmulhuw ymm0, ymm0, ymm2
|
| + vpsubusw ymm0, ymm0, ymm3
|
| + vpsrlw ymm0, ymm0, 6
|
| + vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
|
| +
|
| + // TODO(fbarchard): Weave alpha with unpack.
|
| + // Step 2: Weave into ARGB
|
| + vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
|
| + vpermq ymm1, ymm1, 0xd8
|
| + vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
|
| + vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
|
| + vpor ymm0, ymm0, ymm4
|
| + vpor ymm1, ymm1, ymm4
|
| + vmovdqu [edx], ymm0
|
| + vmovdqu [edx + 32], ymm1
|
| + lea edx, [edx + 64]
|
| + sub ecx, 16
|
| + jg convertloop
|
| + vzeroupper
|
| + ret
|
| + }
|
| +}
|
| +#endif // HAS_YTOARGBROW_AVX2
|
| +
|
| #ifdef HAS_MIRRORROW_SSSE3
|
| // Shuffle table for reversing the bytes.
|
| static const uvec8 kShuffleMirror = {
|
| 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
|
| };
|
|
|
| +// TODO(fbarchard): Replace lea with -16 offset.
|
| __declspec(naked) __declspec(align(16))
|
| void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| @@ -3284,15 +2549,13 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
| mov edx, [esp + 8] // dst
|
| mov ecx, [esp + 12] // width
|
| movdqa xmm5, kShuffleMirror
|
| - lea eax, [eax - 16]
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax + ecx]
|
| + movdqu xmm0, [eax - 16 + ecx]
|
| pshufb xmm0, xmm5
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg convertloop
|
| ret
|
| }
|
| @@ -3300,29 +2563,21 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
| #endif // HAS_MIRRORROW_SSSE3
|
|
|
| #ifdef HAS_MIRRORROW_AVX2
|
| -// Shuffle table for reversing the bytes.
|
| -static const ulvec8 kShuffleMirror_AVX2 = {
|
| - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
|
| - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
|
| -};
|
| -
|
| __declspec(naked) __declspec(align(16))
|
| void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| mov edx, [esp + 8] // dst
|
| mov ecx, [esp + 12] // width
|
| - vmovdqa ymm5, kShuffleMirror_AVX2
|
| - lea eax, [eax - 32]
|
| + vbroadcastf128 ymm5, kShuffleMirror
|
|
|
| - align 4
|
| convertloop:
|
| - vmovdqu ymm0, [eax + ecx]
|
| + vmovdqu ymm0, [eax - 32 + ecx]
|
| vpshufb ymm0, ymm0, ymm5
|
| vpermq ymm0, ymm0, 0x4e // swap high and low halfs
|
| - sub ecx, 32
|
| vmovdqu [edx], ymm0
|
| lea edx, [edx + 32]
|
| + sub ecx, 32
|
| jg convertloop
|
| vzeroupper
|
| ret
|
| @@ -3331,19 +2586,15 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| #endif // HAS_MIRRORROW_AVX2
|
|
|
| #ifdef HAS_MIRRORROW_SSE2
|
| -// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
|
| -// version can not.
|
| __declspec(naked) __declspec(align(16))
|
| void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| mov edx, [esp + 8] // dst
|
| mov ecx, [esp + 12] // width
|
| - lea eax, [eax - 16]
|
|
|
| - align 4
|
| convertloop:
|
| - movdqu xmm0, [eax + ecx]
|
| + movdqu xmm0, [eax - 16 + ecx]
|
| movdqa xmm1, xmm0 // swap bytes
|
| psllw xmm0, 8
|
| psrlw xmm1, 8
|
| @@ -3351,9 +2602,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
| pshuflw xmm0, xmm0, 0x1b // swap words
|
| pshufhw xmm0, xmm0, 0x1b
|
| pshufd xmm0, xmm0, 0x4e // swap qwords
|
| - sub ecx, 16
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg convertloop
|
| ret
|
| }
|
| @@ -3379,15 +2630,14 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
| lea eax, [eax + ecx * 2 - 16]
|
| sub edi, edx
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax]
|
| + movdqu xmm0, [eax]
|
| lea eax, [eax - 16]
|
| pshufb xmm0, xmm1
|
| - sub ecx, 8
|
| movlpd qword ptr [edx], xmm0
|
| movhpd qword ptr [edx + edi], xmm0
|
| lea edx, [edx + 8]
|
| + sub ecx, 8
|
| jg convertloop
|
|
|
| pop edi
|
| @@ -3396,34 +2646,27 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
| }
|
| #endif // HAS_MIRRORROW_UV_SSSE3
|
|
|
| -#ifdef HAS_ARGBMIRRORROW_SSSE3
|
| -// Shuffle table for reversing the bytes.
|
| -static const uvec8 kARGBShuffleMirror = {
|
| - 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
|
| -};
|
| -
|
| +#ifdef HAS_ARGBMIRRORROW_SSE2
|
| __declspec(naked) __declspec(align(16))
|
| -void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
| +void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
| __asm {
|
| mov eax, [esp + 4] // src
|
| mov edx, [esp + 8] // dst
|
| mov ecx, [esp + 12] // width
|
| lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
|
| - movdqa xmm5, kARGBShuffleMirror
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax]
|
| + movdqu xmm0, [eax]
|
| lea eax, [eax - 16]
|
| - pshufb xmm0, xmm5
|
| - sub ecx, 4
|
| - movdqa [edx], xmm0
|
| + pshufd xmm0, xmm0, 0x1b
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg convertloop
|
| ret
|
| }
|
| }
|
| -#endif // HAS_ARGBMIRRORROW_SSSE3
|
| +#endif // HAS_ARGBMIRRORROW_SSE2
|
|
|
| #ifdef HAS_ARGBMIRRORROW_AVX2
|
| // Shuffle table for reversing the bytes.
|
| @@ -3437,15 +2680,13 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| mov eax, [esp + 4] // src
|
| mov edx, [esp + 8] // dst
|
| mov ecx, [esp + 12] // width
|
| - lea eax, [eax - 32]
|
| - vmovdqa ymm5, kARGBShuffleMirror_AVX2
|
| + vmovdqu ymm5, kARGBShuffleMirror_AVX2
|
|
|
| - align 4
|
| convertloop:
|
| - vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order
|
| - sub ecx, 8
|
| + vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
|
| vmovdqu [edx], ymm0
|
| lea edx, [edx + 32]
|
| + sub ecx, 8
|
| jg convertloop
|
| vzeroupper
|
| ret
|
| @@ -3466,44 +2707,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
| psrlw xmm5, 8
|
| sub edi, edx
|
|
|
| - align 4
|
| - convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - lea eax, [eax + 32]
|
| - movdqa xmm2, xmm0
|
| - movdqa xmm3, xmm1
|
| - pand xmm0, xmm5 // even bytes
|
| - pand xmm1, xmm5
|
| - packuswb xmm0, xmm1
|
| - psrlw xmm2, 8 // odd bytes
|
| - psrlw xmm3, 8
|
| - packuswb xmm2, xmm3
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + edi], xmm2
|
| - lea edx, [edx + 16]
|
| - sub ecx, 16
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
| - int pix) {
|
| - __asm {
|
| - push edi
|
| - mov eax, [esp + 4 + 4] // src_uv
|
| - mov edx, [esp + 4 + 8] // dst_u
|
| - mov edi, [esp + 4 + 12] // dst_v
|
| - mov ecx, [esp + 4 + 16] // pix
|
| - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| - psrlw xmm5, 8
|
| - sub edi, edx
|
| -
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -3526,6 +2729,7 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
| ret
|
| }
|
| }
|
| +
|
| #endif // HAS_SPLITUVROW_SSE2
|
|
|
| #ifdef HAS_SPLITUVROW_AVX2
|
| @@ -3541,7 +2745,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
| vpsrlw ymm5, ymm5, 8
|
| sub edi, edx
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax]
|
| vmovdqu ymm1, [eax + 32]
|
| @@ -3579,37 +2782,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
|
| mov ecx, [esp + 4 + 16] // width
|
| sub edx, eax
|
|
|
| - align 4
|
| - convertloop:
|
| - movdqa xmm0, [eax] // read 16 U's
|
| - movdqa xmm1, [eax + edx] // and 16 V's
|
| - lea eax, [eax + 16]
|
| - movdqa xmm2, xmm0
|
| - punpcklbw xmm0, xmm1 // first 8 UV pairs
|
| - punpckhbw xmm2, xmm1 // next 8 UV pairs
|
| - movdqa [edi], xmm0
|
| - movdqa [edi + 16], xmm2
|
| - lea edi, [edi + 32]
|
| - sub ecx, 16
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
|
| - uint8* dst_uv, int width) {
|
| - __asm {
|
| - push edi
|
| - mov eax, [esp + 4 + 4] // src_u
|
| - mov edx, [esp + 4 + 8] // src_v
|
| - mov edi, [esp + 4 + 12] // dst_uv
|
| - mov ecx, [esp + 4 + 16] // width
|
| - sub edx, eax
|
| -
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax] // read 16 U's
|
| movdqu xmm1, [eax + edx] // and 16 V's
|
| @@ -3641,17 +2813,16 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
|
| mov ecx, [esp + 4 + 16] // width
|
| sub edx, eax
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax] // read 32 U's
|
| vmovdqu ymm1, [eax + edx] // and 32 V's
|
| lea eax, [eax + 32]
|
| vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
|
| vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
|
| - vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
|
| - vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
|
| - vmovdqu [edi], ymm1
|
| - vmovdqu [edi + 32], ymm2
|
| + vextractf128 [edi], ymm2, 0 // bytes 0..15
|
| + vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
|
| + vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
|
| + vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
|
| lea edi, [edi + 64]
|
| sub ecx, 32
|
| jg convertloop
|
| @@ -3672,13 +2843,12 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
|
| mov edx, [esp + 8] // dst
|
| mov ecx, [esp + 12] // count
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| lea eax, [eax + 32]
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| + movdqu [edx], xmm0
|
| + movdqu [edx + 16], xmm1
|
| lea edx, [edx + 32]
|
| sub ecx, 32
|
| jg convertloop
|
| @@ -3687,39 +2857,46 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
|
| }
|
| #endif // HAS_COPYROW_SSE2
|
|
|
| -// Unaligned Multiple of 1.
|
| +#ifdef HAS_COPYROW_AVX
|
| +// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
|
| __declspec(naked) __declspec(align(16))
|
| -void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
|
| +void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
|
| __asm {
|
| - mov eax, esi
|
| - mov edx, edi
|
| - mov esi, [esp + 4] // src
|
| - mov edi, [esp + 8] // dst
|
| + mov eax, [esp + 4] // src
|
| + mov edx, [esp + 8] // dst
|
| mov ecx, [esp + 12] // count
|
| - rep movsb
|
| - mov edi, edx
|
| - mov esi, eax
|
| +
|
| + convertloop:
|
| + vmovdqu ymm0, [eax]
|
| + vmovdqu ymm1, [eax + 32]
|
| + lea eax, [eax + 64]
|
| + vmovdqu [edx], ymm0
|
| + vmovdqu [edx + 32], ymm1
|
| + lea edx, [edx + 64]
|
| + sub ecx, 64
|
| + jg convertloop
|
| +
|
| + vzeroupper
|
| ret
|
| }
|
| }
|
| +#endif // HAS_COPYROW_AVX
|
|
|
| -#ifdef HAS_COPYROW_X86
|
| +// Multiple of 1.
|
| __declspec(naked) __declspec(align(16))
|
| -void CopyRow_X86(const uint8* src, uint8* dst, int count) {
|
| +void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
|
| __asm {
|
| mov eax, esi
|
| mov edx, edi
|
| mov esi, [esp + 4] // src
|
| mov edi, [esp + 8] // dst
|
| mov ecx, [esp + 12] // count
|
| - shr ecx, 2
|
| - rep movsd
|
| + rep movsb
|
| mov edi, edx
|
| mov esi, eax
|
| ret
|
| }
|
| }
|
| -#endif // HAS_COPYROW_X86
|
|
|
| #ifdef HAS_ARGBCOPYALPHAROW_SSE2
|
| // width in pixels
|
| @@ -3734,21 +2911,20 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
|
| pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
|
| psrld xmm1, 8
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm2, [eax]
|
| - movdqa xmm3, [eax + 16]
|
| + movdqu xmm2, [eax]
|
| + movdqu xmm3, [eax + 16]
|
| lea eax, [eax + 32]
|
| - movdqa xmm4, [edx]
|
| - movdqa xmm5, [edx + 16]
|
| + movdqu xmm4, [edx]
|
| + movdqu xmm5, [edx + 16]
|
| pand xmm2, xmm0
|
| pand xmm3, xmm0
|
| pand xmm4, xmm1
|
| pand xmm5, xmm1
|
| por xmm2, xmm4
|
| por xmm3, xmm5
|
| - movdqa [edx], xmm2
|
| - movdqa [edx + 16], xmm3
|
| + movdqu [edx], xmm2
|
| + movdqu [edx + 16], xmm3
|
| lea edx, [edx + 32]
|
| sub ecx, 8
|
| jg convertloop
|
| @@ -3769,7 +2945,6 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| vpcmpeqb ymm0, ymm0, ymm0
|
| vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm1, [eax]
|
| vmovdqu ymm2, [eax + 32]
|
| @@ -3801,23 +2976,22 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
|
| pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
|
| psrld xmm1, 8
|
|
|
| - align 4
|
| convertloop:
|
| movq xmm2, qword ptr [eax] // 8 Y's
|
| lea eax, [eax + 8]
|
| punpcklbw xmm2, xmm2
|
| punpckhwd xmm3, xmm2
|
| punpcklwd xmm2, xmm2
|
| - movdqa xmm4, [edx]
|
| - movdqa xmm5, [edx + 16]
|
| + movdqu xmm4, [edx]
|
| + movdqu xmm5, [edx + 16]
|
| pand xmm2, xmm0
|
| pand xmm3, xmm0
|
| pand xmm4, xmm1
|
| pand xmm5, xmm1
|
| por xmm2, xmm4
|
| por xmm3, xmm5
|
| - movdqa [edx], xmm2
|
| - movdqa [edx + 16], xmm3
|
| + movdqu [edx], xmm2
|
| + movdqu [edx + 16], xmm3
|
| lea edx, [edx + 32]
|
| sub ecx, 8
|
| jg convertloop
|
| @@ -3838,7 +3012,6 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| vpcmpeqb ymm0, ymm0, ymm0
|
| vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
|
|
|
| - align 4
|
| convertloop:
|
| vpmovzxbd ymm1, qword ptr [eax]
|
| vpmovzxbd ymm2, qword ptr [eax + 8]
|
| @@ -3860,13 +3033,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
|
| #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
|
|
|
| #ifdef HAS_SETROW_X86
|
| -// SetRow8 writes 'count' bytes using a 32 bit value repeated.
|
| +// Write 'count' bytes using an 8 bit value repeated.
|
| +// Count should be multiple of 4.
|
| __declspec(naked) __declspec(align(16))
|
| -void SetRow_X86(uint8* dst, uint32 v32, int count) {
|
| +void SetRow_X86(uint8* dst, uint8 v8, int count) {
|
| __asm {
|
| + movzx eax, byte ptr [esp + 8] // v8
|
| + mov edx, 0x01010101 // Duplicate byte to all bytes.
|
| + mul edx // overwrites edx with upper part of result.
|
| mov edx, edi
|
| mov edi, [esp + 4] // dst
|
| - mov eax, [esp + 8] // v32
|
| mov ecx, [esp + 12] // count
|
| shr ecx, 2
|
| rep stosd
|
| @@ -3875,33 +3051,30 @@ void SetRow_X86(uint8* dst, uint32 v32, int count) {
|
| }
|
| }
|
|
|
| -// SetRow32 writes 'count' words using a 32 bit value repeated.
|
| +// Write 'count' bytes using an 8 bit value repeated.
|
| +__declspec(naked) __declspec(align(16))
|
| +void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
|
| + __asm {
|
| + mov edx, edi
|
| + mov edi, [esp + 4] // dst
|
| + mov eax, [esp + 8] // v8
|
| + mov ecx, [esp + 12] // count
|
| + rep stosb
|
| + mov edi, edx
|
| + ret
|
| + }
|
| +}
|
| +
|
| +// Write 'count' 32 bit values.
|
| __declspec(naked) __declspec(align(16))
|
| -void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
|
| - int dst_stride, int height) {
|
| +void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
|
| __asm {
|
| - push esi
|
| - push edi
|
| - push ebp
|
| - mov edi, [esp + 12 + 4] // dst
|
| - mov eax, [esp + 12 + 8] // v32
|
| - mov ebp, [esp + 12 + 12] // width
|
| - mov edx, [esp + 12 + 16] // dst_stride
|
| - mov esi, [esp + 12 + 20] // height
|
| - lea ecx, [ebp * 4]
|
| - sub edx, ecx // stride - width * 4
|
| -
|
| - align 4
|
| - convertloop:
|
| - mov ecx, ebp
|
| + mov edx, edi
|
| + mov edi, [esp + 4] // dst
|
| + mov eax, [esp + 8] // v32
|
| + mov ecx, [esp + 12] // count
|
| rep stosd
|
| - add edi, edx
|
| - sub esi, 1
|
| - jg convertloop
|
| -
|
| - pop ebp
|
| - pop edi
|
| - pop esi
|
| + mov edi, edx
|
| ret
|
| }
|
| }
|
| @@ -3918,7 +3091,6 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
|
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
|
| vpsrlw ymm5, ymm5, 8
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax]
|
| vmovdqu ymm1, [eax + 32]
|
| @@ -3927,9 +3099,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
|
| vpand ymm1, ymm1, ymm5
|
| vpackuswb ymm0, ymm0, ymm1 // mutates.
|
| vpermq ymm0, ymm0, 0xd8
|
| - sub ecx, 32
|
| vmovdqu [edx], ymm0
|
| lea edx, [edx + 32]
|
| + sub ecx, 32
|
| jg convertloop
|
| vzeroupper
|
| ret
|
| @@ -3951,7 +3123,6 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
|
| vpsrlw ymm5, ymm5, 8
|
| sub edi, edx
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax]
|
| vmovdqu ymm1, [eax + 32]
|
| @@ -3994,7 +3165,6 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
|
| vpsrlw ymm5, ymm5, 8
|
| sub edi, edx
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax]
|
| vmovdqu ymm1, [eax + 32]
|
| @@ -4029,7 +3199,6 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
|
| mov edx, [esp + 8] // dst_y
|
| mov ecx, [esp + 12] // pix
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax]
|
| vmovdqu ymm1, [eax + 32]
|
| @@ -4038,12 +3207,12 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
|
| vpsrlw ymm1, ymm1, 8
|
| vpackuswb ymm0, ymm0, ymm1 // mutates.
|
| vpermq ymm0, ymm0, 0xd8
|
| - sub ecx, 32
|
| vmovdqu [edx], ymm0
|
| lea edx, [edx + 32]
|
| + sub ecx, 32
|
| jg convertloop
|
| - ret
|
| vzeroupper
|
| + ret
|
| }
|
| }
|
|
|
| @@ -4062,7 +3231,6 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
|
| vpsrlw ymm5, ymm5, 8
|
| sub edi, edx
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax]
|
| vmovdqu ymm1, [eax + 32]
|
| @@ -4105,7 +3273,6 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
|
| vpsrlw ymm5, ymm5, 8
|
| sub edi, edx
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax]
|
| vmovdqu ymm1, [eax + 32]
|
| @@ -4144,114 +3311,6 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
|
| pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| psrlw xmm5, 8
|
|
|
| - align 4
|
| - convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - lea eax, [eax + 32]
|
| - pand xmm0, xmm5 // even bytes are Y
|
| - pand xmm1, xmm5
|
| - packuswb xmm0, xmm1
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| - lea edx, [edx + 16]
|
| - jg convertloop
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
| - uint8* dst_u, uint8* dst_v, int pix) {
|
| - __asm {
|
| - push esi
|
| - push edi
|
| - mov eax, [esp + 8 + 4] // src_yuy2
|
| - mov esi, [esp + 8 + 8] // stride_yuy2
|
| - mov edx, [esp + 8 + 12] // dst_u
|
| - mov edi, [esp + 8 + 16] // dst_v
|
| - mov ecx, [esp + 8 + 20] // pix
|
| - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| - psrlw xmm5, 8
|
| - sub edi, edx
|
| -
|
| - align 4
|
| - convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + esi]
|
| - movdqa xmm3, [eax + esi + 16]
|
| - lea eax, [eax + 32]
|
| - pavgb xmm0, xmm2
|
| - pavgb xmm1, xmm3
|
| - psrlw xmm0, 8 // YUYV -> UVUV
|
| - psrlw xmm1, 8
|
| - packuswb xmm0, xmm1
|
| - movdqa xmm1, xmm0
|
| - pand xmm0, xmm5 // U
|
| - packuswb xmm0, xmm0
|
| - psrlw xmm1, 8 // V
|
| - packuswb xmm1, xmm1
|
| - movq qword ptr [edx], xmm0
|
| - movq qword ptr [edx + edi], xmm1
|
| - lea edx, [edx + 8]
|
| - sub ecx, 16
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
|
| - uint8* dst_u, uint8* dst_v, int pix) {
|
| - __asm {
|
| - push edi
|
| - mov eax, [esp + 4 + 4] // src_yuy2
|
| - mov edx, [esp + 4 + 8] // dst_u
|
| - mov edi, [esp + 4 + 12] // dst_v
|
| - mov ecx, [esp + 4 + 16] // pix
|
| - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| - psrlw xmm5, 8
|
| - sub edi, edx
|
| -
|
| - align 4
|
| - convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - lea eax, [eax + 32]
|
| - psrlw xmm0, 8 // YUYV -> UVUV
|
| - psrlw xmm1, 8
|
| - packuswb xmm0, xmm1
|
| - movdqa xmm1, xmm0
|
| - pand xmm0, xmm5 // U
|
| - packuswb xmm0, xmm0
|
| - psrlw xmm1, 8 // V
|
| - packuswb xmm1, xmm1
|
| - movq qword ptr [edx], xmm0
|
| - movq qword ptr [edx + edi], xmm1
|
| - lea edx, [edx + 8]
|
| - sub ecx, 16
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
|
| - uint8* dst_y, int pix) {
|
| - __asm {
|
| - mov eax, [esp + 4] // src_yuy2
|
| - mov edx, [esp + 8] // dst_y
|
| - mov ecx, [esp + 12] // pix
|
| - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| - psrlw xmm5, 8
|
| -
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -4259,17 +3318,17 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
|
| pand xmm0, xmm5 // even bytes are Y
|
| pand xmm1, xmm5
|
| packuswb xmm0, xmm1
|
| - sub ecx, 16
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg convertloop
|
| ret
|
| }
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
| - uint8* dst_u, uint8* dst_v, int pix) {
|
| +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
| + uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| push esi
|
| push edi
|
| @@ -4282,7 +3341,6 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
| psrlw xmm5, 8
|
| sub edi, edx
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -4312,8 +3370,8 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
|
| - uint8* dst_u, uint8* dst_v, int pix) {
|
| +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
|
| + uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| push edi
|
| mov eax, [esp + 4 + 4] // src_yuy2
|
| @@ -4324,7 +3382,6 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
|
| psrlw xmm5, 8
|
| sub edi, edx
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -4356,112 +3413,6 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
|
| mov edx, [esp + 8] // dst_y
|
| mov ecx, [esp + 12] // pix
|
|
|
| - align 4
|
| - convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - lea eax, [eax + 32]
|
| - psrlw xmm0, 8 // odd bytes are Y
|
| - psrlw xmm1, 8
|
| - packuswb xmm0, xmm1
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| - lea edx, [edx + 16]
|
| - jg convertloop
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
| - uint8* dst_u, uint8* dst_v, int pix) {
|
| - __asm {
|
| - push esi
|
| - push edi
|
| - mov eax, [esp + 8 + 4] // src_yuy2
|
| - mov esi, [esp + 8 + 8] // stride_yuy2
|
| - mov edx, [esp + 8 + 12] // dst_u
|
| - mov edi, [esp + 8 + 16] // dst_v
|
| - mov ecx, [esp + 8 + 20] // pix
|
| - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| - psrlw xmm5, 8
|
| - sub edi, edx
|
| -
|
| - align 4
|
| - convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + esi]
|
| - movdqa xmm3, [eax + esi + 16]
|
| - lea eax, [eax + 32]
|
| - pavgb xmm0, xmm2
|
| - pavgb xmm1, xmm3
|
| - pand xmm0, xmm5 // UYVY -> UVUV
|
| - pand xmm1, xmm5
|
| - packuswb xmm0, xmm1
|
| - movdqa xmm1, xmm0
|
| - pand xmm0, xmm5 // U
|
| - packuswb xmm0, xmm0
|
| - psrlw xmm1, 8 // V
|
| - packuswb xmm1, xmm1
|
| - movq qword ptr [edx], xmm0
|
| - movq qword ptr [edx + edi], xmm1
|
| - lea edx, [edx + 8]
|
| - sub ecx, 16
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
|
| - uint8* dst_u, uint8* dst_v, int pix) {
|
| - __asm {
|
| - push edi
|
| - mov eax, [esp + 4 + 4] // src_yuy2
|
| - mov edx, [esp + 4 + 8] // dst_u
|
| - mov edi, [esp + 4 + 12] // dst_v
|
| - mov ecx, [esp + 4 + 16] // pix
|
| - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
| - psrlw xmm5, 8
|
| - sub edi, edx
|
| -
|
| - align 4
|
| - convertloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - lea eax, [eax + 32]
|
| - pand xmm0, xmm5 // UYVY -> UVUV
|
| - pand xmm1, xmm5
|
| - packuswb xmm0, xmm1
|
| - movdqa xmm1, xmm0
|
| - pand xmm0, xmm5 // U
|
| - packuswb xmm0, xmm0
|
| - psrlw xmm1, 8 // V
|
| - packuswb xmm1, xmm1
|
| - movq qword ptr [edx], xmm0
|
| - movq qword ptr [edx + edi], xmm1
|
| - lea edx, [edx + 8]
|
| - sub ecx, 16
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
|
| - uint8* dst_y, int pix) {
|
| - __asm {
|
| - mov eax, [esp + 4] // src_uyvy
|
| - mov edx, [esp + 8] // dst_y
|
| - mov ecx, [esp + 12] // pix
|
| -
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -4469,17 +3420,17 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
|
| psrlw xmm0, 8 // odd bytes are Y
|
| psrlw xmm1, 8
|
| packuswb xmm0, xmm1
|
| - sub ecx, 16
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg convertloop
|
| ret
|
| }
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
| - uint8* dst_u, uint8* dst_v, int pix) {
|
| +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
| + uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| push esi
|
| push edi
|
| @@ -4492,7 +3443,6 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
| psrlw xmm5, 8
|
| sub edi, edx
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -4522,8 +3472,8 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
| }
|
|
|
| __declspec(naked) __declspec(align(16))
|
| -void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
|
| - uint8* dst_u, uint8* dst_v, int pix) {
|
| +void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
|
| + uint8* dst_u, uint8* dst_v, int pix) {
|
| __asm {
|
| push edi
|
| mov eax, [esp + 4 + 4] // src_yuy2
|
| @@ -4534,7 +3484,6 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
|
| psrlw xmm5, 8
|
| sub edi, edx
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| @@ -4607,9 +3556,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| paddusb xmm0, xmm2 // + src argb
|
| pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
| paddusb xmm0, xmm1 // + src argb
|
| - sub ecx, 1
|
| movd [edx], xmm0
|
| lea edx, [edx + 4]
|
| + sub ecx, 1
|
| jge alignloop1
|
|
|
| alignloop1b:
|
| @@ -4638,9 +3587,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| paddusb xmm0, xmm2 // + src argb
|
| pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
| paddusb xmm0, xmm1 // + src argb
|
| - sub ecx, 4
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jge convertloop4
|
|
|
| convertloop4b:
|
| @@ -4669,9 +3618,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| paddusb xmm0, xmm2 // + src argb
|
| pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
| paddusb xmm0, xmm1 // + src argb
|
| - sub ecx, 1
|
| movd [edx], xmm0
|
| lea edx, [edx + 4]
|
| + sub ecx, 1
|
| jge convertloop1
|
|
|
| convertloop1b:
|
| @@ -4739,48 +3688,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
| paddusb xmm0, xmm2 // + src argb
|
| pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
| paddusb xmm0, xmm1 // + src argb
|
| - sub ecx, 1
|
| movd [edx], xmm0
|
| lea edx, [edx + 4]
|
| + sub ecx, 1
|
| jge alignloop1
|
|
|
| alignloop1b:
|
| add ecx, 1 - 4
|
| jl convertloop4b
|
|
|
| - test eax, 15 // unaligned?
|
| - jne convertuloop4
|
| - test esi, 15 // unaligned?
|
| - jne convertuloop4
|
| -
|
| // 4 pixel loop.
|
| convertloop4:
|
| - movdqa xmm3, [eax] // src argb
|
| - lea eax, [eax + 16]
|
| - movdqa xmm0, xmm3 // src argb
|
| - pxor xmm3, xmm4 // ~alpha
|
| - movdqa xmm2, [esi] // _r_b
|
| - pshufb xmm3, kShuffleAlpha // alpha
|
| - pand xmm2, xmm6 // _r_b
|
| - paddw xmm3, xmm7 // 256 - alpha
|
| - pmullw xmm2, xmm3 // _r_b * alpha
|
| - movdqa xmm1, [esi] // _a_g
|
| - lea esi, [esi + 16]
|
| - psrlw xmm1, 8 // _a_g
|
| - por xmm0, xmm4 // set alpha to 255
|
| - pmullw xmm1, xmm3 // _a_g * alpha
|
| - psrlw xmm2, 8 // _r_b convert to 8 bits again
|
| - paddusb xmm0, xmm2 // + src argb
|
| - pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
| - paddusb xmm0, xmm1 // + src argb
|
| - sub ecx, 4
|
| - movdqa [edx], xmm0
|
| - lea edx, [edx + 16]
|
| - jge convertloop4
|
| - jmp convertloop4b
|
| -
|
| - // 4 pixel unaligned loop.
|
| - convertuloop4:
|
| movdqu xmm3, [eax] // src argb
|
| lea eax, [eax + 16]
|
| movdqa xmm0, xmm3 // src argb
|
| @@ -4799,10 +3717,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
| paddusb xmm0, xmm2 // + src argb
|
| pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
| paddusb xmm0, xmm1 // + src argb
|
| - sub ecx, 4
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| - jge convertuloop4
|
| + sub ecx, 4
|
| + jge convertloop4
|
|
|
| convertloop4b:
|
| add ecx, 4 - 1
|
| @@ -4828,9 +3746,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
| paddusb xmm0, xmm2 // + src argb
|
| pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
| paddusb xmm0, xmm1 // + src argb
|
| - sub ecx, 1
|
| movd [edx], xmm0
|
| lea edx, [edx + 4]
|
| + sub ecx, 1
|
| jge convertloop1
|
|
|
| convertloop1b:
|
| @@ -4842,7 +3760,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
|
|
| #ifdef HAS_ARGBATTENUATEROW_SSE2
|
| // Attenuate 4 pixels at a time.
|
| -// Aligned to 16 bytes.
|
| __declspec(naked) __declspec(align(16))
|
| void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
| __asm {
|
| @@ -4854,19 +3771,18 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
| pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
|
| psrld xmm5, 8
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // read 4 pixels
|
| + movdqu xmm0, [eax] // read 4 pixels
|
| punpcklbw xmm0, xmm0 // first 2
|
| pshufhw xmm2, xmm0, 0FFh // 8 alpha words
|
| pshuflw xmm2, xmm2, 0FFh
|
| pmulhuw xmm0, xmm2 // rgb * a
|
| - movdqa xmm1, [eax] // read 4 pixels
|
| + movdqu xmm1, [eax] // read 4 pixels
|
| punpckhbw xmm1, xmm1 // next 2 pixels
|
| pshufhw xmm2, xmm1, 0FFh // 8 alpha words
|
| pshuflw xmm2, xmm2, 0FFh
|
| pmulhuw xmm1, xmm2 // rgb * a
|
| - movdqa xmm2, [eax] // alphas
|
| + movdqu xmm2, [eax] // alphas
|
| lea eax, [eax + 16]
|
| psrlw xmm0, 8
|
| pand xmm2, xmm4
|
| @@ -4874,9 +3790,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
| packuswb xmm0, xmm1
|
| pand xmm0, xmm5 // keep original alphas
|
| por xmm0, xmm2
|
| - sub ecx, 4
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg convertloop
|
|
|
| ret
|
| @@ -4904,7 +3820,6 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
| movdqa xmm4, kShuffleAlpha0
|
| movdqa xmm5, kShuffleAlpha1
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax] // read 4 pixels
|
| pshufb xmm0, xmm4 // isolate first 2 alphas
|
| @@ -4923,9 +3838,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
| psrlw xmm1, 8
|
| packuswb xmm0, xmm1
|
| por xmm0, xmm2 // copy original alpha
|
| - sub ecx, 4
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg convertloop
|
|
|
| ret
|
| @@ -4935,11 +3850,8 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
|
|
| #ifdef HAS_ARGBATTENUATEROW_AVX2
|
| // Shuffle table duplicating alpha.
|
| -static const ulvec8 kShuffleAlpha_AVX2 = {
|
| - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
|
| - 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
|
| - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
|
| - 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
|
| +static const uvec8 kShuffleAlpha_AVX2 = {
|
| + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
|
| };
|
| __declspec(naked) __declspec(align(16))
|
| void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
| @@ -4948,11 +3860,10 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
| mov edx, [esp + 8] // dst_argb
|
| mov ecx, [esp + 12] // width
|
| sub edx, eax
|
| - vmovdqa ymm4, kShuffleAlpha_AVX2
|
| + vbroadcastf128 ymm4,kShuffleAlpha_AVX2
|
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
|
| vpslld ymm5, ymm5, 24
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm6, [eax] // read 8 pixels.
|
| vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
|
| @@ -4966,9 +3877,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
| vpsrlw ymm1, ymm1, 8
|
| vpackuswb ymm0, ymm0, ymm1 // unmutated.
|
| vpor ymm0, ymm0, ymm6 // copy original alpha
|
| - sub ecx, 8
|
| vmovdqu [eax + edx], ymm0
|
| lea eax, [eax + 32]
|
| + sub ecx, 8
|
| jg convertloop
|
|
|
| vzeroupper
|
| @@ -4979,7 +3890,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
|
|
| #ifdef HAS_ARGBUNATTENUATEROW_SSE2
|
| // Unattenuate 4 pixels at a time.
|
| -// Aligned to 16 bytes.
|
| __declspec(naked) __declspec(align(16))
|
| void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
| int width) {
|
| @@ -4990,7 +3900,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
| mov edx, [esp + 8 + 8] // dst_argb
|
| mov ecx, [esp + 8 + 12] // width
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax] // read 4 pixels
|
| movzx esi, byte ptr [eax + 3] // first alpha
|
| @@ -5016,9 +3925,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
| lea eax, [eax + 16]
|
|
|
| packuswb xmm0, xmm1
|
| - sub ecx, 4
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg convertloop
|
| pop edi
|
| pop esi
|
| @@ -5029,9 +3938,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
|
|
| #ifdef HAS_ARGBUNATTENUATEROW_AVX2
|
| // Shuffle table duplicating alpha.
|
| -static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
|
| - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
|
| - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
|
| +static const uvec8 kUnattenShuffleAlpha_AVX2 = {
|
| + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
|
| };
|
| // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
|
| // USE_GATHER is not on by default, due to being a slow instruction.
|
| @@ -5044,9 +3952,8 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
| mov edx, [esp + 8] // dst_argb
|
| mov ecx, [esp + 12] // width
|
| sub edx, eax
|
| - vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
|
| + vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm6, [eax] // read 8 pixels.
|
| vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
|
| @@ -5061,9 +3968,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
| vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
|
| vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
|
| vpackuswb ymm0, ymm0, ymm1 // unmutated.
|
| - sub ecx, 8
|
| vmovdqu [eax + edx], ymm0
|
| lea eax, [eax + 32]
|
| + sub ecx, 8
|
| jg convertloop
|
|
|
| vzeroupper
|
| @@ -5080,12 +3987,11 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
| mov edx, [esp + 8] // dst_argb
|
| mov ecx, [esp + 12] // width
|
| sub edx, eax
|
| - vmovdqa ymm5, kUnattenShuffleAlpha_AVX2
|
| + vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
|
|
|
| push esi
|
| push edi
|
|
|
| - align 4
|
| convertloop:
|
| // replace VPGATHER
|
| movzx esi, byte ptr [eax + 3] // alpha0
|
| @@ -5123,9 +4029,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
| vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
|
| vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
|
| vpackuswb ymm0, ymm0, ymm1 // unmutated.
|
| - sub ecx, 8
|
| vmovdqu [eax + edx], ymm0
|
| lea eax, [eax + 32]
|
| + sub ecx, 8
|
| jg convertloop
|
|
|
| pop edi
|
| @@ -5148,18 +4054,17 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
| movdqa xmm4, kARGBToYJ
|
| movdqa xmm5, kAddYJ64
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // G
|
| - movdqa xmm1, [eax + 16]
|
| + movdqu xmm0, [eax] // G
|
| + movdqu xmm1, [eax + 16]
|
| pmaddubsw xmm0, xmm4
|
| pmaddubsw xmm1, xmm4
|
| phaddw xmm0, xmm1
|
| paddw xmm0, xmm5 // Add .5 for rounding.
|
| psrlw xmm0, 7
|
| packuswb xmm0, xmm0 // 8 G bytes
|
| - movdqa xmm2, [eax] // A
|
| - movdqa xmm3, [eax + 16]
|
| + movdqu xmm2, [eax] // A
|
| + movdqu xmm3, [eax + 16]
|
| lea eax, [eax + 32]
|
| psrld xmm2, 24
|
| psrld xmm3, 24
|
| @@ -5171,10 +4076,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
| movdqa xmm1, xmm0
|
| punpcklwd xmm0, xmm3 // GGGA first 4
|
| punpckhwd xmm1, xmm3 // GGGA next 4
|
| - sub ecx, 8
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| + movdqu [edx], xmm0
|
| + movdqu [edx + 16], xmm1
|
| lea edx, [edx + 32]
|
| + sub ecx, 8
|
| jg convertloop
|
| ret
|
| }
|
| @@ -5208,32 +4113,31 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
|
| movdqa xmm3, kARGBToSepiaG
|
| movdqa xmm4, kARGBToSepiaR
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // B
|
| - movdqa xmm6, [eax + 16]
|
| + movdqu xmm0, [eax] // B
|
| + movdqu xmm6, [eax + 16]
|
| pmaddubsw xmm0, xmm2
|
| pmaddubsw xmm6, xmm2
|
| phaddw xmm0, xmm6
|
| psrlw xmm0, 7
|
| packuswb xmm0, xmm0 // 8 B values
|
| - movdqa xmm5, [eax] // G
|
| - movdqa xmm1, [eax + 16]
|
| + movdqu xmm5, [eax] // G
|
| + movdqu xmm1, [eax + 16]
|
| pmaddubsw xmm5, xmm3
|
| pmaddubsw xmm1, xmm3
|
| phaddw xmm5, xmm1
|
| psrlw xmm5, 7
|
| packuswb xmm5, xmm5 // 8 G values
|
| punpcklbw xmm0, xmm5 // 8 BG values
|
| - movdqa xmm5, [eax] // R
|
| - movdqa xmm1, [eax + 16]
|
| + movdqu xmm5, [eax] // R
|
| + movdqu xmm1, [eax + 16]
|
| pmaddubsw xmm5, xmm4
|
| pmaddubsw xmm1, xmm4
|
| phaddw xmm5, xmm1
|
| psrlw xmm5, 7
|
| packuswb xmm5, xmm5 // 8 R values
|
| - movdqa xmm6, [eax] // A
|
| - movdqa xmm1, [eax + 16]
|
| + movdqu xmm6, [eax] // A
|
| + movdqu xmm1, [eax + 16]
|
| psrld xmm6, 24
|
| psrld xmm1, 24
|
| packuswb xmm6, xmm1
|
| @@ -5242,10 +4146,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
|
| movdqa xmm1, xmm0 // Weave BG, RA together
|
| punpcklwd xmm0, xmm5 // BGRA first 4
|
| punpckhwd xmm1, xmm5 // BGRA next 4
|
| - sub ecx, 8
|
| - movdqa [eax], xmm0
|
| - movdqa [eax + 16], xmm1
|
| + movdqu [eax], xmm0
|
| + movdqu [eax + 16], xmm1
|
| lea eax, [eax + 32]
|
| + sub ecx, 8
|
| jg convertloop
|
| ret
|
| }
|
| @@ -5271,14 +4175,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| pshufd xmm5, xmm5, 0xff
|
| mov ecx, [esp + 16] /* width */
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // B
|
| - movdqa xmm7, [eax + 16]
|
| + movdqu xmm0, [eax] // B
|
| + movdqu xmm7, [eax + 16]
|
| pmaddubsw xmm0, xmm2
|
| pmaddubsw xmm7, xmm2
|
| - movdqa xmm6, [eax] // G
|
| - movdqa xmm1, [eax + 16]
|
| + movdqu xmm6, [eax] // G
|
| + movdqu xmm1, [eax + 16]
|
| pmaddubsw xmm6, xmm3
|
| pmaddubsw xmm1, xmm3
|
| phaddsw xmm0, xmm7 // B
|
| @@ -5288,13 +4191,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| packuswb xmm0, xmm0 // 8 B values
|
| packuswb xmm6, xmm6 // 8 G values
|
| punpcklbw xmm0, xmm6 // 8 BG values
|
| - movdqa xmm1, [eax] // R
|
| - movdqa xmm7, [eax + 16]
|
| + movdqu xmm1, [eax] // R
|
| + movdqu xmm7, [eax + 16]
|
| pmaddubsw xmm1, xmm4
|
| pmaddubsw xmm7, xmm4
|
| phaddsw xmm1, xmm7 // R
|
| - movdqa xmm6, [eax] // A
|
| - movdqa xmm7, [eax + 16]
|
| + movdqu xmm6, [eax] // A
|
| + movdqu xmm7, [eax + 16]
|
| pmaddubsw xmm6, xmm5
|
| pmaddubsw xmm7, xmm5
|
| phaddsw xmm6, xmm7 // A
|
| @@ -5306,11 +4209,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| movdqa xmm6, xmm0 // Weave BG, RA together
|
| punpcklwd xmm0, xmm1 // BGRA first 4
|
| punpckhwd xmm6, xmm1 // BGRA next 4
|
| - sub ecx, 8
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm6
|
| + movdqu [edx], xmm0
|
| + movdqu [edx + 16], xmm6
|
| lea eax, [eax + 32]
|
| lea edx, [edx + 32]
|
| + sub ecx, 8
|
| jg convertloop
|
| ret
|
| }
|
| @@ -5319,7 +4222,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
|
|
| #ifdef HAS_ARGBQUANTIZEROW_SSE2
|
| // Quantize 4 ARGB pixels (16 bytes).
|
| -// Aligned to 16 bytes.
|
| __declspec(naked) __declspec(align(16))
|
| void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
| int interval_offset, int width) {
|
| @@ -5339,25 +4241,24 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
| pcmpeqb xmm6, xmm6 // generate mask 0xff000000
|
| pslld xmm6, 24
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // read 4 pixels
|
| + movdqu xmm0, [eax] // read 4 pixels
|
| punpcklbw xmm0, xmm5 // first 2 pixels
|
| pmulhuw xmm0, xmm2 // pixel * scale >> 16
|
| - movdqa xmm1, [eax] // read 4 pixels
|
| + movdqu xmm1, [eax] // read 4 pixels
|
| punpckhbw xmm1, xmm5 // next 2 pixels
|
| pmulhuw xmm1, xmm2
|
| pmullw xmm0, xmm3 // * interval_size
|
| - movdqa xmm7, [eax] // read 4 pixels
|
| + movdqu xmm7, [eax] // read 4 pixels
|
| pmullw xmm1, xmm3
|
| pand xmm7, xmm6 // mask alpha
|
| paddw xmm0, xmm4 // + interval_size / 2
|
| paddw xmm1, xmm4
|
| packuswb xmm0, xmm1
|
| por xmm0, xmm7
|
| - sub ecx, 4
|
| - movdqa [eax], xmm0
|
| + movdqu [eax], xmm0
|
| lea eax, [eax + 16]
|
| + sub ecx, 4
|
| jg convertloop
|
| ret
|
| }
|
| @@ -5366,7 +4267,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
|
|
| #ifdef HAS_ARGBSHADEROW_SSE2
|
| // Shade 4 pixels at a time by specified value.
|
| -// Aligned to 16 bytes.
|
| __declspec(naked) __declspec(align(16))
|
| void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
| uint32 value) {
|
| @@ -5378,9 +4278,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
| punpcklbw xmm2, xmm2
|
| punpcklqdq xmm2, xmm2
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // read 4 pixels
|
| + movdqu xmm0, [eax] // read 4 pixels
|
| lea eax, [eax + 16]
|
| movdqa xmm1, xmm0
|
| punpcklbw xmm0, xmm0 // first 2
|
| @@ -5390,9 +4289,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
| psrlw xmm0, 8
|
| psrlw xmm1, 8
|
| packuswb xmm0, xmm1
|
| - sub ecx, 4
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg convertloop
|
|
|
| ret
|
| @@ -5413,7 +4312,6 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| mov ecx, [esp + 4 + 16] // width
|
| pxor xmm5, xmm5 // constant 0
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax] // read 4 pixels from src_argb0
|
| movdqu xmm2, [esi] // read 4 pixels from src_argb1
|
| @@ -5428,9 +4326,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| lea eax, [eax + 16]
|
| lea esi, [esi + 16]
|
| packuswb xmm0, xmm1
|
| - sub ecx, 4
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg convertloop
|
|
|
| pop esi
|
| @@ -5455,16 +4353,15 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| sub ecx, 4
|
| jl convertloop49
|
|
|
| - align 4
|
| convertloop4:
|
| movdqu xmm0, [eax] // read 4 pixels from src_argb0
|
| lea eax, [eax + 16]
|
| movdqu xmm1, [esi] // read 4 pixels from src_argb1
|
| lea esi, [esi + 16]
|
| paddusb xmm0, xmm1 // src_argb0 + src_argb1
|
| - sub ecx, 4
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jge convertloop4
|
|
|
| convertloop49:
|
| @@ -5477,9 +4374,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| movd xmm1, [esi] // read 1 pixels from src_argb1
|
| lea esi, [esi + 4]
|
| paddusb xmm0, xmm1 // src_argb0 + src_argb1
|
| - sub ecx, 1
|
| movd [edx], xmm0
|
| lea edx, [edx + 4]
|
| + sub ecx, 1
|
| jge convertloop1
|
|
|
| convertloop19:
|
| @@ -5501,16 +4398,15 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
| mov edx, [esp + 4 + 12] // dst_argb
|
| mov ecx, [esp + 4 + 16] // width
|
|
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, [eax] // read 4 pixels from src_argb0
|
| lea eax, [eax + 16]
|
| movdqu xmm1, [esi] // read 4 pixels from src_argb1
|
| lea esi, [esi + 16]
|
| psubusb xmm0, xmm1 // src_argb0 - src_argb1
|
| - sub ecx, 4
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg convertloop
|
|
|
| pop esi
|
| @@ -5532,7 +4428,6 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
| mov ecx, [esp + 4 + 16] // width
|
| vpxor ymm5, ymm5, ymm5 // constant 0
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
|
| lea eax, [eax + 32]
|
| @@ -5569,7 +4464,6 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
| mov edx, [esp + 4 + 12] // dst_argb
|
| mov ecx, [esp + 4 + 16] // width
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
|
| lea eax, [eax + 32]
|
| @@ -5599,7 +4493,6 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
| mov edx, [esp + 4 + 12] // dst_argb
|
| mov ecx, [esp + 4 + 16] // width
|
|
|
| - align 4
|
| convertloop:
|
| vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
|
| lea eax, [eax + 32]
|
| @@ -5638,7 +4531,6 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
| sub edx, eax
|
| pxor xmm5, xmm5 // constant 0
|
|
|
| - align 4
|
| convertloop:
|
| movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
|
| movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
|
| @@ -5662,9 +4554,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
| psubw xmm1, xmm0
|
| pmaxsw xmm0, xmm1
|
| packuswb xmm0, xmm0
|
| - sub ecx, 8
|
| movq qword ptr [eax + edx], xmm0
|
| lea eax, [eax + 8]
|
| + sub ecx, 8
|
| jg convertloop
|
|
|
| pop edi
|
| @@ -5692,7 +4584,6 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
| sub edx, eax
|
| pxor xmm5, xmm5 // constant 0
|
|
|
| - align 4
|
| convertloop:
|
| movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
|
| movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
|
| @@ -5716,9 +4607,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
| psubw xmm1, xmm0
|
| pmaxsw xmm0, xmm1
|
| packuswb xmm0, xmm0
|
| - sub ecx, 8
|
| movq qword ptr [eax + edx], xmm0
|
| lea eax, [eax + 8]
|
| + sub ecx, 8
|
| jg convertloop
|
|
|
| pop esi
|
| @@ -5746,10 +4637,9 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
| pcmpeqb xmm5, xmm5 // alpha 255
|
| pslld xmm5, 24 // 0xff000000
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // read 16 pixels src_sobelx
|
| - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
|
| + movdqu xmm0, [eax] // read 16 pixels src_sobelx
|
| + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
|
| lea eax, [eax + 16]
|
| paddusb xmm0, xmm1 // sobel = sobelx + sobely
|
| movdqa xmm2, xmm0 // GG
|
| @@ -5765,12 +4655,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
| punpckhwd xmm0, xmm0 // Last 4
|
| por xmm3, xmm5 // GGGA
|
| por xmm0, xmm5
|
| - sub ecx, 16
|
| - movdqa [edx], xmm1
|
| - movdqa [edx + 16], xmm2
|
| - movdqa [edx + 32], xmm3
|
| - movdqa [edx + 48], xmm0
|
| + movdqu [edx], xmm1
|
| + movdqu [edx + 16], xmm2
|
| + movdqu [edx + 32], xmm3
|
| + movdqu [edx + 48], xmm0
|
| lea edx, [edx + 64]
|
| + sub ecx, 16
|
| jg convertloop
|
|
|
| pop esi
|
| @@ -5792,15 +4682,14 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
| mov ecx, [esp + 4 + 16] // width
|
| sub esi, eax
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // read 16 pixels src_sobelx
|
| - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
|
| + movdqu xmm0, [eax] // read 16 pixels src_sobelx
|
| + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
|
| lea eax, [eax + 16]
|
| paddusb xmm0, xmm1 // sobel = sobelx + sobely
|
| - sub ecx, 16
|
| - movdqa [edx], xmm0
|
| + movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 16
|
| jg convertloop
|
|
|
| pop esi
|
| @@ -5827,10 +4716,9 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
| sub esi, eax
|
| pcmpeqb xmm5, xmm5 // alpha 255
|
|
|
| - align 4
|
| convertloop:
|
| - movdqa xmm0, [eax] // read 16 pixels src_sobelx
|
| - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
|
| + movdqu xmm0, [eax] // read 16 pixels src_sobelx
|
| + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
|
| lea eax, [eax + 16]
|
| movdqa xmm2, xmm0
|
| paddusb xmm2, xmm1 // sobel = sobelx + sobely
|
| @@ -5846,12 +4734,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
| movdqa xmm7, xmm1 // YSXA
|
| punpcklwd xmm7, xmm0 // Next 4
|
| punpckhwd xmm1, xmm0 // Last 4
|
| - sub ecx, 16
|
| - movdqa [edx], xmm6
|
| - movdqa [edx + 16], xmm4
|
| - movdqa [edx + 32], xmm7
|
| - movdqa [edx + 48], xmm1
|
| + movdqu [edx], xmm6
|
| + movdqu [edx + 16], xmm4
|
| + movdqu [edx + 32], xmm7
|
| + movdqu [edx + 48], xmm1
|
| lea edx, [edx + 64]
|
| + sub ecx, 16
|
| jg convertloop
|
|
|
| pop esi
|
| @@ -5872,8 +4760,7 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
| // area is the number of pixels in the area being averaged.
|
| // dst points to pixel to store result to.
|
| // count is number of averaged pixels to produce.
|
| -// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
|
| -// aligned.
|
| +// Does 4 pixels at a time.
|
| void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
|
| int width, int area, uint8* dst,
|
| int count) {
|
| @@ -5903,13 +4790,12 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
|
| packssdw xmm5, xmm5 // 16 bit shorts
|
|
|
| // 4 pixel loop small blocks.
|
| - align 4
|
| s4:
|
| // top left
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| + movdqu xmm2, [eax + 32]
|
| + movdqu xmm3, [eax + 48]
|
|
|
| // - top right
|
| psubd xmm0, [eax + edx * 4]
|
| @@ -5946,13 +4832,12 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
|
| jmp l4b
|
|
|
| // 4 pixel loop
|
| - align 4
|
| l4:
|
| // top left
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - movdqa xmm2, [eax + 32]
|
| - movdqa xmm3, [eax + 48]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| + movdqu xmm2, [eax + 32]
|
| + movdqu xmm3, [eax + 48]
|
|
|
| // - top right
|
| psubd xmm0, [eax + edx * 4]
|
| @@ -5999,9 +4884,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
|
| jl l1b
|
|
|
| // 1 pixel loop
|
| - align 4
|
| l1:
|
| - movdqa xmm0, [eax]
|
| + movdqu xmm0, [eax]
|
| psubd xmm0, [eax + edx * 4]
|
| lea eax, [eax + 16]
|
| psubd xmm0, [esi]
|
| @@ -6040,7 +4924,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
|
| jne l4b
|
|
|
| // 4 pixel loop
|
| - align 4
|
| l4:
|
| movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
|
| lea eax, [eax + 16]
|
| @@ -6057,26 +4940,26 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
|
| punpckhwd xmm5, xmm1
|
|
|
| paddd xmm0, xmm2
|
| - movdqa xmm2, [esi] // previous row above.
|
| + movdqu xmm2, [esi] // previous row above.
|
| paddd xmm2, xmm0
|
|
|
| paddd xmm0, xmm3
|
| - movdqa xmm3, [esi + 16]
|
| + movdqu xmm3, [esi + 16]
|
| paddd xmm3, xmm0
|
|
|
| paddd xmm0, xmm4
|
| - movdqa xmm4, [esi + 32]
|
| + movdqu xmm4, [esi + 32]
|
| paddd xmm4, xmm0
|
|
|
| paddd xmm0, xmm5
|
| - movdqa xmm5, [esi + 48]
|
| + movdqu xmm5, [esi + 48]
|
| lea esi, [esi + 64]
|
| paddd xmm5, xmm0
|
|
|
| - movdqa [edx], xmm2
|
| - movdqa [edx + 16], xmm3
|
| - movdqa [edx + 32], xmm4
|
| - movdqa [edx + 48], xmm5
|
| + movdqu [edx], xmm2
|
| + movdqu [edx + 16], xmm3
|
| + movdqu [edx + 32], xmm4
|
| + movdqu [edx + 48], xmm5
|
|
|
| lea edx, [edx + 64]
|
| sub ecx, 4
|
| @@ -6087,7 +4970,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
|
| jl l1b
|
|
|
| // 1 pixel loop
|
| - align 4
|
| l1:
|
| movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
|
| lea eax, [eax + 4]
|
| @@ -6142,7 +5024,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
|
| addps xmm4, xmm4 // dudv *= 4
|
|
|
| // 4 pixel loop
|
| - align 4
|
| l4:
|
| cvttps2dq xmm0, xmm2 // x, y float to int first 2
|
| cvttps2dq xmm1, xmm3 // x, y float to int next 2
|
| @@ -6156,256 +5037,47 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
|
| movd xmm6, [eax + edi] // read pixel 1
|
| punpckldq xmm1, xmm6 // combine pixel 0 and 1
|
| addps xmm2, xmm4 // x, y += dx, dy first 2
|
| - movq qword ptr [edx], xmm1
|
| - movd esi, xmm0
|
| - pshufd xmm0, xmm0, 0x39 // shift right
|
| - movd edi, xmm0
|
| - movd xmm6, [eax + esi] // read pixel 2
|
| - movd xmm0, [eax + edi] // read pixel 3
|
| - punpckldq xmm6, xmm0 // combine pixel 2 and 3
|
| - addps xmm3, xmm4 // x, y += dx, dy next 2
|
| - sub ecx, 4
|
| - movq qword ptr 8[edx], xmm6
|
| - lea edx, [edx + 16]
|
| - jge l4
|
| -
|
| - l4b:
|
| - add ecx, 4 - 1
|
| - jl l1b
|
| -
|
| - // 1 pixel loop
|
| - align 4
|
| - l1:
|
| - cvttps2dq xmm0, xmm2 // x, y float to int
|
| - packssdw xmm0, xmm0 // x, y as shorts
|
| - pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
|
| - addps xmm2, xmm7 // x, y += dx, dy
|
| - movd esi, xmm0
|
| - movd xmm0, [eax + esi] // copy a pixel
|
| - sub ecx, 1
|
| - movd [edx], xmm0
|
| - lea edx, [edx + 4]
|
| - jge l1
|
| - l1b:
|
| - pop edi
|
| - pop esi
|
| - ret
|
| - }
|
| -}
|
| -#endif // HAS_ARGBAFFINEROW_SSE2
|
| -
|
| -#ifdef HAS_INTERPOLATEROW_AVX2
|
| -// Bilinear filter 16x2 -> 16x1
|
| -__declspec(naked) __declspec(align(16))
|
| -void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| - ptrdiff_t src_stride, int dst_width,
|
| - int source_y_fraction) {
|
| - __asm {
|
| - push esi
|
| - push edi
|
| - mov edi, [esp + 8 + 4] // dst_ptr
|
| - mov esi, [esp + 8 + 8] // src_ptr
|
| - mov edx, [esp + 8 + 12] // src_stride
|
| - mov ecx, [esp + 8 + 16] // dst_width
|
| - mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
| - shr eax, 1
|
| - // Dispatch to specialized filters if applicable.
|
| - cmp eax, 0
|
| - je xloop100 // 0 / 128. Blend 100 / 0.
|
| - sub edi, esi
|
| - cmp eax, 32
|
| - je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
|
| - cmp eax, 64
|
| - je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
|
| - cmp eax, 96
|
| - je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
|
| -
|
| - vmovd xmm0, eax // high fraction 0..127
|
| - neg eax
|
| - add eax, 128
|
| - vmovd xmm5, eax // low fraction 128..1
|
| - vpunpcklbw xmm5, xmm5, xmm0
|
| - vpunpcklwd xmm5, xmm5, xmm5
|
| - vpxor ymm0, ymm0, ymm0
|
| - vpermd ymm5, ymm0, ymm5
|
| -
|
| - align 4
|
| - xloop:
|
| - vmovdqu ymm0, [esi]
|
| - vmovdqu ymm2, [esi + edx]
|
| - vpunpckhbw ymm1, ymm0, ymm2 // mutates
|
| - vpunpcklbw ymm0, ymm0, ymm2 // mutates
|
| - vpmaddubsw ymm0, ymm0, ymm5
|
| - vpmaddubsw ymm1, ymm1, ymm5
|
| - vpsrlw ymm0, ymm0, 7
|
| - vpsrlw ymm1, ymm1, 7
|
| - vpackuswb ymm0, ymm0, ymm1 // unmutates
|
| - sub ecx, 32
|
| - vmovdqu [esi + edi], ymm0
|
| - lea esi, [esi + 32]
|
| - jg xloop
|
| - jmp xloop99
|
| -
|
| - // Blend 25 / 75.
|
| - align 4
|
| - xloop25:
|
| - vmovdqu ymm0, [esi]
|
| - vpavgb ymm0, ymm0, [esi + edx]
|
| - vpavgb ymm0, ymm0, [esi + edx]
|
| - sub ecx, 32
|
| - vmovdqu [esi + edi], ymm0
|
| - lea esi, [esi + 32]
|
| - jg xloop25
|
| - jmp xloop99
|
| -
|
| - // Blend 50 / 50.
|
| - align 4
|
| - xloop50:
|
| - vmovdqu ymm0, [esi]
|
| - vpavgb ymm0, ymm0, [esi + edx]
|
| - sub ecx, 32
|
| - vmovdqu [esi + edi], ymm0
|
| - lea esi, [esi + 32]
|
| - jg xloop50
|
| - jmp xloop99
|
| -
|
| - // Blend 75 / 25.
|
| - align 4
|
| - xloop75:
|
| - vmovdqu ymm0, [esi + edx]
|
| - vpavgb ymm0, ymm0, [esi]
|
| - vpavgb ymm0, ymm0, [esi]
|
| - sub ecx, 32
|
| - vmovdqu [esi + edi], ymm0
|
| - lea esi, [esi + 32]
|
| - jg xloop75
|
| - jmp xloop99
|
| -
|
| - // Blend 100 / 0 - Copy row unchanged.
|
| - align 4
|
| - xloop100:
|
| - rep movsb
|
| -
|
| - xloop99:
|
| - pop edi
|
| - pop esi
|
| - vzeroupper
|
| - ret
|
| - }
|
| -}
|
| -#endif // HAS_INTERPOLATEROW_AVX2
|
| -
|
| -#ifdef HAS_INTERPOLATEROW_SSSE3
|
| -// Bilinear filter 16x2 -> 16x1
|
| -__declspec(naked) __declspec(align(16))
|
| -void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| - ptrdiff_t src_stride, int dst_width,
|
| - int source_y_fraction) {
|
| - __asm {
|
| - push esi
|
| - push edi
|
| - mov edi, [esp + 8 + 4] // dst_ptr
|
| - mov esi, [esp + 8 + 8] // src_ptr
|
| - mov edx, [esp + 8 + 12] // src_stride
|
| - mov ecx, [esp + 8 + 16] // dst_width
|
| - mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
| - sub edi, esi
|
| - shr eax, 1
|
| - // Dispatch to specialized filters if applicable.
|
| - cmp eax, 0
|
| - je xloop100 // 0 / 128. Blend 100 / 0.
|
| - cmp eax, 32
|
| - je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
|
| - cmp eax, 64
|
| - je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
|
| - cmp eax, 96
|
| - je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
|
| -
|
| - movd xmm0, eax // high fraction 0..127
|
| - neg eax
|
| - add eax, 128
|
| - movd xmm5, eax // low fraction 128..1
|
| - punpcklbw xmm5, xmm0
|
| - punpcklwd xmm5, xmm5
|
| - pshufd xmm5, xmm5, 0
|
| -
|
| - align 4
|
| - xloop:
|
| - movdqa xmm0, [esi]
|
| - movdqa xmm2, [esi + edx]
|
| - movdqa xmm1, xmm0
|
| - punpcklbw xmm0, xmm2
|
| - punpckhbw xmm1, xmm2
|
| - pmaddubsw xmm0, xmm5
|
| - pmaddubsw xmm1, xmm5
|
| - psrlw xmm0, 7
|
| - psrlw xmm1, 7
|
| - packuswb xmm0, xmm1
|
| - sub ecx, 16
|
| - movdqa [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - jg xloop
|
| - jmp xloop99
|
| -
|
| - // Blend 25 / 75.
|
| - align 4
|
| - xloop25:
|
| - movdqa xmm0, [esi]
|
| - movdqa xmm1, [esi + edx]
|
| - pavgb xmm0, xmm1
|
| - pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| - movdqa [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - jg xloop25
|
| - jmp xloop99
|
| -
|
| - // Blend 50 / 50.
|
| - align 4
|
| - xloop50:
|
| - movdqa xmm0, [esi]
|
| - movdqa xmm1, [esi + edx]
|
| - pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| - movdqa [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - jg xloop50
|
| - jmp xloop99
|
| -
|
| - // Blend 75 / 25.
|
| - align 4
|
| - xloop75:
|
| - movdqa xmm1, [esi]
|
| - movdqa xmm0, [esi + edx]
|
| - pavgb xmm0, xmm1
|
| - pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| - movdqa [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - jg xloop75
|
| - jmp xloop99
|
| + movq qword ptr [edx], xmm1
|
| + movd esi, xmm0
|
| + pshufd xmm0, xmm0, 0x39 // shift right
|
| + movd edi, xmm0
|
| + movd xmm6, [eax + esi] // read pixel 2
|
| + movd xmm0, [eax + edi] // read pixel 3
|
| + punpckldq xmm6, xmm0 // combine pixel 2 and 3
|
| + addps xmm3, xmm4 // x, y += dx, dy next 2
|
| + movq qword ptr 8[edx], xmm6
|
| + lea edx, [edx + 16]
|
| + sub ecx, 4
|
| + jge l4
|
|
|
| - // Blend 100 / 0 - Copy row unchanged.
|
| - align 4
|
| - xloop100:
|
| - movdqa xmm0, [esi]
|
| - sub ecx, 16
|
| - movdqa [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - jg xloop100
|
| + l4b:
|
| + add ecx, 4 - 1
|
| + jl l1b
|
|
|
| - xloop99:
|
| + // 1 pixel loop
|
| + l1:
|
| + cvttps2dq xmm0, xmm2 // x, y float to int
|
| + packssdw xmm0, xmm0 // x, y as shorts
|
| + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
|
| + addps xmm2, xmm7 // x, y += dx, dy
|
| + movd esi, xmm0
|
| + movd xmm0, [eax + esi] // copy a pixel
|
| + movd [edx], xmm0
|
| + lea edx, [edx + 4]
|
| + sub ecx, 1
|
| + jge l1
|
| + l1b:
|
| pop edi
|
| pop esi
|
| ret
|
| }
|
| }
|
| -#endif // HAS_INTERPOLATEROW_SSSE3
|
| +#endif // HAS_ARGBAFFINEROW_SSE2
|
|
|
| -#ifdef HAS_INTERPOLATEROW_SSE2
|
| -// Bilinear filter 16x2 -> 16x1
|
| +#ifdef HAS_INTERPOLATEROW_AVX2
|
| +// Bilinear filter 32x2 -> 32x1
|
| __declspec(naked) __declspec(align(16))
|
| -void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
| ptrdiff_t src_stride, int dst_width,
|
| int source_y_fraction) {
|
| __asm {
|
| @@ -6416,110 +5088,95 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| mov edx, [esp + 8 + 12] // src_stride
|
| mov ecx, [esp + 8 + 16] // dst_width
|
| mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
| - sub edi, esi
|
| + shr eax, 1
|
| // Dispatch to specialized filters if applicable.
|
| cmp eax, 0
|
| - je xloop100 // 0 / 256. Blend 100 / 0.
|
| + je xloop100 // 0 / 128. Blend 100 / 0.
|
| + sub edi, esi
|
| + cmp eax, 32
|
| + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
|
| cmp eax, 64
|
| - je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
|
| - cmp eax, 128
|
| - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
|
| - cmp eax, 192
|
| - je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
|
| + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
|
| + cmp eax, 96
|
| + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
|
|
|
| - movd xmm5, eax // xmm5 = y fraction
|
| - punpcklbw xmm5, xmm5
|
| - psrlw xmm5, 1
|
| - punpcklwd xmm5, xmm5
|
| - punpckldq xmm5, xmm5
|
| - punpcklqdq xmm5, xmm5
|
| - pxor xmm4, xmm4
|
| + vmovd xmm0, eax // high fraction 0..127
|
| + neg eax
|
| + add eax, 128
|
| + vmovd xmm5, eax // low fraction 128..1
|
| + vpunpcklbw xmm5, xmm5, xmm0
|
| + vpunpcklwd xmm5, xmm5, xmm5
|
| + vpxor ymm0, ymm0, ymm0
|
| + vpermd ymm5, ymm0, ymm5
|
|
|
| - align 4
|
| xloop:
|
| - movdqa xmm0, [esi] // row0
|
| - movdqa xmm2, [esi + edx] // row1
|
| - movdqa xmm1, xmm0
|
| - movdqa xmm3, xmm2
|
| - punpcklbw xmm2, xmm4
|
| - punpckhbw xmm3, xmm4
|
| - punpcklbw xmm0, xmm4
|
| - punpckhbw xmm1, xmm4
|
| - psubw xmm2, xmm0 // row1 - row0
|
| - psubw xmm3, xmm1
|
| - paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
|
| - paddw xmm3, xmm3
|
| - pmulhw xmm2, xmm5 // scale diff
|
| - pmulhw xmm3, xmm5
|
| - paddw xmm0, xmm2 // sum rows
|
| - paddw xmm1, xmm3
|
| - packuswb xmm0, xmm1
|
| - sub ecx, 16
|
| - movdqa [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| + vmovdqu ymm0, [esi]
|
| + vmovdqu ymm2, [esi + edx]
|
| + vpunpckhbw ymm1, ymm0, ymm2 // mutates
|
| + vpunpcklbw ymm0, ymm0, ymm2 // mutates
|
| + vpmaddubsw ymm0, ymm0, ymm5
|
| + vpmaddubsw ymm1, ymm1, ymm5
|
| + vpsrlw ymm0, ymm0, 7
|
| + vpsrlw ymm1, ymm1, 7
|
| + vpackuswb ymm0, ymm0, ymm1 // unmutates
|
| + vmovdqu [esi + edi], ymm0
|
| + lea esi, [esi + 32]
|
| + sub ecx, 32
|
| jg xloop
|
| jmp xloop99
|
|
|
| - // Blend 25 / 75.
|
| - align 4
|
| - xloop25:
|
| - movdqa xmm0, [esi]
|
| - movdqa xmm1, [esi + edx]
|
| - pavgb xmm0, xmm1
|
| - pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| - movdqa [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - jg xloop25
|
| - jmp xloop99
|
| -
|
| - // Blend 50 / 50.
|
| - align 4
|
| - xloop50:
|
| - movdqa xmm0, [esi]
|
| - movdqa xmm1, [esi + edx]
|
| - pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| - movdqa [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - jg xloop50
|
| - jmp xloop99
|
| -
|
| - // Blend 75 / 25.
|
| - align 4
|
| - xloop75:
|
| - movdqa xmm1, [esi]
|
| - movdqa xmm0, [esi + edx]
|
| - pavgb xmm0, xmm1
|
| - pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| - movdqa [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - jg xloop75
|
| - jmp xloop99
|
| -
|
| - // Blend 100 / 0 - Copy row unchanged.
|
| - align 4
|
| - xloop100:
|
| - movdqa xmm0, [esi]
|
| - sub ecx, 16
|
| - movdqa [esi + edi], xmm0
|
| - lea esi, [esi + 16]
|
| - jg xloop100
|
| + // Blend 25 / 75.
|
| + xloop25:
|
| + vmovdqu ymm0, [esi]
|
| + vmovdqu ymm1, [esi + edx]
|
| + vpavgb ymm0, ymm0, ymm1
|
| + vpavgb ymm0, ymm0, ymm1
|
| + vmovdqu [esi + edi], ymm0
|
| + lea esi, [esi + 32]
|
| + sub ecx, 32
|
| + jg xloop25
|
| + jmp xloop99
|
| +
|
| + // Blend 50 / 50.
|
| + xloop50:
|
| + vmovdqu ymm0, [esi]
|
| + vpavgb ymm0, ymm0, [esi + edx]
|
| + vmovdqu [esi + edi], ymm0
|
| + lea esi, [esi + 32]
|
| + sub ecx, 32
|
| + jg xloop50
|
| + jmp xloop99
|
| +
|
| + // Blend 75 / 25.
|
| + xloop75:
|
| + vmovdqu ymm1, [esi]
|
| + vmovdqu ymm0, [esi + edx]
|
| + vpavgb ymm0, ymm0, ymm1
|
| + vpavgb ymm0, ymm0, ymm1
|
| + vmovdqu [esi + edi], ymm0
|
| + lea esi, [esi + 32]
|
| + sub ecx, 32
|
| + jg xloop75
|
| + jmp xloop99
|
| +
|
| + // Blend 100 / 0 - Copy row unchanged.
|
| + xloop100:
|
| + rep movsb
|
|
|
| xloop99:
|
| pop edi
|
| pop esi
|
| + vzeroupper
|
| ret
|
| }
|
| }
|
| -#endif // HAS_INTERPOLATEROW_SSE2
|
| +#endif // HAS_INTERPOLATEROW_AVX2
|
|
|
| // Bilinear filter 16x2 -> 16x1
|
| __declspec(naked) __declspec(align(16))
|
| -void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| - ptrdiff_t src_stride, int dst_width,
|
| - int source_y_fraction) {
|
| +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| + ptrdiff_t src_stride, int dst_width,
|
| + int source_y_fraction) {
|
| __asm {
|
| push esi
|
| push edi
|
| @@ -6548,7 +5205,6 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| punpcklwd xmm5, xmm5
|
| pshufd xmm5, xmm5, 0
|
|
|
| - align 4
|
| xloop:
|
| movdqu xmm0, [esi]
|
| movdqu xmm2, [esi + edx]
|
| @@ -6560,57 +5216,53 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| psrlw xmm0, 7
|
| psrlw xmm1, 7
|
| packuswb xmm0, xmm1
|
| - sub ecx, 16
|
| movdqu [esi + edi], xmm0
|
| lea esi, [esi + 16]
|
| + sub ecx, 16
|
| jg xloop
|
| jmp xloop99
|
|
|
| // Blend 25 / 75.
|
| - align 4
|
| xloop25:
|
| movdqu xmm0, [esi]
|
| movdqu xmm1, [esi + edx]
|
| pavgb xmm0, xmm1
|
| pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| movdqu [esi + edi], xmm0
|
| lea esi, [esi + 16]
|
| + sub ecx, 16
|
| jg xloop25
|
| jmp xloop99
|
|
|
| // Blend 50 / 50.
|
| - align 4
|
| xloop50:
|
| movdqu xmm0, [esi]
|
| movdqu xmm1, [esi + edx]
|
| pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| movdqu [esi + edi], xmm0
|
| lea esi, [esi + 16]
|
| + sub ecx, 16
|
| jg xloop50
|
| jmp xloop99
|
|
|
| // Blend 75 / 25.
|
| - align 4
|
| xloop75:
|
| movdqu xmm1, [esi]
|
| movdqu xmm0, [esi + edx]
|
| pavgb xmm0, xmm1
|
| pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| movdqu [esi + edi], xmm0
|
| lea esi, [esi + 16]
|
| + sub ecx, 16
|
| jg xloop75
|
| jmp xloop99
|
|
|
| // Blend 100 / 0 - Copy row unchanged.
|
| - align 4
|
| xloop100:
|
| movdqu xmm0, [esi]
|
| - sub ecx, 16
|
| movdqu [esi + edi], xmm0
|
| lea esi, [esi + 16]
|
| + sub ecx, 16
|
| jg xloop100
|
|
|
| xloop99:
|
| @@ -6623,9 +5275,9 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
| #ifdef HAS_INTERPOLATEROW_SSE2
|
| // Bilinear filter 16x2 -> 16x1
|
| __declspec(naked) __declspec(align(16))
|
| -void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| - ptrdiff_t src_stride, int dst_width,
|
| - int source_y_fraction) {
|
| +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| + ptrdiff_t src_stride, int dst_width,
|
| + int source_y_fraction) {
|
| __asm {
|
| push esi
|
| push edi
|
| @@ -6653,7 +5305,6 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| punpcklqdq xmm5, xmm5
|
| pxor xmm4, xmm4
|
|
|
| - align 4
|
| xloop:
|
| movdqu xmm0, [esi] // row0
|
| movdqu xmm2, [esi + edx] // row1
|
| @@ -6672,57 +5323,53 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| paddw xmm0, xmm2 // sum rows
|
| paddw xmm1, xmm3
|
| packuswb xmm0, xmm1
|
| - sub ecx, 16
|
| movdqu [esi + edi], xmm0
|
| lea esi, [esi + 16]
|
| + sub ecx, 16
|
| jg xloop
|
| jmp xloop99
|
|
|
| // Blend 25 / 75.
|
| - align 4
|
| xloop25:
|
| movdqu xmm0, [esi]
|
| movdqu xmm1, [esi + edx]
|
| pavgb xmm0, xmm1
|
| pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| movdqu [esi + edi], xmm0
|
| lea esi, [esi + 16]
|
| + sub ecx, 16
|
| jg xloop25
|
| jmp xloop99
|
|
|
| // Blend 50 / 50.
|
| - align 4
|
| xloop50:
|
| movdqu xmm0, [esi]
|
| movdqu xmm1, [esi + edx]
|
| pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| movdqu [esi + edi], xmm0
|
| lea esi, [esi + 16]
|
| + sub ecx, 16
|
| jg xloop50
|
| jmp xloop99
|
|
|
| // Blend 75 / 25.
|
| - align 4
|
| xloop75:
|
| movdqu xmm1, [esi]
|
| movdqu xmm0, [esi + edx]
|
| pavgb xmm0, xmm1
|
| pavgb xmm0, xmm1
|
| - sub ecx, 16
|
| movdqu [esi + edi], xmm0
|
| lea esi, [esi + 16]
|
| + sub ecx, 16
|
| jg xloop75
|
| jmp xloop99
|
|
|
| // Blend 100 / 0 - Copy row unchanged.
|
| - align 4
|
| xloop100:
|
| movdqu xmm0, [esi]
|
| - sub ecx, 16
|
| movdqu [esi + edi], xmm0
|
| lea esi, [esi + 16]
|
| + sub ecx, 16
|
| jg xloop100
|
|
|
| xloop99:
|
| @@ -6733,84 +5380,6 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
| }
|
| #endif // HAS_INTERPOLATEROW_SSE2
|
|
|
| -__declspec(naked) __declspec(align(16))
|
| -void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
|
| - uint8* dst_uv, int pix) {
|
| - __asm {
|
| - push edi
|
| - mov eax, [esp + 4 + 4] // src_uv
|
| - mov edx, [esp + 4 + 8] // src_uv_stride
|
| - mov edi, [esp + 4 + 12] // dst_v
|
| - mov ecx, [esp + 4 + 16] // pix
|
| - sub edi, eax
|
| -
|
| - align 4
|
| - convertloop:
|
| - movdqa xmm0, [eax]
|
| - pavgb xmm0, [eax + edx]
|
| - sub ecx, 16
|
| - movdqa [eax + edi], xmm0
|
| - lea eax, [eax + 16]
|
| - jg convertloop
|
| - pop edi
|
| - ret
|
| - }
|
| -}
|
| -
|
| -#ifdef HAS_HALFROW_AVX2
|
| -__declspec(naked) __declspec(align(16))
|
| -void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
|
| - uint8* dst_uv, int pix) {
|
| - __asm {
|
| - push edi
|
| - mov eax, [esp + 4 + 4] // src_uv
|
| - mov edx, [esp + 4 + 8] // src_uv_stride
|
| - mov edi, [esp + 4 + 12] // dst_v
|
| - mov ecx, [esp + 4 + 16] // pix
|
| - sub edi, eax
|
| -
|
| - align 4
|
| - convertloop:
|
| - vmovdqu ymm0, [eax]
|
| - vpavgb ymm0, ymm0, [eax + edx]
|
| - sub ecx, 32
|
| - vmovdqu [eax + edi], ymm0
|
| - lea eax, [eax + 32]
|
| - jg convertloop
|
| -
|
| - pop edi
|
| - vzeroupper
|
| - ret
|
| - }
|
| -}
|
| -#endif // HAS_HALFROW_AVX2
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
|
| - uint32 selector, int pix) {
|
| - __asm {
|
| - mov eax, [esp + 4] // src_argb
|
| - mov edx, [esp + 8] // dst_bayer
|
| - movd xmm5, [esp + 12] // selector
|
| - mov ecx, [esp + 16] // pix
|
| - pshufd xmm5, xmm5, 0
|
| -
|
| - align 4
|
| - wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - lea eax, [eax + 32]
|
| - pshufb xmm0, xmm5
|
| - pshufb xmm1, xmm5
|
| - punpckldq xmm0, xmm1
|
| - sub ecx, 8
|
| - movq qword ptr [edx], xmm0
|
| - lea edx, [edx + 8]
|
| - jg wloop
|
| - ret
|
| - }
|
| -}
|
| -
|
| // Specialized ARGB to Bayer that just isolates G channel.
|
| __declspec(naked) __declspec(align(16))
|
| void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
|
| @@ -6823,10 +5392,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
|
| pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
|
| psrld xmm5, 24
|
|
|
| - align 4
|
| wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| + movdqu xmm0, [eax]
|
| + movdqu xmm1, [eax + 16]
|
| lea eax, [eax + 32]
|
| psrld xmm0, 8 // Move green to bottom.
|
| psrld xmm1, 8
|
| @@ -6834,9 +5402,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
|
| pand xmm1, xmm5
|
| packssdw xmm0, xmm1
|
| packuswb xmm0, xmm1
|
| - sub ecx, 8
|
| movq qword ptr [edx], xmm0
|
| lea edx, [edx + 8]
|
| + sub ecx, 8
|
| jg wloop
|
| ret
|
| }
|
| @@ -6850,46 +5418,19 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| mov eax, [esp + 4] // src_argb
|
| mov edx, [esp + 8] // dst_argb
|
| mov ecx, [esp + 12] // shuffler
|
| - movdqa xmm5, [ecx]
|
| - mov ecx, [esp + 16] // pix
|
| -
|
| - align 4
|
| - wloop:
|
| - movdqa xmm0, [eax]
|
| - movdqa xmm1, [eax + 16]
|
| - lea eax, [eax + 32]
|
| - pshufb xmm0, xmm5
|
| - pshufb xmm1, xmm5
|
| - sub ecx, 8
|
| - movdqa [edx], xmm0
|
| - movdqa [edx + 16], xmm1
|
| - lea edx, [edx + 32]
|
| - jg wloop
|
| - ret
|
| - }
|
| -}
|
| -
|
| -__declspec(naked) __declspec(align(16))
|
| -void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| - const uint8* shuffler, int pix) {
|
| - __asm {
|
| - mov eax, [esp + 4] // src_argb
|
| - mov edx, [esp + 8] // dst_argb
|
| - mov ecx, [esp + 12] // shuffler
|
| - movdqa xmm5, [ecx]
|
| + movdqu xmm5, [ecx]
|
| mov ecx, [esp + 16] // pix
|
|
|
| - align 4
|
| wloop:
|
| movdqu xmm0, [eax]
|
| movdqu xmm1, [eax + 16]
|
| lea eax, [eax + 32]
|
| pshufb xmm0, xmm5
|
| pshufb xmm1, xmm5
|
| - sub ecx, 8
|
| movdqu [edx], xmm0
|
| movdqu [edx + 16], xmm1
|
| lea edx, [edx + 32]
|
| + sub ecx, 8
|
| jg wloop
|
| ret
|
| }
|
| @@ -6906,17 +5447,16 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
| vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
|
| mov ecx, [esp + 16] // pix
|
|
|
| - align 4
|
| wloop:
|
| vmovdqu ymm0, [eax]
|
| vmovdqu ymm1, [eax + 32]
|
| lea eax, [eax + 64]
|
| vpshufb ymm0, ymm0, ymm5
|
| vpshufb ymm1, ymm1, ymm5
|
| - sub ecx, 16
|
| vmovdqu [edx], ymm0
|
| vmovdqu [edx + 32], ymm1
|
| lea edx, [edx + 64]
|
| + sub ecx, 16
|
| jg wloop
|
|
|
| vzeroupper
|
| @@ -6967,7 +5507,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
| jg shuf_any1
|
| jmp shuf99
|
|
|
| - align 4
|
| shuf_0123:
|
| movdqu xmm0, [eax]
|
| lea eax, [eax + 16]
|
| @@ -6979,13 +5518,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
| pshufhw xmm1, xmm1, 01Bh
|
| pshuflw xmm1, xmm1, 01Bh
|
| packuswb xmm0, xmm1
|
| - sub ecx, 4
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg shuf_0123
|
| jmp shuf99
|
|
|
| - align 4
|
| shuf_0321:
|
| movdqu xmm0, [eax]
|
| lea eax, [eax + 16]
|
| @@ -6997,13 +5535,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
| pshufhw xmm1, xmm1, 039h
|
| pshuflw xmm1, xmm1, 039h
|
| packuswb xmm0, xmm1
|
| - sub ecx, 4
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg shuf_0321
|
| jmp shuf99
|
|
|
| - align 4
|
| shuf_2103:
|
| movdqu xmm0, [eax]
|
| lea eax, [eax + 16]
|
| @@ -7015,13 +5552,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
| pshufhw xmm1, xmm1, 093h
|
| pshuflw xmm1, xmm1, 093h
|
| packuswb xmm0, xmm1
|
| - sub ecx, 4
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg shuf_2103
|
| jmp shuf99
|
|
|
| - align 4
|
| shuf_3012:
|
| movdqu xmm0, [eax]
|
| lea eax, [eax + 16]
|
| @@ -7033,9 +5569,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
| pshufhw xmm1, xmm1, 0C6h
|
| pshuflw xmm1, xmm1, 0C6h
|
| packuswb xmm0, xmm1
|
| - sub ecx, 4
|
| movdqu [edx], xmm0
|
| lea edx, [edx + 16]
|
| + sub ecx, 4
|
| jg shuf_3012
|
|
|
| shuf99:
|
| @@ -7066,7 +5602,6 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
|
| mov ecx, [esp + 8 + 20] // width
|
| sub edx, esi
|
|
|
| - align 4
|
| convertloop:
|
| movq xmm2, qword ptr [esi] // U
|
| movq xmm3, qword ptr [esi + edx] // V
|
| @@ -7104,7 +5639,6 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
|
| mov ecx, [esp + 8 + 20] // width
|
| sub edx, esi
|
|
|
| - align 4
|
| convertloop:
|
| movq xmm2, qword ptr [esi] // U
|
| movq xmm3, qword ptr [esi + edx] // V
|
| @@ -7141,7 +5675,6 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
| pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
|
|
|
| // 2 pixel loop.
|
| - align 4
|
| convertloop:
|
| // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
|
| // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
|
| @@ -7177,9 +5710,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
| cvttps2dq xmm4, xmm4
|
| packuswb xmm0, xmm4
|
| packuswb xmm0, xmm0
|
| - sub ecx, 2
|
| movq qword ptr [edx], xmm0
|
| lea edx, [edx + 8]
|
| + sub ecx, 2
|
| jg convertloop
|
| pop esi
|
| ret
|
| @@ -7203,7 +5736,6 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
| mov ecx, [esp + 16] /* width */
|
|
|
| // 2 pixel loop.
|
| - align 4
|
| convertloop:
|
| vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
|
| lea eax, [eax + 8]
|
| @@ -7217,9 +5749,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
| vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
|
| vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
| vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
|
| - sub ecx, 2
|
| vmovq qword ptr [edx], xmm0
|
| lea edx, [edx + 8]
|
| + sub ecx, 2
|
| jg convertloop
|
| vzeroupper
|
| ret
|
| @@ -7239,7 +5771,6 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
|
| mov ecx, [esp + 4 + 12] /* width */
|
|
|
| // 1 pixel loop.
|
| - align 4
|
| convertloop:
|
| movzx edx, byte ptr [eax]
|
| lea eax, [eax + 4]
|
| @@ -7273,7 +5804,6 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
|
| mov ecx, [esp + 4 + 12] /* width */
|
|
|
| // 1 pixel loop.
|
| - align 4
|
| convertloop:
|
| movzx edx, byte ptr [eax]
|
| lea eax, [eax + 4]
|
| @@ -7315,7 +5845,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| pxor xmm5, xmm5
|
|
|
| // 4 pixel loop.
|
| - align 4
|
| convertloop:
|
| movdqu xmm0, qword ptr [eax] // generate luma ptr
|
| pmaddubsw xmm0, xmm3
|
| @@ -7382,9 +5911,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
| movzx edx, byte ptr [eax + 15] // copy alpha.
|
| mov byte ptr [edi + 15], dl
|
|
|
| - sub ecx, 4
|
| lea eax, [eax + 16]
|
| lea edi, [edi + 16]
|
| + sub ecx, 4
|
| jg convertloop
|
|
|
| pop edi
|
|
|