| Index: source/libvpx/third_party/libyuv/source/row_win.cc
 | 
| diff --git a/source/libvpx/third_party/libyuv/source/row_win.cc b/source/libvpx/third_party/libyuv/source/row_win.cc
 | 
| index d79c353960bdd7a7fa53d660b15c976b92f1cef6..6e9d04c0e4efcb4e3c9a0cf613313eeb9ffdf061 100644
 | 
| --- a/source/libvpx/third_party/libyuv/source/row_win.cc
 | 
| +++ b/source/libvpx/third_party/libyuv/source/row_win.cc
 | 
| @@ -24,55 +24,63 @@ extern "C" {
 | 
|  #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
 | 
|      (defined(_M_IX86) || defined(_M_X64))
 | 
|  
 | 
| -#define YG 74  /* (int8)(1.164 * 64 + 0.5) */
 | 
| -
 | 
| -#define UB 127  /* min(127,(int8)(2.018 * 64)) */
 | 
| -#define UG -25  /* (int8)(-0.391 * 64 - 0.5) */
 | 
| -#define UR 0
 | 
| -
 | 
| -#define VB 0
 | 
| -#define VG -52  /* (int8)(-0.813 * 64 - 0.5) */
 | 
| -#define VR 102  /* (int8)(1.596 * 64 + 0.5) */
 | 
| -
 | 
| -// Bias
 | 
| -#define BB UB * 128 + VB * 128
 | 
| -#define BG UG * 128 + VG * 128
 | 
| -#define BR UR * 128 + VR * 128
 | 
| -
 | 
| -static const vec8 kUVToB = {
 | 
| -  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
 | 
| -};
 | 
| -
 | 
| -static const vec8 kUVToR = {
 | 
| -  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
 | 
| -};
 | 
| -
 | 
| -static const vec8 kUVToG = {
 | 
| -  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
 | 
| +// YUV to RGB conversion constants.
 | 
| +// Y contribution to R,G,B.  Scale and bias.
 | 
| +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
 | 
| +#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
 | 
| +
 | 
| +// U and V contributions to R,G,B.
 | 
| +#define UB -128 /* -min(128, round(2.018 * 64)) */
 | 
| +#define UG 25 /* -round(-0.391 * 64) */
 | 
| +#define VG 52 /* -round(-0.813 * 64) */
 | 
| +#define VR -102 /* -round(1.596 * 64) */
 | 
| +
 | 
| +// Bias values to subtract 16 from Y and 128 from U and V.
 | 
| +#define BB (UB * 128            - YGB)
 | 
| +#define BG (UG * 128 + VG * 128 - YGB)
 | 
| +#define BR            (VR * 128 - YGB)
 | 
| +
 | 
| +struct YuvConstants {
 | 
| +  lvec8 kUVToB;     // 0
 | 
| +  lvec8 kUVToG;     // 32
 | 
| +  lvec8 kUVToR;     // 64
 | 
| +  lvec16 kUVBiasB;  // 96
 | 
| +  lvec16 kUVBiasG;  // 128
 | 
| +  lvec16 kUVBiasR;  // 160
 | 
| +  lvec16 kYToRgb;   // 192
 | 
|  };
 | 
|  
 | 
| -static const vec8 kVUToB = {
 | 
| -  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
 | 
| +// BT601 constants for YUV to RGB.
 | 
| +static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
 | 
| +  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
 | 
| +    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
 | 
| +  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
 | 
| +    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
 | 
| +  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
 | 
| +    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
 | 
| +  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
 | 
| +  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
 | 
| +  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
 | 
| +  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
 | 
|  };
 | 
|  
 | 
| -static const vec8 kVUToR = {
 | 
| -  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
 | 
| +// BT601 constants for NV21 where chroma plane is VU instead of UV.
 | 
| +static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
 | 
| +  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
 | 
| +    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
 | 
| +  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
 | 
| +    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
 | 
| +  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
 | 
| +    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
 | 
| +  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
 | 
| +  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
 | 
| +  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
 | 
| +  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
 | 
|  };
 | 
|  
 | 
| -static const vec8 kVUToG = {
 | 
| -  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
 | 
| -};
 | 
| -
 | 
| -static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
 | 
| -static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
 | 
| -static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
 | 
| -static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
 | 
| -static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
 | 
| -
 | 
|  // 64 bit
 | 
|  #if defined(_M_X64)
 | 
|  
 | 
| -// Aligned destination version.
 | 
|  __declspec(align(16))
 | 
|  void I422ToARGBRow_SSSE3(const uint8* y_buf,
 | 
|                           const uint8* u_buf,
 | 
| @@ -81,60 +89,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
 | 
|                           int width) {
 | 
|    __m128i xmm0, xmm1, xmm2, xmm3;
 | 
|    const __m128i xmm5 = _mm_set1_epi8(-1);
 | 
| -  const __m128i xmm4 = _mm_setzero_si128();
 | 
| -  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
 | 
| -
 | 
| -  while (width > 0) {
 | 
| -    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
 | 
| -    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
 | 
| -    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
 | 
| -    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
 | 
| -    xmm1 = _mm_load_si128(&xmm0);
 | 
| -    xmm2 = _mm_load_si128(&xmm0);
 | 
| -    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
 | 
| -    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
 | 
| -    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
 | 
| -    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
 | 
| -    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
 | 
| -    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
 | 
| -    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
 | 
| -    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
 | 
| -    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
 | 
| -    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
 | 
| -    xmm0 = _mm_adds_epi16(xmm0, xmm3);
 | 
| -    xmm1 = _mm_adds_epi16(xmm1, xmm3);
 | 
| -    xmm2 = _mm_adds_epi16(xmm2, xmm3);
 | 
| -    xmm0 = _mm_srai_epi16(xmm0, 6);
 | 
| -    xmm1 = _mm_srai_epi16(xmm1, 6);
 | 
| -    xmm2 = _mm_srai_epi16(xmm2, 6);
 | 
| -    xmm0 = _mm_packus_epi16(xmm0, xmm0);
 | 
| -    xmm1 = _mm_packus_epi16(xmm1, xmm1);
 | 
| -    xmm2 = _mm_packus_epi16(xmm2, xmm2);
 | 
| -    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
 | 
| -    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
 | 
| -    xmm1 = _mm_load_si128(&xmm0);
 | 
| -    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
 | 
| -    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
 | 
| -
 | 
| -    _mm_store_si128((__m128i *)dst_argb, xmm0);
 | 
| -    _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
 | 
| -
 | 
| -    y_buf += 8;
 | 
| -    u_buf += 4;
 | 
| -    dst_argb += 32;
 | 
| -    width -= 8;
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -// Unaligned destination version.
 | 
| -void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 | 
| -                                   const uint8* u_buf,
 | 
| -                                   const uint8* v_buf,
 | 
| -                                   uint8* dst_argb,
 | 
| -                                   int width) {
 | 
| -  __m128i xmm0, xmm1, xmm2, xmm3;
 | 
| -  const __m128i xmm5 = _mm_set1_epi8(-1);
 | 
| -  const __m128i xmm4 = _mm_setzero_si128();
 | 
|    const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
 | 
|  
 | 
|    while (width > 0) {
 | 
| @@ -142,18 +96,17 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 | 
|      xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
 | 
|      xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
 | 
|      xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
 | 
| -    xmm1 = _mm_load_si128(&xmm0);
 | 
| -    xmm2 = _mm_load_si128(&xmm0);
 | 
| -    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
 | 
| -    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
 | 
| -    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
 | 
| -    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
 | 
| -    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
 | 
| -    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
 | 
| +    xmm1 = _mm_loadu_si128(&xmm0);
 | 
| +    xmm2 = _mm_loadu_si128(&xmm0);
 | 
| +    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
 | 
| +    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
 | 
| +    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
 | 
| +    xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
 | 
| +    xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
 | 
| +    xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
 | 
|      xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
 | 
| -    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
 | 
| -    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
 | 
| -    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
 | 
| +    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
 | 
| +    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
 | 
|      xmm0 = _mm_adds_epi16(xmm0, xmm3);
 | 
|      xmm1 = _mm_adds_epi16(xmm1, xmm3);
 | 
|      xmm2 = _mm_adds_epi16(xmm2, xmm3);
 | 
| @@ -165,7 +118,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 | 
|      xmm2 = _mm_packus_epi16(xmm2, xmm2);
 | 
|      xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
 | 
|      xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
 | 
| -    xmm1 = _mm_load_si128(&xmm0);
 | 
| +    xmm1 = _mm_loadu_si128(&xmm0);
 | 
|      xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
 | 
|      xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
 | 
|  
 | 
| @@ -178,6 +131,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 | 
|      width -= 8;
 | 
|    }
 | 
|  }
 | 
| +
 | 
|  // 32 bit
 | 
|  #else  // defined(_M_X64)
 | 
|  
 | 
| @@ -209,15 +163,10 @@ static const vec8 kARGBToVJ = {
 | 
|    -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
 | 
|  };
 | 
|  
 | 
| -// vpermd for vphaddw + vpackuswb vpermd.
 | 
| -static const lvec32 kPermdARGBToY_AVX = {
 | 
| -  0, 4, 1, 5, 2, 6, 3, 7
 | 
| -};
 | 
| -
 | 
|  // vpshufb for vphaddw + vpackuswb packed to shorts.
 | 
|  static const lvec8 kShufARGBToUV_AVX = {
 | 
|    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
 | 
| -  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
 | 
| +  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
 | 
|  };
 | 
|  
 | 
|  // Constants for BGRA.
 | 
| @@ -263,6 +212,7 @@ static const uvec8 kAddY16 = {
 | 
|    16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 | 
|  };
 | 
|  
 | 
| +// 7 bit fixed point 0.5.
 | 
|  static const vec16 kAddYJ64 = {
 | 
|    64, 64, 64, 64, 64, 64, 64, 64
 | 
|  };
 | 
| @@ -316,36 +266,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
 | 
|      pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
 | 
|      pslld      xmm5, 24
 | 
|  
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    movq       xmm0, qword ptr [eax]
 | 
| -    lea        eax,  [eax + 8]
 | 
| -    punpcklbw  xmm0, xmm0
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm0
 | 
| -    punpckhwd  xmm1, xmm1
 | 
| -    por        xmm0, xmm5
 | 
| -    por        xmm1, xmm5
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + 16], xmm1
 | 
| -    lea        edx, [edx + 32]
 | 
| -    sub        ecx, 8
 | 
| -    jg         convertloop
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
 | 
| -                                  int pix) {
 | 
| -  __asm {
 | 
| -    mov        eax, [esp + 4]        // src_y
 | 
| -    mov        edx, [esp + 8]        // dst_argb
 | 
| -    mov        ecx, [esp + 12]       // pix
 | 
| -    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
 | 
| -    pslld      xmm5, 24
 | 
| -
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movq       xmm0, qword ptr [eax]
 | 
|      lea        eax,  [eax + 8]
 | 
| @@ -374,7 +294,6 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
 | 
|      pslld     xmm5, 24
 | 
|      movdqa    xmm4, kShuffleMaskRGB24ToARGB
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu    xmm0, [eax]
 | 
|      movdqu    xmm1, [eax + 16]
 | 
| @@ -386,18 +305,18 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
 | 
|      por       xmm2, xmm5
 | 
|      palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
 | 
|      pshufb    xmm0, xmm4
 | 
| -    movdqa    [edx + 32], xmm2
 | 
| +    movdqu    [edx + 32], xmm2
 | 
|      por       xmm0, xmm5
 | 
|      pshufb    xmm1, xmm4
 | 
| -    movdqa    [edx], xmm0
 | 
| +    movdqu    [edx], xmm0
 | 
|      por       xmm1, xmm5
 | 
|      palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
 | 
|      pshufb    xmm3, xmm4
 | 
| -    movdqa    [edx + 16], xmm1
 | 
| +    movdqu    [edx + 16], xmm1
 | 
|      por       xmm3, xmm5
 | 
| -    sub       ecx, 16
 | 
| -    movdqa    [edx + 48], xmm3
 | 
| +    movdqu    [edx + 48], xmm3
 | 
|      lea       edx, [edx + 64]
 | 
| +    sub       ecx, 16
 | 
|      jg        convertloop
 | 
|      ret
 | 
|    }
 | 
| @@ -414,7 +333,6 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
 | 
|      pslld     xmm5, 24
 | 
|      movdqa    xmm4, kShuffleMaskRAWToARGB
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu    xmm0, [eax]
 | 
|      movdqu    xmm1, [eax + 16]
 | 
| @@ -426,18 +344,18 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
 | 
|      por       xmm2, xmm5
 | 
|      palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
 | 
|      pshufb    xmm0, xmm4
 | 
| -    movdqa    [edx + 32], xmm2
 | 
| +    movdqu    [edx + 32], xmm2
 | 
|      por       xmm0, xmm5
 | 
|      pshufb    xmm1, xmm4
 | 
| -    movdqa    [edx], xmm0
 | 
| +    movdqu    [edx], xmm0
 | 
|      por       xmm1, xmm5
 | 
|      palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
 | 
|      pshufb    xmm3, xmm4
 | 
| -    movdqa    [edx + 16], xmm1
 | 
| +    movdqu    [edx + 16], xmm1
 | 
|      por       xmm3, xmm5
 | 
| -    sub       ecx, 16
 | 
| -    movdqa    [edx + 48], xmm3
 | 
| +    movdqu    [edx + 48], xmm3
 | 
|      lea       edx, [edx + 64]
 | 
| +    sub       ecx, 16
 | 
|      jg        convertloop
 | 
|      ret
 | 
|    }
 | 
| @@ -474,7 +392,6 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
 | 
|      sub       edx, eax
 | 
|      sub       edx, eax
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
 | 
|      movdqa    xmm1, xmm0
 | 
| @@ -491,8 +408,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
 | 
|      movdqa    xmm2, xmm1
 | 
|      punpcklbw xmm1, xmm0
 | 
|      punpckhbw xmm2, xmm0
 | 
| -    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
 | 
| -    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
 | 
| +    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
 | 
| +    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
 | 
|      lea       eax, [eax + 16]
 | 
|      sub       ecx, 8
 | 
|      jg        convertloop
 | 
| @@ -524,7 +441,6 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
 | 
|      sub       edx, eax
 | 
|      sub       edx, eax
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
 | 
|      movdqa    xmm1, xmm0
 | 
| @@ -545,8 +461,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
 | 
|      movdqa    xmm2, xmm1
 | 
|      punpcklbw xmm1, xmm0
 | 
|      punpckhbw xmm2, xmm0
 | 
| -    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
 | 
| -    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
 | 
| +    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
 | 
| +    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
 | 
|      lea       eax, [eax + 16]
 | 
|      sub       ecx, 8
 | 
|      jg        convertloop
 | 
| @@ -570,7 +486,6 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
 | 
|      sub       edx, eax
 | 
|      sub       edx, eax
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
 | 
|      movdqa    xmm2, xmm0
 | 
| @@ -585,8 +500,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
 | 
|      movdqa    xmm1, xmm0
 | 
|      punpcklbw xmm0, xmm2
 | 
|      punpckhbw xmm1, xmm2
 | 
| -    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
 | 
| -    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
 | 
| +    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
 | 
| +    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
 | 
|      lea       eax, [eax + 16]
 | 
|      sub       ecx, 8
 | 
|      jg        convertloop
 | 
| @@ -602,7 +517,6 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
 | 
|      mov       ecx, [esp + 12]  // pix
 | 
|      movdqa    xmm6, kShuffleMaskARGBToRGB24
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu    xmm0, [eax]   // fetch 16 pixels of argb
 | 
|      movdqu    xmm1, [eax + 16]
 | 
| @@ -641,7 +555,6 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
 | 
|      mov       ecx, [esp + 12]  // pix
 | 
|      movdqa    xmm6, kShuffleMaskARGBToRAW
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu    xmm0, [eax]   // fetch 16 pixels of argb
 | 
|      movdqu    xmm1, [eax + 16]
 | 
| @@ -686,9 +599,8 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 | 
|      pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
 | 
|      pslld     xmm5, 11
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
 | 
| +    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
 | 
|      movdqa    xmm1, xmm0    // B
 | 
|      movdqa    xmm2, xmm0    // G
 | 
|      pslld     xmm0, 8       // R
 | 
| @@ -726,9 +638,8 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 | 
|      pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
 | 
|      pslld     xmm7, 15
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
 | 
| +    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
 | 
|      movdqa    xmm1, xmm0    // B
 | 
|      movdqa    xmm2, xmm0    // G
 | 
|      movdqa    xmm3, xmm0    // R
 | 
| @@ -764,14 +675,13 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 | 
|      movdqa    xmm3, xmm4       // generate mask 0x00f000f0
 | 
|      psrlw     xmm3, 8
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
 | 
| +    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
 | 
|      movdqa    xmm1, xmm0
 | 
|      pand      xmm0, xmm3    // low nibble
 | 
|      pand      xmm1, xmm4    // high nibble
 | 
| -    psrl      xmm0, 4
 | 
| -    psrl      xmm1, 8
 | 
| +    psrld     xmm0, 4
 | 
| +    psrld     xmm1, 8
 | 
|      por       xmm0, xmm1
 | 
|      packuswb  xmm0, xmm0
 | 
|      lea       eax, [eax + 16]
 | 
| @@ -783,6 +693,116 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 | 
|    }
 | 
|  }
 | 
|  
 | 
| +#ifdef HAS_ARGBTORGB565ROW_AVX2
 | 
| +__declspec(naked) __declspec(align(16))
 | 
| +void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 | 
| +  __asm {
 | 
| +    mov        eax, [esp + 4]      // src_argb
 | 
| +    mov        edx, [esp + 8]      // dst_rgb
 | 
| +    mov        ecx, [esp + 12]     // pix
 | 
| +    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
 | 
| +    vpsrld     ymm3, ymm3, 27
 | 
| +    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
 | 
| +    vpsrld     ymm4, ymm4, 26
 | 
| +    vpslld     ymm4, ymm4, 5
 | 
| +    vpcmpeqb   ymm5, ymm5, ymm5    // generate mask 0xfffff800
 | 
| +    vpslld     ymm5, ymm5, 11
 | 
| +
 | 
| + convertloop:
 | 
| +    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
 | 
| +    vpsrld     ymm2, ymm0, 5       // G
 | 
| +    vpsrld     ymm1, ymm0, 3       // B
 | 
| +    vpslld     ymm0, ymm0, 8       // R
 | 
| +    vpand      ymm2, ymm2, ymm4    // G
 | 
| +    vpand      ymm1, ymm1, ymm3    // B
 | 
| +    vpsrad     ymm0, ymm0, 16      // R
 | 
| +    vpand      ymm0, ymm0, ymm5    // R
 | 
| +    vpor       ymm1, ymm1, ymm2    // BG
 | 
| +    vpor       ymm0, ymm0, ymm1    // BGR
 | 
| +    vpackssdw  ymm0, ymm0, ymm0
 | 
| +    vpermq     ymm0, ymm0, 0xd8
 | 
| +    lea        eax, [eax + 32]
 | 
| +    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
 | 
| +    lea        edx, [edx + 16]
 | 
| +    sub        ecx, 8
 | 
| +    jg         convertloop
 | 
| +    vzeroupper
 | 
| +    ret
 | 
| +  }
 | 
| +}
 | 
| +#endif  // HAS_ARGBTORGB565ROW_AVX2
 | 
| +
 | 
| +#ifdef HAS_ARGBTOARGB1555ROW_AVX2
 | 
| +__declspec(naked) __declspec(align(16))
 | 
| +void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 | 
| +  __asm {
 | 
| +    mov        eax, [esp + 4]      // src_argb
 | 
| +    mov        edx, [esp + 8]      // dst_rgb
 | 
| +    mov        ecx, [esp + 12]     // pix
 | 
| +    vpcmpeqb   ymm4, ymm4, ymm4
 | 
| +    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
 | 
| +    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
 | 
| +    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
 | 
| +    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
 | 
| +    vpslld     ymm7, ymm7, 15
 | 
| +
 | 
| + convertloop:
 | 
| +    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
 | 
| +    vpsrld     ymm3, ymm0, 9       // R
 | 
| +    vpsrld     ymm2, ymm0, 6       // G
 | 
| +    vpsrld     ymm1, ymm0, 3       // B
 | 
| +    vpsrad     ymm0, ymm0, 16      // A
 | 
| +    vpand      ymm3, ymm3, ymm6    // R
 | 
| +    vpand      ymm2, ymm2, ymm5    // G
 | 
| +    vpand      ymm1, ymm1, ymm4    // B
 | 
| +    vpand      ymm0, ymm0, ymm7    // A
 | 
| +    vpor       ymm0, ymm0, ymm1    // BA
 | 
| +    vpor       ymm2, ymm2, ymm3    // GR
 | 
| +    vpor       ymm0, ymm0, ymm2    // BGRA
 | 
| +    vpackssdw  ymm0, ymm0, ymm0
 | 
| +    vpermq     ymm0, ymm0, 0xd8
 | 
| +    lea        eax, [eax + 32]
 | 
| +    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
 | 
| +    lea        edx, [edx + 16]
 | 
| +    sub        ecx, 8
 | 
| +    jg         convertloop
 | 
| +    vzeroupper
 | 
| +    ret
 | 
| +  }
 | 
| +}
 | 
| +#endif  // HAS_ARGBTOARGB1555ROW_AVX2
 | 
| +
 | 
| +#ifdef HAS_ARGBTOARGB4444ROW_AVX2
 | 
| +__declspec(naked) __declspec(align(16))
 | 
| +void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 | 
| +  __asm {
 | 
| +    mov        eax, [esp + 4]   // src_argb
 | 
| +    mov        edx, [esp + 8]   // dst_rgb
 | 
| +    mov        ecx, [esp + 12]  // pix
 | 
| +    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
 | 
| +    vpsllw     ymm4, ymm4, 12
 | 
| +    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
 | 
| +
 | 
| + convertloop:
 | 
| +    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
 | 
| +    vpand      ymm1, ymm0, ymm4    // high nibble
 | 
| +    vpand      ymm0, ymm0, ymm3    // low nibble
 | 
| +    vpsrld     ymm1, ymm1, 8
 | 
| +    vpsrld     ymm0, ymm0, 4
 | 
| +    vpor       ymm0, ymm0, ymm1
 | 
| +    vpackuswb  ymm0, ymm0, ymm0
 | 
| +    vpermq     ymm0, ymm0, 0xd8
 | 
| +    lea        eax, [eax + 32]
 | 
| +    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
 | 
| +    lea        edx, [edx + 16]
 | 
| +    sub        ecx, 8
 | 
| +    jg         convertloop
 | 
| +    vzeroupper
 | 
| +    ret
 | 
| +  }
 | 
| +}
 | 
| +#endif  // HAS_ARGBTOARGB4444ROW_AVX2
 | 
| +
 | 
|  // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
| @@ -790,15 +810,14 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|      mov        eax, [esp + 4]   /* src_argb */
 | 
|      mov        edx, [esp + 8]   /* dst_y */
 | 
|      mov        ecx, [esp + 12]  /* pix */
 | 
| -    movdqa     xmm5, kAddY16
 | 
|      movdqa     xmm4, kARGBToY
 | 
| +    movdqa     xmm5, kAddY16
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| +    movdqu     xmm0, [eax]
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
| +    movdqu     xmm2, [eax + 32]
 | 
| +    movdqu     xmm3, [eax + 48]
 | 
|      pmaddubsw  xmm0, xmm4
 | 
|      pmaddubsw  xmm1, xmm4
 | 
|      pmaddubsw  xmm2, xmm4
 | 
| @@ -810,15 +829,16 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|      psrlw      xmm2, 7
 | 
|      packuswb   xmm0, xmm2
 | 
|      paddb      xmm0, xmm5
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [edx], xmm0
 | 
| +    movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
| -// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 | 
| +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 | 
| +// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|    __asm {
 | 
| @@ -828,12 +848,11 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|      movdqa     xmm4, kARGBToYJ
 | 
|      movdqa     xmm5, kAddYJ64
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| +    movdqu     xmm0, [eax]
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
| +    movdqu     xmm2, [eax + 32]
 | 
| +    movdqu     xmm3, [eax + 48]
 | 
|      pmaddubsw  xmm0, xmm4
 | 
|      pmaddubsw  xmm1, xmm4
 | 
|      pmaddubsw  xmm2, xmm4
 | 
| @@ -846,15 +865,20 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|      psrlw      xmm0, 7
 | 
|      psrlw      xmm2, 7
 | 
|      packuswb   xmm0, xmm2
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [edx], xmm0
 | 
| +    movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
|  #ifdef HAS_ARGBTOYROW_AVX2
 | 
| +// vpermd for vphaddw + vpackuswb vpermd.
 | 
| +static const lvec32 kPermdARGBToY_AVX = {
 | 
| +  0, 4, 1, 5, 2, 6, 3, 7
 | 
| +};
 | 
| +
 | 
|  // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 | 
|  __declspec(naked) __declspec(align(32))
 | 
|  void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
| @@ -864,9 +888,8 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|      mov        ecx, [esp + 12]  /* pix */
 | 
|      vbroadcastf128 ymm4, kARGBToY
 | 
|      vbroadcastf128 ymm5, kAddY16
 | 
| -    vmovdqa    ymm6, kPermdARGBToY_AVX
 | 
| +    vmovdqu    ymm6, kPermdARGBToY_AVX
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      vmovdqu    ymm0, [eax]
 | 
|      vmovdqu    ymm1, [eax + 32]
 | 
| @@ -883,10 +906,10 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|      vpsrlw     ymm2, ymm2, 7
 | 
|      vpackuswb  ymm0, ymm0, ymm2  // mutates.
 | 
|      vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
 | 
| -    vpaddb     ymm0, ymm0, ymm5
 | 
| -    sub        ecx, 32
 | 
| +    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
 | 
|      vmovdqu    [edx], ymm0
 | 
|      lea        edx, [edx + 32]
 | 
| +    sub        ecx, 32
 | 
|      jg         convertloop
 | 
|      vzeroupper
 | 
|      ret
 | 
| @@ -904,9 +927,8 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|      mov        ecx, [esp + 12]  /* pix */
 | 
|      vbroadcastf128 ymm4, kARGBToYJ
 | 
|      vbroadcastf128 ymm5, kAddYJ64
 | 
| -    vmovdqa    ymm6, kPermdARGBToY_AVX
 | 
| +    vmovdqu    ymm6, kPermdARGBToY_AVX
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      vmovdqu    ymm0, [eax]
 | 
|      vmovdqu    ymm1, [eax + 32]
 | 
| @@ -925,9 +947,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|      vpsrlw     ymm2, ymm2, 7
 | 
|      vpackuswb  ymm0, ymm0, ymm2  // mutates.
 | 
|      vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
 | 
| -    sub        ecx, 32
 | 
|      vmovdqu    [edx], ymm0
 | 
|      lea        edx, [edx + 32]
 | 
| +    sub        ecx, 32
 | 
|      jg         convertloop
 | 
|  
 | 
|      vzeroupper
 | 
| @@ -937,15 +959,14 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|  #endif  //  HAS_ARGBTOYJROW_AVX2
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
| +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|    __asm {
 | 
|      mov        eax, [esp + 4]   /* src_argb */
 | 
|      mov        edx, [esp + 8]   /* dst_y */
 | 
|      mov        ecx, [esp + 12]  /* pix */
 | 
| +    movdqa     xmm4, kBGRAToY
 | 
|      movdqa     xmm5, kAddY16
 | 
| -    movdqa     xmm4, kARGBToY
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu     xmm0, [eax]
 | 
|      movdqu     xmm1, [eax + 16]
 | 
| @@ -962,24 +983,23 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|      psrlw      xmm2, 7
 | 
|      packuswb   xmm0, xmm2
 | 
|      paddb      xmm0, xmm5
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
| +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|    __asm {
 | 
|      mov        eax, [esp + 4]   /* src_argb */
 | 
|      mov        edx, [esp + 8]   /* dst_y */
 | 
|      mov        ecx, [esp + 12]  /* pix */
 | 
| -    movdqa     xmm4, kARGBToYJ
 | 
| -    movdqa     xmm5, kAddYJ64
 | 
| +    movdqa     xmm4, kABGRToY
 | 
| +    movdqa     xmm5, kAddY16
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu     xmm0, [eax]
 | 
|      movdqu     xmm1, [eax + 16]
 | 
| @@ -992,34 +1012,32 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|      lea        eax, [eax + 64]
 | 
|      phaddw     xmm0, xmm1
 | 
|      phaddw     xmm2, xmm3
 | 
| -    paddw      xmm0, xmm5
 | 
| -    paddw      xmm2, xmm5
 | 
|      psrlw      xmm0, 7
 | 
|      psrlw      xmm2, 7
 | 
|      packuswb   xmm0, xmm2
 | 
| -    sub        ecx, 16
 | 
| +    paddb      xmm0, xmm5
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
| +void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|    __asm {
 | 
|      mov        eax, [esp + 4]   /* src_argb */
 | 
|      mov        edx, [esp + 8]   /* dst_y */
 | 
|      mov        ecx, [esp + 12]  /* pix */
 | 
| +    movdqa     xmm4, kRGBAToY
 | 
|      movdqa     xmm5, kAddY16
 | 
| -    movdqa     xmm4, kBGRAToY
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| +    movdqu     xmm0, [eax]
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
| +    movdqu     xmm2, [eax + 32]
 | 
| +    movdqu     xmm3, [eax + 48]
 | 
|      pmaddubsw  xmm0, xmm4
 | 
|      pmaddubsw  xmm1, xmm4
 | 
|      pmaddubsw  xmm2, xmm4
 | 
| @@ -1031,187 +1049,87 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
|      psrlw      xmm2, 7
 | 
|      packuswb   xmm0, xmm2
 | 
|      paddb      xmm0, xmm5
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [edx], xmm0
 | 
| +    movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
| +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| +                       uint8* dst_u, uint8* dst_v, int width) {
 | 
|    __asm {
 | 
| -    mov        eax, [esp + 4]   /* src_argb */
 | 
| -    mov        edx, [esp + 8]   /* dst_y */
 | 
| -    mov        ecx, [esp + 12]  /* pix */
 | 
| -    movdqa     xmm5, kAddY16
 | 
| -    movdqa     xmm4, kBGRAToY
 | 
| +    push       esi
 | 
| +    push       edi
 | 
| +    mov        eax, [esp + 8 + 4]   // src_argb
 | 
| +    mov        esi, [esp + 8 + 8]   // src_stride_argb
 | 
| +    mov        edx, [esp + 8 + 12]  // dst_u
 | 
| +    mov        edi, [esp + 8 + 16]  // dst_v
 | 
| +    mov        ecx, [esp + 8 + 20]  // pix
 | 
| +    movdqa     xmm5, kAddUV128
 | 
| +    movdqa     xmm6, kARGBToV
 | 
| +    movdqa     xmm7, kARGBToU
 | 
| +    sub        edi, edx             // stride from u to v
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
|      movdqu     xmm0, [eax]
 | 
| +    movdqu     xmm4, [eax + esi]
 | 
| +    pavgb      xmm0, xmm4
 | 
|      movdqu     xmm1, [eax + 16]
 | 
| +    movdqu     xmm4, [eax + esi + 16]
 | 
| +    pavgb      xmm1, xmm4
 | 
|      movdqu     xmm2, [eax + 32]
 | 
| +    movdqu     xmm4, [eax + esi + 32]
 | 
| +    pavgb      xmm2, xmm4
 | 
|      movdqu     xmm3, [eax + 48]
 | 
| -    pmaddubsw  xmm0, xmm4
 | 
| -    pmaddubsw  xmm1, xmm4
 | 
| -    pmaddubsw  xmm2, xmm4
 | 
| -    pmaddubsw  xmm3, xmm4
 | 
| -    lea        eax, [eax + 64]
 | 
| -    phaddw     xmm0, xmm1
 | 
| -    phaddw     xmm2, xmm3
 | 
| -    psrlw      xmm0, 7
 | 
| -    psrlw      xmm2, 7
 | 
| -    packuswb   xmm0, xmm2
 | 
| -    paddb      xmm0, xmm5
 | 
| +    movdqu     xmm4, [eax + esi + 48]
 | 
| +    pavgb      xmm3, xmm4
 | 
| +
 | 
| +    lea        eax,  [eax + 64]
 | 
| +    movdqa     xmm4, xmm0
 | 
| +    shufps     xmm0, xmm1, 0x88
 | 
| +    shufps     xmm4, xmm1, 0xdd
 | 
| +    pavgb      xmm0, xmm4
 | 
| +    movdqa     xmm4, xmm2
 | 
| +    shufps     xmm2, xmm3, 0x88
 | 
| +    shufps     xmm4, xmm3, 0xdd
 | 
| +    pavgb      xmm2, xmm4
 | 
| +
 | 
| +    // step 2 - convert to U and V
 | 
| +    // from here down is very similar to Y code except
 | 
| +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
 | 
| +    movdqa     xmm1, xmm0
 | 
| +    movdqa     xmm3, xmm2
 | 
| +    pmaddubsw  xmm0, xmm7  // U
 | 
| +    pmaddubsw  xmm2, xmm7
 | 
| +    pmaddubsw  xmm1, xmm6  // V
 | 
| +    pmaddubsw  xmm3, xmm6
 | 
| +    phaddw     xmm0, xmm2
 | 
| +    phaddw     xmm1, xmm3
 | 
| +    psraw      xmm0, 8
 | 
| +    psraw      xmm1, 8
 | 
| +    packsswb   xmm0, xmm1
 | 
| +    paddb      xmm0, xmm5            // -> unsigned
 | 
| +
 | 
| +    // step 3 - store 8 U and 8 V values
 | 
| +    movlps     qword ptr [edx], xmm0 // U
 | 
| +    movhps     qword ptr [edx + edi], xmm0 // V
 | 
| +    lea        edx, [edx + 8]
 | 
|      sub        ecx, 16
 | 
| -    movdqu     [edx], xmm0
 | 
| -    lea        edx, [edx + 16]
 | 
|      jg         convertloop
 | 
| +
 | 
| +    pop        edi
 | 
| +    pop        esi
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
| -  __asm {
 | 
| -    mov        eax, [esp + 4]   /* src_argb */
 | 
| -    mov        edx, [esp + 8]   /* dst_y */
 | 
| -    mov        ecx, [esp + 12]  /* pix */
 | 
| -    movdqa     xmm5, kAddY16
 | 
| -    movdqa     xmm4, kABGRToY
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| -    pmaddubsw  xmm0, xmm4
 | 
| -    pmaddubsw  xmm1, xmm4
 | 
| -    pmaddubsw  xmm2, xmm4
 | 
| -    pmaddubsw  xmm3, xmm4
 | 
| -    lea        eax, [eax + 64]
 | 
| -    phaddw     xmm0, xmm1
 | 
| -    phaddw     xmm2, xmm3
 | 
| -    psrlw      xmm0, 7
 | 
| -    psrlw      xmm2, 7
 | 
| -    packuswb   xmm0, xmm2
 | 
| -    paddb      xmm0, xmm5
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [edx], xmm0
 | 
| -    lea        edx, [edx + 16]
 | 
| -    jg         convertloop
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
| -  __asm {
 | 
| -    mov        eax, [esp + 4]   /* src_argb */
 | 
| -    mov        edx, [esp + 8]   /* dst_y */
 | 
| -    mov        ecx, [esp + 12]  /* pix */
 | 
| -    movdqa     xmm5, kAddY16
 | 
| -    movdqa     xmm4, kABGRToY
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    movdqu     xmm0, [eax]
 | 
| -    movdqu     xmm1, [eax + 16]
 | 
| -    movdqu     xmm2, [eax + 32]
 | 
| -    movdqu     xmm3, [eax + 48]
 | 
| -    pmaddubsw  xmm0, xmm4
 | 
| -    pmaddubsw  xmm1, xmm4
 | 
| -    pmaddubsw  xmm2, xmm4
 | 
| -    pmaddubsw  xmm3, xmm4
 | 
| -    lea        eax, [eax + 64]
 | 
| -    phaddw     xmm0, xmm1
 | 
| -    phaddw     xmm2, xmm3
 | 
| -    psrlw      xmm0, 7
 | 
| -    psrlw      xmm2, 7
 | 
| -    packuswb   xmm0, xmm2
 | 
| -    paddb      xmm0, xmm5
 | 
| -    sub        ecx, 16
 | 
| -    movdqu     [edx], xmm0
 | 
| -    lea        edx, [edx + 16]
 | 
| -    jg         convertloop
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
| -  __asm {
 | 
| -    mov        eax, [esp + 4]   /* src_argb */
 | 
| -    mov        edx, [esp + 8]   /* dst_y */
 | 
| -    mov        ecx, [esp + 12]  /* pix */
 | 
| -    movdqa     xmm5, kAddY16
 | 
| -    movdqa     xmm4, kRGBAToY
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| -    pmaddubsw  xmm0, xmm4
 | 
| -    pmaddubsw  xmm1, xmm4
 | 
| -    pmaddubsw  xmm2, xmm4
 | 
| -    pmaddubsw  xmm3, xmm4
 | 
| -    lea        eax, [eax + 64]
 | 
| -    phaddw     xmm0, xmm1
 | 
| -    phaddw     xmm2, xmm3
 | 
| -    psrlw      xmm0, 7
 | 
| -    psrlw      xmm2, 7
 | 
| -    packuswb   xmm0, xmm2
 | 
| -    paddb      xmm0, xmm5
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [edx], xmm0
 | 
| -    lea        edx, [edx + 16]
 | 
| -    jg         convertloop
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 | 
| -  __asm {
 | 
| -    mov        eax, [esp + 4]   /* src_argb */
 | 
| -    mov        edx, [esp + 8]   /* dst_y */
 | 
| -    mov        ecx, [esp + 12]  /* pix */
 | 
| -    movdqa     xmm5, kAddY16
 | 
| -    movdqa     xmm4, kRGBAToY
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    movdqu     xmm0, [eax]
 | 
| -    movdqu     xmm1, [eax + 16]
 | 
| -    movdqu     xmm2, [eax + 32]
 | 
| -    movdqu     xmm3, [eax + 48]
 | 
| -    pmaddubsw  xmm0, xmm4
 | 
| -    pmaddubsw  xmm1, xmm4
 | 
| -    pmaddubsw  xmm2, xmm4
 | 
| -    pmaddubsw  xmm3, xmm4
 | 
| -    lea        eax, [eax + 64]
 | 
| -    phaddw     xmm0, xmm1
 | 
| -    phaddw     xmm2, xmm3
 | 
| -    psrlw      xmm0, 7
 | 
| -    psrlw      xmm2, 7
 | 
| -    packuswb   xmm0, xmm2
 | 
| -    paddb      xmm0, xmm5
 | 
| -    sub        ecx, 16
 | 
| -    movdqu     [edx], xmm0
 | 
| -    lea        edx, [edx + 16]
 | 
| -    jg         convertloop
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| -                       uint8* dst_u, uint8* dst_v, int width) {
 | 
| +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| +                        uint8* dst_u, uint8* dst_v, int width) {
 | 
|    __asm {
 | 
|      push       esi
 | 
|      push       edi
 | 
| @@ -1220,88 +1138,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|      mov        edx, [esp + 8 + 12]  // dst_u
 | 
|      mov        edi, [esp + 8 + 16]  // dst_v
 | 
|      mov        ecx, [esp + 8 + 20]  // pix
 | 
| -    movdqa     xmm7, kARGBToU
 | 
| -    movdqa     xmm6, kARGBToV
 | 
| -    movdqa     xmm5, kAddUV128
 | 
| +    movdqa     xmm5, kAddUVJ128
 | 
| +    movdqa     xmm6, kARGBToVJ
 | 
| +    movdqa     xmm7, kARGBToUJ
 | 
|      sub        edi, edx             // stride from u to v
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| -    pavgb      xmm0, [eax + esi]
 | 
| -    pavgb      xmm1, [eax + esi + 16]
 | 
| -    pavgb      xmm2, [eax + esi + 32]
 | 
| -    pavgb      xmm3, [eax + esi + 48]
 | 
| -    lea        eax,  [eax + 64]
 | 
| -    movdqa     xmm4, xmm0
 | 
| -    shufps     xmm0, xmm1, 0x88
 | 
| -    shufps     xmm4, xmm1, 0xdd
 | 
| +    movdqu     xmm0, [eax]
 | 
| +    movdqu     xmm4, [eax + esi]
 | 
|      pavgb      xmm0, xmm4
 | 
| -    movdqa     xmm4, xmm2
 | 
| -    shufps     xmm2, xmm3, 0x88
 | 
| -    shufps     xmm4, xmm3, 0xdd
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
| +    movdqu     xmm4, [eax + esi + 16]
 | 
| +    pavgb      xmm1, xmm4
 | 
| +    movdqu     xmm2, [eax + 32]
 | 
| +    movdqu     xmm4, [eax + esi + 32]
 | 
|      pavgb      xmm2, xmm4
 | 
| +    movdqu     xmm3, [eax + 48]
 | 
| +    movdqu     xmm4, [eax + esi + 48]
 | 
| +    pavgb      xmm3, xmm4
 | 
|  
 | 
| -    // step 2 - convert to U and V
 | 
| -    // from here down is very similar to Y code except
 | 
| -    // instead of 16 different pixels, its 8 pixels of U and 8 of V
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    movdqa     xmm3, xmm2
 | 
| -    pmaddubsw  xmm0, xmm7  // U
 | 
| -    pmaddubsw  xmm2, xmm7
 | 
| -    pmaddubsw  xmm1, xmm6  // V
 | 
| -    pmaddubsw  xmm3, xmm6
 | 
| -    phaddw     xmm0, xmm2
 | 
| -    phaddw     xmm1, xmm3
 | 
| -    psraw      xmm0, 8
 | 
| -    psraw      xmm1, 8
 | 
| -    packsswb   xmm0, xmm1
 | 
| -    paddb      xmm0, xmm5            // -> unsigned
 | 
| -
 | 
| -    // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
| -    movlps     qword ptr [edx], xmm0 // U
 | 
| -    movhps     qword ptr [edx + edi], xmm0 // V
 | 
| -    lea        edx, [edx + 8]
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| -                        uint8* dst_u, uint8* dst_v, int width) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 8 + 4]   // src_argb
 | 
| -    mov        esi, [esp + 8 + 8]   // src_stride_argb
 | 
| -    mov        edx, [esp + 8 + 12]  // dst_u
 | 
| -    mov        edi, [esp + 8 + 16]  // dst_v
 | 
| -    mov        ecx, [esp + 8 + 20]  // pix
 | 
| -    movdqa     xmm7, kARGBToUJ
 | 
| -    movdqa     xmm6, kARGBToVJ
 | 
| -    movdqa     xmm5, kAddUVJ128
 | 
| -    sub        edi, edx             // stride from u to v
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| -    pavgb      xmm0, [eax + esi]
 | 
| -    pavgb      xmm1, [eax + esi + 16]
 | 
| -    pavgb      xmm2, [eax + esi + 32]
 | 
| -    pavgb      xmm3, [eax + esi + 48]
 | 
|      lea        eax,  [eax + 64]
 | 
|      movdqa     xmm4, xmm0
 | 
|      shufps     xmm0, xmm1, 0x88
 | 
| @@ -1330,10 +1186,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|      packsswb   xmm0, xmm1
 | 
|  
 | 
|      // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
|      movlps     qword ptr [edx], xmm0 // U
 | 
|      movhps     qword ptr [edx + edi], xmm0 // V
 | 
|      lea        edx, [edx + 8]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
| @@ -1359,7 +1215,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 | 
|      vbroadcastf128 ymm7, kARGBToU
 | 
|      sub        edi, edx             // stride from u to v
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      /* step 1 - subsample 32x2 argb pixels to 16x1 */
 | 
|      vmovdqu    ymm0, [eax]
 | 
| @@ -1395,10 +1250,10 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 | 
|      vpaddb     ymm0, ymm0, ymm5  // -> unsigned
 | 
|  
 | 
|      // step 3 - store 16 U and 16 V values
 | 
| -    sub         ecx, 32
 | 
|      vextractf128 [edx], ymm0, 0 // U
 | 
|      vextractf128 [edx + edi], ymm0, 1 // V
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 32
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
| @@ -1410,237 +1265,36 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 | 
|  #endif  // HAS_ARGBTOUVROW_AVX2
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| -                                 uint8* dst_u, uint8* dst_v, int width) {
 | 
| +void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
 | 
| +                          uint8* dst_u, uint8* dst_v, int width) {
 | 
|    __asm {
 | 
| -    push       esi
 | 
|      push       edi
 | 
| -    mov        eax, [esp + 8 + 4]   // src_argb
 | 
| -    mov        esi, [esp + 8 + 8]   // src_stride_argb
 | 
| -    mov        edx, [esp + 8 + 12]  // dst_u
 | 
| -    mov        edi, [esp + 8 + 16]  // dst_v
 | 
| -    mov        ecx, [esp + 8 + 20]  // pix
 | 
| -    movdqa     xmm7, kARGBToU
 | 
| -    movdqa     xmm6, kARGBToV
 | 
| +    mov        eax, [esp + 4 + 4]   // src_argb
 | 
| +    mov        edx, [esp + 4 + 8]   // dst_u
 | 
| +    mov        edi, [esp + 4 + 12]  // dst_v
 | 
| +    mov        ecx, [esp + 4 + 16]  // pix
 | 
|      movdqa     xmm5, kAddUV128
 | 
| +    movdqa     xmm6, kARGBToV
 | 
| +    movdqa     xmm7, kARGBToU
 | 
|      sub        edi, edx             // stride from u to v
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
| -    movdqu     xmm0, [eax]
 | 
| +    /* convert to U and V */
 | 
| +    movdqu     xmm0, [eax]          // U
 | 
|      movdqu     xmm1, [eax + 16]
 | 
|      movdqu     xmm2, [eax + 32]
 | 
|      movdqu     xmm3, [eax + 48]
 | 
| -    movdqu     xmm4, [eax + esi]
 | 
| -    pavgb      xmm0, xmm4
 | 
| -    movdqu     xmm4, [eax + esi + 16]
 | 
| -    pavgb      xmm1, xmm4
 | 
| -    movdqu     xmm4, [eax + esi + 32]
 | 
| -    pavgb      xmm2, xmm4
 | 
| -    movdqu     xmm4, [eax + esi + 48]
 | 
| -    pavgb      xmm3, xmm4
 | 
| -    lea        eax,  [eax + 64]
 | 
| -    movdqa     xmm4, xmm0
 | 
| -    shufps     xmm0, xmm1, 0x88
 | 
| -    shufps     xmm4, xmm1, 0xdd
 | 
| -    pavgb      xmm0, xmm4
 | 
| -    movdqa     xmm4, xmm2
 | 
| -    shufps     xmm2, xmm3, 0x88
 | 
| -    shufps     xmm4, xmm3, 0xdd
 | 
| -    pavgb      xmm2, xmm4
 | 
| -
 | 
| -    // step 2 - convert to U and V
 | 
| -    // from here down is very similar to Y code except
 | 
| -    // instead of 16 different pixels, its 8 pixels of U and 8 of V
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    movdqa     xmm3, xmm2
 | 
| -    pmaddubsw  xmm0, xmm7  // U
 | 
| +    pmaddubsw  xmm0, xmm7
 | 
| +    pmaddubsw  xmm1, xmm7
 | 
|      pmaddubsw  xmm2, xmm7
 | 
| -    pmaddubsw  xmm1, xmm6  // V
 | 
| -    pmaddubsw  xmm3, xmm6
 | 
| -    phaddw     xmm0, xmm2
 | 
| -    phaddw     xmm1, xmm3
 | 
| +    pmaddubsw  xmm3, xmm7
 | 
| +    phaddw     xmm0, xmm1
 | 
| +    phaddw     xmm2, xmm3
 | 
|      psraw      xmm0, 8
 | 
| -    psraw      xmm1, 8
 | 
| -    packsswb   xmm0, xmm1
 | 
| -    paddb      xmm0, xmm5            // -> unsigned
 | 
| -
 | 
| -    // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
| -    movlps     qword ptr [edx], xmm0 // U
 | 
| -    movhps     qword ptr [edx + edi], xmm0 // V
 | 
| -    lea        edx, [edx + 8]
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| -                                 uint8* dst_u, uint8* dst_v, int width) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 8 + 4]   // src_argb
 | 
| -    mov        esi, [esp + 8 + 8]   // src_stride_argb
 | 
| -    mov        edx, [esp + 8 + 12]  // dst_u
 | 
| -    mov        edi, [esp + 8 + 16]  // dst_v
 | 
| -    mov        ecx, [esp + 8 + 20]  // pix
 | 
| -    movdqa     xmm7, kARGBToUJ
 | 
| -    movdqa     xmm6, kARGBToVJ
 | 
| -    movdqa     xmm5, kAddUVJ128
 | 
| -    sub        edi, edx             // stride from u to v
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
| -    movdqu     xmm0, [eax]
 | 
| -    movdqu     xmm1, [eax + 16]
 | 
| -    movdqu     xmm2, [eax + 32]
 | 
| -    movdqu     xmm3, [eax + 48]
 | 
| -    movdqu     xmm4, [eax + esi]
 | 
| -    pavgb      xmm0, xmm4
 | 
| -    movdqu     xmm4, [eax + esi + 16]
 | 
| -    pavgb      xmm1, xmm4
 | 
| -    movdqu     xmm4, [eax + esi + 32]
 | 
| -    pavgb      xmm2, xmm4
 | 
| -    movdqu     xmm4, [eax + esi + 48]
 | 
| -    pavgb      xmm3, xmm4
 | 
| -    lea        eax,  [eax + 64]
 | 
| -    movdqa     xmm4, xmm0
 | 
| -    shufps     xmm0, xmm1, 0x88
 | 
| -    shufps     xmm4, xmm1, 0xdd
 | 
| -    pavgb      xmm0, xmm4
 | 
| -    movdqa     xmm4, xmm2
 | 
| -    shufps     xmm2, xmm3, 0x88
 | 
| -    shufps     xmm4, xmm3, 0xdd
 | 
| -    pavgb      xmm2, xmm4
 | 
| -
 | 
| -    // step 2 - convert to U and V
 | 
| -    // from here down is very similar to Y code except
 | 
| -    // instead of 16 different pixels, its 8 pixels of U and 8 of V
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    movdqa     xmm3, xmm2
 | 
| -    pmaddubsw  xmm0, xmm7  // U
 | 
| -    pmaddubsw  xmm2, xmm7
 | 
| -    pmaddubsw  xmm1, xmm6  // V
 | 
| -    pmaddubsw  xmm3, xmm6
 | 
| -    phaddw     xmm0, xmm2
 | 
| -    phaddw     xmm1, xmm3
 | 
| -    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
 | 
| -    paddw      xmm1, xmm5
 | 
| -    psraw      xmm0, 8
 | 
| -    psraw      xmm1, 8
 | 
| -    packsswb   xmm0, xmm1
 | 
| -
 | 
| -    // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
| -    movlps     qword ptr [edx], xmm0 // U
 | 
| -    movhps     qword ptr [edx + edi], xmm0 // V
 | 
| -    lea        edx, [edx + 8]
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
 | 
| -                          uint8* dst_u, uint8* dst_v, int width) {
 | 
| -  __asm {
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 4 + 4]   // src_argb
 | 
| -    mov        edx, [esp + 4 + 8]   // dst_u
 | 
| -    mov        edi, [esp + 4 + 12]  // dst_v
 | 
| -    mov        ecx, [esp + 4 + 16]  // pix
 | 
| -    movdqa     xmm7, kARGBToU
 | 
| -    movdqa     xmm6, kARGBToV
 | 
| -    movdqa     xmm5, kAddUV128
 | 
| -    sub        edi, edx             // stride from u to v
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    /* convert to U and V */
 | 
| -    movdqa     xmm0, [eax]          // U
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| -    pmaddubsw  xmm0, xmm7
 | 
| -    pmaddubsw  xmm1, xmm7
 | 
| -    pmaddubsw  xmm2, xmm7
 | 
| -    pmaddubsw  xmm3, xmm7
 | 
| -    phaddw     xmm0, xmm1
 | 
| -    phaddw     xmm2, xmm3
 | 
| -    psraw      xmm0, 8
 | 
| -    psraw      xmm2, 8
 | 
| -    packsswb   xmm0, xmm2
 | 
| -    paddb      xmm0, xmm5
 | 
| -    sub        ecx,  16
 | 
| -    movdqa     [edx], xmm0
 | 
| -
 | 
| -    movdqa     xmm0, [eax]          // V
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| -    pmaddubsw  xmm0, xmm6
 | 
| -    pmaddubsw  xmm1, xmm6
 | 
| -    pmaddubsw  xmm2, xmm6
 | 
| -    pmaddubsw  xmm3, xmm6
 | 
| -    phaddw     xmm0, xmm1
 | 
| -    phaddw     xmm2, xmm3
 | 
| -    psraw      xmm0, 8
 | 
| -    psraw      xmm2, 8
 | 
| -    packsswb   xmm0, xmm2
 | 
| -    paddb      xmm0, xmm5
 | 
| -    lea        eax,  [eax + 64]
 | 
| -    movdqa     [edx + edi], xmm0
 | 
| -    lea        edx,  [edx + 16]
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
 | 
| -                                    uint8* dst_u, uint8* dst_v, int width) {
 | 
| -  __asm {
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 4 + 4]   // src_argb
 | 
| -    mov        edx, [esp + 4 + 8]   // dst_u
 | 
| -    mov        edi, [esp + 4 + 12]  // dst_v
 | 
| -    mov        ecx, [esp + 4 + 16]  // pix
 | 
| -    movdqa     xmm7, kARGBToU
 | 
| -    movdqa     xmm6, kARGBToV
 | 
| -    movdqa     xmm5, kAddUV128
 | 
| -    sub        edi, edx             // stride from u to v
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    /* convert to U and V */
 | 
| -    movdqu     xmm0, [eax]          // U
 | 
| -    movdqu     xmm1, [eax + 16]
 | 
| -    movdqu     xmm2, [eax + 32]
 | 
| -    movdqu     xmm3, [eax + 48]
 | 
| -    pmaddubsw  xmm0, xmm7
 | 
| -    pmaddubsw  xmm1, xmm7
 | 
| -    pmaddubsw  xmm2, xmm7
 | 
| -    pmaddubsw  xmm3, xmm7
 | 
| -    phaddw     xmm0, xmm1
 | 
| -    phaddw     xmm2, xmm3
 | 
| -    psraw      xmm0, 8
 | 
| -    psraw      xmm2, 8
 | 
| -    packsswb   xmm0, xmm2
 | 
| -    paddb      xmm0, xmm5
 | 
| -    sub        ecx,  16
 | 
| -    movdqu     [edx], xmm0
 | 
| +    psraw      xmm2, 8
 | 
| +    packsswb   xmm0, xmm2
 | 
| +    paddb      xmm0, xmm5
 | 
| +    movdqu     [edx], xmm0
 | 
|  
 | 
|      movdqu     xmm0, [eax]          // V
 | 
|      movdqu     xmm1, [eax + 16]
 | 
| @@ -1659,6 +1313,7 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
 | 
|      lea        eax,  [eax + 64]
 | 
|      movdqu     [edx + edi], xmm0
 | 
|      lea        edx,  [edx + 16]
 | 
| +    sub        ecx,  16
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
| @@ -1666,287 +1321,26 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
 | 
|    }
 | 
|  }
 | 
|  
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
 | 
| -                          uint8* dst_u, uint8* dst_v, int width) {
 | 
| -  __asm {
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 4 + 4]   // src_argb
 | 
| -    mov        edx, [esp + 4 + 8]   // dst_u
 | 
| -    mov        edi, [esp + 4 + 12]  // dst_v
 | 
| -    mov        ecx, [esp + 4 + 16]  // pix
 | 
| -    movdqa     xmm7, kARGBToU
 | 
| -    movdqa     xmm6, kARGBToV
 | 
| -    movdqa     xmm5, kAddUV128
 | 
| -    sub        edi, edx             // stride from u to v
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| -    lea        eax,  [eax + 64]
 | 
| -    movdqa     xmm4, xmm0
 | 
| -    shufps     xmm0, xmm1, 0x88
 | 
| -    shufps     xmm4, xmm1, 0xdd
 | 
| -    pavgb      xmm0, xmm4
 | 
| -    movdqa     xmm4, xmm2
 | 
| -    shufps     xmm2, xmm3, 0x88
 | 
| -    shufps     xmm4, xmm3, 0xdd
 | 
| -    pavgb      xmm2, xmm4
 | 
| -
 | 
| -    // step 2 - convert to U and V
 | 
| -    // from here down is very similar to Y code except
 | 
| -    // instead of 16 different pixels, its 8 pixels of U and 8 of V
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    movdqa     xmm3, xmm2
 | 
| -    pmaddubsw  xmm0, xmm7  // U
 | 
| -    pmaddubsw  xmm2, xmm7
 | 
| -    pmaddubsw  xmm1, xmm6  // V
 | 
| -    pmaddubsw  xmm3, xmm6
 | 
| -    phaddw     xmm0, xmm2
 | 
| -    phaddw     xmm1, xmm3
 | 
| -    psraw      xmm0, 8
 | 
| -    psraw      xmm1, 8
 | 
| -    packsswb   xmm0, xmm1
 | 
| -    paddb      xmm0, xmm5            // -> unsigned
 | 
| -
 | 
| -    // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
| -    movlps     qword ptr [edx], xmm0 // U
 | 
| -    movhps     qword ptr [edx + edi], xmm0 // V
 | 
| -    lea        edx, [edx + 8]
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
 | 
| -                                    uint8* dst_u, uint8* dst_v, int width) {
 | 
| -  __asm {
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 4 + 4]   // src_argb
 | 
| -    mov        edx, [esp + 4 + 8]   // dst_u
 | 
| -    mov        edi, [esp + 4 + 12]  // dst_v
 | 
| -    mov        ecx, [esp + 4 + 16]  // pix
 | 
| -    movdqa     xmm7, kARGBToU
 | 
| -    movdqa     xmm6, kARGBToV
 | 
| -    movdqa     xmm5, kAddUV128
 | 
| -    sub        edi, edx             // stride from u to v
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
| -    movdqu     xmm0, [eax]
 | 
| -    movdqu     xmm1, [eax + 16]
 | 
| -    movdqu     xmm2, [eax + 32]
 | 
| -    movdqu     xmm3, [eax + 48]
 | 
| -    lea        eax,  [eax + 64]
 | 
| -    movdqa     xmm4, xmm0
 | 
| -    shufps     xmm0, xmm1, 0x88
 | 
| -    shufps     xmm4, xmm1, 0xdd
 | 
| -    pavgb      xmm0, xmm4
 | 
| -    movdqa     xmm4, xmm2
 | 
| -    shufps     xmm2, xmm3, 0x88
 | 
| -    shufps     xmm4, xmm3, 0xdd
 | 
| -    pavgb      xmm2, xmm4
 | 
| -
 | 
| -    // step 2 - convert to U and V
 | 
| -    // from here down is very similar to Y code except
 | 
| -    // instead of 16 different pixels, its 8 pixels of U and 8 of V
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    movdqa     xmm3, xmm2
 | 
| -    pmaddubsw  xmm0, xmm7  // U
 | 
| -    pmaddubsw  xmm2, xmm7
 | 
| -    pmaddubsw  xmm1, xmm6  // V
 | 
| -    pmaddubsw  xmm3, xmm6
 | 
| -    phaddw     xmm0, xmm2
 | 
| -    phaddw     xmm1, xmm3
 | 
| -    psraw      xmm0, 8
 | 
| -    psraw      xmm1, 8
 | 
| -    packsswb   xmm0, xmm1
 | 
| -    paddb      xmm0, xmm5            // -> unsigned
 | 
| -
 | 
| -    // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
| -    movlps     qword ptr [edx], xmm0 // U
 | 
| -    movhps     qword ptr [edx + edi], xmm0 // V
 | 
| -    lea        edx, [edx + 8]
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| -                       uint8* dst_u, uint8* dst_v, int width) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 8 + 4]   // src_argb
 | 
| -    mov        esi, [esp + 8 + 8]   // src_stride_argb
 | 
| -    mov        edx, [esp + 8 + 12]  // dst_u
 | 
| -    mov        edi, [esp + 8 + 16]  // dst_v
 | 
| -    mov        ecx, [esp + 8 + 20]  // pix
 | 
| -    movdqa     xmm7, kBGRAToU
 | 
| -    movdqa     xmm6, kBGRAToV
 | 
| -    movdqa     xmm5, kAddUV128
 | 
| -    sub        edi, edx             // stride from u to v
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| -    pavgb      xmm0, [eax + esi]
 | 
| -    pavgb      xmm1, [eax + esi + 16]
 | 
| -    pavgb      xmm2, [eax + esi + 32]
 | 
| -    pavgb      xmm3, [eax + esi + 48]
 | 
| -    lea        eax,  [eax + 64]
 | 
| -    movdqa     xmm4, xmm0
 | 
| -    shufps     xmm0, xmm1, 0x88
 | 
| -    shufps     xmm4, xmm1, 0xdd
 | 
| -    pavgb      xmm0, xmm4
 | 
| -    movdqa     xmm4, xmm2
 | 
| -    shufps     xmm2, xmm3, 0x88
 | 
| -    shufps     xmm4, xmm3, 0xdd
 | 
| -    pavgb      xmm2, xmm4
 | 
| -
 | 
| -    // step 2 - convert to U and V
 | 
| -    // from here down is very similar to Y code except
 | 
| -    // instead of 16 different pixels, its 8 pixels of U and 8 of V
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    movdqa     xmm3, xmm2
 | 
| -    pmaddubsw  xmm0, xmm7  // U
 | 
| -    pmaddubsw  xmm2, xmm7
 | 
| -    pmaddubsw  xmm1, xmm6  // V
 | 
| -    pmaddubsw  xmm3, xmm6
 | 
| -    phaddw     xmm0, xmm2
 | 
| -    phaddw     xmm1, xmm3
 | 
| -    psraw      xmm0, 8
 | 
| -    psraw      xmm1, 8
 | 
| -    packsswb   xmm0, xmm1
 | 
| -    paddb      xmm0, xmm5            // -> unsigned
 | 
| -
 | 
| -    // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
| -    movlps     qword ptr [edx], xmm0 // U
 | 
| -    movhps     qword ptr [edx + edi], xmm0 // V
 | 
| -    lea        edx, [edx + 8]
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| -                                 uint8* dst_u, uint8* dst_v, int width) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 8 + 4]   // src_argb
 | 
| -    mov        esi, [esp + 8 + 8]   // src_stride_argb
 | 
| -    mov        edx, [esp + 8 + 12]  // dst_u
 | 
| -    mov        edi, [esp + 8 + 16]  // dst_v
 | 
| -    mov        ecx, [esp + 8 + 20]  // pix
 | 
| -    movdqa     xmm7, kBGRAToU
 | 
| -    movdqa     xmm6, kBGRAToV
 | 
| -    movdqa     xmm5, kAddUV128
 | 
| -    sub        edi, edx             // stride from u to v
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
| -    movdqu     xmm0, [eax]
 | 
| -    movdqu     xmm1, [eax + 16]
 | 
| -    movdqu     xmm2, [eax + 32]
 | 
| -    movdqu     xmm3, [eax + 48]
 | 
| -    movdqu     xmm4, [eax + esi]
 | 
| -    pavgb      xmm0, xmm4
 | 
| -    movdqu     xmm4, [eax + esi + 16]
 | 
| -    pavgb      xmm1, xmm4
 | 
| -    movdqu     xmm4, [eax + esi + 32]
 | 
| -    pavgb      xmm2, xmm4
 | 
| -    movdqu     xmm4, [eax + esi + 48]
 | 
| -    pavgb      xmm3, xmm4
 | 
| -    lea        eax,  [eax + 64]
 | 
| -    movdqa     xmm4, xmm0
 | 
| -    shufps     xmm0, xmm1, 0x88
 | 
| -    shufps     xmm4, xmm1, 0xdd
 | 
| -    pavgb      xmm0, xmm4
 | 
| -    movdqa     xmm4, xmm2
 | 
| -    shufps     xmm2, xmm3, 0x88
 | 
| -    shufps     xmm4, xmm3, 0xdd
 | 
| -    pavgb      xmm2, xmm4
 | 
| -
 | 
| -    // step 2 - convert to U and V
 | 
| -    // from here down is very similar to Y code except
 | 
| -    // instead of 16 different pixels, its 8 pixels of U and 8 of V
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    movdqa     xmm3, xmm2
 | 
| -    pmaddubsw  xmm0, xmm7  // U
 | 
| -    pmaddubsw  xmm2, xmm7
 | 
| -    pmaddubsw  xmm1, xmm6  // V
 | 
| -    pmaddubsw  xmm3, xmm6
 | 
| -    phaddw     xmm0, xmm2
 | 
| -    phaddw     xmm1, xmm3
 | 
| -    psraw      xmm0, 8
 | 
| -    psraw      xmm1, 8
 | 
| -    packsswb   xmm0, xmm1
 | 
| -    paddb      xmm0, xmm5            // -> unsigned
 | 
| -
 | 
| -    // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
| -    movlps     qword ptr [edx], xmm0 // U
 | 
| -    movhps     qword ptr [edx + edi], xmm0 // V
 | 
| -    lea        edx, [edx + 8]
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| -                       uint8* dst_u, uint8* dst_v, int width) {
 | 
| +__declspec(naked) __declspec(align(16))
 | 
| +void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
 | 
| +                          uint8* dst_u, uint8* dst_v, int width) {
 | 
|    __asm {
 | 
| -    push       esi
 | 
|      push       edi
 | 
| -    mov        eax, [esp + 8 + 4]   // src_argb
 | 
| -    mov        esi, [esp + 8 + 8]   // src_stride_argb
 | 
| -    mov        edx, [esp + 8 + 12]  // dst_u
 | 
| -    mov        edi, [esp + 8 + 16]  // dst_v
 | 
| -    mov        ecx, [esp + 8 + 20]  // pix
 | 
| -    movdqa     xmm7, kABGRToU
 | 
| -    movdqa     xmm6, kABGRToV
 | 
| +    mov        eax, [esp + 4 + 4]   // src_argb
 | 
| +    mov        edx, [esp + 4 + 8]   // dst_u
 | 
| +    mov        edi, [esp + 4 + 12]  // dst_v
 | 
| +    mov        ecx, [esp + 4 + 16]  // pix
 | 
|      movdqa     xmm5, kAddUV128
 | 
| +    movdqa     xmm6, kARGBToV
 | 
| +    movdqa     xmm7, kARGBToU
 | 
|      sub        edi, edx             // stride from u to v
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| -    pavgb      xmm0, [eax + esi]
 | 
| -    pavgb      xmm1, [eax + esi + 16]
 | 
| -    pavgb      xmm2, [eax + esi + 32]
 | 
| -    pavgb      xmm3, [eax + esi + 48]
 | 
| +    movdqu     xmm0, [eax]
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
| +    movdqu     xmm2, [eax + 32]
 | 
| +    movdqu     xmm3, [eax + 48]
 | 
|      lea        eax,  [eax + 64]
 | 
|      movdqa     xmm4, xmm0
 | 
|      shufps     xmm0, xmm1, 0x88
 | 
| @@ -1974,21 +1368,20 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|      paddb      xmm0, xmm5            // -> unsigned
 | 
|  
 | 
|      // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
|      movlps     qword ptr [edx], xmm0 // U
 | 
|      movhps     qword ptr [edx + edi], xmm0 // V
 | 
|      lea        edx, [edx + 8]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
| -    pop        esi
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| -                                 uint8* dst_u, uint8* dst_v, int width) {
 | 
| +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| +                       uint8* dst_u, uint8* dst_v, int width) {
 | 
|    __asm {
 | 
|      push       esi
 | 
|      push       edi
 | 
| @@ -1997,26 +1390,26 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|      mov        edx, [esp + 8 + 12]  // dst_u
 | 
|      mov        edi, [esp + 8 + 16]  // dst_v
 | 
|      mov        ecx, [esp + 8 + 20]  // pix
 | 
| -    movdqa     xmm7, kABGRToU
 | 
| -    movdqa     xmm6, kABGRToV
 | 
|      movdqa     xmm5, kAddUV128
 | 
| +    movdqa     xmm6, kBGRAToV
 | 
| +    movdqa     xmm7, kBGRAToU
 | 
|      sub        edi, edx             // stride from u to v
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
|      movdqu     xmm0, [eax]
 | 
| -    movdqu     xmm1, [eax + 16]
 | 
| -    movdqu     xmm2, [eax + 32]
 | 
| -    movdqu     xmm3, [eax + 48]
 | 
|      movdqu     xmm4, [eax + esi]
 | 
|      pavgb      xmm0, xmm4
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
|      movdqu     xmm4, [eax + esi + 16]
 | 
|      pavgb      xmm1, xmm4
 | 
| +    movdqu     xmm2, [eax + 32]
 | 
|      movdqu     xmm4, [eax + esi + 32]
 | 
|      pavgb      xmm2, xmm4
 | 
| +    movdqu     xmm3, [eax + 48]
 | 
|      movdqu     xmm4, [eax + esi + 48]
 | 
|      pavgb      xmm3, xmm4
 | 
| +
 | 
|      lea        eax,  [eax + 64]
 | 
|      movdqa     xmm4, xmm0
 | 
|      shufps     xmm0, xmm1, 0x88
 | 
| @@ -2044,10 +1437,10 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|      paddb      xmm0, xmm5            // -> unsigned
 | 
|  
 | 
|      // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
|      movlps     qword ptr [edx], xmm0 // U
 | 
|      movhps     qword ptr [edx + edi], xmm0 // V
 | 
|      lea        edx, [edx + 8]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
| @@ -2057,7 +1450,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|                         uint8* dst_u, uint8* dst_v, int width) {
 | 
|    __asm {
 | 
|      push       esi
 | 
| @@ -2067,22 +1460,26 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|      mov        edx, [esp + 8 + 12]  // dst_u
 | 
|      mov        edi, [esp + 8 + 16]  // dst_v
 | 
|      mov        ecx, [esp + 8 + 20]  // pix
 | 
| -    movdqa     xmm7, kRGBAToU
 | 
| -    movdqa     xmm6, kRGBAToV
 | 
|      movdqa     xmm5, kAddUV128
 | 
| +    movdqa     xmm6, kABGRToV
 | 
| +    movdqa     xmm7, kABGRToU
 | 
|      sub        edi, edx             // stride from u to v
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| -    pavgb      xmm0, [eax + esi]
 | 
| -    pavgb      xmm1, [eax + esi + 16]
 | 
| -    pavgb      xmm2, [eax + esi + 32]
 | 
| -    pavgb      xmm3, [eax + esi + 48]
 | 
| +    movdqu     xmm0, [eax]
 | 
| +    movdqu     xmm4, [eax + esi]
 | 
| +    pavgb      xmm0, xmm4
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
| +    movdqu     xmm4, [eax + esi + 16]
 | 
| +    pavgb      xmm1, xmm4
 | 
| +    movdqu     xmm2, [eax + 32]
 | 
| +    movdqu     xmm4, [eax + esi + 32]
 | 
| +    pavgb      xmm2, xmm4
 | 
| +    movdqu     xmm3, [eax + 48]
 | 
| +    movdqu     xmm4, [eax + esi + 48]
 | 
| +    pavgb      xmm3, xmm4
 | 
| +
 | 
|      lea        eax,  [eax + 64]
 | 
|      movdqa     xmm4, xmm0
 | 
|      shufps     xmm0, xmm1, 0x88
 | 
| @@ -2110,10 +1507,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|      paddb      xmm0, xmm5            // -> unsigned
 | 
|  
 | 
|      // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
|      movlps     qword ptr [edx], xmm0 // U
 | 
|      movhps     qword ptr [edx + edi], xmm0 // V
 | 
|      lea        edx, [edx + 8]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
| @@ -2123,8 +1520,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| -                                 uint8* dst_u, uint8* dst_v, int width) {
 | 
| +void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
| +                       uint8* dst_u, uint8* dst_v, int width) {
 | 
|    __asm {
 | 
|      push       esi
 | 
|      push       edi
 | 
| @@ -2133,26 +1530,26 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|      mov        edx, [esp + 8 + 12]  // dst_u
 | 
|      mov        edi, [esp + 8 + 16]  // dst_v
 | 
|      mov        ecx, [esp + 8 + 20]  // pix
 | 
| -    movdqa     xmm7, kRGBAToU
 | 
| -    movdqa     xmm6, kRGBAToV
 | 
|      movdqa     xmm5, kAddUV128
 | 
| +    movdqa     xmm6, kRGBAToV
 | 
| +    movdqa     xmm7, kRGBAToU
 | 
|      sub        edi, edx             // stride from u to v
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      /* step 1 - subsample 16x2 argb pixels to 8x1 */
 | 
|      movdqu     xmm0, [eax]
 | 
| -    movdqu     xmm1, [eax + 16]
 | 
| -    movdqu     xmm2, [eax + 32]
 | 
| -    movdqu     xmm3, [eax + 48]
 | 
|      movdqu     xmm4, [eax + esi]
 | 
|      pavgb      xmm0, xmm4
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
|      movdqu     xmm4, [eax + esi + 16]
 | 
|      pavgb      xmm1, xmm4
 | 
| +    movdqu     xmm2, [eax + 32]
 | 
|      movdqu     xmm4, [eax + esi + 32]
 | 
|      pavgb      xmm2, xmm4
 | 
| +    movdqu     xmm3, [eax + 48]
 | 
|      movdqu     xmm4, [eax + esi + 48]
 | 
|      pavgb      xmm3, xmm4
 | 
| +
 | 
|      lea        eax,  [eax + 64]
 | 
|      movdqa     xmm4, xmm0
 | 
|      shufps     xmm0, xmm1, 0x88
 | 
| @@ -2180,10 +1577,10 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|      paddb      xmm0, xmm5            // -> unsigned
 | 
|  
 | 
|      // step 3 - store 8 U and 8 V values
 | 
| -    sub        ecx, 16
 | 
|      movlps     qword ptr [edx], xmm0 // U
 | 
|      movhps     qword ptr [edx + edi], xmm0 // V
 | 
|      lea        edx, [edx + 8]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
| @@ -2193,36 +1590,68 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 | 
|  }
 | 
|  #endif  // HAS_ARGBTOYROW_SSSE3
 | 
|  
 | 
| -#ifdef HAS_I422TOARGBROW_AVX2
 | 
| -
 | 
| -static const lvec8 kUVToB_AVX = {
 | 
| -  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
 | 
| -  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
 | 
| -};
 | 
| -static const lvec8 kUVToR_AVX = {
 | 
| -  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
 | 
| -  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
 | 
| -};
 | 
| -static const lvec8 kUVToG_AVX = {
 | 
| -  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
 | 
| -  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
 | 
| -};
 | 
| -static const lvec16 kYToRgb_AVX = {
 | 
| -  YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
 | 
| -};
 | 
| -static const lvec16 kYSub16_AVX = {
 | 
| -  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 | 
| -};
 | 
| -static const lvec16 kUVBiasB_AVX = {
 | 
| -  BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
 | 
| -};
 | 
| -static const lvec16 kUVBiasG_AVX = {
 | 
| -  BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
 | 
| -};
 | 
| -static const lvec16 kUVBiasR_AVX = {
 | 
| -  BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
 | 
| -};
 | 
| +// Read 8 UV from 422, upsample to 16 UV.
 | 
| +#define READYUV422_AVX2 __asm {                                                \
 | 
| +    __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \
 | 
| +    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \
 | 
| +    __asm lea        esi,  [esi + 8]                                           \
 | 
| +    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
 | 
| +    __asm vpermq     ymm0, ymm0, 0xd8                                          \
 | 
| +    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
 | 
| +  }
 | 
| +
 | 
| +// Read 8 UV from NV12, upsample to 16 UV.
 | 
| +#define READNV12_AVX2 __asm {                                                  \
 | 
| +    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
 | 
| +    __asm lea        esi,  [esi + 16]                                          \
 | 
| +    __asm vpermq     ymm0, ymm0, 0xd8                                          \
 | 
| +    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
 | 
| +  }
 | 
| +
 | 
| +// Convert 16 pixels: 16 UV and 16 Y.
 | 
| +#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
 | 
| +    /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \
 | 
| +    __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \
 | 
| +    __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \
 | 
| +    __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \
 | 
| +    __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \
 | 
| +    __asm vpsubw     ymm2, ymm3, ymm2                                          \
 | 
| +    __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \
 | 
| +    __asm vpsubw     ymm1, ymm3, ymm1                                          \
 | 
| +    __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \
 | 
| +    __asm vpsubw     ymm0, ymm3, ymm0                                          \
 | 
| +    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
 | 
| +    __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \
 | 
| +    __asm lea        eax, [eax + 16]                                           \
 | 
| +    __asm vpermq     ymm3, ymm3, 0xd8                                          \
 | 
| +    __asm vpunpcklbw ymm3, ymm3, ymm3                                          \
 | 
| +    __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \
 | 
| +    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
 | 
| +    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
 | 
| +    __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \
 | 
| +    __asm vpsraw     ymm0, ymm0, 6                                             \
 | 
| +    __asm vpsraw     ymm1, ymm1, 6                                             \
 | 
| +    __asm vpsraw     ymm2, ymm2, 6                                             \
 | 
| +    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
 | 
| +    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
 | 
| +    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
 | 
| +  }
 | 
| +
 | 
| +// Store 16 ARGB values.
 | 
| +#define STOREARGB_AVX2 __asm {                                                 \
 | 
| +    /* Step 3: Weave into ARGB */                                              \
 | 
| +    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
 | 
| +    __asm vpermq     ymm0, ymm0, 0xd8                                          \
 | 
| +    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
 | 
| +    __asm vpermq     ymm2, ymm2, 0xd8                                          \
 | 
| +    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
 | 
| +    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
 | 
| +    __asm vmovdqu    [edx], ymm1                                               \
 | 
| +    __asm vmovdqu    [edx + 32], ymm0                                          \
 | 
| +    __asm lea        edx,  [edx + 64]                                          \
 | 
| +  }
 | 
|  
 | 
| +#ifdef HAS_I422TOARGBROW_AVX2
 | 
|  // 16 pixels
 | 
|  // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| @@ -2241,63 +1670,222 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
 | 
|      mov        ecx, [esp + 8 + 20]  // width
 | 
|      sub        edi, esi
 | 
|      vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 | 
| -    vpxor      ymm4, ymm4, ymm4
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    vmovq      xmm0, qword ptr [esi]          //  U
 | 
| -    vmovq      xmm1, qword ptr [esi + edi]    //  V
 | 
| -    lea        esi,  [esi + 8]
 | 
| -    vpunpcklbw ymm0, ymm0, ymm1               // UV
 | 
| -    vpermq     ymm0, ymm0, 0xd8
 | 
| -    vpunpcklwd ymm0, ymm0, ymm0              // UVUV
 | 
| -    vpmaddubsw ymm2, ymm0, kUVToB_AVX        // scale B UV
 | 
| -    vpmaddubsw ymm1, ymm0, kUVToG_AVX        // scale G UV
 | 
| -    vpmaddubsw ymm0, ymm0, kUVToR_AVX        // scale R UV
 | 
| -    vpsubw     ymm2, ymm2, kUVBiasB_AVX      // unbias back to signed
 | 
| -    vpsubw     ymm1, ymm1, kUVBiasG_AVX
 | 
| -    vpsubw     ymm0, ymm0, kUVBiasR_AVX
 | 
| -
 | 
| -    // Step 2: Find Y contribution to 16 R,G,B values
 | 
| -    vmovdqu    xmm3, [eax]                  // NOLINT
 | 
| -    lea        eax, [eax + 16]
 | 
| -    vpermq     ymm3, ymm3, 0xd8
 | 
| -    vpunpcklbw ymm3, ymm3, ymm4
 | 
| -    vpsubsw    ymm3, ymm3, kYSub16_AVX
 | 
| -    vpmullw    ymm3, ymm3, kYToRgb_AVX
 | 
| -    vpaddsw    ymm2, ymm2, ymm3           // B += Y
 | 
| -    vpaddsw    ymm1, ymm1, ymm3           // G += Y
 | 
| -    vpaddsw    ymm0, ymm0, ymm3           // R += Y
 | 
| -    vpsraw     ymm2, ymm2, 6
 | 
| -    vpsraw     ymm1, ymm1, 6
 | 
| -    vpsraw     ymm0, ymm0, 6
 | 
| -    vpackuswb  ymm2, ymm2, ymm2           // B
 | 
| -    vpackuswb  ymm1, ymm1, ymm1           // G
 | 
| -    vpackuswb  ymm0, ymm0, ymm0           // R
 | 
| -
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    vpunpcklbw ymm2, ymm2, ymm1           // BG
 | 
| +    READYUV422_AVX2
 | 
| +    YUVTORGB_AVX2(kYuvConstants)
 | 
| +    STOREARGB_AVX2
 | 
| +
 | 
| +    sub        ecx, 16
 | 
| +    jg         convertloop
 | 
| +
 | 
| +    pop        edi
 | 
| +    pop        esi
 | 
| +    vzeroupper
 | 
| +    ret
 | 
| +  }
 | 
| +}
 | 
| +#endif  // HAS_I422TOARGBROW_AVX2
 | 
| +
 | 
| +#ifdef HAS_NV12TOARGBROW_AVX2
 | 
| +// 16 pixels.
 | 
| +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
 | 
| +__declspec(naked) __declspec(align(16))
 | 
| +void NV12ToARGBRow_AVX2(const uint8* y_buf,
 | 
| +                        const uint8* uv_buf,
 | 
| +                        uint8* dst_argb,
 | 
| +                        int width) {
 | 
| +  __asm {
 | 
| +    push       esi
 | 
| +    mov        eax, [esp + 4 + 4]   // Y
 | 
| +    mov        esi, [esp + 4 + 8]   // UV
 | 
| +    mov        edx, [esp + 4 + 12]  // argb
 | 
| +    mov        ecx, [esp + 4 + 16]  // width
 | 
| +    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 | 
| +
 | 
| + convertloop:
 | 
| +    READNV12_AVX2
 | 
| +    YUVTORGB_AVX2(kYuvConstants)
 | 
| +    STOREARGB_AVX2
 | 
| +
 | 
| +    sub        ecx, 16
 | 
| +    jg         convertloop
 | 
| +
 | 
| +    pop        esi
 | 
| +    ret
 | 
| +  }
 | 
| +}
 | 
| +#endif  // HAS_NV12TOARGBROW_AVX2
 | 
| +
 | 
| +#ifdef HAS_NV21TOARGBROW_AVX2
 | 
| +// 16 pixels.
 | 
| +// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
 | 
| +__declspec(naked) __declspec(align(16))
 | 
| +void NV21ToARGBRow_AVX2(const uint8* y_buf,
 | 
| +                        const uint8* uv_buf,
 | 
| +                        uint8* dst_argb,
 | 
| +                        int width) {
 | 
| +  __asm {
 | 
| +    push       esi
 | 
| +    mov        eax, [esp + 4 + 4]   // Y
 | 
| +    mov        esi, [esp + 4 + 8]   // UV
 | 
| +    mov        edx, [esp + 4 + 12]  // argb
 | 
| +    mov        ecx, [esp + 4 + 16]  // width
 | 
| +    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 | 
| +
 | 
| + convertloop:
 | 
| +    READNV12_AVX2
 | 
| +    YUVTORGB_AVX2(kYvuConstants)
 | 
| +    STOREARGB_AVX2
 | 
| +
 | 
| +    sub        ecx, 16
 | 
| +    jg         convertloop
 | 
| +
 | 
| +    pop        esi
 | 
| +    ret
 | 
| +  }
 | 
| +}
 | 
| +#endif  // HAS_NV21TOARGBROW_AVX2
 | 
| +
 | 
| +#ifdef HAS_I422TOBGRAROW_AVX2
 | 
| +// 16 pixels
 | 
| +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
 | 
| +// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
 | 
| +__declspec(naked) __declspec(align(16))
 | 
| +void I422ToBGRARow_AVX2(const uint8* y_buf,
 | 
| +                        const uint8* u_buf,
 | 
| +                        const uint8* v_buf,
 | 
| +                        uint8* dst_argb,
 | 
| +                        int width) {
 | 
| +  __asm {
 | 
| +    push       esi
 | 
| +    push       edi
 | 
| +    mov        eax, [esp + 8 + 4]   // Y
 | 
| +    mov        esi, [esp + 8 + 8]   // U
 | 
| +    mov        edi, [esp + 8 + 12]  // V
 | 
| +    mov        edx, [esp + 8 + 16]  // argb
 | 
| +    mov        ecx, [esp + 8 + 20]  // width
 | 
| +    sub        edi, esi
 | 
| +    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 | 
| +
 | 
| + convertloop:
 | 
| +    READYUV422_AVX2
 | 
| +    YUVTORGB_AVX2(kYuvConstants)
 | 
| +
 | 
| +    // Step 3: Weave into BGRA
 | 
| +    vpunpcklbw ymm1, ymm1, ymm0           // GB
 | 
| +    vpermq     ymm1, ymm1, 0xd8
 | 
| +    vpunpcklbw ymm2, ymm5, ymm2           // AR
 | 
|      vpermq     ymm2, ymm2, 0xd8
 | 
| -    vpunpcklbw ymm0, ymm0, ymm5           // RA
 | 
| -    vpermq     ymm0, ymm0, 0xd8
 | 
| -    vpunpcklwd ymm1, ymm2, ymm0           // BGRA first 8 pixels
 | 
| -    vpunpckhwd ymm2, ymm2, ymm0           // BGRA next 8 pixels
 | 
| -    vmovdqu    [edx], ymm1
 | 
| +    vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels
 | 
| +    vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels
 | 
| +    vmovdqu    [edx], ymm0
 | 
|      vmovdqu    [edx + 32], ymm2
 | 
|      lea        edx,  [edx + 64]
 | 
|      sub        ecx, 16
 | 
|      jg         convertloop
 | 
| +
 | 
| +    pop        edi
 | 
| +    pop        esi
 | 
|      vzeroupper
 | 
| +    ret
 | 
| +  }
 | 
| +}
 | 
| +#endif  // HAS_I422TOBGRAROW_AVX2
 | 
| +
 | 
| +#ifdef HAS_I422TORGBAROW_AVX2
 | 
| +// 16 pixels
 | 
| +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
 | 
| +// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
 | 
| +__declspec(naked) __declspec(align(16))
 | 
| +void I422ToRGBARow_AVX2(const uint8* y_buf,
 | 
| +                        const uint8* u_buf,
 | 
| +                        const uint8* v_buf,
 | 
| +                        uint8* dst_argb,
 | 
| +                        int width) {
 | 
| +  __asm {
 | 
| +    push       esi
 | 
| +    push       edi
 | 
| +    mov        eax, [esp + 8 + 4]   // Y
 | 
| +    mov        esi, [esp + 8 + 8]   // U
 | 
| +    mov        edi, [esp + 8 + 12]  // V
 | 
| +    mov        edx, [esp + 8 + 16]  // argb
 | 
| +    mov        ecx, [esp + 8 + 20]  // width
 | 
| +    sub        edi, esi
 | 
| +    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 | 
| +
 | 
| + convertloop:
 | 
| +    READYUV422_AVX2
 | 
| +    YUVTORGB_AVX2(kYuvConstants)
 | 
| +
 | 
| +    // Step 3: Weave into RGBA
 | 
| +    vpunpcklbw ymm1, ymm1, ymm2           // GR
 | 
| +    vpermq     ymm1, ymm1, 0xd8
 | 
| +    vpunpcklbw ymm2, ymm5, ymm0           // AB
 | 
| +    vpermq     ymm2, ymm2, 0xd8
 | 
| +    vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels
 | 
| +    vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels
 | 
| +    vmovdqu    [edx], ymm0
 | 
| +    vmovdqu    [edx + 32], ymm1
 | 
| +    lea        edx,  [edx + 64]
 | 
| +    sub        ecx, 16
 | 
| +    jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
|      pop        esi
 | 
| +    vzeroupper
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
| -#endif  // HAS_I422TOARGBROW_AVX2
 | 
| +#endif  // HAS_I422TORGBAROW_AVX2
 | 
| +
 | 
| +#ifdef HAS_I422TOABGRROW_AVX2
 | 
| +// 16 pixels
 | 
| +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
 | 
| +// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
 | 
| +__declspec(naked) __declspec(align(16))
 | 
| +void I422ToABGRRow_AVX2(const uint8* y_buf,
 | 
| +                        const uint8* u_buf,
 | 
| +                        const uint8* v_buf,
 | 
| +                        uint8* dst_argb,
 | 
| +                        int width) {
 | 
| +  __asm {
 | 
| +    push       esi
 | 
| +    push       edi
 | 
| +    mov        eax, [esp + 8 + 4]   // Y
 | 
| +    mov        esi, [esp + 8 + 8]   // U
 | 
| +    mov        edi, [esp + 8 + 12]  // V
 | 
| +    mov        edx, [esp + 8 + 16]  // argb
 | 
| +    mov        ecx, [esp + 8 + 20]  // width
 | 
| +    sub        edi, esi
 | 
| +    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 | 
| +
 | 
| + convertloop:
 | 
| +    READYUV422_AVX2
 | 
| +    YUVTORGB_AVX2(kYuvConstants)
 | 
|  
 | 
| -#ifdef HAS_I422TOARGBROW_SSSE3
 | 
| +    // Step 3: Weave into ABGR
 | 
| +    vpunpcklbw ymm1, ymm2, ymm1           // RG
 | 
| +    vpermq     ymm1, ymm1, 0xd8
 | 
| +    vpunpcklbw ymm2, ymm0, ymm5           // BA
 | 
| +    vpermq     ymm2, ymm2, 0xd8
 | 
| +    vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels
 | 
| +    vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels
 | 
| +    vmovdqu    [edx], ymm0
 | 
| +    vmovdqu    [edx + 32], ymm1
 | 
| +    lea        edx,  [edx + 64]
 | 
| +    sub        ecx, 16
 | 
| +    jg         convertloop
 | 
| +
 | 
| +    pop        edi
 | 
| +    pop        esi
 | 
| +    vzeroupper
 | 
| +    ret
 | 
| +  }
 | 
| +}
 | 
| +#endif  // HAS_I422TOABGRROW_AVX2
 | 
|  
 | 
| +#if defined(HAS_I422TOARGBROW_SSSE3)
 | 
|  // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
 | 
|  
 | 
|  // Read 8 UV from 444.
 | 
| @@ -2336,51 +1924,26 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
 | 
|      __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
 | 
|    }
 | 
|  
 | 
| -// Convert 8 pixels: 8 UV and 8 Y.
 | 
| -#define YUVTORGB __asm {                                                       \
 | 
| -    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
 | 
| -    __asm movdqa     xmm1, xmm0                                                \
 | 
| -    __asm movdqa     xmm2, xmm0                                                \
 | 
| -    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
 | 
| -    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
 | 
| -    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
 | 
| -    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
 | 
| -    __asm psubw      xmm1, kUVBiasG                                            \
 | 
| -    __asm psubw      xmm2, kUVBiasR                                            \
 | 
| -    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
 | 
| -    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
 | 
| -    __asm lea        eax, [eax + 8]                                            \
 | 
| -    __asm punpcklbw  xmm3, xmm4                                                \
 | 
| -    __asm psubsw     xmm3, kYSub16                                             \
 | 
| -    __asm pmullw     xmm3, kYToRgb                                             \
 | 
| -    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
 | 
| -    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
 | 
| -    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
 | 
| -    __asm psraw      xmm0, 6                                                   \
 | 
| -    __asm psraw      xmm1, 6                                                   \
 | 
| -    __asm psraw      xmm2, 6                                                   \
 | 
| -    __asm packuswb   xmm0, xmm0           /* B */                              \
 | 
| -    __asm packuswb   xmm1, xmm1           /* G */                              \
 | 
| -    __asm packuswb   xmm2, xmm2           /* R */                              \
 | 
| -  }
 | 
| -
 | 
| -// Convert 8 pixels: 8 VU and 8 Y.
 | 
| -#define YVUTORGB __asm {                                                       \
 | 
| +// Convert 8 pixels: 8 UV and 8 Y.
 | 
| +#define YUVTORGB(YuvConstants) __asm {                                         \
 | 
|      /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
 | 
|      __asm movdqa     xmm1, xmm0                                                \
 | 
|      __asm movdqa     xmm2, xmm0                                                \
 | 
| -    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
 | 
| -    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
 | 
| -    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
 | 
| -    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
 | 
| -    __asm psubw      xmm1, kUVBiasG                                            \
 | 
| -    __asm psubw      xmm2, kUVBiasR                                            \
 | 
| +    __asm movdqa     xmm3, xmm0                                                \
 | 
| +    __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \
 | 
| +    __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \
 | 
| +    __asm psubw      xmm0, xmm1                                                \
 | 
| +    __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \
 | 
| +    __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \
 | 
| +    __asm psubw      xmm1, xmm2                                                \
 | 
| +    __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \
 | 
| +    __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \
 | 
| +    __asm psubw      xmm2, xmm3                                                \
 | 
|      /* Step 2: Find Y contribution to 8 R,G,B values */                        \
 | 
|      __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
 | 
|      __asm lea        eax, [eax + 8]                                            \
 | 
| -    __asm punpcklbw  xmm3, xmm4                                                \
 | 
| -    __asm psubsw     xmm3, kYSub16                                             \
 | 
| -    __asm pmullw     xmm3, kYToRgb                                             \
 | 
| +    __asm punpcklbw  xmm3, xmm3                                                \
 | 
| +    __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \
 | 
|      __asm paddsw     xmm0, xmm3           /* B += Y */                         \
 | 
|      __asm paddsw     xmm1, xmm3           /* G += Y */                         \
 | 
|      __asm paddsw     xmm2, xmm3           /* R += Y */                         \
 | 
| @@ -2392,7 +1955,131 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
 | 
|      __asm packuswb   xmm2, xmm2           /* R */                              \
 | 
|    }
 | 
|  
 | 
| -// 8 pixels, dest aligned 16.
 | 
| +// Store 8 ARGB values.
 | 
| +#define STOREARGB __asm {                                                      \
 | 
| +    /* Step 3: Weave into ARGB */                                              \
 | 
| +    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
 | 
| +    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
 | 
| +    __asm movdqa     xmm1, xmm0                                                \
 | 
| +    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
 | 
| +    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
 | 
| +    __asm movdqu     [edx], xmm0                                               \
 | 
| +    __asm movdqu     [edx + 16], xmm1                                          \
 | 
| +    __asm lea        edx,  [edx + 32]                                          \
 | 
| +  }
 | 
| +
 | 
| +// Store 8 BGRA values.
 | 
| +#define STOREBGRA __asm {                                                      \
 | 
| +    /* Step 3: Weave into BGRA */                                              \
 | 
| +    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
 | 
| +    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
 | 
| +    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
 | 
| +    __asm movdqa     xmm0, xmm5                                                \
 | 
| +    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
 | 
| +    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
 | 
| +    __asm movdqu     [edx], xmm5                                               \
 | 
| +    __asm movdqu     [edx + 16], xmm0                                          \
 | 
| +    __asm lea        edx,  [edx + 32]                                          \
 | 
| +  }
 | 
| +
 | 
| +// Store 8 ABGR values.
 | 
| +#define STOREABGR __asm {                                                      \
 | 
| +    /* Step 3: Weave into ABGR */                                              \
 | 
| +    __asm punpcklbw  xmm2, xmm1           /* RG */                             \
 | 
| +    __asm punpcklbw  xmm0, xmm5           /* BA */                             \
 | 
| +    __asm movdqa     xmm1, xmm2                                                \
 | 
| +    __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \
 | 
| +    __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \
 | 
| +    __asm movdqu     [edx], xmm2                                               \
 | 
| +    __asm movdqu     [edx + 16], xmm1                                          \
 | 
| +    __asm lea        edx,  [edx + 32]                                          \
 | 
| +  }
 | 
| +
 | 
| +// Store 8 RGBA values.
 | 
| +#define STORERGBA __asm {                                                      \
 | 
| +    /* Step 3: Weave into RGBA */                                              \
 | 
| +    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
 | 
| +    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
 | 
| +    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
 | 
| +    __asm movdqa     xmm0, xmm5                                                \
 | 
| +    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
 | 
| +    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
 | 
| +    __asm movdqu     [edx], xmm5                                               \
 | 
| +    __asm movdqu     [edx + 16], xmm0                                          \
 | 
| +    __asm lea        edx,  [edx + 32]                                          \
 | 
| +  }
 | 
| +
 | 
| +// Store 8 RGB24 values.
 | 
| +#define STORERGB24 __asm {                                                     \
 | 
| +    /* Step 3: Weave into RRGB */                                              \
 | 
| +    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
 | 
| +    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
 | 
| +    __asm movdqa     xmm1, xmm0                                                \
 | 
| +    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
 | 
| +    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
 | 
| +    /* Step 4: RRGB -> RGB24 */                                                \
 | 
| +    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
 | 
| +    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
 | 
| +    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
 | 
| +    __asm movq       qword ptr [edx], xmm0  /* First 8 bytes */                \
 | 
| +    __asm movdqu     [edx + 8], xmm1      /* Last 16 bytes */                  \
 | 
| +    __asm lea        edx,  [edx + 24]                                          \
 | 
| +  }
 | 
| +
 | 
| +// Store 8 RAW values.
 | 
| +#define STORERAW __asm {                                                       \
 | 
| +    /* Step 3: Weave into RRGB */                                              \
 | 
| +    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
 | 
| +    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
 | 
| +    __asm movdqa     xmm1, xmm0                                                \
 | 
| +    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
 | 
| +    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
 | 
| +    /* Step 4: RRGB -> RAW */                                                  \
 | 
| +    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
 | 
| +    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
 | 
| +    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
 | 
| +    __asm movq       qword ptr [edx], xmm0  /* First 8 bytes */                \
 | 
| +    __asm movdqu     [edx + 8], xmm1      /* Last 16 bytes */                  \
 | 
| +    __asm lea        edx,  [edx + 24]                                          \
 | 
| +  }
 | 
| +
 | 
| +// Store 8 RGB565 values.
 | 
| +#define STORERGB565 __asm {                                                    \
 | 
| +    /* Step 3: Weave into RRGB */                                              \
 | 
| +    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
 | 
| +    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
 | 
| +    __asm movdqa     xmm1, xmm0                                                \
 | 
| +    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
 | 
| +    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
 | 
| +    /* Step 4: RRGB -> RGB565 */                                               \
 | 
| +    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
 | 
| +    __asm movdqa     xmm2, xmm0    /* G */                                     \
 | 
| +    __asm pslld      xmm0, 8       /* R */                                     \
 | 
| +    __asm psrld      xmm3, 3       /* B */                                     \
 | 
| +    __asm psrld      xmm2, 5       /* G */                                     \
 | 
| +    __asm psrad      xmm0, 16      /* R */                                     \
 | 
| +    __asm pand       xmm3, xmm5    /* B */                                     \
 | 
| +    __asm pand       xmm2, xmm6    /* G */                                     \
 | 
| +    __asm pand       xmm0, xmm7    /* R */                                     \
 | 
| +    __asm por        xmm3, xmm2    /* BG */                                    \
 | 
| +    __asm por        xmm0, xmm3    /* BGR */                                   \
 | 
| +    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
 | 
| +    __asm movdqa     xmm2, xmm1    /* G */                                     \
 | 
| +    __asm pslld      xmm1, 8       /* R */                                     \
 | 
| +    __asm psrld      xmm3, 3       /* B */                                     \
 | 
| +    __asm psrld      xmm2, 5       /* G */                                     \
 | 
| +    __asm psrad      xmm1, 16      /* R */                                     \
 | 
| +    __asm pand       xmm3, xmm5    /* B */                                     \
 | 
| +    __asm pand       xmm2, xmm6    /* G */                                     \
 | 
| +    __asm pand       xmm1, xmm7    /* R */                                     \
 | 
| +    __asm por        xmm3, xmm2    /* BG */                                    \
 | 
| +    __asm por        xmm1, xmm3    /* BGR */                                   \
 | 
| +    __asm packssdw   xmm0, xmm1                                                \
 | 
| +    __asm movdqu     [edx], xmm0   /* store 8 pixels of RGB565 */              \
 | 
| +    __asm lea        edx, [edx + 16]                                           \
 | 
| +  }
 | 
| +
 | 
| +// 8 pixels.
 | 
|  // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void I444ToARGBRow_SSSE3(const uint8* y_buf,
 | 
| @@ -2410,22 +2097,12 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
 | 
|      mov        ecx, [esp + 8 + 20]  // width
 | 
|      sub        edi, esi
 | 
|      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      READYUV444
 | 
| -    YUVTORGB
 | 
| +    YUVTORGB(kYuvConstants)
 | 
| +    STOREARGB
 | 
|  
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm5           // RA
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
| @@ -2435,8 +2112,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
 | 
|    }
 | 
|  }
 | 
|  
 | 
| -// 8 pixels, dest aligned 16.
 | 
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
| +// 8 pixels.
 | 
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void I422ToRGB24Row_SSSE3(const uint8* y_buf,
 | 
|                            const uint8* u_buf,
 | 
| @@ -2452,27 +2129,14 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
 | 
|      mov        edx, [esp + 8 + 16]  // rgb24
 | 
|      mov        ecx, [esp + 8 + 20]  // width
 | 
|      sub        edi, esi
 | 
| -    pxor       xmm4, xmm4
 | 
|      movdqa     xmm5, kShuffleMaskARGBToRGB24_0
 | 
|      movdqa     xmm6, kShuffleMaskARGBToRGB24
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      READYUV422
 | 
| -    YUVTORGB
 | 
| +    YUVTORGB(kYuvConstants)
 | 
| +    STORERGB24
 | 
|  
 | 
| -    // Step 3: Weave into RRGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm2           // RR
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
 | 
| -    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
 | 
| -    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
 | 
| -    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
 | 
| -    movq       qword ptr [edx], xmm0  // First 8 bytes
 | 
| -    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
 | 
| -    lea        edx,  [edx + 24]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
| @@ -2482,8 +2146,8 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
 | 
|    }
 | 
|  }
 | 
|  
 | 
| -// 8 pixels, dest aligned 16.
 | 
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
| +// 8 pixels.
 | 
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void I422ToRAWRow_SSSE3(const uint8* y_buf,
 | 
|                          const uint8* u_buf,
 | 
| @@ -2499,27 +2163,14 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
 | 
|      mov        edx, [esp + 8 + 16]  // raw
 | 
|      mov        ecx, [esp + 8 + 20]  // width
 | 
|      sub        edi, esi
 | 
| -    pxor       xmm4, xmm4
 | 
|      movdqa     xmm5, kShuffleMaskARGBToRAW_0
 | 
|      movdqa     xmm6, kShuffleMaskARGBToRAW
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      READYUV422
 | 
| -    YUVTORGB
 | 
| +    YUVTORGB(kYuvConstants)
 | 
| +    STORERAW
 | 
|  
 | 
| -    // Step 3: Weave into RRGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm2           // RR
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
 | 
| -    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
 | 
| -    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
 | 
| -    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
 | 
| -    movq       qword ptr [edx], xmm0  // First 8 bytes
 | 
| -    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
 | 
| -    lea        edx,  [edx + 24]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
| @@ -2529,8 +2180,8 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
 | 
|    }
 | 
|  }
 | 
|  
 | 
| -// 8 pixels, dest unaligned.
 | 
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
| +// 8 pixels
 | 
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void I422ToRGB565Row_SSSE3(const uint8* y_buf,
 | 
|                             const uint8* u_buf,
 | 
| @@ -2546,7 +2197,6 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
 | 
|      mov        edx, [esp + 8 + 16]  // rgb565
 | 
|      mov        ecx, [esp + 8 + 20]  // width
 | 
|      sub        edi, esi
 | 
| -    pxor       xmm4, xmm4
 | 
|      pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
 | 
|      psrld      xmm5, 27
 | 
|      pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
 | 
| @@ -2555,45 +2205,12 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
 | 
|      pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
 | 
|      pslld      xmm7, 11
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      READYUV422
 | 
| -    YUVTORGB
 | 
| +    YUVTORGB(kYuvConstants)
 | 
| +    STORERGB565
 | 
|  
 | 
| -    // Step 3: Weave into RRGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm2           // RR
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
 | 
| -
 | 
| -    // Step 3b: RRGB -> RGB565
 | 
| -    movdqa     xmm3, xmm0    // B  first 4 pixels of argb
 | 
| -    movdqa     xmm2, xmm0    // G
 | 
| -    pslld      xmm0, 8       // R
 | 
| -    psrld      xmm3, 3       // B
 | 
| -    psrld      xmm2, 5       // G
 | 
| -    psrad      xmm0, 16      // R
 | 
| -    pand       xmm3, xmm5    // B
 | 
| -    pand       xmm2, xmm6    // G
 | 
| -    pand       xmm0, xmm7    // R
 | 
| -    por        xmm3, xmm2    // BG
 | 
| -    por        xmm0, xmm3    // BGR
 | 
| -    movdqa     xmm3, xmm1    // B  next 4 pixels of argb
 | 
| -    movdqa     xmm2, xmm1    // G
 | 
| -    pslld      xmm1, 8       // R
 | 
| -    psrld      xmm3, 3       // B
 | 
| -    psrld      xmm2, 5       // G
 | 
| -    psrad      xmm1, 16      // R
 | 
| -    pand       xmm3, xmm5    // B
 | 
| -    pand       xmm2, xmm6    // G
 | 
| -    pand       xmm1, xmm7    // R
 | 
| -    por        xmm3, xmm2    // BG
 | 
| -    por        xmm1, xmm3    // BGR
 | 
| -    packssdw   xmm0, xmm1
 | 
|      sub        ecx, 8
 | 
| -    movdqu     [edx], xmm0   // store 8 pixels of RGB565
 | 
| -    lea        edx, [edx + 16]
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
| @@ -2602,7 +2219,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
 | 
|    }
 | 
|  }
 | 
|  
 | 
| -// 8 pixels, dest aligned 16.
 | 
| +// 8 pixels.
 | 
|  // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void I422ToARGBRow_SSSE3(const uint8* y_buf,
 | 
| @@ -2620,22 +2237,12 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
 | 
|      mov        ecx, [esp + 8 + 20]  // width
 | 
|      sub        edi, esi
 | 
|      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      READYUV422
 | 
| -    YUVTORGB
 | 
| +    YUVTORGB(kYuvConstants)
 | 
| +    STOREARGB
 | 
|  
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm5           // RA
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
| @@ -2645,491 +2252,119 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
 | 
|    }
 | 
|  }
 | 
|  
 | 
| -// 8 pixels, dest aligned 16.
 | 
| +// 8 pixels.
 | 
|  // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
|  // Similar to I420 but duplicate UV once more.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void I411ToARGBRow_SSSE3(const uint8* y_buf,
 | 
|                           const uint8* u_buf,
 | 
|                           const uint8* v_buf,
 | 
| -                         uint8* dst_argb,
 | 
| -                         int width) {
 | 
| -  __asm {
 | 
| -    push       ebx
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 12 + 4]   // Y
 | 
| -    mov        esi, [esp + 12 + 8]   // U
 | 
| -    mov        edi, [esp + 12 + 12]  // V
 | 
| -    mov        edx, [esp + 12 + 16]  // argb
 | 
| -    mov        ecx, [esp + 12 + 20]  // width
 | 
| -    sub        edi, esi
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    READYUV411  // modifies EBX
 | 
| -    YUVTORGB
 | 
| -
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm5           // RA
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
| -    sub        ecx, 8
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    pop        ebx
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -// 8 pixels, dest aligned 16.
 | 
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void NV12ToARGBRow_SSSE3(const uint8* y_buf,
 | 
| -                         const uint8* uv_buf,
 | 
| -                         uint8* dst_argb,
 | 
| -                         int width) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    mov        eax, [esp + 4 + 4]   // Y
 | 
| -    mov        esi, [esp + 4 + 8]   // UV
 | 
| -    mov        edx, [esp + 4 + 12]  // argb
 | 
| -    mov        ecx, [esp + 4 + 16]  // width
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    READNV12
 | 
| -    YUVTORGB
 | 
| -
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm5           // RA
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
| -    sub        ecx, 8
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -// 8 pixels, dest aligned 16.
 | 
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void NV21ToARGBRow_SSSE3(const uint8* y_buf,
 | 
| -                         const uint8* uv_buf,
 | 
| -                         uint8* dst_argb,
 | 
| -                         int width) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    mov        eax, [esp + 4 + 4]   // Y
 | 
| -    mov        esi, [esp + 4 + 8]   // VU
 | 
| -    mov        edx, [esp + 4 + 12]  // argb
 | 
| -    mov        ecx, [esp + 4 + 16]  // width
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    READNV12
 | 
| -    YVUTORGB
 | 
| -
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm5           // RA
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
| -    sub        ecx, 8
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -// 8 pixels, unaligned.
 | 
| -// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 | 
| -                                   const uint8* u_buf,
 | 
| -                                   const uint8* v_buf,
 | 
| -                                   uint8* dst_argb,
 | 
| -                                   int width) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 8 + 4]   // Y
 | 
| -    mov        esi, [esp + 8 + 8]   // U
 | 
| -    mov        edi, [esp + 8 + 12]  // V
 | 
| -    mov        edx, [esp + 8 + 16]  // argb
 | 
| -    mov        ecx, [esp + 8 + 20]  // width
 | 
| -    sub        edi, esi
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    READYUV444
 | 
| -    YUVTORGB
 | 
| -
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm5           // RA
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
 | 
| -    movdqu     [edx], xmm0
 | 
| -    movdqu     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
| -    sub        ecx, 8
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -// 8 pixels, unaligned.
 | 
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 | 
| -                                   const uint8* u_buf,
 | 
| -                                   const uint8* v_buf,
 | 
| -                                   uint8* dst_argb,
 | 
| -                                   int width) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 8 + 4]   // Y
 | 
| -    mov        esi, [esp + 8 + 8]   // U
 | 
| -    mov        edi, [esp + 8 + 12]  // V
 | 
| -    mov        edx, [esp + 8 + 16]  // argb
 | 
| -    mov        ecx, [esp + 8 + 20]  // width
 | 
| -    sub        edi, esi
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    READYUV422
 | 
| -    YUVTORGB
 | 
| -
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm5           // RA
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
 | 
| -    movdqu     [edx], xmm0
 | 
| -    movdqu     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
| -    sub        ecx, 8
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -// 8 pixels, unaligned.
 | 
| -// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
| -// Similar to I420 but duplicate UV once more.
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 | 
| -                                   const uint8* u_buf,
 | 
| -                                   const uint8* v_buf,
 | 
| -                                   uint8* dst_argb,
 | 
| -                                   int width) {
 | 
| -  __asm {
 | 
| -    push       ebx
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 12 + 4]   // Y
 | 
| -    mov        esi, [esp + 12 + 8]   // U
 | 
| -    mov        edi, [esp + 12 + 12]  // V
 | 
| -    mov        edx, [esp + 12 + 16]  // argb
 | 
| -    mov        ecx, [esp + 12 + 20]  // width
 | 
| -    sub        edi, esi
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    READYUV411  // modifies EBX
 | 
| -    YUVTORGB
 | 
| -
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm5           // RA
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
 | 
| -    movdqu     [edx], xmm0
 | 
| -    movdqu     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
| -    sub        ecx, 8
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    pop        ebx
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -// 8 pixels, dest aligned 16.
 | 
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 | 
| -                                   const uint8* uv_buf,
 | 
| -                                   uint8* dst_argb,
 | 
| -                                   int width) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    mov        eax, [esp + 4 + 4]   // Y
 | 
| -    mov        esi, [esp + 4 + 8]   // UV
 | 
| -    mov        edx, [esp + 4 + 12]  // argb
 | 
| -    mov        ecx, [esp + 4 + 16]  // width
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    READNV12
 | 
| -    YUVTORGB
 | 
| -
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm5           // RA
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
 | 
| -    movdqu     [edx], xmm0
 | 
| -    movdqu     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
| -    sub        ecx, 8
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -// 8 pixels, dest aligned 16.
 | 
| -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 | 
| -                                   const uint8* uv_buf,
 | 
| -                                   uint8* dst_argb,
 | 
| -                                   int width) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    mov        eax, [esp + 4 + 4]   // Y
 | 
| -    mov        esi, [esp + 4 + 8]   // VU
 | 
| -    mov        edx, [esp + 4 + 12]  // argb
 | 
| -    mov        ecx, [esp + 4 + 16]  // width
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
| -
 | 
| -    align      4
 | 
| - convertloop:
 | 
| -    READNV12
 | 
| -    YVUTORGB
 | 
| -
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm0, xmm1           // BG
 | 
| -    punpcklbw  xmm2, xmm5           // RA
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
 | 
| -    movdqu     [edx], xmm0
 | 
| -    movdqu     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
| -    sub        ecx, 8
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void I422ToBGRARow_SSSE3(const uint8* y_buf,
 | 
| -                         const uint8* u_buf,
 | 
| -                         const uint8* v_buf,
 | 
| -                         uint8* dst_bgra,
 | 
| +                         uint8* dst_argb,
 | 
|                           int width) {
 | 
|    __asm {
 | 
| +    push       ebx
 | 
|      push       esi
 | 
|      push       edi
 | 
| -    mov        eax, [esp + 8 + 4]   // Y
 | 
| -    mov        esi, [esp + 8 + 8]   // U
 | 
| -    mov        edi, [esp + 8 + 12]  // V
 | 
| -    mov        edx, [esp + 8 + 16]  // bgra
 | 
| -    mov        ecx, [esp + 8 + 20]  // width
 | 
| +    mov        eax, [esp + 12 + 4]   // Y
 | 
| +    mov        esi, [esp + 12 + 8]   // U
 | 
| +    mov        edi, [esp + 12 + 12]  // V
 | 
| +    mov        edx, [esp + 12 + 16]  // argb
 | 
| +    mov        ecx, [esp + 12 + 20]  // width
 | 
|      sub        edi, esi
 | 
| -    pxor       xmm4, xmm4
 | 
| +    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    READYUV422
 | 
| -    YUVTORGB
 | 
| +    READYUV411  // modifies EBX
 | 
| +    YUVTORGB(kYuvConstants)
 | 
| +    STOREARGB
 | 
|  
 | 
| -    // Step 3: Weave into BGRA
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    punpcklbw  xmm1, xmm0           // GB
 | 
| -    punpcklbw  xmm5, xmm2           // AR
 | 
| -    movdqa     xmm0, xmm5
 | 
| -    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
 | 
| -    movdqa     [edx], xmm5
 | 
| -    movdqa     [edx + 16], xmm0
 | 
| -    lea        edx,  [edx + 32]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
|      pop        esi
 | 
| +    pop        ebx
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
| +// 8 pixels.
 | 
| +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
 | 
| -                                   const uint8* u_buf,
 | 
| -                                   const uint8* v_buf,
 | 
| -                                   uint8* dst_bgra,
 | 
| -                                   int width) {
 | 
| +void NV12ToARGBRow_SSSE3(const uint8* y_buf,
 | 
| +                         const uint8* uv_buf,
 | 
| +                         uint8* dst_argb,
 | 
| +                         int width) {
 | 
|    __asm {
 | 
|      push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 8 + 4]   // Y
 | 
| -    mov        esi, [esp + 8 + 8]   // U
 | 
| -    mov        edi, [esp + 8 + 12]  // V
 | 
| -    mov        edx, [esp + 8 + 16]  // bgra
 | 
| -    mov        ecx, [esp + 8 + 20]  // width
 | 
| -    sub        edi, esi
 | 
| -    pxor       xmm4, xmm4
 | 
| +    mov        eax, [esp + 4 + 4]   // Y
 | 
| +    mov        esi, [esp + 4 + 8]   // UV
 | 
| +    mov        edx, [esp + 4 + 12]  // argb
 | 
| +    mov        ecx, [esp + 4 + 16]  // width
 | 
| +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    READYUV422
 | 
| -    YUVTORGB
 | 
| +    READNV12
 | 
| +    YUVTORGB(kYuvConstants)
 | 
| +    STOREARGB
 | 
|  
 | 
| -    // Step 3: Weave into BGRA
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    punpcklbw  xmm1, xmm0           // GB
 | 
| -    punpcklbw  xmm5, xmm2           // AR
 | 
| -    movdqa     xmm0, xmm5
 | 
| -    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
 | 
| -    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
 | 
| -    movdqu     [edx], xmm5
 | 
| -    movdqu     [edx + 16], xmm0
 | 
| -    lea        edx,  [edx + 32]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
| -    pop        edi
 | 
|      pop        esi
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
| +// 8 pixels.
 | 
| +// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void I422ToABGRRow_SSSE3(const uint8* y_buf,
 | 
| -                         const uint8* u_buf,
 | 
| -                         const uint8* v_buf,
 | 
| -                         uint8* dst_abgr,
 | 
| +void NV21ToARGBRow_SSSE3(const uint8* y_buf,
 | 
| +                         const uint8* uv_buf,
 | 
| +                         uint8* dst_argb,
 | 
|                           int width) {
 | 
|    __asm {
 | 
|      push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 8 + 4]   // Y
 | 
| -    mov        esi, [esp + 8 + 8]   // U
 | 
| -    mov        edi, [esp + 8 + 12]  // V
 | 
| -    mov        edx, [esp + 8 + 16]  // abgr
 | 
| -    mov        ecx, [esp + 8 + 20]  // width
 | 
| -    sub        edi, esi
 | 
| +    mov        eax, [esp + 4 + 4]   // Y
 | 
| +    mov        esi, [esp + 4 + 8]   // UV
 | 
| +    mov        edx, [esp + 4 + 12]  // argb
 | 
| +    mov        ecx, [esp + 4 + 16]  // width
 | 
|      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    READYUV422
 | 
| -    YUVTORGB
 | 
| +    READNV12
 | 
| +    YUVTORGB(kYvuConstants)
 | 
| +    STOREARGB
 | 
|  
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm2, xmm1           // RG
 | 
| -    punpcklbw  xmm0, xmm5           // BA
 | 
| -    movdqa     xmm1, xmm2
 | 
| -    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
 | 
| -    movdqa     [edx], xmm2
 | 
| -    movdqa     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
| -    pop        edi
 | 
|      pop        esi
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
 | 
| -                                   const uint8* u_buf,
 | 
| -                                   const uint8* v_buf,
 | 
| -                                   uint8* dst_abgr,
 | 
| -                                   int width) {
 | 
| +void I422ToBGRARow_SSSE3(const uint8* y_buf,
 | 
| +                         const uint8* u_buf,
 | 
| +                         const uint8* v_buf,
 | 
| +                         uint8* dst_bgra,
 | 
| +                         int width) {
 | 
|    __asm {
 | 
|      push       esi
 | 
|      push       edi
 | 
|      mov        eax, [esp + 8 + 4]   // Y
 | 
|      mov        esi, [esp + 8 + 8]   // U
 | 
|      mov        edi, [esp + 8 + 12]  // V
 | 
| -    mov        edx, [esp + 8 + 16]  // abgr
 | 
| +    mov        edx, [esp + 8 + 16]  // bgra
 | 
|      mov        ecx, [esp + 8 + 20]  // width
 | 
|      sub        edi, esi
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    pxor       xmm4, xmm4
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      READYUV422
 | 
| -    YUVTORGB
 | 
| +    YUVTORGB(kYuvConstants)
 | 
| +    STOREBGRA
 | 
|  
 | 
| -    // Step 3: Weave into ARGB
 | 
| -    punpcklbw  xmm2, xmm1           // RG
 | 
| -    punpcklbw  xmm0, xmm5           // BA
 | 
| -    movdqa     xmm1, xmm2
 | 
| -    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
 | 
| -    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
 | 
| -    movdqu     [edx], xmm2
 | 
| -    movdqu     [edx + 16], xmm1
 | 
| -    lea        edx,  [edx + 32]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
| @@ -3140,10 +2375,10 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void I422ToRGBARow_SSSE3(const uint8* y_buf,
 | 
| +void I422ToABGRRow_SSSE3(const uint8* y_buf,
 | 
|                           const uint8* u_buf,
 | 
|                           const uint8* v_buf,
 | 
| -                         uint8* dst_rgba,
 | 
| +                         uint8* dst_abgr,
 | 
|                           int width) {
 | 
|    __asm {
 | 
|      push       esi
 | 
| @@ -3151,26 +2386,16 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
 | 
|      mov        eax, [esp + 8 + 4]   // Y
 | 
|      mov        esi, [esp + 8 + 8]   // U
 | 
|      mov        edi, [esp + 8 + 12]  // V
 | 
| -    mov        edx, [esp + 8 + 16]  // rgba
 | 
| +    mov        edx, [esp + 8 + 16]  // abgr
 | 
|      mov        ecx, [esp + 8 + 20]  // width
 | 
|      sub        edi, esi
 | 
| -    pxor       xmm4, xmm4
 | 
| +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      READYUV422
 | 
| -    YUVTORGB
 | 
| +    YUVTORGB(kYuvConstants)
 | 
| +    STOREABGR
 | 
|  
 | 
| -    // Step 3: Weave into RGBA
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    punpcklbw  xmm1, xmm2           // GR
 | 
| -    punpcklbw  xmm5, xmm0           // AB
 | 
| -    movdqa     xmm0, xmm5
 | 
| -    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
 | 
| -    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
 | 
| -    movdqa     [edx], xmm5
 | 
| -    movdqa     [edx + 16], xmm0
 | 
| -    lea        edx,  [edx + 32]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
| @@ -3181,11 +2406,11 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
 | 
| -                                   const uint8* u_buf,
 | 
| -                                   const uint8* v_buf,
 | 
| -                                   uint8* dst_rgba,
 | 
| -                                   int width) {
 | 
| +void I422ToRGBARow_SSSE3(const uint8* y_buf,
 | 
| +                         const uint8* u_buf,
 | 
| +                         const uint8* v_buf,
 | 
| +                         uint8* dst_rgba,
 | 
| +                         int width) {
 | 
|    __asm {
 | 
|      push       esi
 | 
|      push       edi
 | 
| @@ -3195,23 +2420,12 @@ void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
 | 
|      mov        edx, [esp + 8 + 16]  // rgba
 | 
|      mov        ecx, [esp + 8 + 20]  // width
 | 
|      sub        edi, esi
 | 
| -    pxor       xmm4, xmm4
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      READYUV422
 | 
| -    YUVTORGB
 | 
| +    YUVTORGB(kYuvConstants)
 | 
| +    STORERGBA
 | 
|  
 | 
| -    // Step 3: Weave into RGBA
 | 
| -    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 | 
| -    punpcklbw  xmm1, xmm2           // GR
 | 
| -    punpcklbw  xmm5, xmm0           // AB
 | 
| -    movdqa     xmm0, xmm5
 | 
| -    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
 | 
| -    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
 | 
| -    movdqu     [edx], xmm5
 | 
| -    movdqu     [edx + 16], xmm0
 | 
| -    lea        edx,  [edx + 32]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
| @@ -3224,32 +2438,32 @@ void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
 | 
|  #endif  // HAS_I422TOARGBROW_SSSE3
 | 
|  
 | 
|  #ifdef HAS_YTOARGBROW_SSE2
 | 
| +// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void YToARGBRow_SSE2(const uint8* y_buf,
 | 
|                       uint8* rgb_buf,
 | 
|                       int width) {
 | 
|    __asm {
 | 
| -    pxor       xmm5, xmm5
 | 
| -    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
 | 
| -    pslld      xmm4, 24
 | 
| -    mov        eax, 0x00100010
 | 
| -    movd       xmm3, eax
 | 
| -    pshufd     xmm3, xmm3, 0
 | 
| -    mov        eax, 0x004a004a       // 74
 | 
| +    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
 | 
|      movd       xmm2, eax
 | 
|      pshufd     xmm2, xmm2,0
 | 
| +    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
 | 
| +    movd       xmm3, eax
 | 
| +    pshufd     xmm3, xmm3, 0
 | 
| +    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
 | 
| +    pslld      xmm4, 24
 | 
| +
 | 
|      mov        eax, [esp + 4]       // Y
 | 
|      mov        edx, [esp + 8]       // rgb
 | 
|      mov        ecx, [esp + 12]      // width
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
 | 
|      movq       xmm0, qword ptr [eax]
 | 
|      lea        eax, [eax + 8]
 | 
| -    punpcklbw  xmm0, xmm5           // 0.Y
 | 
| +    punpcklbw  xmm0, xmm0           // Y.Y
 | 
| +    pmulhuw    xmm0, xmm2
 | 
|      psubusw    xmm0, xmm3
 | 
| -    pmullw     xmm0, xmm2
 | 
|      psrlw      xmm0, 6
 | 
|      packuswb   xmm0, xmm0           // G
 | 
|  
 | 
| @@ -3260,23 +2474,74 @@ void YToARGBRow_SSE2(const uint8* y_buf,
 | 
|      punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
 | 
|      por        xmm0, xmm4
 | 
|      por        xmm1, xmm4
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + 16], xmm1
 | 
| +    movdqu     [edx], xmm0
 | 
| +    movdqu     [edx + 16], xmm1
 | 
|      lea        edx,  [edx + 32]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
| -
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  #endif  // HAS_YTOARGBROW_SSE2
 | 
|  
 | 
| +#ifdef HAS_YTOARGBROW_AVX2
 | 
| +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 | 
| +// note: vpunpcklbw mutates and vpackuswb unmutates.
 | 
| +__declspec(naked) __declspec(align(16))
 | 
| +void YToARGBRow_AVX2(const uint8* y_buf,
 | 
| +                     uint8* rgb_buf,
 | 
| +                     int width) {
 | 
| +  __asm {
 | 
| +    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
 | 
| +    vmovd      xmm2, eax
 | 
| +    vbroadcastss ymm2, xmm2
 | 
| +    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
 | 
| +    vmovd      xmm3, eax
 | 
| +    vbroadcastss ymm3, xmm3
 | 
| +    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
 | 
| +    vpslld     ymm4, ymm4, 24
 | 
| +
 | 
| +    mov        eax, [esp + 4]       // Y
 | 
| +    mov        edx, [esp + 8]       // rgb
 | 
| +    mov        ecx, [esp + 12]      // width
 | 
| +
 | 
| + convertloop:
 | 
| +    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
 | 
| +    vmovdqu    xmm0, [eax]
 | 
| +    lea        eax, [eax + 16]
 | 
| +    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
 | 
| +    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
 | 
| +    vpmulhuw   ymm0, ymm0, ymm2
 | 
| +    vpsubusw   ymm0, ymm0, ymm3
 | 
| +    vpsrlw     ymm0, ymm0, 6
 | 
| +    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
 | 
| +
 | 
| +    // TODO(fbarchard): Weave alpha with unpack.
 | 
| +    // Step 2: Weave into ARGB
 | 
| +    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
 | 
| +    vpermq     ymm1, ymm1, 0xd8
 | 
| +    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
 | 
| +    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
 | 
| +    vpor       ymm0, ymm0, ymm4
 | 
| +    vpor       ymm1, ymm1, ymm4
 | 
| +    vmovdqu    [edx], ymm0
 | 
| +    vmovdqu    [edx + 32], ymm1
 | 
| +    lea        edx,  [edx + 64]
 | 
| +    sub        ecx, 16
 | 
| +    jg         convertloop
 | 
| +    vzeroupper
 | 
| +    ret
 | 
| +  }
 | 
| +}
 | 
| +#endif  // HAS_YTOARGBROW_AVX2
 | 
| +
 | 
|  #ifdef HAS_MIRRORROW_SSSE3
 | 
|  // Shuffle table for reversing the bytes.
 | 
|  static const uvec8 kShuffleMirror = {
 | 
|    15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 | 
|  };
 | 
|  
 | 
| +// TODO(fbarchard): Replace lea with -16 offset.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 | 
|    __asm {
 | 
| @@ -3284,15 +2549,13 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 | 
|      mov       edx, [esp + 8]   // dst
 | 
|      mov       ecx, [esp + 12]  // width
 | 
|      movdqa    xmm5, kShuffleMirror
 | 
| -    lea       eax, [eax - 16]
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa    xmm0, [eax + ecx]
 | 
| +    movdqu    xmm0, [eax - 16 + ecx]
 | 
|      pshufb    xmm0, xmm5
 | 
| -    sub       ecx, 16
 | 
| -    movdqa    [edx], xmm0
 | 
| +    movdqu    [edx], xmm0
 | 
|      lea       edx, [edx + 16]
 | 
| +    sub       ecx, 16
 | 
|      jg        convertloop
 | 
|      ret
 | 
|    }
 | 
| @@ -3300,29 +2563,21 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 | 
|  #endif  // HAS_MIRRORROW_SSSE3
 | 
|  
 | 
|  #ifdef HAS_MIRRORROW_AVX2
 | 
| -// Shuffle table for reversing the bytes.
 | 
| -static const ulvec8 kShuffleMirror_AVX2 = {
 | 
| -  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
 | 
| -  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 | 
| -};
 | 
| -
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 | 
|    __asm {
 | 
|      mov       eax, [esp + 4]   // src
 | 
|      mov       edx, [esp + 8]   // dst
 | 
|      mov       ecx, [esp + 12]  // width
 | 
| -    vmovdqa   ymm5, kShuffleMirror_AVX2
 | 
| -    lea       eax, [eax - 32]
 | 
| +    vbroadcastf128 ymm5, kShuffleMirror
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    vmovdqu   ymm0, [eax + ecx]
 | 
| +    vmovdqu   ymm0, [eax - 32 + ecx]
 | 
|      vpshufb   ymm0, ymm0, ymm5
 | 
|      vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
 | 
| -    sub       ecx, 32
 | 
|      vmovdqu   [edx], ymm0
 | 
|      lea       edx, [edx + 32]
 | 
| +    sub       ecx, 32
 | 
|      jg        convertloop
 | 
|      vzeroupper
 | 
|      ret
 | 
| @@ -3331,19 +2586,15 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 | 
|  #endif  // HAS_MIRRORROW_AVX2
 | 
|  
 | 
|  #ifdef HAS_MIRRORROW_SSE2
 | 
| -// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
 | 
| -// version can not.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 | 
|    __asm {
 | 
|      mov       eax, [esp + 4]   // src
 | 
|      mov       edx, [esp + 8]   // dst
 | 
|      mov       ecx, [esp + 12]  // width
 | 
| -    lea       eax, [eax - 16]
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqu    xmm0, [eax + ecx]
 | 
| +    movdqu    xmm0, [eax - 16 + ecx]
 | 
|      movdqa    xmm1, xmm0        // swap bytes
 | 
|      psllw     xmm0, 8
 | 
|      psrlw     xmm1, 8
 | 
| @@ -3351,9 +2602,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 | 
|      pshuflw   xmm0, xmm0, 0x1b  // swap words
 | 
|      pshufhw   xmm0, xmm0, 0x1b
 | 
|      pshufd    xmm0, xmm0, 0x4e  // swap qwords
 | 
| -    sub       ecx, 16
 | 
|      movdqu    [edx], xmm0
 | 
|      lea       edx, [edx + 16]
 | 
| +    sub       ecx, 16
 | 
|      jg        convertloop
 | 
|      ret
 | 
|    }
 | 
| @@ -3379,15 +2630,14 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 | 
|      lea       eax, [eax + ecx * 2 - 16]
 | 
|      sub       edi, edx
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa    xmm0, [eax]
 | 
| +    movdqu    xmm0, [eax]
 | 
|      lea       eax, [eax - 16]
 | 
|      pshufb    xmm0, xmm1
 | 
| -    sub       ecx, 8
 | 
|      movlpd    qword ptr [edx], xmm0
 | 
|      movhpd    qword ptr [edx + edi], xmm0
 | 
|      lea       edx, [edx + 8]
 | 
| +    sub       ecx, 8
 | 
|      jg        convertloop
 | 
|  
 | 
|      pop       edi
 | 
| @@ -3396,34 +2646,27 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 | 
|  }
 | 
|  #endif  // HAS_MIRRORROW_UV_SSSE3
 | 
|  
 | 
| -#ifdef HAS_ARGBMIRRORROW_SSSE3
 | 
| -// Shuffle table for reversing the bytes.
 | 
| -static const uvec8 kARGBShuffleMirror = {
 | 
| -  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
 | 
| -};
 | 
| -
 | 
| +#ifdef HAS_ARGBMIRRORROW_SSE2
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 | 
| +void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 | 
|    __asm {
 | 
|      mov       eax, [esp + 4]   // src
 | 
|      mov       edx, [esp + 8]   // dst
 | 
|      mov       ecx, [esp + 12]  // width
 | 
|      lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
 | 
| -    movdqa    xmm5, kARGBShuffleMirror
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa    xmm0, [eax]
 | 
| +    movdqu    xmm0, [eax]
 | 
|      lea       eax, [eax - 16]
 | 
| -    pshufb    xmm0, xmm5
 | 
| -    sub       ecx, 4
 | 
| -    movdqa    [edx], xmm0
 | 
| +    pshufd    xmm0, xmm0, 0x1b
 | 
| +    movdqu    [edx], xmm0
 | 
|      lea       edx, [edx + 16]
 | 
| +    sub       ecx, 4
 | 
|      jg        convertloop
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
| -#endif  // HAS_ARGBMIRRORROW_SSSE3
 | 
| +#endif  // HAS_ARGBMIRRORROW_SSE2
 | 
|  
 | 
|  #ifdef HAS_ARGBMIRRORROW_AVX2
 | 
|  // Shuffle table for reversing the bytes.
 | 
| @@ -3437,15 +2680,13 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 | 
|      mov       eax, [esp + 4]   // src
 | 
|      mov       edx, [esp + 8]   // dst
 | 
|      mov       ecx, [esp + 12]  // width
 | 
| -    lea       eax, [eax - 32]
 | 
| -    vmovdqa   ymm5, kARGBShuffleMirror_AVX2
 | 
| +    vmovdqu   ymm5, kARGBShuffleMirror_AVX2
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    vpermd    ymm0, ymm5, [eax + ecx * 4]  // permute dword order
 | 
| -    sub       ecx, 8
 | 
| +    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
 | 
|      vmovdqu   [edx], ymm0
 | 
|      lea       edx, [edx + 32]
 | 
| +    sub       ecx, 8
 | 
|      jg        convertloop
 | 
|      vzeroupper
 | 
|      ret
 | 
| @@ -3466,44 +2707,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 | 
|      psrlw      xmm5, 8
 | 
|      sub        edi, edx
 | 
|  
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    lea        eax,  [eax + 32]
 | 
| -    movdqa     xmm2, xmm0
 | 
| -    movdqa     xmm3, xmm1
 | 
| -    pand       xmm0, xmm5   // even bytes
 | 
| -    pand       xmm1, xmm5
 | 
| -    packuswb   xmm0, xmm1
 | 
| -    psrlw      xmm2, 8      // odd bytes
 | 
| -    psrlw      xmm3, 8
 | 
| -    packuswb   xmm2, xmm3
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + edi], xmm2
 | 
| -    lea        edx, [edx + 16]
 | 
| -    sub        ecx, 16
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 | 
| -                               int pix) {
 | 
| -  __asm {
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 4 + 4]    // src_uv
 | 
| -    mov        edx, [esp + 4 + 8]    // dst_u
 | 
| -    mov        edi, [esp + 4 + 12]   // dst_v
 | 
| -    mov        ecx, [esp + 4 + 16]   // pix
 | 
| -    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
 | 
| -    psrlw      xmm5, 8
 | 
| -    sub        edi, edx
 | 
| -
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movdqu     xmm0, [eax]
 | 
|      movdqu     xmm1, [eax + 16]
 | 
| @@ -3526,6 +2729,7 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
| +
 | 
|  #endif  // HAS_SPLITUVROW_SSE2
 | 
|  
 | 
|  #ifdef HAS_SPLITUVROW_AVX2
 | 
| @@ -3541,7 +2745,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 | 
|      vpsrlw     ymm5, ymm5, 8
 | 
|      sub        edi, edx
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      vmovdqu    ymm0, [eax]
 | 
|      vmovdqu    ymm1, [eax + 32]
 | 
| @@ -3579,37 +2782,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 | 
|      mov        ecx, [esp + 4 + 16]   // width
 | 
|      sub        edx, eax
 | 
|  
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    movdqa     xmm0, [eax]      // read 16 U's
 | 
| -    movdqa     xmm1, [eax + edx]  // and 16 V's
 | 
| -    lea        eax,  [eax + 16]
 | 
| -    movdqa     xmm2, xmm0
 | 
| -    punpcklbw  xmm0, xmm1       // first 8 UV pairs
 | 
| -    punpckhbw  xmm2, xmm1       // next 8 UV pairs
 | 
| -    movdqa     [edi], xmm0
 | 
| -    movdqa     [edi + 16], xmm2
 | 
| -    lea        edi, [edi + 32]
 | 
| -    sub        ecx, 16
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
 | 
| -                               uint8* dst_uv, int width) {
 | 
| -  __asm {
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 4 + 4]    // src_u
 | 
| -    mov        edx, [esp + 4 + 8]    // src_v
 | 
| -    mov        edi, [esp + 4 + 12]   // dst_uv
 | 
| -    mov        ecx, [esp + 4 + 16]   // width
 | 
| -    sub        edx, eax
 | 
| -
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movdqu     xmm0, [eax]      // read 16 U's
 | 
|      movdqu     xmm1, [eax + edx]  // and 16 V's
 | 
| @@ -3641,17 +2813,16 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 | 
|      mov        ecx, [esp + 4 + 16]   // width
 | 
|      sub        edx, eax
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      vmovdqu    ymm0, [eax]           // read 32 U's
 | 
|      vmovdqu    ymm1, [eax + edx]     // and 32 V's
 | 
|      lea        eax,  [eax + 32]
 | 
|      vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
 | 
|      vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
 | 
| -    vperm2i128 ymm1, ymm2, ymm0, 0x20  // low 128 of ymm2 and low 128 of ymm0
 | 
| -    vperm2i128 ymm2, ymm2, ymm0, 0x31  // high 128 of ymm2 and high 128 of ymm0
 | 
| -    vmovdqu    [edi], ymm1
 | 
| -    vmovdqu    [edi + 32], ymm2
 | 
| +    vextractf128 [edi], ymm2, 0       // bytes 0..15
 | 
| +    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
 | 
| +    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
 | 
| +    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
 | 
|      lea        edi, [edi + 64]
 | 
|      sub        ecx, 32
 | 
|      jg         convertloop
 | 
| @@ -3672,13 +2843,12 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
 | 
|      mov        edx, [esp + 8]   // dst
 | 
|      mov        ecx, [esp + 12]  // count
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| +    movdqu     xmm0, [eax]
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
|      lea        eax, [eax + 32]
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + 16], xmm1
 | 
| +    movdqu     [edx], xmm0
 | 
| +    movdqu     [edx + 16], xmm1
 | 
|      lea        edx, [edx + 32]
 | 
|      sub        ecx, 32
 | 
|      jg         convertloop
 | 
| @@ -3687,39 +2857,46 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
 | 
|  }
 | 
|  #endif  // HAS_COPYROW_SSE2
 | 
|  
 | 
| -// Unaligned Multiple of 1.
 | 
| +#ifdef HAS_COPYROW_AVX
 | 
| +// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
 | 
| +void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
 | 
|    __asm {
 | 
| -    mov        eax, esi
 | 
| -    mov        edx, edi
 | 
| -    mov        esi, [esp + 4]   // src
 | 
| -    mov        edi, [esp + 8]   // dst
 | 
| +    mov        eax, [esp + 4]   // src
 | 
| +    mov        edx, [esp + 8]   // dst
 | 
|      mov        ecx, [esp + 12]  // count
 | 
| -    rep movsb
 | 
| -    mov        edi, edx
 | 
| -    mov        esi, eax
 | 
| +
 | 
| +  convertloop:
 | 
| +    vmovdqu    ymm0, [eax]
 | 
| +    vmovdqu    ymm1, [eax + 32]
 | 
| +    lea        eax, [eax + 64]
 | 
| +    vmovdqu    [edx], ymm0
 | 
| +    vmovdqu    [edx + 32], ymm1
 | 
| +    lea        edx, [edx + 64]
 | 
| +    sub        ecx, 64
 | 
| +    jg         convertloop
 | 
| +
 | 
| +    vzeroupper
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
| +#endif  // HAS_COPYROW_AVX
 | 
|  
 | 
| -#ifdef HAS_COPYROW_X86
 | 
| +// Multiple of 1.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void CopyRow_X86(const uint8* src, uint8* dst, int count) {
 | 
| +void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
 | 
|    __asm {
 | 
|      mov        eax, esi
 | 
|      mov        edx, edi
 | 
|      mov        esi, [esp + 4]   // src
 | 
|      mov        edi, [esp + 8]   // dst
 | 
|      mov        ecx, [esp + 12]  // count
 | 
| -    shr        ecx, 2
 | 
| -    rep movsd
 | 
| +    rep movsb
 | 
|      mov        edi, edx
 | 
|      mov        esi, eax
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
| -#endif  // HAS_COPYROW_X86
 | 
|  
 | 
|  #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 | 
|  // width in pixels
 | 
| @@ -3734,21 +2911,20 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
 | 
|      pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
 | 
|      psrld      xmm1, 8
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
| -    movdqa     xmm2, [eax]
 | 
| -    movdqa     xmm3, [eax + 16]
 | 
| +    movdqu     xmm2, [eax]
 | 
| +    movdqu     xmm3, [eax + 16]
 | 
|      lea        eax, [eax + 32]
 | 
| -    movdqa     xmm4, [edx]
 | 
| -    movdqa     xmm5, [edx + 16]
 | 
| +    movdqu     xmm4, [edx]
 | 
| +    movdqu     xmm5, [edx + 16]
 | 
|      pand       xmm2, xmm0
 | 
|      pand       xmm3, xmm0
 | 
|      pand       xmm4, xmm1
 | 
|      pand       xmm5, xmm1
 | 
|      por        xmm2, xmm4
 | 
|      por        xmm3, xmm5
 | 
| -    movdqa     [edx], xmm2
 | 
| -    movdqa     [edx + 16], xmm3
 | 
| +    movdqu     [edx], xmm2
 | 
| +    movdqu     [edx + 16], xmm3
 | 
|      lea        edx, [edx + 32]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
| @@ -3769,7 +2945,6 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 | 
|      vpcmpeqb   ymm0, ymm0, ymm0
 | 
|      vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      vmovdqu    ymm1, [eax]
 | 
|      vmovdqu    ymm2, [eax + 32]
 | 
| @@ -3801,23 +2976,22 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
 | 
|      pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
 | 
|      psrld      xmm1, 8
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movq       xmm2, qword ptr [eax]  // 8 Y's
 | 
|      lea        eax, [eax + 8]
 | 
|      punpcklbw  xmm2, xmm2
 | 
|      punpckhwd  xmm3, xmm2
 | 
|      punpcklwd  xmm2, xmm2
 | 
| -    movdqa     xmm4, [edx]
 | 
| -    movdqa     xmm5, [edx + 16]
 | 
| +    movdqu     xmm4, [edx]
 | 
| +    movdqu     xmm5, [edx + 16]
 | 
|      pand       xmm2, xmm0
 | 
|      pand       xmm3, xmm0
 | 
|      pand       xmm4, xmm1
 | 
|      pand       xmm5, xmm1
 | 
|      por        xmm2, xmm4
 | 
|      por        xmm3, xmm5
 | 
| -    movdqa     [edx], xmm2
 | 
| -    movdqa     [edx + 16], xmm3
 | 
| +    movdqu     [edx], xmm2
 | 
| +    movdqu     [edx + 16], xmm3
 | 
|      lea        edx, [edx + 32]
 | 
|      sub        ecx, 8
 | 
|      jg         convertloop
 | 
| @@ -3838,7 +3012,6 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 | 
|      vpcmpeqb   ymm0, ymm0, ymm0
 | 
|      vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      vpmovzxbd  ymm1, qword ptr [eax]
 | 
|      vpmovzxbd  ymm2, qword ptr [eax + 8]
 | 
| @@ -3860,13 +3033,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 | 
|  #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 | 
|  
 | 
|  #ifdef HAS_SETROW_X86
 | 
| -// SetRow8 writes 'count' bytes using a 32 bit value repeated.
 | 
| +// Write 'count' bytes using an 8 bit value repeated.
 | 
| +// Count should be multiple of 4.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void SetRow_X86(uint8* dst, uint32 v32, int count) {
 | 
| +void SetRow_X86(uint8* dst, uint8 v8, int count) {
 | 
|    __asm {
 | 
| +    movzx      eax, byte ptr [esp + 8]    // v8
 | 
| +    mov        edx, 0x01010101  // Duplicate byte to all bytes.
 | 
| +    mul        edx              // overwrites edx with upper part of result.
 | 
|      mov        edx, edi
 | 
|      mov        edi, [esp + 4]   // dst
 | 
| -    mov        eax, [esp + 8]   // v32
 | 
|      mov        ecx, [esp + 12]  // count
 | 
|      shr        ecx, 2
 | 
|      rep stosd
 | 
| @@ -3875,33 +3051,30 @@ void SetRow_X86(uint8* dst, uint32 v32, int count) {
 | 
|    }
 | 
|  }
 | 
|  
 | 
| -// SetRow32 writes 'count' words using a 32 bit value repeated.
 | 
| +// Write 'count' bytes using an 8 bit value repeated.
 | 
| +__declspec(naked) __declspec(align(16))
 | 
| +void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
 | 
| +  __asm {
 | 
| +    mov        edx, edi
 | 
| +    mov        edi, [esp + 4]   // dst
 | 
| +    mov        eax, [esp + 8]   // v8
 | 
| +    mov        ecx, [esp + 12]  // count
 | 
| +    rep stosb
 | 
| +    mov        edi, edx
 | 
| +    ret
 | 
| +  }
 | 
| +}
 | 
| +
 | 
| +// Write 'count' 32 bit values.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
 | 
| -                   int dst_stride, int height) {
 | 
| +void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
 | 
|    __asm {
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    push       ebp
 | 
| -    mov        edi, [esp + 12 + 4]   // dst
 | 
| -    mov        eax, [esp + 12 + 8]   // v32
 | 
| -    mov        ebp, [esp + 12 + 12]  // width
 | 
| -    mov        edx, [esp + 12 + 16]  // dst_stride
 | 
| -    mov        esi, [esp + 12 + 20]  // height
 | 
| -    lea        ecx, [ebp * 4]
 | 
| -    sub        edx, ecx             // stride - width * 4
 | 
| -
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    mov        ecx, ebp
 | 
| +    mov        edx, edi
 | 
| +    mov        edi, [esp + 4]   // dst
 | 
| +    mov        eax, [esp + 8]   // v32
 | 
| +    mov        ecx, [esp + 12]  // count
 | 
|      rep stosd
 | 
| -    add        edi, edx
 | 
| -    sub        esi, 1
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        ebp
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| +    mov        edi, edx
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
| @@ -3918,7 +3091,6 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
 | 
|      vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
 | 
|      vpsrlw     ymm5, ymm5, 8
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      vmovdqu    ymm0, [eax]
 | 
|      vmovdqu    ymm1, [eax + 32]
 | 
| @@ -3927,9 +3099,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
 | 
|      vpand      ymm1, ymm1, ymm5
 | 
|      vpackuswb  ymm0, ymm0, ymm1   // mutates.
 | 
|      vpermq     ymm0, ymm0, 0xd8
 | 
| -    sub        ecx, 32
 | 
|      vmovdqu    [edx], ymm0
 | 
|      lea        edx, [edx + 32]
 | 
| +    sub        ecx, 32
 | 
|      jg         convertloop
 | 
|      vzeroupper
 | 
|      ret
 | 
| @@ -3951,7 +3123,6 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
 | 
|      vpsrlw     ymm5, ymm5, 8
 | 
|      sub        edi, edx
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      vmovdqu    ymm0, [eax]
 | 
|      vmovdqu    ymm1, [eax + 32]
 | 
| @@ -3994,7 +3165,6 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
 | 
|      vpsrlw     ymm5, ymm5, 8
 | 
|      sub        edi, edx
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      vmovdqu    ymm0, [eax]
 | 
|      vmovdqu    ymm1, [eax + 32]
 | 
| @@ -4029,7 +3199,6 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
 | 
|      mov        edx, [esp + 8]    // dst_y
 | 
|      mov        ecx, [esp + 12]   // pix
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      vmovdqu    ymm0, [eax]
 | 
|      vmovdqu    ymm1, [eax + 32]
 | 
| @@ -4038,12 +3207,12 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
 | 
|      vpsrlw     ymm1, ymm1, 8
 | 
|      vpackuswb  ymm0, ymm0, ymm1   // mutates.
 | 
|      vpermq     ymm0, ymm0, 0xd8
 | 
| -    sub        ecx, 32
 | 
|      vmovdqu    [edx], ymm0
 | 
|      lea        edx, [edx + 32]
 | 
| +    sub        ecx, 32
 | 
|      jg         convertloop
 | 
| -    ret
 | 
|      vzeroupper
 | 
| +    ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
| @@ -4062,7 +3231,6 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
 | 
|      vpsrlw     ymm5, ymm5, 8
 | 
|      sub        edi, edx
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      vmovdqu    ymm0, [eax]
 | 
|      vmovdqu    ymm1, [eax + 32]
 | 
| @@ -4105,7 +3273,6 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
 | 
|      vpsrlw     ymm5, ymm5, 8
 | 
|      sub        edi, edx
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      vmovdqu    ymm0, [eax]
 | 
|      vmovdqu    ymm1, [eax + 32]
 | 
| @@ -4144,114 +3311,6 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
 | 
|      pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
 | 
|      psrlw      xmm5, 8
 | 
|  
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    lea        eax,  [eax + 32]
 | 
| -    pand       xmm0, xmm5   // even bytes are Y
 | 
| -    pand       xmm1, xmm5
 | 
| -    packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [edx], xmm0
 | 
| -    lea        edx, [edx + 16]
 | 
| -    jg         convertloop
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
 | 
| -                      uint8* dst_u, uint8* dst_v, int pix) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 8 + 4]    // src_yuy2
 | 
| -    mov        esi, [esp + 8 + 8]    // stride_yuy2
 | 
| -    mov        edx, [esp + 8 + 12]   // dst_u
 | 
| -    mov        edi, [esp + 8 + 16]   // dst_v
 | 
| -    mov        ecx, [esp + 8 + 20]   // pix
 | 
| -    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
 | 
| -    psrlw      xmm5, 8
 | 
| -    sub        edi, edx
 | 
| -
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + esi]
 | 
| -    movdqa     xmm3, [eax + esi + 16]
 | 
| -    lea        eax,  [eax + 32]
 | 
| -    pavgb      xmm0, xmm2
 | 
| -    pavgb      xmm1, xmm3
 | 
| -    psrlw      xmm0, 8      // YUYV -> UVUV
 | 
| -    psrlw      xmm1, 8
 | 
| -    packuswb   xmm0, xmm1
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    pand       xmm0, xmm5  // U
 | 
| -    packuswb   xmm0, xmm0
 | 
| -    psrlw      xmm1, 8     // V
 | 
| -    packuswb   xmm1, xmm1
 | 
| -    movq       qword ptr [edx], xmm0
 | 
| -    movq       qword ptr [edx + edi], xmm1
 | 
| -    lea        edx, [edx + 8]
 | 
| -    sub        ecx, 16
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
 | 
| -                         uint8* dst_u, uint8* dst_v, int pix) {
 | 
| -  __asm {
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 4 + 4]    // src_yuy2
 | 
| -    mov        edx, [esp + 4 + 8]    // dst_u
 | 
| -    mov        edi, [esp + 4 + 12]   // dst_v
 | 
| -    mov        ecx, [esp + 4 + 16]   // pix
 | 
| -    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
 | 
| -    psrlw      xmm5, 8
 | 
| -    sub        edi, edx
 | 
| -
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    lea        eax,  [eax + 32]
 | 
| -    psrlw      xmm0, 8      // YUYV -> UVUV
 | 
| -    psrlw      xmm1, 8
 | 
| -    packuswb   xmm0, xmm1
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    pand       xmm0, xmm5  // U
 | 
| -    packuswb   xmm0, xmm0
 | 
| -    psrlw      xmm1, 8     // V
 | 
| -    packuswb   xmm1, xmm1
 | 
| -    movq       qword ptr [edx], xmm0
 | 
| -    movq       qword ptr [edx + edi], xmm1
 | 
| -    lea        edx, [edx + 8]
 | 
| -    sub        ecx, 16
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
 | 
| -                               uint8* dst_y, int pix) {
 | 
| -  __asm {
 | 
| -    mov        eax, [esp + 4]    // src_yuy2
 | 
| -    mov        edx, [esp + 8]    // dst_y
 | 
| -    mov        ecx, [esp + 12]   // pix
 | 
| -    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
 | 
| -    psrlw      xmm5, 8
 | 
| -
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movdqu     xmm0, [eax]
 | 
|      movdqu     xmm1, [eax + 16]
 | 
| @@ -4259,17 +3318,17 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
 | 
|      pand       xmm0, xmm5   // even bytes are Y
 | 
|      pand       xmm1, xmm5
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
 | 
| -                                uint8* dst_u, uint8* dst_v, int pix) {
 | 
| +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
 | 
| +                      uint8* dst_u, uint8* dst_v, int pix) {
 | 
|    __asm {
 | 
|      push       esi
 | 
|      push       edi
 | 
| @@ -4282,7 +3341,6 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
 | 
|      psrlw      xmm5, 8
 | 
|      sub        edi, edx
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movdqu     xmm0, [eax]
 | 
|      movdqu     xmm1, [eax + 16]
 | 
| @@ -4312,8 +3370,8 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
 | 
| -                                   uint8* dst_u, uint8* dst_v, int pix) {
 | 
| +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
 | 
| +                         uint8* dst_u, uint8* dst_v, int pix) {
 | 
|    __asm {
 | 
|      push       edi
 | 
|      mov        eax, [esp + 4 + 4]    // src_yuy2
 | 
| @@ -4324,7 +3382,6 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
 | 
|      psrlw      xmm5, 8
 | 
|      sub        edi, edx
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movdqu     xmm0, [eax]
 | 
|      movdqu     xmm1, [eax + 16]
 | 
| @@ -4356,112 +3413,6 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
 | 
|      mov        edx, [esp + 8]    // dst_y
 | 
|      mov        ecx, [esp + 12]   // pix
 | 
|  
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    lea        eax,  [eax + 32]
 | 
| -    psrlw      xmm0, 8    // odd bytes are Y
 | 
| -    psrlw      xmm1, 8
 | 
| -    packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [edx], xmm0
 | 
| -    lea        edx, [edx + 16]
 | 
| -    jg         convertloop
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
 | 
| -                      uint8* dst_u, uint8* dst_v, int pix) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 8 + 4]    // src_yuy2
 | 
| -    mov        esi, [esp + 8 + 8]    // stride_yuy2
 | 
| -    mov        edx, [esp + 8 + 12]   // dst_u
 | 
| -    mov        edi, [esp + 8 + 16]   // dst_v
 | 
| -    mov        ecx, [esp + 8 + 20]   // pix
 | 
| -    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
 | 
| -    psrlw      xmm5, 8
 | 
| -    sub        edi, edx
 | 
| -
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + esi]
 | 
| -    movdqa     xmm3, [eax + esi + 16]
 | 
| -    lea        eax,  [eax + 32]
 | 
| -    pavgb      xmm0, xmm2
 | 
| -    pavgb      xmm1, xmm3
 | 
| -    pand       xmm0, xmm5   // UYVY -> UVUV
 | 
| -    pand       xmm1, xmm5
 | 
| -    packuswb   xmm0, xmm1
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    pand       xmm0, xmm5  // U
 | 
| -    packuswb   xmm0, xmm0
 | 
| -    psrlw      xmm1, 8     // V
 | 
| -    packuswb   xmm1, xmm1
 | 
| -    movq       qword ptr [edx], xmm0
 | 
| -    movq       qword ptr [edx + edi], xmm1
 | 
| -    lea        edx, [edx + 8]
 | 
| -    sub        ecx, 16
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
 | 
| -                         uint8* dst_u, uint8* dst_v, int pix) {
 | 
| -  __asm {
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 4 + 4]    // src_yuy2
 | 
| -    mov        edx, [esp + 4 + 8]    // dst_u
 | 
| -    mov        edi, [esp + 4 + 12]   // dst_v
 | 
| -    mov        ecx, [esp + 4 + 16]   // pix
 | 
| -    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
 | 
| -    psrlw      xmm5, 8
 | 
| -    sub        edi, edx
 | 
| -
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    lea        eax,  [eax + 32]
 | 
| -    pand       xmm0, xmm5   // UYVY -> UVUV
 | 
| -    pand       xmm1, xmm5
 | 
| -    packuswb   xmm0, xmm1
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    pand       xmm0, xmm5  // U
 | 
| -    packuswb   xmm0, xmm0
 | 
| -    psrlw      xmm1, 8     // V
 | 
| -    packuswb   xmm1, xmm1
 | 
| -    movq       qword ptr [edx], xmm0
 | 
| -    movq       qword ptr [edx + edi], xmm1
 | 
| -    lea        edx, [edx + 8]
 | 
| -    sub        ecx, 16
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
 | 
| -                               uint8* dst_y, int pix) {
 | 
| -  __asm {
 | 
| -    mov        eax, [esp + 4]    // src_uyvy
 | 
| -    mov        edx, [esp + 8]    // dst_y
 | 
| -    mov        ecx, [esp + 12]   // pix
 | 
| -
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movdqu     xmm0, [eax]
 | 
|      movdqu     xmm1, [eax + 16]
 | 
| @@ -4469,17 +3420,17 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
 | 
|      psrlw      xmm0, 8    // odd bytes are Y
 | 
|      psrlw      xmm1, 8
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
 | 
| -                                uint8* dst_u, uint8* dst_v, int pix) {
 | 
| +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
 | 
| +                      uint8* dst_u, uint8* dst_v, int pix) {
 | 
|    __asm {
 | 
|      push       esi
 | 
|      push       edi
 | 
| @@ -4492,7 +3443,6 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
 | 
|      psrlw      xmm5, 8
 | 
|      sub        edi, edx
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movdqu     xmm0, [eax]
 | 
|      movdqu     xmm1, [eax + 16]
 | 
| @@ -4522,8 +3472,8 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
 | 
|  }
 | 
|  
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
 | 
| -                                   uint8* dst_u, uint8* dst_v, int pix) {
 | 
| +void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
 | 
| +                         uint8* dst_u, uint8* dst_v, int pix) {
 | 
|    __asm {
 | 
|      push       edi
 | 
|      mov        eax, [esp + 4 + 4]    // src_yuy2
 | 
| @@ -4534,7 +3484,6 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
 | 
|      psrlw      xmm5, 8
 | 
|      sub        edi, edx
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movdqu     xmm0, [eax]
 | 
|      movdqu     xmm1, [eax + 16]
 | 
| @@ -4607,9 +3556,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      paddusb    xmm0, xmm2       // + src argb
 | 
|      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
 | 
|      paddusb    xmm0, xmm1       // + src argb
 | 
| -    sub        ecx, 1
 | 
|      movd       [edx], xmm0
 | 
|      lea        edx, [edx + 4]
 | 
| +    sub        ecx, 1
 | 
|      jge        alignloop1
 | 
|  
 | 
|    alignloop1b:
 | 
| @@ -4638,9 +3587,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      paddusb    xmm0, xmm2       // + src argb
 | 
|      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
 | 
|      paddusb    xmm0, xmm1       // + src argb
 | 
| -    sub        ecx, 4
 | 
| -    movdqa     [edx], xmm0
 | 
| +    movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jge        convertloop4
 | 
|  
 | 
|    convertloop4b:
 | 
| @@ -4669,9 +3618,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      paddusb    xmm0, xmm2       // + src argb
 | 
|      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
 | 
|      paddusb    xmm0, xmm1       // + src argb
 | 
| -    sub        ecx, 1
 | 
|      movd       [edx], xmm0
 | 
|      lea        edx, [edx + 4]
 | 
| +    sub        ecx, 1
 | 
|      jge        convertloop1
 | 
|  
 | 
|    convertloop1b:
 | 
| @@ -4739,48 +3688,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      paddusb    xmm0, xmm2       // + src argb
 | 
|      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
 | 
|      paddusb    xmm0, xmm1       // + src argb
 | 
| -    sub        ecx, 1
 | 
|      movd       [edx], xmm0
 | 
|      lea        edx, [edx + 4]
 | 
| +    sub        ecx, 1
 | 
|      jge        alignloop1
 | 
|  
 | 
|    alignloop1b:
 | 
|      add        ecx, 1 - 4
 | 
|      jl         convertloop4b
 | 
|  
 | 
| -    test       eax, 15          // unaligned?
 | 
| -    jne        convertuloop4
 | 
| -    test       esi, 15          // unaligned?
 | 
| -    jne        convertuloop4
 | 
| -
 | 
|      // 4 pixel loop.
 | 
|    convertloop4:
 | 
| -    movdqa     xmm3, [eax]      // src argb
 | 
| -    lea        eax, [eax + 16]
 | 
| -    movdqa     xmm0, xmm3       // src argb
 | 
| -    pxor       xmm3, xmm4       // ~alpha
 | 
| -    movdqa     xmm2, [esi]      // _r_b
 | 
| -    pshufb     xmm3, kShuffleAlpha // alpha
 | 
| -    pand       xmm2, xmm6       // _r_b
 | 
| -    paddw      xmm3, xmm7       // 256 - alpha
 | 
| -    pmullw     xmm2, xmm3       // _r_b * alpha
 | 
| -    movdqa     xmm1, [esi]      // _a_g
 | 
| -    lea        esi, [esi + 16]
 | 
| -    psrlw      xmm1, 8          // _a_g
 | 
| -    por        xmm0, xmm4       // set alpha to 255
 | 
| -    pmullw     xmm1, xmm3       // _a_g * alpha
 | 
| -    psrlw      xmm2, 8          // _r_b convert to 8 bits again
 | 
| -    paddusb    xmm0, xmm2       // + src argb
 | 
| -    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
 | 
| -    paddusb    xmm0, xmm1       // + src argb
 | 
| -    sub        ecx, 4
 | 
| -    movdqa     [edx], xmm0
 | 
| -    lea        edx, [edx + 16]
 | 
| -    jge        convertloop4
 | 
| -    jmp        convertloop4b
 | 
| -
 | 
| -    // 4 pixel unaligned loop.
 | 
| -  convertuloop4:
 | 
|      movdqu     xmm3, [eax]      // src argb
 | 
|      lea        eax, [eax + 16]
 | 
|      movdqa     xmm0, xmm3       // src argb
 | 
| @@ -4799,10 +3717,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      paddusb    xmm0, xmm2       // + src argb
 | 
|      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
 | 
|      paddusb    xmm0, xmm1       // + src argb
 | 
| -    sub        ecx, 4
 | 
| -    movdqa     [edx], xmm0
 | 
| +    movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| -    jge        convertuloop4
 | 
| +    sub        ecx, 4
 | 
| +    jge        convertloop4
 | 
|  
 | 
|    convertloop4b:
 | 
|      add        ecx, 4 - 1
 | 
| @@ -4828,9 +3746,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      paddusb    xmm0, xmm2       // + src argb
 | 
|      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
 | 
|      paddusb    xmm0, xmm1       // + src argb
 | 
| -    sub        ecx, 1
 | 
|      movd       [edx], xmm0
 | 
|      lea        edx, [edx + 4]
 | 
| +    sub        ecx, 1
 | 
|      jge        convertloop1
 | 
|  
 | 
|    convertloop1b:
 | 
| @@ -4842,7 +3760,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 | 
|  
 | 
|  #ifdef HAS_ARGBATTENUATEROW_SSE2
 | 
|  // Attenuate 4 pixels at a time.
 | 
| -// Aligned to 16 bytes.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
|    __asm {
 | 
| @@ -4854,19 +3771,18 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
|      pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
 | 
|      psrld      xmm5, 8
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]      // read 4 pixels
 | 
| +    movdqu     xmm0, [eax]      // read 4 pixels
 | 
|      punpcklbw  xmm0, xmm0       // first 2
 | 
|      pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
 | 
|      pshuflw    xmm2, xmm2, 0FFh
 | 
|      pmulhuw    xmm0, xmm2       // rgb * a
 | 
| -    movdqa     xmm1, [eax]      // read 4 pixels
 | 
| +    movdqu     xmm1, [eax]      // read 4 pixels
 | 
|      punpckhbw  xmm1, xmm1       // next 2 pixels
 | 
|      pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
 | 
|      pshuflw    xmm2, xmm2, 0FFh
 | 
|      pmulhuw    xmm1, xmm2       // rgb * a
 | 
| -    movdqa     xmm2, [eax]      // alphas
 | 
| +    movdqu     xmm2, [eax]      // alphas
 | 
|      lea        eax, [eax + 16]
 | 
|      psrlw      xmm0, 8
 | 
|      pand       xmm2, xmm4
 | 
| @@ -4874,9 +3790,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
|      packuswb   xmm0, xmm1
 | 
|      pand       xmm0, xmm5       // keep original alphas
 | 
|      por        xmm0, xmm2
 | 
| -    sub        ecx, 4
 | 
| -    movdqa     [edx], xmm0
 | 
| +    movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         convertloop
 | 
|  
 | 
|      ret
 | 
| @@ -4904,7 +3820,6 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
|      movdqa     xmm4, kShuffleAlpha0
 | 
|      movdqa     xmm5, kShuffleAlpha1
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu     xmm0, [eax]      // read 4 pixels
 | 
|      pshufb     xmm0, xmm4       // isolate first 2 alphas
 | 
| @@ -4923,9 +3838,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
|      psrlw      xmm1, 8
 | 
|      packuswb   xmm0, xmm1
 | 
|      por        xmm0, xmm2       // copy original alpha
 | 
| -    sub        ecx, 4
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         convertloop
 | 
|  
 | 
|      ret
 | 
| @@ -4935,11 +3850,8 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
|  
 | 
|  #ifdef HAS_ARGBATTENUATEROW_AVX2
 | 
|  // Shuffle table duplicating alpha.
 | 
| -static const ulvec8 kShuffleAlpha_AVX2 = {
 | 
| -  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
 | 
| -  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
 | 
| -  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
 | 
| -  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
 | 
| +static const uvec8 kShuffleAlpha_AVX2 = {
 | 
| +  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
 | 
|  };
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
| @@ -4948,11 +3860,10 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
|      mov        edx, [esp + 8]   // dst_argb
 | 
|      mov        ecx, [esp + 12]  // width
 | 
|      sub        edx, eax
 | 
| -    vmovdqa    ymm4, kShuffleAlpha_AVX2
 | 
| +    vbroadcastf128 ymm4,kShuffleAlpha_AVX2
 | 
|      vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
 | 
|      vpslld     ymm5, ymm5, 24
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      vmovdqu    ymm6, [eax]       // read 8 pixels.
 | 
|      vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
 | 
| @@ -4966,9 +3877,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
|      vpsrlw     ymm1, ymm1, 8
 | 
|      vpackuswb  ymm0, ymm0, ymm1  // unmutated.
 | 
|      vpor       ymm0, ymm0, ymm6  // copy original alpha
 | 
| -    sub        ecx, 8
 | 
|      vmovdqu    [eax + edx], ymm0
 | 
|      lea        eax, [eax + 32]
 | 
| +    sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
|      vzeroupper
 | 
| @@ -4979,7 +3890,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
|  
 | 
|  #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 | 
|  // Unattenuate 4 pixels at a time.
 | 
| -// Aligned to 16 bytes.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 | 
|                               int width) {
 | 
| @@ -4990,7 +3900,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 | 
|      mov        edx, [esp + 8 + 8]   // dst_argb
 | 
|      mov        ecx, [esp + 8 + 12]  // width
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu     xmm0, [eax]      // read 4 pixels
 | 
|      movzx      esi, byte ptr [eax + 3]  // first alpha
 | 
| @@ -5016,9 +3925,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 | 
|      lea        eax, [eax + 16]
 | 
|  
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 4
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         convertloop
 | 
|      pop        edi
 | 
|      pop        esi
 | 
| @@ -5029,9 +3938,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 | 
|  
 | 
|  #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 | 
|  // Shuffle table duplicating alpha.
 | 
| -static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
 | 
| -  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
 | 
| -  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
 | 
| +static const uvec8 kUnattenShuffleAlpha_AVX2 = {
 | 
| +  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
 | 
|  };
 | 
|  // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
 | 
|  // USE_GATHER is not on by default, due to being a slow instruction.
 | 
| @@ -5044,9 +3952,8 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 | 
|      mov        edx, [esp + 8]   // dst_argb
 | 
|      mov        ecx, [esp + 12]  // width
 | 
|      sub        edx, eax
 | 
| -    vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
 | 
| +    vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      vmovdqu    ymm6, [eax]       // read 8 pixels.
 | 
|      vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
 | 
| @@ -5061,9 +3968,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 | 
|      vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
 | 
|      vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
 | 
|      vpackuswb  ymm0, ymm0, ymm1  // unmutated.
 | 
| -    sub        ecx, 8
 | 
|      vmovdqu    [eax + edx], ymm0
 | 
|      lea        eax, [eax + 32]
 | 
| +    sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
|      vzeroupper
 | 
| @@ -5080,12 +3987,11 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 | 
|      mov        edx, [esp + 8]   // dst_argb
 | 
|      mov        ecx, [esp + 12]  // width
 | 
|      sub        edx, eax
 | 
| -    vmovdqa    ymm5, kUnattenShuffleAlpha_AVX2
 | 
| +    vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
 | 
|  
 | 
|      push       esi
 | 
|      push       edi
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      // replace VPGATHER
 | 
|      movzx      esi, byte ptr [eax + 3]                 // alpha0
 | 
| @@ -5123,9 +4029,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 | 
|      vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
 | 
|      vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
 | 
|      vpackuswb  ymm0, ymm0, ymm1  // unmutated.
 | 
| -    sub        ecx, 8
 | 
|      vmovdqu    [eax + edx], ymm0
 | 
|      lea        eax, [eax + 32]
 | 
| +    sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
| @@ -5148,18 +4054,17 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
|      movdqa     xmm4, kARGBToYJ
 | 
|      movdqa     xmm5, kAddYJ64
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]  // G
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| +    movdqu     xmm0, [eax]  // G
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
|      pmaddubsw  xmm0, xmm4
 | 
|      pmaddubsw  xmm1, xmm4
 | 
|      phaddw     xmm0, xmm1
 | 
|      paddw      xmm0, xmm5  // Add .5 for rounding.
 | 
|      psrlw      xmm0, 7
 | 
|      packuswb   xmm0, xmm0   // 8 G bytes
 | 
| -    movdqa     xmm2, [eax]  // A
 | 
| -    movdqa     xmm3, [eax + 16]
 | 
| +    movdqu     xmm2, [eax]  // A
 | 
| +    movdqu     xmm3, [eax + 16]
 | 
|      lea        eax, [eax + 32]
 | 
|      psrld      xmm2, 24
 | 
|      psrld      xmm3, 24
 | 
| @@ -5171,10 +4076,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 | 
|      movdqa     xmm1, xmm0
 | 
|      punpcklwd  xmm0, xmm3   // GGGA first 4
 | 
|      punpckhwd  xmm1, xmm3   // GGGA next 4
 | 
| -    sub        ecx, 8
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + 16], xmm1
 | 
| +    movdqu     [edx], xmm0
 | 
| +    movdqu     [edx + 16], xmm1
 | 
|      lea        edx, [edx + 32]
 | 
| +    sub        ecx, 8
 | 
|      jg         convertloop
 | 
|      ret
 | 
|    }
 | 
| @@ -5208,32 +4113,31 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 | 
|      movdqa     xmm3, kARGBToSepiaG
 | 
|      movdqa     xmm4, kARGBToSepiaR
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]  // B
 | 
| -    movdqa     xmm6, [eax + 16]
 | 
| +    movdqu     xmm0, [eax]  // B
 | 
| +    movdqu     xmm6, [eax + 16]
 | 
|      pmaddubsw  xmm0, xmm2
 | 
|      pmaddubsw  xmm6, xmm2
 | 
|      phaddw     xmm0, xmm6
 | 
|      psrlw      xmm0, 7
 | 
|      packuswb   xmm0, xmm0   // 8 B values
 | 
| -    movdqa     xmm5, [eax]  // G
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| +    movdqu     xmm5, [eax]  // G
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
|      pmaddubsw  xmm5, xmm3
 | 
|      pmaddubsw  xmm1, xmm3
 | 
|      phaddw     xmm5, xmm1
 | 
|      psrlw      xmm5, 7
 | 
|      packuswb   xmm5, xmm5   // 8 G values
 | 
|      punpcklbw  xmm0, xmm5   // 8 BG values
 | 
| -    movdqa     xmm5, [eax]  // R
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| +    movdqu     xmm5, [eax]  // R
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
|      pmaddubsw  xmm5, xmm4
 | 
|      pmaddubsw  xmm1, xmm4
 | 
|      phaddw     xmm5, xmm1
 | 
|      psrlw      xmm5, 7
 | 
|      packuswb   xmm5, xmm5   // 8 R values
 | 
| -    movdqa     xmm6, [eax]  // A
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| +    movdqu     xmm6, [eax]  // A
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
|      psrld      xmm6, 24
 | 
|      psrld      xmm1, 24
 | 
|      packuswb   xmm6, xmm1
 | 
| @@ -5242,10 +4146,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 | 
|      movdqa     xmm1, xmm0   // Weave BG, RA together
 | 
|      punpcklwd  xmm0, xmm5   // BGRA first 4
 | 
|      punpckhwd  xmm1, xmm5   // BGRA next 4
 | 
| -    sub        ecx, 8
 | 
| -    movdqa     [eax], xmm0
 | 
| -    movdqa     [eax + 16], xmm1
 | 
| +    movdqu     [eax], xmm0
 | 
| +    movdqu     [eax + 16], xmm1
 | 
|      lea        eax, [eax + 32]
 | 
| +    sub        ecx, 8
 | 
|      jg         convertloop
 | 
|      ret
 | 
|    }
 | 
| @@ -5271,14 +4175,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 | 
|      pshufd     xmm5, xmm5, 0xff
 | 
|      mov        ecx, [esp + 16]  /* width */
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]  // B
 | 
| -    movdqa     xmm7, [eax + 16]
 | 
| +    movdqu     xmm0, [eax]  // B
 | 
| +    movdqu     xmm7, [eax + 16]
 | 
|      pmaddubsw  xmm0, xmm2
 | 
|      pmaddubsw  xmm7, xmm2
 | 
| -    movdqa     xmm6, [eax]  // G
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| +    movdqu     xmm6, [eax]  // G
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
|      pmaddubsw  xmm6, xmm3
 | 
|      pmaddubsw  xmm1, xmm3
 | 
|      phaddsw    xmm0, xmm7   // B
 | 
| @@ -5288,13 +4191,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 | 
|      packuswb   xmm0, xmm0   // 8 B values
 | 
|      packuswb   xmm6, xmm6   // 8 G values
 | 
|      punpcklbw  xmm0, xmm6   // 8 BG values
 | 
| -    movdqa     xmm1, [eax]  // R
 | 
| -    movdqa     xmm7, [eax + 16]
 | 
| +    movdqu     xmm1, [eax]  // R
 | 
| +    movdqu     xmm7, [eax + 16]
 | 
|      pmaddubsw  xmm1, xmm4
 | 
|      pmaddubsw  xmm7, xmm4
 | 
|      phaddsw    xmm1, xmm7   // R
 | 
| -    movdqa     xmm6, [eax]  // A
 | 
| -    movdqa     xmm7, [eax + 16]
 | 
| +    movdqu     xmm6, [eax]  // A
 | 
| +    movdqu     xmm7, [eax + 16]
 | 
|      pmaddubsw  xmm6, xmm5
 | 
|      pmaddubsw  xmm7, xmm5
 | 
|      phaddsw    xmm6, xmm7   // A
 | 
| @@ -5306,11 +4209,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 | 
|      movdqa     xmm6, xmm0   // Weave BG, RA together
 | 
|      punpcklwd  xmm0, xmm1   // BGRA first 4
 | 
|      punpckhwd  xmm6, xmm1   // BGRA next 4
 | 
| -    sub        ecx, 8
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + 16], xmm6
 | 
| +    movdqu     [edx], xmm0
 | 
| +    movdqu     [edx + 16], xmm6
 | 
|      lea        eax, [eax + 32]
 | 
|      lea        edx, [edx + 32]
 | 
| +    sub        ecx, 8
 | 
|      jg         convertloop
 | 
|      ret
 | 
|    }
 | 
| @@ -5319,7 +4222,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 | 
|  
 | 
|  #ifdef HAS_ARGBQUANTIZEROW_SSE2
 | 
|  // Quantize 4 ARGB pixels (16 bytes).
 | 
| -// Aligned to 16 bytes.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 | 
|                            int interval_offset, int width) {
 | 
| @@ -5339,25 +4241,24 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 | 
|      pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
 | 
|      pslld      xmm6, 24
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]  // read 4 pixels
 | 
| +    movdqu     xmm0, [eax]  // read 4 pixels
 | 
|      punpcklbw  xmm0, xmm5   // first 2 pixels
 | 
|      pmulhuw    xmm0, xmm2   // pixel * scale >> 16
 | 
| -    movdqa     xmm1, [eax]  // read 4 pixels
 | 
| +    movdqu     xmm1, [eax]  // read 4 pixels
 | 
|      punpckhbw  xmm1, xmm5   // next 2 pixels
 | 
|      pmulhuw    xmm1, xmm2
 | 
|      pmullw     xmm0, xmm3   // * interval_size
 | 
| -    movdqa     xmm7, [eax]  // read 4 pixels
 | 
| +    movdqu     xmm7, [eax]  // read 4 pixels
 | 
|      pmullw     xmm1, xmm3
 | 
|      pand       xmm7, xmm6   // mask alpha
 | 
|      paddw      xmm0, xmm4   // + interval_size / 2
 | 
|      paddw      xmm1, xmm4
 | 
|      packuswb   xmm0, xmm1
 | 
|      por        xmm0, xmm7
 | 
| -    sub        ecx, 4
 | 
| -    movdqa     [eax], xmm0
 | 
| +    movdqu     [eax], xmm0
 | 
|      lea        eax, [eax + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         convertloop
 | 
|      ret
 | 
|    }
 | 
| @@ -5366,7 +4267,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 | 
|  
 | 
|  #ifdef HAS_ARGBSHADEROW_SSE2
 | 
|  // Shade 4 pixels at a time by specified value.
 | 
| -// Aligned to 16 bytes.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 | 
|                         uint32 value) {
 | 
| @@ -5378,9 +4278,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 | 
|      punpcklbw  xmm2, xmm2
 | 
|      punpcklqdq xmm2, xmm2
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]      // read 4 pixels
 | 
| +    movdqu     xmm0, [eax]      // read 4 pixels
 | 
|      lea        eax, [eax + 16]
 | 
|      movdqa     xmm1, xmm0
 | 
|      punpcklbw  xmm0, xmm0       // first 2
 | 
| @@ -5390,9 +4289,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 | 
|      psrlw      xmm0, 8
 | 
|      psrlw      xmm1, 8
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 4
 | 
| -    movdqa     [edx], xmm0
 | 
| +    movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         convertloop
 | 
|  
 | 
|      ret
 | 
| @@ -5413,7 +4312,6 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      mov        ecx, [esp + 4 + 16]  // width
 | 
|      pxor       xmm5, xmm5  // constant 0
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
 | 
|      movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
 | 
| @@ -5428,9 +4326,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      lea        eax, [eax + 16]
 | 
|      lea        esi, [esi + 16]
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 4
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        esi
 | 
| @@ -5455,16 +4353,15 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      sub        ecx, 4
 | 
|      jl         convertloop49
 | 
|  
 | 
| -    align      4
 | 
|   convertloop4:
 | 
|      movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
 | 
|      lea        eax, [eax + 16]
 | 
|      movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
 | 
|      lea        esi, [esi + 16]
 | 
|      paddusb    xmm0, xmm1         // src_argb0 + src_argb1
 | 
| -    sub        ecx, 4
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jge        convertloop4
 | 
|  
 | 
|   convertloop49:
 | 
| @@ -5477,9 +4374,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      movd       xmm1, [esi]        // read 1 pixels from src_argb1
 | 
|      lea        esi, [esi + 4]
 | 
|      paddusb    xmm0, xmm1         // src_argb0 + src_argb1
 | 
| -    sub        ecx, 1
 | 
|      movd       [edx], xmm0
 | 
|      lea        edx, [edx + 4]
 | 
| +    sub        ecx, 1
 | 
|      jge        convertloop1
 | 
|  
 | 
|   convertloop19:
 | 
| @@ -5501,16 +4398,15 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      mov        edx, [esp + 4 + 12]  // dst_argb
 | 
|      mov        ecx, [esp + 4 + 16]  // width
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
 | 
|      lea        eax, [eax + 16]
 | 
|      movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
 | 
|      lea        esi, [esi + 16]
 | 
|      psubusb    xmm0, xmm1         // src_argb0 - src_argb1
 | 
| -    sub        ecx, 4
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        esi
 | 
| @@ -5532,7 +4428,6 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      mov        ecx, [esp + 4 + 16]  // width
 | 
|      vpxor      ymm5, ymm5, ymm5     // constant 0
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
 | 
|      lea        eax, [eax + 32]
 | 
| @@ -5569,7 +4464,6 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      mov        edx, [esp + 4 + 12]  // dst_argb
 | 
|      mov        ecx, [esp + 4 + 16]  // width
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
 | 
|      lea        eax, [eax + 32]
 | 
| @@ -5599,7 +4493,6 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 | 
|      mov        edx, [esp + 4 + 12]  // dst_argb
 | 
|      mov        ecx, [esp + 4 + 16]  // width
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
 | 
|      lea        eax, [eax + 32]
 | 
| @@ -5638,7 +4531,6 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 | 
|      sub        edx, eax
 | 
|      pxor       xmm5, xmm5  // constant 0
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
 | 
|      movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
 | 
| @@ -5662,9 +4554,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 | 
|      psubw      xmm1, xmm0
 | 
|      pmaxsw     xmm0, xmm1
 | 
|      packuswb   xmm0, xmm0
 | 
| -    sub        ecx, 8
 | 
|      movq       qword ptr [eax + edx], xmm0
 | 
|      lea        eax, [eax + 8]
 | 
| +    sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
| @@ -5692,7 +4584,6 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 | 
|      sub        edx, eax
 | 
|      pxor       xmm5, xmm5  // constant 0
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
 | 
|      movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
 | 
| @@ -5716,9 +4607,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 | 
|      psubw      xmm1, xmm0
 | 
|      pmaxsw     xmm0, xmm1
 | 
|      packuswb   xmm0, xmm0
 | 
| -    sub        ecx, 8
 | 
|      movq       qword ptr [eax + edx], xmm0
 | 
|      lea        eax, [eax + 8]
 | 
| +    sub        ecx, 8
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        esi
 | 
| @@ -5746,10 +4637,9 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 | 
|      pcmpeqb    xmm5, xmm5           // alpha 255
 | 
|      pslld      xmm5, 24             // 0xff000000
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
 | 
| -    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
 | 
| +    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
 | 
| +    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
 | 
|      lea        eax, [eax + 16]
 | 
|      paddusb    xmm0, xmm1             // sobel = sobelx + sobely
 | 
|      movdqa     xmm2, xmm0             // GG
 | 
| @@ -5765,12 +4655,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 | 
|      punpckhwd  xmm0, xmm0             // Last 4
 | 
|      por        xmm3, xmm5             // GGGA
 | 
|      por        xmm0, xmm5
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [edx], xmm1
 | 
| -    movdqa     [edx + 16], xmm2
 | 
| -    movdqa     [edx + 32], xmm3
 | 
| -    movdqa     [edx + 48], xmm0
 | 
| +    movdqu     [edx], xmm1
 | 
| +    movdqu     [edx + 16], xmm2
 | 
| +    movdqu     [edx + 32], xmm3
 | 
| +    movdqu     [edx + 48], xmm0
 | 
|      lea        edx, [edx + 64]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        esi
 | 
| @@ -5792,15 +4682,14 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 | 
|      mov        ecx, [esp + 4 + 16]  // width
 | 
|      sub        esi, eax
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
 | 
| -    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
 | 
| +    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
 | 
| +    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
 | 
|      lea        eax, [eax + 16]
 | 
|      paddusb    xmm0, xmm1             // sobel = sobelx + sobely
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [edx], xmm0
 | 
| +    movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        esi
 | 
| @@ -5827,10 +4716,9 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 | 
|      sub        esi, eax
 | 
|      pcmpeqb    xmm5, xmm5           // alpha 255
 | 
|  
 | 
| -    align      4
 | 
|   convertloop:
 | 
| -    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
 | 
| -    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
 | 
| +    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
 | 
| +    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
 | 
|      lea        eax, [eax + 16]
 | 
|      movdqa     xmm2, xmm0
 | 
|      paddusb    xmm2, xmm1             // sobel = sobelx + sobely
 | 
| @@ -5846,12 +4734,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 | 
|      movdqa     xmm7, xmm1             // YSXA
 | 
|      punpcklwd  xmm7, xmm0             // Next 4
 | 
|      punpckhwd  xmm1, xmm0             // Last 4
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [edx], xmm6
 | 
| -    movdqa     [edx + 16], xmm4
 | 
| -    movdqa     [edx + 32], xmm7
 | 
| -    movdqa     [edx + 48], xmm1
 | 
| +    movdqu     [edx], xmm6
 | 
| +    movdqu     [edx + 16], xmm4
 | 
| +    movdqu     [edx + 32], xmm7
 | 
| +    movdqu     [edx + 48], xmm1
 | 
|      lea        edx, [edx + 64]
 | 
| +    sub        ecx, 16
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        esi
 | 
| @@ -5872,8 +4760,7 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 | 
|  // area is the number of pixels in the area being averaged.
 | 
|  // dst points to pixel to store result to.
 | 
|  // count is number of averaged pixels to produce.
 | 
| -// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
 | 
| -// aligned.
 | 
| +// Does 4 pixels at a time.
 | 
|  void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 | 
|                                      int width, int area, uint8* dst,
 | 
|                                      int count) {
 | 
| @@ -5903,13 +4790,12 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 | 
|      packssdw   xmm5, xmm5           // 16 bit shorts
 | 
|  
 | 
|      // 4 pixel loop small blocks.
 | 
| -    align      4
 | 
|    s4:
 | 
|      // top left
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| +    movdqu     xmm0, [eax]
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
| +    movdqu     xmm2, [eax + 32]
 | 
| +    movdqu     xmm3, [eax + 48]
 | 
|  
 | 
|      // - top right
 | 
|      psubd      xmm0, [eax + edx * 4]
 | 
| @@ -5946,13 +4832,12 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 | 
|      jmp        l4b
 | 
|  
 | 
|      // 4 pixel loop
 | 
| -    align      4
 | 
|    l4:
 | 
|      // top left
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    movdqa     xmm2, [eax + 32]
 | 
| -    movdqa     xmm3, [eax + 48]
 | 
| +    movdqu     xmm0, [eax]
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
| +    movdqu     xmm2, [eax + 32]
 | 
| +    movdqu     xmm3, [eax + 48]
 | 
|  
 | 
|      // - top right
 | 
|      psubd      xmm0, [eax + edx * 4]
 | 
| @@ -5999,9 +4884,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 | 
|      jl         l1b
 | 
|  
 | 
|      // 1 pixel loop
 | 
| -    align      4
 | 
|    l1:
 | 
| -    movdqa     xmm0, [eax]
 | 
| +    movdqu     xmm0, [eax]
 | 
|      psubd      xmm0, [eax + edx * 4]
 | 
|      lea        eax, [eax + 16]
 | 
|      psubd      xmm0, [esi]
 | 
| @@ -6040,7 +4924,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
 | 
|      jne        l4b
 | 
|  
 | 
|      // 4 pixel loop
 | 
| -    align      4
 | 
|    l4:
 | 
|      movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
 | 
|      lea        eax, [eax + 16]
 | 
| @@ -6057,26 +4940,26 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
 | 
|      punpckhwd  xmm5, xmm1
 | 
|  
 | 
|      paddd      xmm0, xmm2
 | 
| -    movdqa     xmm2, [esi]  // previous row above.
 | 
| +    movdqu     xmm2, [esi]  // previous row above.
 | 
|      paddd      xmm2, xmm0
 | 
|  
 | 
|      paddd      xmm0, xmm3
 | 
| -    movdqa     xmm3, [esi + 16]
 | 
| +    movdqu     xmm3, [esi + 16]
 | 
|      paddd      xmm3, xmm0
 | 
|  
 | 
|      paddd      xmm0, xmm4
 | 
| -    movdqa     xmm4, [esi + 32]
 | 
| +    movdqu     xmm4, [esi + 32]
 | 
|      paddd      xmm4, xmm0
 | 
|  
 | 
|      paddd      xmm0, xmm5
 | 
| -    movdqa     xmm5, [esi + 48]
 | 
| +    movdqu     xmm5, [esi + 48]
 | 
|      lea        esi, [esi + 64]
 | 
|      paddd      xmm5, xmm0
 | 
|  
 | 
| -    movdqa     [edx], xmm2
 | 
| -    movdqa     [edx + 16], xmm3
 | 
| -    movdqa     [edx + 32], xmm4
 | 
| -    movdqa     [edx + 48], xmm5
 | 
| +    movdqu     [edx], xmm2
 | 
| +    movdqu     [edx + 16], xmm3
 | 
| +    movdqu     [edx + 32], xmm4
 | 
| +    movdqu     [edx + 48], xmm5
 | 
|  
 | 
|      lea        edx, [edx + 64]
 | 
|      sub        ecx, 4
 | 
| @@ -6087,7 +4970,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
 | 
|      jl         l1b
 | 
|  
 | 
|      // 1 pixel loop
 | 
| -    align      4
 | 
|    l1:
 | 
|      movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
 | 
|      lea        eax, [eax + 4]
 | 
| @@ -6142,7 +5024,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 | 
|      addps      xmm4, xmm4    // dudv *= 4
 | 
|  
 | 
|      // 4 pixel loop
 | 
| -    align      4
 | 
|    l4:
 | 
|      cvttps2dq  xmm0, xmm2    // x, y float to int first 2
 | 
|      cvttps2dq  xmm1, xmm3    // x, y float to int next 2
 | 
| @@ -6156,256 +5037,47 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 | 
|      movd       xmm6, [eax + edi]  // read pixel 1
 | 
|      punpckldq  xmm1, xmm6     // combine pixel 0 and 1
 | 
|      addps      xmm2, xmm4    // x, y += dx, dy first 2
 | 
| -    movq       qword ptr [edx], xmm1
 | 
| -    movd       esi, xmm0
 | 
| -    pshufd     xmm0, xmm0, 0x39  // shift right
 | 
| -    movd       edi, xmm0
 | 
| -    movd       xmm6, [eax + esi]  // read pixel 2
 | 
| -    movd       xmm0, [eax + edi]  // read pixel 3
 | 
| -    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
 | 
| -    addps      xmm3, xmm4    // x, y += dx, dy next 2
 | 
| -    sub        ecx, 4
 | 
| -    movq       qword ptr 8[edx], xmm6
 | 
| -    lea        edx, [edx + 16]
 | 
| -    jge        l4
 | 
| -
 | 
| -  l4b:
 | 
| -    add        ecx, 4 - 1
 | 
| -    jl         l1b
 | 
| -
 | 
| -    // 1 pixel loop
 | 
| -    align      4
 | 
| -  l1:
 | 
| -    cvttps2dq  xmm0, xmm2    // x, y float to int
 | 
| -    packssdw   xmm0, xmm0    // x, y as shorts
 | 
| -    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
 | 
| -    addps      xmm2, xmm7    // x, y += dx, dy
 | 
| -    movd       esi, xmm0
 | 
| -    movd       xmm0, [eax + esi]  // copy a pixel
 | 
| -    sub        ecx, 1
 | 
| -    movd       [edx], xmm0
 | 
| -    lea        edx, [edx + 4]
 | 
| -    jge        l1
 | 
| -  l1b:
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -#endif  // HAS_ARGBAFFINEROW_SSE2
 | 
| -
 | 
| -#ifdef HAS_INTERPOLATEROW_AVX2
 | 
| -// Bilinear filter 16x2 -> 16x1
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
 | 
| -                          ptrdiff_t src_stride, int dst_width,
 | 
| -                          int source_y_fraction) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        edi, [esp + 8 + 4]   // dst_ptr
 | 
| -    mov        esi, [esp + 8 + 8]   // src_ptr
 | 
| -    mov        edx, [esp + 8 + 12]  // src_stride
 | 
| -    mov        ecx, [esp + 8 + 16]  // dst_width
 | 
| -    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
 | 
| -    shr        eax, 1
 | 
| -    // Dispatch to specialized filters if applicable.
 | 
| -    cmp        eax, 0
 | 
| -    je         xloop100  // 0 / 128.  Blend 100 / 0.
 | 
| -    sub        edi, esi
 | 
| -    cmp        eax, 32
 | 
| -    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
 | 
| -    cmp        eax, 64
 | 
| -    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
 | 
| -    cmp        eax, 96
 | 
| -    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
 | 
| -
 | 
| -    vmovd      xmm0, eax  // high fraction 0..127
 | 
| -    neg        eax
 | 
| -    add        eax, 128
 | 
| -    vmovd      xmm5, eax  // low fraction 128..1
 | 
| -    vpunpcklbw xmm5, xmm5, xmm0
 | 
| -    vpunpcklwd xmm5, xmm5, xmm5
 | 
| -    vpxor      ymm0, ymm0, ymm0
 | 
| -    vpermd     ymm5, ymm0, ymm5
 | 
| -
 | 
| -    align      4
 | 
| -  xloop:
 | 
| -    vmovdqu    ymm0, [esi]
 | 
| -    vmovdqu    ymm2, [esi + edx]
 | 
| -    vpunpckhbw ymm1, ymm0, ymm2  // mutates
 | 
| -    vpunpcklbw ymm0, ymm0, ymm2  // mutates
 | 
| -    vpmaddubsw ymm0, ymm0, ymm5
 | 
| -    vpmaddubsw ymm1, ymm1, ymm5
 | 
| -    vpsrlw     ymm0, ymm0, 7
 | 
| -    vpsrlw     ymm1, ymm1, 7
 | 
| -    vpackuswb  ymm0, ymm0, ymm1  // unmutates
 | 
| -    sub        ecx, 32
 | 
| -    vmovdqu    [esi + edi], ymm0
 | 
| -    lea        esi, [esi + 32]
 | 
| -    jg         xloop
 | 
| -    jmp        xloop99
 | 
| -
 | 
| -    // Blend 25 / 75.
 | 
| -    align      4
 | 
| -  xloop25:
 | 
| -    vmovdqu    ymm0, [esi]
 | 
| -    vpavgb     ymm0, ymm0, [esi + edx]
 | 
| -    vpavgb     ymm0, ymm0, [esi + edx]
 | 
| -    sub        ecx, 32
 | 
| -    vmovdqu    [esi + edi], ymm0
 | 
| -    lea        esi, [esi + 32]
 | 
| -    jg         xloop25
 | 
| -    jmp        xloop99
 | 
| -
 | 
| -    // Blend 50 / 50.
 | 
| -    align      4
 | 
| -  xloop50:
 | 
| -    vmovdqu    ymm0, [esi]
 | 
| -    vpavgb     ymm0, ymm0, [esi + edx]
 | 
| -    sub        ecx, 32
 | 
| -    vmovdqu    [esi + edi], ymm0
 | 
| -    lea        esi, [esi + 32]
 | 
| -    jg         xloop50
 | 
| -    jmp        xloop99
 | 
| -
 | 
| -    // Blend 75 / 25.
 | 
| -    align      4
 | 
| -  xloop75:
 | 
| -    vmovdqu    ymm0, [esi + edx]
 | 
| -    vpavgb     ymm0, ymm0, [esi]
 | 
| -    vpavgb     ymm0, ymm0, [esi]
 | 
| -    sub        ecx, 32
 | 
| -    vmovdqu     [esi + edi], ymm0
 | 
| -    lea        esi, [esi + 32]
 | 
| -    jg         xloop75
 | 
| -    jmp        xloop99
 | 
| -
 | 
| -    // Blend 100 / 0 - Copy row unchanged.
 | 
| -    align      4
 | 
| -  xloop100:
 | 
| -    rep movsb
 | 
| -
 | 
| -  xloop99:
 | 
| -    pop        edi
 | 
| -    pop        esi
 | 
| -    vzeroupper
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -#endif  // HAS_INTERPOLATEROW_AVX2
 | 
| -
 | 
| -#ifdef HAS_INTERPOLATEROW_SSSE3
 | 
| -// Bilinear filter 16x2 -> 16x1
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 | 
| -                          ptrdiff_t src_stride, int dst_width,
 | 
| -                          int source_y_fraction) {
 | 
| -  __asm {
 | 
| -    push       esi
 | 
| -    push       edi
 | 
| -    mov        edi, [esp + 8 + 4]   // dst_ptr
 | 
| -    mov        esi, [esp + 8 + 8]   // src_ptr
 | 
| -    mov        edx, [esp + 8 + 12]  // src_stride
 | 
| -    mov        ecx, [esp + 8 + 16]  // dst_width
 | 
| -    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
 | 
| -    sub        edi, esi
 | 
| -    shr        eax, 1
 | 
| -    // Dispatch to specialized filters if applicable.
 | 
| -    cmp        eax, 0
 | 
| -    je         xloop100  // 0 / 128.  Blend 100 / 0.
 | 
| -    cmp        eax, 32
 | 
| -    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
 | 
| -    cmp        eax, 64
 | 
| -    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
 | 
| -    cmp        eax, 96
 | 
| -    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
 | 
| -
 | 
| -    movd       xmm0, eax  // high fraction 0..127
 | 
| -    neg        eax
 | 
| -    add        eax, 128
 | 
| -    movd       xmm5, eax  // low fraction 128..1
 | 
| -    punpcklbw  xmm5, xmm0
 | 
| -    punpcklwd  xmm5, xmm5
 | 
| -    pshufd     xmm5, xmm5, 0
 | 
| -
 | 
| -    align      4
 | 
| -  xloop:
 | 
| -    movdqa     xmm0, [esi]
 | 
| -    movdqa     xmm2, [esi + edx]
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    punpcklbw  xmm0, xmm2
 | 
| -    punpckhbw  xmm1, xmm2
 | 
| -    pmaddubsw  xmm0, xmm5
 | 
| -    pmaddubsw  xmm1, xmm5
 | 
| -    psrlw      xmm0, 7
 | 
| -    psrlw      xmm1, 7
 | 
| -    packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [esi + edi], xmm0
 | 
| -    lea        esi, [esi + 16]
 | 
| -    jg         xloop
 | 
| -    jmp        xloop99
 | 
| -
 | 
| -    // Blend 25 / 75.
 | 
| -    align      4
 | 
| -  xloop25:
 | 
| -    movdqa     xmm0, [esi]
 | 
| -    movdqa     xmm1, [esi + edx]
 | 
| -    pavgb      xmm0, xmm1
 | 
| -    pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [esi + edi], xmm0
 | 
| -    lea        esi, [esi + 16]
 | 
| -    jg         xloop25
 | 
| -    jmp        xloop99
 | 
| -
 | 
| -    // Blend 50 / 50.
 | 
| -    align      4
 | 
| -  xloop50:
 | 
| -    movdqa     xmm0, [esi]
 | 
| -    movdqa     xmm1, [esi + edx]
 | 
| -    pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [esi + edi], xmm0
 | 
| -    lea        esi, [esi + 16]
 | 
| -    jg         xloop50
 | 
| -    jmp        xloop99
 | 
| -
 | 
| -    // Blend 75 / 25.
 | 
| -    align      4
 | 
| -  xloop75:
 | 
| -    movdqa     xmm1, [esi]
 | 
| -    movdqa     xmm0, [esi + edx]
 | 
| -    pavgb      xmm0, xmm1
 | 
| -    pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [esi + edi], xmm0
 | 
| -    lea        esi, [esi + 16]
 | 
| -    jg         xloop75
 | 
| -    jmp        xloop99
 | 
| +    movq       qword ptr [edx], xmm1
 | 
| +    movd       esi, xmm0
 | 
| +    pshufd     xmm0, xmm0, 0x39  // shift right
 | 
| +    movd       edi, xmm0
 | 
| +    movd       xmm6, [eax + esi]  // read pixel 2
 | 
| +    movd       xmm0, [eax + edi]  // read pixel 3
 | 
| +    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
 | 
| +    addps      xmm3, xmm4    // x, y += dx, dy next 2
 | 
| +    movq       qword ptr 8[edx], xmm6
 | 
| +    lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
| +    jge        l4
 | 
|  
 | 
| -    // Blend 100 / 0 - Copy row unchanged.
 | 
| -    align      4
 | 
| -  xloop100:
 | 
| -    movdqa     xmm0, [esi]
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [esi + edi], xmm0
 | 
| -    lea        esi, [esi + 16]
 | 
| -    jg         xloop100
 | 
| +  l4b:
 | 
| +    add        ecx, 4 - 1
 | 
| +    jl         l1b
 | 
|  
 | 
| -  xloop99:
 | 
| +    // 1 pixel loop
 | 
| +  l1:
 | 
| +    cvttps2dq  xmm0, xmm2    // x, y float to int
 | 
| +    packssdw   xmm0, xmm0    // x, y as shorts
 | 
| +    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
 | 
| +    addps      xmm2, xmm7    // x, y += dx, dy
 | 
| +    movd       esi, xmm0
 | 
| +    movd       xmm0, [eax + esi]  // copy a pixel
 | 
| +    movd       [edx], xmm0
 | 
| +    lea        edx, [edx + 4]
 | 
| +    sub        ecx, 1
 | 
| +    jge        l1
 | 
| +  l1b:
 | 
|      pop        edi
 | 
|      pop        esi
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
| -#endif  // HAS_INTERPOLATEROW_SSSE3
 | 
| +#endif  // HAS_ARGBAFFINEROW_SSE2
 | 
|  
 | 
| -#ifdef HAS_INTERPOLATEROW_SSE2
 | 
| -// Bilinear filter 16x2 -> 16x1
 | 
| +#ifdef HAS_INTERPOLATEROW_AVX2
 | 
| +// Bilinear filter 32x2 -> 32x1
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 | 
| +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
 | 
|                           ptrdiff_t src_stride, int dst_width,
 | 
|                           int source_y_fraction) {
 | 
|    __asm {
 | 
| @@ -6416,110 +5088,95 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 | 
|      mov        edx, [esp + 8 + 12]  // src_stride
 | 
|      mov        ecx, [esp + 8 + 16]  // dst_width
 | 
|      mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
 | 
| -    sub        edi, esi
 | 
| +    shr        eax, 1
 | 
|      // Dispatch to specialized filters if applicable.
 | 
|      cmp        eax, 0
 | 
| -    je         xloop100  // 0 / 256.  Blend 100 / 0.
 | 
| +    je         xloop100  // 0 / 128.  Blend 100 / 0.
 | 
| +    sub        edi, esi
 | 
| +    cmp        eax, 32
 | 
| +    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
 | 
|      cmp        eax, 64
 | 
| -    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
 | 
| -    cmp        eax, 128
 | 
| -    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
 | 
| -    cmp        eax, 192
 | 
| -    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
 | 
| +    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
 | 
| +    cmp        eax, 96
 | 
| +    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
 | 
|  
 | 
| -    movd       xmm5, eax            // xmm5 = y fraction
 | 
| -    punpcklbw  xmm5, xmm5
 | 
| -    psrlw      xmm5, 1
 | 
| -    punpcklwd  xmm5, xmm5
 | 
| -    punpckldq  xmm5, xmm5
 | 
| -    punpcklqdq xmm5, xmm5
 | 
| -    pxor       xmm4, xmm4
 | 
| +    vmovd      xmm0, eax  // high fraction 0..127
 | 
| +    neg        eax
 | 
| +    add        eax, 128
 | 
| +    vmovd      xmm5, eax  // low fraction 128..1
 | 
| +    vpunpcklbw xmm5, xmm5, xmm0
 | 
| +    vpunpcklwd xmm5, xmm5, xmm5
 | 
| +    vpxor      ymm0, ymm0, ymm0
 | 
| +    vpermd     ymm5, ymm0, ymm5
 | 
|  
 | 
| -    align      4
 | 
|    xloop:
 | 
| -    movdqa     xmm0, [esi]  // row0
 | 
| -    movdqa     xmm2, [esi + edx]  // row1
 | 
| -    movdqa     xmm1, xmm0
 | 
| -    movdqa     xmm3, xmm2
 | 
| -    punpcklbw  xmm2, xmm4
 | 
| -    punpckhbw  xmm3, xmm4
 | 
| -    punpcklbw  xmm0, xmm4
 | 
| -    punpckhbw  xmm1, xmm4
 | 
| -    psubw      xmm2, xmm0  // row1 - row0
 | 
| -    psubw      xmm3, xmm1
 | 
| -    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
 | 
| -    paddw      xmm3, xmm3
 | 
| -    pmulhw     xmm2, xmm5  // scale diff
 | 
| -    pmulhw     xmm3, xmm5
 | 
| -    paddw      xmm0, xmm2  // sum rows
 | 
| -    paddw      xmm1, xmm3
 | 
| -    packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [esi + edi], xmm0
 | 
| -    lea        esi, [esi + 16]
 | 
| +    vmovdqu    ymm0, [esi]
 | 
| +    vmovdqu    ymm2, [esi + edx]
 | 
| +    vpunpckhbw ymm1, ymm0, ymm2  // mutates
 | 
| +    vpunpcklbw ymm0, ymm0, ymm2  // mutates
 | 
| +    vpmaddubsw ymm0, ymm0, ymm5
 | 
| +    vpmaddubsw ymm1, ymm1, ymm5
 | 
| +    vpsrlw     ymm0, ymm0, 7
 | 
| +    vpsrlw     ymm1, ymm1, 7
 | 
| +    vpackuswb  ymm0, ymm0, ymm1  // unmutates
 | 
| +    vmovdqu    [esi + edi], ymm0
 | 
| +    lea        esi, [esi + 32]
 | 
| +    sub        ecx, 32
 | 
|      jg         xloop
 | 
|      jmp        xloop99
 | 
|  
 | 
| -    // Blend 25 / 75.
 | 
| -    align      4
 | 
| -  xloop25:
 | 
| -    movdqa     xmm0, [esi]
 | 
| -    movdqa     xmm1, [esi + edx]
 | 
| -    pavgb      xmm0, xmm1
 | 
| -    pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [esi + edi], xmm0
 | 
| -    lea        esi, [esi + 16]
 | 
| -    jg         xloop25
 | 
| -    jmp        xloop99
 | 
| -
 | 
| -    // Blend 50 / 50.
 | 
| -    align      4
 | 
| -  xloop50:
 | 
| -    movdqa     xmm0, [esi]
 | 
| -    movdqa     xmm1, [esi + edx]
 | 
| -    pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [esi + edi], xmm0
 | 
| -    lea        esi, [esi + 16]
 | 
| -    jg         xloop50
 | 
| -    jmp        xloop99
 | 
| -
 | 
| -    // Blend 75 / 25.
 | 
| -    align      4
 | 
| -  xloop75:
 | 
| -    movdqa     xmm1, [esi]
 | 
| -    movdqa     xmm0, [esi + edx]
 | 
| -    pavgb      xmm0, xmm1
 | 
| -    pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [esi + edi], xmm0
 | 
| -    lea        esi, [esi + 16]
 | 
| -    jg         xloop75
 | 
| -    jmp        xloop99
 | 
| -
 | 
| -    // Blend 100 / 0 - Copy row unchanged.
 | 
| -    align      4
 | 
| -  xloop100:
 | 
| -    movdqa     xmm0, [esi]
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [esi + edi], xmm0
 | 
| -    lea        esi, [esi + 16]
 | 
| -    jg         xloop100
 | 
| +   // Blend 25 / 75.
 | 
| + xloop25:
 | 
| +   vmovdqu    ymm0, [esi]
 | 
| +   vmovdqu    ymm1, [esi + edx]
 | 
| +   vpavgb     ymm0, ymm0, ymm1
 | 
| +   vpavgb     ymm0, ymm0, ymm1
 | 
| +   vmovdqu    [esi + edi], ymm0
 | 
| +   lea        esi, [esi + 32]
 | 
| +   sub        ecx, 32
 | 
| +   jg         xloop25
 | 
| +   jmp        xloop99
 | 
| +
 | 
| +   // Blend 50 / 50.
 | 
| + xloop50:
 | 
| +   vmovdqu    ymm0, [esi]
 | 
| +   vpavgb     ymm0, ymm0, [esi + edx]
 | 
| +   vmovdqu    [esi + edi], ymm0
 | 
| +   lea        esi, [esi + 32]
 | 
| +   sub        ecx, 32
 | 
| +   jg         xloop50
 | 
| +   jmp        xloop99
 | 
| +
 | 
| +   // Blend 75 / 25.
 | 
| + xloop75:
 | 
| +   vmovdqu    ymm1, [esi]
 | 
| +   vmovdqu    ymm0, [esi + edx]
 | 
| +   vpavgb     ymm0, ymm0, ymm1
 | 
| +   vpavgb     ymm0, ymm0, ymm1
 | 
| +   vmovdqu    [esi + edi], ymm0
 | 
| +   lea        esi, [esi + 32]
 | 
| +   sub        ecx, 32
 | 
| +   jg         xloop75
 | 
| +   jmp        xloop99
 | 
| +
 | 
| +   // Blend 100 / 0 - Copy row unchanged.
 | 
| + xloop100:
 | 
| +   rep movsb
 | 
|  
 | 
|    xloop99:
 | 
|      pop        edi
 | 
|      pop        esi
 | 
| +    vzeroupper
 | 
|      ret
 | 
|    }
 | 
|  }
 | 
| -#endif  // HAS_INTERPOLATEROW_SSE2
 | 
| +#endif  // HAS_INTERPOLATEROW_AVX2
 | 
|  
 | 
|  // Bilinear filter 16x2 -> 16x1
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 | 
| -                                    ptrdiff_t src_stride, int dst_width,
 | 
| -                                    int source_y_fraction) {
 | 
| +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 | 
| +                          ptrdiff_t src_stride, int dst_width,
 | 
| +                          int source_y_fraction) {
 | 
|    __asm {
 | 
|      push       esi
 | 
|      push       edi
 | 
| @@ -6548,7 +5205,6 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 | 
|      punpcklwd  xmm5, xmm5
 | 
|      pshufd     xmm5, xmm5, 0
 | 
|  
 | 
| -    align      4
 | 
|    xloop:
 | 
|      movdqu     xmm0, [esi]
 | 
|      movdqu     xmm2, [esi + edx]
 | 
| @@ -6560,57 +5216,53 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 | 
|      psrlw      xmm0, 7
 | 
|      psrlw      xmm1, 7
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [esi + edi], xmm0
 | 
|      lea        esi, [esi + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         xloop
 | 
|      jmp        xloop99
 | 
|  
 | 
|      // Blend 25 / 75.
 | 
| -    align      4
 | 
|    xloop25:
 | 
|      movdqu     xmm0, [esi]
 | 
|      movdqu     xmm1, [esi + edx]
 | 
|      pavgb      xmm0, xmm1
 | 
|      pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [esi + edi], xmm0
 | 
|      lea        esi, [esi + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         xloop25
 | 
|      jmp        xloop99
 | 
|  
 | 
|      // Blend 50 / 50.
 | 
| -    align      4
 | 
|    xloop50:
 | 
|      movdqu     xmm0, [esi]
 | 
|      movdqu     xmm1, [esi + edx]
 | 
|      pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [esi + edi], xmm0
 | 
|      lea        esi, [esi + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         xloop50
 | 
|      jmp        xloop99
 | 
|  
 | 
|      // Blend 75 / 25.
 | 
| -    align      4
 | 
|    xloop75:
 | 
|      movdqu     xmm1, [esi]
 | 
|      movdqu     xmm0, [esi + edx]
 | 
|      pavgb      xmm0, xmm1
 | 
|      pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [esi + edi], xmm0
 | 
|      lea        esi, [esi + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         xloop75
 | 
|      jmp        xloop99
 | 
|  
 | 
|      // Blend 100 / 0 - Copy row unchanged.
 | 
| -    align      4
 | 
|    xloop100:
 | 
|      movdqu     xmm0, [esi]
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [esi + edi], xmm0
 | 
|      lea        esi, [esi + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         xloop100
 | 
|  
 | 
|    xloop99:
 | 
| @@ -6623,9 +5275,9 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 | 
|  #ifdef HAS_INTERPOLATEROW_SSE2
 | 
|  // Bilinear filter 16x2 -> 16x1
 | 
|  __declspec(naked) __declspec(align(16))
 | 
| -void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 | 
| -                                   ptrdiff_t src_stride, int dst_width,
 | 
| -                                   int source_y_fraction) {
 | 
| +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 | 
| +                         ptrdiff_t src_stride, int dst_width,
 | 
| +                         int source_y_fraction) {
 | 
|    __asm {
 | 
|      push       esi
 | 
|      push       edi
 | 
| @@ -6653,7 +5305,6 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 | 
|      punpcklqdq xmm5, xmm5
 | 
|      pxor       xmm4, xmm4
 | 
|  
 | 
| -    align      4
 | 
|    xloop:
 | 
|      movdqu     xmm0, [esi]  // row0
 | 
|      movdqu     xmm2, [esi + edx]  // row1
 | 
| @@ -6672,57 +5323,53 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 | 
|      paddw      xmm0, xmm2  // sum rows
 | 
|      paddw      xmm1, xmm3
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [esi + edi], xmm0
 | 
|      lea        esi, [esi + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         xloop
 | 
|      jmp        xloop99
 | 
|  
 | 
|      // Blend 25 / 75.
 | 
| -    align      4
 | 
|    xloop25:
 | 
|      movdqu     xmm0, [esi]
 | 
|      movdqu     xmm1, [esi + edx]
 | 
|      pavgb      xmm0, xmm1
 | 
|      pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [esi + edi], xmm0
 | 
|      lea        esi, [esi + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         xloop25
 | 
|      jmp        xloop99
 | 
|  
 | 
|      // Blend 50 / 50.
 | 
| -    align      4
 | 
|    xloop50:
 | 
|      movdqu     xmm0, [esi]
 | 
|      movdqu     xmm1, [esi + edx]
 | 
|      pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [esi + edi], xmm0
 | 
|      lea        esi, [esi + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         xloop50
 | 
|      jmp        xloop99
 | 
|  
 | 
|      // Blend 75 / 25.
 | 
| -    align      4
 | 
|    xloop75:
 | 
|      movdqu     xmm1, [esi]
 | 
|      movdqu     xmm0, [esi + edx]
 | 
|      pavgb      xmm0, xmm1
 | 
|      pavgb      xmm0, xmm1
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [esi + edi], xmm0
 | 
|      lea        esi, [esi + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         xloop75
 | 
|      jmp        xloop99
 | 
|  
 | 
|      // Blend 100 / 0 - Copy row unchanged.
 | 
| -    align      4
 | 
|    xloop100:
 | 
|      movdqu     xmm0, [esi]
 | 
| -    sub        ecx, 16
 | 
|      movdqu     [esi + edi], xmm0
 | 
|      lea        esi, [esi + 16]
 | 
| +    sub        ecx, 16
 | 
|      jg         xloop100
 | 
|  
 | 
|    xloop99:
 | 
| @@ -6733,84 +5380,6 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 | 
|  }
 | 
|  #endif  // HAS_INTERPOLATEROW_SSE2
 | 
|  
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
 | 
| -                  uint8* dst_uv, int pix) {
 | 
| -  __asm {
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 4 + 4]    // src_uv
 | 
| -    mov        edx, [esp + 4 + 8]    // src_uv_stride
 | 
| -    mov        edi, [esp + 4 + 12]   // dst_v
 | 
| -    mov        ecx, [esp + 4 + 16]   // pix
 | 
| -    sub        edi, eax
 | 
| -
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    pavgb      xmm0, [eax + edx]
 | 
| -    sub        ecx, 16
 | 
| -    movdqa     [eax + edi], xmm0
 | 
| -    lea        eax,  [eax + 16]
 | 
| -    jg         convertloop
 | 
| -    pop        edi
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -#ifdef HAS_HALFROW_AVX2
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
 | 
| -                  uint8* dst_uv, int pix) {
 | 
| -  __asm {
 | 
| -    push       edi
 | 
| -    mov        eax, [esp + 4 + 4]    // src_uv
 | 
| -    mov        edx, [esp + 4 + 8]    // src_uv_stride
 | 
| -    mov        edi, [esp + 4 + 12]   // dst_v
 | 
| -    mov        ecx, [esp + 4 + 16]   // pix
 | 
| -    sub        edi, eax
 | 
| -
 | 
| -    align      4
 | 
| -  convertloop:
 | 
| -    vmovdqu    ymm0, [eax]
 | 
| -    vpavgb     ymm0, ymm0, [eax + edx]
 | 
| -    sub        ecx, 32
 | 
| -    vmovdqu    [eax + edi], ymm0
 | 
| -    lea        eax,  [eax + 32]
 | 
| -    jg         convertloop
 | 
| -
 | 
| -    pop        edi
 | 
| -    vzeroupper
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -#endif  // HAS_HALFROW_AVX2
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
 | 
| -                          uint32 selector, int pix) {
 | 
| -  __asm {
 | 
| -    mov        eax, [esp + 4]    // src_argb
 | 
| -    mov        edx, [esp + 8]    // dst_bayer
 | 
| -    movd       xmm5, [esp + 12]  // selector
 | 
| -    mov        ecx, [esp + 16]   // pix
 | 
| -    pshufd     xmm5, xmm5, 0
 | 
| -
 | 
| -    align      4
 | 
| -  wloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    lea        eax, [eax + 32]
 | 
| -    pshufb     xmm0, xmm5
 | 
| -    pshufb     xmm1, xmm5
 | 
| -    punpckldq  xmm0, xmm1
 | 
| -    sub        ecx, 8
 | 
| -    movq       qword ptr [edx], xmm0
 | 
| -    lea        edx, [edx + 8]
 | 
| -    jg         wloop
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
|  // Specialized ARGB to Bayer that just isolates G channel.
 | 
|  __declspec(naked) __declspec(align(16))
 | 
|  void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
 | 
| @@ -6823,10 +5392,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
 | 
|      pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
 | 
|      psrld      xmm5, 24
 | 
|  
 | 
| -    align      4
 | 
|    wloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| +    movdqu     xmm0, [eax]
 | 
| +    movdqu     xmm1, [eax + 16]
 | 
|      lea        eax, [eax + 32]
 | 
|      psrld      xmm0, 8  // Move green to bottom.
 | 
|      psrld      xmm1, 8
 | 
| @@ -6834,9 +5402,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
 | 
|      pand       xmm1, xmm5
 | 
|      packssdw   xmm0, xmm1
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 8
 | 
|      movq       qword ptr [edx], xmm0
 | 
|      lea        edx, [edx + 8]
 | 
| +    sub        ecx, 8
 | 
|      jg         wloop
 | 
|      ret
 | 
|    }
 | 
| @@ -6850,46 +5418,19 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 | 
|      mov        eax, [esp + 4]    // src_argb
 | 
|      mov        edx, [esp + 8]    // dst_argb
 | 
|      mov        ecx, [esp + 12]   // shuffler
 | 
| -    movdqa     xmm5, [ecx]
 | 
| -    mov        ecx, [esp + 16]   // pix
 | 
| -
 | 
| -    align      4
 | 
| -  wloop:
 | 
| -    movdqa     xmm0, [eax]
 | 
| -    movdqa     xmm1, [eax + 16]
 | 
| -    lea        eax, [eax + 32]
 | 
| -    pshufb     xmm0, xmm5
 | 
| -    pshufb     xmm1, xmm5
 | 
| -    sub        ecx, 8
 | 
| -    movdqa     [edx], xmm0
 | 
| -    movdqa     [edx + 16], xmm1
 | 
| -    lea        edx, [edx + 32]
 | 
| -    jg         wloop
 | 
| -    ret
 | 
| -  }
 | 
| -}
 | 
| -
 | 
| -__declspec(naked) __declspec(align(16))
 | 
| -void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
 | 
| -                                    const uint8* shuffler, int pix) {
 | 
| -  __asm {
 | 
| -    mov        eax, [esp + 4]    // src_argb
 | 
| -    mov        edx, [esp + 8]    // dst_argb
 | 
| -    mov        ecx, [esp + 12]   // shuffler
 | 
| -    movdqa     xmm5, [ecx]
 | 
| +    movdqu     xmm5, [ecx]
 | 
|      mov        ecx, [esp + 16]   // pix
 | 
|  
 | 
| -    align      4
 | 
|    wloop:
 | 
|      movdqu     xmm0, [eax]
 | 
|      movdqu     xmm1, [eax + 16]
 | 
|      lea        eax, [eax + 32]
 | 
|      pshufb     xmm0, xmm5
 | 
|      pshufb     xmm1, xmm5
 | 
| -    sub        ecx, 8
 | 
|      movdqu     [edx], xmm0
 | 
|      movdqu     [edx + 16], xmm1
 | 
|      lea        edx, [edx + 32]
 | 
| +    sub        ecx, 8
 | 
|      jg         wloop
 | 
|      ret
 | 
|    }
 | 
| @@ -6906,17 +5447,16 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 | 
|      vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
 | 
|      mov        ecx, [esp + 16]    // pix
 | 
|  
 | 
| -    align      4
 | 
|    wloop:
 | 
|      vmovdqu    ymm0, [eax]
 | 
|      vmovdqu    ymm1, [eax + 32]
 | 
|      lea        eax, [eax + 64]
 | 
|      vpshufb    ymm0, ymm0, ymm5
 | 
|      vpshufb    ymm1, ymm1, ymm5
 | 
| -    sub        ecx, 16
 | 
|      vmovdqu    [edx], ymm0
 | 
|      vmovdqu    [edx + 32], ymm1
 | 
|      lea        edx, [edx + 64]
 | 
| +    sub        ecx, 16
 | 
|      jg         wloop
 | 
|  
 | 
|      vzeroupper
 | 
| @@ -6967,7 +5507,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 | 
|      jg         shuf_any1
 | 
|      jmp        shuf99
 | 
|  
 | 
| -    align      4
 | 
|    shuf_0123:
 | 
|      movdqu     xmm0, [eax]
 | 
|      lea        eax, [eax + 16]
 | 
| @@ -6979,13 +5518,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 | 
|      pshufhw    xmm1, xmm1, 01Bh
 | 
|      pshuflw    xmm1, xmm1, 01Bh
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 4
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         shuf_0123
 | 
|      jmp        shuf99
 | 
|  
 | 
| -    align      4
 | 
|    shuf_0321:
 | 
|      movdqu     xmm0, [eax]
 | 
|      lea        eax, [eax + 16]
 | 
| @@ -6997,13 +5535,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 | 
|      pshufhw    xmm1, xmm1, 039h
 | 
|      pshuflw    xmm1, xmm1, 039h
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 4
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         shuf_0321
 | 
|      jmp        shuf99
 | 
|  
 | 
| -    align      4
 | 
|    shuf_2103:
 | 
|      movdqu     xmm0, [eax]
 | 
|      lea        eax, [eax + 16]
 | 
| @@ -7015,13 +5552,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 | 
|      pshufhw    xmm1, xmm1, 093h
 | 
|      pshuflw    xmm1, xmm1, 093h
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 4
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         shuf_2103
 | 
|      jmp        shuf99
 | 
|  
 | 
| -    align      4
 | 
|    shuf_3012:
 | 
|      movdqu     xmm0, [eax]
 | 
|      lea        eax, [eax + 16]
 | 
| @@ -7033,9 +5569,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 | 
|      pshufhw    xmm1, xmm1, 0C6h
 | 
|      pshuflw    xmm1, xmm1, 0C6h
 | 
|      packuswb   xmm0, xmm1
 | 
| -    sub        ecx, 4
 | 
|      movdqu     [edx], xmm0
 | 
|      lea        edx, [edx + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         shuf_3012
 | 
|  
 | 
|    shuf99:
 | 
| @@ -7066,7 +5602,6 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
 | 
|      mov        ecx, [esp + 8 + 20]   // width
 | 
|      sub        edx, esi
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movq       xmm2, qword ptr [esi] // U
 | 
|      movq       xmm3, qword ptr [esi + edx] // V
 | 
| @@ -7104,7 +5639,6 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
 | 
|      mov        ecx, [esp + 8 + 20]   // width
 | 
|      sub        edx, esi
 | 
|  
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movq       xmm2, qword ptr [esi] // U
 | 
|      movq       xmm3, qword ptr [esi + edx] // V
 | 
| @@ -7141,7 +5675,6 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
 | 
|      pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
 | 
|  
 | 
|      // 2 pixel loop.
 | 
| -    align      4
 | 
|   convertloop:
 | 
|  //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
 | 
|  //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
 | 
| @@ -7177,9 +5710,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
 | 
|      cvttps2dq  xmm4, xmm4
 | 
|      packuswb   xmm0, xmm4
 | 
|      packuswb   xmm0, xmm0
 | 
| -    sub        ecx, 2
 | 
|      movq       qword ptr [edx], xmm0
 | 
|      lea        edx, [edx + 8]
 | 
| +    sub        ecx, 2
 | 
|      jg         convertloop
 | 
|      pop        esi
 | 
|      ret
 | 
| @@ -7203,7 +5736,6 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
 | 
|      mov        ecx, [esp + 16]  /* width */
 | 
|  
 | 
|      // 2 pixel loop.
 | 
| -    align      4
 | 
|   convertloop:
 | 
|      vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
 | 
|      lea         eax, [eax + 8]
 | 
| @@ -7217,9 +5749,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
 | 
|      vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
 | 
|      vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
 | 
|      vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
 | 
| -    sub         ecx, 2
 | 
|      vmovq       qword ptr [edx], xmm0
 | 
|      lea         edx, [edx + 8]
 | 
| +    sub         ecx, 2
 | 
|      jg          convertloop
 | 
|      vzeroupper
 | 
|      ret
 | 
| @@ -7239,7 +5771,6 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
 | 
|      mov        ecx, [esp + 4 + 12]  /* width */
 | 
|  
 | 
|      // 1 pixel loop.
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movzx      edx, byte ptr [eax]
 | 
|      lea        eax, [eax + 4]
 | 
| @@ -7273,7 +5804,6 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
 | 
|      mov        ecx, [esp + 4 + 12]  /* width */
 | 
|  
 | 
|      // 1 pixel loop.
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movzx      edx, byte ptr [eax]
 | 
|      lea        eax, [eax + 4]
 | 
| @@ -7315,7 +5845,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 | 
|      pxor       xmm5, xmm5
 | 
|  
 | 
|      // 4 pixel loop.
 | 
| -    align      4
 | 
|    convertloop:
 | 
|      movdqu     xmm0, qword ptr [eax]      // generate luma ptr
 | 
|      pmaddubsw  xmm0, xmm3
 | 
| @@ -7382,9 +5911,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 | 
|      movzx      edx, byte ptr [eax + 15]  // copy alpha.
 | 
|      mov        byte ptr [edi + 15], dl
 | 
|  
 | 
| -    sub        ecx, 4
 | 
|      lea        eax, [eax + 16]
 | 
|      lea        edi, [edi + 16]
 | 
| +    sub        ecx, 4
 | 
|      jg         convertloop
 | 
|  
 | 
|      pop        edi
 | 
| 
 |