Index: source/row_win.cc |
diff --git a/source/row_win.cc b/source/row_win.cc |
index b843998d89c0da59b6e41e3113dd96519db85d90..a8c16c3c1ef3a4c0ed99814f9c3898636213e504 100644 |
--- a/source/row_win.cc |
+++ b/source/row_win.cc |
@@ -1505,7 +1505,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
pmaddubsw xmm3, xmm6 |
phaddw xmm0, xmm2 |
phaddw xmm1, xmm3 |
- paddw xmm0, xmm5 // +.5 rounding -> unsigned |
+ paddw xmm0, xmm5 // +.5 rounding -> unsigned |
paddw xmm1, xmm5 |
psraw xmm0, 8 |
psraw xmm1, 8 |
@@ -1590,6 +1590,73 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
} |
#endif // HAS_ARGBTOUVROW_AVX2 |
+#ifdef HAS_ARGBTOUVJROW_AVX2 |
+__declspec(naked) |
+void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
+ uint8* dst_u, uint8* dst_v, int width) { |
+ __asm { |
+ push esi |
+ push edi |
+ mov eax, [esp + 8 + 4] // src_argb |
+ mov esi, [esp + 8 + 8] // src_stride_argb |
+ mov edx, [esp + 8 + 12] // dst_u |
+ mov edi, [esp + 8 + 16] // dst_v |
+ mov ecx, [esp + 8 + 20] // width |
+ vbroadcastf128 ymm5, xmmword ptr kAddUV128 |
+ vbroadcastf128 ymm6, xmmword ptr kARGBToV |
+ vbroadcastf128 ymm7, xmmword ptr kARGBToU |
+ sub edi, edx // stride from u to v |
+ |
+ convertloop: |
+ /* step 1 - subsample 32x2 argb pixels to 16x1 */ |
+ vmovdqu ymm0, [eax] |
+ vmovdqu ymm1, [eax + 32] |
+ vmovdqu ymm2, [eax + 64] |
+ vmovdqu ymm3, [eax + 96] |
+ vpavgb ymm0, ymm0, [eax + esi] |
+ vpavgb ymm1, ymm1, [eax + esi + 32] |
+ vpavgb ymm2, ymm2, [eax + esi + 64] |
+ vpavgb ymm3, ymm3, [eax + esi + 96] |
+ lea eax, [eax + 128] |
+ vshufps ymm4, ymm0, ymm1, 0x88 |
+ vshufps ymm0, ymm0, ymm1, 0xdd |
+ vpavgb ymm0, ymm0, ymm4 // mutated by vshufps |
+ vshufps ymm4, ymm2, ymm3, 0x88 |
+ vshufps ymm2, ymm2, ymm3, 0xdd |
+ vpavgb ymm2, ymm2, ymm4 // mutated by vshufps |
+ |
+ // step 2 - convert to U and V |
+ // from here down is very similar to Y code except |
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V |
+ vpmaddubsw ymm1, ymm0, ymm7 // U |
+ vpmaddubsw ymm3, ymm2, ymm7 |
+ vpmaddubsw ymm0, ymm0, ymm6 // V |
+ vpmaddubsw ymm2, ymm2, ymm6 |
+ vphaddw ymm1, ymm1, ymm3 // mutates |
+ vphaddw ymm0, ymm0, ymm2 |
+ vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned |
+ vpaddw ymm0, ymm0, ymm5 |
+ vpsraw ymm1, ymm1, 8 |
+ vpsraw ymm0, ymm0, 8 |
+ vpacksswb ymm0, ymm1, ymm0 // mutates |
+ vpermq ymm0, ymm0, 0xd8 // For vpacksswb |
+ vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw |
+ |
+ // step 3 - store 16 U and 16 V values |
+ vextractf128 [edx], ymm0, 0 // U |
+ vextractf128 [edx + edi], ymm0, 1 // V |
+ lea edx, [edx + 16] |
+ sub ecx, 32 |
+ jg convertloop |
+ |
+ pop edi |
+ pop esi |
+ vzeroupper |
+ ret |
+ } |
+} |
+#endif // HAS_ARGBTOUVJROW_AVX2 |
+ |
__declspec(naked) |
void ARGBToUV444Row_SSSE3(const uint8* src_argb0, |
uint8* dst_u, uint8* dst_v, int width) { |