Index: third_party/libwebp/dsp/lossless_enc_sse2.c |
diff --git a/third_party/libwebp/dsp/lossless_enc_sse2.c b/third_party/libwebp/dsp/lossless_enc_sse2.c |
index e8c9834184c9906e82e8d5011d6d8e3b8fcf00e9..7c894e7ca4d32f9fb95df2e8de89f3e46f2e2167 100644 |
--- a/third_party/libwebp/dsp/lossless_enc_sse2.c |
+++ b/third_party/libwebp/dsp/lossless_enc_sse2.c |
@@ -325,6 +325,57 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) { |
#undef ANALYZE_XY |
//------------------------------------------------------------------------------ |
+ |
+static int VectorMismatch(const uint32_t* const array1, |
+ const uint32_t* const array2, int length) { |
+ int match_len; |
+ |
+ if (length >= 12) { |
+ __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]); |
+ __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]); |
+ match_len = 0; |
+ do { |
+ // Loop unrolling and early load both provide a speedup of 10% for the |
+ // current function. Also, max_limit can be MAX_LENGTH=4096 at most. |
+ const __m128i cmpA = _mm_cmpeq_epi32(A0, A1); |
+ const __m128i B0 = |
+ _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); |
+ const __m128i B1 = |
+ _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); |
+ if (_mm_movemask_epi8(cmpA) != 0xffff) break; |
+ match_len += 4; |
+ |
+ { |
+ const __m128i cmpB = _mm_cmpeq_epi32(B0, B1); |
+ A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); |
+ A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); |
+ if (_mm_movemask_epi8(cmpB) != 0xffff) break; |
+ match_len += 4; |
+ } |
+ } while (match_len + 12 < length); |
+ } else { |
+ match_len = 0; |
+ // Unroll the potential first two loops. |
+ if (length >= 4 && |
+ _mm_movemask_epi8(_mm_cmpeq_epi32( |
+ _mm_loadu_si128((const __m128i*)&array1[0]), |
+ _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) { |
+ match_len = 4; |
+ if (length >= 8 && |
+ _mm_movemask_epi8(_mm_cmpeq_epi32( |
+ _mm_loadu_si128((const __m128i*)&array1[4]), |
+ _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) |
+ match_len = 8; |
+ } |
+ } |
+ |
+ while (match_len < length && array1[match_len] == array2[match_len]) { |
+ ++match_len; |
+ } |
+ return match_len; |
+} |
+ |
+//------------------------------------------------------------------------------ |
// Entry point |
extern void VP8LEncDspInitSSE2(void); |
@@ -336,6 +387,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { |
VP8LCollectColorRedTransforms = CollectColorRedTransforms; |
VP8LHistogramAdd = HistogramAdd; |
VP8LCombinedShannonEntropy = CombinedShannonEntropy; |
+ VP8LVectorMismatch = VectorMismatch; |
} |
#else // !WEBP_USE_SSE2 |