OLD | NEW |
1 // Copyright 2015 Google Inc. All Rights Reserved. | 1 // Copyright 2015 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
9 // | 9 // |
10 // SSE2 variant of methods for lossless encoder | 10 // SSE2 variant of methods for lossless encoder |
(...skipping 307 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
318 _mm_storeu_si128((__m128i*)tmp, sumXY_128); | 318 _mm_storeu_si128((__m128i*)tmp, sumXY_128); |
319 sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0]; | 319 sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0]; |
320 | 320 |
321 retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); | 321 retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); |
322 return (float)retval; | 322 return (float)retval; |
323 } | 323 } |
324 #undef ANALYZE_X_OR_Y | 324 #undef ANALYZE_X_OR_Y |
325 #undef ANALYZE_XY | 325 #undef ANALYZE_XY |
326 | 326 |
327 //------------------------------------------------------------------------------ | 327 //------------------------------------------------------------------------------ |
| 328 |
| 329 static int VectorMismatch(const uint32_t* const array1, |
| 330 const uint32_t* const array2, int length) { |
| 331 int match_len; |
| 332 |
| 333 if (length >= 12) { |
| 334 __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]); |
| 335 __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]); |
| 336 match_len = 0; |
| 337 do { |
| 338 // Loop unrolling and early load both provide a speedup of 10% for the |
| 339 // current function. Also, max_limit can be MAX_LENGTH=4096 at most. |
| 340 const __m128i cmpA = _mm_cmpeq_epi32(A0, A1); |
| 341 const __m128i B0 = |
| 342 _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); |
| 343 const __m128i B1 = |
| 344 _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); |
| 345 if (_mm_movemask_epi8(cmpA) != 0xffff) break; |
| 346 match_len += 4; |
| 347 |
| 348 { |
| 349 const __m128i cmpB = _mm_cmpeq_epi32(B0, B1); |
| 350 A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); |
| 351 A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); |
| 352 if (_mm_movemask_epi8(cmpB) != 0xffff) break; |
| 353 match_len += 4; |
| 354 } |
| 355 } while (match_len + 12 < length); |
| 356 } else { |
| 357 match_len = 0; |
| 358 // Unroll the potential first two loops. |
| 359 if (length >= 4 && |
| 360 _mm_movemask_epi8(_mm_cmpeq_epi32( |
| 361 _mm_loadu_si128((const __m128i*)&array1[0]), |
| 362 _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) { |
| 363 match_len = 4; |
| 364 if (length >= 8 && |
| 365 _mm_movemask_epi8(_mm_cmpeq_epi32( |
| 366 _mm_loadu_si128((const __m128i*)&array1[4]), |
| 367 _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) |
| 368 match_len = 8; |
| 369 } |
| 370 } |
| 371 |
| 372 while (match_len < length && array1[match_len] == array2[match_len]) { |
| 373 ++match_len; |
| 374 } |
| 375 return match_len; |
| 376 } |
| 377 |
| 378 //------------------------------------------------------------------------------ |
328 // Entry point | 379 // Entry point |
329 | 380 |
330 extern void VP8LEncDspInitSSE2(void); | 381 extern void VP8LEncDspInitSSE2(void); |
331 | 382 |
332 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { | 383 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { |
333 VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; | 384 VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; |
334 VP8LTransformColor = TransformColor; | 385 VP8LTransformColor = TransformColor; |
335 VP8LCollectColorBlueTransforms = CollectColorBlueTransforms; | 386 VP8LCollectColorBlueTransforms = CollectColorBlueTransforms; |
336 VP8LCollectColorRedTransforms = CollectColorRedTransforms; | 387 VP8LCollectColorRedTransforms = CollectColorRedTransforms; |
337 VP8LHistogramAdd = HistogramAdd; | 388 VP8LHistogramAdd = HistogramAdd; |
338 VP8LCombinedShannonEntropy = CombinedShannonEntropy; | 389 VP8LCombinedShannonEntropy = CombinedShannonEntropy; |
| 390 VP8LVectorMismatch = VectorMismatch; |
339 } | 391 } |
340 | 392 |
341 #else // !WEBP_USE_SSE2 | 393 #else // !WEBP_USE_SSE2 |
342 | 394 |
343 WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2) | 395 WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2) |
344 | 396 |
345 #endif // WEBP_USE_SSE2 | 397 #endif // WEBP_USE_SSE2 |
OLD | NEW |