| Index: src/opts/SkChecksum_opts.h
|
| diff --git a/src/opts/SkChecksum_opts.h b/src/opts/SkChecksum_opts.h
|
| index 346b16b3f5847b9f29427f7da28fd4eeb23cde29..07fdfaab65165198f152e6ad24ce594dfe6d094e 100644
|
| --- a/src/opts/SkChecksum_opts.h
|
| +++ b/src/opts/SkChecksum_opts.h
|
| @@ -16,18 +16,18 @@
|
| #endif
|
|
|
| // TODO: ARMv8 has optional CRC instructions similar to SSE 4.2
|
| -// TODO: 32-bit x86 version: same sort of idea using only _mm_crc32_u32() and smaller
|
|
|
| namespace SK_OPTS_NS {
|
|
|
| -#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
|
| - template <typename T>
|
| - static inline T unaligned_load(const uint8_t* src) {
|
| - T val;
|
| - memcpy(&val, src, sizeof(val));
|
| - return val;
|
| - }
|
| +template <typename T>
|
| +static inline T unaligned_load(const uint8_t* src) {
|
| + T val;
|
| + memcpy(&val, src, sizeof(val));
|
| + return val;
|
| +}
|
|
|
| +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
|
| + // This is not a CRC32. It's Just A Hash that uses those instructions because they're fast.
|
| static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
|
| auto data = (const uint8_t*)vdata;
|
|
|
| @@ -82,21 +82,61 @@ namespace SK_OPTS_NS {
|
| return hash32;
|
| }
|
|
|
| +#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
|
| + // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64().
|
| + static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
|
| + auto data = (const uint8_t*)vdata;
|
| +
|
| + if (bytes >= 12) {
|
| + // We'll create 3 independent hashes, each using _mm_crc32_u32()
|
| + // to hash 4 bytes per step. Both 3 and independent are important:
|
| + // we can execute 3 of these instructions in parallel on a single core.
|
| + uint32_t a = hash,
|
| + b = hash,
|
| + c = hash;
|
| + size_t steps = bytes/12;
|
| + while (steps --> 0) {
|
| + a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0));
|
| + b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4));
|
| + c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8));
|
| + data += 12;
|
| + }
|
| + bytes %= 12;
|
| + hash = a^b^c;
|
| + }
|
| +
|
| + SkASSERT(bytes < 12);
|
| + if (bytes >= 8) {
|
| + hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
|
| + bytes -= 4;
|
| + data += 4;
|
| + }
|
| +
|
| + SkASSERT(bytes < 8);
|
| + if (bytes & 4) {
|
| + hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
|
| + data += 4;
|
| + }
|
| + if (bytes & 2) {
|
| + hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data));
|
| + data += 2;
|
| + }
|
| + if (bytes & 1) {
|
| + hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data));
|
| + }
|
| + return hash;
|
| + }
|
| +
|
| #else
|
| - static uint32_t hash_fn(const void* data, size_t bytes, uint32_t seed) {
|
| - // This is Murmur3.
|
| + // This is Murmur3.
|
| + static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
|
| + auto data = (const uint8_t*)vdata;
|
|
|
| - // Use may_alias to remind the compiler we're intentionally violating strict aliasing,
|
| - // and so not to apply strict-aliasing-based optimizations.
|
| - typedef uint32_t SK_ATTRIBUTE(may_alias) aliased_uint32_t;
|
| - typedef uint8_t SK_ATTRIBUTE(may_alias) aliased_uint8_t;
|
| + size_t original_bytes = bytes;
|
|
|
| // Handle 4 bytes at a time while possible.
|
| - const aliased_uint32_t* safe_data = (const aliased_uint32_t*)data;
|
| - const size_t words = bytes/4;
|
| - uint32_t hash = seed;
|
| - for (size_t i = 0; i < words; i++) {
|
| - uint32_t k = safe_data[i];
|
| + while (bytes >= 4) {
|
| + uint32_t k = unaligned_load<uint32_t>(data);
|
| k *= 0xcc9e2d51;
|
| k = (k << 15) | (k >> 17);
|
| k *= 0x1b873593;
|
| @@ -105,22 +145,24 @@ namespace SK_OPTS_NS {
|
| hash = (hash << 13) | (hash >> 19);
|
| hash *= 5;
|
| hash += 0xe6546b64;
|
| +
|
| + bytes -= 4;
|
| + data += 4;
|
| }
|
|
|
| // Handle last 0-3 bytes.
|
| - const aliased_uint8_t* safe_tail = (const uint8_t*)(safe_data + words);
|
| uint32_t k = 0;
|
| switch (bytes & 3) {
|
| - case 3: k ^= safe_tail[2] << 16;
|
| - case 2: k ^= safe_tail[1] << 8;
|
| - case 1: k ^= safe_tail[0] << 0;
|
| + case 3: k ^= data[2] << 16;
|
| + case 2: k ^= data[1] << 8;
|
| + case 1: k ^= data[0] << 0;
|
| k *= 0xcc9e2d51;
|
| k = (k << 15) | (k >> 17);
|
| k *= 0x1b873593;
|
| hash ^= k;
|
| }
|
|
|
| - hash ^= bytes;
|
| + hash ^= original_bytes;
|
| return SkChecksum::Mix(hash);
|
| }
|
| #endif
|
|
|