OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkChecksum_opts_DEFINED | 8 #ifndef SkChecksum_opts_DEFINED |
9 #define SkChecksum_opts_DEFINED | 9 #define SkChecksum_opts_DEFINED |
10 | 10 |
11 #include "SkChecksum.h" | 11 #include "SkChecksum.h" |
12 #include "SkTypes.h" | 12 #include "SkTypes.h" |
13 | 13 |
14 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 | 14 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 |
15 #include <immintrin.h> | 15 #include <immintrin.h> |
16 #endif | 16 #endif |
17 | 17 |
18 // TODO: ARMv8 has optional CRC instructions similar to SSE 4.2 | 18 // TODO: ARMv8 has optional CRC instructions similar to SSE 4.2 |
19 // TODO: 32-bit x86 version: same sort of idea using only _mm_crc32_u32() and sm
aller | |
20 | 19 |
21 namespace SK_OPTS_NS { | 20 namespace SK_OPTS_NS { |
22 | 21 |
| 22 template <typename T> |
| 23 static inline T unaligned_load(const uint8_t* src) { |
| 24 T val; |
| 25 memcpy(&val, src, sizeof(val)); |
| 26 return val; |
| 27 } |
| 28 |
23 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || define
d(_M_X64)) | 29 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || define
d(_M_X64)) |
24 template <typename T> | 30 // This is not a CRC32. It's Just A Hash that uses those instructions becau
se they're fast. |
25 static inline T unaligned_load(const uint8_t* src) { | |
26 T val; | |
27 memcpy(&val, src, sizeof(val)); | |
28 return val; | |
29 } | |
30 | |
31 static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) { | 31 static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) { |
32 auto data = (const uint8_t*)vdata; | 32 auto data = (const uint8_t*)vdata; |
33 | 33 |
34 // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for
a while. | 34 // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for
a while. |
35 uint64_t hash = seed; | 35 uint64_t hash = seed; |
36 if (bytes >= 24) { | 36 if (bytes >= 24) { |
37 // We'll create 3 independent hashes, each using _mm_crc32_u64() | 37 // We'll create 3 independent hashes, each using _mm_crc32_u64() |
38 // to hash 8 bytes per step. Both 3 and independent are important: | 38 // to hash 8 bytes per step. Both 3 and independent are important: |
39 // we can execute 3 of these instructions in parallel on a single co
re. | 39 // we can execute 3 of these instructions in parallel on a single co
re. |
40 uint64_t a = hash, | 40 uint64_t a = hash, |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
75 if (bytes & 2) { | 75 if (bytes & 2) { |
76 hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data)); | 76 hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data)); |
77 data += 2; | 77 data += 2; |
78 } | 78 } |
79 if (bytes & 1) { | 79 if (bytes & 1) { |
80 hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data)); | 80 hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data)); |
81 } | 81 } |
82 return hash32; | 82 return hash32; |
83 } | 83 } |
84 | 84 |
| 85 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 |
| 86 // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64(). |
| 87 static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
| 88 auto data = (const uint8_t*)vdata; |
| 89 |
| 90 if (bytes >= 12) { |
| 91 // We'll create 3 independent hashes, each using _mm_crc32_u32() |
| 92 // to hash 4 bytes per step. Both 3 and independent are important: |
| 93 // we can execute 3 of these instructions in parallel on a single co
re. |
| 94 uint32_t a = hash, |
| 95 b = hash, |
| 96 c = hash; |
| 97 size_t steps = bytes/12; |
| 98 while (steps --> 0) { |
| 99 a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0)); |
| 100 b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4)); |
| 101 c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8)); |
| 102 data += 12; |
| 103 } |
| 104 bytes %= 12; |
| 105 hash = a^b^c; |
| 106 } |
| 107 |
| 108 SkASSERT(bytes < 12); |
| 109 if (bytes >= 8) { |
| 110 hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data)); |
| 111 bytes -= 4; |
| 112 data += 4; |
| 113 } |
| 114 |
| 115 SkASSERT(bytes < 8); |
| 116 if (bytes & 4) { |
| 117 hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data)); |
| 118 data += 4; |
| 119 } |
| 120 if (bytes & 2) { |
| 121 hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data)); |
| 122 data += 2; |
| 123 } |
| 124 if (bytes & 1) { |
| 125 hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data)); |
| 126 } |
| 127 return hash; |
| 128 } |
| 129 |
85 #else | 130 #else |
86 static uint32_t hash_fn(const void* data, size_t bytes, uint32_t seed) { | 131 // This is Murmur3. |
87 // This is Murmur3. | 132 static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
| 133 auto data = (const uint8_t*)vdata; |
88 | 134 |
89 // Use may_alias to remind the compiler we're intentionally violating st
rict aliasing, | 135 size_t original_bytes = bytes; |
90 // and so not to apply strict-aliasing-based optimizations. | |
91 typedef uint32_t SK_ATTRIBUTE(may_alias) aliased_uint32_t; | |
92 typedef uint8_t SK_ATTRIBUTE(may_alias) aliased_uint8_t; | |
93 | 136 |
94 // Handle 4 bytes at a time while possible. | 137 // Handle 4 bytes at a time while possible. |
95 const aliased_uint32_t* safe_data = (const aliased_uint32_t*)data; | 138 while (bytes >= 4) { |
96 const size_t words = bytes/4; | 139 uint32_t k = unaligned_load<uint32_t>(data); |
97 uint32_t hash = seed; | |
98 for (size_t i = 0; i < words; i++) { | |
99 uint32_t k = safe_data[i]; | |
100 k *= 0xcc9e2d51; | 140 k *= 0xcc9e2d51; |
101 k = (k << 15) | (k >> 17); | 141 k = (k << 15) | (k >> 17); |
102 k *= 0x1b873593; | 142 k *= 0x1b873593; |
103 | 143 |
104 hash ^= k; | 144 hash ^= k; |
105 hash = (hash << 13) | (hash >> 19); | 145 hash = (hash << 13) | (hash >> 19); |
106 hash *= 5; | 146 hash *= 5; |
107 hash += 0xe6546b64; | 147 hash += 0xe6546b64; |
| 148 |
| 149 bytes -= 4; |
| 150 data += 4; |
108 } | 151 } |
109 | 152 |
110 // Handle last 0-3 bytes. | 153 // Handle last 0-3 bytes. |
111 const aliased_uint8_t* safe_tail = (const uint8_t*)(safe_data + words); | |
112 uint32_t k = 0; | 154 uint32_t k = 0; |
113 switch (bytes & 3) { | 155 switch (bytes & 3) { |
114 case 3: k ^= safe_tail[2] << 16; | 156 case 3: k ^= data[2] << 16; |
115 case 2: k ^= safe_tail[1] << 8; | 157 case 2: k ^= data[1] << 8; |
116 case 1: k ^= safe_tail[0] << 0; | 158 case 1: k ^= data[0] << 0; |
117 k *= 0xcc9e2d51; | 159 k *= 0xcc9e2d51; |
118 k = (k << 15) | (k >> 17); | 160 k = (k << 15) | (k >> 17); |
119 k *= 0x1b873593; | 161 k *= 0x1b873593; |
120 hash ^= k; | 162 hash ^= k; |
121 } | 163 } |
122 | 164 |
123 hash ^= bytes; | 165 hash ^= original_bytes; |
124 return SkChecksum::Mix(hash); | 166 return SkChecksum::Mix(hash); |
125 } | 167 } |
126 #endif | 168 #endif |
127 | 169 |
128 } // namespace SK_OPTS_NS | 170 } // namespace SK_OPTS_NS |
129 | 171 |
130 #endif//SkChecksum_opts_DEFINED | 172 #endif//SkChecksum_opts_DEFINED |
OLD | NEW |