Chromium Code Reviews| Index: tools/gn/uniquify.h |
| diff --git a/tools/gn/uniquify.h b/tools/gn/uniquify.h |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..ae8e4300dd3205bc08d6a7b7b3b6130fe0888266 |
| --- /dev/null |
| +++ b/tools/gn/uniquify.h |
| @@ -0,0 +1,141 @@ |
| +// Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#ifndef TOOLS_GN_UNIQUIFY_H_ |
| +#define TOOLS_GN_UNIQUIFY_H_ |
| + |
| +#include <algorithm> |
| + |
| +#include "base/containers/hash_tables.h" |
| + |
| +namespace internal { |
| + |
| +// This struct allows us to insert things by reference into a hash table. This |
| +// is used to avoid copying strings just to uniquify them. The hash function |
| +// of a UniquifyRef is that of the object it points to. |
| +// |
| +// It also caches the hash value which allows us to query and then insert |
| +// without recomputing the hash. |
| +template<typename T> |
| +struct UniquifyRef { |
| + UniquifyRef() : value(NULL), hash_val(0) {} |
| + UniquifyRef(T* v) : value(v) { |
| +#if defined(COMPILER_GCC) |
| + BASE_HASH_NAMESPACE::hash<T> h; |
| + hash_val = h(*value); |
| +#elif defined(COMPILER_MSVC) |
| + hash_val = BASE_HASH_NAMESPACE::hash_value(*value); |
| +#else |
| + #error write me |
| +#endif // COMPILER... |
| + } |
| + |
| + T* value; |
| + size_t hash_val; |
| + |
| + bool operator==(const UniquifyRef<T>& other) const { |
| + return *value == *other.value; |
| + } |
| + bool operator<(const UniquifyRef<T>& other) const { |
| + return *value < *other.value; |
| + } |
| +}; |
| + |
| +} // namespace internal |
| + |
| +namespace BASE_HASH_NAMESPACE { |
| + |
| +#if defined(COMPILER_GCC) |
| +template<typename T> struct hash< internal::UniquifyRef<T> > { |
| + std::size_t operator()(const internal::UniquifyRef<T>& v) const { |
| + return v.hash_val; |
| + } |
| +}; |
| +#elif defined(COMPILER_MSVC) |
| +template<typename T> |
| +inline size_t hash_value(const internal::UniquifyRef<T>& v) { |
| + return v.hash_val; |
| +} |
| +#endif // COMPILER... |
| + |
| +} // namespace BASE_HASH_NAMESPACE |
| + |
| +// Uniquifies based on brute-force equality comparisons. |
| +template<typename T> |
| +inline void UniquifyBruteForce(std::vector<T>* container) { |
| + if (container->size() <= 1) |
| + return; |
| + |
| + size_t dest_i = 1; |
| + for (size_t source_i = 1; source_i < container->size(); source_i++) { |
| + std::vector<T>::iterator last = container->begin() + dest_i; |
| + T& cur = (*container)[source_i]; |
| + if (std::find(container->begin(), last, cur) == last) { |
| + // Not seen yet. |
| + if (source_i != dest_i) |
| + std::swap(cur, (*container)[dest_i]); |
|
viettrungluu
2013/10/08 23:39:41
Having to do a swap is a little unfortunate (since
|
| + dest_i++; |
| + } |
| + } |
| + container->resize(dest_i); |
| +} |
| + |
| +// Uniquifies based on a hash-table comparison. |
| +template<typename T> |
| +inline void UniquifyHash(std::vector<T>* container) { |
| + if (container->size() <= 1) |
| + return; |
| + |
| + typedef internal::UniquifyRef<T> Ref; |
| + typedef base::hash_set<Ref> HashSet; |
| + |
| + HashSet hash; |
| + hash.insert(Ref(&(*container)[0])); |
| + |
| + size_t dest_i = 1; |
| + for (size_t source_i = 1; source_i < container->size(); source_i++) { |
| + Ref source_ref(&(*container)[source_i]); |
| + if (hash.find(source_ref) == hash.end()) { |
| + // Not seen yet. |
| + hash.insert(source_ref); |
| + if (source_i != dest_i) |
| + std::swap((*container)[source_i], (*container)[dest_i]); |
| + dest_i++; |
| + } |
| + // Otherwise we've seen this one before and we can skip it. |
| + } |
| + container->resize(dest_i); |
| +} |
| + |
| +// Uniqifies items in the given vector. The first occurance of a given item |
| +// will be kept, any remaining ones will be dropped. |
| +// |
| +// This function is somewhat like std::unique. std::unique only removes |
| +// sequential duplicates, and just moves duplicates to the end since it can't |
| +// resize the container via iteratos alone. |
| +// |
| +// Since this version assumes the input is a vector, we can actually do the |
| +// resize. We also remove duplicates no matter what the ordering is. |
| +// |
| +// The objects in the vecotr must support hashing and equality. |
| +template<typename T> |
| +inline void Uniquify(std::vector<T>* container) { |
| + // Containers this size or smaller get brute-force compares rather than |
| + // using a hash table. This saves a bunch of allocations for the hash table. |
| + // This number was picked based on measurements for 32-byte strings which |
| + // seem like a reasonable average compare for our workload (this test is in |
| + // the unittest file). |
| + // |
| + // Interestingly, for longer strings (assuming they're random and there are |
| + // mostly no duplicates) the threshold will actually be higher because a |
| + // string compare will terminate before checking all the characters, while a |
| + // hash has to walk the whole string. |
| + const size_t kBruteForceThreshold = 75; |
| + if (container->size() <= kBruteForceThreshold) |
| + UniquifyBruteForce(container); |
| + else |
| + UniquifyHash(container); |
| +} |
| + |
| +#endif // TOOLS_GN_UNIQUIFY_H_ |