| OLD | NEW |
| 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // We want to measure the similarity of two sequences of bytes as a surrogate | 5 // We want to measure the similarity of two sequences of bytes as a surrogate |
| 6 // for measuring how well a second sequence will compress differentially to the | 6 // for measuring how well a second sequence will compress differentially to the |
| 7 // first sequence. | 7 // first sequence. |
| 8 | 8 |
| 9 #include "courgette/difference_estimator.h" | 9 #include "courgette/difference_estimator.h" |
| 10 | 10 |
| 11 #include <stddef.h> |
| 12 #include <stdint.h> |
| 13 |
| 11 #include "base/containers/hash_tables.h" | 14 #include "base/containers/hash_tables.h" |
| 15 #include "base/macros.h" |
| 12 | 16 |
| 13 namespace courgette { | 17 namespace courgette { |
| 14 | 18 |
| 15 // Our difference measure is the number of k-tuples that occur in Subject that | 19 // Our difference measure is the number of k-tuples that occur in Subject that |
| 16 // don't occur in Base. | 20 // don't occur in Base. |
| 17 const int kTupleSize = 4; | 21 const int kTupleSize = 4; |
| 18 | 22 |
| 19 namespace { | 23 namespace { |
| 20 | 24 |
| 21 static_assert(kTupleSize >= 4 && kTupleSize <= 8, | 25 static_assert(kTupleSize >= 4 && kTupleSize <= 8, |
| 22 "kTupleSize should be between 4 and 8"); | 26 "kTupleSize should be between 4 and 8"); |
| 23 | 27 |
| 24 size_t HashTuple(const uint8* source) { | 28 size_t HashTuple(const uint8_t* source) { |
| 25 size_t hash1 = *reinterpret_cast<const uint32*>(source); | 29 size_t hash1 = *reinterpret_cast<const uint32_t*>(source); |
| 26 size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4); | 30 size_t hash2 = *reinterpret_cast<const uint32_t*>(source + kTupleSize - 4); |
| 27 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23); | 31 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23); |
| 28 return hash; | 32 return hash; |
| 29 } | 33 } |
| 30 | 34 |
| 31 bool RegionsEqual(const Region& a, const Region& b) { | 35 bool RegionsEqual(const Region& a, const Region& b) { |
| 32 if (a.length() != b.length()) | 36 if (a.length() != b.length()) |
| 33 return false; | 37 return false; |
| 34 return memcmp(a.start(), b.start(), a.length()) == 0; | 38 return memcmp(a.start(), b.start(), a.length()) == 0; |
| 35 } | 39 } |
| 36 | 40 |
| 37 } // anonymous namepace | 41 } // anonymous namepace |
| 38 | 42 |
| 39 class DifferenceEstimator::Base { | 43 class DifferenceEstimator::Base { |
| 40 public: | 44 public: |
| 41 explicit Base(const Region& region) : region_(region) { } | 45 explicit Base(const Region& region) : region_(region) { } |
| 42 | 46 |
| 43 void Init() { | 47 void Init() { |
| 44 if (region_.length() < kTupleSize) | 48 if (region_.length() < kTupleSize) |
| 45 return; | 49 return; |
| 46 const uint8* start = region_.start(); | 50 const uint8_t* start = region_.start(); |
| 47 const uint8* end = region_.end() - (kTupleSize - 1); | 51 const uint8_t* end = region_.end() - (kTupleSize - 1); |
| 48 for (const uint8* p = start; p < end; ++p) { | 52 for (const uint8_t* p = start; p < end; ++p) { |
| 49 size_t hash = HashTuple(p); | 53 size_t hash = HashTuple(p); |
| 50 hashes_.insert(hash); | 54 hashes_.insert(hash); |
| 51 } | 55 } |
| 52 } | 56 } |
| 53 | 57 |
| 54 const Region& region() const { return region_; } | 58 const Region& region() const { return region_; } |
| 55 | 59 |
| 56 private: | 60 private: |
| 57 Region region_; | 61 Region region_; |
| 58 base::hash_set<size_t> hashes_; | 62 base::hash_set<size_t> hashes_; |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 92 DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject( | 96 DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject( |
| 93 const Region& region) { | 97 const Region& region) { |
| 94 Subject* subject = new Subject(region); | 98 Subject* subject = new Subject(region); |
| 95 owned_subjects_.push_back(subject); | 99 owned_subjects_.push_back(subject); |
| 96 return subject; | 100 return subject; |
| 97 } | 101 } |
| 98 | 102 |
| 99 size_t DifferenceEstimator::Measure(Base* base, Subject* subject) { | 103 size_t DifferenceEstimator::Measure(Base* base, Subject* subject) { |
| 100 size_t mismatches = 0; | 104 size_t mismatches = 0; |
| 101 if (subject->region().length() >= kTupleSize) { | 105 if (subject->region().length() >= kTupleSize) { |
| 102 const uint8* start = subject->region().start(); | 106 const uint8_t* start = subject->region().start(); |
| 103 const uint8* end = subject->region().end() - (kTupleSize - 1); | 107 const uint8_t* end = subject->region().end() - (kTupleSize - 1); |
| 104 | 108 |
| 105 const uint8* p = start; | 109 const uint8_t* p = start; |
| 106 while (p < end) { | 110 while (p < end) { |
| 107 size_t hash = HashTuple(p); | 111 size_t hash = HashTuple(p); |
| 108 if (base->hashes_.find(hash) == base->hashes_.end()) { | 112 if (base->hashes_.find(hash) == base->hashes_.end()) { |
| 109 ++mismatches; | 113 ++mismatches; |
| 110 } | 114 } |
| 111 p += 1; | 115 p += 1; |
| 112 } | 116 } |
| 113 } | 117 } |
| 114 | 118 |
| 115 if (mismatches == 0) { | 119 if (mismatches == 0) { |
| 116 if (RegionsEqual(base->region(), subject->region())) | 120 if (RegionsEqual(base->region(), subject->region())) |
| 117 return 0; | 121 return 0; |
| 118 } | 122 } |
| 119 ++mismatches; // Guarantee not zero. | 123 ++mismatches; // Guarantee not zero. |
| 120 return mismatches; | 124 return mismatches; |
| 121 } | 125 } |
| 122 | 126 |
| 123 } // namespace | 127 } // namespace |
| OLD | NEW |