OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 // We want to measure the similarity of two sequences of bytes as a surrogate |
| 6 // for measuring how well a second sequence will compress differentially to the |
| 7 // first sequence. |
| 8 |
| 9 #include "courgette/difference_estimator.h" |
| 10 |
| 11 #include "base/hash_tables.h" |
| 12 |
| 13 namespace courgette { |
| 14 |
| 15 // Our difference measure is the number of k-tuples that occur in Subject that |
| 16 // don't occur in Base. |
| 17 const int kTupleSize = 4; |
| 18 |
| 19 namespace { |
| 20 |
| 21 COMPILE_ASSERT(kTupleSize >= 4 && kTupleSize <= 8, kTupleSize_between_4_and_8); |
| 22 |
| 23 size_t HashTuple(const uint8* source) { |
| 24 size_t hash1 = *reinterpret_cast<const uint32*>(source); |
| 25 size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4); |
| 26 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23); |
| 27 return hash; |
| 28 } |
| 29 |
| 30 bool RegionsEqual(const Region& a, const Region& b) { |
| 31 if (a.length() != b.length()) |
| 32 return false; |
| 33 return memcmp(a.start(), b.start(), a.length()) == 0; |
| 34 } |
| 35 |
| 36 } // anonymous namepace |
| 37 |
| 38 class DifferenceEstimator::Base { |
| 39 public: |
| 40 explicit Base(const Region& region) : region_(region) { } |
| 41 |
| 42 void Init() { |
| 43 const uint8* start = region_.start(); |
| 44 const uint8* end = region_.end() - (kTupleSize - 1); |
| 45 for (const uint8* p = start; p < end; ++p) { |
| 46 size_t hash = HashTuple(p); |
| 47 hashes_.insert(hash); |
| 48 } |
| 49 } |
| 50 |
| 51 const Region& region() const { return region_; } |
| 52 |
| 53 private: |
| 54 Region region_; |
| 55 base::hash_set<size_t> hashes_; |
| 56 |
| 57 friend class DifferenceEstimator; |
| 58 DISALLOW_COPY_AND_ASSIGN(Base); |
| 59 }; |
| 60 |
| 61 class DifferenceEstimator::Subject { |
| 62 public: |
| 63 explicit Subject(const Region& region) : region_(region) {} |
| 64 |
| 65 const Region& region() const { return region_; } |
| 66 |
| 67 private: |
| 68 Region region_; |
| 69 |
| 70 DISALLOW_COPY_AND_ASSIGN(Subject); |
| 71 }; |
| 72 |
| 73 DifferenceEstimator::DifferenceEstimator() {} |
| 74 |
| 75 DifferenceEstimator::~DifferenceEstimator() { |
| 76 for (size_t i = 0; i < owned_bases_.size(); ++i) |
| 77 delete owned_bases_[i]; |
| 78 for (size_t i = 0; i < owned_subjects_.size(); ++i) |
| 79 delete owned_subjects_[i]; |
| 80 } |
| 81 |
| 82 DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) { |
| 83 Base* base = new Base(region); |
| 84 base->Init(); |
| 85 owned_bases_.push_back(base); |
| 86 return base; |
| 87 } |
| 88 |
| 89 DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject( |
| 90 const Region& region) { |
| 91 Subject* subject = new Subject(region); |
| 92 owned_subjects_.push_back(subject); |
| 93 return subject; |
| 94 } |
| 95 |
| 96 size_t DifferenceEstimator::Measure(Base* base, Subject* subject) { |
| 97 size_t mismatches = 0; |
| 98 const uint8* start = subject->region().start(); |
| 99 const uint8* end = subject->region().end() - (kTupleSize - 1); |
| 100 |
| 101 const uint8* p = start; |
| 102 while (p < end) { |
| 103 size_t hash = HashTuple(p); |
| 104 if (base->hashes_.find(hash) == base->hashes_.end()) { |
| 105 ++mismatches; |
| 106 } |
| 107 p += 1; |
| 108 } |
| 109 |
| 110 if (mismatches == 0) { |
| 111 if (RegionsEqual(base->region(), subject->region())) |
| 112 return 0; |
| 113 } |
| 114 ++mismatches; // Guarantee not zero. |
| 115 return mismatches; |
| 116 } |
| 117 |
| 118 } // namespace |
OLD | NEW |