OLD | NEW |
| (Empty) |
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 // We want to measure the similarity of two sequences of bytes as a surrogate | |
6 // for measuring how well a second sequence will compress differentially to the | |
7 // first sequence. | |
8 | |
9 #include "third_party/courgette/difference_estimator.h" | |
10 | |
11 #include "base/hash_tables.h" | |
12 | |
13 namespace courgette { | |
14 | |
15 // Our difference measure is the number of k-tuples that occur in Subject that | |
16 // don't occur in Base. | |
17 const int kTupleSize = 4; | |
18 | |
19 namespace { | |
20 | |
21 COMPILE_ASSERT(kTupleSize >= 4 && kTupleSize <= 8, kTupleSize_between_4_and_8); | |
22 | |
23 size_t HashTuple(const uint8* source) { | |
24 size_t hash1 = *reinterpret_cast<const uint32*>(source); | |
25 size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4); | |
26 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23); | |
27 return hash; | |
28 } | |
29 | |
30 bool RegionsEqual(const Region& a, const Region& b) { | |
31 if (a.length() != b.length()) | |
32 return false; | |
33 return memcmp(a.start(), b.start(), a.length()) == 0; | |
34 } | |
35 | |
36 } // anonymous namepace | |
37 | |
38 class DifferenceEstimator::Base { | |
39 public: | |
40 explicit Base(const Region& region) : region_(region) { } | |
41 | |
42 void Init() { | |
43 const uint8* start = region_.start(); | |
44 const uint8* end = region_.end() - (kTupleSize - 1); | |
45 for (const uint8* p = start; p < end; ++p) { | |
46 size_t hash = HashTuple(p); | |
47 hashes_.insert(hash); | |
48 } | |
49 } | |
50 | |
51 const Region& region() const { return region_; } | |
52 | |
53 private: | |
54 Region region_; | |
55 base::hash_set<size_t> hashes_; | |
56 | |
57 friend class DifferenceEstimator; | |
58 DISALLOW_COPY_AND_ASSIGN(Base); | |
59 }; | |
60 | |
61 class DifferenceEstimator::Subject { | |
62 public: | |
63 explicit Subject(const Region& region) : region_(region) {} | |
64 | |
65 const Region& region() const { return region_; } | |
66 | |
67 private: | |
68 Region region_; | |
69 | |
70 DISALLOW_COPY_AND_ASSIGN(Subject); | |
71 }; | |
72 | |
73 DifferenceEstimator::DifferenceEstimator() {} | |
74 | |
75 DifferenceEstimator::~DifferenceEstimator() { | |
76 for (size_t i = 0; i < owned_bases_.size(); ++i) | |
77 delete owned_bases_[i]; | |
78 for (size_t i = 0; i < owned_subjects_.size(); ++i) | |
79 delete owned_subjects_[i]; | |
80 } | |
81 | |
82 DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) { | |
83 Base* base = new Base(region); | |
84 base->Init(); | |
85 owned_bases_.push_back(base); | |
86 return base; | |
87 } | |
88 | |
89 DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject( | |
90 const Region& region) { | |
91 Subject* subject = new Subject(region); | |
92 owned_subjects_.push_back(subject); | |
93 return subject; | |
94 } | |
95 | |
96 size_t DifferenceEstimator::Measure(Base* base, Subject* subject) { | |
97 size_t mismatches = 0; | |
98 const uint8* start = subject->region().start(); | |
99 const uint8* end = subject->region().end() - (kTupleSize - 1); | |
100 | |
101 const uint8* p = start; | |
102 while (p < end) { | |
103 size_t hash = HashTuple(p); | |
104 if (base->hashes_.find(hash) == base->hashes_.end()) { | |
105 ++mismatches; | |
106 } | |
107 p += 1; | |
108 } | |
109 | |
110 if (mismatches == 0) { | |
111 if (RegionsEqual(base->region(), subject->region())) | |
112 return 0; | |
113 } | |
114 ++mismatches; // Guarantee not zero. | |
115 return mismatches; | |
116 } | |
117 | |
118 } // namespace | |
OLD | NEW |