OLD | NEW |
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // We want to measure the similarity of two sequences of bytes as a surrogate | 5 // We want to measure the similarity of two sequences of bytes as a surrogate |
6 // for measuring how well a second sequence will compress differentially to the | 6 // for measuring how well a second sequence will compress differentially to the |
7 // first sequence. | 7 // first sequence. |
8 | 8 |
9 #include "courgette/difference_estimator.h" | 9 #include "courgette/difference_estimator.h" |
10 | 10 |
| 11 #include <stddef.h> |
| 12 #include <stdint.h> |
| 13 |
11 #include "base/containers/hash_tables.h" | 14 #include "base/containers/hash_tables.h" |
| 15 #include "base/macros.h" |
12 | 16 |
13 namespace courgette { | 17 namespace courgette { |
14 | 18 |
15 // Our difference measure is the number of k-tuples that occur in Subject that | 19 // Our difference measure is the number of k-tuples that occur in Subject that |
16 // don't occur in Base. | 20 // don't occur in Base. |
17 const int kTupleSize = 4; | 21 const int kTupleSize = 4; |
18 | 22 |
19 namespace { | 23 namespace { |
20 | 24 |
21 static_assert(kTupleSize >= 4 && kTupleSize <= 8, | 25 static_assert(kTupleSize >= 4 && kTupleSize <= 8, |
22 "kTupleSize should be between 4 and 8"); | 26 "kTupleSize should be between 4 and 8"); |
23 | 27 |
24 size_t HashTuple(const uint8* source) { | 28 size_t HashTuple(const uint8_t* source) { |
25 size_t hash1 = *reinterpret_cast<const uint32*>(source); | 29 size_t hash1 = *reinterpret_cast<const uint32_t*>(source); |
26 size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4); | 30 size_t hash2 = *reinterpret_cast<const uint32_t*>(source + kTupleSize - 4); |
27 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23); | 31 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23); |
28 return hash; | 32 return hash; |
29 } | 33 } |
30 | 34 |
31 bool RegionsEqual(const Region& a, const Region& b) { | 35 bool RegionsEqual(const Region& a, const Region& b) { |
32 if (a.length() != b.length()) | 36 if (a.length() != b.length()) |
33 return false; | 37 return false; |
34 return memcmp(a.start(), b.start(), a.length()) == 0; | 38 return memcmp(a.start(), b.start(), a.length()) == 0; |
35 } | 39 } |
36 | 40 |
37 } // anonymous namepace | 41 } // anonymous namepace |
38 | 42 |
39 class DifferenceEstimator::Base { | 43 class DifferenceEstimator::Base { |
40 public: | 44 public: |
41 explicit Base(const Region& region) : region_(region) { } | 45 explicit Base(const Region& region) : region_(region) { } |
42 | 46 |
43 void Init() { | 47 void Init() { |
44 if (region_.length() < kTupleSize) | 48 if (region_.length() < kTupleSize) |
45 return; | 49 return; |
46 const uint8* start = region_.start(); | 50 const uint8_t* start = region_.start(); |
47 const uint8* end = region_.end() - (kTupleSize - 1); | 51 const uint8_t* end = region_.end() - (kTupleSize - 1); |
48 for (const uint8* p = start; p < end; ++p) { | 52 for (const uint8_t* p = start; p < end; ++p) { |
49 size_t hash = HashTuple(p); | 53 size_t hash = HashTuple(p); |
50 hashes_.insert(hash); | 54 hashes_.insert(hash); |
51 } | 55 } |
52 } | 56 } |
53 | 57 |
54 const Region& region() const { return region_; } | 58 const Region& region() const { return region_; } |
55 | 59 |
56 private: | 60 private: |
57 Region region_; | 61 Region region_; |
58 base::hash_set<size_t> hashes_; | 62 base::hash_set<size_t> hashes_; |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
92 DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject( | 96 DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject( |
93 const Region& region) { | 97 const Region& region) { |
94 Subject* subject = new Subject(region); | 98 Subject* subject = new Subject(region); |
95 owned_subjects_.push_back(subject); | 99 owned_subjects_.push_back(subject); |
96 return subject; | 100 return subject; |
97 } | 101 } |
98 | 102 |
99 size_t DifferenceEstimator::Measure(Base* base, Subject* subject) { | 103 size_t DifferenceEstimator::Measure(Base* base, Subject* subject) { |
100 size_t mismatches = 0; | 104 size_t mismatches = 0; |
101 if (subject->region().length() >= kTupleSize) { | 105 if (subject->region().length() >= kTupleSize) { |
102 const uint8* start = subject->region().start(); | 106 const uint8_t* start = subject->region().start(); |
103 const uint8* end = subject->region().end() - (kTupleSize - 1); | 107 const uint8_t* end = subject->region().end() - (kTupleSize - 1); |
104 | 108 |
105 const uint8* p = start; | 109 const uint8_t* p = start; |
106 while (p < end) { | 110 while (p < end) { |
107 size_t hash = HashTuple(p); | 111 size_t hash = HashTuple(p); |
108 if (base->hashes_.find(hash) == base->hashes_.end()) { | 112 if (base->hashes_.find(hash) == base->hashes_.end()) { |
109 ++mismatches; | 113 ++mismatches; |
110 } | 114 } |
111 p += 1; | 115 p += 1; |
112 } | 116 } |
113 } | 117 } |
114 | 118 |
115 if (mismatches == 0) { | 119 if (mismatches == 0) { |
116 if (RegionsEqual(base->region(), subject->region())) | 120 if (RegionsEqual(base->region(), subject->region())) |
117 return 0; | 121 return 0; |
118 } | 122 } |
119 ++mismatches; // Guarantee not zero. | 123 ++mismatches; // Guarantee not zero. |
120 return mismatches; | 124 return mismatches; |
121 } | 125 } |
122 | 126 |
123 } // namespace | 127 } // namespace |
OLD | NEW |