| OLD | NEW |
| (Empty) | |
| 1 /* Copyright 2013 Google Inc. All Rights Reserved. |
| 2 |
| 3 Distributed under MIT license. |
| 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
| 5 */ |
| 6 |
| 7 // Functions for clustering similar histograms together. |
| 8 |
| 9 #ifndef BROTLI_ENC_CLUSTER_H_ |
| 10 #define BROTLI_ENC_CLUSTER_H_ |
| 11 |
| 12 #include <math.h> |
| 13 #include <algorithm> |
| 14 #include <utility> |
| 15 #include <vector> |
| 16 |
| 17 #include "./bit_cost.h" |
| 18 #include "./entropy_encode.h" |
| 19 #include "./fast_log.h" |
| 20 #include "./histogram.h" |
| 21 #include "./port.h" |
| 22 #include "./types.h" |
| 23 |
| 24 namespace brotli { |
| 25 |
| 26 struct HistogramPair { |
| 27 uint32_t idx1; |
| 28 uint32_t idx2; |
| 29 double cost_combo; |
| 30 double cost_diff; |
| 31 }; |
| 32 |
| 33 inline bool operator<(const HistogramPair& p1, const HistogramPair& p2) { |
| 34 if (p1.cost_diff != p2.cost_diff) { |
| 35 return p1.cost_diff > p2.cost_diff; |
| 36 } |
| 37 return (p1.idx2 - p1.idx1) > (p2.idx2 - p2.idx1); |
| 38 } |
| 39 |
| 40 // Returns entropy reduction of the context map when we combine two clusters. |
| 41 inline double ClusterCostDiff(size_t size_a, size_t size_b) { |
| 42 size_t size_c = size_a + size_b; |
| 43 return static_cast<double>(size_a) * FastLog2(size_a) + |
| 44 static_cast<double>(size_b) * FastLog2(size_b) - |
| 45 static_cast<double>(size_c) * FastLog2(size_c); |
| 46 } |
| 47 |
| 48 // Computes the bit cost reduction by combining out[idx1] and out[idx2] and if |
| 49 // it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. |
| 50 template<typename HistogramType> |
| 51 void CompareAndPushToQueue(const HistogramType* out, |
| 52 const uint32_t* cluster_size, |
| 53 uint32_t idx1, uint32_t idx2, |
| 54 size_t max_num_pairs, |
| 55 HistogramPair* pairs, |
| 56 size_t* num_pairs) { |
| 57 if (idx1 == idx2) { |
| 58 return; |
| 59 } |
| 60 if (idx2 < idx1) { |
| 61 uint32_t t = idx2; |
| 62 idx2 = idx1; |
| 63 idx1 = t; |
| 64 } |
| 65 bool store_pair = false; |
| 66 HistogramPair p; |
| 67 p.idx1 = idx1; |
| 68 p.idx2 = idx2; |
| 69 p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]); |
| 70 p.cost_diff -= out[idx1].bit_cost_; |
| 71 p.cost_diff -= out[idx2].bit_cost_; |
| 72 |
| 73 if (out[idx1].total_count_ == 0) { |
| 74 p.cost_combo = out[idx2].bit_cost_; |
| 75 store_pair = true; |
| 76 } else if (out[idx2].total_count_ == 0) { |
| 77 p.cost_combo = out[idx1].bit_cost_; |
| 78 store_pair = true; |
| 79 } else { |
| 80 double threshold = *num_pairs == 0 ? 1e99 : |
| 81 std::max(0.0, pairs[0].cost_diff); |
| 82 HistogramType combo = out[idx1]; |
| 83 combo.AddHistogram(out[idx2]); |
| 84 double cost_combo = PopulationCost(combo); |
| 85 if (cost_combo < threshold - p.cost_diff) { |
| 86 p.cost_combo = cost_combo; |
| 87 store_pair = true; |
| 88 } |
| 89 } |
| 90 if (store_pair) { |
| 91 p.cost_diff += p.cost_combo; |
| 92 if (*num_pairs > 0 && pairs[0] < p) { |
| 93 // Replace the top of the queue if needed. |
| 94 if (*num_pairs < max_num_pairs) { |
| 95 pairs[*num_pairs] = pairs[0]; |
| 96 ++(*num_pairs); |
| 97 } |
| 98 pairs[0] = p; |
| 99 } else if (*num_pairs < max_num_pairs) { |
| 100 pairs[*num_pairs] = p; |
| 101 ++(*num_pairs); |
| 102 } |
| 103 } |
| 104 } |
| 105 |
| 106 template<typename HistogramType> |
| 107 size_t HistogramCombine(HistogramType* out, |
| 108 uint32_t* cluster_size, |
| 109 uint32_t* symbols, |
| 110 uint32_t* clusters, |
| 111 HistogramPair* pairs, |
| 112 size_t num_clusters, |
| 113 size_t symbols_size, |
| 114 size_t max_clusters, |
| 115 size_t max_num_pairs) { |
| 116 double cost_diff_threshold = 0.0; |
| 117 size_t min_cluster_size = 1; |
| 118 |
| 119 // We maintain a vector of histogram pairs, with the property that the pair |
| 120 // with the maximum bit cost reduction is the first. |
| 121 size_t num_pairs = 0; |
| 122 for (size_t idx1 = 0; idx1 < num_clusters; ++idx1) { |
| 123 for (size_t idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) { |
| 124 CompareAndPushToQueue(out, cluster_size, clusters[idx1], clusters[idx2], |
| 125 max_num_pairs, &pairs[0], &num_pairs); |
| 126 } |
| 127 } |
| 128 |
| 129 while (num_clusters > min_cluster_size) { |
| 130 if (pairs[0].cost_diff >= cost_diff_threshold) { |
| 131 cost_diff_threshold = 1e99; |
| 132 min_cluster_size = max_clusters; |
| 133 continue; |
| 134 } |
| 135 // Take the best pair from the top of heap. |
| 136 uint32_t best_idx1 = pairs[0].idx1; |
| 137 uint32_t best_idx2 = pairs[0].idx2; |
| 138 out[best_idx1].AddHistogram(out[best_idx2]); |
| 139 out[best_idx1].bit_cost_ = pairs[0].cost_combo; |
| 140 cluster_size[best_idx1] += cluster_size[best_idx2]; |
| 141 for (size_t i = 0; i < symbols_size; ++i) { |
| 142 if (symbols[i] == best_idx2) { |
| 143 symbols[i] = best_idx1; |
| 144 } |
| 145 } |
| 146 for (size_t i = 0; i < num_clusters; ++i) { |
| 147 if (clusters[i] == best_idx2) { |
| 148 memmove(&clusters[i], &clusters[i + 1], |
| 149 (num_clusters - i - 1) * sizeof(clusters[0])); |
| 150 break; |
| 151 } |
| 152 } |
| 153 --num_clusters; |
| 154 // Remove pairs intersecting the just combined best pair. |
| 155 size_t copy_to_idx = 0; |
| 156 for (size_t i = 0; i < num_pairs; ++i) { |
| 157 HistogramPair& p = pairs[i]; |
| 158 if (p.idx1 == best_idx1 || p.idx2 == best_idx1 || |
| 159 p.idx1 == best_idx2 || p.idx2 == best_idx2) { |
| 160 // Remove invalid pair from the queue. |
| 161 continue; |
| 162 } |
| 163 if (pairs[0] < p) { |
| 164 // Replace the top of the queue if needed. |
| 165 HistogramPair front = pairs[0]; |
| 166 pairs[0] = p; |
| 167 pairs[copy_to_idx] = front; |
| 168 } else { |
| 169 pairs[copy_to_idx] = p; |
| 170 } |
| 171 ++copy_to_idx; |
| 172 } |
| 173 num_pairs = copy_to_idx; |
| 174 |
| 175 // Push new pairs formed with the combined histogram to the heap. |
| 176 for (size_t i = 0; i < num_clusters; ++i) { |
| 177 CompareAndPushToQueue(out, cluster_size, best_idx1, clusters[i], |
| 178 max_num_pairs, &pairs[0], &num_pairs); |
| 179 } |
| 180 } |
| 181 return num_clusters; |
| 182 } |
| 183 |
| 184 // ----------------------------------------------------------------------------- |
| 185 // Histogram refinement |
| 186 |
| 187 // What is the bit cost of moving histogram from cur_symbol to candidate. |
| 188 template<typename HistogramType> |
| 189 double HistogramBitCostDistance(const HistogramType& histogram, |
| 190 const HistogramType& candidate) { |
| 191 if (histogram.total_count_ == 0) { |
| 192 return 0.0; |
| 193 } |
| 194 HistogramType tmp = histogram; |
| 195 tmp.AddHistogram(candidate); |
| 196 return PopulationCost(tmp) - candidate.bit_cost_; |
| 197 } |
| 198 |
| 199 // Find the best 'out' histogram for each of the 'in' histograms. |
| 200 // When called, clusters[0..num_clusters) contains the unique values from |
| 201 // symbols[0..in_size), but this property is not preserved in this function. |
| 202 // Note: we assume that out[]->bit_cost_ is already up-to-date. |
| 203 template<typename HistogramType> |
| 204 void HistogramRemap(const HistogramType* in, size_t in_size, |
| 205 const uint32_t* clusters, size_t num_clusters, |
| 206 HistogramType* out, uint32_t* symbols) { |
| 207 for (size_t i = 0; i < in_size; ++i) { |
| 208 uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1]; |
| 209 double best_bits = HistogramBitCostDistance(in[i], out[best_out]); |
| 210 for (size_t j = 0; j < num_clusters; ++j) { |
| 211 const double cur_bits = HistogramBitCostDistance(in[i], out[clusters[j]]); |
| 212 if (cur_bits < best_bits) { |
| 213 best_bits = cur_bits; |
| 214 best_out = clusters[j]; |
| 215 } |
| 216 } |
| 217 symbols[i] = best_out; |
| 218 } |
| 219 |
| 220 // Recompute each out based on raw and symbols. |
| 221 for (size_t j = 0; j < num_clusters; ++j) { |
| 222 out[clusters[j]].Clear(); |
| 223 } |
| 224 for (size_t i = 0; i < in_size; ++i) { |
| 225 out[symbols[i]].AddHistogram(in[i]); |
| 226 } |
| 227 } |
| 228 |
| 229 // Reorders elements of the out[0..length) array and changes values in |
| 230 // symbols[0..length) array in the following way: |
| 231 // * when called, symbols[] contains indexes into out[], and has N unique |
| 232 // values (possibly N < length) |
| 233 // * on return, symbols'[i] = f(symbols[i]) and |
| 234 // out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length, |
| 235 // where f is a bijection between the range of symbols[] and [0..N), and |
| 236 // the first occurrences of values in symbols'[i] come in consecutive |
| 237 // increasing order. |
| 238 // Returns N, the number of unique values in symbols[]. |
| 239 template<typename HistogramType> |
| 240 size_t HistogramReindex(HistogramType* out, uint32_t* symbols, size_t length) { |
| 241 static const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max(); |
| 242 std::vector<uint32_t> new_index(length, kInvalidIndex); |
| 243 uint32_t next_index = 0; |
| 244 for (size_t i = 0; i < length; ++i) { |
| 245 if (new_index[symbols[i]] == kInvalidIndex) { |
| 246 new_index[symbols[i]] = next_index; |
| 247 ++next_index; |
| 248 } |
| 249 } |
| 250 std::vector<HistogramType> tmp(next_index); |
| 251 next_index = 0; |
| 252 for (size_t i = 0; i < length; ++i) { |
| 253 if (new_index[symbols[i]] == next_index) { |
| 254 tmp[next_index] = out[symbols[i]]; |
| 255 ++next_index; |
| 256 } |
| 257 symbols[i] = new_index[symbols[i]]; |
| 258 } |
| 259 for (size_t i = 0; i < next_index; ++i) { |
| 260 out[i] = tmp[i]; |
| 261 } |
| 262 return next_index; |
| 263 } |
| 264 |
| 265 // Clusters similar histograms in 'in' together, the selected histograms are |
| 266 // placed in 'out', and for each index in 'in', *histogram_symbols will |
| 267 // indicate which of the 'out' histograms is the best approximation. |
| 268 template<typename HistogramType> |
| 269 void ClusterHistograms(const std::vector<HistogramType>& in, |
| 270 size_t num_contexts, size_t num_blocks, |
| 271 size_t max_histograms, |
| 272 std::vector<HistogramType>* out, |
| 273 std::vector<uint32_t>* histogram_symbols) { |
| 274 const size_t in_size = num_contexts * num_blocks; |
| 275 assert(in_size == in.size()); |
| 276 std::vector<uint32_t> cluster_size(in_size, 1); |
| 277 std::vector<uint32_t> clusters(in_size); |
| 278 size_t num_clusters = 0; |
| 279 out->resize(in_size); |
| 280 histogram_symbols->resize(in_size); |
| 281 for (size_t i = 0; i < in_size; ++i) { |
| 282 (*out)[i] = in[i]; |
| 283 (*out)[i].bit_cost_ = PopulationCost(in[i]); |
| 284 (*histogram_symbols)[i] = static_cast<uint32_t>(i); |
| 285 } |
| 286 |
| 287 const size_t max_input_histograms = 64; |
| 288 // For the first pass of clustering, we allow all pairs. |
| 289 size_t max_num_pairs = max_input_histograms * max_input_histograms / 2; |
| 290 std::vector<HistogramPair> pairs(max_num_pairs + 1); |
| 291 |
| 292 for (size_t i = 0; i < in_size; i += max_input_histograms) { |
| 293 size_t num_to_combine = std::min(in_size - i, max_input_histograms); |
| 294 for (size_t j = 0; j < num_to_combine; ++j) { |
| 295 clusters[num_clusters + j] = static_cast<uint32_t>(i + j); |
| 296 } |
| 297 size_t num_new_clusters = |
| 298 HistogramCombine(&(*out)[0], &cluster_size[0], |
| 299 &(*histogram_symbols)[i], |
| 300 &clusters[num_clusters], &pairs[0], |
| 301 num_to_combine, num_to_combine, |
| 302 max_histograms, max_num_pairs); |
| 303 num_clusters += num_new_clusters; |
| 304 } |
| 305 |
| 306 // For the second pass, we limit the total number of histogram pairs. |
| 307 // After this limit is reached, we only keep searching for the best pair. |
| 308 max_num_pairs = |
| 309 std::min(64 * num_clusters, (num_clusters / 2) * num_clusters); |
| 310 pairs.resize(max_num_pairs + 1); |
| 311 |
| 312 // Collapse similar histograms. |
| 313 num_clusters = HistogramCombine(&(*out)[0], &cluster_size[0], |
| 314 &(*histogram_symbols)[0], &clusters[0], |
| 315 &pairs[0], num_clusters, in_size, |
| 316 max_histograms, max_num_pairs); |
| 317 |
| 318 // Find the optimal map from original histograms to the final ones. |
| 319 HistogramRemap(&in[0], in_size, &clusters[0], num_clusters, |
| 320 &(*out)[0], &(*histogram_symbols)[0]); |
| 321 |
| 322 // Convert the context map to a canonical form. |
| 323 size_t num_histograms = |
| 324 HistogramReindex(&(*out)[0], &(*histogram_symbols)[0], in_size); |
| 325 out->resize(num_histograms); |
| 326 } |
| 327 |
| 328 } // namespace brotli |
| 329 |
| 330 #endif // BROTLI_ENC_CLUSTER_H_ |
| OLD | NEW |