OLD | NEW |
| (Empty) |
1 /* Copyright 2013 Google Inc. All Rights Reserved. | |
2 | |
3 Distributed under MIT license. | |
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT | |
5 */ | |
6 | |
7 // Literal cost model to allow backward reference replacement to be efficient. | |
8 | |
9 #include "./literal_cost.h" | |
10 | |
11 #include <math.h> | |
12 #include <algorithm> | |
13 | |
14 #include "./fast_log.h" | |
15 #include "./types.h" | |
16 #include "./utf8_util.h" | |
17 | |
18 namespace brotli { | |
19 | |
20 static size_t UTF8Position(size_t last, size_t c, size_t clamp) { | |
21 if (c < 128) { | |
22 return 0; // Next one is the 'Byte 1' again. | |
23 } else if (c >= 192) { // Next one is the 'Byte 2' of utf-8 encoding. | |
24 return std::min<size_t>(1, clamp); | |
25 } else { | |
26 // Let's decide over the last byte if this ends the sequence. | |
27 if (last < 0xe0) { | |
28 return 0; // Completed two or three byte coding. | |
29 } else { // Next one is the 'Byte 3' of utf-8 encoding. | |
30 return std::min<size_t>(2, clamp); | |
31 } | |
32 } | |
33 } | |
34 | |
35 static size_t DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask, | |
36 const uint8_t *data) { | |
37 size_t counts[3] = { 0 }; | |
38 size_t max_utf8 = 1; // should be 2, but 1 compresses better. | |
39 size_t last_c = 0; | |
40 size_t utf8_pos = 0; | |
41 for (size_t i = 0; i < len; ++i) { | |
42 size_t c = data[(pos + i) & mask]; | |
43 utf8_pos = UTF8Position(last_c, c, 2); | |
44 ++counts[utf8_pos]; | |
45 last_c = c; | |
46 } | |
47 if (counts[2] < 500) { | |
48 max_utf8 = 1; | |
49 } | |
50 if (counts[1] + counts[2] < 25) { | |
51 max_utf8 = 0; | |
52 } | |
53 return max_utf8; | |
54 } | |
55 | |
56 static void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, | |
57 const uint8_t *data, float *cost) { | |
58 | |
59 // max_utf8 is 0 (normal ascii single byte modeling), | |
60 // 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling). | |
61 const size_t max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data); | |
62 size_t histogram[3][256] = { { 0 } }; | |
63 size_t window_half = 495; | |
64 size_t in_window = std::min(window_half, len); | |
65 size_t in_window_utf8[3] = { 0 }; | |
66 | |
67 // Bootstrap histograms. | |
68 size_t last_c = 0; | |
69 size_t utf8_pos = 0; | |
70 for (size_t i = 0; i < in_window; ++i) { | |
71 size_t c = data[(pos + i) & mask]; | |
72 ++histogram[utf8_pos][c]; | |
73 ++in_window_utf8[utf8_pos]; | |
74 utf8_pos = UTF8Position(last_c, c, max_utf8); | |
75 last_c = c; | |
76 } | |
77 | |
78 // Compute bit costs with sliding window. | |
79 for (size_t i = 0; i < len; ++i) { | |
80 if (i >= window_half) { | |
81 // Remove a byte in the past. | |
82 size_t c = i < window_half + 1 ? | |
83 0 : data[(pos + i - window_half - 1) & mask]; | |
84 size_t last_c = i < window_half + 2 ? | |
85 0 : data[(pos + i - window_half - 2) & mask]; | |
86 size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8); | |
87 --histogram[utf8_pos2][data[(pos + i - window_half) & mask]]; | |
88 --in_window_utf8[utf8_pos2]; | |
89 } | |
90 if (i + window_half < len) { | |
91 // Add a byte in the future. | |
92 size_t c = data[(pos + i + window_half - 1) & mask]; | |
93 size_t last_c = data[(pos + i + window_half - 2) & mask]; | |
94 size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8); | |
95 ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]]; | |
96 ++in_window_utf8[utf8_pos2]; | |
97 } | |
98 size_t c = i < 1 ? 0 : data[(pos + i - 1) & mask]; | |
99 size_t last_c = i < 2 ? 0 : data[(pos + i - 2) & mask]; | |
100 size_t utf8_pos = UTF8Position(last_c, c, max_utf8); | |
101 size_t masked_pos = (pos + i) & mask; | |
102 size_t histo = histogram[utf8_pos][data[masked_pos]]; | |
103 if (histo == 0) { | |
104 histo = 1; | |
105 } | |
106 double lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo); | |
107 lit_cost += 0.02905; | |
108 if (lit_cost < 1.0) { | |
109 lit_cost *= 0.5; | |
110 lit_cost += 0.5; | |
111 } | |
112 // Make the first bytes more expensive -- seems to help, not sure why. | |
113 // Perhaps because the entropy source is changing its properties | |
114 // rapidly in the beginning of the file, perhaps because the beginning | |
115 // of the data is a statistical "anomaly". | |
116 if (i < 2000) { | |
117 lit_cost += 0.7 - (static_cast<double>(2000 - i) / 2000.0 * 0.35); | |
118 } | |
119 cost[i] = static_cast<float>(lit_cost); | |
120 } | |
121 } | |
122 | |
123 void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask, | |
124 const uint8_t *data, float *cost) { | |
125 if (IsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) { | |
126 EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, cost); | |
127 return; | |
128 } | |
129 size_t histogram[256] = { 0 }; | |
130 size_t window_half = 2000; | |
131 size_t in_window = std::min(window_half, len); | |
132 | |
133 // Bootstrap histogram. | |
134 for (size_t i = 0; i < in_window; ++i) { | |
135 ++histogram[data[(pos + i) & mask]]; | |
136 } | |
137 | |
138 // Compute bit costs with sliding window. | |
139 for (size_t i = 0; i < len; ++i) { | |
140 if (i >= window_half) { | |
141 // Remove a byte in the past. | |
142 --histogram[data[(pos + i - window_half) & mask]]; | |
143 --in_window; | |
144 } | |
145 if (i + window_half < len) { | |
146 // Add a byte in the future. | |
147 ++histogram[data[(pos + i + window_half) & mask]]; | |
148 ++in_window; | |
149 } | |
150 size_t histo = histogram[data[(pos + i) & mask]]; | |
151 if (histo == 0) { | |
152 histo = 1; | |
153 } | |
154 double lit_cost = FastLog2(in_window) - FastLog2(histo); | |
155 lit_cost += 0.029; | |
156 if (lit_cost < 1.0) { | |
157 lit_cost *= 0.5; | |
158 lit_cost += 0.5; | |
159 } | |
160 cost[i] = static_cast<float>(lit_cost); | |
161 } | |
162 } | |
163 | |
164 | |
165 } // namespace brotli | |
OLD | NEW |