Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(92)

Side by Side Diff: third_party/cld/encodings/compact_enc_det/compact_enc_det.cc

Issue 1956183002: CL for perf tryjob on linux (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 //
2 // Copyright 2006, 2007 Google Inc. All Rights Reserved.
3 // Author: dsites@google.com (Dick Sites)
4 //
5 // Design document: eng/designdocs/i18n/compact_encoding_detector.pdf
6
7 #include "encodings/compact_enc_det/compact_enc_det.h"
8
9 #include <math.h> // for sqrt
10 #include <stddef.h> // for size_t
11 #include <stdio.h> // for printf, fprintf, NULL, etc
12 #include <stdlib.h> // for qsort
13 #include <string.h> // for memset, memcpy, memcmp, etc
14 #include <memory>
15 #include <string> // for string, operator==, etc
16
17 //#include "base/basictypes.h" // for uint8, uint32, char32, etc
18 //#include "base/commandlineflags.h" // for DEFINE_bool, <anonymous>, etc
19 //#include "base/logging.h" // for COMPACT_GOOGLE_LOG_FATAL, etc
20 //#include "base/macros.h" // for COMPILE_ASSERT, arraysize, etc
21 #include "encodings/compact_enc_det/compact_enc_det_hint_code.h"
22 #include "encodings/compact_lang_det/win/cld_basictypes.h"
23 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
24 #include "encodings/compact_lang_det/win/cld_logging.h"
25 #include "encodings/compact_lang_det/win/cld_macros.h"
26
27 using std::string;
28
29 // TODO
30 // dsites 2007.10.09
31 //
32 // Consider font=TT-BHxxx as user-defined => binary
33 // Demote GB18030 if no 8x3x pair
34 // Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires
35 // Consider removing/ignoring bytes 01-1F to avoid crap pollution
36 // Possibly boost declared encoding in robust scan
37 // googlebot tiny files
38 // look for ranges of encodings
39 // consider tags just as > < within aligned block of 32
40 // flag too few characters in postproc (Latin 6 problem)
41 // Remove slow scan beyond 16KB
42 // Consider removing kMostLikelyEncoding or cut it in half
43
44
45 // A note on mixed encodings
46 //
47 // The most common encoding error on the web is a page containing a mixture of
48 // CP-1252 and UTF-8. A less common encoding error is a third-party feed that
49 // has been converted from CP-1252 to UTF-8 and then those bytes converted a
50 // second time to UTF-8. CED originally attempted to detect these error cases
51 // by using two synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended
52 // implementation was to start these just below CP1252 and UTF8 respectively in
53 // overall liklihood, and allow 1252 and UTF8 to fall behind if mixtures are
54 // found.
55 //
56 // The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the
57 // UTF8CP1252 internal encoding was added late and not put into encodings.proto,
58 // so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and
59 // is removed in this November 2011 CL.
60 //
61 // Mixed encoding detection never worked out as well as envisioned, so the
62 // ced_allow_utf8utf8 flag normally disables all this.
63 //
64 // The effect is that CP-1252 and UTF-8 mixtures will usually be detected as
65 // UTF8, and the inputconverter code for UTF8 normally will convert bare
66 // CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8
67 // and double-UTF-8 mixtures will be detected as UTF-8, and the double
68 // conversion will stand.
69 //
70 // However, it is occasionally useful to use CED to detect double-converted
71 // UTF-8 coming from third-party data feeds, so they can be fixed at the source.
72 // For this purpose, the UTF8UTF8 encoding remains available under the
73 // ced_allow_utf8utf8 flag.
74 //
75 // When UTF8UTF8 is detected, the inputconverter code will undo the double
76 // conversion, giving good text.
77
78 // Norbert Runge has noted these words in CP1252 that are mistakenly identified
79 // as UTF-8 because of the last pair of characters:
80 // NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH
81 // drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N
82 // Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA
83 // Schoß\u201c 0xDF 0x93 U+00DF U+201C
84 // weiß\u201c 0xDF 0x93 U+00DF U+00AB
85 // Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C
86 // süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE
87 // These four byte combinations now explicitly boost Latin1/CP1252.
88
89 // And for reference, here are a couple of Portuguese spellings
90 // that may be mistaken as double-byte encodings.
91 // informações 0xE7 0xF5
92 // traição 0xE7 0xE3
93
94
95 static const char* kVersion = "2.2";
96
97 DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, "
98 "to handle mixtures of CP1252 "
99 "converted to UTF-8 zero, one, "
100 "or two times");
101 DEFINE_int32(enc_detect_slow_max_kb, 16,
102 "Maximum number of Kbytes to examine for "
103 "7-bit-only (2022, Hz, UTF7) encoding detect. "
104 "You are unlikely to want to change this.");
105 DEFINE_int32(enc_detect_fast_max_kb, 256,
106 "Maximum number of Kbytes to examine for encoding detect. "
107 "You are unlikely to want to change this.");
108
109 DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility "
110 "difference 1st - 2nd to be considered reliable \n"
111 " 2 corresponds to min 4x difference\n"
112 " 4 corresponds to min 16x difference\n"
113 " 8 corresponds to min 256x difference\n"
114 " 10 corresponds to min 1024x difference\n"
115 " 20 corresponds to min 1Mx difference.");
116
117 // Text debug output options
118 DEFINE_bool(enc_detect_summary, false,
119 "Print first 16 interesting pairs at exit.");
120 DEFINE_bool(counts, false, "Count major-section usage");
121
122 // PostScript debug output options
123 DEFINE_bool(enc_detect_detail, false,
124 "Print PostScript of every update, to stderr.");
125 DEFINE_bool(enc_detect_detail2, false,
126 "More PostScript detail of every update, to stderr.");
127 DEFINE_bool(enc_detect_source, false, "Include source text in detail");
128 // Encoding name must exactly match FIRST column of kI18NInfoByEncoding in
129 // lang_enc.cc
130 DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name.");
131 DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name.");
132
133
134 // Only for experiments. Delete soon.
135 DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams");
136
137 // Demo-mode/debugging experiment
138 DEFINE_bool(demo_nodefault, false,
139 "Default to all equal; no boost for declared encoding.");
140 DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings");
141 DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr");
142
143
144 static const int XDECILOG2 = 3; // Multiplier for log base 2 ** n/10
145 static const int XLOG2 = 30; // Multiplier for log base 2 ** n
146
147 static const int kFinalPruneDifference = 10 * XLOG2;
148 // Final bits of minimum
149 // probability difference 1st-nth
150 // to be pruned
151
152 static const int kInititalPruneDifference = kFinalPruneDifference * 4;
153 // Initial bits of minimum
154 // probability difference 1st-nth
155 // to be pruned
156 //
157 static const int kPruneDiffDecrement = kFinalPruneDifference;
158 // Decrements bits of minimum
159 // probability difference 1st-nth
160 // to be pruned
161
162 static const int kSmallInitDiff = 2 * XLOG2; // bits of minimum
163 // probability difference, base to
164 // superset encodings
165
166 static const int kBoostInitial = 20 * XLOG2; // bits of boost for
167 // initial byte patterns (BOM, 00)
168
169 static const int kBadPairWhack = 20 * XLOG2; // bits of whack for
170 // one bad pair
171
172 static const int kBoostOnePair = 20 * XLOG2; // bits of boost for
173 // one good pair in Hz, etc.
174
175 static const int kGentleOnePair = 4 * XLOG2; // bits of boost for
176 // one good sequence
177 //
178 static const int kGentlePairWhack = 2 * XLOG2; // bits of whack
179 // for ill-formed sequence
180
181 static const int kGentlePairBoost = 2 * XLOG2; // bits of boost
182 // for well-formed sequence
183
184 static const int kBoostPerB64Byte = 2 * XLOG2; // bits of boost for
185 // one good pair in Hz, etc.
186
187 static const int kDeclaredEncBoost = 5 * XDECILOG2; // bits/10 of boost for
188 // best declared encoding per bigram
189
190 static const int kBestEncBoost = 5 * XDECILOG2; // bits/10 of boost for
191 // best encoding per bigram
192
193 static const int kTrigramBoost = 2 * XLOG2; // bits of boost for Latin127 tri
194
195 static const int kMaxPairs = 48; // Max interesting pairs to look at
196 // If you change this,
197 // adjust *PruneDiff*
198
199 static const int kPruneMask = 0x07; // Prune every 8 interesting pairs
200
201
202 static const int kBestPairsCount = 16; // For first N pairs, do extra boost
203 // based on most likely encoding
204 // of pair over entire web
205
206 static const int kDerateHintsBelow = 12; // If we have fewer than N bigrams,
207 // weaken the hints enough that
208 // unhinted encodings have a hope of
209 // rising to the top
210
211 static const int kMinRescanLength = 800; // Don't bother rescanning for
212 // unreliable encoding if fewer
213 // than this many bytes unscanned.
214 // We will rescan at most last half
215 // of this.
216
217 static const int kStrongBinary = 12; // Make F_BINARY the only encoding
218 static const int kWeakerBinary = 4; // Make F_BINARY likely encoding
219
220 // These are byte counts from front of file
221 static const int kBinaryHardAsciiLimit = 6 * 1024; // Not binary if all ASCII
222 static const int kBinarySoftAsciiLimit = 8 * 1024; // " if mostly ASCII
223
224 // We try here to avoid having title text dominate the encoding detection,
225 // for the not-infrequent error case of title in encoding1, body in encoding2:
226 // we want to bias toward encoding2 winning.
227 //
228 // kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we
229 // rarely cut off mid-character in the original (not-yet-detected) encoding.
230 // This matters most for UTF-8 two- and three-byte codes and for
231 // Shift-JIS three-byte codes.
232 static const int kMaxBigramsTagTitleText = 12; // Keep only some tag text
233 static const int kWeightshiftForTagTitleText = 4; // Give text in tags, etc.
234 // 1/16 normal weight
235
236 static const int kStrongPairs = 6; // Let reliable enc with this many
237 // pairs overcome missing hint
238
239 enum CEDInternalFlags {
240 kCEDNone = 0, // The empty flag
241 kCEDRescanning = 1, // Do not further recurse
242 kCEDSlowscore = 2, // Do extra scoring
243 kCEDForceTags = 4, // Always examine text inside tags
244 };
245
246 // Forward declaration
247 Encoding InternalDetectEncoding(
248 CEDInternalFlags flags, const char* text, int text_length,
249 const char* url_hint, const char* http_charset_hint,
250 const char* meta_charset_hint, const int encoding_hint,
251 const Language language_hint, // User interface lang
252 const CompactEncDet::TextCorpusType corpus_type,
253 bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
254 Encoding* second_best_enc);
255
256 typedef struct {
257 const uint8* hires[4]; // Pointers to possible high-resolution bigram deltas
258 uint8 x_bar; // Average byte2 value
259 uint8 y_bar; // Average byte1 value
260 uint8 x_stddev; // Standard deviation of byte2 value
261 uint8 y_stddev; // Standard deviation of byte1 value
262 int so; // Scaling offset -- add to probabilities below
263 const uint8 b1[256]; // Unigram probability for first byte of aligned bigram
264 const uint8 b2[256]; // Unigram probability for second byte of aligned bigram
265 const uint8 b12[256]; // Unigram probability for cross bytes of aligned bigram
266 } UnigramEntry;
267
268 //typedef struct {
269 // uint8 b12[256*256]; // Bigram probability for aligned bigram
270 //} FullBigramEntry;
271
272
273 // Include all the postproc-generated tables here:
274 // RankedEncoding
275 // kMapToEncoding
276 // unigram_table
277 // kMostLIkelyEncoding
278 // kTLDHintProbs
279 // kCharsetHintProbs
280 // HintEntry, kMaxTldKey kMaxTldVector, etc.
281 // =============================================================================
282
283 #include "encodings/compact_enc_det/compact_enc_det_generated_tables.h"
284
285
286 #define F_ASCII F_Latin1 // "ASCII" is a misnomer, so this code uses "Latin1"
287
288 #define F_BINARY F_X_BINARYENC // We are mid-update for name change
289 #define F_UTF8UTF8 F_X_UTF8UTF8 // We are mid-update for name change
290 #define F_BIG5_CP950 F_BIG5 // We are mid-update for name change
291 #define F_Unicode F_UTF_16LE // We are mid-update for name change
292 // =============================================================================
293
294 // 7-bit encodings have at least one "interesting" byte value < 0x80
295 // (00 0E 1B + ~)
296 // JIS 2022-cn 2022-kr hz utf7
297 // Unicode UTF-16 UTF-32
298 // 8-bit encodings have no interesting byte values < 0x80
299 static const uint32 kSevenBitActive = 0x00000001; // needs <80 to detect
300 static const uint32 kUTF7Active = 0x00000002; // <80 and +
301 static const uint32 kHzActive = 0x00000004; // <80 and ~
302 static const uint32 kIso2022Active = 0x00000008; // <80 and 1B 0E 0F
303 static const uint32 kUTF8Active = 0x00000010;
304 static const uint32 kUTF8UTF8Active = 0x00000020;
305 static const uint32 kUTF1632Active = 0x00000040; // <80 and 00
306 static const uint32 kBinaryActive = 0x00000080; // <80 and 00
307 static const uint32 kTwobyteCode = 0x00000100; // Needs 8xxx
308 static const uint32 kIsIndicCode = 0x00000200; //
309 static const uint32 kHighAlphaCode = 0x00000400; // full alphabet in 8x-Fx
310 static const uint32 kHighAccentCode = 0x00000800; // accents in 8x-Fx
311 static const uint32 kEUCJPActive = 0x00001000; // Have to mess with phase
312
313
314 // Debug only. not thread safe
315 static int encdet_used = 0;
316 static int rescore_used = 0;
317 static int rescan_used = 0;
318 static int robust_used = 0;
319 static int looking_used = 0;
320 static int doing_used = 0;
321
322
323 // For debugging only -- about 256B/entry times about 500 = 128KB
324 // TODO: only allocate this if being used
325 typedef struct {
326 int offset;
327 int best_enc; // Best ranked encoding for this bigram, or
328 // -1 for overhead entries
329 string label;
330 int detail_enc_prob[NUM_RANKEDENCODING];
331 } DetailEntry;
332
333 static int watch1_rankedenc = -1; // Debug. not threadsafe
334 static int watch2_rankedenc = -1; // Debug. not threadsafe
335 ////static int next_detail_entry = 0; // Debug. not threadsafe
336 ////static DetailEntry details[kMaxPairs * 10]; // Allow 10 details per bigram
337 // End For debugging only
338
339 // Must match kTestPrintableAsciiTildePlus exit codes, minus one
340 enum PairSet {AsciiPair = 0, OtherPair = 1, NUM_PAIR_SETS = 2};
341
342 // The reasons for pruning
343 enum PruneReason {PRUNE_NORMAL, PRUNE_SLOWEND, PRUNE_FINAL};
344
345 static const char* kWhatSetName[] = {"Ascii", "Other"};
346
347
348 // State for encodings that do shift-out/shift-in between one- and two-byte
349 // regions (ISO-2022-xx, HZ)
350 enum StateSoSi {SOSI_NONE, SOSI_ERROR, SOSI_ONEBYTE, SOSI_TWOBYTE};
351
352 typedef struct {
353 const uint8* initial_src; // For calculating byte offsets
354 const uint8* limit_src; // Range of input source
355 const uint8* prior_src; // Source consumed by prior call to BoostPrune
356 const uint8* last_pair; // Last pair inserted into interesting_pairs
357
358 DetailEntry* debug_data; // Normally NULL. Ptr to debug data for
359 // FLAGS_enc_detect_detail PostScript data
360 int next_detail_entry; // Debug
361
362 bool done;
363 bool reliable;
364 bool hints_derated;
365 int declared_enc_1; // From http/meta hint
366 int declared_enc_2; // from http/meta hint
367 int prune_count; // Number of times we have pruned
368
369 int trigram_highwater_mark; // Byte offset of last trigram processing
370 bool looking_for_latin_trigrams; // True if we should test for doing
371 // Latin1/2/7 trigram processing
372 bool do_latin_trigrams; // True if we actually are scoring trigrams
373
374 // Miscellaneous state variables for difficult encodings
375 int binary_quadrants_count; // Number of four bigram quadrants seen:
376 // 0xxxxxxx0xxxxxxx 0xxxxxxx1xxxxxx
377 // 1xxxxxxx0xxxxxxx 1xxxxxxx1xxxxxx
378 int binary_8x4_count; // Number of 8x4 buckets seen:
379 uint32 binary_quadrants_seen; // Bit[i] set if bigram i.......i....... seen
380 uint32 binary_8x4_seen; // Bit[i] set if bigram iii.....ii...... seen
381 int utf7_starts; // Count of possible UTF-7 beginnings seen
382 int prior_utf7_offset; // Source consumed by prior UTF-7 string
383 int next_utf8_ministate; // Mini state for UTF-8 sequences
384 int utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors
385 int next_utf8utf8_ministate; // Mini state for UTF8UTF8 sequences
386 int utf8utf8_odd_byte; // UTF8UTF8 seq has odd number of bytes
387 int utf8utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors
388 StateSoSi next_2022_state; // Mini state for 2022 sequences
389 StateSoSi next_hz_state; // Mini state for HZ sequences
390 bool next_eucjp_oddphase; // Mini state for EUC-JP sequences
391 int byte32_count[8]; // Count of top 3 bits of byte1 of bigram
392 // 0x1x 2x3x 4x5x 6x7x 8x9x AxBx CxDx ExFx
393 uint32 active_special; // Bits showing which special cases are active
394
395 Encoding tld_hint; // Top TLD encoding or UNKNOWN
396 Encoding http_hint; // What the document says about itself or
397 Encoding meta_hint; // UNKNOWN_ENCODING. BOM is initial byte
398 Encoding bom_hint; // order mark for UTF-xx
399
400 // small cache of previous interesting bigrams
401 int next_prior_bigram;
402 int prior_bigram[4];
403 int prior_binary[1];
404
405 int top_rankedencoding; // Top two probabilities and families
406 int second_top_rankedencoding;
407 int top_prob;
408 int second_top_prob;
409 int prune_difference; // Prune things this much below the top prob
410 int rankedencoding_list_len; // Number of active encodings
411 int rankedencoding_list[NUM_RANKEDENCODING]; // List of active encodings
412 //
413 int enc_prob[NUM_RANKEDENCODING]; // Cumulative probability per enc
414 // This is where all the action is
415 int hint_prob[NUM_RANKEDENCODING]; // Initial hint probabilities
416 int hint_weight[NUM_RANKEDENCODING]; // Number of hints for this enc
417
418 // Two sets -- one for printable ASCII, one for the rest
419 int prior_interesting_pair[NUM_PAIR_SETS]; // Pairs consumed by prior call
420 int next_interesting_pair[NUM_PAIR_SETS]; // Next pair to write
421 char interesting_pairs[NUM_PAIR_SETS][kMaxPairs * 2]; // Two bytes per pair
422 int interesting_offsets[NUM_PAIR_SETS][kMaxPairs]; // Src offset of pair
423 int interesting_weightshift[NUM_PAIR_SETS][kMaxPairs]; // weightshift of pair
424 } DetectEncodingState;
425
426
427 // Record a debug event that changes probabilities
428 void SetDetailsEncProb(DetectEncodingState* destatep,
429 int offset, int best_enc, const char* label) {
430 int next = destatep->next_detail_entry;
431 destatep->debug_data[next].offset = offset;
432 destatep->debug_data[next].best_enc = best_enc;
433 destatep->debug_data[next].label = label;
434 memcpy(&destatep->debug_data[next].detail_enc_prob,
435 &destatep->enc_prob,
436 sizeof(destatep->enc_prob));
437 ++destatep->next_detail_entry;
438 }
439
440 // Record a debug event that changes probabilities, copy offset
441 void SetDetailsEncProbCopyOffset(DetectEncodingState* destatep,
442 int best_enc, const char* label) {
443 int next = destatep->next_detail_entry;
444 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
445 destatep->debug_data[next].best_enc = best_enc;
446 destatep->debug_data[next].label = label;
447 memcpy(&destatep->debug_data[next].detail_enc_prob,
448 &destatep->enc_prob,
449 sizeof(destatep->enc_prob));
450 ++destatep->next_detail_entry;
451 }
452
453 // Record a debug event that changes probs and has simple text label
454 void SetDetailsEncLabel(DetectEncodingState* destatep, const char* label) {
455 int next = destatep->next_detail_entry;
456 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
457 destatep->debug_data[next].best_enc = -1;
458 destatep->debug_data[next].label = label;
459 memcpy(&destatep->debug_data[next].detail_enc_prob,
460 &destatep->enc_prob,
461 sizeof(destatep->enc_prob));
462 ++destatep->next_detail_entry;
463 }
464
465 // Record a debug event that is just a text label, no change in probs
466 void SetDetailsLabel(DetectEncodingState* destatep, const char* label) {
467 int next = destatep->next_detail_entry;
468 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
469 destatep->debug_data[next].best_enc = -1;
470 destatep->debug_data[next].label = label;
471 memcpy(&destatep->debug_data[next].detail_enc_prob,
472 &destatep->debug_data[next - 1].detail_enc_prob,
473 sizeof(destatep->enc_prob));
474 ++destatep->next_detail_entry;
475 }
476
477
478 // Maps superset encodings to base, to see if 2 encodings are compatible
479 // (Non-identity mappings are marked "-->" below.)
480 static const Encoding kMapEncToBaseEncoding[] = {
481 ISO_8859_1, // 0: Teragram ASCII
482 ISO_8859_2, // 1: Teragram Latin2
483 ISO_8859_3, // 2: in BasisTech but not in Teragram
484 ISO_8859_4, // 3: Teragram Latin4
485 ISO_8859_5, // 4: Teragram ISO-8859-5
486 ISO_8859_6, // 5: Teragram Arabic
487 ISO_8859_7, // 6: Teragram Greek
488 MSFT_CP1255, // 7: Teragram Hebrew --> 36
489 ISO_8859_9, // 8: in BasisTech but not in Teragram
490 ISO_8859_10, // 9: in BasisTech but not in Teragram
491 JAPANESE_EUC_JP, // 10: Teragram EUC_JP
492 JAPANESE_SHIFT_JIS, // 11: Teragram SJS
493 JAPANESE_JIS, // 12: Teragram JIS
494 CHINESE_BIG5, // 13: Teragram BIG5
495 CHINESE_GB, // 14: Teragram GB
496 CHINESE_EUC_CN, // 15: Teragram EUC-CN
497 KOREAN_EUC_KR, // 16: Teragram KSC
498 UNICODE, // 17: Teragram Unicode
499 CHINESE_EUC_CN, // 18: Teragram EUC --> 15
500 CHINESE_EUC_CN, // 19: Teragram CNS --> 15
501 CHINESE_BIG5, // 20: Teragram BIG5_CP950 --> 13
502 JAPANESE_SHIFT_JIS, // 21: Teragram CP932 --> 11
503 UTF8, // 22
504 UNKNOWN_ENCODING, // 23
505 ISO_8859_1, // 24: ISO_8859_1 with all characters <= 127 --> 0
506 RUSSIAN_KOI8_R, // 25: Teragram KOI8R
507 RUSSIAN_CP1251, // 26: Teragram CP1251
508 ISO_8859_1, // 27: CP1252 aka MSFT euro ascii --> 0
509 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian
510 MSFT_CP1250, // 29: CP1250 aka MSFT eastern european
511 ISO_8859_1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0
512 ISO_8859_9, // 31: used for Turkish
513 ISO_8859_13, // 32: used in Baltic countries --> 43
514 ISO_8859_11, // 33: aka TIS-620, used for Thai
515 ISO_8859_11, // 34: used for Thai --> 33
516 MSFT_CP1256, // 35: used for Arabic
517 MSFT_CP1255, // 36: Logical Hebrew Microsoft
518 MSFT_CP1255, // 37: Iso Hebrew Logical --> 36
519 MSFT_CP1255, // 38: Iso Hebrew Visual --> 36
520 CZECH_CP852, // 39
521 ISO_8859_2, // 40: aka ISO_IR_139 aka KOI8_CS --> 1
522 MSFT_CP1253, // 41: used for Greek, but NOT a superset of 8859-7
523 RUSSIAN_CP866, // 42
524 ISO_8859_13, // 43
525 ISO_2022_KR, // 44
526 CHINESE_GB, // 45 GBK --> 14
527 CHINESE_GB, // 46 GB18030 --> 14
528 CHINESE_BIG5, // 47 BIG5_HKSCS --> 13
529 ISO_2022_KR, // 48 ISO_2022_CN --> 44
530 TSCII, // 49 Indic encoding
531 TAMIL_MONO, // 50 Indic encoding - Tamil
532 TAMIL_BI, // 51 Indic encoding - Tamil
533 JAGRAN, // 52 Indic encoding - Devanagari
534 MACINTOSH_ROMAN, // 53
535 UTF7, // 54
536 BHASKAR, // 55 Indic encoding - Devanagari
537 HTCHANAKYA, // 56 Indic encoding - Devanagari
538 UTF16BE, // 57
539 UTF16LE, // 58
540 UTF32BE, // 59
541 UTF32LE, // 60
542 BINARYENC, // 61
543 HZ_GB_2312, // 62
544 UTF8UTF8, // 63
545 TAM_ELANGO, // 64 Elango - Tamil
546 TAM_LTTMBARANI, // 65 Barani - Tamil
547 TAM_SHREE, // 66 Shree - Tamil
548 TAM_TBOOMIS, // 67 TBoomis - Tamil
549 TAM_TMNEWS, // 68 TMNews - Tamil
550 TAM_WEBTAMIL, // 69 Webtamil - Tamil
551 KDDI_SHIFT_JIS, // 70 KDDI Shift_JIS
552 DOCOMO_SHIFT_JIS, // 71 DoCoMo Shift_JIS
553 SOFTBANK_SHIFT_JIS, // 72 SoftBank Shift_JIS
554 KDDI_ISO_2022_JP, // 73 KDDI ISO-2022-JP
555 SOFTBANK_ISO_2022_JP, // 74 SOFTBANK ISO-2022-JP
556 };
557
558 COMPILE_ASSERT(arraysize(kMapEncToBaseEncoding) == NUM_ENCODINGS,
559 kMapEncToBaseEncoding_has_incorrect_size);
560
561 // Maps base encodings to 0, supersets to 1+, undesired to -1
562 // (Non-identity mappings are marked "-->" below.)
563 static const int kMapEncToSuperLevel[] = {
564 0, // 0: Teragram ASCII
565 0, // 1: Teragram Latin2
566 0, // 2: in BasisTech but not in Teragram
567 0, // 3: Teragram Latin4
568 0, // 4: Teragram ISO-8859-5
569 0, // 5: Teragram Arabic
570 0, // 6: Teragram Greek
571 0, // 7: Teragram Hebrew
572 0, // 8: in BasisTech but not in Teragram
573 0, // 9: in BasisTech but not in Teragram
574 0, // 10: Teragram EUC_JP
575 0, // 11: Teragram SJS
576 0, // 12: Teragram JIS
577 0, // 13: Teragram BIG5
578 0, // 14: Teragram GB
579 0, // 15: Teragram EUC-CN
580 0, // 16: Teragram KSC
581 0, // 17: Teragram Unicode
582 -1, // 18: Teragram EUC --> 15
583 -1, // 19: Teragram CNS --> 15
584 1, // 20: Teragram BIG5_CP950 --> 13
585 1, // 21: Teragram CP932 --> 11
586 0, // 22
587 -1, // 23
588 -1, // 24: ISO_8859_1 with all characters <= 127 --> 0
589 0, // 25: Teragram KOI8R
590 0, // 26: Teragram CP1251
591 1, // 27: CP1252 aka MSFT euro ascii --> 0
592 0, // 28: CP21866 aka KOI8_RU, used for Ukrainian
593 0, // 29: CP1250 aka MSFT eastern european
594 1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0
595 0, // 31: used for Turkish
596 1, // 32: used in Baltic countries --> 43
597 0, // 33: aka TIS-620, used for Thai
598 1, // 34: used for Thai --> 33
599 0, // 35: used for Arabic
600 0, // 36: Logical Hebrew Microsoft
601 -1, // 37: Iso Hebrew Logical --> 36
602 -1, // 38: Iso Hebrew Visual --> 7
603 0, // 39
604 1, // 40: aka ISO_IR_139 aka KOI8_CS --> 1
605 0, // 41: used for Greek, NOT superset of 8859-7
606 0, // 42
607 0, // 43
608 0, // 44
609 1, // 45 GBK --> 14
610 1, // 46 GB18030 --> 14
611 1, // 47 BIG5_HKSCS --> 13
612 1, // 48 ISO_2022_CN --> 44
613 0, // 49 Indic encoding
614 0, // 50 Indic encoding - Tamil
615 0, // 51 Indic encoding - Tamil
616 0, // 52 Indic encoding - Devanagari
617 0, // 53
618 0, // 54
619 0, // 55 Indic encoding - Devanagari
620 0, // 56 Indic encoding - Devanagari
621 0, // 57
622 0, // 58
623 0, // 59
624 0, // 60
625 0, // 61
626 0, // 62
627 2, // 63
628 0, 0, 0, 0, 0, 0, // add six more Tamil
629 0, 0, 0, 0, 0, // add five encodings with emoji
630 };
631
632 COMPILE_ASSERT(arraysize(kMapEncToSuperLevel) == NUM_ENCODINGS,
633 kMapEncToSuperLevel_has_incorrect_size);
634
635
636
637 // Subscripted by Encoding enum value
638 static const uint32 kSpecialMask[] = {
639 kHighAccentCode, // 0
640 kHighAccentCode,
641 kHighAccentCode,
642 kHighAccentCode,
643 kHighAlphaCode, // 4
644 kHighAlphaCode,
645 kHighAlphaCode,
646 kHighAlphaCode,
647 kHighAccentCode,
648 kHighAccentCode,
649
650 kTwobyteCode + kEUCJPActive, // 10 euc-jp
651 kTwobyteCode,
652 kSevenBitActive + kIso2022Active, // jis
653 kTwobyteCode,
654 kTwobyteCode,
655 kTwobyteCode,
656 kTwobyteCode,
657 kSevenBitActive + kUTF1632Active, // Unicode
658 kTwobyteCode,
659 kTwobyteCode,
660
661 kTwobyteCode, // 20
662 kTwobyteCode,
663 kUTF8Active, // UTF-8
664 0,
665 0,
666 kHighAlphaCode, // 25
667 kHighAlphaCode,
668 kHighAccentCode,
669 kHighAlphaCode,
670 kHighAccentCode,
671
672 kHighAccentCode, // 30
673 kHighAccentCode,
674 kHighAccentCode,
675 kHighAlphaCode,
676 kHighAlphaCode,
677 kHighAlphaCode, // 35
678 kHighAlphaCode,
679 kHighAlphaCode,
680 kHighAlphaCode,
681 0,
682
683 0, // 40
684 kHighAlphaCode,
685 kHighAlphaCode,
686 kHighAccentCode,
687 kSevenBitActive + kIso2022Active, // 2022-kr
688 kTwobyteCode,
689 kTwobyteCode,
690 kTwobyteCode,
691 kSevenBitActive + kIso2022Active, // 2022-cn
692 kHighAlphaCode + kIsIndicCode, // 49 TSCII
693
694 kHighAlphaCode + kIsIndicCode, // 50 TAMIL_MONO
695 kHighAlphaCode + kIsIndicCode, // 51 TAMIL_BI
696 kHighAlphaCode + kIsIndicCode, // 52 JAGRAN
697 kHighAccentCode, // 53 MACINTOSH_ROMAN
698 kSevenBitActive + kUTF7Active, // 54 UTF-7
699 kHighAlphaCode + kIsIndicCode, // 55 BHASKAR Indic encoding - Devanagari
700 kHighAlphaCode + kIsIndicCode, // 56 HTCHANAKYA Indic encoding - Devanag ari
701 kSevenBitActive + kUTF1632Active, // 57 UTF16BE
702 kSevenBitActive + kUTF1632Active, // 58 UTF16LE
703 kSevenBitActive + kUTF1632Active, // 59 UTF32BE
704 kSevenBitActive + kUTF1632Active, // 60 UTF32LE
705
706 kSevenBitActive + kBinaryActive, // 61 BINARYENC
707 kSevenBitActive + kHzActive, // 62 HZ_GB_2312
708 kHighAccentCode + kUTF8Active + kUTF8UTF8Active, // 63 UTF8UTF8
709 kHighAlphaCode + kIsIndicCode, // 64 Elango - Tamil
710 kHighAlphaCode + kIsIndicCode, // 65 Barani - Tamil
711 kHighAlphaCode + kIsIndicCode, // 66 Shree - Tamil
712 kHighAlphaCode + kIsIndicCode, // 67 TBoomis - Tamil
713 kHighAlphaCode + kIsIndicCode, // 68 TMNews - Tamil
714 kHighAlphaCode + kIsIndicCode, // 69 Webtamil - Tamil
715 kTwobyteCode, // 70 KDDI Shift_JIS
716 kTwobyteCode, // 71 DoCoMo Shift_JIS
717 kTwobyteCode, // 72 SoftBank Shift_JIS
718 kSevenBitActive + kIso2022Active, // 73 KDDI-ISO-2022-JP
719 kSevenBitActive + kIso2022Active, // 74 SOFTBANK-ISO-2022-JP
720 };
721
722 COMPILE_ASSERT(arraysize(kSpecialMask) == NUM_ENCODINGS,
723 kSpecialMask_has_incorrect_size);
724
725
726 /***
727 kHighAlphaCode -- full alphabet in 8x-Fx range, not just accents
728
729 ISO_8859_5, // 4: Teragram ISO-8859-5 Cyrl UL bd
730 RUSSIAN_CP1251, // 26: Teragram CP1251 UL cdef
731 RUSSIAN_KOI8_R, // 25: Teragram KOI8R LU cdef
732 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, LU cdef
733 RUSSIAN_CP866, // 42 89ae
734
735 ISO_8859_6, // 5: Teragram Arabic nocase cde
736 MSFT_CP1256, // 35: used for Arabic nocase cde
737
738 ISO_8859_7, // 6: Teragram Greek UL cdef
739 MSFT_CP1253, // 41: used for Greek UL cdef
740
741 ISO_8859_8, // 7: Teragram Hebrew nocase ef
742 MSFT_CP1255, // 36: Logical Hebrew Microsoft nocase ef
743 ISO_8859_8_I, // 37: Iso Hebrew Logical nocase ef
744 HEBREW_VISUAL, // 38: Iso Hebrew Visual nocase ef
745
746 ISO_8859_11, // 33: aka TIS-620, used for Thai nocase abcde
747 MSFT_CP874, // 34: used for Thai nocase abcde
748
749 TSCII, // 49 8-f
750 TAMIL_MONO, // 50
751 TAMIL_BI, // 51
752 JAGRAN, // 52
753 BHASKAR, // 55 Indic encoding - Devanagari
754 HTCHANAKYA, // 56 Indic encoding - Devanagari
755 ***/
756
757 // We can scan bytes using this at about 500 MB/sec 2.8GHz P4
758 // Slow scan uses this, stopping on NUL ESC SO SI bad C0 and + ~
759 // We allow FF, 0x0C, here because it gives a better result for old
760 // Ascii text formatted for a TTY
761 // non-zero exits scan loop -- 1 for printable ASCII, 2 otherwise
762 static const char kTestPrintableAsciiTildePlus[256] = {
763 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
764 0,0,0,0,0,0,0,0, 0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
765 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
766 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,1,2,
767
768 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
769 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
770 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
771 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
772 };
773
774 // We can scan bytes using this at about 550 MB/sec 2.8GHz P4
775 // Slow scan uses this, stopping on NUL ESC SO SI and bad C0
776 // after Hz and UTF7 are pruned away
777 // We allow Form Feed, 0x0C, here
778 static const char kTestPrintableAscii[256] = {
779 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
780 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
781 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
782 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,2,
783
784 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
785 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
786 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
787 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
788 };
789
790 // Used in first-four-byte testing
791 static const char kIsPrintableAscii[256] = {
792 0,0,0,0,0,0,0,0, 0,1,1,0,0,1,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
793 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
794 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
795 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0,
796
797 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
798 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
799 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
800 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
801 };
802
803
804 static const signed char kBase64Value[256] = {
805 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
806 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
807 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,62,-1,-1,-1,63,
808 52,53,54,55,56,57,58,59, 60,61,-1,-1,-1,-1,-1,-1,
809
810 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
811 15,16,17,18,19,20,21,22, 23,24,25,-1,-1,-1,-1,-1,
812 -1,26,27,28,29,30,31,32, 33,34,35,36,37,38,39,40,
813 41,42,43,44,45,46,47,48, 49,50,51,-1,-1,-1,-1,-1,
814
815 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
816 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
817 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
818 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
819
820 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
821 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
822 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
823 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
824 };
825
826
827 // Subscripted by <state, byte/16>
828 // Accepts Cx->8x Dx->8x Ex->8x->8x Fx->8x->8x->8x
829 //
830 // Fixed Problem: GB has sequences like B2DB B8D6 BDE1 B9B9
831 // which we can mis-parse as an error byte followed by good UTF-8:
832 // B2 DBB8 D6BD E1B9B9
833 // To counteract this, we now require an ASCII7 byte to resync out
834 // of the error state
835 // Next problem: good UTF-8 with bad byte
836 // efbc a012 eea4 bee7 b280 c2b7
837 // efbca0 12 eea4be e7b280 c2b7
838 // ^^ bad byte
839 // fix: change state0 byte 1x to be don't-care
840 //
841 // Short UTF-8 ending in ASCII7 byte should resync immediately:
842 // E0 20 E0 A6 AA should give one error and resync at 2nd E0
843 //
844 static const char kMiniUTF8State[8][16] = {
845 {0,0,0,0,0,0,0,0, 7,7,7,7,1,1,2,4,}, // [0] start char (allow cr/lf/ht)
846 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [1] continue 1 of 2
847 {0,7,0,0,0,0,0,0, 3,3,3,3,7,7,7,7,}, // [2] continue 1 of 3
848 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [3] continue 2 of 3
849 {0,7,0,0,0,0,0,0, 5,5,5,5,7,7,7,7,}, // [4] continue 1 of 4
850 {0,7,0,0,0,0,0,0, 6,6,6,6,7,7,7,7,}, // [5] continue 2 of 4
851 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [6] continue 3 of 4
852 {0,7,0,0,0,0,0,0, 7,7,7,7,7,7,7,7,}, // [7] error, soak up continues,
853 // ONLY resync after Ascii char
854 // then restart
855 };
856 // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
857 static const char kMiniUTF8Count[8][16] = {
858 {0,0,0,0,0,0,0,0, 1,1,1,1,0,0,0,0,}, // [0] start char (allow cr/lf/ht)
859 {1,1,1,1,1,1,1,1, 2,2,2,2,1,1,1,1,}, // [1] continue 1 of 2
860 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [2] continue 1 of 3
861 {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [3] continue 2 of 3
862 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [4] continue 1 of 4
863 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] continue 2 of 4
864 {1,1,1,1,1,1,1,1, 4,4,4,4,1,1,1,1,}, // [6] continue 3 of 4
865 {0,1,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,}, // [7] error, soak up continues,
866 // then restart
867 };
868
869 // Subscripted by <state, f(byte1) + g(byte2)>
870 // where f(x)= E2->4, Cx->8 and C3->12 and 0 otherwise
871 // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.
872 // (no checking for illegal bytes)
873 // Here are example patterns of CP1252 converted to UTF-8 0/1/2 times. We want
874 // to detect two, so we can back-convert to one.
875 // zero one two pattern
876 // ---- ------ ---------------- -----------------
877 // 81 C281 C382C281 C3->8x->C2->xx
878 // 98 CB9C C38BC593 C3->8x->C5->xx
879 // C3 C383 C383C692 C3->8x->C6->xx
880 // C8 C388 C383CB86 C3->8x->CB->xx
881 // 83 C692 C386E28099 C3->8x->E2->xx->8x
882 // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx
883 // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx
884 //
885 // We also want to detect bare-byte extra UTF-8 conversions:
886 // zero one two pattern
887 // ---- ------ ---------------- -----------------
888 // C3 C3 C383 C3->8x->C2->xx
889 // D3 D3 C393 C3->9x->C2->xx->C2->xx
890 // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx
891 // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
892 //
893
894 /**
895 CP1252 => UTF8 => UTF8UTF8
896 80 => E282AC => C3A2E2809AC2AC
897 81 => C281 => C382C281
898 82 => E2809A => C3A2E282ACC5A1
899 83 => C692 => C386E28099
900 84 => E2809E => C3A2E282ACC5BE
901 85 => E280A6 => C3A2E282ACC2A6
902 86 => E280A0 => C3A2E282ACC2A0
903 87 => E280A1 => C3A2E282ACC2A1
904 88 => CB86 => C38BE280A0
905 89 => E280B0 => C3A2E282ACC2B0
906 8A => C5A0 => C385C2A0
907 8B => E280B9 => C3A2E282ACC2B9
908 8C => C592 => C385E28099
909 8D => C28D => C382C28D
910 8E => C5BD => C385C2BD
911 8F => C28F => C382C28F
912 90 => C290 => C382C290
913 91 => E28098 => C3A2E282ACCB9C
914 92 => E28099 => C3A2E282ACE284A2
915 93 => E2809C => C3A2E282ACC593
916 94 => E2809D => C3A2E282ACC29D
917 95 => E280A2 => C3A2E282ACC2A2
918 96 => E28093 => C3A2E282ACE2809C
919 97 => E28094 => C3A2E282ACE2809D
920 98 => CB9C => C38BC593
921 99 => E284A2 => C3A2E2809EC2A2
922 9A => C5A1 => C385C2A1
923 9B => E280BA => C3A2E282ACC2BA
924 9C => C593 => C385E2809C
925 9D => C29D => C382C29D
926 9E => C5BE => C385C2BE
927 9F => C5B8 => C385C2B8
928 A0 => C2A0 => C382C2A0
929 A1 => C2A1 => C382C2A1
930 A2 => C2A2 => C382C2A2
931 A3 => C2A3 => C382C2A3
932 A4 => C2A4 => C382C2A4
933 A5 => C2A5 => C382C2A5
934 A6 => C2A6 => C382C2A6
935 A7 => C2A7 => C382C2A7
936 A8 => C2A8 => C382C2A8
937 A9 => C2A9 => C382C2A9
938 AA => C2AA => C382C2AA
939 AB => C2AB => C382C2AB
940 AC => C2AC => C382C2AC
941 AD => C2AD => C382C2AD
942 AE => C2AE => C382C2AE
943 AF => C2AF => C382C2AF
944 B0 => C2B0 => C382C2B0
945 B1 => C2B1 => C382C2B1
946 B2 => C2B2 => C382C2B2
947 B3 => C2B3 => C382C2B3
948 B4 => C2B4 => C382C2B4
949 B5 => C2B5 => C382C2B5
950 B6 => C2B6 => C382C2B6
951 B7 => C2B7 => C382C2B7
952 B8 => C2B8 => C382C2B8
953 B9 => C2B9 => C382C2B9
954 BA => C2BA => C382C2BA
955 BB => C2BB => C382C2BB
956 BC => C2BC => C382C2BC
957 BD => C2BD => C382C2BD
958 BE => C2BE => C382C2BE
959 BF => C2BF => C382C2BF
960 C0 => C380 => C383E282AC
961 C1 => C381 => C383C281
962 C2 => C382 => C383E2809A
963 C3 => C383 => C383C692
964 C4 => C384 => C383E2809E
965 C5 => C385 => C383E280A6
966 C6 => C386 => C383E280A0
967 C7 => C387 => C383E280A1
968 C8 => C388 => C383CB86
969 C9 => C389 => C383E280B0
970 CA => C38A => C383C5A0
971 CB => C38B => C383E280B9
972 CC => C38C => C383C592
973 CD => C38D => C383C28D
974 CE => C38E => C383C5BD
975 CF => C38F => C383C28F
976 D0 => C390 => C383C290
977 D1 => C391 => C383E28098
978 D2 => C392 => C383E28099
979 D3 => C393 => C383E2809C
980 D4 => C394 => C383E2809D
981 D5 => C395 => C383E280A2
982 D6 => C396 => C383E28093
983 D7 => C397 => C383E28094
984 D8 => C398 => C383CB9C
985 D9 => C399 => C383E284A2
986 DA => C39A => C383C5A1
987 DB => C39B => C383E280BA
988 DC => C39C => C383C593
989 DD => C39D => C383C29D
990 DE => C39E => C383C5BE
991 DF => C39F => C383C5B8
992 E0 => C3A0 => C383C2A0
993 E1 => C3A1 => C383C2A1
994 E2 => C3A2 => C383C2A2
995 E3 => C3A3 => C383C2A3
996 E4 => C3A4 => C383C2A4
997 E5 => C3A5 => C383C2A5
998 E6 => C3A6 => C383C2A6
999 E7 => C3A7 => C383C2A7
1000 E8 => C3A8 => C383C2A8
1001 E9 => C3A9 => C383C2A9
1002 EA => C3AA => C383C2AA
1003 EB => C3AB => C383C2AB
1004 EC => C3AC => C383C2AC
1005 ED => C3AD => C383C2AD
1006 EE => C3AE => C383C2AE
1007 EF => C3AF => C383C2AF
1008 F0 => C3B0 => C383C2B0
1009 F1 => C3B1 => C383C2B1
1010 F2 => C3B2 => C383C2B2
1011 F3 => C3B3 => C383C2B3
1012 F4 => C3B4 => C383C2B4
1013 F5 => C3B5 => C383C2B5
1014 F6 => C3B6 => C383C2B6
1015 F7 => C3B7 => C383C2B7
1016 F8 => C3B8 => C383C2B8
1017 F9 => C3B9 => C383C2B9
1018 FA => C3BA => C383C2BA
1019 FB => C3BB => C383C2BB
1020 FC => C3BC => C383C2BC
1021 FD => C3BD => C383C2BD
1022 FE => C3BE => C383C2BE
1023 FF => C3BF => C383C2BF
1024 **/
1025
1026 // Subscripted by <state, f(byte1) + g(byte2)>
1027 // where f(x)= E2->4, C2/5/6/B->8 and C3->12 and 0 otherwise
1028 // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.
1029
1030 // 81 C281 C382C281 C3->8x->C2->xx
1031 // 98 CB9C C38BC593 C3->8x->C5->xx
1032 // C3 C383 C383C692 C3->8x->C6->xx
1033 // C8 C388 C383CB86 C3->8x->CB->xx
1034 // [0] [2] [0]
1035 // 83 C692 C386E28099 C3->8x->E2->xx->xx
1036 // odd_byte=0 [0] [2] [0+] odd_byte flipped
1037 // odd_byte=1 [0+] [2] [0] [0] odd_byte unflipped
1038 // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx
1039 // odd_byte=0 [0] [3] [4] [0+]
1040 // odd_byte=1 [0+] [3] [4] [4] [0]
1041 // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx
1042 // odd_byte=0 [0] [3] [4] [0] [0]
1043 // odd_byte=1 [0+] [3] [4] [4] [0+]
1044 //
1045 // When an E2xxxx sequence is encountered, we absorb the two bytes E2xx and flip
1046 // the odd_byte state. If that goes from 0 to 1, the next pair is offset up
1047 // by one byte, picking up the two bytes just after E2xxxx. If odd_byte goes
1048 // from 1 to 0, the next two bytes picked up are the two bytes xxxx of E2xxxx.
1049 // These are absorbed with no error in state 0 or state 4
1050 //
1051 // C3 C3 C383 C3->8x->C2->xx
1052 // D3 D3 C393 C3->9x->C2->xx->C2->xx
1053 // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx
1054 // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
1055 // Counter3 for Fx Ex sequences is incremented at last C2
1056
1057 static const char kMiniUTF8UTF8State[8][16] = {
1058 // xxxx E2xx CXxx C3xx
1059 // 8 9 a b 8 9 a b 8 9 a b
1060 {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [0] looking for C38x/C3Ax/2020/8x 8x, or err
1061 {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [1] error, back to looking
1062 {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx
1063 // + + + + // E2xxxx flips odd_byte
1064 {1,1,1,1,4,4,4,4, 7,7,7,7,1,1,1,1,}, // [3] C3Ax looking for E2xx or C2xx C2xx
1065 // + + + + // E2xxxx flips odd_byte
1066 {4,4,4,4,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E 2xxxx
1067 // + + + + // E2xxxx flips odd_byte
1068 {1,1,1,1,1,1,1,1, 6,6,6,6,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC 2xx
1069 {1,1,1,1,1,1,1,1, 7,7,7,7,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx
1070 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [7] C3Bx -- looking for C2xx
1071 };
1072 // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
1073 static const char kMiniUTF8UTF8Count[8][16] = {
1074 // xxxx E2xx C2Xx C3xx
1075 // 8 9 a b 8 9 a b 8 9 a b
1076 {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x 8x, or err
1077 {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [1] error, back to looking
1078 {1,1,1,1,3,3,3,3, 2,2,2,2,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx
1079 // + + + + // E2xxxx flips odd_byte
1080 {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [3] C3Ax looking for E2xx
1081 // + + + + // E2xxxx flips odd_byte
1082 {1,1,1,1,4,4,4,4, 4,4,4,4,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E 2xxxx
1083 // + + + + // E2xxxx flips odd_byte
1084 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC 2xx
1085 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx
1086 {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [7] C3Bx -- looking for C2xx
1087 };
1088
1089 static const char kMiniUTF8UTF8Odd[8][16] = {
1090 // xxxx E2xx C2Xx C3xx
1091 // 8 9 a b 8 9 a b 8 9 a b
1092 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x 8x, or err
1093 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [1] error, back to looking
1094 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [2] C38x looking for CXxx/E2xxxx
1095 // + + + + // E2xxxx flips odd_byte
1096 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [3] C3Ax looking for E2xx
1097 // + + + + // E2xxxx flips odd_byte
1098 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [4] C3AxE2xx-- looking for C2xx/E 2xxxx
1099 // + + + + // E2xxxx flips odd_byte
1100 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [5] C3Bx -- looking for C2xxC2xxC 2xx
1101 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [6] C3Bx -- looking for C2xxC2xx
1102 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [7] C3Bx -- looking for C2xx
1103 };
1104
1105 // Turn a pair of bytes into the subscript for UTF8UTF8 tables above
1106 int UTF88Sub(char s0, char s1) {
1107 int sub = (s1 >> 4) & 0x03;
1108 uint8 u0 = static_cast<uint8>(s0);
1109 if (u0 == 0xc3) {
1110 sub += 12;
1111 } else if ((u0 & 0xf0) == 0xc0) {
1112 if ((u0 == 0xc2) || (u0 == 0xc5) || (u0 == 0xc6) || (u0 == 0xcb)) {
1113 sub += 8;
1114 }
1115 } else if (u0 == 0xe2) {
1116 sub += 4;
1117 }
1118 return sub;
1119 }
1120
1121
1122
1123
1124
1125 // Default probability for an encoding rankedencoding
1126 // Based on a scan of 55M web pages
1127 // These values are 255 - log base 2**1/10 (occurrences / total)
1128 // Large values are most likely. This the reverse of some Google code
1129 // 255 = 1.0, 245 = 1/2, 235 = 1/4, 15 = 1/2**24, 0 = 0 (< 1/50M)
1130 //
1131 // TODO change this to be per encoding, not permuted
1132 //
1133
1134
1135 // Support function for unit test program
1136 // Return ranked encoding corresponding to enc
1137 // (also exported to compact_enc_det_text.cc)
1138 int CompactEncDet::BackmapEncodingToRankedEncoding(Encoding enc) {
1139 for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
1140 if (kMapToEncoding[i] == enc) {
1141 return i;
1142 }
1143 }
1144 return -1;
1145 }
1146
1147
1148 string DecodeActive(uint32 active) {
1149 string temp("");
1150 if (active & kBinaryActive) {
1151 temp.append("Binary ");
1152 }
1153 if (active & kUTF1632Active) {
1154 temp.append("UTF1632 ");
1155 }
1156 if (active & kUTF8UTF8Active) {
1157 temp.append("UTF8UTF8 ");
1158 }
1159 if (active & kUTF8Active) {
1160 temp.append("UTF8 ");
1161 }
1162 if (active & kIso2022Active) {
1163 temp.append("Iso2022 ");
1164 }
1165 if (active & kHzActive) {
1166 temp.append("Hz ");
1167 }
1168 if (active & kUTF7Active) {
1169 temp.append("UTF7A ");
1170 }
1171 if (active & kSevenBitActive) {
1172 temp.append("SevenBit ");
1173 }
1174 if (active & kIsIndicCode) {
1175 temp.append("Indic ");
1176 }
1177 if (active & kHighAlphaCode) {
1178 temp.append("HighAlpha ");
1179 }
1180 if (active & kHighAccentCode) {
1181 temp.append("HighAccent ");
1182 }
1183 if (active & kEUCJPActive) {
1184 temp.append("EUCJP ");
1185 }
1186 return temp;
1187 }
1188
1189 static inline bool SevenBitEncoding(int enc) {
1190 return ((kSpecialMask[enc] & kSevenBitActive) != 0);
1191 }
1192 static inline bool TwoByteEncoding(int enc) {
1193 return ((kSpecialMask[enc] & kTwobyteCode) != 0);
1194 }
1195 static inline bool IndicEncoding(int enc) {
1196 return ((kSpecialMask[enc] & kIsIndicCode) != 0);
1197 }
1198 static inline bool HighAlphaEncoding(int enc) {
1199 return ((kSpecialMask[enc] & kHighAlphaCode) != 0);
1200 }
1201 static inline bool HighAccentEncoding(int enc) {
1202 return ((kSpecialMask[enc] & kHighAccentCode) != 0);
1203 }
1204
1205
1206 static inline bool AnyActive(DetectEncodingState* destatep) {
1207 return (destatep->active_special != 0);
1208 }
1209 static inline bool SevenBitActive(DetectEncodingState* destatep) {
1210 return (destatep->active_special & kSevenBitActive) != 0;
1211 }
1212
1213 static inline bool UTF7Active(DetectEncodingState* destatep) {
1214 return (destatep->active_special & kUTF7Active) != 0;
1215 }
1216
1217 static inline bool HzActive(DetectEncodingState* destatep) {
1218 return (destatep->active_special & kHzActive) != 0;
1219 }
1220 static inline bool Iso2022Active(DetectEncodingState* destatep) {
1221 return (destatep->active_special & kIso2022Active) != 0;
1222 }
1223 static inline bool UTF8Active(DetectEncodingState* destatep) {
1224 return (destatep->active_special & kUTF8Active) != 0;
1225 }
1226 static inline bool UTF8UTF8Active(DetectEncodingState* destatep) {
1227 return (destatep->active_special & kUTF8UTF8Active) != 0;
1228 }
1229 static inline bool UTF1632Active(DetectEncodingState* destatep) {
1230 return (destatep->active_special & kUTF1632Active) != 0;
1231 }
1232 static inline bool BinaryActive(DetectEncodingState* destatep) {
1233 return (destatep->active_special & kBinaryActive) != 0;
1234 }
1235 static inline bool UTF7OrHzActive(DetectEncodingState* destatep) {
1236 return (destatep->active_special & (kHzActive + kUTF7Active)) != 0;
1237 }
1238 static inline bool EUCJPActive(DetectEncodingState* destatep) {
1239 return ((destatep->active_special & kEUCJPActive) != 0);
1240 }
1241 static inline bool OtherActive(DetectEncodingState* destatep) {
1242 return (destatep->active_special & (kIso2022Active + kBinaryActive +
1243 kUTF8Active + kUTF8UTF8Active +
1244 kUTF1632Active + kEUCJPActive)) != 0;
1245 }
1246
1247
1248 static inline bool CEDFlagRescanning(CEDInternalFlags flags) {
1249 return (flags & kCEDRescanning) != 0;
1250 }
1251
1252 static inline bool CEDFlagSlowscore(CEDInternalFlags flags) {
1253 return (flags & kCEDSlowscore) != 0;
1254 }
1255
1256 static inline bool CEDFlagForceTags(CEDInternalFlags flags) {
1257 return (flags & kCEDForceTags) != 0;
1258 }
1259
1260
1261 static inline int maxint(int a, int b) {return (a > b) ? a : b;}
1262 static inline int minint(int a, int b) {return (a < b) ? a : b;}
1263
1264 static inline const char* MyRankedEncName(int r_enc) {
1265 return MyEncodingName(kMapToEncoding[r_enc]);
1266 }
1267
1268
1269 // Only for debugging. not thread safe
1270 static const int kPsSourceWidth = 32;
1271 static int pssourcenext = 0; // debug only. not threadsafe. dump only >= this
1272 static int pssourcewidth = 0; // debug only.
1273 static char* pssource_mark_buffer = NULL;
1274 int next_do_src_line;
1275 int do_src_offset[16];
1276
1277
1278 void PsSourceInit(int len) {
1279 pssourcenext = 0;
1280 pssourcewidth = len;
1281 delete[] pssource_mark_buffer;
1282 // Allocate 2 Ascii characters per input byte
1283 pssource_mark_buffer = new char[(pssourcewidth * 2) + 8]; // 8 = overscan
1284 memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
1285 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
1286
1287 next_do_src_line = 0;
1288 memset(do_src_offset, 0, sizeof(do_src_offset));
1289 }
1290
1291 void PsSourceFinish() {
1292 // Print preceding mark buffer
1293 int j = (pssourcewidth * 2) - 1;
1294 while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim
1295 pssource_mark_buffer[j + 1] = '\0';
1296 fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer);
1297 memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
1298 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
1299
1300 delete[] pssource_mark_buffer;
1301 pssource_mark_buffer = NULL;
1302 }
1303
1304 // Dump aligned len bytes src... if not already dumped
1305 void PsSource(const uint8* src, const uint8* isrc, const uint8* srclimit) {
1306 int offset = src - isrc;
1307 offset -= (offset % pssourcewidth); // round down to multiple of len bytes
1308 if (offset < pssourcenext) {
1309 return;
1310 }
1311 pssourcenext = offset + pssourcewidth; // Min offset for next dump
1312
1313 // Print preceding mark buffer
1314 int j = (pssourcewidth * 2) - 1;
1315 while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim
1316 pssource_mark_buffer[j + 1] = '\0';
1317 fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer);
1318 memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
1319 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
1320
1321 // Print source bytes
1322 const uint8* src_aligned = isrc + offset;
1323 int length = srclimit - src_aligned;
1324 length = minint(pssourcewidth, length);
1325
1326 fprintf(stderr, "(%05x ", offset);
1327 for (int i = 0; i < length; ++i) {
1328 char c = src_aligned[i];
1329 if (c == '\n') {c = ' ';}
1330 if (c == '\r') {c = ' ';}
1331 if (c == '\t') {c = ' ';}
1332 if (c == '(') {
1333 fprintf(stderr, "%s", "\\( ");
1334 } else if (c == ')') {
1335 fprintf(stderr, "%s", "\\) ");
1336 } else if (c == '\\') {
1337 fprintf(stderr, "%s", "\\\\ ");
1338 } else if ((0x20 <= c) && (c <= 0x7e)) {
1339 fprintf(stderr, "%c ", c);
1340 } else {
1341 fprintf(stderr, "%02x", c);
1342 }
1343 }
1344 fprintf(stderr, ") do-src\n");
1345 // Remember which source offsets are where, mod 16
1346 do_src_offset[next_do_src_line & 0x0f] = offset;
1347 ++next_do_src_line;
1348 }
1349
1350 // Mark bytes in just-previous source bytes
1351 void PsMark(const uint8* src, int len, const uint8* isrc, int weightshift) {
1352 int offset = src - isrc;
1353 offset = (offset % pssourcewidth); // mod len bytes
1354 char mark = (weightshift == 0) ? '-' : 'x';
1355
1356 pssource_mark_buffer[(offset * 2)] = '=';
1357 pssource_mark_buffer[(offset * 2) + 1] = '=';
1358 for (int i = 1; i < len; ++i) {
1359 pssource_mark_buffer[(offset + i) * 2] = mark;
1360 pssource_mark_buffer[((offset + i) * 2) + 1] = mark;
1361 }
1362 }
1363
1364
1365 // Highlight trigram bytes in just-previous source bytes
1366 // Unfortunately, we have to skip back N lines since source was printed for
1367 // up to 8 bigrams before we get here. Match on src+1 to handle 0/31 better
1368 void PsHighlight(const uint8* src, const uint8* isrc, int trigram_val, int n) {
1369 int offset = (src + 1) - isrc;
1370 int offset32 = (offset % pssourcewidth); // mod len bytes
1371 offset -= offset32; // round down to multiple of len bytes
1372
1373 for (int i = 1; i <= 16; ++i) {
1374 if (do_src_offset[(next_do_src_line - i) & 0x0f] == offset) {
1375 fprintf(stderr, "%d %d %d do-highlight%d\n",
1376 i, offset32 - 1, trigram_val, n);
1377 break;
1378 }
1379 }
1380 }
1381
1382
1383 void InitDetectEncodingState(DetectEncodingState* destatep) {
1384 destatep->initial_src = NULL; // Filled in by caller
1385 destatep->limit_src = NULL;
1386 destatep->prior_src = NULL;
1387 destatep->last_pair = NULL;
1388
1389 destatep->debug_data = NULL;
1390 destatep->next_detail_entry = 0;
1391
1392 destatep->done = false;
1393 destatep->reliable = false;
1394 destatep->hints_derated = false;
1395 //destatep->declared_enc_1 init in ApplyHints
1396 //destatep->declared_enc_2 init in ApplyHints
1397 destatep->prune_count = 0;
1398
1399 destatep->trigram_highwater_mark = 0;
1400 destatep->looking_for_latin_trigrams = false;
1401 destatep->do_latin_trigrams = false;
1402
1403 // Miscellaneous state variables for difficult encodings
1404 destatep->binary_quadrants_count = 0;
1405 destatep->binary_8x4_count = 0;
1406 destatep->binary_quadrants_seen = 0;
1407 destatep->binary_8x4_seen = 0;
1408 destatep->utf7_starts = 0;
1409 destatep->prior_utf7_offset = 0;
1410 destatep->next_utf8_ministate = 0;
1411 for (int i = 0; i < 6; i++) {destatep->utf8_minicount[i] = 0;}
1412 destatep->next_utf8utf8_ministate = 0;
1413 destatep->utf8utf8_odd_byte = 0;
1414 for (int i = 0; i < 6; i++) {destatep->utf8utf8_minicount[i] = 0;}
1415 destatep->next_2022_state = SOSI_NONE;
1416 destatep->next_hz_state = SOSI_NONE;
1417 destatep->next_eucjp_oddphase = false;
1418 for (int i = 0; i < 8; i++) {destatep->byte32_count[i] = 0;}
1419 destatep->active_special = 0xffffffff;
1420 destatep->tld_hint = UNKNOWN_ENCODING;
1421 destatep->http_hint = UNKNOWN_ENCODING;
1422 destatep->meta_hint = UNKNOWN_ENCODING;
1423 destatep->bom_hint = UNKNOWN_ENCODING;
1424 destatep->top_rankedencoding = 0; // ASCII [seven-bit] is the default
1425 destatep->second_top_rankedencoding = 0; // ASCII [seven-bit] is the default
1426 destatep->top_prob = -1;
1427 destatep->second_top_prob = -1;
1428 // This is wide for first pruning, shrinks for 2nd and later
1429 destatep->prune_difference = kInititalPruneDifference;
1430
1431 destatep->next_prior_bigram = 0;
1432 destatep->prior_bigram[0] = -1;
1433 destatep->prior_bigram[1] = -1;
1434 destatep->prior_bigram[2] = -1;
1435 destatep->prior_bigram[3] = -1;
1436
1437 destatep->prior_binary[0] = -1;
1438
1439 // Initialize with all but Indic encodings, which we never detect
1440 int k = 0;
1441 for (int rankedencoding = 0;
1442 rankedencoding < NUM_RANKEDENCODING;
1443 rankedencoding++) {
1444 Encoding enc = kMapToEncoding[rankedencoding];
1445 if (!IndicEncoding(enc)) {
1446 destatep->rankedencoding_list[k++] = rankedencoding;
1447 }
1448 }
1449 destatep->rankedencoding_list_len = k;
1450
1451 // This is where all the action is
1452 memset(destatep->enc_prob, 0, sizeof(destatep->enc_prob));
1453
1454 memset(destatep->hint_prob, 0, sizeof(destatep->hint_prob));
1455 memset(destatep->hint_weight, 0, sizeof(destatep->hint_weight));
1456
1457 destatep->prior_interesting_pair[AsciiPair] = 0;
1458 destatep->prior_interesting_pair[OtherPair] = 0;
1459 destatep->next_interesting_pair[AsciiPair] = 0;
1460 destatep->next_interesting_pair[OtherPair] = 0;
1461 // interesting_pairs/offsets/weightshifts not initialized; no need
1462 }
1463
1464 // Probability strings are uint8, with zeros removed via simple run-length:
1465 // (<skip-take byte> <data bytes>)*
1466 // skip-take:
1467 // 00 end
1468 // x0 skip 16 x locations, take 0 data values
1469 // xy skip x locations, take y data values
1470 // Multiply all the incoming values by 3 to account for 3x unigram sums
1471 //
1472 // {{0x77,0x69,0x6e,0x64,0x31,0x32,0x35,0x35,
1473 // 0x01,0xc2,0x10,0x41,0xfe,0x71,0xba,0x00,}}, // "wind1255"
1474 //
1475 // Weight is 0..100 percent
1476 //
1477 // Returns subscript of largest (most probable) value
1478 //
1479
1480
1481 // {{0x6e,0x6c,0x5f,0x5f, 0x05,0xb2,0xae,0xa0,0x32,0xa1,0x36,0x31,0x42,0x39,0x3 b,0x33,0x45,0x11,0x6f,0x00,}}, // "nl__"
1482 // // ASCII-7-bit=178 Latin1=174 UTF8=160 GB=50 CP1252=161 BIG5=49 Latin2=66 CP1251=57 CP1256=59 CP1250=51 Latin5=69 ISO-8859-15=111 [top ASC II-7-bit]
1483 int ApplyCompressedProb(const char* iprob, int len,
1484 int weight, DetectEncodingState* destatep) {
1485 int* dst = &destatep->enc_prob[0];
1486 int* dst2 = &destatep->hint_weight[0];
1487 const uint8* prob = reinterpret_cast<const uint8*>(iprob);
1488 const uint8* problimit = prob + len;
1489
1490 int largest = -1;
1491 int subscript_of_largest = 0;
1492
1493 // Continue with first byte and subsequent ones
1494 while (prob < problimit) {
1495 int skiptake = *prob++;
1496 int skip = (skiptake & 0xf0) >> 4;
1497 int take = skiptake & 0x0f;
1498 if (skiptake == 00) {
1499 break;
1500 } else if (take == 0) {
1501 dst += (skip << 4);
1502 dst2 += (skip << 4);
1503 } else {
1504 dst += skip; // Normal case
1505 dst2 += skip; // Normal case
1506 for (int i = 0; i < take; i++) {
1507 int enc = static_cast<int>(dst - &destatep->enc_prob[0]) + i;
1508 if (largest < prob[i]) {
1509 largest = prob[i];
1510 subscript_of_largest = enc;
1511 }
1512
1513 int increment = prob[i] * 3; // The actual increment
1514
1515 // Do maximum of previous hints plus this new one
1516 if (weight > 0) {
1517 increment = (increment * weight) / 100;
1518 dst[i] = maxint(dst[i], increment);
1519 dst2[i] = 1; // New total weight
1520 }
1521 }
1522 prob += take;
1523 dst += take;
1524 dst2 += take;
1525 }
1526 }
1527 return subscript_of_largest;
1528 }
1529
1530
1531 // Returns subscript of largest (most probable) value [for unit test]
1532 int TopCompressedProb(const char* iprob, int len) {
1533 const uint8* prob = reinterpret_cast<const uint8*>(iprob);
1534 const uint8* problimit = prob + len;
1535 int next_prob_sub = 0;
1536 int topprob = 0;
1537 int toprankenc = 0;
1538
1539 while (prob < problimit) {
1540 int skiptake = *prob++;
1541 int skip = (skiptake & 0xf0) >> 4;
1542 int take = skiptake & 0x0f;
1543 if (skiptake == 0) {
1544 break;
1545 } else if (take == 0) {
1546 next_prob_sub += (skip << 4);
1547 } else {
1548 next_prob_sub += skip; // Normal case
1549 for (int i = 0; i < take; i++) {
1550 if (topprob < prob[i]) {
1551 topprob = prob[i];
1552 toprankenc = next_prob_sub + i;
1553 }
1554 }
1555 prob += take;
1556 next_prob_sub += take;
1557 }
1558 }
1559 return toprankenc;
1560 }
1561
1562
1563 // Find subscript of matching key in first 8 bytes of sorted hint array, or -1
1564 int HintBinaryLookup8(const HintEntry* hintprobs, int hintprobssize,
1565 const char* norm_key) {
1566 // Key is always in range [lo..hi)
1567 int lo = 0;
1568 int hi = hintprobssize;
1569 while (lo < hi) {
1570 int mid = (lo + hi) >> 1;
1571 int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 8);
1572 if (comp < 0) {
1573 lo = mid + 1;
1574 } else if (comp > 0) {
1575 hi = mid;
1576 } else {
1577 return mid;
1578 }
1579 }
1580 return -1;
1581 }
1582
1583 // Find subscript of matching key in first 4 bytes of sorted hint array, or -1
1584 int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
1585 const char* norm_key) {
1586 // Key is always in range [lo..hi)
1587 int lo = 0;
1588 int hi = hintprobssize;
1589 while (lo < hi) {
1590 int mid = (lo + hi) >> 1;
1591 int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 4);
1592 if (comp < 0) {
1593 lo = mid + 1;
1594 } else if (comp > 0) {
1595 hi = mid;
1596 } else {
1597 return mid;
1598 }
1599 }
1600 return -1;
1601 }
1602
1603 static inline void Boost(DetectEncodingState* destatep, int r_enc, int boost) {
1604 destatep->enc_prob[r_enc] += boost;
1605 }
1606
1607 static inline void Whack(DetectEncodingState* destatep, int r_enc, int whack) {
1608 destatep->enc_prob[r_enc] -= whack;
1609 }
1610
1611 // Apply initial probability hint based on top level domain name
1612 // Weight is 0..100 percent
1613 // Return 1 if name match found
1614 int ApplyTldHint(const char* url_tld_hint, int weight,
1615 DetectEncodingState* destatep) {
1616 if (url_tld_hint[0] == '~') {
1617 return 0;
1618 }
1619 string normalized_tld = MakeChar4(string(url_tld_hint));
1620 int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
1621 normalized_tld.c_str());
1622 if (n >= 0) {
1623 // TLD is four bytes, probability table is ~12 bytes
1624 int best_sub = ApplyCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey],
1625 kMaxTldVector, weight, destatep);
1626 // Never boost ASCII7; do CP1252 instead
1627 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
1628 destatep->declared_enc_1 = best_sub;
1629 if (destatep->debug_data != NULL) {
1630 // Show TLD hint
1631 SetDetailsEncProb(destatep, 0, best_sub, url_tld_hint);
1632 }
1633 return 1;
1634 }
1635 return 0;
1636 }
1637
1638 // Apply initial probability hint based on charset= name
1639 // Weight is 0..100 percent
1640 // Return 1 if name match found
1641 int ApplyCharsetHint(const char* charset_hint, int weight,
1642 DetectEncodingState* destatep) {
1643 if (charset_hint[0] == '~') {
1644 return 0;
1645 }
1646 string normalized_charset = MakeChar44(string(charset_hint));
1647 int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize,
1648 normalized_charset.c_str());
1649 if (n >= 0) {
1650 // Charset is eight bytes, probability table is ~eight bytes
1651 int best_sub = ApplyCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharse tKey],
1652 kMaxCharsetVector, weight, destatep);
1653 // Never boost ASCII7; do CP1252 instead
1654 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
1655 destatep->declared_enc_1 = best_sub;
1656
1657 // If first explicitly declared charset is confusable with Latin1/1252, put
1658 // both declared forms in declared_enc_*, displacing Latin1/1252.
1659 // This avoids a bit of Latin1 creep.
1660 // Also boost the declared encoding and its pair
1661 // TODO (dsites) This should all be folded into postproc-enc-detect.cc
1662 if ((destatep->http_hint == UNKNOWN_ENCODING) &&
1663 (destatep->meta_hint == UNKNOWN_ENCODING)) {
1664 // This is the first charset=hint
1665 switch (best_sub) {
1666 case F_Latin2: // 8859-2 Latin2, east euro
1667 destatep->declared_enc_2 = F_CP1250;
1668 Boost(destatep, F_Latin2, kGentleOnePair);
1669 Boost(destatep, F_CP1250, kGentleOnePair);
1670 break;
1671 case F_CP1250:
1672 destatep->declared_enc_2 = F_Latin2;
1673 Boost(destatep, F_Latin2, kGentleOnePair);
1674 Boost(destatep, F_CP1250, kGentleOnePair);
1675 break;
1676
1677 case F_Latin3: // 8859-3 Latin3, south euro, Esperanto
1678 destatep->declared_enc_2 = F_ASCII_7_bit;
1679 Boost(destatep, F_Latin3, kGentleOnePair);
1680 break;
1681
1682 case F_Latin4: // 8859-4 Latin4, north euro
1683 destatep->declared_enc_2 = F_ASCII_7_bit;
1684 Boost(destatep, F_Latin4, kGentleOnePair);
1685 break;
1686
1687 case F_ISO_8859_5: // 8859-5 Cyrillic
1688 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1251
1689 Boost(destatep, F_ISO_8859_5, kGentleOnePair); // (too different)
1690 break;
1691 case F_CP1251:
1692 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost -5
1693 Boost(destatep, F_CP1251, kGentleOnePair); // (too different)
1694 break;
1695
1696 case F_Arabic: // 8859-6 Arabic
1697 destatep->declared_enc_2 = F_CP1256;
1698 Boost(destatep, F_Arabic, kGentleOnePair);
1699 Boost(destatep, F_CP1256, kGentleOnePair);
1700 break;
1701 case F_CP1256:
1702 destatep->declared_enc_2 = F_Arabic;
1703 Boost(destatep, F_Arabic, kGentleOnePair);
1704 Boost(destatep, F_CP1256, kGentleOnePair);
1705 break;
1706
1707 case F_Greek: // 8859-7 Greek
1708 destatep->declared_enc_2 = F_CP1253;
1709 Boost(destatep, F_Greek, kGentleOnePair);
1710 Boost(destatep, F_CP1253, kGentleOnePair);
1711 break;
1712 case F_CP1253:
1713 destatep->declared_enc_2 = F_Greek;
1714 Boost(destatep, F_Greek, kGentleOnePair);
1715 Boost(destatep, F_CP1253, kGentleOnePair);
1716 break;
1717
1718 case F_Hebrew: // 8859-8 Hebrew
1719 destatep->declared_enc_2 = F_CP1255;
1720 Boost(destatep, F_Hebrew, kGentleOnePair);
1721 Boost(destatep, F_CP1255, kGentleOnePair);
1722 break;
1723 case F_CP1255:
1724 destatep->declared_enc_2 = F_Hebrew;
1725 Boost(destatep, F_Hebrew, kGentleOnePair);
1726 Boost(destatep, F_CP1255, kGentleOnePair);
1727 break;
1728
1729 case F_Latin5: // 8859-9 Latin5, Turkish
1730 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1254
1731 Boost(destatep, F_Latin5, kGentleOnePair); // (too different)
1732 break;
1733 case F_CP1254:
1734 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost Latin5
1735 Boost(destatep, F_CP1254, kGentleOnePair); // (too different)
1736 break;
1737
1738 case F_Latin6: // 8859-10 Latin6, Nordic
1739 destatep->declared_enc_2 = F_ASCII_7_bit;
1740 Boost(destatep, F_Latin6, kGentleOnePair);
1741 break;
1742
1743 case F_ISO_8859_11: // 8859-11 Thai,
1744 destatep->declared_enc_2 = F_CP874;
1745 Boost(destatep, F_ISO_8859_11, kGentleOnePair);
1746 Boost(destatep, F_CP874, kGentleOnePair);
1747 break;
1748 case F_CP874:
1749 destatep->declared_enc_2 = F_ISO_8859_11;
1750 Boost(destatep, F_ISO_8859_11, kGentleOnePair);
1751 Boost(destatep, F_CP874, kGentleOnePair);
1752 break;
1753
1754 case F_ISO_8859_13: // 8859-13 Latin7, Baltic
1755 destatep->declared_enc_2 = F_CP1257;
1756 Boost(destatep, F_ISO_8859_13, kGentleOnePair);
1757 Boost(destatep, F_CP1257, kGentleOnePair);
1758 break;
1759 case F_CP1257:
1760 destatep->declared_enc_2 = F_ISO_8859_13;
1761 Boost(destatep, F_ISO_8859_13, kGentleOnePair);
1762 Boost(destatep, F_CP1257, kGentleOnePair);
1763 break;
1764
1765 case F_ISO_8859_15: // 8859-15 Latin9, Latin0, Euro-ized Latin1
1766 destatep->declared_enc_2 = F_ASCII_7_bit;
1767 Boost(destatep, F_ISO_8859_15, kGentleOnePair);
1768 break;
1769
1770
1771 // Greek all-caps is confusable with KOI8x all-lower and Hebrew.
1772 // This turns some Greek documents into Cyrillic, etc. by mistake.
1773 // Greek and Hebrew are boosted explicitly above; do KOI8x here.
1774 // Boosting the declared encodingmakes it harder for the wrong one to
1775 // creep up.
1776 case F_KOI8R:
1777 Boost(destatep, F_KOI8R, kGentleOnePair);
1778 break;
1779 case F_KOI8U:
1780 Boost(destatep, F_KOI8U, kGentleOnePair);
1781 break;
1782
1783 default:
1784 break;
1785 }
1786 }
1787
1788 if (destatep->debug_data != NULL) {
1789 // Show charset hint
1790 SetDetailsEncProb(destatep, 0, best_sub, charset_hint);
1791 }
1792
1793 //
1794 // Some fix-ups for the declared encodings
1795 //
1796
1797 // If non-UTF8, non-Latin1/1252 encoding declared, disable UTF8 combos
1798 // TODO (dsites) This should all be folded into postproc-enc-detect.cc
1799 if ((best_sub != F_UTF8) &&
1800 (best_sub != F_Latin1) &&
1801 (best_sub != F_CP1252)) {
1802 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 4); // demote
1803 }
1804
1805 // Latin2 and CP1250 differ in the overlap part, such as B1 or B9
1806 // The initial probabilites for charset=Latin2 explicitly put CP1250
1807 // down twice as far as normal, and vice versa. This is done in
1808 // postproc-enc-detect.cc
1809
1810 // If charset=user-defined, treat as Binary --
1811 // we can safely only do low ASCII, might be Indic
1812 if (normalized_charset.substr(0,4) == "user") {
1813 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
1814 }
1815
1816 return 1;
1817 }
1818 return 0;
1819 }
1820
1821 // Apply initial probability hint based on caller-supplied encoding
1822 // Negative hint whacks ~encoding, non-negative boosts encoding
1823 //
1824 // Negative hints are an experiment to see if they might be useful.
1825 // Not operator used instead of unary minus to allow specifying not-zero
1826 int ApplyEncodingHint(const int encoding_hint, int weight,
1827 DetectEncodingState* destatep) {
1828 Encoding enc_hint = static_cast<Encoding>((encoding_hint < 0) ?
1829 ~encoding_hint : encoding_hint);
1830 // Map to the right internal subscript
1831 int rankedenc_hint = CompactEncDet::BackmapEncodingToRankedEncoding(enc_hint);
1832
1833 // I'm not sure how strong this hint should be. Weight 100% = 1 bigram
1834 int increment = (kBoostOnePair * weight) / 100;
1835
1836 if (encoding_hint < 0) {
1837 destatep->enc_prob[rankedenc_hint] -= increment;
1838 } else {
1839 destatep->enc_prob[rankedenc_hint] += increment;
1840 }
1841
1842 if (destatep->debug_data != NULL) {
1843 // Show encoding hint
1844 SetDetailsEncProb(destatep, 0, -1, MyEncodingName(enc_hint));
1845 }
1846 return 1;
1847 }
1848
1849 // Apply initial probability hint based on user interface language
1850 // Weight is 0..100 percent
1851 // Return 1 if name match found
1852 int ApplyUILangaugeHint(const Language language_hint,
1853 int weight, DetectEncodingState* destatep) {
1854 if (language_hint == UNKNOWN_LANGUAGE) {
1855 return 0;
1856 }
1857 string normalized_lang = MakeChar8(LanguageName(language_hint));
1858 int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize,
1859 normalized_lang.c_str());
1860 if (n >= 0) {
1861 // Language is eight bytes, probability table is ~eight bytes
1862 int best_sub = ApplyCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey],
1863 kMaxLangVector, weight, destatep);
1864 // Never boost ASCII7; do CP1252 instead
1865 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
1866 destatep->declared_enc_1 = best_sub;
1867 if (destatep->debug_data != NULL) {
1868 // Show language hint
1869 SetDetailsEncProb(destatep, 0, best_sub, normalized_lang.c_str());
1870 }
1871 return 1;
1872 }
1873 return 0;
1874 }
1875
1876 // Apply initial probability hint based on corpus type (web, email, etc)
1877 // Weight is 0..100 percent IGNORED
1878 // Return 1 if name match found
1879 int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
1880 int weight, DetectEncodingState* destatep) {
1881
1882 for (int i = 0; i < NUM_RANKEDENCODING; i++) {
1883 // Set the default probability
1884 destatep->enc_prob[i] = kDefaultProb[i] * 3;
1885 // Deliberately set 2022 seven-bit encodings to zero,
1886 // so we can look for actual use
1887 // TODO (dsites) This should all be folded into postproc-enc-detect.cc
1888 if (SevenBitEncoding(kMapToEncoding[i])) {
1889 destatep->enc_prob[i] = 0;
1890 }
1891 }
1892
1893 // A little corpus distinction
1894 switch (corpus_type) {
1895 case CompactEncDet::WEB_CORPUS:
1896 case CompactEncDet::XML_CORPUS:
1897 // Allow double-converted UTF-8 to start nearly equal to normal UTF-8
1898 destatep->enc_prob[F_UTF8UTF8] =
1899 destatep->enc_prob[F_UTF8] - kSmallInitDiff;
1900 break;
1901 case CompactEncDet::QUERY_CORPUS:
1902 case CompactEncDet::EMAIL_CORPUS:
1903 default:
1904 break;
1905 }
1906
1907 if (FLAGS_demo_nodefault) {
1908 // Demo, make initial probs all zero
1909 for (int i = 0; i < NUM_RANKEDENCODING; i++) {
1910 destatep->enc_prob[i] = 0;
1911 }
1912 }
1913
1914 if (destatep->debug_data != NULL) {
1915 // Show default hint
1916 SetDetailsEncProb(destatep, 0, -1, "Default");
1917 }
1918 return 1;
1919 }
1920
1921
1922
1923 // Do reverse search for c in [str..str+len)
1924 // Note: initial pointer is to FRONT of string, not back
1925 const char* MyMemrchr(const char* str, char c, size_t len) {
1926 const char* ret = str + len;
1927 while (str <= --ret) {
1928 if (*ret == c) {return ret;}
1929 }
1930 return NULL;
1931 }
1932
1933
1934 // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD
1935 // Now that we are no longer trying to do Indic font-based encodigns, we
1936 // don't need the full URL and can go back to simple TLD. This test remains for
1937 // backwards compatility with any caller using full URL.
1938 static const int kMinURLLength = 11;
1939
1940 // Extract TLD from a full URL or just a TLD
1941 // Return hostname and length if a full URL
1942 void ExtractTLD(const char* url_hint, char* tld_hint, int tld_hint_len,
1943 const char** ret_host_start, int* ret_host_len) {
1944 // url_hint can either be a full URL (preferred) or just top-level domain name
1945 // Extract the TLD from a full URL and use it for
1946 // a normal TLD hint
1947
1948 strncpy(tld_hint, "~", tld_hint_len);
1949 tld_hint[tld_hint_len - 1] = '\0';
1950 *ret_host_start = NULL;
1951 *ret_host_len = 0;
1952
1953 int url_len = (url_hint != NULL) ? strlen(url_hint) : 0;
1954 if (url_len == 0) {
1955 // Empty TLD
1956 return;
1957 }
1958
1959 // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD
1960 if (kMinURLLength <= url_len) {
1961 // See if it really is a URL
1962 const char* first_slash = strchr(url_hint, '/');
1963 if ((first_slash != NULL) && (first_slash != url_hint) &&
1964 (first_slash[-1] == ':') && (first_slash[1] == '/') &&
1965 (memrchr(url_hint, '.', first_slash - url_hint) == NULL)) {
1966 // We found :// and no dot in front of it, so declare a real URL
1967
1968 const char* hostname_start = first_slash + 2;
1969 const char* hostname_end = strchr(hostname_start, '/');
1970 if (hostname_end == NULL) {
1971 // No slash; end is first byte off end of the URL string
1972 hostname_end = url_hint + url_len;
1973 }
1974 size_t hostname_len = hostname_end - hostname_start;
1975 const char* port_start =
1976 (const char*)memchr(hostname_start, ':', hostname_len);
1977 if (port_start != NULL) {
1978 // Port; shorten hostname
1979 hostname_end = port_start;
1980 hostname_len = hostname_end - hostname_start;
1981 }
1982
1983 const char* tld_start = MyMemrchr(hostname_start, '.', hostname_len);
1984 if (tld_start != NULL) {
1985 // Remember the TLD we just found
1986 int tld_len = hostname_start + hostname_len - tld_start - 1;
1987 if (tld_len > (tld_hint_len - 1)) {
1988 tld_len = tld_hint_len - 1;
1989 }
1990 memcpy(tld_hint, tld_start + 1, tld_len);
1991 tld_hint[tld_len] = '\0';
1992 }
1993 *ret_host_start = hostname_start;
1994 *ret_host_len = hostname_len;
1995 return;
1996 }
1997 } else {
1998 strncpy(tld_hint, url_hint, tld_hint_len);
1999 tld_hint[tld_hint_len - 1] = '\0';
2000 }
2001 }
2002
2003 // Apply hints, if any, to probabilities
2004 // NOTE: Encoding probabilites are all zero at this point
2005 void ApplyHints(const char* url_hint,
2006 const char* http_charset_hint,
2007 const char* meta_charset_hint,
2008 const int encoding_hint,
2009 const Language language_hint,
2010 const CompactEncDet::TextCorpusType corpus_type,
2011 DetectEncodingState* destatep) {
2012 int hint_count = 0;
2013 // url_hint can either be a full URL (preferred) or just top-level domain name
2014 // Extract the TLD from a full URL and use it for
2015 // a normal TLD hint
2016
2017 char tld_hint[16];
2018 const char* hostname_start = NULL;
2019 int hostname_len = 0;
2020 ExtractTLD(url_hint, tld_hint, sizeof(tld_hint),
2021 &hostname_start, &hostname_len);
2022
2023
2024 // Initial hints give slight boost to Ascii-7-bit and code page 1252
2025 // ApplyXxx routines copy enc_1 to enc_2 then update declared_enc_1
2026 // This gives a boost to 1252 if one of HTTP/META is specified,
2027 // but this could be the wrong thing to do if Latin2/3/4/etc. is specified
2028 destatep->declared_enc_1 = F_CP1252;
2029 destatep->declared_enc_2 = F_ASCII_7_bit;
2030
2031 // Applying various hints takes max of new hint and any old hint.
2032 // This does better on multiple hints that a weighted average
2033
2034 // Weight is 0..100 percent
2035 if ((http_charset_hint != NULL) && (http_charset_hint[0] != '~')) {
2036 destatep->declared_enc_2 = destatep->declared_enc_1;
2037 hint_count += ApplyCharsetHint(http_charset_hint, 100, destatep);
2038 destatep->http_hint = kMapToEncoding[destatep->declared_enc_1];
2039 if ((destatep->declared_enc_1 == F_CP1252) ||
2040 (destatep->declared_enc_1 == F_Latin1)) {
2041 destatep->looking_for_latin_trigrams = true;
2042 }
2043 }
2044 if ((meta_charset_hint != NULL) && (meta_charset_hint[0] != '~')) {
2045 destatep->declared_enc_2 = destatep->declared_enc_1;
2046 hint_count += ApplyCharsetHint(meta_charset_hint, 100, destatep);
2047 destatep->meta_hint = kMapToEncoding[destatep->declared_enc_1];
2048 if ((destatep->declared_enc_1 == F_CP1252) ||
2049 (destatep->declared_enc_1 == F_Latin1)) {
2050 destatep->looking_for_latin_trigrams = true;
2051 }
2052 }
2053 if (encoding_hint != UNKNOWN_ENCODING) {
2054 destatep->declared_enc_2 = destatep->declared_enc_1;
2055 hint_count += ApplyEncodingHint(encoding_hint, 50, destatep);
2056 }
2057 if (language_hint != UNKNOWN_LANGUAGE) {
2058 destatep->declared_enc_2 = destatep->declared_enc_1;
2059 hint_count += ApplyUILangaugeHint(language_hint, 50, destatep);
2060 }
2061 // Use top level domain if not .com and <=1 other hint was available
2062 if (url_hint != NULL) {
2063 destatep->tld_hint = CompactEncDet::TopEncodingOfTLDHint(tld_hint);
2064 if (hint_count == 0) {
2065 // Apply with weight 100%
2066 destatep->declared_enc_2 = destatep->declared_enc_1;
2067 hint_count += ApplyTldHint(tld_hint, 100, destatep);
2068 if ((destatep->declared_enc_1 == F_CP1252) ||
2069 (destatep->declared_enc_1 == F_Latin1)) {
2070 destatep->looking_for_latin_trigrams = true;
2071 }
2072 if (strcmp("hu", tld_hint) == 0) {
2073 // Hungarian is particularly difficult to separate Latin2 from Latin1,
2074 // so always look for trigram scanning if bare TLD=hu hint
2075 destatep->looking_for_latin_trigrams = true;
2076 }
2077 // Treat .com as no TLD hint at all
2078 } else if ((hint_count == 1) && (strcmp("com", tld_hint) != 0)) {
2079 // Either shift weighting or consider doing no TLD here -- seems to
2080 // distract from correct charset= hints. Or perhaps apply only if
2081 // charset = Latin1/1252...
2082 // Apply with weight 50%
2083 destatep->declared_enc_2 = destatep->declared_enc_1;
2084 hint_count += ApplyTldHint(tld_hint, 50, destatep);
2085 if ((destatep->declared_enc_1 == F_CP1252) ||
2086 (destatep->declared_enc_1 == F_Latin1)) {
2087 destatep->looking_for_latin_trigrams = true; // These need trigrams
2088 }
2089 }
2090 // Else ignore TLD hint entirely
2091 }
2092
2093 // Use all-web default distribution if not even a TLD hint
2094 if (hint_count == 0) {
2095 destatep->looking_for_latin_trigrams = true; // Default needs trigrams
2096 destatep->declared_enc_2 = destatep->declared_enc_1;
2097 hint_count += ApplyDefaultHint(corpus_type, 100, destatep);
2098 }
2099
2100
2101 // ISO-Microsoft Pairs
2102 // F_Latin1, F_CP1252,
2103 // F_Latin2, F_CP1250, NOT really strict subset/superset pairs
2104 // F_Latin3,
2105 // F_Latin4,
2106 // F_ISO_8859_5, F_CP1251,
2107 // F_Arabic, F_CP1256, NOT
2108 // F_Greek, F_CP1253, NOT really pairs
2109 // (or upgrade incvt to make Greek use CP)
2110 // F_Hebrew, F_CP1255, NOT really pairs
2111 // F_Latin5, F_CP1254,
2112 // F_Latin6,
2113 // F_ISO_8859_11,
2114 // F_ISO_8859_13, F_CP1257,
2115 // F_ISO_8859_15,
2116 // ISO-Microsoft Pairs
2117
2118 // Get important families started together
2119 // // This should fall out of the initializatoin vectors for charset,
2120 // but we need to get rid of families alltogetrher
2121 //
2122 // TODO make this more graceful
2123
2124 // Add small bias for subsets
2125
2126 // Subtract small bias for supersets
2127 destatep->enc_prob[F_CP932] = destatep->enc_prob[F_SJS] - kSmallInitDiff;
2128
2129 destatep->enc_prob[F_GBK] = destatep->enc_prob[F_GB] - kSmallInitDiff;
2130 destatep->enc_prob[F_GB18030] = destatep->enc_prob[F_GB] - kSmallInitDiff;
2131
2132 destatep->enc_prob[F_BIG5_CP950] = destatep->enc_prob[F_BIG5] -
2133 kSmallInitDiff;
2134 destatep->enc_prob[F_BIG5_HKSCS] = destatep->enc_prob[F_BIG5] -
2135 kSmallInitDiff;
2136
2137 // Deliberate over-bias Ascii7 and underbias Binary [unneeded]
2138 // destatep->enc_prob[F_ASCII_7_bit] = destatep->enc_prob[F_ASCII_7_bit] + kSm allInitDiff;
2139 // destatep->enc_prob[F_BINARY] = destatep->enc_prob[F_BINARY] - (kBoostInitia l / 2);
2140
2141 if (destatep->debug_data != NULL) {
2142 // Show state at end of hints
2143 SetDetailsEncProb(destatep, 0, -1, "Endhints");
2144 if(FLAGS_enc_detect_detail2) {
2145 // Add a line showing the watched encoding(s)
2146 if (watch1_rankedenc >= 0) {
2147 SetDetailsEncProb(destatep, 0,
2148 watch1_rankedenc, FLAGS_enc_detect_watch1.c_str());
2149 }
2150 if (watch2_rankedenc >= 0) {
2151 SetDetailsEncProb(destatep, 0,
2152 watch2_rankedenc, FLAGS_enc_detect_watch2.c_str());
2153 }
2154 } // End detail2
2155 }
2156
2157 // If duplicate hints, set second one to ASCII_7BIT to prevent double-boost
2158 if (destatep->declared_enc_1 == destatep->declared_enc_2) {
2159 destatep->declared_enc_2 = F_ASCII_7_bit;
2160 }
2161
2162 if (FLAGS_force127) {
2163 destatep->do_latin_trigrams = true;
2164 if (FLAGS_enc_detect_source) {
2165 PsHighlight(0, destatep->initial_src, 0, 2);
2166 }
2167 }
2168
2169
2170 if (FLAGS_counts && destatep->looking_for_latin_trigrams) {++looking_used;}
2171 if (FLAGS_counts && destatep->do_latin_trigrams) {++doing_used;}
2172
2173 //
2174 // At this point, destatep->enc_prob[] is an initial probability vector based
2175 // on the given hints/default. In general, it spreads out least-likely
2176 // encodings to be about 2**-25 below the most-likely encoding.
2177 // For input text with lots of bigrams, an unlikely encoding can rise to
2178 // the top at a rate of about 2**6 per bigram, and more commonly 2**2 per
2179 // bigram. So more than 4 bigrams and commonly more than 12 are
2180 // needed to overcome the initial hints when the least-likely encoding
2181 // is in fact the correct answer. So if the entire text has very few bigrams
2182 // (as a two-word query might), it can be impossible for the correct
2183 // encoding to win.
2184 //
2185 // To compensate for this, we take the initial hint vector and effectively
2186 // apply it at the rate of 1/16 every bigram for the first 16 bigrams. The
2187 // actual mechanism is done just before the last prune.
2188 //
2189
2190 // Remember Initial hint probabilities
2191 memcpy(destatep->hint_prob, destatep->enc_prob, sizeof(destatep->enc_prob));
2192 }
2193
2194 // Look for specific high-value patterns in the first 4 bytes
2195 // Byte order marks (BOM)
2196 // EFBBBF UTF-8
2197 // FEFF UTF-16 BE
2198 // FFFE UTF-16 LE
2199 // FFFE0000 UTF-32 BE
2200 // 0000FEFF UTF-32 LE
2201 //
2202 // Likely UTF-x of seven-bit ASCII
2203 // 00xx UTF-16 BE xx printable ASCII
2204 // xx00 UTF-16 LE
2205 // 000000xx UTF-32 BE
2206 // xx000000 UTF-32 LE
2207 //
2208 void InitialBytesBoost(const uint8* src,
2209 int text_length,
2210 DetectEncodingState* destatep) {
2211 if (text_length < 4) {return;}
2212
2213 char32 pair01 = (src[0] << 8) | src[1];
2214 char32 pair23 = (src[2] << 8) | src[3];
2215 char32 quad0123 = (pair01 << 16) | pair23;
2216
2217 bool utf_16_indication = false;
2218 bool utf_32_indication = false;
2219 int best_enc = -1;
2220
2221 // Byte order marks
2222 // UTF-8
2223 if ((quad0123 & 0xffffff00) == 0xEFBBBF00) {
2224 destatep->bom_hint = UTF8;
2225 Boost(destatep, F_UTF8, kBoostInitial * 2);
2226 Boost(destatep, F_UTF8UTF8, kBoostInitial * 2);
2227 best_enc = F_UTF8;
2228 // UTF-32 (test before UTF-16)
2229 } else if (quad0123 == 0x0000FEFF) {
2230 destatep->bom_hint = UTF32BE;
2231 Boost(destatep, F_UTF_32BE, kBoostInitial * 2);
2232 best_enc = F_UTF_32BE;
2233 } else if (quad0123 == 0xFFFE0000) {
2234 destatep->bom_hint = UTF32LE;
2235 Boost(destatep, F_UTF_32LE, kBoostInitial * 2);
2236 best_enc = F_UTF_32LE;
2237 // UTF-16
2238 } else if (pair01 == 0xFEFF) {
2239 destatep->bom_hint = UTF16BE;
2240 Boost(destatep, F_UTF_16BE, kBoostInitial * 3);
2241 best_enc = F_UTF_16BE;
2242 } else if (pair01 == 0xFFFE) {
2243 destatep->bom_hint = UTF16LE;
2244 Boost(destatep, F_UTF_16LE, kBoostInitial * 3);
2245 best_enc = F_UTF_16LE;
2246
2247 // Possible seven-bit ASCII encoded as UTF-16/32
2248 // UTF-32 (test before UTF-16)
2249 } else if (((quad0123 & 0xffffff00) == 0) &&
2250 (kIsPrintableAscii[src[3]] != 0)) {
2251 Boost(destatep, F_UTF_32BE, kBoostInitial);
2252 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal char
2253 best_enc = F_UTF_32BE;
2254 } else if (((quad0123 & 0x00ffffff) == 0) &&
2255 (kIsPrintableAscii[src[0]] != 0)) {
2256 Boost(destatep, F_UTF_32LE, kBoostInitial);
2257 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
2258 best_enc = F_UTF_32LE;
2259 } else if ((src[0] == 0x00) && (kIsPrintableAscii[src[1]] != 0)) {
2260 Boost(destatep, F_UTF_16BE, kBoostInitial);
2261 best_enc = F_UTF_16BE;
2262 } else if ((src[1] == 0x00) && (kIsPrintableAscii[src[0]] != 0)) {
2263 Boost(destatep, F_UTF_16LE, kBoostInitial);
2264 best_enc = F_UTF_16LE;
2265
2266 // Whack if 0000 or FFFF
2267 // UTF-32 (test before UTF-16)
2268 } else if (quad0123 == 0x00000000) {
2269 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
2270 Whack(destatep, F_UTF_32LE, kBadPairWhack);
2271 Whack(destatep, F_UTF_16BE, kBadPairWhack);
2272 Whack(destatep, F_UTF_16LE, kBadPairWhack);
2273 best_enc = -1;
2274 } else if (quad0123 == 0xffffffff) {
2275 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
2276 Whack(destatep, F_UTF_32LE, kBadPairWhack);
2277 Whack(destatep, F_UTF_16BE, kBadPairWhack);
2278 Whack(destatep, F_UTF_16LE, kBadPairWhack);
2279 best_enc = -1;
2280 } else if (pair01 == 0x0000) {
2281 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char
2282 Whack(destatep, F_UTF_16LE, kBadPairWhack);
2283 best_enc = -1;
2284 } else if (pair01 == 0xffff) {
2285 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char
2286 Whack(destatep, F_UTF_16LE, kBadPairWhack);
2287 best_enc = -1;
2288
2289
2290 // These are the first four bytes of some known binary file formats
2291
2292 // Boost BINARY bigtime if JPEG FFD8FFxx
2293 // Boost BINARY bigtime if png 89504E47 (.PNG)
2294 // Boost BINARY bigtime if gif 47494638 (GIF8)
2295 // Boost BINARY bigtime if zip 504B0304 (PK..)
2296 // Boost BINARY bigtime if gzip 1F8B08xx
2297 // Boost BINARY bigtime if gzip 78DAxxxx
2298 // Boost BINARY if PDF 25504446 (%PDF)
2299 // Boost BINARY if SWF (FWSx or CWSx where x <= 0x1f)
2300 } else if ((quad0123 & 0xffffff00) == 0xFFD8FF00) { // JPEG FFD8FFxx
2301 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2302 } else if (quad0123 == 0x89504E47) { // Hex 89 P N G
2303 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2304 } else if (quad0123 == 0x47494638) { // Hex GIF8
2305 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2306 } else if (quad0123 == 0x504B0304) { // Hex P K 03 04
2307 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2308 } else if ((quad0123 & 0xffffff00) == 0x1F8B0800) { // gzip 1F8B08xx
2309 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2310 } else if (pair01 == 0x78DA) { // gzip 78DAxxxx
2311 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2312 } else if (quad0123 == 0x25504446) { // Hex %PDF
2313 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2314 } else if ((quad0123 & 0xffffff1f) == 0x66535700) { // Hex FWSx
2315 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2316 } else if ((quad0123 & 0xffffff1f) == 0x63535700) { // Hex CWSx
2317 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2318
2319 // More binary detect prefixes
2320 // 7F E L F Executable and linking format
2321 // M M 00 * TIFF (little-endian)
2322 // * 00 M M TIFF (big-endian)
2323 // 01 f c p Final cut pro
2324 } else if (quad0123 == 0x7F454C46) { // Hex 7F E L F
2325 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2326 } else if (quad0123 == 0x4D4D002A) { // Hex M M 00 *
2327 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2328 } else if (quad0123 == 0x2A004D4D) { // Hex * 00 M M
2329 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2330 } else if (quad0123 == 0x01666370) { // Hex 01 f c p
2331 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2332
2333 // More binary detect prefixes; all-ASCII names; heavy weight to avoid ASCII
2334 // prefix overcoming binary
2335 // C C S D USGS ISIS 3-D cube files
2336 // S I M P FITS image header "SIMPLE "
2337 } else if (quad0123 == 0x43435344) { // Hex C C S D
2338 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2339 } else if (quad0123 == 0x53494D50) { // Hex S I M P
2340 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2341
2342 // More binary detect prefixes; all-ASCII names; lighter weight
2343 // H W P Hangul word processor
2344 // 8 B P S Photoshop
2345 // P D S _ xx "PDS_VERSION_ID "
2346 } else if (quad0123 == 0x48575020) { // Hex H W P
2347 if ((19 <= text_length) &&
2348 (memcmp(src, "HWP.Document.File.V", 19) == 0)) {
2349 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2350 } else if ((19 <= text_length) &&
2351 (memcmp(src, "HWP Document File V", 19) == 0)) {
2352 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2353 } else {
2354 Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);
2355 }
2356 } else if (quad0123 == 0x38425053) { // Hex 8 B P S
2357 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2358 } else if (quad0123 == 0x5044535F) { // Hex P D S _
2359 if ((14 <= text_length) && (memcmp(src, "PDS_VERSION_ID", 14) == 0)) {
2360 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2361 } else {
2362 Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);
2363 }
2364 }
2365
2366 // There are several main Windows EXE file formats.
2367 // Not examined here (prefix too short; never see them in Google pipeline)
2368 // M Z DOS .exe Mark Zbikowski
2369 // N E DOS 4.0 16-bit
2370 // L E OS/2 VxD drivers
2371 // L X OS/2
2372 // P E Windows NT
2373
2374
2375 // More user-defined
2376 // http://www.freenet.am/armscii/ Armenian
2377
2378 // If any hints or BOM, etc. keep UTF 16/32 around
2379 if ((destatep->enc_prob[F_UTF_16BE] > 0) ||
2380 (destatep->enc_prob[F_UTF_16LE] > 0)) {
2381 utf_16_indication = true;
2382 }
2383 if ((destatep->enc_prob[F_UTF_32BE] > 0) ||
2384 (destatep->enc_prob[F_UTF_32LE] > 0)) {
2385 utf_32_indication = true;
2386 }
2387
2388
2389 // Kill UTF16/32 right now if no positive indication of them
2390 // Otherwise, they tend to rise to the top in 7-bit files with an
2391 // occasional 0x02 byte in some comment or javascript
2392 if (!utf_16_indication) {
2393 Whack(destatep, F_UTF_16BE, kBadPairWhack * 8);
2394 Whack(destatep, F_UTF_16LE, kBadPairWhack * 8);
2395 Whack(destatep, F_Unicode, kBadPairWhack * 8);
2396 }
2397 if (!utf_32_indication) {
2398 Whack(destatep, F_UTF_32BE, kBadPairWhack * 8);
2399 Whack(destatep, F_UTF_32LE, kBadPairWhack * 8);
2400 }
2401
2402 // Usually kill mixed encodings
2403 if (!FLAGS_ced_allow_utf8utf8) {
2404 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8);
2405 }
2406 // 2011.11.07 never use UTF8CP1252 -- answer will be UTF8 instead
2407 Whack(destatep, F_UTF8CP1252, kBadPairWhack * 8);
2408
2409 if (destatep->debug_data != NULL) {
2410 // Show first four bytes of the input
2411 char buff[16];
2412 snprintf(buff, sizeof(buff), "%04x%04x", pair01, pair23);
2413 SetDetailsEncProb(destatep, 0, best_enc, buff);
2414 }
2415 }
2416
2417
2418
2419 // Descending order
2420 int IntCompare(const void* v1, const void* v2) {
2421 const int* p1 = reinterpret_cast<const int*>(v1);
2422 const int* p2 = reinterpret_cast<const int*>(v2);
2423 if (*p1 < *p2) {return 1;}
2424 if (*p1 > *p2) {return -1;}
2425 return 0;
2426 }
2427
2428 bool Base64Char(uint8 c) {
2429 if (('A' <= c) && (c <= 'Z')) {return true;}
2430 if (('a' <= c) && (c <= 'z')) {return true;}
2431 if (('0' <= c) && (c <= '9')) {return true;}
2432 if ('+' == c) {return true;}
2433 if ('/' == c) {return true;}
2434 return false;
2435 }
2436
2437 int Base64ScanLen(const uint8* start, const uint8* limit) {
2438 // We have a plausible beginning; scan entire base64 string
2439 const uint8* ib64str = start;
2440 const uint8* b64str = ib64str;
2441 const uint8* b64strlimit = limit;
2442 // if starts with + +++, assume it is drawing, so bogus
2443 if (((limit - start) > 3) && (start[0] == '+') &&
2444 (start[1] == '+') && (start[2] == '+')) {
2445 return 81;
2446 }
2447 // Scan over base64
2448 while ((b64str < b64strlimit) && (kBase64Value[*b64str++] >= 0)) {
2449 }
2450 b64str--; // We overshot by 1
2451 return b64str - ib64str;
2452 }
2453
2454 // Input is at least 8-character legal base64 string after +.
2455 // But might be say + "Presse+Termine"
2456 bool GoodUnicodeFromBase64(const uint8* start, const uint8* limit) {
2457 // Reject base64 string len N if density of '+' is > 1 + N/16 (expect 1/64)
2458 // Reject base64 string len N if density of A-Z is < 1 + N/16 (expect 26/64)
2459 // Reject base64 string len N if density of a-z is < 1 + N/16 (expect 26/64)
2460 // Reject base64 string len N if density of 0-9 is < 1 + N/32 (expect 10/64)
2461 // NOTE: this requires at least one lower AND one upper AND one digit to pass
2462 //
2463 int plus_count = 0;
2464 int lower_count = 0;
2465 int upper_count = 0;
2466 int digit_count = 0;
2467 int len = limit - start;
2468 for (const uint8* src = start; src < limit; ++src) {
2469 uint8 c = *src;
2470 if (('a' <= c) && (c <= 'z')) {
2471 ++lower_count;
2472 } else if (('A' <= c) && (c <= 'Z')) {
2473 ++upper_count;
2474 } else if (('0' <= c) && (c <= '0')) {
2475 ++digit_count;
2476 } else if (*src == '+') {
2477 ++plus_count;
2478 }
2479 }
2480
2481 if (plus_count > (1 + (len >> 4))) {return false;}
2482 if (lower_count < (1 + (len >> 4))) {return false;}
2483 if (upper_count < (1 + (len >> 4))) {return false;}
2484 if (digit_count < (1 + (len >> 5))) {return false;}
2485
2486 // checking the last character to reduce false positive
2487 // since the last character may be padded to 0 bits at the end.
2488 // refer to http://en.wikipedia.org/wiki/UTF-7
2489 int nmod8 = len & 7;
2490 const uint8 last = *(start+len-1);
2491 // When UTF-7 string length%8=3, the last two bits must be padded as 0
2492 if ((nmod8 == 3) && (kBase64Value[last] & 3)) {return false;}
2493 // When UTF-7 string length%8=6, the last four bits must be padded as 0
2494 if ((nmod8 == 6) && (kBase64Value[last] & 15)) {return false;}
2495 return true;
2496 }
2497
2498 // Prune here after N bytes
2499 // Boost here for seven-bit sequences (at every prune)
2500 // if (sevenbitrankedencoding)
2501 // + UTF7 scan and boost/demote len mod 8 = 0 3 6
2502 // ~ Hz scan and boost/demote len mod 8 = 0 2 4 6
2503 // 1B 2022 scan and boost/demote len mod 8 = 0 2 4 6
2504 // 0E 2022 scan and boost/demote len mod 8 = 0 2 4 6
2505 // [0F 2022 boost/demote]
2506 // 00 UTF16/32 scan and boost/demote offset = even/odd
2507 //
2508 // If still some seven-bit possibilities > pure ASCII,
2509 // scan each possibility for clearer prob, s.t. about
2510 // two good sequences is a clear win
2511 // A-Z 00-19 00xx-64xx (B = 04xx)
2512 // a-z 1A-33 68xx-CCxx (f = 7Cxx)
2513 // 0-9 34-3D D0xx-F4xx (1 = D4xx)
2514 // + 3E F8xx
2515 // / 3F FCxx
2516 // do another chunk with slow scan
2517
2518
2519 // Boost, whack, or leave alone UTF-7 probablilty
2520 void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) {
2521 int off = destatep->interesting_offsets[AsciiPair][next_pair];
2522 if (off >= destatep->prior_utf7_offset) {
2523 // Not part of a previous successful UTF-7 string
2524 ++destatep->utf7_starts;
2525
2526 if (byte2 == '-') {
2527 // +- encoding for '+' neutral
2528 } else if (!Base64Char(byte2)) {
2529 // Not base64 -- not UTF-7, whack
2530 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal pair
2531 } else {
2532 // Starts with base64 byte, might be a good UTF7 sequence
2533 const uint8* start = destatep->initial_src + off + 1; // over the +
2534 int n = Base64ScanLen(start, destatep->limit_src);
2535 int nmod8 = n & 7;
2536 if ((n == 3) || (n == 6)) {
2537 // short but legal -- treat as neutral
2538 } else if ((nmod8 == 0) | (nmod8 == 3) | (nmod8 == 6)) {
2539 // Good length. Check for good Unicode.
2540 if (GoodUnicodeFromBase64(start, start + n)) {
2541 // Good length and Unicode, boost
2542 Boost(destatep, F_UTF7, kBoostOnePair); // Found good
2543 destatep->prior_utf7_offset = off + n + 1;
2544 } else {
2545 // Bad Unicode. Whack
2546 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length
2547 }
2548 } else {
2549 // Bad length. Whack
2550 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length
2551 }
2552 }
2553 }
2554 }
2555
2556 // Boost, whack, or leave alone HZ probablilty
2557 void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
2558 if ((byte2 == '{') || (byte2 == '}')) {
2559 Boost(destatep, F_HZ_GB_2312, kBoostOnePair); // Found ~{ or ~}
2560 } else if ((byte2 == '~') || (byte2 == '\n')) {
2561 destatep->enc_prob[F_HZ_GB_2312] += 0; // neutral
2562 } else {
2563 Whack(destatep, F_HZ_GB_2312, kBadPairWhack); // Illegal pair
2564 }
2565 }
2566
2567 // Boost, whack, or leave alone BINARY probablilty
2568 void BinaryBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
2569 int quadrant = ((byte1 & 0x80) >> 6) | ((byte2 & 0x80) >> 7);
2570 int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6);
2571 uint32 quad_mask = 1 << quadrant;
2572 uint32 bucket8x4_mask = 1 << bucket8x4;
2573 if ((destatep->binary_quadrants_seen & quad_mask) == 0) {
2574 destatep->binary_quadrants_seen |= quad_mask;
2575 destatep->binary_quadrants_count += 1;
2576 if (destatep->binary_quadrants_count == 4) {
2577 Boost(destatep, F_BINARY, kBoostOnePair * 2); // Found all 4 quadrants,
2578 // boost 2 pairs
2579 }
2580 }
2581 if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) {
2582 destatep->binary_8x4_seen |= bucket8x4_mask;
2583 destatep->binary_8x4_count += 1;
2584 if (destatep->binary_8x4_count >= 11) {
2585 Boost(destatep, F_BINARY, kBoostOnePair * 4); // Found 11+/20 buckets,
2586 // boost 4 pairs each time
2587 }
2588 }
2589 }
2590
2591
2592 // Demote UTF-16/32 on 0000 or FFFF, favoring Binary
2593 void UTF1632BoostWhack(DetectEncodingState* destatep, int offset, uint8 byte1) {
2594 if (byte1 == 0) { // We have 0000
2595 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair
2596 Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair
2597 switch (offset & 3) {
2598 case 0: // We get called with 0 4 8, etc. for ASCII/BMP as UTF-32BE
2599 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair
2600 Boost(destatep, F_UTF_32BE, kSmallInitDiff); // Good pair
2601 break;
2602 case 1: // We get called with 1 5 9, etc. for ASCII as UTF-32LE
2603 case 2: // We get called with 2 6 10, etc. for BMP as UTF-32LE
2604 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair
2605 Boost(destatep, F_UTF_32LE, kSmallInitDiff); // Good pair
2606 break;
2607 case 3: // ambiguous
2608 break;
2609 }
2610 } else { // We have ffff
2611 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair
2612 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair
2613 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair
2614 Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair
2615 }
2616 }
2617
2618 // Make even offset
2619 void UTF16MakeEven(DetectEncodingState* destatep, int next_pair) {
2620 destatep->interesting_offsets[OtherPair][next_pair] &= ~1;
2621 }
2622
2623 bool ConsecutivePair(DetectEncodingState* destatep, int i) {
2624 if (i <= 0) {
2625 return false;
2626 }
2627 return destatep->interesting_offsets[OtherPair][i] ==
2628 (destatep->interesting_offsets[OtherPair][i - 1] + 2);
2629 }
2630
2631 // boost, whack, or leave alone UTF-8 probablilty
2632 // Any whacks are also applied to UTF8UTF8; CheckUTF8UTF8Seq assumes good UTF8
2633 // Returns total boost
2634 int CheckUTF8Seq(DetectEncodingState* destatep, int weightshift) {
2635 int startcount = destatep->prior_interesting_pair[OtherPair];
2636 int endcount = destatep->next_interesting_pair[OtherPair];
2637
2638 int demotion_count = 0;
2639 for (int i = startcount; i < endcount; ++i) {
2640 int sub;
2641 char* s = &destatep->interesting_pairs[OtherPair][i * 2];
2642 // Demote four byte patterns that are more likely Latin1 than UTF-8
2643 // C9AE, DF92, DF93, DFAB. See note at top.
2644 // Demotion also boosts Latin1 and CP1252
2645 uint8 s0 = static_cast<uint8>(s[0]);
2646 uint8 s1 = static_cast<uint8>(s[1]);
2647 if ((s0 == 0xc9) && (s1 == 0xae)) {++demotion_count;}
2648 if ((s0 == 0xdf) && (s1 == 0x92)) {++demotion_count;}
2649 if ((s0 == 0xdf) && (s1 == 0x93)) {++demotion_count;}
2650 if ((s0 == 0xdf) && (s1 == 0xab)) {++demotion_count;}
2651
2652 if (!ConsecutivePair(destatep, i)) {
2653 // Insert a blank into the sequence; avoid wrong splices
2654 sub = (' ' >> 4) & 0x0f;
2655 ++destatep->utf8_minicount[
2656 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_m inistate)][sub])];
2657 destatep->next_utf8_ministate =
2658 kMiniUTF8State[destatep->next_utf8_ministate][sub];
2659 }
2660 // Byte 0
2661 sub = (s0 >> 4) & 0x0f;
2662 ++destatep->utf8_minicount[
2663 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_min istate)][sub])];
2664 destatep->next_utf8_ministate =
2665 kMiniUTF8State[destatep->next_utf8_ministate][sub];
2666 // Byte 1
2667 sub = (s1 >> 4) & 0x0f;
2668 ++destatep->utf8_minicount[
2669 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_min istate)][sub])];
2670 destatep->next_utf8_ministate =
2671 kMiniUTF8State[destatep->next_utf8_ministate][sub];
2672 DCHECK((0 <= destatep->next_utf8_ministate) &&
2673 (destatep->next_utf8_ministate < 8));
2674 }
2675
2676
2677 // For the four specific byte combinations above, Latin1/CP1252 is more likely
2678 if (demotion_count > 0) {
2679 Boost(destatep, F_Latin1, kGentleOnePair * demotion_count);
2680 Boost(destatep, F_CP1252, kGentleOnePair * demotion_count);
2681 }
2682
2683 // Boost UTF8 for completed good sequences
2684 int total_boost = 2 * destatep->utf8_minicount[2] +
2685 3 * destatep->utf8_minicount[3] +
2686 4 * destatep->utf8_minicount[4];
2687 // But not so much for demoted bytes
2688 total_boost -= (3 * demotion_count);
2689
2690 total_boost *= kGentleOnePair;
2691 total_boost >>= weightshift;
2692 // Design: boost both UTF8 and UTF8UTF8 for each good sequence
2693 Boost(destatep, F_UTF8, total_boost);
2694 Boost(destatep, F_UTF8UTF8, total_boost);
2695
2696 destatep->utf8_minicount[5] += destatep->utf8_minicount[2]; // total chars
2697 destatep->utf8_minicount[5] += destatep->utf8_minicount[3]; // total chars
2698 destatep->utf8_minicount[5] += destatep->utf8_minicount[4]; // total chars
2699 destatep->utf8_minicount[2] = 0;
2700 destatep->utf8_minicount[3] = 0;
2701 destatep->utf8_minicount[4] = 0;
2702
2703 // Whack (2 bytes) for errors
2704 int error_whack = 2 * destatep->utf8_minicount[1];
2705 error_whack *= kGentlePairWhack;
2706 error_whack >>= weightshift;
2707 Whack(destatep, F_UTF8, error_whack);
2708 Whack(destatep, F_UTF8UTF8, error_whack);
2709 destatep->utf8_minicount[1] = 0;
2710
2711 return total_boost - error_whack;
2712 }
2713
2714
2715 // Boost, whack, or leave alone UTF8UTF8 probablilty
2716 //
2717 // We are looking for
2718 // (1) chars ONLY in set UTF8(0080)..UTF8(00FF), including for 80..9F the
2719 // MS CP1252 mappings, and
2720 // (2) sequences of 2 or more such characters
2721 //
2722 // If so, we could be looking at some non-7-bit encoding extra-converted
2723 // to UTF-8. The most common observed is CP1252->UTF8 twice,
2724 // 1252=>UTF8 : 1252=>UTF8
2725 // where the colon means "take those bytes and pretend that they are 1252".
2726 // We have a couple of examples of BIG5 bytes converted as though
2727 // they were 1252,
2728 // BIG5 : 1252=>UTF8
2729 //
2730 // Of course, we don't want correctly converted 1252 to be flagged here
2731 // 1252=>UTF8
2732 // So we want the input high bytes to be in pairs or longer, hence the
2733 // output UTF8 in groups of four bytes or more
2734 //
2735 // Good chars: C2xx, C3xx,
2736 // Good chars: C592, C593, C5A0, C5A1, C5B8, C5BD, C5BE, C692, CB86, CB9C
2737 // Good chars: E280xx E282AC E284A2
2738 // C2xx 1100001x 10xxxxxx (128/128)
2739 // C5xx 11000101 10xx00xx (16/4)
2740 // C5xx 11000101 10111xxx (8/3)
2741 // C692 11000110 10010010 (1/1)
2742 // CBxx 11001011 100xx1x0 (8/2)
2743 // E28x 11100010 10000xx0 (4/3)
2744 //
2745 // Returns total boost
2746 int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) {
2747 int this_pair = destatep->prior_interesting_pair[OtherPair];
2748 int startbyteoffset = this_pair * 2;
2749 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
2750 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
2751 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
2752
2753 int pair_number = this_pair;
2754 for (char* s = startbyte; s < endbyte; s += 2) {
2755 int next = destatep->next_utf8utf8_ministate;
2756 if (!ConsecutivePair(destatep, pair_number)) {
2757 // Insert two blanks into the sequence to avoid wrong splices
2758 // go back to no odd-byte offset
2759 destatep->utf8utf8_odd_byte = 0;
2760 int sub = UTF88Sub(' ', ' ');
2761 ++destatep->utf8utf8_minicount[static_cast<int>(kMiniUTF8UTF8Count[next][s ub])];
2762 next = kMiniUTF8UTF8State[next][sub];
2763 }
2764
2765 int odd = destatep->utf8utf8_odd_byte;
2766 if (s + 1 + odd >= endbyte) continue;
2767 int sub = UTF88Sub(s[0 + odd], s[1 + odd]);
2768 destatep->utf8utf8_odd_byte ^= kMiniUTF8UTF8Odd[next][sub];
2769 ++destatep->utf8utf8_minicount[
2770 static_cast<int>(kMiniUTF8UTF8Count[next][sub])];
2771 destatep->next_utf8utf8_ministate = kMiniUTF8UTF8State[next][sub];
2772 ++pair_number;
2773 }
2774
2775 // Boost for completed good sequences; each count covers two chars.
2776 // Design: boost UTF8UTF8 above UTF8 for each good sequence
2777 int total_boost = (2) * destatep->utf8utf8_minicount[2] +
2778 (2) * destatep->utf8utf8_minicount[3] +
2779 (2) * destatep->utf8utf8_minicount[4];
2780 total_boost *= kGentleOnePair;
2781 total_boost >>= weightshift;
2782 Boost(destatep, F_UTF8UTF8, total_boost);
2783
2784 // Track total characters
2785 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[2];
2786 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[3];
2787 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[4];
2788 destatep->utf8utf8_minicount[2] = 0;
2789 destatep->utf8utf8_minicount[3] = 0;
2790 destatep->utf8utf8_minicount[4] = 0;
2791
2792 // Design: Do not whack UTF8UTF8 below UTF8 for each bad sequence
2793
2794 destatep->utf8utf8_minicount[1] = 0;
2795 return total_boost;
2796 }
2797
2798
2799 // boost, whack, or leave alone UTF-32 probablilty
2800 // Expecting 0000PPxx 0000QQxx where PP mostly = QQ (UTF-32BE)
2801 // Expecting xxPP0000 xxQQ0000 where PP mostly = QQ (UTF-32LE)
2802 void CheckUTF32ActiveSeq(DetectEncodingState* destatep) {
2803 // Not needed
2804 return;
2805 }
2806
2807 // We give a gentle boost for each paired SO ... SI, whack others
2808 void CheckIso2022ActiveSeq(DetectEncodingState* destatep) {
2809 int this_pair = destatep->prior_interesting_pair[OtherPair];
2810 int startbyteoffset = this_pair * 2;
2811 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
2812 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
2813 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
2814
2815 // Initial <esc> char must precede SO/SI
2816 // HZ_GB_2312 has no alternation constraint on 1- and 2-byte segments
2817 // ISO-2022-JP (JIS) has no alternation constraint on 1- and 2-byte segments
2818 // ISO-2022-CN has no alternation constraint on 1- and 2-byte segments
2819 // ISO-2022-KR requires alternation between 1- and 2-byte segments
2820 // JIS:
2821 // <esc> ( B ISO-2022-JP [1b 28 42] SI to ASCII
2822 // <esc> ( J ISO-2022-JP [1b 28 4a] SI to X0201
2823 // <esc> $ @ ISO-2022-JP [1b 24 40] SO to X0208-78 twobyte
2824 // <esc> $ B ISO-2022-JP [1b 24 42] SO to X0208-83 twobyte
2825 for (char* s = startbyte; s < endbyte; s += 2) {
2826 if (s[0] == 0x1b) {
2827 if (s[1] == 0x24) {
2828 // <esc> $ is SO
2829 destatep->next_2022_state = SOSI_TWOBYTE; // SO to two-byte
2830 } else if (s[1] == 0x28) {
2831 if (destatep->next_2022_state == SOSI_TWOBYTE) {
2832 Boost(destatep, F_JIS, kGentlePairBoost);
2833 } else if (destatep->next_2022_state == SOSI_ONEBYTE) {
2834 Whack(destatep, F_JIS, kGentlePairWhack);
2835 }
2836 destatep->next_2022_state = SOSI_ONEBYTE; // JIS SI to one-byte
2837 } else {
2838 Whack(destatep, F_JIS, kBadPairWhack);
2839 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
2840 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
2841 destatep->next_2022_state = SOSI_ERROR; // not 2022
2842 }
2843 } else if (s[0] == 0x0e) {
2844 // <so>
2845 Whack(destatep, F_JIS, kBadPairWhack);
2846 if (destatep->next_2022_state != SOSI_NONE) {
2847 destatep->next_2022_state = SOSI_TWOBYTE; // SO to two-byte
2848 } else {
2849 // ESC required before SO/SI
2850 Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4);
2851 Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4);
2852 destatep->next_2022_state = SOSI_ERROR; // SO not after SI
2853 }
2854 } else if (s[0] == 0x0f) {
2855 // <si>
2856 Whack(destatep, F_JIS, kBadPairWhack);
2857 if (destatep->next_2022_state != SOSI_NONE) {
2858 if (destatep->next_2022_state == SOSI_TWOBYTE) {
2859 Boost(destatep, F_ISO_2022_CN, kGentlePairBoost);
2860 Boost(destatep, F_ISO_2022_KR, kGentlePairBoost);
2861 } else if (destatep->next_2022_state == SOSI_ONEBYTE) {
2862 Whack(destatep, F_ISO_2022_CN, kGentlePairWhack);
2863 Whack(destatep, F_ISO_2022_KR, kGentlePairWhack);
2864 }
2865 destatep->next_2022_state = SOSI_ONEBYTE; // SI to one-byte
2866 } else {
2867 // ESC required before SO/SI
2868 Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4);
2869 Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4);
2870 destatep->next_2022_state = SOSI_ERROR; // SI not after SO
2871 }
2872 } else if (s[0] <= 0x1f) {
2873 // Some other control code. Allow ht lf [ff] cr
2874 if ((s[0] != 0x09) && (s[0] != 0x0a) &&
2875 (s[0] != 0x0c) && (s[0] != 0x0d)) {
2876 // Otherwise these can float to the top on bad bytes
2877 Whack(destatep, F_JIS, kBadPairWhack);
2878 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
2879 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
2880 }
2881 }
2882 }
2883
2884 // If no start, keep the probability pinned at zero (or below)
2885 if (destatep->next_2022_state == SOSI_NONE) {
2886 destatep->enc_prob[F_ISO_2022_CN] =
2887 minint(0, destatep->enc_prob[F_ISO_2022_CN]);
2888 destatep->enc_prob[F_ISO_2022_KR] =
2889 minint(0, destatep->enc_prob[F_ISO_2022_KR]);
2890 destatep->enc_prob[F_JIS] =
2891 minint(0, destatep->enc_prob[F_JIS]);
2892 }
2893 }
2894
2895 // We give a gentle boost for each paired ~{ ... ~}, whack others
2896 void CheckHzActiveSeq(DetectEncodingState* destatep) {
2897 int this_pair = destatep->prior_interesting_pair[AsciiPair];
2898 int startbyteoffset = this_pair * 2;
2899 int endbyteoffset = destatep->next_interesting_pair[AsciiPair] * 2;
2900 char* startbyte = &destatep->interesting_pairs[AsciiPair][startbyteoffset];
2901 char* endbyte = &destatep->interesting_pairs[AsciiPair][endbyteoffset];
2902
2903 for (char* s = startbyte; s < endbyte; s += 2) {
2904 // Look for initial ~{ pair
2905 if ((s[0] == '~') && (s[1] == '{')) {
2906 destatep->next_hz_state = SOSI_TWOBYTE; // SO to two-byte
2907 }
2908 // Also look for closing ~} pair
2909 if ((s[0] == '~') && (s[1] == '}')) {
2910 if (destatep->next_hz_state == SOSI_TWOBYTE) {
2911 Boost(destatep, F_HZ_GB_2312, kGentlePairBoost);
2912 } else if (destatep->next_hz_state == SOSI_ONEBYTE) {
2913 Whack(destatep, F_HZ_GB_2312, kGentlePairWhack);
2914 }
2915 destatep->next_hz_state = SOSI_ONEBYTE; // SI to one-byte
2916 }
2917 }
2918
2919 // If no start, keep the probability pinned at zero (or below)
2920 if (destatep->next_hz_state == SOSI_NONE) {
2921 destatep->enc_prob[F_HZ_GB_2312] =
2922 minint(0, destatep->enc_prob[F_HZ_GB_2312]);
2923 }
2924 }
2925
2926 // We give a gentle boost after an odd number of 8Fxxxx triples, which
2927 // put subsequent bigrams out of phase until a low byte or another 8Fxxxx
2928 void CheckEucJpSeq(DetectEncodingState* destatep) {
2929 int this_pair = destatep->prior_interesting_pair[OtherPair];
2930 int startbyteoffset = this_pair * 2;
2931 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
2932 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
2933 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
2934
2935 for (char* s = startbyte; s < endbyte; s += 2) {
2936 // Boost if out of phase (otherwise, EUC-JP will score badly after 8Fxxxx)
2937 if (destatep->next_eucjp_oddphase) {
2938 //printf(" EucJp boost[%02x%02x]\n", s[0], s[1]); // TEMP
2939 Boost(destatep, F_EUC_JP, kGentlePairBoost * 2);
2940 }
2941
2942 uint8 s0 = static_cast<uint8>(s[0]);
2943 uint8 s1 = static_cast<uint8>(s[1]);
2944 // Look for phase flip at 8F
2945 if ((s0 & 0x80) == 0x00) {
2946 destatep->next_eucjp_oddphase = false;
2947 } else if (s0 == 0x8f) {
2948 destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase;
2949 }
2950 if ((s1 & 0x80) == 0x00) {
2951 destatep->next_eucjp_oddphase = false;
2952 } else if (s1 == 0x8f) {
2953 destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase;
2954 }
2955 }
2956 }
2957
2958 // Boost, whack, or leave alone BINARY probablilty
2959 // Also called if UTF 16/32 active
2960 void CheckBinaryDensity(const uint8* src, DetectEncodingState* destatep,
2961 int delta_otherpairs) {
2962 // No change if not much gathered information
2963 if (delta_otherpairs == 0) {
2964 // Only ASCII pairs this call
2965 return;
2966 }
2967 int next_pair = destatep->next_interesting_pair[OtherPair];
2968
2969 // Look at density of interesting pairs [0..src)
2970 int delta_offset = static_cast<int>(src - destatep->initial_src); // actual
2971
2972 // Look at density of interesting pairs [0..next_interesting)
2973 int low_byte = destatep->interesting_offsets[OtherPair][0];
2974 //int high_byte = destatep->interesting_offsets[OtherPair][next_pair - 1] + 2;
2975 //int byte_span = high_byte - low_byte;
2976 int byte_span = delta_offset - low_byte;
2977
2978 // If all ASCII for the first 4KB, reject
2979 // If mostly ASCII in the first 5KB, reject
2980 if ((low_byte >= kBinaryHardAsciiLimit) || (delta_offset >= kBinarySoftAsciiLi mit)) {
2981 // Not binary early enough in text
2982 Whack(destatep, F_BINARY, kBadPairWhack * 4);
2983 Whack(destatep, F_UTF_32BE, kBadPairWhack * 4);
2984 Whack(destatep, F_UTF_32LE, kBadPairWhack * 4);
2985 Whack(destatep, F_UTF_16BE, kBadPairWhack * 4);
2986 Whack(destatep, F_UTF_16LE, kBadPairWhack * 4);
2987 return;
2988 }
2989
2990 // Density 1.0 for N pairs takes 2*N bytes
2991 // Whack if < 1/16 after first non_ASCII pair
2992 if ((next_pair * 2 * 16) < byte_span) {
2993 // Not dense enough
2994 Whack(destatep, F_BINARY, kBadPairWhack * 4);
2995 Whack(destatep, F_UTF_32BE, kBadPairWhack * 4);
2996 Whack(destatep, F_UTF_32LE, kBadPairWhack * 4);
2997 Whack(destatep, F_UTF_16BE, kBadPairWhack * 4);
2998 Whack(destatep, F_UTF_16LE, kBadPairWhack * 4);
2999 }
3000
3001 if (next_pair < 8) {
3002 // Fewer than 8 non-ASCII total; too soon to boost
3003 return;
3004 }
3005
3006 // Density 1.0 for N pairs takes 2*N bytes
3007 // Boost if density >= 1/4, whack if < 1/16
3008 if ((next_pair * 2 * 4) >= byte_span) {
3009 // Very dense
3010 // Only boost if at least 2 quadrants seen
3011 if (destatep->binary_quadrants_count >= 2) {
3012 Boost(destatep, F_BINARY, kSmallInitDiff);
3013 Boost(destatep, F_UTF_32BE, kSmallInitDiff);
3014 Boost(destatep, F_UTF_32LE, kSmallInitDiff);
3015 Boost(destatep, F_UTF_16BE, kSmallInitDiff);
3016 Boost(destatep, F_UTF_16LE, kSmallInitDiff);
3017 }
3018 }
3019 }
3020
3021
3022 // Look at a number of special-case encodings whose reliable detection depends
3023 // on sequencing or other properties
3024 // AsciiPair probibilities (UTF7 and HZ) are all done here
3025 void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
3026 int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] -
3027 destatep->prior_interesting_pair[AsciiPair];
3028 int delta_otherpairs = destatep->next_interesting_pair[OtherPair] -
3029 destatep->prior_interesting_pair[OtherPair];
3030
3031 // The two pure ASCII encodings
3032 if (UTF7OrHzActive(destatep) && (delta_asciipairs > 0)) {
3033 // Adjust per pair
3034 for (int i = 0; i < delta_asciipairs; ++i) {
3035 int next_pair = destatep->prior_interesting_pair[AsciiPair] + i;
3036 uint8 byte1 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 0];
3037 uint8 byte2 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 1];
3038 if (byte1 == '+') {
3039 // Boost, whack, or leave alone UTF-7 probablilty
3040 UTF7BoostWhack(destatep, next_pair, byte2);
3041 if (destatep->debug_data != NULL) {
3042 // Show UTF7 entry
3043 char buff[16];
3044 snprintf(buff, sizeof(buff), "%02x%02x+", byte1, byte2);
3045 SetDetailsEncProb(destatep,
3046 destatep->interesting_offsets[AsciiPair][next_pair],
3047 kMostLikelyEncoding[(byte1 << 8) + byte2],
3048 buff);
3049 }
3050 } else if (byte1 == '~') {
3051 // Boost, whack, or leave alone HZ probablilty
3052 HzBoostWhack(destatep, byte1, byte2);
3053 if (destatep->debug_data != NULL) {
3054 // Show Hz entry
3055 char buff[16];
3056 snprintf(buff, sizeof(buff), "%02x%02x~", byte1, byte2);
3057 SetDetailsEncProb(destatep,
3058 destatep->interesting_offsets[AsciiPair][next_pair],
3059 kMostLikelyEncoding[(byte1 << 8) + byte2],
3060 buff);
3061 }
3062 }
3063 }
3064
3065 // Kill UTF-7 now if at least 8 + pairs and not confirmed valid UTF-7
3066 if ((destatep->utf7_starts >= 8) && (destatep->prior_utf7_offset == 0)) {
3067 Whack(destatep, F_UTF7, kBadPairWhack * 8); // flush
3068 }
3069 }
3070
3071
3072
3073 // All the other encodings
3074 if (OtherActive(destatep) && (delta_otherpairs > 0)) {
3075 // Adjust per pair
3076 int biggest_weightshift = 0;
3077 for (int i = 0; i < delta_otherpairs; ++i) {
3078 int next_pair = destatep->prior_interesting_pair[OtherPair] + i;
3079 uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0];
3080 uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1];
3081 int off = destatep->interesting_offsets[OtherPair][next_pair];
3082 int weightshift = destatep->interesting_weightshift[OtherPair][next_pair];
3083 biggest_weightshift = maxint(biggest_weightshift, weightshift);
3084
3085 if (byte1 == 0x00) {
3086 if (byte2 == 0x00) {
3087 UTF1632BoostWhack(destatep, off, byte1);
3088 } else if ((kIsPrintableAscii[byte2] != 0) && ((off & 1) != 0)) {
3089 // We have 00xx at an odd offset. Turn into preceding even offset
3090 // for possible Ascii text in UTF-16LE or UTF-32LE (vs BE)
3091 // This will cascade into caller's probability update
3092 // 00 is illegal for all other encodings, so it doesn't matter to them
3093 UTF16MakeEven(destatep, next_pair);
3094 }
3095 if (destatep->debug_data != NULL) {
3096 // Show 0000 detail entry for this bigram
3097 char buff[16];
3098 snprintf(buff, sizeof(buff), "%02x%02xZ", byte1, byte2);
3099 SetDetailsEncProb(destatep,
3100 destatep->interesting_offsets[OtherPair][next_pair],
3101 kMostLikelyEncoding[(byte1 << 8) + byte2],
3102 buff);
3103 }
3104 }
3105 if (byte1 == 0xff) {
3106 if (byte2 == 0xff) {
3107 UTF1632BoostWhack(destatep, off, byte1);
3108 }
3109 if (destatep->debug_data != NULL) {
3110 // Show FFFF detail entry for this bigram
3111 char buff[16];
3112 snprintf(buff, sizeof(buff), "%02x%02xF", byte1, byte2);
3113 SetDetailsEncProb(destatep,
3114 destatep->interesting_offsets[OtherPair][next_pair],
3115 kMostLikelyEncoding[(byte1 << 8) + byte2],
3116 buff);
3117 }
3118 }
3119 if (BinaryActive(destatep)) {
3120 BinaryBoostWhack(destatep, byte1, byte2);
3121 }
3122 } // End for i
3123
3124 // Adjust per entire-pair-span
3125 int utf8_boost = 0;
3126 int utf8utf8_boost = 0;
3127 if (UTF8Active(destatep)) {
3128 utf8_boost = CheckUTF8Seq(destatep, biggest_weightshift);
3129 }
3130
3131 if (UTF8UTF8Active(destatep)) {
3132 utf8utf8_boost = CheckUTF8UTF8Seq(destatep, biggest_weightshift);
3133 }
3134
3135 if (UTF1632Active(destatep)) {
3136 CheckUTF32ActiveSeq(destatep);
3137 }
3138
3139 if (Iso2022Active(destatep)) {
3140 CheckIso2022ActiveSeq(destatep);
3141 }
3142
3143 if (HzActive(destatep)) {
3144 CheckHzActiveSeq(destatep);
3145 }
3146
3147 if (EUCJPActive(destatep)) {
3148 CheckEucJpSeq(destatep);
3149 }
3150
3151 if (BinaryActive(destatep) || UTF1632Active(destatep)) {
3152 CheckBinaryDensity(src, destatep, delta_otherpairs);
3153 }
3154 }
3155 // ISO-2022 do OK on their own, using stright probabilities? Not on bad bytes
3156
3157 if (destatep->debug_data != NULL) {
3158 // Show sequencing result
3159 SetDetailsEncLabel(destatep, "seq");
3160 }
3161 }
3162
3163
3164 void PrintTopEnc(DetectEncodingState* destatep, int n) {
3165 // Print top n or fewer
3166 int temp_sort[NUM_RANKEDENCODING];
3167 for (int j = 0; j < destatep->rankedencoding_list_len; ++j) {
3168 int rankedencoding = destatep->rankedencoding_list[j];
3169 temp_sort[j] = destatep->enc_prob[rankedencoding];
3170 }
3171
3172 qsort(temp_sort, destatep->rankedencoding_list_len,
3173 sizeof(temp_sort[0]), IntCompare);
3174
3175 int top_n = minint(n, destatep->rankedencoding_list_len);
3176 int showme = temp_sort[top_n - 1]; // Print this value and above
3177
3178 printf("rankedencodingList top %d: ", top_n);
3179 for (int j = 0; j < destatep->rankedencoding_list_len; ++j) {
3180 int rankedencoding = destatep->rankedencoding_list[j];
3181 if (showme <= destatep->enc_prob[rankedencoding]) {
3182 printf("%s=%d ",
3183 MyEncodingName(kMapToEncoding[rankedencoding]),
3184 destatep->enc_prob[rankedencoding]);
3185 }
3186 }
3187 printf("\n\n");
3188 }
3189
3190 // If the same bigram repeats, don't boost its best encoding too much
3191 bool RepeatedBigram(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
3192 int this_bigram = (byte1 << 8) | byte2;
3193 // If 00xx 01xx 02xx ... 1fxx, take out bottom 4 bits of xx.
3194 // This ignores parts of Yahoo 0255 0254 0243 0247 0245 0243 0250 0255 ...
3195 // It may screw up UTF-16BE
3196 // It may screw up ISO-2022 (1b24 suppresses 1b28)
3197 if (byte1 < 0x20) {
3198 this_bigram &= 0xfff0;
3199 }
3200 if (this_bigram == destatep->prior_bigram[0]) {return true;}
3201 if (this_bigram == destatep->prior_bigram[1]) {return true;}
3202 if (this_bigram == destatep->prior_bigram[2]) {return true;}
3203 if (this_bigram == destatep->prior_bigram[3]) {return true;}
3204 // Round-robin replacement
3205 destatep->prior_bigram[destatep->next_prior_bigram] = this_bigram;
3206 destatep->next_prior_bigram = (destatep->next_prior_bigram + 1) & 3;
3207 return false;
3208 }
3209
3210 // Sometimes illegal bytes are used as markers between text that Javascript
3211 // is going to decode. Don't overboost the Binary encoding for markers 01-FF.
3212 // Just count first pair per 8x4 bucket
3213 bool RepeatedBinary(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
3214 int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6);
3215 uint32 bucket8x4_mask = 1 << bucket8x4;
3216 if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) {
3217 destatep->binary_8x4_seen |= bucket8x4_mask;
3218 destatep->binary_8x4_count += 1;
3219 return false;
3220 }
3221 return true;
3222 }
3223
3224
3225
3226
3227 // Find current top two rankedencoding probabilities
3228 void ReRank(DetectEncodingState* destatep) {
3229 destatep->top_prob = -1;
3230 destatep->second_top_prob = -1;
3231 // Leave unchanged
3232 //destatep->top_rankedencoding =
3233 // destatep->rankedencoding_list[0]; // Just to make well-defined
3234 //destatep->second_top_rankedencoding =
3235 // destatep->rankedencoding_list[1]; // Just to make well-defined
3236 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3237 int rankedencoding = destatep->rankedencoding_list[j];
3238 if (destatep->top_prob < destatep->enc_prob[rankedencoding]) {
3239 // Make sure top 2 are in different superset groups
3240 if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] !=
3241 kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) {
3242 destatep->second_top_prob =
3243 destatep->top_prob; // old top to second
3244 destatep->second_top_rankedencoding =
3245 destatep->top_rankedencoding; // old top to second
3246 }
3247 destatep->top_prob = destatep->enc_prob[rankedencoding];
3248 destatep->top_rankedencoding = rankedencoding;
3249 } else if (destatep->second_top_prob < destatep->enc_prob[rankedencoding]) {
3250 if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] !=
3251 kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) {
3252 destatep->second_top_prob = destatep->enc_prob[rankedencoding];
3253 destatep->second_top_rankedencoding = rankedencoding;
3254 }
3255 }
3256 }
3257 }
3258
3259 void SimplePrune(DetectEncodingState* destatep, int prune_diff) {
3260 // Prune the list of active encoding families
3261 int keep_prob = destatep->top_prob - prune_diff;
3262
3263 destatep->active_special = 0;
3264 int k = 0;
3265 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3266 bool keep = true;
3267 int rankedencoding = destatep->rankedencoding_list[j];
3268
3269 // If count is too low, ditch it
3270 if (destatep->enc_prob[rankedencoding] < keep_prob) {keep = false;}
3271
3272 // Keep it. This will always keep at least top_prob rankedencoding
3273 if (keep) {
3274 destatep->active_special |= kSpecialMask[kMapToEncoding[rankedencoding]];
3275 destatep->rankedencoding_list[k++] = rankedencoding;
3276 }
3277 }
3278
3279 destatep->rankedencoding_list_len = k;
3280 }
3281
3282 // Recalculate reliable
3283 void CalcReliable(DetectEncodingState* destatep) {
3284 // Encoding result is reliable if big difference in top two, or if
3285 // only Ascii7 ever encountered
3286 // Also reliable if exactly one OtherPair and it's best encoding matches top
3287 destatep->reliable = false;
3288 if (destatep->next_interesting_pair[OtherPair] == 0) {
3289 // Only 7-bit ASCII
3290 destatep->reliable = true;
3291 return;
3292 }
3293 if ((destatep->top_prob - destatep->second_top_prob) >=
3294 FLAGS_ced_reliable_difference) {
3295 destatep->reliable = true;
3296 return;
3297 }
3298 if (destatep->next_interesting_pair[OtherPair] == 1) {
3299 uint8 byte1 = destatep->interesting_pairs[OtherPair][0];
3300 uint8 byte2 = destatep->interesting_pairs[OtherPair][1];
3301 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
3302 if (best_enc == destatep->top_rankedencoding) {
3303 destatep->reliable = true;
3304 return;
3305 }
3306 }
3307
3308 // If we pruned to one encoding, we are done
3309 if (destatep->rankedencoding_list_len == 1) {
3310 destatep->reliable = true;
3311 destatep->done = true;
3312 return;
3313 }
3314
3315 // If we pruned to two or three encodings in the same *superset/subset
3316 // rankedencoding* and enough pairs, we are done. Else keep going
3317 if (destatep->rankedencoding_list_len == 2) {
3318 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
3319 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
3320 if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) {
3321 if (destatep->prune_count >= 3) {
3322 destatep->reliable = true;
3323 destatep->done = true;
3324 return;
3325 }
3326 }
3327 } else if (destatep->rankedencoding_list_len == 3) {
3328 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
3329 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
3330 Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]];
3331 Encoding base0 = kMapEncToBaseEncoding[enc0];
3332 Encoding base1 = kMapEncToBaseEncoding[enc1];
3333 Encoding base2 = kMapEncToBaseEncoding[enc2];
3334
3335 if ((base0 == base1) && (base0 == base2)) {
3336 if (destatep->prune_count >= 3) {
3337 destatep->reliable = true;
3338 destatep->done = true;
3339 return;
3340 }
3341 }
3342 }
3343
3344 }
3345
3346
3347 // Find current top two rankedencoding probabilities
3348 void FindTop2(DetectEncodingState* destatep,
3349 int* first_renc, int* second_renc,
3350 int* first_prob, int* second_prob) {
3351 *first_prob = -1;
3352 *second_prob = -1;
3353 *first_renc = 0;
3354 *second_renc = 0;
3355 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3356 int rankedencoding = destatep->rankedencoding_list[j];
3357 if (*first_prob < destatep->enc_prob[rankedencoding]) {
3358 *second_prob = *first_prob; // old top to second
3359 *second_renc = *first_renc; // old top to second
3360 *first_prob = destatep->enc_prob[rankedencoding];
3361 *first_renc = rankedencoding;
3362 } else if (*second_prob < destatep->enc_prob[rankedencoding]) {
3363 *second_prob = destatep->enc_prob[rankedencoding];
3364 *second_renc = rankedencoding;
3365 }
3366 }
3367 }
3368
3369
3370 void PrintRankedEncodingList(DetectEncodingState* destatep, const char* str) {
3371 printf("Current ranked encoding list %s\n", str);
3372 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3373 int rankedencoding = destatep->rankedencoding_list[j];
3374 if ((rankedencoding < 0) || (rankedencoding > NUM_RANKEDENCODING)) {
3375 printf(" [%d] BOGUS rankedencoding = %d\n", j, rankedencoding);
3376 } else {
3377 printf(" [%d] rankedencoding = %d %-12.12s enc_prob = %d\n",
3378 j, rankedencoding, MyRankedEncName(rankedencoding),
3379 destatep->enc_prob[rankedencoding]);
3380 }
3381 }
3382 printf("End current ranked encoding list\n\n");
3383 }
3384
3385
3386
3387
3388 // Map unencoded bytes down to five bits, largely preserving letters
3389 // This design struggles to put 33 values into 5 bits.
3390 #define XX 0 // Punctuation (00-7F range)
3391 #define HA 27 // High vowel a in Latin1/2/sometimes7
3392 #define HE 28 // High vowel e
3393 #define HI 29 // High vowel i
3394 #define HO 30 // High vowel o
3395 #define HU 30 // High vowel u on top of HO
3396 #define Hc 31 // High consonant (80-FF range)
3397 static const char kMapToFiveBits[256] = {
3398 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
3399 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
3400 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
3401 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
3402
3403 XX, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
3404 16,17,18,19,20,21,22,23, 24,25,26,XX,XX,XX,XX,XX,
3405 XX, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
3406 16,17,18,19,20,21,22,23, 24,25,26,XX,XX,XX,XX,XX,
3407
3408 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3409 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3410 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3411 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3412
3413 Hc,HA,HA,HA,HA,Hc,Hc,Hc, Hc,HE,HE,HE,HI,HI,HI,Hc,
3414 Hc,Hc,Hc,HO,HO,HO,HO,Hc, Hc,HU,HU,HU,HU,Hc,Hc,Hc,
3415 Hc,HA,HA,HA,HA,Hc,Hc,Hc, Hc,HE,HE,HE,HI,HI,HI,Hc,
3416 Hc,Hc,Hc,HO,HO,HO,HO,Hc, Hc,HU,HU,HU,HU,Hc,Hc,Hc,
3417
3418 };
3419 #undef XX
3420 #undef HA
3421 #undef HE
3422 #undef HI
3423 #undef HO
3424 #undef HU
3425 #undef Hc
3426
3427 static const int kTriNoneLikely = 0;
3428 static const int kTriLatin1Likely = 1;
3429 static const int kTriLatin2Likely = 2;
3430 static const int kTriLatin7Likely = 3;
3431
3432 // Each table entry has 32 times two bits, selected by byte[2]
3433 // Entry subscript is selected by byte[0] and byte[1]
3434 // Latin1/2/7 boost vector, generated 2007.09.26 by postproc-enc-detect-short.cc
3435 static const uint64 kLatin127Trigrams[1024] = {
3436 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,
3437 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,
3438 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,
3439 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,
3440 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,
3441 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,
3442 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,
3443 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,
3444 0x0000000000000000ULL, 0x304080c0402c3330ULL, 0x0008400004000000ULL, 0x082800000 c200000ULL,
3445 0x23a0000420800030ULL, 0x00000000000ccc00ULL, 0x0500100100100000ULL, 0x038840000 0200010ULL,
3446 0x0000000000000c00ULL, 0xd0f0300740f0cf00ULL, 0x2aa0a2a22882a2acULL, 0x081d80000 0000080ULL,
3447 0x0c82000020000000ULL, 0x200a03c000a00000ULL, 0x0008400400290000ULL, 0x040087000 0000000ULL,
3448 0x00f040c00000c080ULL, 0x0008004000000410ULL, 0x0020300000000030ULL, 0x00a030002 c300000ULL,
3449 0x0c8030c020a00000ULL, 0x15410030f0f4c000ULL, 0x3000000300a00000ULL, 0xa2880980a 0880a88ULL,
3450 0x0900300000000000ULL, 0x0000040100300000ULL, 0x0888820020a00000ULL, 0xc04400224 2010000ULL,
3451 0x000000121d300040ULL, 0x40100040440c0d54ULL, 0x00008423102f8144ULL, 0x0b4080840 0000280ULL,
3452 0x0000000000000000ULL, 0x0680a000000c0000ULL, 0x0880008020aa0000ULL, 0x2aaa01410 10a4940ULL,
3453 0xcb80000000010000ULL, 0x2280000000000000ULL, 0x5248000001800000ULL, 0x800040100 4040010ULL,
3454 0x1540010201001010ULL, 0x0080080400000000ULL, 0x5a00044040000108ULL, 0x028800028 2080008ULL,
3455 0x4800008002200000ULL, 0x4a00000000010100ULL, 0x8a88040080000800ULL, 0x014080000 0000400ULL,
3456 0x40010050000c0000ULL, 0x0000008000000000ULL, 0x0028000020140040ULL, 0x862040140 1005308ULL,
3457 0xc082000000000400ULL, 0x05c0b004c0240600ULL, 0x0288000080000000ULL, 0x000001400 0000000ULL,
3458 0x00000000040000c0ULL, 0x8001861008004280ULL, 0x0200000000000300ULL, 0x000024024 2288620ULL,
3459 0x801000c05434c200ULL, 0x9020162040a2d2b4ULL, 0x0021840000240704ULL, 0x2a8028008 0084908ULL,
3460 0x0000000000000000ULL, 0x0500004000000040ULL, 0x0080000000040000ULL, 0x010805810 4440000ULL,
3461 0x0900000000040000ULL, 0x00c0000000208008ULL, 0x2000005000000000ULL, 0x008000000 0050000ULL,
3462 0x0808000000001080ULL, 0x9880810100308000ULL, 0x2285480080081a08ULL, 0x8a8000008 0080000ULL,
3463 0x1450000000600010ULL, 0x2210000100000000ULL, 0x8a88000100011000ULL, 0x154180400 0000010ULL,
3464 0xc084011140040100ULL, 0x0000000000000800ULL, 0x0400000000000030ULL, 0x2a800000a 0890128ULL,
3465 0x1140a00054000104ULL, 0x1440000101200404ULL, 0x028800400400d800ULL, 0x000000000 0000000ULL,
3466 0x0000000000002330ULL, 0x0020820228a02280ULL, 0xa2888a02aa8008a8ULL, 0xd0040a004 4202500ULL,
3467 0x8000044104a29424ULL, 0xc000100178b2c5b4ULL, 0x0000810100241504ULL, 0xd04003000 0380008ULL,
3468 0x0000000000000000ULL, 0x26c08c0000200130ULL, 0x4a08000110080000ULL, 0x2aa000400 1080800ULL,
3469 0x0aac000000004000ULL, 0x2000000000200000ULL, 0x4240000100020000ULL, 0x410000008 0000000ULL,
3470 0x4900040000000000ULL, 0x0800000400300040ULL, 0x6a80000000040800ULL, 0x2a0818200 0588008ULL,
3471 0x0a00000c81000008ULL, 0x0a000c0010000000ULL, 0x8a88001080280808ULL, 0x002000020 0300600ULL,
3472 0xaac00000900a0000ULL, 0x0000100004000000ULL, 0x0020081020000000ULL, 0x822010501 0084110ULL,
3473 0x4a80800000004000ULL, 0x050000c0c0200000ULL, 0x288c000084000000ULL, 0xa04808228 0000000ULL,
3474 0x0000000000000000ULL, 0x8000900000032080ULL, 0xee889e81b8880820ULL, 0xc2200a814 2800424ULL,
3475 0xc020141543361010ULL, 0x10a000204a801634ULL, 0x3a808800802a00a0ULL, 0x28808b008 03d0800ULL,
3476 0x0000000000000000ULL, 0x0020000000000030ULL, 0x0808400121010040ULL, 0x0c2824010 0200040ULL,
3477 0x2008200028800000ULL, 0xc10004c80f30c030ULL, 0x0400440114100000ULL, 0x220820028 0a22220ULL,
3478 0x0600000030c01000ULL, 0x1201001040c00000ULL, 0x0aa02ea22aa22aa0ULL, 0x300080000 00200a0ULL,
3479 0x20c8400400800000ULL, 0x08280b0420800000ULL, 0x0800100000210000ULL, 0x10000300c 0100400ULL,
3480 0xc8c0000420000000ULL, 0x1000000010000000ULL, 0x0420000400000000ULL, 0x022000050 0204000ULL,
3481 0x2200000420000000ULL, 0x0000540400000000ULL, 0x0000000020000000ULL, 0x00080c00a 0810080ULL,
3482 0x1540000000043000ULL, 0x0000000000100000ULL, 0x2e88a22220200a20ULL, 0xc06030e34 ea503a0ULL,
3483 0x0001100204048500ULL, 0x000000e0000c0d54ULL, 0x3000820310a31400ULL, 0x13088c032 0e00280ULL,
3484 0x0000000000000000ULL, 0x0480000000200000ULL, 0x4000200100000000ULL, 0x000030004 0040000ULL,
3485 0x4400000000000000ULL, 0x0401000002240000ULL, 0x0540000000040000ULL, 0x400401000 0000000ULL,
3486 0x4001111001100000ULL, 0x2880000000300040ULL, 0x4040004040002404ULL, 0x020000000 0000000ULL,
3487 0x0140040000100000ULL, 0x4040010040040080ULL, 0x0a00140000041004ULL, 0x0000a0040 0808000ULL,
3488 0x1010200000430040ULL, 0x0010000000000000ULL, 0x0540000000104000ULL, 0x140011400 5000000ULL,
3489 0x0000204000440010ULL, 0x0500000000004400ULL, 0x4500000018000400ULL, 0x000040000 0000000ULL,
3490 0x000000300000cc00ULL, 0x0100001011300000ULL, 0x0040000000000000ULL, 0xc0e000024 8a00444ULL,
3491 0x0000040020340144ULL, 0x0000046445105454ULL, 0x32a0a80280880128ULL, 0x088004000 0100100ULL,
3492 0x0000000000000000ULL, 0x14003000030c0004ULL, 0x4a04001100000000ULL, 0x0a0010801 0000000ULL,
3493 0x28a8004000200248ULL, 0x0100040000b00000ULL, 0x42000000000008c0ULL, 0x600804401 0550010ULL,
3494 0x0800401000010400ULL, 0x080080040cf80000ULL, 0x5080000001001010ULL, 0x2a8010000 0000000ULL,
3495 0xcc8010010d401100ULL, 0x0200000001001000ULL, 0x0480001004001000ULL, 0x8d0080004 0b40210ULL,
3496 0x6200800000300000ULL, 0x0000010000000000ULL, 0x0428004100010000ULL, 0x432010514 1501100ULL,
3497 0xe28c0000000c1000ULL, 0xd5c000c3c0e00300ULL, 0x0001000000100200ULL, 0x100401020 2400008ULL,
3498 0x0000000000003000ULL, 0x2aa038a0800aab08ULL, 0x2a88038000000000ULL, 0xc22004024 2f09720ULL,
3499 0x8020200200ba0420ULL, 0x0020106105101004ULL, 0x0480800000220400ULL, 0x228010008 0000008ULL,
3500 0x0000000000000000ULL, 0x9000000000200000ULL, 0x0001000000100000ULL, 0x2aa40c000 0080800ULL,
3501 0x0040000040010000ULL, 0x0040000000c01000ULL, 0x4000000040000400ULL, 0x000000100 0200000ULL,
3502 0x0000010000000000ULL, 0x05808004000c0000ULL, 0x50400c0000000400ULL, 0x020040008 f000040ULL,
3503 0x0800000000100000ULL, 0x0000000000000000ULL, 0x0a08440000004000ULL, 0x006400040 0008200ULL,
3504 0x0010010010034170ULL, 0x0000000010000000ULL, 0x0100204021000000ULL, 0x022000d00 0010100ULL,
3505 0x0840300000c00000ULL, 0x1400000040204400ULL, 0x09800c0040000000ULL, 0x020970800 0000000ULL,
3506 0x000000000000c040ULL, 0x90000c50204040a0ULL, 0x0000000000000000ULL, 0x00e150004 0200004ULL,
3507 0x8020260540204494ULL, 0x0020026150201054ULL, 0x0281800380105634ULL, 0x088490048 1105000ULL,
3508 0x0000000000000000ULL, 0x84203c00002c0200ULL, 0xc089040000000000ULL, 0xc2a810004 0200004ULL,
3509 0xe00c1c0000000000ULL, 0x0ce1330080200080ULL, 0x0000000000200000ULL, 0xc40011000 0404010ULL,
3510 0x0088400000000000ULL, 0x00083cc00c00c00cULL, 0xcac01c00c000580cULL, 0xe300b0f00 0100000ULL,
3511 0x0300000000000000ULL, 0xc0000f0000000000ULL, 0xc3c01c0400000000ULL, 0x81008004c 0f40000ULL,
3512 0xc3d8003000000440ULL, 0x0000000000000000ULL, 0xc430000000000000ULL, 0x006000000 0001000ULL,
3513 0x0800000000000000ULL, 0x00c03300f0fc0008ULL, 0x3000000400200010ULL, 0xa2a80892a 0880a28ULL,
3514 0x0500000040000004ULL, 0x0000000000000000ULL, 0xc80032070c200020ULL, 0x022082006 0a296a0ULL,
3515 0x802084021db486a0ULL, 0x00000d60080c0080ULL, 0xb281803313a32428ULL, 0x180830032 0300000ULL,
3516 0x0000000000000000ULL, 0x85208cc0ccac1f20ULL, 0x2081000186100808ULL, 0x22a808800 00a0808ULL,
3517 0xaaa8086880000000ULL, 0x802084800a2e9200ULL, 0xa280000000002008ULL, 0xa00000008 0080400ULL,
3518 0x2080010000000008ULL, 0x802020c00c028c80ULL, 0x2080000000140810ULL, 0x2a8008608 0080008ULL,
3519 0x2a800000a8000800ULL, 0xaa881800a2080800ULL, 0xaa98004080280808ULL, 0x004483d0c 0300000ULL,
3520 0xa280002080080000ULL, 0x0000000000300000ULL, 0x22a1030000000008ULL, 0xa8a030108 8880880ULL,
3521 0xaa80002080222808ULL, 0x85400c03fc030400ULL, 0x8a88000000000008ULL, 0xa00800801 0080008ULL,
3522 0x0000000000010000ULL, 0x0040100000301040ULL, 0x28800000a0002008ULL, 0x122482306 cbc0eacULL,
3523 0x8020224222b8c6a0ULL, 0x802002004a82c284ULL, 0x0aa08fc440a41c80ULL, 0x888080d18 1385098ULL,
3524 0x0000000000000000ULL, 0x00c0b000000c0080ULL, 0x2208001000000800ULL, 0x0a2800000 0200000ULL,
3525 0x0000000300000000ULL, 0x00c1040000200000ULL, 0x0203020000000000ULL, 0x024800000 0020000ULL,
3526 0x0000840000100000ULL, 0x0a808c00c000008cULL, 0x5200040040000004ULL, 0x02000c000 00080a0ULL,
3527 0x0b0c000020000000ULL, 0x0b04000001000000ULL, 0x088c0010002000c0ULL, 0x80e08b00c 0030c20ULL,
3528 0x0280000200014040ULL, 0x0000000000000000ULL, 0x0e20a0a008000020ULL, 0x0e280fd03 f00111cULL,
3529 0x200080c020001000ULL, 0x8cc00c02c02f0400ULL, 0x480c0001000c404cULL, 0x020801428 1080808ULL,
3530 0x000000000000fcfcULL, 0x004403300cf00030ULL, 0x2200000000004400ULL, 0x02202000c 08c0c20ULL,
3531 0x02202022683a80a0ULL, 0x4020228028008c00ULL, 0x32208cc0002c0200ULL, 0x3ec00c008 0304008ULL,
3532 0x0000000000000000ULL, 0x34000c00002c0000ULL, 0x0b00000100100030ULL, 0x082301800 0000000ULL,
3533 0x0e8c001c01e00000ULL, 0x1200800600330000ULL, 0x4000110000000000ULL, 0x008000030 0000000ULL,
3534 0x0800000000000000ULL, 0x08c08c04000c0000ULL, 0x0080400000880000ULL, 0x0a0800008 0c00008ULL,
3535 0x0800000304400000ULL, 0x0208000000c00000ULL, 0x2888300080400800ULL, 0x8dc020440 0000000ULL,
3536 0xc0000000c0800000ULL, 0x0000c10000000000ULL, 0x24000c4010c00000ULL, 0x272000541 d811000ULL,
3537 0x0200400000001000ULL, 0x0400000400001004ULL, 0xc08c007004001000ULL, 0x204800400 0000000ULL,
3538 0x000000000003fcfcULL, 0x2aa030000cf8c800ULL, 0xe280000000000000ULL, 0x0a2100814 2000340ULL,
3539 0x0021002000b61040ULL, 0x800004064006d444ULL, 0x3aa0800300230008ULL, 0x0b0003000 0300000ULL,
3540 0x0000000000000000ULL, 0x01c080000000040cULL, 0x0100000000004000ULL, 0x0aa801801 0001000ULL,
3541 0x0800000000100000ULL, 0x3000000000008c00ULL, 0x5400000013000000ULL, 0x02c0c0000 4004010ULL,
3542 0x5241100010000c00ULL, 0x0e00080000000808ULL, 0x5281000000000800ULL, 0x0a0810802 0000800ULL,
3543 0x0a80000000005210ULL, 0x0100000041000000ULL, 0x2a88000002080110ULL, 0x852080000 0c00080ULL,
3544 0x01000010108c0100ULL, 0x0000000000000000ULL, 0x42a0420080000000ULL, 0x002000100 4010010ULL,
3545 0xc4000000000c0000ULL, 0x01000c00c0200400ULL, 0x4600000100000000ULL, 0x000000000 0000000ULL,
3546 0x0010001000000010ULL, 0x910400900820d030ULL, 0x2280000000000000ULL, 0xc22120044 00040e4ULL,
3547 0x8001000000b61420ULL, 0xa00002a248e810b4ULL, 0x32008000002c0008ULL, 0x0c0100348 03c5010ULL,
3548 0x0000000000000000ULL, 0x85008002002c0000ULL, 0x0204001000004010ULL, 0x012000800 0200000ULL,
3549 0x000010000c2000c0ULL, 0xccc0000000200000ULL, 0x0400000c00100040ULL, 0x000330010 0004100ULL,
3550 0x4000551040000004ULL, 0x0e0080000c820808ULL, 0xc000000000080800ULL, 0xc80300000 0000000ULL,
3551 0x0a4000c000200000ULL, 0x0040000000c00000ULL, 0x0918145000405000ULL, 0x81400000c 0300400ULL,
3552 0x0050000000000000ULL, 0xd000045000000000ULL, 0x0400004000400000ULL, 0x042010401 0000110ULL,
3553 0x0700000000203000ULL, 0x34800300c0e00704ULL, 0x4440100044000400ULL, 0x004000004 0000000ULL,
3554 0x0030000044000000ULL, 0xeaaca0008808c880ULL, 0x0a01000000200000ULL, 0x1220a3004 03ccf20ULL,
3555 0x002024c200b61044ULL, 0x802014346aa2d434ULL, 0x30008c00c0820c44ULL, 0x0a0000000 00c4800ULL,
3556 0x0000000000000000ULL, 0x0000404000340c90ULL, 0x08a8a10820800280ULL, 0x812800902 2201000ULL,
3557 0x0020808228a000a0ULL, 0x0020400100410000ULL, 0x0400000110000000ULL, 0xa60900000 0200000ULL,
3558 0x8008330000d00000ULL, 0x8060100040404010ULL, 0xeaa00ea0ea00808cULL, 0x200c8020a 0000020ULL,
3559 0x0408800020200000ULL, 0x0189001403200000ULL, 0xc00800000000c000ULL, 0x200430c00 c300000ULL,
3560 0x0100300100004000ULL, 0x0000040000000000ULL, 0x2420000400001000ULL, 0x89a120040 0000000ULL,
3561 0x20c8a000208c0000ULL, 0x8080000000000000ULL, 0x28a0108020210080ULL, 0xa2a84800a 0880988ULL,
3562 0x258008000400c000ULL, 0x0140000000100000ULL, 0xa028a222a0aa0228ULL, 0xc06001205 4044040ULL,
3563 0x0010010400000000ULL, 0x00000050150c0114ULL, 0x0000008010c20010ULL, 0xaa088000a 0200880ULL,
3564 0x0000000000000000ULL, 0x0700b0c0000c0000ULL, 0x2200040000080030ULL, 0x2aa880804 0240800ULL,
3565 0x08b0500000000100ULL, 0x1000830400200000ULL, 0x4204000010000000ULL, 0x40c220005 0040050ULL,
3566 0x0104404001010000ULL, 0x1a808c8103c00030ULL, 0x30900010c0000b00ULL, 0x200812b28 3000008ULL,
3567 0x000c000020e00000ULL, 0x2140000000400000ULL, 0x0288000080200000ULL, 0x8060a200c 8a20280ULL,
3568 0x0400114010215000ULL, 0x0000000000000000ULL, 0x082b200002000010ULL, 0x22a003000 0031000ULL,
3569 0x008100001000000cULL, 0x05400c00c0230400ULL, 0xca3000003c080100ULL, 0x000000002 0000004ULL,
3570 0x0000000100000000ULL, 0x8004320813f5c000ULL, 0xa280080200000800ULL, 0xc22000044 e334c20ULL,
3571 0x000004146e361024ULL, 0x800126806aa0d584ULL, 0xb000a0040023c41cULL, 0x0a0830008 03053d8ULL,
3572 0x0000000000000000ULL, 0x0000100000020000ULL, 0x0000000010000010ULL, 0x000000004 5040004ULL,
3573 0x0000000000100000ULL, 0x0000020400000010ULL, 0x0003015000000000ULL, 0x040000000 0000000ULL,
3574 0x0000000400000000ULL, 0x0100000000000800ULL, 0x0000001000000000ULL, 0x000000000 0000000ULL,
3575 0x0000000040000000ULL, 0x0000000000000000ULL, 0x0004001000000000ULL, 0x000800100 0000000ULL,
3576 0x0010000000000004ULL, 0x0000010100001000ULL, 0x0004000000000004ULL, 0x000001404 0050014ULL,
3577 0x0014000000000040ULL, 0x5540000000041000ULL, 0x0000000000000000ULL, 0x000004000 0000d00ULL,
3578 0x0000000000000000ULL, 0x0000000000100000ULL, 0x0001000000000000ULL, 0x000000000 0000000ULL,
3579 0x0000000000000000ULL, 0x0000000000000000ULL, 0x4500000000040400ULL, 0x000080000 0000400ULL,
3580 0x0000000000000000ULL, 0x13e080000020000cULL, 0xcf00001005100000ULL, 0x04a800800 0200300ULL,
3581 0x00280100100000c0ULL, 0x1c8c000040200000ULL, 0x0600005000100000ULL, 0x050800000 c104000ULL,
3582 0x4c10101000110000ULL, 0x0c00000000300000ULL, 0x22040c00100000c0ULL, 0x080070001 0100000ULL,
3583 0x0000000000001000ULL, 0x0a08000010000040ULL, 0x0800034004210010ULL, 0x04e000040 0000000ULL,
3584 0x0800030020000000ULL, 0x0000005000000000ULL, 0x0400110101304110ULL, 0x042800001 0a01000ULL,
3585 0x060b000000800010ULL, 0x35810c00c020c000ULL, 0x00800c4321800000ULL, 0x420808802 0000080ULL,
3586 0x040000111003ff00ULL, 0x0020900020202080ULL, 0x22888180a8000888ULL, 0x022520054 2005420ULL,
3587 0x2020040400340020ULL, 0x10300424500cc444ULL, 0x3081a00400e00200ULL, 0x33001300c 0300000ULL,
3588 0x0000000000000000ULL, 0x04003c0000000000ULL, 0x0a04001000100100ULL, 0x140800000 1000000ULL,
3589 0x1800000044100000ULL, 0x3400040400000300ULL, 0x5000040801000040ULL, 0x408840104 0000040ULL,
3590 0x1010110130100000ULL, 0xca800c3000300000ULL, 0x5a01000000080100ULL, 0x020280000 cd01300ULL,
3591 0x0302000410200010ULL, 0x0000102000300000ULL, 0x0b09000000000000ULL, 0x20008004c 4800004ULL,
3592 0x28c0410010000000ULL, 0x0004015041000050ULL, 0x0a01006000200200ULL, 0x0020d0000 0100040ULL,
3593 0x0010a00100900000ULL, 0x3500bf00c0030300ULL, 0x080c010000200d00ULL, 0x224800000 4020010ULL,
3594 0x0000c00000000000ULL, 0x8044b00200e08000ULL, 0xaaa82aa2aa8a2aa8ULL, 0x022000224 1c08604ULL,
3595 0x4200260440328444ULL, 0x68001226103008b4ULL, 0x3a0080c0b0000400ULL, 0x2a8048048 03c4008ULL,
3596 0x0000000000000000ULL, 0x04008c0300000400ULL, 0x008000c0000c0000ULL, 0x088001000 000001cULL,
3597 0x0840000001000010ULL, 0x0400000000200c00ULL, 0x4244000101040000ULL, 0x423800701 1100000ULL,
3598 0x1000d00100000010ULL, 0x1d00800400300000ULL, 0x4204080c00000000ULL, 0x2a8808008 0000008ULL,
3599 0x08001c0200001000ULL, 0x0a00000400000000ULL, 0x8a88003080080000ULL, 0x052180040 0300000ULL,
3600 0x3200051000201000ULL, 0x0000000000000000ULL, 0x0020801404000000ULL, 0x322010401 c0c101cULL,
3601 0x0c01100013000000ULL, 0x04003000c0204000ULL, 0x088c0020a0cc0000ULL, 0x220000008 0000018ULL,
3602 0x0404000044000000ULL, 0x82a0b000008820b0ULL, 0x0000040020440000ULL, 0xc26500044 03f1420ULL,
3603 0x0021340241b64464ULL, 0x8020040242c2d474ULL, 0x32018c0480288000ULL, 0x00800b008 0300000ULL,
3604 0x0000000000000000ULL, 0x05008c0000040130ULL, 0xc0d8000000800000ULL, 0x002000002 0200200ULL,
3605 0x23a2000120204000ULL, 0x5052100550104150ULL, 0x1000101100040000ULL, 0xc40001c30 1000000ULL,
3606 0x8288000000c00000ULL, 0x5150040144d01404ULL, 0xea8c0ea028ae088cULL, 0xc31010c00 0000c80ULL,
3607 0x0002000060000000ULL, 0xc80800f030000000ULL, 0x0000000400300000ULL, 0xc00080c00 ff0c344ULL,
3608 0x00080001200c0000ULL, 0x0000050080000000ULL, 0x0328000300300000ULL, 0x082030000 cc01040ULL,
3609 0xeb08800100004000ULL, 0x8030003300c80f00ULL, 0xfb0d0000e4ac0000ULL, 0x002000608 0000008ULL,
3610 0x0500100100040000ULL, 0x1140000000000000ULL, 0xcb883330a0e00000ULL, 0xc00001005 0000080ULL,
3611 0x0010104005b54150ULL, 0x40111d5155001554ULL, 0x80000070140f0004ULL, 0x0b0830c3a 0003380ULL,
3612 0x0000000000000000ULL, 0x04c13000000f830cULL, 0x2808000000000000ULL, 0x281000000 0000800ULL,
3613 0x08c0080004400000ULL, 0x04c0240300801c20ULL, 0x4040000080000004ULL, 0x000040010 0100010ULL,
3614 0x020001008000c0c0ULL, 0x1d008c000c3c0000ULL, 0x0080003000000800ULL, 0x228808008 0000008ULL,
3615 0x0a84004020220000ULL, 0x0800080000100000ULL, 0xaa80004080400008ULL, 0x802400040 0c01660ULL,
3616 0x80841c2001000104ULL, 0x0001000000000000ULL, 0x0020028020020280ULL, 0x086040401 1900100ULL,
3617 0xec80080200000000ULL, 0x010103c100200400ULL, 0x0200004000000000ULL, 0x000000000 0400400ULL,
3618 0x000010000003fcfcULL, 0x8040083238c20000ULL, 0x08800220a0920a00ULL, 0x082100044 83c0c24ULL,
3619 0xc020240740b0a200ULL, 0x802006014a201494ULL, 0x3201233070ac0e00ULL, 0x080028060 33a48a0ULL,
3620 0x0000000000000000ULL, 0x8020820028a00680ULL, 0x2000002000000104ULL, 0x22a808011 00a0808ULL,
3621 0xa2a8002080000000ULL, 0xa000800008a08000ULL, 0x0000100000400000ULL, 0x800000210 0000000ULL,
3622 0x0000010000004404ULL, 0xa2a0088080000888ULL, 0x0000000010400800ULL, 0xa28008208 0080008ULL,
3623 0x2280000080010008ULL, 0x2000000000000000ULL, 0x228800008c080808ULL, 0x802182800 2a98200ULL,
3624 0xa200002000080000ULL, 0x0000040000000000ULL, 0x22a0000080000000ULL, 0x202882c20 0800080ULL,
3625 0xa000000001004000ULL, 0x000000c808a00600ULL, 0x0000000010000000ULL, 0x000001000 000040cULL,
3626 0x0000000000000000ULL, 0x802002a2a8aa82a0ULL, 0x20000024a8088228ULL, 0x802082000 1000000ULL,
3627 0x8020000000808280ULL, 0x8000000000000000ULL, 0x0020800000200280ULL, 0x208008228 0a00888ULL,
3628 0x0000000000000000ULL, 0x0000015000000040ULL, 0x0000040000040000ULL, 0x010001001 0001000ULL,
3629 0x0000003210008000ULL, 0x0000000404000000ULL, 0x0000000000000400ULL, 0x020000000 0000000ULL,
3630 0x0000000000000100ULL, 0x5180014400004050ULL, 0x1000000014000000ULL, 0x420000000 0000000ULL,
3631 0x0040200000000000ULL, 0x0201004000000000ULL, 0x0a00000000000010ULL, 0x004020000 0800000ULL,
3632 0x0040051000000500ULL, 0x0000000100800400ULL, 0x6000000000000000ULL, 0x000000000 0000000ULL,
3633 0x280000c1400040ccULL, 0x4180001000000000ULL, 0x00000000c1000104ULL, 0x000000000 0000000ULL,
3634 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0080000000c00000ULL, 0x000400606 6004000ULL,
3635 0x0000005000040440ULL, 0x0000106005804044ULL, 0x0000a10511004440ULL, 0x000000000 0000110ULL,
3636 0x0000000000000000ULL, 0x0000000000080000ULL, 0xeb0808a020800080ULL, 0x29a800810 02a1800ULL,
3637 0x0b2c000202100100ULL, 0x0001000000888000ULL, 0x2280102010000000ULL, 0x020000602 a004110ULL,
3638 0x8a800160a6108100ULL, 0x0280000000000020ULL, 0x8a8000a0a8808208ULL, 0x028088208 0500308ULL,
3639 0x0b18010020804100ULL, 0xeb080000c0080080ULL, 0x2b08000000810130ULL, 0x000000000 8040020ULL,
3640 0xaa0a08e082894140ULL, 0x0000000000000000ULL, 0x202081409010001cULL, 0x8aa880508 2806000ULL,
3641 0xeb082900289c0000ULL, 0x0000000000008000ULL, 0xf80c2e20002e0000ULL, 0xa28808042 0880888ULL,
3642 0x0000010000000000ULL, 0x0000000000102000ULL, 0x22880000a8a80808ULL, 0x022022a22 aa880a0ULL,
3643 0x0000222222aa0620ULL, 0x0000022002800000ULL, 0x208080004028a000ULL, 0x2b8888008 01c0828ULL,
3644 0x0000000000000000ULL, 0x22e0828280a08028ULL, 0xaa88002082080308ULL, 0x0ea800804 10a0040ULL,
3645 0x2a28222000a00000ULL, 0x8aa2808028a0a2a0ULL, 0x0200001000000000ULL, 0x82080000a 0000000ULL,
3646 0x8800000082000808ULL, 0x2a008a0000300888ULL, 0x0a80080080080808ULL, 0xaa8828008 40b0808ULL,
3647 0x0a80000080000040ULL, 0xea080820a0000000ULL, 0xaa88080080080808ULL, 0x8040a2800 a8024a0ULL,
3648 0xaa800020a0080808ULL, 0x0000040000000000ULL, 0x2a280a0080080880ULL, 0x2a2008108 0008a00ULL,
3649 0x2a88882088aa0008ULL, 0x81800202c0a01480ULL, 0xea88082082200000ULL, 0xaa8800208 0080008ULL,
3650 0x0000100000000000ULL, 0x802082a22aa0a2a0ULL, 0x2e80000000000000ULL, 0x0220a2a26 aa0a2a8ULL,
3651 0x800022a2228a22a0ULL, 0x880002212e82c0b0ULL, 0x02a0aa0002a82228ULL, 0x2d808b008 0380008ULL,
3652 0x0000000000000000ULL, 0x000407551c154244ULL, 0x2a00208088a02228ULL, 0x12a82182a 2402a88ULL,
3653 0xe32821e020826d00ULL, 0x801130100ccc1330ULL, 0x028010c000841008ULL, 0x88a08002a 0a664a0ULL,
3654 0x0048270080000100ULL, 0x00001f010cd10f30ULL, 0xe2242ce22aaea2a0ULL, 0xc2c00cc20 ae22460ULL,
3655 0xe208003128021c10ULL, 0x2a2021c010821080ULL, 0x2a88202082202020ULL, 0x401011110 4941410ULL,
3656 0xc80c02c182b00080ULL, 0x0000040000000000ULL, 0xe28030068002c300ULL, 0x2aa02024a 2a22228ULL,
3657 0xe20889328aa22080ULL, 0x0000000000210100ULL, 0xaa0028e0a9b221a0ULL, 0x200000808 0400000ULL,
3658 0x0000010041150404ULL, 0x0000105114410100ULL, 0xeaa82aa6aaaaaaa8ULL, 0x000000f44 300c434ULL,
3659 0x0000222222b00020ULL, 0x0000002000000000ULL, 0x0000004014000000ULL, 0x0039b3f73 fbcd3fcULL,
3660 0x0000000000000000ULL, 0x0000104015045040ULL, 0x20a80490a08800a0ULL, 0x40a825841 0a909a0ULL,
3661 0xe0a8a2022aa2e2a0ULL, 0xc111010014000500ULL, 0x2080044041840004ULL, 0x28a820022 0a2aba0ULL,
3662 0x008400a0a2840800ULL, 0x0101015451009464ULL, 0x20000ea0e02c2c2cULL, 0xe2a828a2a ca2aaa8ULL,
3663 0x682020a228a222a0ULL, 0xe8882ae22aa2a2a0ULL, 0xe9a80e6022a24140ULL, 0x001105500 5001040ULL,
3664 0x2aa8208229a0aaa4ULL, 0x0000040000000000ULL, 0x28a0228026a62260ULL, 0xe2a020a42 2a2a020ULL,
3665 0xe808a0022aa1a220ULL, 0x0000010014000100ULL, 0x28ac22802aa2a020ULL, 0x002000000 0000000ULL,
3666 0x0100010100040000ULL, 0x0000000000000000ULL, 0x22a822a22a8aaaa0ULL, 0x000000000 0000000ULL,
3667 0x0000102410800100ULL, 0x0000000000000000ULL, 0x0000000002000000ULL, 0x00000fb2a 08c0aa8ULL,
3668 0x0000000000000000ULL, 0x4010005015440140ULL, 0x18c81c00b180001cULL, 0x280004802 1820800ULL,
3669 0x8ab820c06a802580ULL, 0x00100170f4040000ULL, 0x4000144041041404ULL, 0x0ac800d00 02e440cULL,
3670 0x20880820a2000808ULL, 0x400000f03f300c00ULL, 0xaa000ea22aa22aa0ULL, 0xa2880ac0a 8942a20ULL,
3671 0xaa880a81a1804188ULL, 0xeea022a0aaa02080ULL, 0xaaa820a2aaa66120ULL, 0x000000511 5800150ULL,
3672 0x2a880920a0840040ULL, 0x0000040000000000ULL, 0xaea82222aaa22a28ULL, 0x8a2804126 0055150ULL,
3673 0xa28824008aa28880ULL, 0x0000025014019000ULL, 0xea882ae02aa200a0ULL, 0x000000000 0000000ULL,
3674 0x0000000040000400ULL, 0x0000000000000000ULL, 0xaaa82aa22aaaaaa0ULL, 0x000000000 0000000ULL,
3675 0x0000000000000000ULL, 0x002003003c80c000ULL, 0x0000020014000000ULL, 0x00200010a 0980a20ULL,
3676 0x0000000000000000ULL, 0x0020001200801240ULL, 0x0a88000089800020ULL, 0xcaa00080a 1000000ULL,
3677 0x0a200c0020a04080ULL, 0x4002034003840880ULL, 0x4690500190000050ULL, 0x222800400 0601000ULL,
3678 0x0a803f00803f400cULL, 0x400033e24dd0cf34ULL, 0xaa80a2a229a220a0ULL, 0x0a2240000 02c0000ULL,
3679 0x028000202000008cULL, 0x0a08000070000030ULL, 0x00800c040020000cULL, 0x000000000 2850000ULL,
3680 0x02881cc310200000ULL, 0x0000040004000000ULL, 0xcba8000400000080ULL, 0xcaa02c068 0000000ULL,
3681 0xcc880002008c4080ULL, 0x300000f007f0cf0cULL, 0x0a80001080a00000ULL, 0x820880802 a880a80ULL,
3682 0x0000050001040004ULL, 0x0000011000000000ULL, 0x0a8020a2a0202000ULL, 0x000002220 2008000ULL,
3683 0x0000222212808000ULL, 0x0020226010000000ULL, 0x000033f33ff3c33cULL, 0x00288002a 08c02a8ULL,
3684 0x0000000000000000ULL, 0x04408e0000008200ULL, 0x0808004000900000ULL, 0x0aa820001 0ca00c0ULL,
3685 0x0ba80101005d4010ULL, 0x00018604802c8288ULL, 0x00049400101c0000ULL, 0x000c10111 0505010ULL,
3686 0x0000000000100000ULL, 0x30000c00c022000cULL, 0xd0c00dd0d51d431cULL, 0x000800001 0100000ULL,
3687 0x000c1001a0280000ULL, 0x0bc80000c0000000ULL, 0x0a00000080280000ULL, 0x8000a0022 0308420ULL,
3688 0x0808000010301000ULL, 0x0000040000000000ULL, 0x0d00031480100000ULL, 0x072000001 08c0300ULL,
3689 0x0bc0a0c000004000ULL, 0x8000b002c0208480ULL, 0x340c0100118c111cULL, 0x800800802 0890000ULL,
3690 0x0000000000040010ULL, 0x0020b00320c1d0b0ULL, 0x00002000000c0000ULL, 0x0020be226 e2008a0ULL,
3691 0x002010c03fb0a6a0ULL, 0x00202e222aaec284ULL, 0x00008f0000208400ULL, 0x000000000 0300000ULL,
3692 };
3693 // Latin1 6%, Latin2 11%, Latin7 3%
3694
3695
3696
3697 // Just for debugging. not thread-safe
3698 static char tri_string[4];
3699 char* Latin127Str(int trisub) {
3700 tri_string[0] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 10) & 0x1f];
3701 tri_string[1] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 5) & 0x1f];
3702 tri_string[2] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 0) & 0x1f];
3703 tri_string[3] = '\0';
3704 return tri_string;
3705 }
3706
3707 // Returns two bits per three-byte trigram, indicating
3708 // dont-care, Latin1 likely, Latin2 likely, and Latin7 (ISO-8859-13) likely
3709 int TrigramValue(const uint8* trisrc) {
3710 int byte0_p = kMapToFiveBits[trisrc[0]];
3711 int byte1_p = kMapToFiveBits[trisrc[1]];
3712 int byte2_p = kMapToFiveBits[trisrc[2]];
3713 int subscr = ((byte0_p) << 5) | byte1_p;
3714 int temp = static_cast<int>((kLatin127Trigrams[subscr] >> (byte2_p * 2)));
3715 //printf("%s=%d ", Latin127Str((subscr << 5) | byte2_p), temp & 3);
3716 return temp & 3;
3717 }
3718
3719
3720 // Put out trigrams for surrounding 32 bytes for Latin encodings
3721 // Return true if more Latin2 & 7 than Latin1
3722 bool BoostLatin127Trigrams(int tri_block_offset,
3723 DetectEncodingState* destatep) {
3724 //printf("BoostLatin127Trigrams[%06x]\n", tri_block_offset);
3725 int excess_latin27 = 0;
3726 int srclen = destatep->limit_src - destatep->initial_src;
3727 int hi_limit = minint(tri_block_offset + 32, srclen - 2);
3728 const uint8* trisrc = &destatep->initial_src[tri_block_offset];
3729 const uint8* trisrclimit = &destatep->initial_src[hi_limit];
3730 while (trisrc < trisrclimit) {
3731 // Selectively boost Latin1, Latin2, or Latin7 and friends
3732 int trigram_val = TrigramValue(trisrc);
3733 if (trigram_val != 0) {
3734 if (FLAGS_enc_detect_source) {
3735 PsHighlight(trisrc, destatep->initial_src, trigram_val, 1);
3736 }
3737 if (trigram_val == kTriLatin1Likely) {
3738 Boost(destatep, F_Latin1, kTrigramBoost);
3739 Boost(destatep, F_CP1252, kTrigramBoost);
3740 // We don't want to upset the relative rank of a declared 8859-15
3741 Boost(destatep, F_ISO_8859_15, kTrigramBoost);
3742 --excess_latin27;
3743 } else if (trigram_val == kTriLatin2Likely) {
3744 Boost(destatep, F_Latin2, kTrigramBoost);
3745 Boost(destatep, F_CP1250, kTrigramBoost);
3746 ++excess_latin27;
3747 } else if (trigram_val == kTriLatin7Likely) {
3748 Boost(destatep, F_ISO_8859_13, kTrigramBoost);
3749 Boost(destatep, F_CP1257, kTrigramBoost);
3750 // We don't want to upset the relative rank of a declared 8859-4 or -6
3751 // for Estonian
3752 Boost(destatep, F_Latin4, kTrigramBoost);
3753 Boost(destatep, F_Latin6, kTrigramBoost);
3754 ++excess_latin27;
3755 }
3756 }
3757
3758 ++trisrc;
3759 }
3760 //printf("\n");
3761
3762 return (0 < excess_latin27);
3763 }
3764
3765
3766
3767 // Boost any encodings that need extra detection help, then prune
3768 // src is first unscanned byte
3769 // slowend means extra pruning when dropping out of initial slow scan
3770 // final means last call -- no bigram at src
3771 void BoostPrune(const uint8* src, DetectEncodingState* destatep,
3772 int prunereason) {
3773 int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] -
3774 destatep->prior_interesting_pair[AsciiPair];
3775 int delta_otherpairs = destatep->next_interesting_pair[OtherPair] -
3776 destatep->prior_interesting_pair[OtherPair];
3777
3778 if (prunereason == PRUNE_FINAL) {
3779 // We are about done
3780 // If we get here with very little accumulated data, the initial hints
3781 // were too strong, so we derate them to n+1 / 12 for n bigrams
3782 if (!destatep->hints_derated &&
3783 (destatep->next_interesting_pair[OtherPair] < kDerateHintsBelow)) {
3784 int n = destatep->next_interesting_pair[OtherPair];
3785
3786 // Map N pairs to (N+1)/12 portions of the initial hints, etc.
3787 // Floor of 3/12 -- 1/12 and 2/12 are too easy to overcome
3788 int m = maxint(3, (n + 1));
3789 for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
3790 int original_delta = destatep->hint_prob[i];
3791 int scaled_delta = (original_delta * m) / kDerateHintsBelow;
3792 destatep->enc_prob[i] -= original_delta;
3793 destatep->enc_prob[i] += scaled_delta;
3794 }
3795 destatep->hints_derated = true;
3796 if (destatep->debug_data != NULL) {
3797 // Show derated-hint result
3798 char buff[32];
3799 snprintf(buff, sizeof(buff), "Hints %d/%d", m, kDerateHintsBelow);
3800 SetDetailsEncLabel(destatep, buff);
3801 }
3802 }
3803 }
3804
3805
3806 ++destatep->prune_count;
3807
3808 if (prunereason != PRUNE_FINAL) {
3809 // Early outs
3810 if (destatep->rankedencoding_list_len <= 1) { // nothing to prune
3811 destatep->done = true;
3812 return;
3813 }
3814
3815 if ((destatep->prune_count > 0) &&
3816 (delta_asciipairs + delta_otherpairs) == 0) {
3817 // Nothing to do; must have just been called earlier
3818 return;
3819 }
3820 }
3821
3822
3823
3824 // INCREMENT
3825 // ====================
3826 // Accumulate OtherPair probibilities over all active families
3827 // AsciiPair probibilities are all done in ActiveSpecialBoostWhack
3828 uint8 prior_bad_byte1 = ' '; // won't match first bad pair
3829 uint8 prior_bad_byte2 = ' '; // won't match first bad pair
3830 uint8 or_byte1 = 0; // Track if any current pair has a high bit
3831 int counted_otherpairs = 0;
3832 uint8 prior_byte1x2x = 0;
3833 for (int i = 0; i < delta_otherpairs; ++i) {
3834 int watch1_incr = 0;
3835 int watch2_incr = 0;
3836 int next_pair = destatep->prior_interesting_pair[OtherPair] + i;
3837
3838 uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0];
3839 uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1];
3840 uint8 byte1x2x = (byte1 & 0xf0) | ((byte2 >> 4) & 0x0f);
3841 int weightshift = destatep->interesting_weightshift[OtherPair][next_pair];
3842
3843 int offset_byte12 = destatep->interesting_offsets[OtherPair][next_pair];
3844
3845 // To help distinguish some Cyrillic, Arabic, Greek, Hebrew, Thai
3846 // Remember if this is a CDEF pair immediately following the previous pair
3847 // 8xxx CxCx or CxCx 8xxx
3848 bool next_pair_consec_hi = false;
3849 if (ConsecutivePair(destatep, next_pair)) {
3850 if ((byte1x2x & 0xcc) == 0xcc) { // 8xxx CxCx
3851 next_pair_consec_hi = true;
3852 } else if ((prior_byte1x2x & 0xcc) == 0xcc) { // CxCx 8xxx
3853 next_pair_consec_hi = true;
3854 }
3855 }
3856 //printf("prior/cur/consec %02x %02x %d\n",
3857 // prior_byte1x2x, byte1x2x, next_pair_consec_hi);
3858 prior_byte1x2x = byte1x2x;
3859
3860 or_byte1 |= byte1;
3861 uint8 byte1f = byte1;
3862 // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebrew)
3863 byte1f ^= (byte2 & 0x80);
3864
3865 // If the same bigram occurred recently, don't increment again
3866 bool pair_used = false;
3867 if (!RepeatedBigram(destatep, byte1, byte2)) {
3868 ++counted_otherpairs;
3869 pair_used = true;
3870 // Boost both charset= declared encodings, so
3871 // Nearly-same probability nearby encoding doesn't drift to the top
3872 if (!FLAGS_demo_nodefault) {
3873 destatep->enc_prob[destatep->declared_enc_1] += kDeclaredEncBoost >> wei ghtshift;
3874 destatep->enc_prob[destatep->declared_enc_2] += kDeclaredEncBoost >> wei ghtshift;
3875 }
3876 bool was_bad_pair = false;
3877 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3878 int incr_shift = 0;
3879 int rankedencoding = destatep->rankedencoding_list[j];
3880 Encoding enc = kMapToEncoding[rankedencoding];
3881
3882 // For binary, Skip over repeated marker bytes, such as 02, FF, etc.
3883 if ((rankedencoding == F_BINARY) &&
3884 RepeatedBinary(destatep, byte1, byte2)) {
3885 incr_shift = 2; // count 1/4 as much if repeated
3886 }
3887
3888 // If byte 1x2x for this encoding is exactly zero, illegal byte pair
3889 // Don't increment, but instead penalize
3890 const UnigramEntry* ue = &unigram_table[rankedencoding];
3891 if (ue->b12[byte1x2x] == 0) {
3892 // Don't whack consecutive duplicate bad pairs -- overkill
3893 if ((byte1 != prior_bad_byte1) || (byte2 != prior_bad_byte2)) {
3894 // Extra whack for illegal pair in this encoding
3895 Whack(destatep, rankedencoding, kBadPairWhack >> weightshift);
3896 was_bad_pair = true;
3897 }
3898 } else {
3899 // OK to do the real increment
3900 int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x];
3901 if ((ue->b12[byte1x2x] & 0x01) != 0) {
3902 // Use a more-precise table
3903 int byte32x32 = ((byte1 & 0x1f) << 5) | (byte2 & 0x1f);
3904 int hiressub = (byte2 & 0x60) >> 5; // select w/bits 5&6 of byte 2
3905 DCHECK(ue->hires[hiressub] != NULL);
3906 incr += ue->hires[hiressub][byte32x32];
3907 } else {
3908 // Default final offset
3909 incr += ue->so;
3910 }
3911 incr >>= incr_shift;
3912
3913 incr >>= weightshift;
3914 destatep->enc_prob[rankedencoding] += incr; // The actual increment
3915
3916 if (FLAGS_enc_detect_detail2) {
3917 if (watch1_rankedenc == rankedencoding) {watch1_incr = incr;}
3918 if (watch2_rankedenc == rankedencoding) {watch2_incr = incr;}
3919 }
3920 }
3921
3922
3923 // If consecutive pair of high bytes, give slight boost to one-byte
3924 // encodings that have a full alphabet in the high bytes
3925 if (next_pair_consec_hi && HighAlphaEncoding(enc)) {
3926 Boost(destatep, rankedencoding, kDeclaredEncBoost >> weightshift);
3927 }
3928 } // End for j < rankedencoding_list_len
3929
3930 if (was_bad_pair) {
3931 prior_bad_byte1 = byte1;
3932 prior_bad_byte2 = byte2;
3933 }
3934
3935 // Fold in per-bigram most likely encoding for first N bigrams
3936 if (next_pair < kBestPairsCount) {
3937 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
3938 Boost(destatep, best_enc, kBestEncBoost >> weightshift);
3939 }
3940
3941 // Possibly score 32 trigrams around a bigram to better separate
3942 // Latin1 from Latin2 and Latin7. Especially helpful for detecting
3943 // mis-labelled Hungarian latin2.
3944 // If looking and at bigram 0,8,16,... do full scoring, else just 1 tri
3945 if (destatep->do_latin_trigrams ||
3946 destatep->looking_for_latin_trigrams) {
3947 // If just looking, do full scan every 8 times
3948 // Just look up one trigram the other 7 and do full scan if Latin2,7
3949 bool scan32 = false;
3950 const uint8* trisrc = &destatep->initial_src[offset_byte12 - 1];
3951 if (!destatep->do_latin_trigrams) {
3952 if ((i & 7) == 0 || trisrc + 3 > destatep->limit_src) {
3953 scan32 = true;
3954 } else {
3955 scan32 = (kTriLatin1Likely < TrigramValue(trisrc));
3956 }
3957 }
3958 if (destatep->do_latin_trigrams || scan32) {
3959 // Just score each block of 32 bytes once
3960 int tri_block_offset = offset_byte12 & ~0x1f;
3961 if (destatep->trigram_highwater_mark <= tri_block_offset) {
3962 bool turnon = BoostLatin127Trigrams(tri_block_offset, destatep);
3963 if (FLAGS_counts && !destatep->do_latin_trigrams && turnon) {
3964 ++doing_used; // First time
3965 }
3966 if (FLAGS_enc_detect_source) {
3967 if (!destatep->do_latin_trigrams && turnon) {
3968 // First time
3969 PsHighlight(trisrc, destatep->initial_src, 0, 2);
3970 }
3971 }
3972 destatep->do_latin_trigrams |= turnon;
3973 destatep->trigram_highwater_mark = tri_block_offset + 32;
3974 }
3975 }
3976 }
3977
3978 } // end if RepeatedBigram()
3979
3980 // Keep track of initial byte high 3 bits
3981 ++destatep->byte32_count[byte1 >> 5];
3982
3983
3984 // TODO: boost subset/superset also
3985 // Boost(destatep, kRelatedEncoding[best_enc], kBestEncBoost);
3986
3987 if (destatep->debug_data != NULL) {
3988 // Show detail entry for this bigram
3989 char buff[16];
3990 snprintf(buff, sizeof(buff), "%c%02x%02x%c%c",
3991 pair_used ? ' ' : '[',
3992 byte1,
3993 byte2,
3994 pair_used ? ' ' : ']',
3995 (weightshift == 0) ? ' ' : '-');
3996
3997 SetDetailsEncProb(destatep,
3998 destatep->interesting_offsets[OtherPair][next_pair],
3999 kMostLikelyEncoding[(byte1 << 8) + byte2],
4000 buff);
4001 }
4002 if (FLAGS_enc_detect_detail2) {
4003 if ((watch1_incr != 0) || (watch2_incr != 0)) {
4004 // Show increment detail for this encoding
4005 char buff[32];
4006 snprintf(buff, sizeof(buff), "%c%d %c%d",
4007 (watch1_incr < 0) ? '-' : '+', watch1_incr,
4008 (watch2_incr < 0) ? '-' : '+', watch2_incr);
4009 SetDetailsEncLabel(destatep, buff);
4010 }
4011 }
4012 } // End for i
4013
4014
4015 // If no high bit on, demote all the two-byte codes
4016 // WAS BUG. This was inside the loop above and should be outside
4017 if ((counted_otherpairs > 0) && ((or_byte1 & 0x80) == 0)) {
4018 // No high bit in this group (just 02xx, etc.). Whack 2-byte codes
4019 // This keeps SJS from creeping past Latin1 on illegal C0 bytes
4020 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4021 int rankedencoding = destatep->rankedencoding_list[j];
4022 Encoding enc = kMapToEncoding[rankedencoding];
4023 if (TwoByteEncoding(enc)) {
4024 Whack(destatep, rankedencoding, kGentlePairWhack * counted_otherpairs);
4025 }
4026 }
4027 }
4028
4029
4030 // BOOST
4031 // ====================
4032 if (AnyActive(destatep)) {
4033 ActiveSpecialBoostWhack(src, destatep);
4034 }
4035
4036 // Update for next time
4037 destatep->prior_src = src;
4038 destatep->prior_interesting_pair[AsciiPair] =
4039 destatep->next_interesting_pair[AsciiPair];
4040 destatep->prior_interesting_pair[OtherPair] =
4041 destatep->next_interesting_pair[OtherPair];
4042
4043
4044 // Do any pre-prune final adjustments
4045 // ====================
4046 if (prunereason == PRUNE_FINAL) {
4047 // If UTF8 not in base state, whack
4048 if (destatep->next_utf8_ministate != 0) {
4049 Whack(destatep, F_UTF8, kGentlePairWhack * 2 * 1);
4050 }
4051 // If UTF8UTF8 not in base state, whack
4052 if (destatep->next_utf8utf8_ministate != 0) {
4053 Whack(destatep, F_UTF8UTF8, kGentlePairWhack * 2 * 1);
4054 }
4055
4056 // If no valid UTF-8 char ever seen, whack
4057 if (destatep->utf8_minicount[5] == 0) {
4058 Whack(destatep, F_UTF8, kBadPairWhack * 8); // No sequence
4059 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); // No sequence
4060 }
4061
4062 // If no valid UTF8UTF8 char ever seen, whack
4063 if (destatep->utf8utf8_minicount[5] == 0) {
4064 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); // No sequence
4065 }
4066
4067 // If not all four binary quadrants, whack BINARY;
4068 // worth 2 pair if 3 quads, 4 pair if 1 or 2 quads
4069 if (destatep->binary_quadrants_count < 4) {
4070 if (destatep->binary_quadrants_count == 3) {
4071 Whack(destatep, F_BINARY, kBadPairWhack * 2);
4072 } else {
4073 Whack(destatep, F_BINARY, kBadPairWhack * 4);
4074 }
4075 }
4076
4077 // If 1st pair is 1b24, choose between ISO-2022-xx
4078 // <esc> $ ) C ISO-2022-KR [1b 24 29 43]
4079 // <esc> $ ) A ISO-2022-CN [1b 24 29 41]
4080 // <esc> $ ) G ISO-2022-CN [1b 24 29 47]
4081 // <esc> $ * H ISO-2022-CN [1b 24 2a 48]
4082 // <esc> ( B ISO-2022-JP [1b 28 42] to ASCII
4083 // <esc> ( J ISO-2022-JP [1b 28 4a] to X0201
4084 // <esc> $ @ ISO-2022-JP [1b 24 40] to X0208-78 twobyte
4085 // <esc> $ B ISO-2022-JP [1b 24 42] to X0208-83 twobyte
4086 if ((destatep->next_interesting_pair[OtherPair] >= 1) &&
4087 Iso2022Active(destatep)) {
4088 if ((destatep->interesting_pairs[OtherPair][0] == 0x1b) &&
4089 (destatep->interesting_pairs[OtherPair][1] == 0x24)) {
4090 int offset = destatep->interesting_offsets[OtherPair][0];
4091 const uint8* esc_src = destatep->initial_src + offset;
4092 if ((destatep->initial_src + offset) < (destatep->limit_src - 3)) {
4093 if ((esc_src[2] == ')') && (esc_src[3] == 'C')) {
4094 Boost(destatep, F_ISO_2022_KR, kBoostOnePair);
4095 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
4096 Whack(destatep, F_JIS, kBadPairWhack);
4097 } else if ((esc_src[2] == ')') && ((esc_src[3] == 'A') ||
4098 (esc_src[3] == 'G'))) {
4099 Boost(destatep, F_ISO_2022_CN, kBoostOnePair);
4100 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
4101 Whack(destatep, F_JIS, kBadPairWhack);
4102 } else if ((esc_src[2] == '@') || (esc_src[2] == 'B')) {
4103 Boost(destatep, F_JIS, kBoostOnePair);
4104 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
4105 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
4106 }
4107 } else {
4108 // Incomplete escape sequence. Whack them all
4109 Whack(destatep, F_JIS, kBadPairWhack);
4110 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
4111 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
4112 }
4113 }
4114 }
4115 if (destatep->debug_data != NULL) {
4116 SetDetailsEncLabel(destatep, "pre-final");
4117 }
4118 }
4119
4120 // PRUNE
4121 // ====================
4122 // Find current top two rankedencoding probabilities
4123 ReRank(destatep);
4124
4125 if (prunereason == PRUNE_SLOWEND) {
4126 if (destatep->debug_data != NULL) {
4127 SetDetailsEncLabel(destatep, "slow-end");
4128 }
4129 }
4130
4131 // Keep every rankedencoding with probablity >= top_prob - prune_difference
4132 int prune_diff = destatep->prune_difference;
4133 // If the top encoding is BINARY, it might be overstated, and we might
4134 // therefore prune away the real encoding. Make the pruning delta
4135 // twice as big.
4136 if (destatep->top_rankedencoding == F_BINARY) {
4137 prune_diff *= 2;
4138 }
4139 int keep_prob = destatep->top_prob - prune_diff;
4140
4141 // Tighten pruning difference (we start wide) for next time
4142 if (destatep->prune_difference > kFinalPruneDifference) {
4143 int decrement = kPruneDiffDecrement;
4144 // If only ASCII pairs, small tighten; if some non-ASCII, full tighten
4145 if (counted_otherpairs == 0) {
4146 decrement >>= 1;
4147 }
4148 destatep->prune_difference -= decrement;
4149 }
4150
4151 // Prune the list of active encoding families
4152 destatep->active_special = 0;
4153 int k = 0;
4154 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4155 bool keep = true;
4156 int rankedencoding = destatep->rankedencoding_list[j];
4157
4158 // If count is too low, ditch it
4159 if (destatep->enc_prob[rankedencoding] < keep_prob) {
4160 keep = false;
4161 }
4162
4163 // If at end of slow section, ditch any 7-bit with zero evidence so far
4164 if ((prunereason == PRUNE_SLOWEND) &&
4165 SevenBitEncoding(kMapToEncoding[rankedencoding]) &&
4166 (destatep->enc_prob[rankedencoding] <= 0) &&
4167 (rankedencoding != destatep->top_rankedencoding)) {
4168 keep = false;
4169 }
4170
4171 // Keep it. This will always keep at least top_prob rankedencoding
4172 if (keep) {
4173 destatep->active_special |= kSpecialMask[kMapToEncoding[rankedencoding]];
4174 destatep->rankedencoding_list[k++] = rankedencoding;
4175 }
4176 }
4177
4178 if (destatep->debug_data != NULL) {
4179 char buff[32];
4180 snprintf(buff, sizeof(buff), "%d prune", prune_diff / XLOG2);
4181 SetDetailsEncLabel(destatep, buff);
4182 }
4183 destatep->rankedencoding_list_len = k;
4184
4185
4186
4187 // Force final result in some cases
4188 // Do any post-prune final adjustments
4189 if (prunereason == PRUNE_FINAL) {
4190 // If no high-byte pairs, result is ASCII7, BINARY, UTF7, 2022, or HZ
4191 if (destatep->next_interesting_pair[OtherPair] == 0) {
4192 if ((destatep->top_rankedencoding != F_BINARY) &&
4193 (destatep->top_rankedencoding != F_UTF7) &&
4194 (destatep->top_rankedencoding != F_ISO_2022_CN) &&
4195 (destatep->top_rankedencoding != F_ISO_2022_KR) &&
4196 (destatep->top_rankedencoding != F_JIS) &&
4197 (destatep->top_rankedencoding != F_HZ_GB_2312)) {
4198 destatep->top_rankedencoding = F_ASCII_7_bit;
4199 Boost(destatep, F_ASCII_7_bit, kBoostOnePair * 2);
4200 }
4201 }
4202
4203 // If some 89 pairs, not ISO_8859_x and vice versa
4204 if (destatep->byte32_count[4] > 0) {
4205 switch (destatep->top_rankedencoding) {
4206 case F_ASCII: // ISO-8859-1
4207 destatep->top_rankedencoding = F_CP1252;
4208 // Better: destatep->enc_prob[F_ASCII] <==> destatep->enc_prob[F_CP1252]
4209 Boost(destatep, F_CP1252, kBoostOnePair * 2);
4210 break;
4211 case F_Latin2: // ISO-8859-2
4212 // Don't swap back; not superset
4213 //destatep->top_rankedencoding = F_CP1250;
4214 //Boost(destatep, F_CP1250, kBoostOnePair * 2);
4215 break;
4216 case F_Arabic: // ISO-8859-6
4217 destatep->top_rankedencoding = F_CP1256;
4218 Boost(destatep, F_CP1256, kBoostOnePair * 2);
4219 break;
4220 case F_Greek: // ISO-8859-7
4221 // Don't swap -- not proper superset
4222 // Capital Alpha tonos at 0xB6 in ISO-8859-7, 0xA2 in CP1253
4223 //destatep->top_rankedencoding = F_CP1253;
4224 //Boost(destatep, F_CP1253, kBoostOnePair * 2);
4225 break;
4226 case F_Hebrew: // ISO-8859-8
4227 // Don't swap -- visual vs. logical
4228 //destatep->top_rankedencoding = F_CP1255;
4229 //Boost(destatep, F_CP1255, kBoostOnePair * 2);
4230 break;
4231 case F_Latin5: // ISO-8859-9
4232 destatep->top_rankedencoding = F_CP1254;
4233 Boost(destatep, F_CP1254, kBoostOnePair * 2);
4234 break;
4235 case F_ISO_8859_11: // ISO-8859-11
4236 destatep->top_rankedencoding = F_CP874;
4237 Boost(destatep, F_CP874, kBoostOnePair * 2);
4238 break;
4239 }
4240 } else {
4241 switch (destatep->top_rankedencoding) {
4242 case F_CP1252: // ISO-8859-1
4243 destatep->top_rankedencoding = F_ASCII;
4244 Boost(destatep, F_ASCII, kBoostOnePair * 2);
4245 break;
4246 case F_CP1250: // ISO-8859-2
4247 // Don't swap back; not superset
4248 //destatep->top_rankedencoding = F_Latin2;
4249 //Boost(destatep, F_Latin2, kBoostOnePair * 2);
4250 break;
4251 case F_CP1256: // ISO-8859-6
4252 // Don't swap back -- not proper superset
4253 //destatep->top_rankedencoding = F_Arabic;
4254 //Boost(destatep, F_Arabic, kBoostOnePair * 2);
4255 break;
4256 case F_CP1253: // ISO-8859-7
4257 // Don't swap back -- not proper superset
4258 //destatep->top_rankedencoding = F_Greek;
4259 //Boost(destatep, F_Greek, kBoostOnePair * 2);
4260 break;
4261 case F_CP1255: // ISO-8859-8
4262 // Don't swap back -- not proper superset
4263 //destatep->top_rankedencoding = F_Hebrew;
4264 //Boost(destatep, F_Hebrew, kBoostOnePair * 2);
4265 break;
4266 case F_CP1254: // ISO-8859-9
4267 destatep->top_rankedencoding = F_Latin5;
4268 Boost(destatep, F_Latin5, kBoostOnePair * 2);
4269 break;
4270 case F_CP874: // ISO-8859-11
4271 destatep->top_rankedencoding = F_ISO_8859_11;
4272 Boost(destatep, F_ISO_8859_11, kBoostOnePair * 2);
4273 break;
4274 }
4275 }
4276
4277 if (destatep->debug_data != NULL) {
4278 char buff[32];
4279 snprintf(buff, sizeof(buff), "final %d",
4280 static_cast<int>(src - destatep->initial_src));
4281 SetDetailsEncLabel(destatep, buff);
4282
4283 // Show winning encoding and its delta log base2 from 2nd-best
4284 // Divide delta by XLOG2 to get log base 2
4285 int delta = destatep->top_prob - destatep->second_top_prob;
4286 if (delta < (2 * XLOG2)) {
4287 delta /= XDECILOG2;
4288 snprintf(buff, sizeof(buff), "+%d.%d %s ",
4289 delta / 10, delta % 10,
4290 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
4291 } else if (delta < (50 * XLOG2)) {
4292 delta /= XLOG2;
4293 snprintf(buff, sizeof(buff), "+%d %s",
4294 delta,
4295 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
4296 } else {
4297 snprintf(buff, sizeof(buff), "%s",
4298 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
4299 }
4300 SetDetailsEncProbCopyOffset(destatep, destatep->top_rankedencoding, buff);
4301 }
4302 }
4303
4304
4305 // FINISH
4306 // ====================
4307 // Eventual encoding result is reliable if big difference in top two, or if
4308 // only Ascii7 ever encountered
4309 // Also reliable if exactly one OtherPair and it's best encoding matches top
4310 destatep->reliable = false;
4311 if (destatep->next_interesting_pair[OtherPair] == 0) {
4312 // Only 7-bit ASCII
4313 destatep->reliable = true;
4314 }
4315 if ((destatep->top_prob - destatep->second_top_prob) >=
4316 FLAGS_ced_reliable_difference) {
4317 destatep->reliable = true;
4318 }
4319 if (destatep->next_interesting_pair[OtherPair] == 1) {
4320 uint8 byte1 = destatep->interesting_pairs[OtherPair][0];
4321 uint8 byte2 = destatep->interesting_pairs[OtherPair][1];
4322 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
4323 if (best_enc == destatep->top_rankedencoding) {
4324 destatep->reliable = true;
4325 }
4326 }
4327
4328 // If we pruned to one encoding, we are done
4329 if (destatep->rankedencoding_list_len == 1) {
4330 destatep->reliable = true;
4331 destatep->done = true;
4332 }
4333
4334 // If we pruned to two or three encodings in the same *superset/subset
4335 // rankedencoding* and enough pairs, we are done. Else keep going
4336 if (destatep->rankedencoding_list_len == 2) {
4337 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
4338 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
4339 if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) {
4340 if (destatep->prune_count >= 3) {
4341 destatep->reliable = true;
4342 destatep->done = true;
4343 }
4344 }
4345 } else if (destatep->rankedencoding_list_len == 3) {
4346 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
4347 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
4348 Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]];
4349 Encoding base0 = kMapEncToBaseEncoding[enc0];
4350 Encoding base1 = kMapEncToBaseEncoding[enc1];
4351 Encoding base2 = kMapEncToBaseEncoding[enc2];
4352
4353 if ((base0 == base1) && (base0 == base2)) {
4354 if (destatep->prune_count >= 3) {
4355 destatep->reliable = true;
4356 destatep->done = true;
4357 }
4358 }
4359 }
4360 }
4361
4362
4363 // Accumulate aligned byte-pair at src
4364 // Occasionally, calc boost for some encodings and then prune the active list
4365 // weightshift is used to give low weight some text, such as inside tags
4366 // Returns true if pruning occurred
4367 bool IncrementAndBoostPrune(const uint8* src,
4368 int remaining_length,
4369 DetectEncodingState* destatep,
4370 int weightshift,
4371 int exit_reason) {
4372 destatep->last_pair = src;
4373 // Pick up byte pair, or very last byte plus 0x20
4374 uint8 byte1 = src[0];
4375 uint8 byte2 = 0x20;
4376 if (1 < remaining_length) {byte2 = src[1];}
4377
4378 // whatset=0 for Ascii + ~, 1 for all others; see kTestPrintableAsciiTildePlus
4379 int whatset = exit_reason - 1;
4380 int next_pair = destatep->next_interesting_pair[whatset];
4381
4382 if (next_pair > 16) {
4383 // If not clear by 16 bigrams, stop accumulating + ~ 00
4384 if (byte1 == '+') {return false;}
4385 if (byte1 == '~') {return false;}
4386 if (byte1 == 0x00) {return false;}
4387 }
4388
4389 // Remember pair in appropriate list
4390 if (next_pair >= kMaxPairs) {
4391 // We have filled up our alloted space for interesting pairs with no
4392 // decision. If ASCII pairs full, just skip until end of slow loop; if
4393 // non-Ascii pairs full, force done
4394 if (whatset == OtherPair) {
4395 destatep->done = true;
4396 }
4397 } else {
4398 int offset = static_cast<int>(src - destatep->initial_src);
4399 destatep->interesting_pairs[whatset][next_pair * 2 + 0] = byte1;
4400 destatep->interesting_pairs[whatset][next_pair * 2 + 1] = byte2;
4401 destatep->interesting_offsets[whatset][next_pair] = offset;
4402 destatep->interesting_weightshift[whatset][next_pair] = weightshift;
4403 ++destatep->next_interesting_pair[whatset];
4404 ++next_pair;
4405 }
4406
4407 // Prune now and then , but always if forced to be done
4408 if (destatep->done || ((next_pair & kPruneMask) == 0)) { // Prune every M
4409 BoostPrune(src + 2, destatep, PRUNE_NORMAL); // src+2 first unscanned byte
4410 // may be off end of input
4411 return true;
4412 }
4413 return false;
4414 }
4415
4416 void DumpSummary(DetectEncodingState* destatep, int whatset, int n) {
4417 printf(" %sSummary[%2d]: ", kWhatSetName[whatset],
4418 destatep->next_interesting_pair[whatset]);
4419 int limit = minint(n, destatep->next_interesting_pair[whatset]);
4420 for (int i = 0; i < limit; ++i) {
4421 printf("%02x%02x ",
4422 destatep->interesting_pairs[whatset][i * 2 + 0],
4423 destatep->interesting_pairs[whatset][i * 2 + 1]);
4424 if ((i & 7) == 7) {printf(" ");}
4425 }
4426 printf("\n");
4427 }
4428
4429 void BeginDetail(DetectEncodingState* destatep) {
4430 fprintf(stderr, "%d [", NUM_RANKEDENCODING);
4431 for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
4432 fprintf(stderr, "(%s)", MyRankedEncName(e));
4433 if ((e % 10) == 9) {fprintf(stderr, "\n ");}
4434 }
4435 fprintf(stderr, "] size-detail\n");
4436 destatep->next_detail_entry = 0;
4437 }
4438
4439 // Single character to represent (printable ASCII) gap between bigrams
4440 char DetailOffsetChar(int delta) {
4441 if (delta == 0) {return ' ';}
4442 if (delta <= 2) {return '=';}
4443 if (delta <= 15) {return '_';}
4444 if (delta <= 31) {return '+';}
4445 {return ' ';}
4446 }
4447
4448 void DumpDetail(DetectEncodingState* destatep) {
4449 // Turn all counts into delta from previous entry
4450 fprintf(stderr, "%d count-detail\n", destatep->next_detail_entry);
4451 // Rewrite, recording deltas
4452 for (int z = destatep->next_detail_entry - 1; z > 0; --z) {
4453 destatep->debug_data[z].offset -= destatep->debug_data[z - 1].offset;
4454 for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
4455 destatep->debug_data[z].detail_enc_prob[e] -=
4456 destatep->debug_data[z - 1].detail_enc_prob[e];
4457 }
4458 }
4459 // Now print
4460 for (int z = 0; z < destatep->next_detail_entry; ++z) {
4461 // Highlight some entries ending in '!' with light red underbar
4462 int len = destatep->debug_data[z].label.size();
4463 if (destatep->debug_data[z].label[len - 1] == '!') {
4464 fprintf(stderr, "1 0.9 0.9 do-flag\n");
4465 }
4466 fprintf(stderr, "(%c%s) %d [",
4467 DetailOffsetChar(destatep->debug_data[z].offset),
4468 destatep->debug_data[z].label.c_str(),
4469 destatep->debug_data[z].best_enc);
4470 for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
4471 fprintf(stderr, "%d ", destatep->debug_data[z].detail_enc_prob[e]);
4472 if ((e % 10) == 9) {fprintf(stderr, " ");}
4473 }
4474 fprintf(stderr, "] do-detail-e\n");
4475 }
4476 // Get ready for next time,if any
4477 destatep->next_detail_entry = 0;
4478 }
4479
4480 void PsRecurse(const char* buff) {
4481 fprintf(stderr, "() end-detail (%s) start-detail\n\n", buff);
4482 }
4483
4484 void DumpReliable(DetectEncodingState* destatep) {
4485 printf("Not reliable: ");
4486
4487 // Find center of gravity of OtherPair list
4488 int x_sum = 0;
4489 int y_sum = 0;
4490 int count = destatep->next_interesting_pair[OtherPair];
4491 for (int i = 0; i < count; ++i) {
4492 uint8 byte1 = destatep->interesting_pairs[OtherPair][i * 2 + 0];
4493 uint8 byte2 = destatep->interesting_pairs[OtherPair][i * 2 + 1];
4494 x_sum += byte2;
4495 y_sum += byte1;
4496 }
4497 if (count == 0) {count = 1;} // adoid zdiv
4498 int x_bar = x_sum / count;
4499 int y_bar = y_sum / count;
4500 printf("center %02X,%02X\n", x_bar, y_bar);
4501
4502 double closest_dist = 999.0;
4503 int closest = 0;
4504 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4505 int rankedencoding = destatep->rankedencoding_list[j];
4506 const UnigramEntry* ue = &unigram_table[rankedencoding];
4507 printf(" %8s = %4d at %02x,%02x +/- %02X,%02X ",
4508 MyEncodingName(kMapToEncoding[rankedencoding]),
4509 destatep->enc_prob[rankedencoding],
4510 ue->x_bar, ue->y_bar,
4511 ue->x_stddev, ue->y_stddev);
4512 double x_diff = x_bar - ue->x_bar;
4513 double y_diff = y_bar - ue->y_bar;
4514 double dist = sqrt((x_diff * x_diff) + (y_diff * y_diff));
4515 printf("(%3.1f)\n", dist);
4516
4517 if (closest_dist > dist) {
4518 closest_dist = dist;
4519 closest = rankedencoding;
4520 }
4521 }
4522 printf("Closest=%s (%3.1f)\n",
4523 MyEncodingName(kMapToEncoding[closest]), closest_dist);
4524
4525 for (int i = 0; i < 8; ++i) {
4526 // Demote by distance to CG and see if that helps, or just quit
4527 }
4528 }
4529
4530 // Scan short single lines quickly for all printable ASCII
4531 // Return true if all bytes are in [20..7F], false otherwise
4532 bool QuickPrintableAsciiScan(const char* text, int text_length) {
4533 const uint8* src = reinterpret_cast<const uint8*>(text);
4534 const uint8* srclimit = src + text_length;
4535 const uint8* srclimit8 = srclimit - 7;
4536 while (src < srclimit8) {
4537 const uint32* s = reinterpret_cast<const uint32*>(src);
4538 uint32 tmp1 = s[0];
4539 uint32 tmp2 = s[1];
4540 src += 8;
4541 // Exits on any byte outside [0x20..0x7E] range (HT LF CR exit)
4542 uint32 byte_outside_range_mask = ((tmp1 - 0x20202020U) |
4543 (tmp1 + 0x01010101U) |
4544 (tmp2 - 0x20202020U) |
4545 (tmp2 + 0x01010101U));
4546 if ((byte_outside_range_mask & 0x80808080U) != 0) {
4547 src -= 8;
4548 break;
4549 }
4550 }
4551 while (src < srclimit) {
4552 uint8 uc = *src++;
4553 if (kIsPrintableAscii[uc] == 0) {return false;}
4554 }
4555 return true;
4556 }
4557
4558 static const int kMaxScanBack = 192;
4559 static const int kMaxScanForward = 64;
4560
4561 // Return true if text is inside a tag or JS comment
4562 bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
4563 const uint8* srcbacklimit = src - kMaxScanBack;
4564 if (srcbacklimit < isrc) {
4565 srcbacklimit = isrc;
4566 }
4567 const uint8* ss = src - 1;
4568 while (srcbacklimit <= ss) {
4569 uint8 c = *ss--;
4570 if ((c & ~0x02) == '<') {
4571 // We found preceding < 3C or > 3E nearby
4572 // Even cheaper: if inside a tag, we don't care what tag; return true
4573 if (c == '<') {
4574 return true;
4575 }
4576 // See if we are just after <title>...
4577 if ((c == '>') && (isrc <= (ss - 5)) &&
4578 (ss[-5] == '<') &&
4579 ((ss[-4] | 0x20) == 't') &&
4580 ((ss[-3] | 0x20) == 'i') &&
4581 ((ss[-2] | 0x20) == 't') &&
4582 ((ss[-1] | 0x20) == 'l') &&
4583 ((ss[-0] | 0x20) == 'e')) {
4584 return true;
4585 }
4586 // See if we are just after <SCRIPT language=javascript>...
4587 if ((c == '>') && (isrc <= (ss - 5)) &&
4588 (ss[-5] == 's') &&
4589 ((ss[-4] | 0x20) == 'c') &&
4590 ((ss[-3] | 0x20) == 'r') &&
4591 ((ss[-2] | 0x20) == 'i') &&
4592 ((ss[-1] | 0x20) == 'p') &&
4593 ((ss[-0] | 0x20) == 't')) {
4594 return true;
4595 }
4596 // Not in a tag
4597 return false;
4598 // See if we are just after JavaScript comment /* ...
4599 } else if (c == '/') {
4600 if (((ss + 2) < srclimit) && (ss[2] == '*')) {
4601 // We backscanned to /*
4602 return true;
4603 }
4604 }
4605 }
4606
4607 return false;
4608 }
4609
4610 const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srcl imit) {
4611 const uint8* ss = src + 1;
4612 while (ss <= srclimit) {
4613 uint8 c = *ss++;
4614 if ((c == '<') || (c == '>')) {
4615 return ss;
4616 }
4617 }
4618 return src + 2; // Always make progress, Otherwise we get an infinite loop
4619 }
4620
4621
4622 // Take a watch string and map to a ranked encoding. If no match, return -1
4623 int LookupWatchEnc(const string& watch_str) {
4624 int watchval = -1;
4625 // Mixed encoding maps to enc=UTF8UTF8
4626 if (watch_str == "UTF8UTF8") {
4627 watchval = F_UTF8UTF8;
4628 } else {
4629 Encoding enc;
4630 if (EncodingFromName(watch_str.c_str(), &enc)) {
4631 watchval = CompactEncDet::BackmapEncodingToRankedEncoding(enc);
4632 }
4633 }
4634 return watchval;
4635 }
4636
4637 // Return true if enc and enc2 are equal or one is a subset of the other
4638 // or either is UNKNOWN
4639 // also UTF8UTF8 is compatible with both Latin1 and UTF8
4640 bool CompatibleEnc(Encoding enc, Encoding enc2) {
4641 if (enc < 0) {return false;}
4642 if (NUM_ENCODINGS <= enc) {return false;}
4643 if (enc2 < 0) {return false;}
4644 if (NUM_ENCODINGS <= enc2) {return false;}
4645 if (enc == enc2) {return true;}
4646 if (kMapEncToBaseEncoding[enc] == kMapEncToBaseEncoding[enc2]) {return true;}
4647
4648 if (enc == ASCII_7BIT) {return true;}
4649 if (enc2 == ASCII_7BIT) {return true;}
4650 if (enc == UNKNOWN_ENCODING) {return true;}
4651 if (enc2 == UNKNOWN_ENCODING) {return true;}
4652 if (enc == UTF8UTF8) {
4653 if (enc2 == UTF8) {return true;}
4654 if (kMapEncToBaseEncoding[enc2] == ISO_8859_1) {return true;}
4655 }
4656 if (enc2 == UTF8UTF8) {
4657 if (enc == UTF8) {return true;}
4658 if (kMapEncToBaseEncoding[enc] == ISO_8859_1) {return true;}
4659 }
4660
4661 return false;
4662 }
4663
4664 // Return superset of enc and enc2, which must be compatible
4665 Encoding SupersetEnc(Encoding enc, Encoding enc2) {
4666 //printf(" SupersetEnc (%s, ", MyEncodingName(enc)); // TEMP
4667 //printf("%s) ", MyEncodingName(enc2));
4668 //printf("= %s\n",
4669 // MyEncodingName(kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2] ?
4670 // enc :enc2));
4671 if (kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2]) {
4672 return enc;
4673 }
4674 return enc2;
4675 }
4676
4677
4678 // If unreliable, try rescoring to separate some encodings
4679 Encoding Rescore(Encoding enc, const uint8* isrc,
4680 const uint8* srctextlimit, DetectEncodingState* destatep) {
4681 if (FLAGS_counts) {++rescore_used;}
4682 Encoding new_enc = enc;
4683
4684 bool rescore_change = false;
4685
4686 int count = destatep->next_interesting_pair[OtherPair];
4687 int text_length = srctextlimit - isrc;
4688 for (int i = 0; i < count; ++i) {
4689 int bigram_offset = destatep->interesting_offsets[OtherPair][i];
4690 uint8 byte0 = (0 < bigram_offset) ?
4691 isrc[bigram_offset - 1] : 0x20;
4692 uint8 byte1 = isrc[bigram_offset + 0]; // Known to have high bit on
4693 uint8 byte2 = ((bigram_offset + 1) < text_length) ?
4694 isrc[bigram_offset + 1] : 0x20;
4695 uint8 byte3 = ((bigram_offset + 2) < text_length) ?
4696 isrc[bigram_offset + 2] : 0x20;
4697 int high_hash = ((byte0 & 0xc0) >> 0) |
4698 ((byte1 & 0xc0) >> 1) |
4699 ((byte2 & 0xc0) >> 4) |
4700 ((byte3 & 0xc0) >> 6); // 00112233
4701
4702 // Boost HighAccent encodings for Ascii bit patterns
4703 // 0x1x 0x0x
4704 // 1010 1010
4705 // 0010 0000
4706 //
4707 if ((high_hash & 0xaa) == 0x20) {
4708 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4709 int rankedencoding = destatep->rankedencoding_list[j];
4710 if (HighAccentEncoding(kMapToEncoding[rankedencoding])) {
4711 // TODO: also want to boost Shift-JIS here if byte1 is Ax..Dx
4712 // TEMP
4713 //printf(" Rescore[%02x] %s +%d\n",
4714 // high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost) ;
4715 Boost(destatep, rankedencoding, kGentlePairBoost);
4716 rescore_change = true;
4717 }
4718 }
4719 }
4720
4721 // Whack HighAccent encodings for high bit patterns
4722 // 1x1x 1x1x
4723 // 1010 1010
4724 // 1010 1010
4725 //
4726 if ((high_hash & 0xaa) == 0xaa) {
4727 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4728 int rankedencoding = destatep->rankedencoding_list[j];
4729 if (HighAccentEncoding(kMapToEncoding[rankedencoding])) {
4730 // TEMP
4731 //printf(" Rescore[%02x] %s -%d\n",
4732 // high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost) ;
4733 Whack(destatep, rankedencoding, kGentlePairBoost);
4734 rescore_change = true;
4735 }
4736 }
4737 }
4738
4739 }
4740
4741 if (rescore_change) {
4742 ReRank(destatep);
4743 new_enc = kMapToEncoding[destatep->top_rankedencoding];
4744
4745 if (destatep->debug_data != NULL) {
4746 char buff[32];
4747 snprintf(buff, sizeof(buff), "=Rescore %s", MyEncodingName(new_enc));
4748 SetDetailsEncProb(destatep,
4749 0,
4750 CompactEncDet::BackmapEncodingToRankedEncoding(new_enc),
4751 buff);
4752 //// DumpDetail(destatep);
4753 }
4754
4755 SimplePrune(destatep, kFinalPruneDifference);
4756 CalcReliable(destatep);
4757 }
4758
4759 //if (new_enc != enc) {
4760 // // TEMP
4761 // printf(" Rescore new top encoding = %s\n",
4762 // MyRankedEncName(destatep->top_rankedencoding));
4763 //}
4764
4765 return new_enc;
4766 }
4767
4768
4769 // Given an encoding, add its corresponding ranked encoding to the set
4770 void AddToSet(Encoding enc, int* list_len, int* list) {
4771 // TEMP print
4772 int item = CompactEncDet::BackmapEncodingToRankedEncoding(enc);
4773 for (int i = 0; i < *list_len; ++i) {
4774 if (list[i] == item) {
4775 return; // Already in the set; don't add again
4776 }
4777 }
4778 list[(*list_len)++] = item;
4779 }
4780
4781
4782 static const int kMinRobustBigramCount = 1000;
4783 static const int kMinKBToRobustScan = 64;
4784 static const int kMaxKBToRobustScan = 256;
4785
4786 // Scan the first 64K or so, just doing raw bigram increments on given
4787 // probability list.
4788 // No fancy duplicate filtering or anything else here.
4789 // Returns number of bigrams counted
4790 int RobustScan(const char* text,
4791 int text_length,
4792 int robust_renc_list_len,
4793 int* robust_renc_list,
4794 int* robust_renc_probs) {
4795 if (FLAGS_counts) {++robust_used;}
4796 // Zero all the result probabilities
4797 for (int i = 0; i < robust_renc_list_len; ++i) {
4798 robust_renc_probs[i] = 0;
4799 }
4800 int max_fast_len = minint(text_length, (kMaxKBToRobustScan << 10));
4801 const uint8* isrc = reinterpret_cast<const uint8*>(text);
4802 const uint8* src = isrc;
4803 const uint8* srclimitfast2 = isrc + max_fast_len - 1;
4804 const uint8* srclimitfast4 = isrc + max_fast_len - 3;
4805
4806 int min_fast_len = minint(text_length, (kMinKBToRobustScan << 10));
4807 const uint8* srclimitmin = isrc + min_fast_len - 1;
4808
4809 int bigram_count = 0;
4810
4811 if (FLAGS_enc_detect_source) {
4812 PsSourceInit(kPsSourceWidth);
4813 fprintf(stderr, "(RobustScan) do-src\n");
4814 }
4815
4816 // Sum over a big chunk of the input
4817 // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec
4818 //====================================
4819 while (src < srclimitfast2) {
4820 // Skip to next interesting bigram
4821 while (src < srclimitfast4) {
4822 uint32 u32 = *reinterpret_cast<const uint32*>(src);
4823 src+= 4;
4824 if ((u32 & 0x80808080) != 0) {src -= 4; break;}
4825 }
4826 while (src < srclimitfast2) {
4827 uint8 uc = *src++;
4828 if (static_cast<signed char>(uc) < 0) {src--; break;}
4829 }
4830
4831 if (src < srclimitfast2) {
4832 // We found a bigram with high bit on
4833 // Next 5 lines commented out so we don't show all the source.
4834 //const uint8* srctextlimit = isrc + text_length;
4835 //if (FLAGS_enc_detect_source) {
4836 // PsSource(src, isrc, srctextlimit);
4837 // PsMark(src, 2, isrc, 0);
4838 //}
4839
4840 uint8 byte1 = src[0];
4841 uint8 byte2 = src[1];
4842 uint8 byte1x2x = (byte1 & 0xf0) | ((byte2 >> 4) & 0x0f);
4843 uint8 byte1f = byte1;
4844 // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebre w)
4845 byte1f ^= (byte2 & 0x80);
4846
4847 // The real increments
4848 for (int j = 0; j < robust_renc_list_len; ++j) {
4849 int rankedencoding = robust_renc_list[j];
4850 const UnigramEntry* ue = &unigram_table[rankedencoding];
4851 int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x];
4852 if ((ue->b12[byte1x2x] & 0x01) != 0) {
4853 // Use a more-precise table
4854 int byte32x32 = ((byte1 & 0x1f) << 5) | (byte2 & 0x1f);
4855 int hiressub = (byte2 & 0x60) >> 5; // select w/bits 5&6 of byte 2
4856 DCHECK(ue->hires[hiressub] != NULL);
4857 incr += ue->hires[hiressub][byte32x32];
4858 } else {
4859 // Default final offset
4860 incr += ue->so;
4861 }
4862 robust_renc_probs[j] += incr;
4863 }
4864
4865 src += 2; // Continue after this bigram
4866 ++bigram_count;
4867
4868 // Stop after 1000 bigrams reached, if at least 64KB scanned
4869 if ((bigram_count > kMinRobustBigramCount) && (src > srclimitmin)) {
4870 break;
4871 }
4872 }
4873 }
4874
4875 if (FLAGS_enc_detect_source) {
4876 fprintf(stderr, "( bigram_count = %d) do-src\n", bigram_count);
4877 if (bigram_count == 0) {bigram_count = 1;} // zdiv
4878 for (int i = 0; i < robust_renc_list_len; ++i) {
4879 fprintf(stderr, "( enc[%-12.12s] = %7d (avg %d)) do-src\n",
4880 MyRankedEncName(robust_renc_list[i]), robust_renc_probs[i],
4881 robust_renc_probs[i] / bigram_count);
4882 }
4883 PsSourceFinish();
4884 }
4885
4886 return bigram_count;
4887 }
4888
4889 // If unreliable, rescan middle of document to see if we can get a better
4890 // answer. Rescan is only worthwhile if there are ~200 bytes or more left,
4891 // since the detector takes as much as 96 bytes of bigrams to decide.
4892 Encoding Rescan(Encoding enc,
4893 const uint8* isrc,
4894 const uint8* src,
4895 const uint8* srctextlimit,
4896 const char* url_hint,
4897 const char* http_charset_hint,
4898 const char* meta_charset_hint,
4899 const int encoding_hint,
4900 const Language language_hint,
4901 const CompactEncDet::TextCorpusType corpus_type,
4902 bool ignore_7bit_mail_encodings,
4903 DetectEncodingState* destatep) {
4904 bool enc_is_reliable = destatep->reliable;
4905 Encoding new_enc = enc;
4906 Encoding second_best_enc =
4907 kMapToEncoding[destatep->second_top_rankedencoding];
4908
4909 if (FLAGS_counts) {++rescan_used;}
4910
4911 int scanned_bytes = src - isrc;
4912 int unscanned_bytes = srctextlimit - src;
4913 int text_length = srctextlimit - isrc;
4914 bool empty_rescan = true;
4915
4916 // See if enough bytes left to bother doing rescan
4917 if (kMinRescanLength < unscanned_bytes) {
4918 const char* text = reinterpret_cast<const char*>(isrc);
4919
4920 Encoding one_hint = destatep->http_hint;
4921 if ((one_hint == UNKNOWN_ENCODING) &&
4922 (destatep->meta_hint != UNKNOWN_ENCODING)) {
4923 one_hint = destatep->meta_hint;
4924 }
4925 if ((one_hint == UNKNOWN_ENCODING) &&
4926 (destatep->bom_hint != UNKNOWN_ENCODING)) {
4927 one_hint = destatep->bom_hint;
4928 }
4929
4930 // Go to an even offset to keep UTF-16 in synch
4931 int middle_offset = (scanned_bytes + (unscanned_bytes / 2)) & ~1;
4932 CHECK(middle_offset <= text_length);
4933
4934 // Look back a bit for a low byte to synchronize, else hope for the best.
4935 const uint8* srcbacklimit = isrc + middle_offset - kMaxScanBack;
4936 if (srcbacklimit < src) {
4937 srcbacklimit = src;
4938 }
4939 const uint8* ss = isrc + middle_offset - 1;
4940 while (srcbacklimit <= ss) {
4941 if ((*ss & 0x80) == 0) {break;}
4942 --ss;
4943 }
4944 // Leave middle offset unchanged unless we found a low byte
4945 if (srcbacklimit <= ss) {
4946 // Align to low byte or high byte just after it, whichever is even
4947 middle_offset = (ss - isrc + 1) & ~1; // Even to keep UTF-16 in sync
4948 }
4949 CHECK(middle_offset <= text_length);
4950
4951 if (destatep->debug_data != NULL) {
4952 SetDetailsEncLabel(destatep, ">> Rescan");
4953 // Print the current chart before recursive call
4954 DumpDetail(destatep);
4955
4956 char buff[32];
4957 snprintf(buff, sizeof(buff), ">> Rescan[%d..%d]",
4958 middle_offset, text_length);
4959 PsRecurse(buff);
4960 }
4961
4962 int mid_bytes_consumed;
4963 bool mid_is_reliable;
4964 Encoding mid_second_best_enc;
4965 CEDInternalFlags newflags = static_cast<CEDInternalFlags>(
4966 kCEDRescanning + kCEDForceTags);
4967 // Recursive call for rescan of half of remaining
4968 Encoding mid_enc = InternalDetectEncoding(
4969 newflags,
4970 text + middle_offset,
4971 text_length - middle_offset,
4972 url_hint,
4973 http_charset_hint,
4974 meta_charset_hint,
4975 encoding_hint,
4976 language_hint, // User interface lang
4977 corpus_type,
4978 ignore_7bit_mail_encodings,
4979 &mid_bytes_consumed,
4980 &mid_is_reliable,
4981 &mid_second_best_enc);
4982 destatep->reliable = mid_is_reliable;
4983
4984 empty_rescan = (mid_enc == ASCII_7BIT);
4985
4986 // Not the right decision if, e.g. enc=Greek, mid=ASCII7, one=KSC
4987 // hence the !empty_rescan term
4988 if (!empty_rescan && CompatibleEnc(one_hint, mid_enc)) {
4989 // Encoding we just found is compatible with the
4990 // single hint (if any); return superset
4991 new_enc = SupersetEnc(one_hint, mid_enc);
4992 }
4993
4994 // If original and mid are compatible, and both reliable,
4995 // return new_enc = SupersetEnc(enc, mid_enc)
4996 //
4997 // This avoids too much weight on a bogus hint causing a RobustScan
4998 // that gets the wrong answer
4999 if (!empty_rescan && mid_is_reliable && enc_is_reliable &&
5000 CompatibleEnc(enc, mid_enc)) {
5001 new_enc = SupersetEnc(enc, mid_enc);
5002 return new_enc;
5003 }
5004
5005 // if mid unreliable, robustscan
5006 // if mid empty, robustscan
5007 // if original and mid not compatible, robustscan
5008 // if mid and one_hint not compatible, robustscan
5009
5010 // If we found conflicting data, drop back and do a robust scan of a big
5011 // chunk of the input over a set of candidate encodings
5012 //
5013 if (!mid_is_reliable ||
5014 empty_rescan ||
5015 !CompatibleEnc(enc, mid_enc) ||
5016 !CompatibleEnc(one_hint, mid_enc)) {
5017 int robust_renc_list_len; // Number of active encodings
5018 int robust_renc_list[NUM_RANKEDENCODING]; // List of ranked encodings
5019 int robust_renc_probs[NUM_RANKEDENCODING]; // List of matching probs
5020
5021 robust_renc_list_len = 0;
5022 AddToSet(enc, &robust_renc_list_len, robust_renc_list);
5023 AddToSet(second_best_enc, &robust_renc_list_len, robust_renc_list);
5024 AddToSet(mid_enc, &robust_renc_list_len, robust_renc_list);
5025 AddToSet(mid_second_best_enc, &robust_renc_list_len, robust_renc_list);
5026 if (destatep->http_hint != UNKNOWN_ENCODING) {
5027 AddToSet(destatep->http_hint, &robust_renc_list_len, robust_renc_list);
5028 }
5029 if (destatep->meta_hint != UNKNOWN_ENCODING) {
5030 AddToSet(destatep->meta_hint, &robust_renc_list_len, robust_renc_list);
5031 }
5032 if (destatep->bom_hint != UNKNOWN_ENCODING) {
5033 AddToSet(destatep->bom_hint, &robust_renc_list_len, robust_renc_list);
5034 }
5035 if (destatep->tld_hint != UNKNOWN_ENCODING) {
5036 AddToSet(destatep->tld_hint, &robust_renc_list_len, robust_renc_list);
5037 }
5038
5039 // Separate simple scan
5040 // =====================
5041 if (destatep->debug_data != NULL) {
5042 SetDetailsEncLabel(destatep, ">> RobustScan");
5043 // Print the current chart before recursive call
5044 DumpDetail(destatep);
5045
5046 char buff[32];
5047 snprintf(buff, sizeof(buff), ">> RobustScan[0..%d]", text_length);
5048 PsRecurse(buff);
5049 }
5050
5051 int bigram_count = RobustScan(text, text_length,
5052 robust_renc_list_len, robust_renc_list, robust_renc_probs);
5053
5054 // Default to new_enc and update if something better was found
5055 int best_prob = -1;
5056 // TEMP print
5057 for (int i = 0; i < robust_renc_list_len; ++i) {
5058 if (best_prob < robust_renc_probs[i]) {
5059 best_prob = robust_renc_probs[i];
5060 new_enc = kMapToEncoding[robust_renc_list[i]];
5061 }
5062 }
5063
5064 if (destatep->debug_data != NULL) {
5065 char buff[32];
5066 snprintf(buff, sizeof(buff), "=Robust[%d] %s",
5067 bigram_count, MyEncodingName(new_enc));
5068 SetDetailsEncProb(destatep,
5069 0,
5070 CompactEncDet::BackmapEncodingToRankedEncoding(new_enc ),
5071 buff);
5072 }
5073 }
5074 } // End if enough bytes
5075
5076 return new_enc;
5077 }
5078
5079 // With no hints at all, and perhaps on rescan, we relax our pickiness
5080 // and go ahead and accept the top multibyte encodings, even though
5081 // strictly their web pages should have declared an explicit encoding to
5082 // avoid the HTML standard's default ISO-8859-1.
5083 bool NoHintsCloseEnoughCompatible(Encoding top_enc) {
5084 // First test accepts degenerate cases plus UTF8 and UTF8UTF8
5085 if (CompatibleEnc(UTF8, top_enc)) {return true;}
5086
5087 // The rest look for exact match of base encoding
5088 Encoding base_enc = kMapEncToBaseEncoding[top_enc];
5089 if (base_enc == JAPANESE_EUC_JP) {return true;}
5090 if (base_enc == JAPANESE_SHIFT_JIS) {return true;}
5091 if (base_enc == CHINESE_BIG5) {return true;}
5092 if (base_enc == CHINESE_GB) {return true;}
5093 if (base_enc == KOREAN_EUC_KR) {return true;}
5094 return false;
5095 }
5096
5097
5098
5099 // Scan raw bytes and detect most likely encoding
5100 // Design goals:
5101 // Skip over big initial stretches of seven-bit ASCII bytes very quickly
5102 // Thread safe
5103 // Works equally well on
5104 // 50-byte queries,
5105 // 5000-byte email and
5106 // 50000-byte web pages
5107 // Length 0 input returns ISO_8859_1 (ASCII) encoding
5108 // Setting ignore_7bit_mail_encodings effectively turns off detection of
5109 // UTF-7, HZ, and ISO-2022-xx
5110 Encoding InternalDetectEncoding(
5111 CEDInternalFlags flags, const char* text, int text_length,
5112 const char* url_hint, const char* http_charset_hint,
5113 const char* meta_charset_hint, const int encoding_hint,
5114 const Language language_hint, // User interface lang
5115 const CompactEncDet::TextCorpusType corpus_type,
5116 bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
5117 Encoding* second_best_enc) {
5118 *bytes_consumed = 0;
5119 *is_reliable = false;
5120 *second_best_enc = ASCII_7BIT;
5121
5122 if (text_length == 0) {
5123 // Follow the spec. Text might be NULL.
5124 *is_reliable = true;
5125 return ISO_8859_1;
5126 }
5127
5128 // For very short (20-50 byte) input strings that are highly likely to be
5129 // all printable ASCII, our startup overhead might dominate. We have to do the
5130 // full detection if the ISO-2022-xx, HZ, or UTF-7 encodings are possible.
5131 // Otherwise, we can do a quick scan for printable ASCII.
5132 if ((text_length <= 500) && ignore_7bit_mail_encodings &&
5133 QuickPrintableAsciiScan(text, text_length)) {
5134 *is_reliable = true;
5135 return ASCII_7BIT;
5136 }
5137
5138 // Go for the full boat detection
5139 DetectEncodingState destate;
5140 InitDetectEncodingState(&destate);
5141
5142 std::unique_ptr<DetailEntry[]> scoped_debug_data;
5143 if (FLAGS_enc_detect_detail) {
5144 // Allocate max 10 details per bigram
5145 scoped_debug_data.reset(new DetailEntry[kMaxPairs * 10]);
5146 destate.debug_data = scoped_debug_data.get();
5147 // NOTE: destate and scoped_debug_data have exactly the same scope
5148 // All other FLAGS_enc_detect_detail tests use destate.debug_data != NULL
5149 }
5150
5151 // Get text length limits
5152 // Typically, we scan the first 16KB looking for all encodings, then
5153 // scan the rest (up to 256KB) a bit faster by no longer looking for
5154 // interesting bytes below 0x80. This allows us to skip over runs of
5155 // 7-bit-ASCII much more quickly.
5156 int slow_len = minint(text_length, (FLAGS_enc_detect_slow_max_kb << 10));
5157 int fast_len = minint(text_length, (FLAGS_enc_detect_fast_max_kb << 10));
5158
5159 // Initialize pointers.
5160 // In general, we do not look at last 3 bytes of input in the fast scan
5161 // We do, however want to look at the last byte or so in the slow scan,
5162 // especilly in the case of a very short text whose only interesting
5163 // information is a 3-byte UTF-8 character in the last three bytes.
5164 // If necessary, we fake a last bigram with 0x20 space as a pad byte.
5165 const uint8* isrc = reinterpret_cast<const uint8*>(text);
5166 const uint8* src = isrc;
5167 const uint8* srctextlimit = isrc + text_length;
5168 const uint8* srclimitslow2 = isrc + slow_len - 1;
5169 const uint8* srclimitfast2 = isrc + fast_len - 1;
5170 const uint8* srclimitfast4 = isrc + fast_len - 3;
5171 if (srclimitslow2 > srclimitfast2) {
5172 srclimitslow2 = srclimitfast2;
5173 }
5174 destate.initial_src = isrc;
5175 destate.limit_src = srclimitfast2 + 1; // May include last byte
5176 destate.prior_src = isrc;
5177 destate.last_pair = isrc - 2;
5178
5179 const char* scan_table = kTestPrintableAsciiTildePlus;
5180 if (ignore_7bit_mail_encodings) {
5181 // Caller wants to ignore UTF-7, HZ, ISO-2022-xx
5182 // Don't stop on + (for UTF-7), nor on ~ (for HZ)
5183 scan_table = kTestPrintableAscii;
5184 }
5185 int exit_reason = 0;
5186
5187 if (destate.debug_data != NULL) {
5188 BeginDetail(&destate);
5189 // Take any incoming watch encoding name and backmap to the corresponding
5190 // ranked enum value
5191 watch1_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch1);
5192 if (watch1_rankedenc >= 0) {
5193 fprintf(stderr, "/track-me %d def\n", watch1_rankedenc);
5194 }
5195
5196 watch2_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch2);
5197 if (watch2_rankedenc >= 0) {
5198 fprintf(stderr, "/track-me2 %d def\n", watch2_rankedenc);
5199 }
5200
5201 fprintf(stderr, "%% kDerateHintsBelow = %d\n", kDerateHintsBelow);
5202 }
5203 if (FLAGS_enc_detect_source) {
5204 PsSourceInit(kPsSourceWidth);
5205 PsSource(src, isrc, srctextlimit);
5206 PsMark(src, 4, isrc, 0);
5207 }
5208
5209 // Apply hints, if any, to probabilities
5210 // NOTE: Encoding probabilites are all zero at this point
5211 ApplyHints(url_hint,
5212 http_charset_hint,
5213 meta_charset_hint,
5214 encoding_hint,
5215 language_hint,
5216 corpus_type,
5217 &destate);
5218
5219 // NOTE: probabilities up to this point are subject to derating for
5220 // small numbers of bigrams.
5221 // Probability changes after this point are not derated.
5222
5223 // Do first 4 bytes to pick off strong markers
5224 InitialBytesBoost(isrc, text_length, &destate);
5225
5226 bool ignored_some_tag_text = false;
5227 int tag_text_bigram_count = 0;
5228
5229 // Slower loop, approx 500 MB/sec (2.8 GHz P4)
5230 // ASSERT(srclimitslow2 <= srclimitfast2);
5231 //====================================
5232 DoMoreSlowLoop:
5233 while (src < srclimitslow2) {
5234 // Skip to next interesting byte (this is the slower part)
5235 while (src < srclimitslow2) {
5236 uint8 uc = *src++;
5237 if (scan_table[uc] != 0) {exit_reason = scan_table[uc]; src--; break;}
5238 }
5239
5240 if (src < srclimitslow2) {
5241 if (FLAGS_enc_detect_source) {
5242 PsSource(src, isrc, srctextlimit); // don't mark yet
5243 }
5244
5245 int weightshift = 0;
5246 // In the first 16KB, derate new text run inside <title>...</title> and
5247 // inside <!-- ... -->
5248 if (////((destate.last_pair + 6) <= src) && // if beyond last one
5249 ////(tag_text_bigram_count < kMaxBigramsTagTitleText) &&
5250 (corpus_type == CompactEncDet::WEB_CORPUS) && // and web page
5251 !CEDFlagForceTags(flags)) { // and OK to skip
5252 ////if (TextInsideTag(destate.last_pair + 2, src, srclimitslow2)) {
5253 if (TextInsideTag(isrc, src, srclimitslow2)) {
5254 if (tag_text_bigram_count >= kMaxBigramsTagTitleText) {
5255 ignored_some_tag_text = true;
5256 src = SkipToTagEnd(destate.last_pair + 2, src, srclimitslow2);
5257 continue;
5258 } else {
5259 weightshift = kWeightshiftForTagTitleText;
5260 ++tag_text_bigram_count;
5261 }
5262 }
5263 }
5264 if (FLAGS_enc_detect_source) {
5265 PsMark(src, 2, isrc, weightshift);
5266 }
5267 // Saves byte pair and offset
5268 bool pruned = IncrementAndBoostPrune(src, srctextlimit - src,
5269 &destate, weightshift, exit_reason);
5270 // Advance; if inside tag, advance to end of tag
5271 if (weightshift == 0) {
5272 src += exit_reason; // 1 Ascii, 2 other
5273 } else {
5274 src += exit_reason; // 1 Ascii, 2 other
5275 //// src = SkipToTagEnd(destate.last_pair, src, srclimitslow2);
5276 }
5277
5278 if (pruned) {
5279 // Scoring and active encodings have been updated
5280 if (destate.done) {break;}
5281 // Check if all the reasons for the slow loop have been pruned
5282 // If so, go to fast loop
5283 if (!SevenBitActive(&destate)) {break;}
5284 }
5285 }
5286 }
5287 //====================================
5288
5289 // We reached the end of a slow scan, possibly because no more SevenBitActive,
5290 // or possibly are at end of source.
5291 // If we are exactly at the end of the source, make sure we look at the very
5292 // last byte.
5293 bool very_last_byte_incremented = false;
5294 if (src == (srctextlimit - 1)) {
5295 exit_reason = scan_table[*src];
5296 if (exit_reason != 0) {
5297 // The very last byte is an interesting byte
5298 // Saves byte pair and offset
5299 //printf("Interesting very last slow byte = 0x%02x\n", *src);
5300 IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason);
5301 very_last_byte_incremented = true;
5302 }
5303 }
5304
5305 if (FLAGS_enc_detect_source) {
5306 PsSource(src, isrc, srctextlimit);
5307 PsMark(src, 2, isrc, 0);
5308 }
5309 // Force a pruning based on whatever we have
5310 // Delete the seven-bit encodings if there is no evidence of them so far
5311 BoostPrune(src, &destate, PRUNE_SLOWEND);
5312
5313 if (!destate.done) {
5314 // If not clear yet on 7-bit-encodings and more bytes, do more slow
5315 if (SevenBitActive(&destate) && (src < srclimitfast2)) {
5316 // Increment limit by another xxxK
5317 slow_len += (FLAGS_enc_detect_slow_max_kb << 10);
5318 srclimitslow2 = isrc + slow_len - 1;
5319 if (srclimitslow2 > srclimitfast2) {
5320 srclimitslow2 = srclimitfast2;
5321 }
5322 if (!UTF7OrHzActive(&destate)) {
5323 // We can switch to table that does not stop on + ~
5324 scan_table = kTestPrintableAscii;
5325 }
5326 goto DoMoreSlowLoop;
5327 }
5328
5329
5330 exit_reason = 2;
5331 // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec
5332 //====================================
5333 while (src < srclimitfast2) {
5334 // Skip to next interesting byte (this is the faster part)
5335 while (src < srclimitfast4) {
5336 uint32 u32 = *reinterpret_cast<const uint32*>(src);
5337 src+= 4;
5338 if ((u32 & 0x80808080) != 0) {src -= 4; break;}
5339 }
5340 while (src < srclimitfast2) {
5341 uint8 uc = *src++;
5342 if (static_cast<signed char>(uc) < 0) {src--; break;}
5343 }
5344
5345 if (src < srclimitfast2) {
5346 if (FLAGS_enc_detect_source) {
5347 PsSource(src, isrc, srctextlimit);
5348 PsMark(src, 2, isrc, 0);
5349 }
5350 // saves byte pair and offset
5351 bool pruned = IncrementAndBoostPrune(src, srctextlimit - src,
5352 &destate, 0, exit_reason);
5353 src += exit_reason; // 1 Ascii, 2 other
5354 if (pruned) {
5355 // Scoring and active encodings have been updated
5356 if (destate.done) {break;}
5357 }
5358 }
5359 }
5360 //====================================
5361 // We reached the end of fast scan
5362
5363 // If we are exactly at the end of the source, make sure we look at the very
5364 // last byte.
5365 if (src == (srctextlimit - 1) && !very_last_byte_incremented) {
5366 exit_reason = scan_table[*src];
5367 if (exit_reason != 0) {
5368 // The very last byte is an interesting byte
5369 // Saves byte pair and offset
5370 //printf("Interesting very last fast byte = 0x%02x\n", *src);
5371 IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason );
5372 very_last_byte_incremented = true;
5373 }
5374 }
5375
5376 } // End if !done
5377
5378 if (FLAGS_enc_detect_source) {
5379 PsSource(src, isrc, srctextlimit);
5380 PsMark(src, 2, isrc, 0);
5381 }
5382 // Force a pruning based on whatever we have
5383 BoostPrune(src, &destate, PRUNE_FINAL);
5384
5385 if (FLAGS_enc_detect_summary) {
5386 DumpSummary(&destate, AsciiPair, 32);
5387 DumpSummary(&destate, OtherPair, 32);
5388 }
5389 if (FLAGS_enc_detect_source) {
5390 PsSourceFinish();
5391 }
5392 if (destate.debug_data != NULL) {
5393 //// DumpDetail(&destate);
5394 }
5395
5396
5397 if (ignored_some_tag_text &&
5398 (kMapToEncoding[destate.top_rankedencoding] == ASCII_7BIT)) {
5399 // There were some interesting bytes, but only in tag text.
5400 // Recursive call to reprocess looking at the tags this time.
5401
5402 if (destate.debug_data != NULL) {
5403 SetDetailsEncLabel(&destate, ">> Recurse/tags");
5404 // Print the current chart before recursive call
5405 DumpDetail(&destate);
5406
5407 char buff[32];
5408 snprintf(buff, sizeof(buff), ">> Recurse for tags");
5409 PsRecurse(buff);
5410 }
5411
5412 // Recursive call for high bytes in tags [no longer used, 1/16 tag score]
5413 Encoding enc2 = InternalDetectEncoding(
5414 kCEDForceTags, // force
5415 text,
5416 text_length,
5417 url_hint,
5418 http_charset_hint,
5419 meta_charset_hint,
5420 encoding_hint,
5421 language_hint,
5422 corpus_type,
5423 ignore_7bit_mail_encodings,
5424 bytes_consumed,
5425 is_reliable,
5426 second_best_enc);
5427
5428 if (destate.debug_data != NULL) {
5429 // Show winning encoding and dump PostScript
5430 char buff[32];
5431 snprintf(buff, sizeof(buff), "=2 %s", MyEncodingName(enc2));
5432 SetDetailsEncProb(&destate,
5433 0,
5434 CompactEncDet::BackmapEncodingToRankedEncoding(enc2),
5435 buff);
5436 DumpDetail(&destate);
5437 }
5438
5439 return enc2;
5440 }
5441
5442
5443 // If the detected encoding does not match default/hints, or if the hints
5444 // conflict with each other, mark as unreliable. This can be used to trigger
5445 // further scoring.
5446 // Three buckets of input documents;
5447 // ~19% of the web no hints, and top == 7bit, Latin1, or CP1252
5448 // ~79% of the web one or more hints, all same encoding X and top == X
5449 // ~ 2% of the web one or more hints that are inconsistent
5450
5451 Encoding top_enc = kMapToEncoding[destate.top_rankedencoding];
5452 Encoding one_hint = destate.http_hint;
5453 if ((one_hint == UNKNOWN_ENCODING) &&
5454 (destate.meta_hint != UNKNOWN_ENCODING)) {
5455 one_hint = destate.meta_hint;
5456 }
5457 if ((one_hint == UNKNOWN_ENCODING) &&
5458 (destate.bom_hint != UNKNOWN_ENCODING)) {
5459 one_hint = destate.bom_hint;
5460 }
5461
5462 bool found_compatible_encoding = true;
5463 if (one_hint == UNKNOWN_ENCODING) {
5464 // [~14% of the web] No hints, and top == 7bit, Latin1, or CP1252
5465 if (!CompatibleEnc(ISO_8859_1, top_enc)) {
5466 found_compatible_encoding = false;
5467 // If there is nothing but a TLD hint and its top encoding matches, OK
5468 if ((destate.tld_hint != UNKNOWN_ENCODING) &&
5469 CompatibleEnc(destate.tld_hint, top_enc)) {
5470 found_compatible_encoding = true;
5471 }
5472 }
5473 } else if (CompatibleEnc(one_hint, destate.http_hint) &&
5474 CompatibleEnc(one_hint, destate.meta_hint) &&
5475 CompatibleEnc(one_hint, destate.bom_hint)) {
5476 // [~83% of the web] One or more hints, all same encoding X and top == X
5477 if (!CompatibleEnc(one_hint, top_enc)) {
5478 // [~ 2% of the web] Oops, not the declared encoding
5479 found_compatible_encoding = false;
5480 }
5481 } else {
5482 // [~ 3% of the web] Two or more hints that are inconsistent
5483 one_hint = UNKNOWN_ENCODING;
5484 found_compatible_encoding = false;
5485 }
5486
5487 // If we turned Latin1 into Latin2 or 7 via trigrams, don't fail it here
5488 if (destate.do_latin_trigrams) {
5489 if (CompatibleEnc(kMapToEncoding[F_Latin1], top_enc) ||
5490 CompatibleEnc(kMapToEncoding[F_Latin2], top_enc) ||
5491 CompatibleEnc(kMapToEncoding[F_CP1250], top_enc) ||
5492 CompatibleEnc(kMapToEncoding[F_ISO_8859_13], top_enc)) {
5493 found_compatible_encoding = true;
5494 destate.reliable = true;
5495 }
5496 }
5497
5498 // If top encoding is not compatible with the hints, but it is reliably
5499 // UTF-8, accept it anyway.
5500 // This will perform badly with mixed UTF-8 prefix plus another encoding in
5501 // the body if done too early, so we want to be rescanning.
5502 if (!found_compatible_encoding &&
5503 destate.reliable &&
5504 NoHintsCloseEnoughCompatible(top_enc) &&
5505 (destate.next_interesting_pair[OtherPair] >= kStrongPairs) &&
5506 CEDFlagRescanning(flags)) {
5507 found_compatible_encoding = true;
5508 }
5509
5510 // Hold off on this so Rescan() can see if the original encoding was reliable
5511 //if (!found_compatible_encoding) {
5512 // destate.reliable = false;
5513 //}
5514
5515 // If unreliable, try rescoring to separate some encodings
5516 if (!destate.reliable || !found_compatible_encoding) {
5517 top_enc = Rescore(top_enc, isrc, srctextlimit, &destate);
5518 }
5519
5520 *second_best_enc = kMapToEncoding[destate.second_top_rankedencoding];
5521
5522 // If unreliable, and not already rescanning,
5523 // rescan middle of document to see if we can get a better
5524 // answer. Rescan is only worthwhile if there are ~200 bytes or more left,
5525 // since the detector takes as much as 96 bytes of bigrams to decide.
5526 //
5527 // CANNOT retry ISO-2022-xx HZ etc. because no declaration escape at the front
5528 // or we may land in the middle of some partial state. Skip them all.
5529 //
5530 if ((!destate.reliable || !found_compatible_encoding) &&
5531 !CEDFlagRescanning(flags) &&
5532 !SevenBitEncoding(top_enc)) {
5533 top_enc = Rescan(top_enc,
5534 isrc,
5535 src,
5536 srctextlimit,
5537 url_hint,
5538 http_charset_hint,
5539 meta_charset_hint,
5540 encoding_hint,
5541 language_hint,
5542 corpus_type,
5543 ignore_7bit_mail_encodings,
5544 &destate);
5545 } else {
5546 if (!found_compatible_encoding) {
5547 destate.reliable = false;
5548 }
5549 }
5550
5551 if (destate.debug_data != NULL) {
5552 // Dump PostScript
5553 DumpDetail(&destate);
5554 }
5555
5556 *bytes_consumed = src - isrc + 1; // We looked 1 byte beyond src
5557 *is_reliable = destate.reliable;
5558 return top_enc;
5559 }
5560
5561 Encoding CompactEncDet::DetectEncoding(
5562 const char* text, int text_length, const char* url_hint,
5563 const char* http_charset_hint, const char* meta_charset_hint,
5564 const int encoding_hint,
5565 const Language language_hint, // User interface lang
5566 const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
5567 int* bytes_consumed, bool* is_reliable) {
5568 if (FLAGS_ced_echo_input) {
5569 string temp(text, text_length);
5570 fprintf(stderr, "CompactEncDet::DetectEncoding()\n%s\n\n", temp.c_str());
5571 }
5572
5573 if (FLAGS_counts) {
5574 encdet_used = 0;
5575 rescore_used = 0;
5576 rescan_used = 0;
5577 robust_used = 0;
5578 looking_used = 0;
5579 doing_used = 0;
5580 ++encdet_used;
5581 }
5582 if (FLAGS_dirtsimple) {
5583 // Just count first 64KB bigram encoding probabilities for each encoding
5584 int robust_renc_list_len; // Number of active encodings
5585 int robust_renc_list[NUM_RANKEDENCODING]; // List of ranked encodings
5586 int robust_renc_probs[NUM_RANKEDENCODING]; // List of matching probs
5587
5588 for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
5589 robust_renc_list[i] = i;
5590 }
5591 robust_renc_list_len = NUM_RANKEDENCODING;
5592
5593 RobustScan(text, text_length,
5594 robust_renc_list_len, robust_renc_list, robust_renc_probs);
5595
5596 // Pick off best encoding
5597 int best_prob = -1;
5598 Encoding enc = UNKNOWN_ENCODING;
5599 for (int i = 0; i < robust_renc_list_len; ++i) {
5600 if (best_prob < robust_renc_probs[i]) {
5601 best_prob = robust_renc_probs[i];
5602 enc = kMapToEncoding[robust_renc_list[i]];
5603 }
5604 }
5605
5606 *bytes_consumed = minint(text_length, (kMaxKBToRobustScan << 10));
5607 *is_reliable = true;
5608 if (FLAGS_counts) {
5609 printf("CEDcounts ");
5610 while (encdet_used--) {printf("encdet ");}
5611 while (rescore_used--) {printf("rescore ");}
5612 while (rescan_used--) {printf("rescan ");}
5613 while (robust_used--) {printf("robust ");}
5614 while (looking_used--) {printf("looking ");}
5615 while (doing_used--) {printf("doing ");}
5616 printf("\n");
5617 }
5618
5619 return enc;
5620 }
5621
5622 Encoding second_best_enc;
5623 Encoding enc = InternalDetectEncoding(kCEDNone,
5624 text,
5625 text_length,
5626 url_hint,
5627 http_charset_hint,
5628 meta_charset_hint,
5629 encoding_hint,
5630 language_hint, // User interface lang
5631 corpus_type,
5632 ignore_7bit_mail_encodings,
5633 bytes_consumed,
5634 is_reliable,
5635 &second_best_enc);
5636 if (FLAGS_counts) {
5637 printf("CEDcounts ");
5638 while (encdet_used--) {printf("encdet ");}
5639 while (rescore_used--) {printf("rescore ");}
5640 while (rescan_used--) {printf("rescan ");}
5641 while (robust_used--) {printf("robust ");}
5642 while (looking_used--) {printf("looking ");}
5643 while (doing_used--) {printf("doing ");}
5644 printf("\n");
5645 }
5646 return enc;
5647 }
5648
5649
5650 // Return top encoding hint for given string
5651 Encoding CompactEncDet::TopEncodingOfLangHint(const char* name) {
5652 string normalized_lang = MakeChar8(string(name));
5653 int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize,
5654 normalized_lang.c_str());
5655 if (n < 0) {return UNKNOWN_ENCODING;}
5656
5657 // Charset is eight bytes, probability table is eight bytes
5658 int toprankenc =
5659 TopCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey],
5660 kMaxLangVector);
5661 return kMapToEncoding[toprankenc];
5662 }
5663
5664 // Return top encoding hint for given string
5665 Encoding CompactEncDet::TopEncodingOfTLDHint(const char* name) {
5666 string normalized_tld = MakeChar4(string(name));
5667 int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
5668 normalized_tld.c_str());
5669 if (n < 0) {return UNKNOWN_ENCODING;}
5670
5671 // TLD is four bytes, probability table is 12 bytes
5672 int toprankenc =
5673 TopCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey],
5674 kMaxTldVector);
5675 return kMapToEncoding[toprankenc];
5676 }
5677
5678 // Return top encoding hint for given string
5679 Encoding CompactEncDet::TopEncodingOfCharsetHint(const char* name) {
5680 string normalized_charset = MakeChar44(string(name));
5681 int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize,
5682 normalized_charset.c_str());
5683 if (n < 0) {return UNKNOWN_ENCODING;}
5684
5685 // Charset is eight bytes, probability table is eight bytes
5686 int toprankenc =
5687 TopCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharsetKey],
5688 kMaxCharsetVector);
5689 return kMapToEncoding[toprankenc];
5690 }
5691
5692 const char* CompactEncDet::Version(void) {
5693 return kVersion;
5694 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698